You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

327 lines
19 KiB

;* ======================================================================== *;
;* TEXAS INSTRUMENTS, INC. *;
;* *;
;* IMGLIB DSP Image/Video Processing Library *;
;* *;
;* Release: Revision 1.04b *;
;* CVS Revision: 1.3 Sun Sep 29 03:32:32 2002 (UTC) *;
;* Snapshot date: 23-Oct-2003 *;
;* *;
;* This library contains proprietary intellectual property of Texas *;
;* Instruments, Inc. The library and its source code are protected by *;
;* various copyrights, and portions may also be protected by patents or *;
;* other legal protections. *;
;* *;
;* This software is licensed for use with Texas Instruments TMS320 *;
;* family DSPs. This license was provided to you prior to installing *;
;* the software. You may review this license by consulting the file *;
;* TI_license.PDF which accompanies the files in this library. *;
;* ------------------------------------------------------------------------ *;
;* Copyright (C) 2003 Texas Instruments, Incorporated. *;
;* All Rights Reserved. *;
;* ======================================================================== *;
;* ======================================================================== *;
;* Assembler compatibility shim for assembling 4.30 and later code on *;
;* tools prior to 4.30. *;
;* ======================================================================== *;
.if $isdefed(".ASSEMBLER_VERSION")
.asg .ASSEMBLER_VERSION, $asmver
.else
.asg 0, $asmver
.endif
.if ($asmver < 430)
.asg B, CALL ; Function Call
.asg B, RET ; Return from a Function
.asg B, CALLRET ; Function call with Call / Ret chaining.
.if .TMS320C6400
.asg BNOP, CALLNOP ; C64x BNOP as a Fn. Call
.asg BNOP, RETNOP ; C64x BNOP as a Fn. Return
.asg BNOP, CRNOP ; C64x Fn call w/, Call/Ret chaining via BNOP.
.endif
.asg , .asmfunc ; .func equivalent for hand-assembly code
.asg , .endasmfunc ; .endfunc equivalent for hand-assembly code
.endif
;* ======================================================================== *;
;* End of assembler compatibility shim. *;
;* ======================================================================== *;
* ========================================================================= *
* TEXAS INSTRUMENTS, INC. *
* *
* NAME *
* IMG_yc_demux_le16 -- De-interleave a 4:2:2 LITTLE ENDIAN video stream *
* into three separate LITTLE ENDIAN 16-bit planes *
* *
* REVISION DATE *
* 15-Mar-2002 *
* *
* USAGE *
* This function is C callable, and is called as follows: *
* *
* void IMG_yc_demux_le16 *
* ( *
* int n, // Number of luma pixels // *
* const unsigned char * yc, // Interleaved luma/chroma // *
* short *restrict y, // Luma plane (16-bit) // *
* short *restrict cr, // Cr chroma plane (16-bit) // *
* short *restrict cb // Cb chroma plane (16-bit) // *
* ); *
* *
* The input array 'yc' is expected to be an interleaved 4:2:2 *
* video stream. The input is expected in LITTLE ENDIAN byte *
* order within each 4-byte word. This is consistent with reading *
* the video stream from a word-oriented LITTLE ENDIAN device *
* while the C6000 device is in a LITTLE ENDIAN configuration. *
* *
* In other words, the expected pixel order is: *
* *
* Word 0 Word 1 Word 2 *
* +---------------+---------------+---------------+-- *
* Byte# | 0 1 2 3 | 4 5 6 7 | 8 9 10 11 |... *
* | y0 cr0 y1 cb0 | y2 cr2 y3 cb2 | y4 cr4 y5 cb4 |... *
* +---------------+---------------+---------------+-- *
* *
* The output arrays 'y', 'cr', and 'cb' are expected to not *
* overlap. The de-interleaved pixels are written as half-words *
* in LITTLE ENDIAN order. *
* *
* Please see the IMGLIB function IMB_yc_demux_be16 for code which *
* handles input coming from a BIG ENDIAN device. *
* *
* DESCRIPTION *
* This function reads the byte-oriented pixel data, zero-extends *
* it, and then writes it to the appropriate result array. Both *
* the luma and chroma values are expected to be unsigned. *
* *
* The data is expected to be in an order consistent with reading *
* byte oriented data from a word-oriented peripheral that is *
* operating in LITTLE ENDIAN mode, while the CPU is in LITTLE *
* ENDIAN mode. This function unpacks the byte-oriented data *
* so that further processing may proceed in LITTLE ENDIAN mode. *
* *
* ASSUMPTIONS *
* Input and output arrays are double-word aligned. *
* The input must be a multiple of 16 luma pixels long. *
* *
* TECHNIQUES *
* The loop has been unrolled a total of 16 times to allow for *
* processing 8 pixels in each datapath. *
* *
* Double-word loads and stores maximize memory bandwidth *
* utilization. *
* *
* This code uses _gmpy4() to ease the L/S/D unit bottleneck on *
* ANDs. The _gmpy4(value, 0x00010001) is equivalent to *
* value & 0x00FF00FF, as long as the size field of GFPGFR is *
* equal to 7. (The polynomial does not matter.) *
* *
* NOTES *
* This code is fully interruptible. *
* *
* CYCLES *
* cycles = 3 * (num_luma / 8) + 18. *
* For num_luma = 1024, cycles = 402 *
* *
* This number includes 6 cycles of function call overhead. The *
* exact overhead will vary depending on compiler options used. *
* *
* CODESIZE *
* 352 bytes *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *
.sect ".text:_yc_demux_le16"
.global _IMG_yc_demux_le16
_IMG_yc_demux_le16
* ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== *
.asg A4, A_i
.asg B4, B_yc
.asg A6, A_y
.asg B6, B_cr
.asg A8, A_cb
.asg B3, B_ret_addr
.asg A1, A_p
.asg A2, A_b0y1r0y0
.asg A3, A_b2y3r2y2
.asg A16, A_r0b0
.asg A17, A_b0r0
.asg A18, A_b6r6
.asg A19, A_r2b2
.asg A20, A_yc
.asg A21, A_r6b6
.asg A22, A_cr
.asg A23, A_00010001
.asg A24, A_r2r0
.asg A25, A_b2r2
.asg A25, A_r6r4
.asg A26, A_b2b0
.asg A26, A_y5y4
.asg A27, A_b4r4
.asg A27, A_b6b4
.asg A27, A_y7y6
.asg A28, A_b4y5r4y4
.asg A29, A_b6y7r6y6
.asg A30, A_r4b4
.asg A30, A_y1y0
.asg A31, A_y3y2
.asg B8, B_bCyDrCyC
.asg B9, B_bEyFrEyE
.asg B17, B_gfpgfr
.asg B18, B_rAbA
.asg B19, B_rCbC
.asg B20, B_rEbE
.asg B21, B_cb
.asg B22, B_00FF00FF
.asg B23, B_y
.asg B24, B_rAr8
.asg B24, B_y9y8
.asg B25, B_rErC
.asg B25, B_yByA
.asg B26, B_b8r8
.asg B26, B_bAb8
.asg B26, B_bErE
.asg B27, B_bArA
.asg B27, B_bEbC
.asg B28, B_b8y9r8y8
.asg B29, B_bAyBrAyA
.asg B29, B_bCrC
.asg B30, B_r8b8
.asg B30, B_yDyC
.asg B31, B_yFyE
* ========================================================================= *
* =========================== PIPE LOOP PROLOG ============================ *
LDDW .D2T1 *B_yc[1], A_b6y7r6y6:A_b4y5r4y4 ;[ 1,1]
|| ZERO .S1 A_p
|| ADD .L1X B_yc, 8, A_yc
|| ADD .L2X A_y, 8, B_y
MVKLH .S1 0x0700, A_p
|| ADD .L1 A_yc, 8, A_yc
|| ADD .L2 B_y, 8, B_y
|| MVK .S2 0x00FF, B_00FF00FF
LDDW .D2T1 *B_yc++[4], A_b2y3r2y2:A_b0y1r0y0 ;[ 3,1]
|| LDDW .D1T2 *A_yc[1], B_bEyFrEyE:B_bCyDrCyC ;[ 3,1]
|| PACK2 .L2 B_00FF00FF, B_00FF00FF, B_00FF00FF
MVC .S2 GFPGFR, B_gfpgfr
|| ADD .L1X B_cr, 8, A_cr
|| ADD .L2X A_cb, 8, B_cb
|| SHRU .S1 A_i, 4, A_i
MVC .S2X A_p, GFPGFR
|| SHRU2 .S1X B_00FF00FF, 7, A_00010001
|| SUB .L1 A_i, 2, A_i
ANDN .L1X A_b6y7r6y6, B_00FF00FF, A_b6r6 ;[ 6,1]
|| AND .S1X A_b4y5r4y4, B_00FF00FF, A_y5y4 ;[ 6,1]
; ===== 1 prolog stage collapsed
* =========================== PIPE LOOP KERNEL ============================ *
loop:
BDEC .S1 loop, A_i ;[13,1]
|| ROTL .M1 A_b4r4, 8, A_r4b4 ;[13,1]
|| ROTL .M2 B_b8r8, 8, B_r8b8 ;[13,1]
|| AND .L2 B_b8y9r8y8, B_00FF00FF, B_y9y8 ;[13,1]
|| AND .S2 B_bAyBrAyA, B_00FF00FF, B_yByA ;[13,1]
|| AND .L1X A_b6y7r6y6, B_00FF00FF, A_y7y6 ;[ 7,2]
|| LDDW .D1T2 *A_yc++[4], B_bAyBrAyA:B_b8y9r8y8 ;[ 7,2]
|| LDDW .D2T1 *B_yc[1], A_b6y7r6y6:A_b4y5r4y4 ;[ 1,3]
[!A_p]STDW .D1T1 A_y3y2:A_y1y0, *A_y++[4] ;[14,1]
||[!A_p]STDW .D2T2 B_yFyE:B_yDyC, *B_y[1] ;[14,1]
|| PACK2 .S2 B_rEbE, B_rCbC, B_bEbC ;[14,1]
|| ROTL .M2 B_bArA, 8, B_rAbA ;[14,1]
|| ANDN .L1X A_b2y3r2y2, B_00FF00FF, A_b2r2 ;[ 8,2]
|| ANDN .L2 B_bEyFrEyE, B_00FF00FF, B_bErE ;[ 8,2]
|| GMPY4 .M1 A_b0y1r0y0, A_00010001, A_y1y0 ;[ 8,2]
|| AND .S1X A_b2y3r2y2, B_00FF00FF, A_y3y2 ;[ 8,2]
PACKH2 .L1 A_r2b2, A_r0b0, A_r2r0 ;[15,1]
|| ANDN .S1X A_b0y1r0y0, B_00FF00FF, A_b0r0 ;[ 9,2]
|| ANDN .S2 B_bCyDrCyC, B_00FF00FF, B_bCrC ;[ 9,2]
|| AND .L2 B_bEyFrEyE, B_00FF00FF, B_yFyE ;[ 9,2]
|| LDDW .D2T1 *B_yc++[4], A_b2y3r2y2:A_b0y1r0y0 ;[ 3,3]
|| LDDW .D1T2 *A_yc[1], B_bEyFrEyE:B_bCyDrCyC ;[ 3,3]
||[ A_p]DOTPN2 .M1 A_00010001, A_00010001, A_p ; pro collapse
[!A_p]STDW .D2T2 B_yByA:B_y9y8, *B_y++[4] ;[16,1]
|| PACK2 .S1 A_r2b2, A_r0b0, A_b2b0 ;[16,1]
|| PACK2 .L1 A_r6b6, A_r4b4, A_b6b4 ;[16,1]
|| PACK2 .S2 B_rAbA, B_r8b8, B_bAb8 ;[16,1]
|| PACKH2 .L2 B_rAbA, B_r8b8, B_rAr8 ;[16,1]
|| STDW .D1T1 A_y7y6:A_y5y4, *A_y[1] ;[10,2]
|| ROTL .M1 A_b6r6, 8, A_r6b6 ;[10,2]
|| ROTL .M2 B_bErE, 8, B_rEbE ;[10,2]
[!A_p]STDW .D1T1 A_b6b4:A_b2b0, *A_cb++[2] ;[17,1]
||[!A_p]STDW .D2T2 B_bEbC:B_bAb8, *B_cb++[2] ;[17,1]
|| PACKH2 .L1 A_r6b6, A_r4b4, A_r6r4 ;[17,1]
|| PACKH2 .S2 B_rEbE, B_rCbC, B_rErC ;[17,1]
|| ROTL .M1 A_b2r2, 8, A_r2b2 ;[11,2]
|| ROTL .M2 B_bCrC, 8, B_rCbC ;[11,2]
|| ANDN .S1X A_b4y5r4y4, B_00FF00FF, A_b4r4 ;[11,2]
|| AND .L2 B_bCyDrCyC, B_00FF00FF, B_yDyC ;[11,2]
[!A_p]STDW .D2T1 A_r6r4:A_r2r0, *B_cr++[2] ;[18,1]
||[!A_p]STDW .D1T2 B_rErC:B_rAr8, *A_cr++[2] ;[18,1]
|| ROTL .M1 A_b0r0, 8, A_r0b0 ;[12,2]
|| ANDN .L2 B_b8y9r8y8, B_00FF00FF, B_b8r8 ;[12,2]
|| ANDN .S2 B_bAyBrAyA, B_00FF00FF, B_bArA ;[12,2]
|| ANDN .L1X A_b6y7r6y6, B_00FF00FF, A_b6r6 ;[ 6,3]
|| AND .S1X A_b4y5r4y4, B_00FF00FF, A_y5y4 ;[ 6,3]
* =========================== PIPE LOOP EPILOG ============================ *
; ===== 1 epilog stage collapsed
ROTL .M1 A_b4r4, 8, A_r4b4 ;[13,3]
|| ROTL .M2 B_b8r8, 8, B_r8b8 ;[13,3]
|| AND .L2 B_b8y9r8y8, B_00FF00FF, B_y9y8 ;[13,3]
|| AND .D2 B_bAyBrAyA, B_00FF00FF, B_yByA ;[13,3]
|| RET B_ret_addr
STDW .D1T1 A_y3y2:A_y1y0, *A_y ;[14,3]
|| STDW .D2T2 B_yFyE:B_yDyC, *B_y[1] ;[14,3]
|| PACK2 .S2 B_rEbE, B_rCbC, B_bEbC ;[14,3]
|| ROTL .M2 B_bArA, 8, B_rAbA ;[14,3]
PACKH2 .L1 A_r2b2, A_r0b0, A_r2r0 ;[15,3]
STDW .D2T2 B_yByA:B_y9y8, *B_y ;[16,3]
|| PACK2 .S1 A_r2b2, A_r0b0, A_b2b0 ;[16,3]
|| PACK2 .L1 A_r6b6, A_r4b4, A_b6b4 ;[16,3]
|| PACK2 .S2 B_rAbA, B_r8b8, B_bAb8 ;[16,3]
|| PACKH2 .L2 B_rAbA, B_r8b8, B_rAr8 ;[16,3]
STDW .D1T1 A_b6b4:A_b2b0, *A_cb ;[17,3]
|| STDW .D2T2 B_bEbC:B_bAb8, *B_cb ;[17,3]
|| PACKH2 .L1 A_r6b6, A_r4b4, A_r6r4 ;[17,3]
|| PACKH2 .S2 B_rEbE, B_rCbC, B_rErC ;[17,3]
STDW .D2T1 A_r6r4:A_r2r0, *B_cr ;[18,3]
|| STDW .D1T2 B_rErC:B_rAr8, *A_cr ;[18,3]
|| MVC .S2 B_gfpgfr, GFPGFR
* ========================================================================= *
* End of file: img_yc_demux_le16.asm *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *