You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

782 lines
50 KiB

;* ======================================================================== *;
;* TEXAS INSTRUMENTS, INC. *;
;* *;
;* IMGLIB DSP Image/Video Processing Library *;
;* *;
;* Release: Revision 1.04b *;
;* CVS Revision: 1.12 Mon Oct 21 15:28:45 2002 (UTC) *;
;* Snapshot date: 23-Oct-2003 *;
;* *;
;* This library contains proprietary intellectual property of Texas *;
;* Instruments, Inc. The library and its source code are protected by *;
;* various copyrights, and portions may also be protected by patents or *;
;* other legal protections. *;
;* *;
;* This software is licensed for use with Texas Instruments TMS320 *;
;* family DSPs. This license was provided to you prior to installing *;
;* the software. You may review this license by consulting the file *;
;* TI_license.PDF which accompanies the files in this library. *;
;* ------------------------------------------------------------------------ *;
;* Copyright (C) 2003 Texas Instruments, Incorporated. *;
;* All Rights Reserved. *;
;* ======================================================================== *;
;* ======================================================================== *;
;* Assembler compatibility shim for assembling 4.30 and later code on *;
;* tools prior to 4.30. *;
;* ======================================================================== *;
.if $isdefed(".ASSEMBLER_VERSION")
.asg .ASSEMBLER_VERSION, $asmver
.else
.asg 0, $asmver
.endif
.if ($asmver < 430)
.asg B, CALL ; Function Call
.asg B, RET ; Return from a Function
.asg B, CALLRET ; Function call with Call / Ret chaining.
.if .TMS320C6400
.asg BNOP, CALLNOP ; C64x BNOP as a Fn. Call
.asg BNOP, RETNOP ; C64x BNOP as a Fn. Return
.asg BNOP, CRNOP ; C64x Fn call w/, Call/Ret chaining via BNOP.
.endif
.asg , .asmfunc ; .func equivalent for hand-assembly code
.asg , .endasmfunc ; .endfunc equivalent for hand-assembly code
.endif
;* ======================================================================== *;
;* End of assembler compatibility shim. *;
;* ======================================================================== *;
* ========================================================================= *
* TEXAS INSTRUMENTS, INC. *
* *
* NAME *
* IMG_ycbcr422p_rgb565 -- Planarized YCbCr 4:2:2/4:2:0 to 16-bit *
* RGB 5:6:5 color space conversion. *
* *
* REVISION DATE *
* 21-Oct-2002 *
* *
* USAGE *
* This function is C callable, and is called according to this *
* C prototype: *
* *
* void IMG_ycbcr422p_rgb565 *
* ( *
* const short coeff[5], /* Matrix coefficients. */ *
* const unsigned char *y_data, /* Luminence data (Y') */ *
* const unsigned char *cb_data, /* Blue color-diff (B'-Y') */ *
* const unsigned char *cr_data, /* Red color-diff (R'-Y') */ *
* unsigned short *
* *restrict rgb_data, /* RGB 5:6:5 packed pixel out. */ *
* unsigned num_pixels /* # of luma pixels to process */ *
* ); *
* *
* The 'coeff[]' array contains the color-space-conversion matrix *
* coefficients. The 'y_data', 'cb_data' and 'cr_data' pointers *
* point to the separate input image planes. The 'rgb_data' pointer *
* *
* The kernel is designed to process arbitrary amounts of 4:2:2 *
* image data, although 4:2:0 image data may be processed as well. *
* For 4:2:2 input data, the 'y_data', 'cb_data' and 'cr_data' *
* arrays may hold an arbitrary amount of image data, including *
* multiple scan lines of image data. *
* *
* For 4:2:0 input data, only a single scan-line (or portion *
* thereof) may be processed at a time. This is achieved by *
* calling the function twice using the same row data for *
* 'cr_data' and 'cb_data', and providing new row data for *
* 'y_data'. This is numerically equivalent to replicating the Cr *
* and Cb pixels vertically. *
* *
* The coefficients in the coeff array must be in signed Q13 form. *
* These coefficients correspond to the following matrix equation: *
* *
* [ coeff[0] 0.0000 coeff[1] ] [ Y' - 16 ] [ R'] *
* [ coeff[0] coeff[2] coeff[3] ] * [ Cb - 128 ] = [ G'] *
* [ coeff[0] coeff[4] 0.0000 ] [ Cr - 128 ] [ B'] *
* *
* The output from this kernel is 16-bit RGB in 5:6:5 format. *
* The RGB components are packed into halfwords as shown below. *
* *
* 15 11 10 5 4 0 *
* +----------+----------+----------+ *
* | Red | Green | Blue | *
* +----------+----------+----------+ *
* *
* This kernel can also return the red, green, and blue values in *
* the opposite order if a particular application requires it. *
* This is achieved by exchanging the 'cb_data' and 'cr_data' *
* arguments when calling the function, and by reversing the order *
* of coefficients in coeff[1] through coeff[4]. This essentially *
* implements the following matrix multiply: *
* *
* [ coeff[0] 0.0000 coeff[4] ] [ Y' - 16 ] [ B'] *
* [ coeff[0] coeff[3] coeff[2] ] * [ Cr - 128 ] = [ G'] *
* [ coeff[0] coeff[1] 0.0000 ] [ Cb - 128 ] [ R'] *
* *
* The reversed RGB ordering output by this mode is as follows: *
* *
* 15 11 10 5 4 0 *
* +----------+----------+----------+ *
* | Blue | Green | Red | *
* +----------+----------+----------+ *
* *
* DESCRIPTION *
* This kernel performs Y'CbCr to RGB conversion. From the Color *
* FAQ, http://home.inforamp.net/~poynton/ColorFAQ.html : *
* *
* Various scale factors are applied to (B'-Y') and (R'-Y') *
* for different applications. The Y'PbPr scale factors are *
* optimized for component analog video. The Y'CbCr scaling *
* is appropriate for component digital video, JPEG and MPEG. *
* Kodak's PhotoYCC(tm) uses scale factors optimized for the *
* gamut of film colors. Y'UV scaling is appropriate as an *
* intermediate step in the formation of composite NTSC or PAL *
* video signals, but is not appropriate when the components *
* are keps separate. Y'UV nomenclature is now used rather *
* loosely, and it sometimes denotes any scaling of (B'-Y') *
* and (R'-Y'). Y'IQ coding is obsolete. *
* *
* This code can perform various flavors of Y'CbCr to RGB *
* conversion as long as the offsets on Y, Cb, and Cr are -16, *
* -128, and -128, respectively, and the coefficients match the *
* pattern shown. *
* *
* The kernel implements the following matrix form, which involves 5 *
* unique coefficients: *
* *
* [ coeff[0] 0.0000 coeff[1] ] [ Y' - 16 ] [ R'] *
* [ coeff[0] coeff[2] coeff[3] ] * [ Cb - 128 ] = [ G'] *
* [ coeff[0] coeff[4] 0.0000 ] [ Cr - 128 ] [ B'] *
* *
* *
* Below are some common coefficient sets, along with the matrix *
* equation that they correspond to. Coefficients are in signed *
* Q13 notation, which gives a suitable balance between precision *
* and range. *
* *
* 1. Y'CbCr -> RGB conversion with RGB levels that correspond to *
* the 219-level range of Y'. Expected ranges are [16..235] for *
* Y' and [16..240] for Cb and Cr. *
* *
* coeff[] = { 0x2000, 0x2BDD, -0x0AC5, -0x1658, 0x3770 }; *
* *
* [ 1.0000 0.0000 1.3707 ] [ Y' - 16 ] [ R'] *
* [ 1.0000 -0.3365 -0.6982 ] * [ Cb - 128 ] = [ G'] *
* [ 1.0000 1.7324 0.0000 ] [ Cr - 128 ] [ B'] *
* *
* 2. Y'CbCr -> RGB conversion with the 219-level range of Y' *
* expanded to fill the full RGB dynamic range. (The matrix *
* has been scaled by 255/219.) Expected ranges are [16..235] *
* for Y' and [16..240] for Cb and Cr. *
* *
* coeff[] = { 0x2543, 0x3313, -0x0C8A, -0x1A04, 0x408D }; *
* *
* [ 1.1644 0.0000 1.5960 ] [ Y' - 16 ] [ R'] *
* [ 1.1644 -0.3918 -0.8130 ] * [ Cb - 128 ] = [ G'] *
* [ 1.1644 2.0172 0.0000 ] [ Cr - 128 ] [ B'] *
* *
* 3. Y'CbCr -> BGR conversion with RGB levels that correspond to *
* the 219-level range of Y'. This is equivalent to #1 above, *
* except that the R, G, and B output order in the packed *
* pixels is reversed. Note: The 'cr_data' and 'cb_data' *
* input arguments must be exchanged for this example as *
* indicated under USAGE above. *
* *
* coeff[] = { 0x2000, 0x3770, -0x1658, -0x0AC5, 0x2BDD }; *
* *
* [ 1.0000 0.0000 1.7324 ] [ Y' - 16 ] [ B'] *
* [ 1.0000 -0.6982 -0.3365 ] * [ Cr - 128 ] = [ G'] *
* [ 1.0000 1.3707 0.0000 ] [ Cb - 128 ] [ R'] *
* *
* 4. Y'CbCr -> BGR conversion with the 219-level range of Y' *
* expanded to fill the full RGB dynamic range. This is *
* equivalent to #2 above, except that the R, G, and B output *
* order in the packed pixels is reversed. Note: The *
* 'cr_data' and 'cb_data' input arguments must be exchanged *
* for this example as indicated under USAGE above. *
* *
* coeff[] = { 0x2000, 0x408D, -0x1A04, -0x0C8A, 0x3313 }; *
* *
* [ 1.0000 0.0000 2.0172 ] [ Y' - 16 ] [ B'] *
* [ 1.0000 -0.8130 -0.3918 ] * [ Cr - 128 ] = [ G'] *
* [ 1.0000 1.5960 0.0000 ] [ Cb - 128 ] [ R'] *
* *
* Other scalings of the color differences (B'-Y') and (R'-Y') *
* (sometimes incorrectly referred to as U and V) are supported, as *
* long as the color differences are unsigned values centered around *
* 128 rather than signed values centered around 0, as noted above. *
* *
* In addition to performing plain color-space conversion, color *
* saturation can be adjusted by scaling coeff[1] through coeff[4]. *
* Similarly, brightness can be adjusted by scaling coeff[0]. *
* General hue adjustment can not be performed, however, due to the *
* two zeros hard-coded in the matrix. *
* *
* TECHNIQUES *
* Pixel replication is performed implicitly on chroma data to *
* reduce the total number of multiplies required. The chroma *
* portion of the matrix is calculated once for each Cb, Cr pair, *
* and the result is added to both Y' samples. *
* *
* Matrix Multiplication is performed as a combination of MPY2s and *
* DOTP2s. Saturation to 8bit values is performed using SPACKU4 *
* which takes in 4 signed 16-bit values and saturates them to *
* unsigned 8-bit values. The output of Matrix Multiplication would *
* ideally be in a Q13 format. This however, cannot be fed directly *
* to SPACKU4. *
* *
* This implies a shift left by 3 bits, which could be pretty *
* expensive in terms of the number of shifts to be performed. Thus, *
* to avoid being bottlenecked by so many shifts, the Y, Cr & Cb data *
* are shifted left by 3 before multiplication. This is possible *
* because they are 8-bit unsigned data. Due to this, the output of *
* Matrix Multiplication is in a Q16 format, which can be directly *
* fed to SPACKU4. *
* *
* Because the loop accesses four different arrays at three *
* different strides, no memory accesses are allowed to parallelize *
* in the loop. No bank conflicts occur, as a result. *
* *
* The epilog has been completely removed, while the prolog is left *
* as is. However, some cycles of the prolog are performed using the *
* kernel cycles to help reduce code-size. The setup code is merged *
* along with the prolog for speed. *
* *
* ASSUMPTIONS *
* The number of luma samples to be processed needs to be a multiple *
* of 8. *
* The input Y array needs to be double-word aligned. *
* The input Cr and Cb arrays need to be word aligned *
* The output image must be double-word aligned. *
* *
* NOTES *
* No bank conflicts occur. *
* *
* Codesize is 952 bytes. *
* *
* Memory bank conflicts will not occurs since the 3 loads and two *
* stores happen in different cycles of the loop *
* *
* The kernel requires 3 words of stack space. *
* *
* CYCLES *
* 12 * num_pixels/8 + 50 *
* *
* CODESIZE *
* 952 bytes *
* *
* SOURCE *
* Poynton, Charles et al. "The Color FAQ," 1999. *
* http://home.inforamp.net/~poynton/ColorFAQ.html *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *
.sect ".text:_ycbcr422pl_to_rgb565"
.global _IMG_ycbcr422p_rgb565
_IMG_ycbcr422p_rgb565:
.asg B0, B_k08 ; Constant of 0x08080808
.asg B1, B_loopcnt ; Loop counter
.asg A2, A_lp0 ; Predicate register, to reduce prolog size
.asg B2, B_rgb_ptr ; Pointer to output RGB data
.asg B3, B_ret ; Return address
.asg A4, A_coef ; Pointer to Co-efficients
.asg A4, A_y_data ; Pointer to 'Y' data on A datapath
.asg B4, B_y_data ; Pointer to 'Y' data on B datapath
.asg A5, A_cb_data ; Pointer to 'Cb' data
.asg B5, B_c2c3 ; g_cb : g_cr register
.asg A6, A_cb_data_in; Input pointer to 'Cb' data
.asg B6, B_cr_data ; Pointer to 'Cr' data
.asg A8, A_rgb_data ; Pointer to output RGB data
.asg B8, B_num_pix ; Number of pixels to be processed
.asg B8, B_cr6420 ; Loaded Word of 'Cr' data
.asg A9, A_g_cr ; g_cr : g_cr data
.asg B9, B_coef ; Pointer to Co-efficients
.asg B10, B_reg10 ; Input B10 register; to be stored on stack
.asg B11, B_reg11 ; Input B11 register; to be stored on stack
.asg B11, B_cr6420_ ; Biased Word of 'Cr' data
.asg B15, B_SP ; Stack pointer, B datapath
.asg A16, A_rgb_ptr ; Pointer to output RGB data
.asg B16, B_n16 ; Constant 0x00800080
.asg A17, A_k80 ; Constant 0x80808080
.asg B17, B_c0 ; Luma co-efficient
.asg A18, A_k08 ; Constant 0x08080808
.asg B18, B_c1 ; r_cr co-efficient
.asg A19, A_c4 ; b_cb co-efficient
.asg B19, B_msk5 ; Mask to obtain the upper 5 bits of R & B
.asg A20, A_msk6 ; Mask to obtain the upper 6 bits of G
.asg B20, B_k01 ; Constant 0x01010101
.asg A21, A_c2c3 ; g_cb : g_cr register
.asg B21, B_c3c2 ; g_cr : g_cb register on B datapath
.asg A22, A_c3c2 ; g_cr : g_cb register on A datapath
.asg B22, B_y_3210 ; Loaded lower 4 pixels of 'Y' data
.asg A23, A_cb6420_ ; Biased 'Cb' data
.asg B23, B_y_7654 ; Loaded upper 4 pixels of 'Y' data
.asg A24, A_y_10_ ; Lower 2 pixels of 'Y' multiplied by 8
.asg B24, B_no_gie ; CSR w/ GIE bit cleared
.asg B24, B_cr20 ; Lower 2 samples 'Cr'
.asg A25, A_y_32_ ; Upper 2 pixels of 'Y' multiplied by 8
.asg B25, B_cr64 ; Biased Upper 'Cr' value multiplied by 8
.asg A26, A_cb20 ; Biased Lower 'Cb' value multiplied by 8
.asg B26, B_csr ; CSR's value
.asg A27, A_cb64 ; Biased Upper 'Cb' value multiplied by 8
.asg A28, A_cb6420 ; Loaded 4 samples of 'Cb'
.asg B28, B_y_54_ ; 'Y' samples multiplied by 8
.asg B29, B_y_76_ ; 'Y' samples multiplied by 8
.asg B30, B_g_cb ; g_cb co-efficient
.asg A0, A_g_0 ; Generated G pixel in 11:8:13 format
.asg A0, A_y_10 ; Lower 2 pixels of Y after bias
.asg A3, A_g_32 ; Packed G3 and G2 pixels
.asg A3, A_g_2 ; Generated G pixel in 11:8:13 format
.asg A3, A_cg2 ; Additional input to generate G pixels
.asg B4, B_g_7654_ ; Upper 6 bits of Green pixels
.asg B4, B_g_54 ; Packed G5 and G4 pixels
.asg B4, B_g_7654 ; Saturated and packed 8bit values of G
.asg B4, B_g_5 ; Generated G pixel in 11:8:13 format
.asg B5, B_r_b54 ; Generated R & B pixel in 5:6:5 format
.asg B5, B_r_76 ; Packed R7 and R6 pixels
.asg B5, B_r_6 ; Generated R pixel in 11:8:13 format
.asg A6, A_y_32 ; Biased Y pixels
.asg A6, A_r_b32 ; Generated R & B pixel in 5:6:5 format
.asg A6, A_rgb10 ; Final RGB values for pixels 0 & 1
.asg A6, A_y_2_c0 ; Y2 and luma
.asg A7, A_r_3 ; Generated R pixel in 11:8:13 format
.asg A7, A_r_32 ; Packed R3 & R2 pixels
.asg A7, A_r_b10 ; Generated R & B pixel in 5:6:5 format
.asg A7, A_rgb32 ; Final RGB values for pixels 3 & 2
.asg A7, A_y_3_c0 ; Y3 and luma
.asg B7, B_b_54 ; Packed B5 and B4 pixels
.asg B7, B_b_5 ; Generated B pixel in 11:8:13 format
.asg B7, B_b_7654 ; Saturated and packed 8bit values of B
.asg B7, B_b_7654_ ; Upper 5 bits of Blue pixels
.asg B7, B_r_7 ; Generated R pixel in 11:8:13 format
.asg A8, A_r_10 ; Packed R1 and R0 pixels
.asg A8, A_r_3210 ; Saturated and packed 8bit values of R
.asg A8, A_b_1 ; Generated B pixel in 11:8:13 format
.asg A8, A_b_3210__ ;
.asg A8, A_r_3210_ ; Upper 5 bits of generated R pixels
.asg A8, A_r_1 ; Generated R pixel in 11:8:13 format
.asg B8, B_b_7 ; Generated B pixel in 11:8:13 format
.asg B8, B_cg4 ; Additional input to G pixel
.asg B8, B_rgb54 ; Final RGB values for pixels 5 & 4
.asg B8, B_y_54 ; Packed Y5 and Y4
.asg A9, A_g_3210 ; Saturated and packed 8bit values of G
.asg A9, A_r_2 ; Generated R pixel in 11:8:13 format
.asg A9, A_g_1 ; Generated G pixel in 11:8:13 format
.asg A9, A_g_10 ; Packed G1 and G0 pixels
.asg A9, A_b_2 ; Generated B pixel in 11:8:13 format
.asg A9, A_cg0 ; Additional input to G pixel
.asg B9, B_rgb76 ; Final RGB values for pixels 7 & 6
.asg B9, B_g_4 ; Generated G pixel in 11:8:13 format
.asg B9, B_g_6 ; Generated G pixel in 11:8:13 format
.asg B9, B_b_76 ; Packed B7 and B6 pixels
.asg B9, B_r_5 ; Generated R pixel in 11:8:13 format
.asg B9, B_cr6cb6 ; Packed Cr and Cb samples
.asg B10, B_r_7654_ ; Upper 5 bits of generated R pixels
.asg B10, B_r_54 ; Packed R5 and R4 pixels
.asg B11, B_g_76 ; Packed G7 and G6 pixels
.asg B11, B_b_7654__ ; To shift right the B pixels by 3
.asg B11, B_cr4cb4 ; Packed Cr and Cb sample
.asg A16, A_rgb_ptr ; Final RGB output pointer
.asg B16, B_n16 ; constant 0x00800080
.asg A17, A_k80 ; constant 0x80808080
.asg B17, B_c0 ; Luma = coef[0]
.asg A18, A_k08 ; constant 0x08080808
.asg B18, B_c1 ; r_cr = coef[1]
.asg A19, A_c4 ; b_cb = coef[4]
.asg B19, B_msk5 ; 5 bit mask to get upper 5 bits of R & B
.asg A20, A_msk6 ; 6 bit mask to get upper 6 bits of G
.asg B20, B_k01 ; constant 0x01010101
.asg A22, A_g_3210_ ; Upper 6 bits of generated G pixels
.asg A22, A_cb0cr0 ; Packed Cb and Cr pixels
.asg B22, B_b5_b4 ; Packed B5 and B4 pixels
.asg B22, B_r_7654 ; Saturated and packed 8bit values of R
.asg B22, B_b_6 ; Generated B pixel in 11:8:13 format
.asg A23, A_b_3210_ ; Upper 5 bits of generated B pixels
.asg A23, A_b_32 ; Packed B3 and B2 pixels
.asg B23, B_b7_b6 ; Packed B7 and B6 pixel
.asg B23, B_r_b76 ; Packed R and B pixels
.asg B23, B_b_4 ; Generated B pixel in 11:8:13 format
.asg B23, B_g_7 ; Generated G pixel in 11:8:13 format
.asg A24, A_b1_b0 ; Packed B1 and B0 pixels
.asg A24, A_b_3 ; Generated B pixel in 11:8:13 format
.asg B24, B_r_4 ; Generated R pixel in 11:8:13 format
.asg B24, B_cr4_c1 ; Packed Cr and r_cr
.asg B24, B_y_76 ; Packed Y7 and Y6 pixels
.asg A25, A_b3_b2 ; Packed B3 and B2 pixels
.asg A25, A_b_3210 ; Saturated and packed 8bit values of B
.asg A25, A_cb2cr2 ; Packed Cb and Cr
.asg B25, B_cg6 ; Additional input to G
.asg B25, B_cr6_c1 ; Packed Cr and r_cr
.asg A26, A_g1_g0 ; Packed G1 and G0 pixels
.asg B26, B_r5_r4 ; Packed R5 and R4 pixels
.asg B26, B_y_6_c0 ; Packed Y6 and luma
.asg A27, A_g3_g2 ; Packed G3 and G2 pixels
.asg A27, A_b_0 ; Generated B pixel in 11:8:13 format
.asg B27, B_r7_r6 ; Packed R7 and R6 pixels
.asg B27, B_y_7_c0 ; Packed Y7 and luma
.asg A28, A_r_0 ; Generated R pixel in 11:8:13 format
.asg A28, A_y_0_c0 ; Packed Y0 and luma
.asg A28, A_cb4_c4 ; Packed Cb and b_cb
.asg A28, A_cb6420 ; Loaded Cb pixels
.asg B28, B_g5_g4 ; Packed G5 and G4 pixels
.asg B28, B_y_4_c0 ; Packed Y4 and luma
.asg A29, A_cb6_c4 ; Packed Cb and b_cb
.asg A29, A_b_10 ; Packed B1 and B0
.asg A29, A_y_1_c0 ; Packed Y1 and luma
.asg B29, B_g7_g6 ; Packed G7 and G6
.asg B29, B_y_5_c0 ; Packed Y5 and luma
.asg A30, A_r1_r0 ; Packed R1 and R0
.asg A30, A_cb0_c4 ; Packed Cb and b_cb coef
.asg B30, B_cr0_c1 ; Packed Cr and r_cr coef
.asg A31, A_g_3 ; Generated G pixel in 11:8:13 format
.asg A31, A_r3_r2 ; Packed R3 and R2 pixels
.asg A31, A_cb2_c4 ; Packed Cb and b_cb
.asg B31, B_cr2_c1 ; Packed Cr and r_cr
* ===================== SETUP CODE ======================================== *
MV .L1 A_cb_data_in, A_cb_data ; copy of input Cb ptr
|| MV .L2x A_coef, B_coef ; copy of Co-ef ptr on B side
|| LDHU .D1T1 *A_coef[4], A_c4 ; Loading b_cb
|| LDW .D2T2 *B_cr_data++, B_cr6420 ; loading first 4 Cr pixels
|| MVKL .S1 0x80808080, A_k80 ; Moving constant
|| MVC .S2 CSR, B_csr ; Loading contents of CSR
LDW .D1T1 *A_cb_data++, A_cb6420 ; loading first 4 Cb pixels
|| LDHU .D2T2 *B_coef[1], B_c1 ; r_cr = coef[1]
|| AND .L2 B_csr, -2, B_no_gie;clear interrupt enable bit
|| MVKL .S1 0x08080808, A_k08 ; Loading constant
|| SHRU .S2 B_num_pix, 3, B_loopcnt; divide num_pix by 8
|| MV .L1x B_y_data, A_y_data ; copy of Y ptr
PACK2.S1 A_k08, A_k08, A_k08 ; creating constant
|| MVC .S2 B_no_gie, CSR ; disable Interrupts
|| SUB .L2 B_loopcnt, 2, B_loopcnt; Sub 2 due to BDEC
|| LDHU .D2T1 *B_coef[3], A_g_cr ; g_cr = coef[3]
|| LDDW .D1T2 *A_y_data++, B_y_7654:B_y_3210; first set of 8 Y samples
|| PACK2.L1 A_k80, A_k80, A_k80 ; loading constant
MV .L2x A_k08, B_k08 ; copy of constant
|| MVKL .S2 0x01010101, B_k01 ; creating constant
|| MV .L1 A_rgb_data, A_rgb_ptr ; copy of RGB pointer
|| ZERO .D1 A_lp0
|| LDHU .D2T2 *B_coef[2], B_g_cb ; g_cb = coef[2]
PACK2.L2 B_k01, B_k01, B_k01 ; creating constant
|| MVKL .S1 0xFCFCFCFC, A_msk6 ; creating mask for upper 6 bits
|| MVKL .S2 0xF8F8F8F8, B_msk5 ; creating mask for upper 5 bits
|| STDW .D2T2 B_reg11:B_reg10, *--B_SP ; Save B10 & B11 onto stack
PACK2 .L1 A_c4, A_c4, A_c4 ; b_cb:b_cb
|| PACK2 .L2 B_msk5, B_msk5, B_msk5; creating mask
|| PACK2 .S1 A_msk6, A_msk6, A_msk6; creating mask
|| MVKL .S2 0x00800080, B_n16 ; creating constant
|| LDHU .D2T2 *B_coef[0], B_c0 ; luma = coef[0]
XOR .L1 A_cb6420, A_k80, A_cb6420_; applying Cb bias
|| XOR .S2X B_cr6420, A_k80, B_cr6420_; applying Cr bias
|| PACK2 .L2 B_n16, B_n16, B_n16 ; creating constant
|| STW .D2T2 B_csr, *-B_SP[1] ; save CSR
MPYSU4 .M1 A_cb6420_, A_k08, A_cb64:A_cb20; multiply Cb by 8
|| MPYSU4 .M2 B_cr6420_, B_k08, B_cr64:B_cr20; multiply Cr by 8
PACK2 .L2 B_c1, B_c1, B_c1 ; r_cr:r_cr
|| MPYU4 .M1X B_y_3210, A_k08, A_y_32_:A_y_10_; multiply Y by 8
|| MPYU4 .M2 B_y_7654, B_k08, B_y_76_:B_y_54_; multiply Y by 8
* =========================== PIPE LOOP PROLOG ============================ *
PACK2 .L2x B_g_cb, A_g_cr, B_c2c3; g_cb:g_cr
|| PACK2 .L1x A_g_cr, B_g_cb, A_c3c2; g_cr: g_cb
MV .L1x B_c2c3, A_c2c3 ; Copy of g_cb:g_cr
|| PACK2 .L2 B_c0, B_c0, B_c0 ; luma:luma
|| MV .S2x A_c3c2, B_c3c2
PACKH2 .L2X B_cr64, A_cb64, B_cr6cb6; Pack Cr & Cb samples
|| PACK2 .S1X A_cb20, B_cr20, A_cb0cr0;
|| LDW .D2T2 *B_cr_data++, B_cr6420; Load next 4 samples of Cr
SUB2 .D1X A_y_32_, B_n16, A_y_32 ;
|| ADD .D2x A_rgb_data,8, B_rgb_ptr;
PACKH2 .L1X A_cb20, B_cr20, A_cb2cr2;[15,1]
|| MPY2 .M2 B_cr64, B_c1, B_cr6_c1:B_cr4_c1;[15,1]
|| SUB2 .D2 B_y_54_, B_n16, B_y_54 ;[15,1]
|| LDW .D1T1 *A_cb_data++, A_cb6420;[ 3,2]
|| B .S2 h_loop_9
MPY2 .M2 B_cr20, B_c1, B_cr2_c1:B_cr0_c1;[16,1]
|| SUB2 .D2 B_y_76_, B_n16, B_y_76 ;[16,1]
|| LDDW .D1T2 *A_y_data++, B_y_7654:B_y_3210;[ 4,2]
PACK2 .L2X B_cr64, A_cb64, B_cr4cb4 ;[17,1]
|| MPY2 .M1 A_cb64, A_c4, A_cb6_c4:A_cb4_c4;[17,1]
|| MPY2 .M2 B_y_54, B_c0, B_y_5_c0:B_y_4_c0;[17,1]
MPY2 .M1 A_cb20, A_c4, A_cb2_c4:A_cb0_c4;[18,1]
|| MPY2 .M2 B_y_76, B_c0, B_y_7_c0:B_y_6_c0;[18,1]
|| B .S2 h_loop_0
DOTP2 .M1 A_cb2cr2, A_c2c3, A_cg2 ;[19,1]
|| DOTP2 .M2 B_cr6cb6, B_c3c2, B_cg6 ;[19,1]
|| SUB2 .D1X A_y_10_, B_n16, A_y_10 ;[19,1]
DOTP2 .M2 B_cr4cb4, B_c3c2, B_cg4 ;[20,1]
|| MPY2 .M1X A_y_32, B_c0, A_y_3_c0:A_y_2_c0;[20,1]
|| XOR .L1 A_cb6420, A_k80, A_cb6420_ ;[ 8,2]
|| XOR .S2X B_cr6420, A_k80, B_cr6420_ ;[ 8,2]
|| B .S1 o_loop_0
o_loop_0:
PACKH2 .S2 B_g_7, B_g_6, B_g_76 ;[26,1]
|| PACKH2 .S1 A_r_3, A_r_2, A_r_32 ;[26,1]
|| ADD .L2X B_y_7_c0, A_cb6_c4, B_b_7 ;[26,1]
|| ADD .L1 A_y_2_c0, A_cg2, A_g_2 ;[26,1]
|| ADD .D2 B_y_4_c0, B_cg4, B_g_4 ;[26,1]
|| SUB2 .D1X A_y_32_, B_n16, A_y_32 ;[14,2]
PACKH2 .L2 B_b_7, B_b_6, B_b_76 ;[27,1]
|| PACKH2 .S2 B_g_5, B_g_4, B_g_54 ;[27,1]
|| PACK2 .S1X A_cb20, B_cr20, A_cb0cr0 ;[15,2]
|| PACKH2 .L1X A_cb20, B_cr20, A_cb2cr2 ;[15,2]
|| MPY2 .M2 B_cr64, B_c1, B_cr6_c1:B_cr4_c1;[15,2]
|| SUB2 .D2 B_y_54_, B_n16, B_y_54 ;[15,2]
|| LDW .D1T1 *A_cb_data++, A_cb6420 ;[ 3,3]
SPACKU4 .S2 B_b_76, B_b_54, B_b_7654 ;[28,1]
|| ADD .S1 A_y_1_c0, A_cb0_c4, A_b_1 ;[28,1]
|| ADD .L1 A_y_0_c0, A_cg0, A_g_0 ;[28,1]
|| PACKH2 .L2X B_cr64, A_cb64, B_cr6cb6 ;[16,2]
|| MPY2 .M2 B_cr20, B_c1, B_cr2_c1:B_cr0_c1;[16,2]
|| SUB2 .D2 B_y_76_, B_n16, B_y_76 ;[16,2]
|| LDDW .D1T2 *A_y_data++, B_y_7654:B_y_3210;[ 4,3]
AND .D2 B_b_7654, B_msk5, B_b_7654_ ;[29,1]
|| SPACKU4 .S2 B_g_76, B_g_54, B_g_7654 ;[29,1]
|| PACKH2 .L1 A_g_3, A_g_2, A_g_32 ;[29,1]
|| ADD .S1 A_y_0_c0, A_cb0_c4, A_b_0 ;[29,1]
|| ADD .D1 A_y_1_c0, A_cg0, A_g_1 ;[29,1]
|| PACK2 .L2X B_cr64, A_cb64, B_cr4cb4 ;[17,2]
|| MPY2 .M1 A_cb64, A_c4, A_cb6_c4:A_cb4_c4;[17,2]
|| MPY2 .M2 B_y_54, B_c0, B_y_5_c0:B_y_4_c0;[17,2]
AND .L2X B_g_7654, A_msk6, B_g_7654_ ;[30,1]
|| PACKH2 .L1 A_b_1, A_b_0, A_b_10 ;[30,1]
|| ADD .S1X A_y_0_c0, B_cr0_c1, A_r_0 ;[30,1]
|| ADD .D1X A_y_1_c0, B_cr0_c1, A_r_1 ;[30,1]
|| MPY2 .M1 A_cb20, A_c4, A_cb2_c4:A_cb0_c4;[18,2]
|| MPY2 .M2 B_y_76, B_c0, B_y_7_c0:B_y_6_c0;[18,2]
SPACKU4 .S2 B_r_76, B_r_54, B_r_7654 ;[31,1]
|| PACKH2 .L1 A_g_1, A_g_0, A_g_10 ;[31,1]
|| PACKH2 .S1 A_r_1, A_r_0, A_r_10 ;[31,1]
|| DOTP2 .M1 A_cb2cr2, A_c2c3, A_cg2 ;[19,2]
|| DOTP2 .M2 B_cr6cb6, B_c3c2, B_cg6 ;[19,2]
|| SUB2 .D1X A_y_10_, B_n16, A_y_10 ;[19,2]
AND .L2 B_r_7654, B_msk5, B_r_7654_ ;[32,1]
|| SPACKU4 .S1 A_b_32, A_b_10, A_b_3210 ;[32,1]
|| DOTP2 .M2 B_cr4cb4, B_c3c2, B_cg4 ;[20,2]
|| MPY2 .M1X A_y_32, B_c0, A_y_3_c0:A_y_2_c0;[20,2]
|| XOR .L1 A_cb6420, A_k80, A_cb6420_ ;[ 8,3]
|| XOR .S2X B_cr6420, A_k80, B_cr6420_ ;[ 8,3]
|| ADD .D1 A_lp0, 1, A_lp0
ROTL .M2 B_b_7654_, 29, B_b_7654__ ;[33,1]
|| AND .L1X A_b_3210, B_msk5, A_b_3210_ ;[33,1]
|| SPACKU4 .S1 A_r_32, A_r_10, A_r_3210 ;[33,1]
|| ADD .L2 B_y_4_c0, B_cr4_c1, B_r_4 ;[21,2]
|| ADD .S2 B_y_5_c0, B_cr4_c1, B_r_5 ;[21,2]
|| MPYSU4 .M1 A_cb6420_, A_k08, A_cb64:A_cb20 ;[ 9,3]
SPACKU4 .S1 A_g_32, A_g_10, A_g_3210 ;[34,1]
|| ADD .S2X B_y_6_c0, A_cb6_c4, B_b_6 ;[22,2]
|| ADD .D2 B_y_6_c0, B_cr6_c1, B_r_6 ;[22,2]
|| ADD .L2 B_y_7_c0, B_cr6_c1, B_r_7 ;[22,2]
|| MPYSU4 .M2 B_cr6420_, B_k08, B_cr64:B_cr20 ;[10,3]
|| MPYU4 .M1X B_y_3210, A_k08, A_y_32_:A_y_10_ ;[10,3]
AND .S1 A_g_3210, A_msk6, A_g_3210_ ;[35,1]
|| AND .L1X A_r_3210, B_msk5, A_r_3210_ ;[35,1]
|| PACKH2 .L2 B_r_7, B_r_6, B_r_76 ;[23,2]
|| ADD .D2X B_y_4_c0, A_cb4_c4, B_b_4 ;[23,2]
|| ADD .S2X B_y_5_c0, A_cb4_c4, B_b_5 ;[23,2]
|| DOTP2 .M1 A_cb0cr0, A_c2c3, A_cg0 ;[23,2]
|| MPYU4 .M2 B_y_7654, B_k08, B_y_76_:B_y_54_ ;[11,3]
*
* =========================== PIPE LOOP KERNEL ============================ *
loop:
h_loop_0:
[A_lp0] MPYU4 .M2X B_r_7654_, A_k80, B_r7_r6:B_r5_r4 ;[36,1] r << 7
|| PACKH2 .L2 B_b_5, B_b_4, B_b_54 ;[24,2]
|| PACKH2 .S2 B_r_5, B_r_4, B_r_54 ;[24,2]
|| ADD .L1 A_y_2_c0, A_cb2_c4, A_b_2 ;[24,2]
|| ADD .S1 A_y_3_c0, A_cb2_c4, A_b_3 ;[24,2]
|| ADD .D1 A_y_3_c0, A_cg2, A_g_3 ;[24,2]
|| ADD .D2 B_y_6_c0, B_cg6, B_g_6 ;[24,2]
|| MPY2 .M1X A_y_10, B_c0, A_y_1_c0:A_y_0_c0;[24,2]
h_loop_1:
[A_lp0] ROTL .M1 A_b_3210_, 29, A_b_3210__ ;[37,1]
||[A_lp0] MPYU4 .M2 B_g_7654_, B_k08, B_g7_g6:B_g5_g4 ;[37,1] g << 3
|| PACKH2 .S1 A_b_3, A_b_2, A_b_32 ;[25,2]
|| ADD .S2 B_y_5_c0, B_cg4, B_g_5 ;[25,2]
|| ADD .L2 B_y_7_c0, B_cg6, B_g_7 ;[25,2]
|| ADD .L1X A_y_2_c0, B_cr2_c1, A_r_2 ;[25,2]
|| ADD .D1X A_y_3_c0, B_cr2_c1, A_r_3 ;[25,2]
|| LDW .D2T2 *B_cr_data++, B_cr6420 ;[ 1,4]
h_loop_2:
MPYU4 .M2 B_b_7654__, B_k01, B_b7_b6:B_b5_b4 ;[38,1] b >> 3
|| MPYU4 .M1 A_r_3210_, A_k80, A_r3_r2:A_r1_r0 ;[38,1] r << 7
|| PACKH2 .S2 B_g_7, B_g_6, B_g_76 ;[26,2]
|| PACKH2 .S1 A_r_3, A_r_2, A_r_32 ;[26,2]
|| ADD .L2X B_y_7_c0, A_cb6_c4, B_b_7 ;[26,2]
|| ADD .L1 A_y_2_c0, A_cg2, A_g_2 ;[26,2]
|| ADD .D2 B_y_4_c0, B_cg4, B_g_4 ;[26,2]
|| SUB2 .D1X A_y_32_, B_n16, A_y_32 ;[14,3]
h_loop_3:
MPYU4 .M1 A_g_3210_, A_k08, A_g3_g2:A_g1_g0 ;[39,1] g << 3
|| PACKH2 .L2 B_b_7, B_b_6, B_b_76 ;[27,2]
|| PACKH2 .S2 B_g_5, B_g_4, B_g_54 ;[27,2]
|| PACK2 .S1X A_cb20, B_cr20, A_cb0cr0 ;[15,3]
|| PACKH2 .L1X A_cb20, B_cr20, A_cb2cr2 ;[15,3]
|| MPY2 .M2 B_cr64, B_c1, B_cr6_c1:B_cr4_c1;[15,3]
|| SUB2 .D2 B_y_54_, B_n16, B_y_54 ;[15,3]
|| LDW .D1T1 *A_cb_data++, A_cb6420 ;[ 3,4]
h_loop_4:
MPYU4 .M1X A_b_3210__, B_k01, A_b3_b2:A_b1_b0 ;[40,1] b >> 3
|| SPACKU4 .S2 B_b_76, B_b_54, B_b_7654 ;[28,2]
|| ADD .S1 A_y_1_c0, A_cb0_c4, A_b_1 ;[28,2]
|| ADD .L1 A_y_0_c0, A_cg0, A_g_0 ;[28,2]
|| PACKH2 .L2X B_cr64, A_cb64, B_cr6cb6 ;[16,3]
|| MPY2 .M2 B_cr20, B_c1, B_cr2_c1:B_cr0_c1;[16,3]
|| SUB2 .D2 B_y_76_, B_n16, B_y_76 ;[16,3]
|| LDDW .D1T2 *A_y_data++, B_y_7654:B_y_3210;[ 4,4]
h_loop_5:
AND .D2 B_b_7654, B_msk5, B_b_7654_ ;[29,2]
|| SPACKU4 .S2 B_g_76, B_g_54, B_g_7654 ;[29,2]
|| PACKH2 .L1 A_g_3, A_g_2, A_g_32 ;[29,2]
|| ADD .S1 A_y_0_c0, A_cb0_c4, A_b_0 ;[29,2]
|| ADD .D1 A_y_1_c0, A_cg0, A_g_1 ;[29,2]
|| PACK2 .L2X B_cr64, A_cb64, B_cr4cb4 ;[17,3]
|| MPY2 .M1 A_cb64, A_c4, A_cb6_c4:A_cb4_c4;[17,3]
|| MPY2 .M2 B_y_54, B_c0, B_y_5_c0:B_y_4_c0;[17,3]
h_loop_6:
BDEC .S2 loop, B_loopcnt ;[42,1]
|| ADDAH .D2 B_b7_b6, B_r7_r6, B_r_b76 ;[42,1] (r<<8)|(b>>3)
|| AND .L2X B_g_7654, A_msk6, B_g_7654_ ;[30,2]
|| PACKH2 .L1 A_b_1, A_b_0, A_b_10 ;[30,2]
|| ADD .S1X A_y_0_c0, B_cr0_c1, A_r_0 ;[30,2]
|| ADD .D1X A_y_1_c0, B_cr0_c1, A_r_1 ;[30,2]
|| MPY2 .M1 A_cb20, A_c4, A_cb2_c4:A_cb0_c4;[18,3]
|| MPY2 .M2 B_y_76, B_c0, B_y_7_c0:B_y_6_c0;[18,3]
h_loop_7:
ADD .L2 B_r_b76, B_g7_g6, B_rgb76 ;[43,1]
|| ADDAH .D2 B_b5_b4, B_r5_r4, B_r_b54 ;[43,1] (r<<8)|(b>>3)
|| SPACKU4 .S2 B_r_76, B_r_54, B_r_7654 ;[31,2]
|| PACKH2 .L1 A_g_1, A_g_0, A_g_10 ;[31,2]
|| PACKH2 .S1 A_r_1, A_r_0, A_r_10 ;[31,2]
|| DOTP2 .M1 A_cb2cr2, A_c2c3, A_cg2 ;[19,3]
|| DOTP2 .M2 B_cr6cb6, B_c3c2, B_cg6 ;[19,3]
|| SUB2 .D1X A_y_10_, B_n16, A_y_10 ;[19,3]
h_loop_8:
ADD .D2 B_r_b54, B_g5_g4, B_rgb54 ;[44,1]
|| ADDAH .D1 A_b3_b2, A_r3_r2, A_r_b32 ;[44,1] (r<<8)|(b>>3)
|| AND .L2 B_r_7654, B_msk5, B_r_7654_ ;[32,2]
|| SPACKU4 .S1 A_b_32, A_b_10, A_b_3210 ;[32,2]
|| DOTP2 .M2 B_cr4cb4, B_c3c2, B_cg4 ;[20,3]
|| MPY2 .M1X A_y_32, B_c0, A_y_3_c0:A_y_2_c0;[20,3]
|| XOR .L1 A_cb6420, A_k80, A_cb6420_ ;[ 8,4]
|| XOR .S2X B_cr6420, A_k80, B_cr6420_ ;[ 8,4]
h_loop_9:
[A_lp0] STDW .D2T2 B_rgb76:B_rgb54, *B_rgb_ptr++[2] ;[45,1]
||[A_lp0] ADDAH .D1 A_b1_b0, A_r1_r0, A_r_b10 ;[45,1] (r<<8)|(b>>3)
||[A_lp0] ROTL .M2 B_b_7654_, 29, B_b_7654__ ;[33,2]
||[A_lp0] AND .L1X A_b_3210, B_msk5, A_b_3210_ ;[33,2]
||[A_lp0] SPACKU4 .S1 A_r_32, A_r_10, A_r_3210 ;[33,2]
|| ADD .L2 B_y_4_c0, B_cr4_c1, B_r_4 ;[21,3]
|| ADD .S2 B_y_5_c0, B_cr4_c1, B_r_5 ;[21,3]
|| MPYSU4 .M1 A_cb6420_, A_k08, A_cb64:A_cb20 ;[ 9,4]
h_loop_10:
[A_lp0] ADD .L1 A_r_b10, A_g1_g0, A_rgb10 ;[46,1]
||[A_lp0] ADD .D1 A_r_b32, A_g3_g2, A_rgb32 ;[46,1]
||[A_lp0] SPACKU4 .S1 A_g_32, A_g_10, A_g_3210 ;[34,2]
|| ADD .S2X B_y_6_c0, A_cb6_c4, B_b_6 ;[22,3]
|| ADD .D2 B_y_6_c0, B_cr6_c1, B_r_6 ;[22,3]
|| ADD .L2 B_y_7_c0, B_cr6_c1, B_r_7 ;[22,3]
|| MPYSU4 .M2 B_cr6420_, B_k08, B_cr64:B_cr20 ;[10,4]
|| MPYU4 .M1X B_y_3210, A_k08, A_y_32_:A_y_10_ ;[10,4]
h_loop_11:
[A_lp0] STDW .D1T1 A_rgb32:A_rgb10, *A_rgb_ptr++[2] ;[47,1]
||[A_lp0] AND .S1 A_g_3210, A_msk6, A_g_3210_ ;[35,2]
||[A_lp0] AND .L1X A_r_3210, B_msk5, A_r_3210_ ;[35,2]
|| PACKH2 .L2 B_r_7, B_r_6, B_r_76 ;[23,3]
|| ADD .D2X B_y_4_c0, A_cb4_c4, B_b_4 ;[23,3]
|| ADD .S2X B_y_5_c0, A_cb4_c4, B_b_5 ;[23,3]
|| DOTP2 .M1 A_cb0cr0, A_c2c3, A_cg0 ;[23,3]
|| MPYU4 .M2 B_y_7654, B_k08, B_y_76_:B_y_54_ ;[11,4]
* =========================== PIPE LOOP EPILOG ============================ *
LDDW .D2T2 *B_SP++, B_reg11:B_reg10 ; Restore Regs 11 & 10
BNOP .S2 B_ret, 4 ; Return to caller
|| LDW .D2T2 *-B_SP[3], B_csr ;
MVC .S2 B_csr, CSR ; Restore CSR
; ===== Branch occurs =====
; ===== Interrupts may occur here =====
* ========================================================================= *
* End of file: img_ycbcr422p_rgb565.asm *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *