c6416_sdk/imglib/ycbcr422p_.asm


								;* ======================================================================== *;

								;*  TEXAS INSTRUMENTS, INC.                                                 *;

								;*                                                                          *;

								;*  IMGLIB  DSP Image/Video Processing Library                              *;

								;*                                                                          *;

								;*      Release:        Revision 1.04b                                      *;

								;*      CVS Revision:   1.12    Mon Oct 21 15:28:45 2002 (UTC)              *;

								;*      Snapshot date:  23-Oct-2003                                         *;

								;*                                                                          *;

								;*  This library contains proprietary intellectual property of Texas        *;

								;*  Instruments, Inc.  The library and its source code are protected by     *;

								;*  various copyrights, and portions may also be protected by patents or    *;

								;*  other legal protections.                                                *;

								;*                                                                          *;

								;*  This software is licensed for use with Texas Instruments TMS320         *;

								;*  family DSPs.  This license was provided to you prior to installing      *;

								;*  the software.  You may review this license by consulting the file       *;

								;*  TI_license.PDF which accompanies the files in this library.             *;

								;* ------------------------------------------------------------------------ *;

								;*          Copyright (C) 2003 Texas Instruments, Incorporated.             *;

								;*                          All Rights Reserved.                            *;

								;* ======================================================================== *;


								;* ======================================================================== *;

								;*  Assembler compatibility shim for assembling 4.30 and later code on      *;

								;*  tools prior to 4.30.                                                    *;

								;* ======================================================================== *;


								        .if $isdefed(".ASSEMBLER_VERSION")

								        .asg    .ASSEMBLER_VERSION, $asmver

								        .else

								        .asg    0,    $asmver

								        .endif


								        .if ($asmver < 430)


								        .asg    B,    CALL     ; Function Call

								        .asg    B,    RET      ; Return from a Function

								        .asg    B,    CALLRET  ; Function call with Call / Ret chaining.


								        .if .TMS320C6400

								        .asg    BNOP, CALLNOP  ; C64x BNOP as a Fn. Call

								        .asg    BNOP, RETNOP   ; C64x BNOP as a Fn. Return

								        .asg    BNOP, CRNOP    ; C64x Fn call w/, Call/Ret chaining via BNOP.

								        .endif


								        .asg    , .asmfunc     ; .func equivalent for hand-assembly code

								        .asg    , .endasmfunc  ; .endfunc equivalent for hand-assembly code


								        .endif


								;* ======================================================================== *;

								;*  End of assembler compatibility shim.                                    *;

								;* ======================================================================== *;


								* ========================================================================= *

								*   TEXAS INSTRUMENTS, INC.                                                 *

								*                                                                           *

								*   NAME                                                                    *

								*       IMG_ycbcr422p_rgb565 -- Planarized YCbCr 4:2:2/4:2:0 to 16-bit      *

								*                               RGB 5:6:5 color space conversion.           *

								*                                                                           *

								*   REVISION DATE                                                           *

								*       21-Oct-2002                                                         *

								*                                                                           *

								*   USAGE                                                                   *

								*       This function is C callable, and is called according to this        *

								*       C prototype:                                                        *

								*                                                                           *

								*       void IMG_ycbcr422p_rgb565                                           *

								*       (                                                                   *

								*         const short         coeff[5],  /* Matrix coefficients.        */  *

								*         const unsigned char *y_data,   /* Luminence data  (Y')        */  *

								*         const unsigned char *cb_data,  /* Blue color-diff (B'-Y')     */  *

								*         const unsigned char *cr_data,  /* Red color-diff  (R'-Y')     */  *

								*         unsigned short                                                    *

								*                    *restrict rgb_data, /* RGB 5:6:5 packed pixel out. */  *

								*         unsigned            num_pixels /* # of luma pixels to process */  *

								*       );                                                                  *

								*                                                                           *

								*       The 'coeff[]' array contains the color-space-conversion matrix      *

								*       coefficients.  The 'y_data', 'cb_data' and 'cr_data' pointers       *

								*       point to the separate input image planes.  The 'rgb_data' pointer   *

								*                                                                           *

								*       The kernel is designed to process arbitrary amounts of 4:2:2        *

								*       image data, although 4:2:0 image data may be processed as well.     *

								*       For 4:2:2 input data, the 'y_data', 'cb_data' and 'cr_data'         *

								*       arrays may hold an arbitrary amount of image data, including        *

								*       multiple scan lines of image data.                                  *

								*                                                                           *

								*       For 4:2:0 input data, only a single scan-line (or portion           *

								*       thereof) may be processed at a time.  This is achieved by           *

								*       calling the function twice using the same row data for              *

								*       'cr_data' and 'cb_data', and providing new row data for             *

								*       'y_data'.  This is numerically equivalent to replicating the Cr     *

								*       and Cb pixels vertically.                                           *

								*                                                                           *

								*       The coefficients in the coeff array must be in signed Q13 form.     *

								*       These coefficients correspond to the following matrix equation:     *

								*                                                                           *

								*           [ coeff[0] 0.0000   coeff[1] ]   [ Y' -  16 ]     [ R']         *

								*           [ coeff[0] coeff[2] coeff[3] ] * [ Cb - 128 ]  =  [ G']         *

								*           [ coeff[0] coeff[4] 0.0000   ]   [ Cr - 128 ]     [ B']         *

								*                                                                           *

								*       The output from this kernel is 16-bit RGB in 5:6:5 format.          *

								*       The RGB components are packed into halfwords as shown below.        *

								*                                                                           *

								*                      15      11 10       5 4        0                     *

								*                     +----------+----------+----------+                    *

								*                     |   Red    |  Green   |   Blue   |                    *

								*                     +----------+----------+----------+                    *

								*                                                                           *

								*       This kernel can also return the red, green, and blue values in      *

								*       the opposite order if a particular application requires it.         *

								*       This is achieved by exchanging the 'cb_data' and 'cr_data'          *

								*       arguments when calling the function, and by reversing the order     *

								*       of coefficients in coeff[1] through coeff[4].  This essentially     *

								*       implements the following matrix multiply:                           *

								*                                                                           *

								*           [ coeff[0] 0.0000   coeff[4] ]   [ Y' -  16 ]     [ B']         *

								*           [ coeff[0] coeff[3] coeff[2] ] * [ Cr - 128 ]  =  [ G']         *

								*           [ coeff[0] coeff[1] 0.0000   ]   [ Cb - 128 ]     [ R']         *

								*                                                                           *

								*       The reversed RGB ordering output by this mode is as follows:        *

								*                                                                           *

								*                      15      11 10       5 4        0                     *

								*                     +----------+----------+----------+                    *

								*                     |   Blue   |  Green   |   Red    |                    *

								*                     +----------+----------+----------+                    *

								*                                                                           *

								*   DESCRIPTION                                                             *

								*       This kernel performs Y'CbCr to RGB conversion.  From the Color      *

								*       FAQ, http://home.inforamp.net/~poynton/ColorFAQ.html :              *

								*                                                                           *

								*           Various scale factors are applied to (B'-Y') and (R'-Y')        *

								*           for different applications.  The Y'PbPr scale factors are       *

								*           optimized for component analog video.  The Y'CbCr scaling       *

								*           is appropriate for component digital video, JPEG and MPEG.      *

								*           Kodak's PhotoYCC(tm) uses scale factors optimized for the       *

								*           gamut of film colors.  Y'UV scaling is appropriate as an        *

								*           intermediate step in the formation of composite NTSC or PAL     *

								*           video signals, but is not appropriate when the components       *

								*           are keps separate.  Y'UV nomenclature is now used rather        *

								*           loosely, and it sometimes denotes any scaling of (B'-Y')        *

								*           and (R'-Y').  Y'IQ coding is obsolete.                          *

								*                                                                           *

								*       This code can perform various flavors of Y'CbCr to RGB              *

								*       conversion as long as the offsets on Y, Cb, and Cr are -16,         *

								*       -128, and -128, respectively, and the coefficients match the        *

								*       pattern shown.                                                      *

								*                                                                           *

								*       The kernel implements the following matrix form, which involves 5   *

								*       unique coefficients:                                                *

								*                                                                           *

								*           [ coeff[0] 0.0000   coeff[1] ]   [ Y' -  16 ]     [ R']         *

								*           [ coeff[0] coeff[2] coeff[3] ] * [ Cb - 128 ]  =  [ G']         *

								*           [ coeff[0] coeff[4] 0.0000   ]   [ Cr - 128 ]     [ B']         *

								*                                                                           *

								*                                                                           *

								*       Below are some common coefficient sets, along with the matrix       *

								*       equation that they correspond to.   Coefficients are in signed      *

								*       Q13 notation, which gives a suitable balance between precision      *

								*       and range.                                                          *

								*                                                                           *

								*       1.  Y'CbCr -> RGB conversion with RGB levels that correspond to     *

								*           the 219-level range of Y'.  Expected ranges are [16..235] for   *

								*           Y' and [16..240] for Cb and Cr.                                 *

								*                                                                           *

								*           coeff[] = { 0x2000, 0x2BDD, -0x0AC5, -0x1658, 0x3770 };         *

								*                                                                           *

								*           [ 1.0000    0.0000    1.3707 ]   [ Y' -  16 ]     [ R']         *

								*           [ 1.0000   -0.3365   -0.6982 ] * [ Cb - 128 ]  =  [ G']         *

								*           [ 1.0000    1.7324    0.0000 ]   [ Cr - 128 ]     [ B']         *

								*                                                                           *

								*       2.  Y'CbCr -> RGB conversion with the 219-level range of Y'         *

								*           expanded to fill the full RGB dynamic range.  (The matrix       *

								*           has been scaled by 255/219.)  Expected ranges are [16..235]     *

								*           for Y' and [16..240] for Cb and Cr.                             *

								*                                                                           *

								*           coeff[] = { 0x2543, 0x3313, -0x0C8A, -0x1A04, 0x408D };         *

								*                                                                           *

								*           [ 1.1644    0.0000    1.5960 ]   [ Y' -  16 ]     [ R']         *

								*           [ 1.1644   -0.3918   -0.8130 ] * [ Cb - 128 ]  =  [ G']         *

								*           [ 1.1644    2.0172    0.0000 ]   [ Cr - 128 ]     [ B']         *

								*                                                                           *

								*       3.  Y'CbCr -> BGR conversion with RGB levels that correspond to     *

								*           the 219-level range of Y'.  This is equivalent to #1 above,     *

								*           except that the R, G, and B output order in the packed          *

								*           pixels is reversed.  Note:  The 'cr_data' and 'cb_data'         *

								*           input arguments must be exchanged for this example as           *

								*           indicated under USAGE above.                                    *

								*                                                                           *

								*           coeff[] = { 0x2000, 0x3770, -0x1658, -0x0AC5, 0x2BDD };         *

								*                                                                           *

								*           [ 1.0000    0.0000    1.7324 ]   [ Y' -  16 ]     [ B']         *

								*           [ 1.0000   -0.6982   -0.3365 ] * [ Cr - 128 ]  =  [ G']         *

								*           [ 1.0000    1.3707    0.0000 ]   [ Cb - 128 ]     [ R']         *

								*                                                                           *

								*       4.  Y'CbCr -> BGR conversion with the 219-level range of Y'         *

								*           expanded to fill the full RGB dynamic range.  This is           *

								*           equivalent to #2 above, except that the R, G, and B output      *

								*           order in the packed pixels is reversed.  Note:  The             *

								*           'cr_data' and 'cb_data' input arguments must be exchanged       *

								*           for this example as indicated under USAGE above.                *

								*                                                                           *

								*           coeff[] = { 0x2000, 0x408D, -0x1A04, -0x0C8A, 0x3313 };         *

								*                                                                           *

								*           [ 1.0000    0.0000    2.0172 ]   [ Y' -  16 ]     [ B']         *

								*           [ 1.0000   -0.8130   -0.3918 ] * [ Cr - 128 ]  =  [ G']         *

								*           [ 1.0000    1.5960    0.0000 ]   [ Cb - 128 ]     [ R']         *

								*                                                                           *

								*       Other scalings of the color differences (B'-Y') and (R'-Y')         *

								*       (sometimes incorrectly referred to as U and V) are supported, as    *

								*       long as the color differences are unsigned values centered around   *

								*       128 rather than signed values centered around 0, as noted above.    *

								*                                                                           *

								*       In addition to performing plain color-space conversion, color       *

								*       saturation can be adjusted by scaling coeff[1] through coeff[4].    *

								*       Similarly, brightness can be adjusted by scaling coeff[0].          *

								*       General hue adjustment can not be performed, however, due to the    *

								*       two zeros hard-coded in the matrix.                                 *

								*                                                                           *

								*   TECHNIQUES                                                              *

								*       Pixel replication is performed implicitly on chroma data to         *

								*       reduce the total number of multiplies required.  The chroma         *

								*       portion of the matrix is calculated once for each Cb, Cr pair,      *

								*       and the result is added to both Y' samples.                         *

								*                                                                           *

								*       Matrix Multiplication is performed as a combination of MPY2s and    *

								*       DOTP2s.  Saturation to 8bit values is performed using SPACKU4       *

								*       which takes in 4 signed 16-bit values and saturates them to         *

								*       unsigned 8-bit values.  The output of Matrix Multiplication would   *

								*       ideally be in a Q13 format.  This however, cannot be fed directly   *

								*       to SPACKU4.                                                         *

								*                                                                           *

								*       This implies a shift left by 3 bits, which could be pretty          *

								*       expensive in terms of the number of shifts to be performed.  Thus,  *

								*       to avoid being bottlenecked by so many shifts, the Y, Cr & Cb data  *

								*       are shifted left by 3 before multiplication.  This is possible      *

								*       because they are 8-bit unsigned data.  Due to this, the output of   *

								*       Matrix Multiplication is in a Q16 format, which can be directly     *

								*       fed to SPACKU4.                                                     *

								*                                                                           *

								*       Because the loop accesses four different arrays at three            *

								*       different strides, no memory accesses are allowed to parallelize    *

								*       in the loop.  No bank conflicts occur, as a result.                 *

								*                                                                           *

								*       The epilog has been completely removed, while the prolog is left    *

								*       as is. However, some cycles of the prolog are performed using the   *

								*       kernel cycles to help reduce code-size. The setup code is merged    *

								*       along with the prolog for speed.                                    *

								*                                                                           *

								*   ASSUMPTIONS                                                             *

								*       The number of luma samples to be processed needs to be a multiple   *

								*       of 8.                                                               *

								*       The input Y array needs to be double-word aligned.                  *

								*       The input Cr and Cb arrays need to be word aligned                  *

								*       The output image must be double-word aligned.                       *

								*                                                                           *

								*   NOTES                                                                   *

								*       No bank conflicts occur.                                            *

								*                                                                           *

								*       Codesize is 952 bytes.                                              *

								*                                                                           *

								*       Memory bank conflicts will not occurs since the 3 loads and two     *

								*       stores happen in different cycles of the loop                       *

								*                                                                           *

								*       The kernel requires 3 words of stack space.                         *

								*                                                                           *

								*   CYCLES                                                                  *

								*       12 * num_pixels/8 + 50                                              *

								*                                                                           *

								*   CODESIZE                                                                *

								*       952 bytes                                                           *

								*                                                                           *

								*   SOURCE                                                                  *

								*       Poynton, Charles et al.  "The Color FAQ,"  1999.                    *

								*           http://home.inforamp.net/~poynton/ColorFAQ.html                 *

								* ------------------------------------------------------------------------- *

								*             Copyright (c) 2003 Texas Instruments, Incorporated.           *

								*                            All Rights Reserved.                           *

								* ========================================================================= *


								        .sect ".text:_ycbcr422pl_to_rgb565"

								        .global _IMG_ycbcr422p_rgb565

								_IMG_ycbcr422p_rgb565:


								         .asg       B0,         B_k08       ; Constant of 0x08080808

								         .asg       B1,         B_loopcnt   ; Loop counter

								         .asg       A2,         A_lp0       ; Predicate register, to reduce prolog size

								         .asg       B2,         B_rgb_ptr   ; Pointer to output RGB data

								         .asg       B3,         B_ret       ; Return address

								         .asg       A4,         A_coef      ; Pointer to Co-efficients

								         .asg       A4,         A_y_data    ; Pointer to 'Y' data on A datapath

								         .asg       B4,         B_y_data    ; Pointer to 'Y' data on B datapath

								         .asg       A5,         A_cb_data   ; Pointer to 'Cb' data

								         .asg       B5,         B_c2c3      ; g_cb : g_cr register

								         .asg       A6,         A_cb_data_in; Input pointer to 'Cb' data

								         .asg       B6,         B_cr_data   ; Pointer to 'Cr' data

								         .asg       A8,         A_rgb_data  ; Pointer to output RGB data

								         .asg       B8,         B_num_pix   ; Number of pixels to be processed

								         .asg       B8,         B_cr6420    ; Loaded Word of 'Cr' data

								         .asg       A9,         A_g_cr      ; g_cr : g_cr data

								         .asg       B9,         B_coef      ; Pointer to Co-efficients

								         .asg       B10,        B_reg10     ; Input B10 register; to be stored on stack

								         .asg       B11,        B_reg11     ; Input B11 register; to be stored on stack

								         .asg       B11,        B_cr6420_   ; Biased Word of 'Cr' data

								         .asg       B15,        B_SP        ; Stack pointer, B datapath

								         .asg       A16,        A_rgb_ptr   ; Pointer to output RGB data

								         .asg       B16,        B_n16       ; Constant 0x00800080

								         .asg       A17,        A_k80       ; Constant 0x80808080

								         .asg       B17,        B_c0        ; Luma co-efficient

								         .asg       A18,        A_k08       ; Constant 0x08080808

								         .asg       B18,        B_c1        ; r_cr co-efficient

								         .asg       A19,        A_c4        ; b_cb co-efficient

								         .asg       B19,        B_msk5      ; Mask to obtain the upper 5 bits of R & B

								         .asg       A20,        A_msk6      ; Mask to obtain the upper 6 bits of G

								         .asg       B20,        B_k01       ; Constant 0x01010101

								         .asg       A21,        A_c2c3      ; g_cb : g_cr register

								         .asg       B21,        B_c3c2      ; g_cr : g_cb register on B datapath

								         .asg       A22,        A_c3c2      ; g_cr : g_cb register on A datapath

								         .asg       B22,        B_y_3210    ; Loaded lower 4 pixels of 'Y' data

								         .asg       A23,        A_cb6420_   ; Biased 'Cb' data

								         .asg       B23,        B_y_7654    ; Loaded upper 4 pixels of 'Y' data

								         .asg       A24,        A_y_10_     ; Lower 2 pixels of 'Y' multiplied by 8

								         .asg       B24,        B_no_gie    ; CSR w/ GIE bit cleared

								         .asg       B24,        B_cr20      ; Lower 2 samples 'Cr'

								         .asg       A25,        A_y_32_     ; Upper 2 pixels of 'Y' multiplied by 8

								         .asg       B25,        B_cr64      ; Biased Upper 'Cr' value multiplied by 8

								         .asg       A26,        A_cb20      ; Biased Lower 'Cb' value multiplied by 8

								         .asg       B26,        B_csr       ; CSR's value

								         .asg       A27,        A_cb64      ; Biased Upper 'Cb' value multiplied by 8

								         .asg       A28,        A_cb6420    ; Loaded 4 samples of 'Cb'

								         .asg       B28,        B_y_54_     ; 'Y' samples multiplied by 8

								         .asg       B29,        B_y_76_     ; 'Y' samples multiplied by 8

								         .asg       B30,        B_g_cb      ; g_cb co-efficient


								         .asg       A0,     A_g_0       ; Generated G pixel in 11:8:13 format

								         .asg       A0,         A_y_10      ; Lower 2 pixels of Y after bias

								         .asg       A3,         A_g_32      ; Packed G3 and G2 pixels

								         .asg       A3,         A_g_2       ; Generated G pixel in 11:8:13 format

								         .asg       A3,         A_cg2       ; Additional input to generate G pixels

								         .asg       B4,         B_g_7654_   ; Upper 6 bits of Green pixels

								         .asg       B4,         B_g_54      ; Packed G5 and G4 pixels

								         .asg       B4,         B_g_7654    ; Saturated and packed 8bit values of G

								         .asg       B4,         B_g_5       ; Generated G pixel in 11:8:13 format

								         .asg       B5,         B_r_b54     ; Generated R & B pixel in 5:6:5 format

								         .asg       B5,         B_r_76      ; Packed R7 and R6 pixels

								         .asg       B5,         B_r_6       ; Generated R pixel in 11:8:13 format

								         .asg       A6,         A_y_32      ; Biased Y pixels

								         .asg       A6,         A_r_b32     ; Generated R & B pixel in 5:6:5 format

								         .asg       A6,         A_rgb10     ; Final RGB values for pixels 0 & 1

								         .asg       A6,         A_y_2_c0    ; Y2 and luma

								         .asg       A7,         A_r_3       ; Generated R pixel in 11:8:13 format

								         .asg       A7,         A_r_32      ; Packed R3 & R2 pixels

								         .asg       A7,     A_r_b10     ; Generated R & B pixel in 5:6:5 format

								         .asg       A7,         A_rgb32     ; Final RGB values for pixels 3 & 2

								         .asg       A7,         A_y_3_c0    ; Y3 and luma

								         .asg       B7,         B_b_54      ; Packed B5 and B4 pixels

								         .asg       B7,         B_b_5       ; Generated B pixel in 11:8:13 format

								         .asg       B7,         B_b_7654    ; Saturated and packed 8bit values of B

								         .asg       B7,         B_b_7654_   ; Upper 5 bits of Blue pixels

								         .asg       B7,         B_r_7       ; Generated R pixel in 11:8:13 format

								         .asg       A8,         A_r_10      ; Packed R1 and R0 pixels

								         .asg       A8,         A_r_3210    ; Saturated and packed 8bit values of R

								         .asg       A8,         A_b_1       ; Generated B pixel in 11:8:13 format

								         .asg       A8,         A_b_3210__  ;

								         .asg       A8,         A_r_3210_   ; Upper 5 bits of generated R pixels

								         .asg       A8,         A_r_1       ; Generated R pixel in 11:8:13 format

								         .asg       B8,         B_b_7       ; Generated B pixel in 11:8:13 format

								         .asg       B8,         B_cg4       ; Additional input to G pixel

								         .asg       B8,         B_rgb54     ; Final RGB values for pixels 5 & 4

								         .asg       B8,         B_y_54      ; Packed Y5 and Y4

								         .asg       A9,         A_g_3210    ; Saturated and packed 8bit values of G

								         .asg       A9,         A_r_2       ; Generated R pixel in 11:8:13 format

								         .asg       A9,         A_g_1       ; Generated G pixel in 11:8:13 format

								         .asg       A9,         A_g_10      ; Packed G1 and G0 pixels

								         .asg       A9,         A_b_2       ; Generated B pixel in 11:8:13 format

								         .asg       A9,         A_cg0       ; Additional input to G pixel

								         .asg       B9,         B_rgb76     ; Final RGB values for pixels 7 & 6

								         .asg       B9,         B_g_4       ; Generated G pixel in 11:8:13 format

								         .asg       B9,         B_g_6       ; Generated G pixel in 11:8:13 format

								         .asg       B9,         B_b_76      ; Packed B7 and B6 pixels

								         .asg       B9,         B_r_5       ; Generated R pixel in 11:8:13 format

								         .asg       B9,         B_cr6cb6    ; Packed Cr and Cb samples

								         .asg       B10,        B_r_7654_   ; Upper 5 bits of generated R pixels

								         .asg       B10,        B_r_54      ; Packed R5 and R4 pixels

								         .asg       B11,        B_g_76      ; Packed G7 and G6 pixels

								         .asg       B11,        B_b_7654__  ; To shift right the B pixels by 3

								         .asg       B11,        B_cr4cb4    ; Packed Cr and Cb sample

								         .asg       A16,        A_rgb_ptr   ; Final RGB output pointer

								         .asg       B16,        B_n16       ; constant 0x00800080

								         .asg       A17,        A_k80       ; constant 0x80808080

								         .asg       B17,        B_c0        ; Luma = coef[0]

								         .asg       A18,        A_k08       ; constant 0x08080808

								         .asg       B18,        B_c1        ; r_cr = coef[1]

								         .asg       A19,        A_c4        ; b_cb = coef[4]

								         .asg       B19,        B_msk5      ; 5 bit mask to get upper 5 bits of R & B

								         .asg       A20,        A_msk6      ; 6 bit mask to get upper 6 bits of G

								         .asg       B20,        B_k01       ; constant 0x01010101

								         .asg       A22,        A_g_3210_   ; Upper 6 bits of generated G pixels

								         .asg       A22,        A_cb0cr0    ; Packed Cb and Cr pixels

								         .asg       B22,        B_b5_b4     ; Packed B5 and B4 pixels

								         .asg       B22,        B_r_7654    ; Saturated and packed 8bit values of R

								         .asg       B22,        B_b_6       ; Generated B pixel in 11:8:13 format

								         .asg       A23,        A_b_3210_   ; Upper 5 bits of generated B pixels

								         .asg       A23,        A_b_32      ; Packed B3 and B2 pixels

								         .asg       B23,        B_b7_b6     ; Packed B7 and B6 pixel

								         .asg       B23,        B_r_b76     ; Packed R and B pixels

								         .asg       B23,        B_b_4       ; Generated B pixel in 11:8:13 format

								         .asg       B23,        B_g_7       ; Generated G pixel in 11:8:13 format

								         .asg       A24,        A_b1_b0     ; Packed B1 and B0 pixels

								         .asg       A24,        A_b_3       ; Generated B pixel in 11:8:13 format

								         .asg       B24,        B_r_4       ; Generated R pixel in 11:8:13 format

								         .asg       B24,        B_cr4_c1    ; Packed Cr and r_cr

								         .asg       B24,        B_y_76      ; Packed Y7 and Y6 pixels

								         .asg       A25,        A_b3_b2     ; Packed B3 and B2 pixels

								         .asg       A25,        A_b_3210    ; Saturated and packed 8bit values of B

								         .asg       A25,        A_cb2cr2    ; Packed Cb and Cr

								         .asg       B25,        B_cg6       ; Additional input to G

								         .asg       B25,        B_cr6_c1    ; Packed Cr and r_cr

								         .asg       A26,        A_g1_g0     ; Packed G1 and G0 pixels

								         .asg       B26,        B_r5_r4     ; Packed R5 and R4 pixels

								         .asg       B26,        B_y_6_c0    ; Packed Y6 and luma

								         .asg       A27,        A_g3_g2     ; Packed G3 and G2 pixels

								         .asg       A27,        A_b_0       ; Generated B pixel in 11:8:13 format

								         .asg       B27,        B_r7_r6     ; Packed R7 and R6 pixels

								         .asg       B27,        B_y_7_c0    ; Packed Y7 and luma

								         .asg       A28,        A_r_0       ; Generated R pixel in 11:8:13 format

								         .asg       A28,        A_y_0_c0    ; Packed Y0 and luma

								         .asg       A28,        A_cb4_c4    ; Packed Cb and b_cb

								         .asg       A28,        A_cb6420    ; Loaded Cb pixels

								         .asg       B28,        B_g5_g4     ; Packed G5 and G4 pixels

								         .asg       B28,        B_y_4_c0    ; Packed Y4 and luma

								         .asg       A29,        A_cb6_c4    ; Packed Cb and b_cb

								         .asg       A29,        A_b_10      ; Packed B1 and B0

								         .asg       A29,        A_y_1_c0    ; Packed Y1 and luma

								         .asg       B29,        B_g7_g6     ; Packed G7 and G6

								         .asg       B29,        B_y_5_c0    ; Packed Y5 and luma

								         .asg       A30,        A_r1_r0     ; Packed R1 and R0

								         .asg       A30,        A_cb0_c4    ; Packed Cb and b_cb coef

								         .asg       B30,        B_cr0_c1    ; Packed Cr and r_cr coef

								         .asg       A31,        A_g_3       ; Generated G pixel in 11:8:13 format

								         .asg       A31,        A_r3_r2     ; Packed R3 and R2 pixels

								         .asg       A31,        A_cb2_c4    ; Packed Cb and b_cb

								         .asg       B31,        B_cr2_c1    ; Packed Cr and r_cr


								* ===================== SETUP CODE ======================================== *


								         MV   .L1     A_cb_data_in, A_cb_data        ; copy of input Cb ptr

								||       MV   .L2x    A_coef,       B_coef           ; copy of Co-ef ptr on B side

								||       LDHU .D1T1   *A_coef[4],   A_c4             ; Loading b_cb

								||       LDW  .D2T2   *B_cr_data++, B_cr6420         ; loading first 4 Cr pixels

								||       MVKL .S1     0x80808080,   A_k80            ; Moving  constant

								||       MVC  .S2     CSR,          B_csr            ; Loading contents of CSR


								         LDW  .D1T1   *A_cb_data++, A_cb6420         ; loading first 4 Cb pixels

								||       LDHU .D2T2   *B_coef[1],   B_c1             ; r_cr = coef[1]

								||       AND  .L2     B_csr,        -2,        B_no_gie;clear interrupt enable bit

								||       MVKL .S1     0x08080808,   A_k08            ; Loading constant

								||       SHRU .S2     B_num_pix,    3,         B_loopcnt; divide num_pix by 8

								||       MV   .L1x    B_y_data,     A_y_data         ; copy of Y ptr


								         PACK2.S1     A_k08,        A_k08,     A_k08 ; creating constant

								||       MVC  .S2     B_no_gie,     CSR              ; disable Interrupts

								||       SUB  .L2     B_loopcnt,    2,         B_loopcnt; Sub 2 due to BDEC

								||       LDHU .D2T1   *B_coef[3],   A_g_cr           ; g_cr = coef[3]

								||       LDDW .D1T2   *A_y_data++,  B_y_7654:B_y_3210; first set of 8 Y samples

								||       PACK2.L1     A_k80,        A_k80,     A_k80 ; loading constant


								         MV   .L2x    A_k08,        B_k08            ; copy of constant

								||       MVKL .S2     0x01010101,   B_k01            ; creating constant

								||       MV   .L1     A_rgb_data,   A_rgb_ptr        ; copy of RGB pointer

								||       ZERO .D1     A_lp0

								||       LDHU .D2T2   *B_coef[2],   B_g_cb           ; g_cb = coef[2]


								         PACK2.L2     B_k01,        B_k01,     B_k01 ; creating constant

								||       MVKL .S1     0xFCFCFCFC,   A_msk6           ; creating mask for upper 6 bits

								||       MVKL .S2     0xF8F8F8F8,   B_msk5           ; creating mask for upper 5 bits

								||       STDW .D2T2   B_reg11:B_reg10, *--B_SP       ; Save B10 & B11 onto stack


								         PACK2 .L1    A_c4,         A_c4,      A_c4  ; b_cb:b_cb

								||       PACK2 .L2    B_msk5,       B_msk5,    B_msk5; creating mask

								||       PACK2 .S1    A_msk6,       A_msk6,    A_msk6; creating mask

								||       MVKL .S2     0x00800080,   B_n16            ; creating constant

								||       LDHU .D2T2   *B_coef[0],   B_c0             ; luma = coef[0]


								         XOR   .L1    A_cb6420,     A_k80,     A_cb6420_; applying Cb bias

								||       XOR   .S2X   B_cr6420,     A_k80,     B_cr6420_; applying Cr bias

								||       PACK2 .L2    B_n16,        B_n16,     B_n16 ; creating constant

								||       STW  .D2T2   B_csr,        *-B_SP[1]        ; save CSR


								         MPYSU4 .M1   A_cb6420_,    A_k08,     A_cb64:A_cb20; multiply Cb by 8

								||       MPYSU4 .M2   B_cr6420_,    B_k08,     B_cr64:B_cr20; multiply Cr by 8


								         PACK2 .L2    B_c1,         B_c1,      B_c1  ; r_cr:r_cr

								||       MPYU4 .M1X   B_y_3210,     A_k08,     A_y_32_:A_y_10_; multiply Y by 8

								||       MPYU4 .M2    B_y_7654,     B_k08,     B_y_76_:B_y_54_; multiply Y by 8


								* =========================== PIPE LOOP PROLOG ============================ *

								        PACK2 .L2x       B_g_cb,    A_g_cr,     B_c2c3; g_cb:g_cr

								||      PACK2 .L1x       A_g_cr,    B_g_cb,     A_c3c2; g_cr: g_cb


								        MV    .L1x       B_c2c3,    A_c2c3            ; Copy of g_cb:g_cr

								||      PACK2 .L2        B_c0,      B_c0,       B_c0  ; luma:luma

								||      MV    .S2x       A_c3c2,    B_c3c2


								        PACKH2 .L2X      B_cr64,    A_cb64,     B_cr6cb6; Pack Cr & Cb samples

								||      PACK2  .S1X      A_cb20,    B_cr20,     A_cb0cr0;

								||      LDW    .D2T2     *B_cr_data++,          B_cr6420; Load next 4 samples of Cr


								        SUB2   .D1X      A_y_32_,   B_n16,      A_y_32  ;

								||      ADD    .D2x      A_rgb_data,8,          B_rgb_ptr;


								        PACKH2  .L1X     A_cb20,    B_cr20,     A_cb2cr2;[15,1]

								||      MPY2    .M2      B_cr64,    B_c1,       B_cr6_c1:B_cr4_c1;[15,1]

								||      SUB2    .D2      B_y_54_,   B_n16,      B_y_54  ;[15,1]

								||      LDW     .D1T1    *A_cb_data++,          A_cb6420;[ 3,2]

								||      B       .S2      h_loop_9


								        MPY2    .M2     B_cr20,     B_c1,       B_cr2_c1:B_cr0_c1;[16,1]

								||      SUB2    .D2     B_y_76_,    B_n16,      B_y_76           ;[16,1]

								||      LDDW    .D1T2   *A_y_data++,            B_y_7654:B_y_3210;[ 4,2]


								        PACK2   .L2X    B_cr64,     A_cb64,     B_cr4cb4         ;[17,1]

								||      MPY2    .M1     A_cb64,     A_c4,       A_cb6_c4:A_cb4_c4;[17,1]

								||      MPY2    .M2     B_y_54,     B_c0,       B_y_5_c0:B_y_4_c0;[17,1]


								        MPY2    .M1     A_cb20,     A_c4,       A_cb2_c4:A_cb0_c4;[18,1]

								||      MPY2    .M2     B_y_76,     B_c0,       B_y_7_c0:B_y_6_c0;[18,1]

								||      B       .S2     h_loop_0


								        DOTP2   .M1     A_cb2cr2,   A_c2c3,     A_cg2           ;[19,1]

								||      DOTP2   .M2     B_cr6cb6,   B_c3c2,     B_cg6           ;[19,1]

								||      SUB2    .D1X    A_y_10_,    B_n16,      A_y_10          ;[19,1]


								        DOTP2   .M2     B_cr4cb4,   B_c3c2,     B_cg4           ;[20,1]

								||      MPY2    .M1X    A_y_32,     B_c0,       A_y_3_c0:A_y_2_c0;[20,1]

								||      XOR     .L1     A_cb6420,   A_k80,      A_cb6420_       ;[ 8,2]

								||      XOR     .S2X    B_cr6420,   A_k80,      B_cr6420_       ;[ 8,2]

								||      B       .S1     o_loop_0


								o_loop_0:

								        PACKH2  .S2     B_g_7,      B_g_6,      B_g_76          ;[26,1]

								||      PACKH2  .S1     A_r_3,      A_r_2,      A_r_32          ;[26,1]

								||      ADD     .L2X    B_y_7_c0,   A_cb6_c4,   B_b_7           ;[26,1]

								||      ADD     .L1     A_y_2_c0,   A_cg2,      A_g_2           ;[26,1]

								||      ADD     .D2     B_y_4_c0,   B_cg4,      B_g_4           ;[26,1]

								||      SUB2    .D1X    A_y_32_,    B_n16,      A_y_32          ;[14,2]


								        PACKH2  .L2     B_b_7,      B_b_6,      B_b_76          ;[27,1]

								||      PACKH2  .S2     B_g_5,      B_g_4,      B_g_54          ;[27,1]

								||      PACK2   .S1X    A_cb20,     B_cr20,     A_cb0cr0        ;[15,2]

								||      PACKH2  .L1X    A_cb20,     B_cr20,     A_cb2cr2        ;[15,2]

								||      MPY2    .M2     B_cr64,     B_c1,       B_cr6_c1:B_cr4_c1;[15,2]

								||      SUB2    .D2     B_y_54_,    B_n16,      B_y_54          ;[15,2]

								||      LDW     .D1T1   *A_cb_data++,           A_cb6420        ;[ 3,3]


								        SPACKU4 .S2     B_b_76,     B_b_54,     B_b_7654        ;[28,1]

								||      ADD     .S1     A_y_1_c0,   A_cb0_c4,   A_b_1           ;[28,1]

								||      ADD     .L1     A_y_0_c0,   A_cg0,      A_g_0           ;[28,1]

								||      PACKH2  .L2X    B_cr64,     A_cb64,     B_cr6cb6        ;[16,2]

								||      MPY2    .M2     B_cr20,     B_c1,       B_cr2_c1:B_cr0_c1;[16,2]

								||      SUB2    .D2     B_y_76_,    B_n16,      B_y_76          ;[16,2]

								||      LDDW    .D1T2   *A_y_data++,            B_y_7654:B_y_3210;[ 4,3]


								        AND     .D2     B_b_7654,   B_msk5,     B_b_7654_       ;[29,1]

								||      SPACKU4 .S2     B_g_76,     B_g_54,     B_g_7654        ;[29,1]

								||      PACKH2  .L1     A_g_3,      A_g_2,      A_g_32          ;[29,1]

								||      ADD     .S1     A_y_0_c0,   A_cb0_c4,   A_b_0           ;[29,1]

								||      ADD     .D1     A_y_1_c0,   A_cg0,      A_g_1           ;[29,1]

								||      PACK2   .L2X    B_cr64,     A_cb64,     B_cr4cb4        ;[17,2]

								||      MPY2    .M1     A_cb64,     A_c4,       A_cb6_c4:A_cb4_c4;[17,2]

								||      MPY2    .M2     B_y_54,     B_c0,       B_y_5_c0:B_y_4_c0;[17,2]


								        AND     .L2X    B_g_7654,   A_msk6,     B_g_7654_       ;[30,1]

								||      PACKH2  .L1     A_b_1,      A_b_0,      A_b_10          ;[30,1]

								||      ADD     .S1X    A_y_0_c0,   B_cr0_c1,   A_r_0           ;[30,1]

								||      ADD     .D1X    A_y_1_c0,   B_cr0_c1,   A_r_1           ;[30,1]

								||      MPY2    .M1     A_cb20,     A_c4,       A_cb2_c4:A_cb0_c4;[18,2]

								||      MPY2    .M2     B_y_76,     B_c0,       B_y_7_c0:B_y_6_c0;[18,2]


								        SPACKU4 .S2     B_r_76,     B_r_54,     B_r_7654        ;[31,1]

								||      PACKH2  .L1     A_g_1,      A_g_0,      A_g_10          ;[31,1]

								||      PACKH2  .S1     A_r_1,      A_r_0,      A_r_10          ;[31,1]

								||      DOTP2   .M1     A_cb2cr2,   A_c2c3,     A_cg2           ;[19,2]

								||      DOTP2   .M2     B_cr6cb6,   B_c3c2,     B_cg6           ;[19,2]

								||      SUB2    .D1X    A_y_10_,    B_n16,      A_y_10          ;[19,2]


								        AND     .L2     B_r_7654,   B_msk5,     B_r_7654_       ;[32,1]

								||      SPACKU4 .S1     A_b_32,     A_b_10,     A_b_3210        ;[32,1]

								||      DOTP2   .M2     B_cr4cb4,   B_c3c2,     B_cg4           ;[20,2]

								||      MPY2    .M1X    A_y_32,     B_c0,       A_y_3_c0:A_y_2_c0;[20,2]

								||      XOR     .L1     A_cb6420,   A_k80,      A_cb6420_       ;[ 8,3]

								||      XOR     .S2X    B_cr6420,   A_k80,      B_cr6420_       ;[ 8,3]

								||      ADD     .D1     A_lp0,      1,          A_lp0


								        ROTL    .M2     B_b_7654_,  29,         B_b_7654__      ;[33,1]

								||      AND     .L1X    A_b_3210,   B_msk5,     A_b_3210_       ;[33,1]

								||      SPACKU4 .S1     A_r_32,     A_r_10,     A_r_3210        ;[33,1]

								||      ADD     .L2     B_y_4_c0,   B_cr4_c1,   B_r_4           ;[21,2]

								||      ADD     .S2     B_y_5_c0,   B_cr4_c1,   B_r_5           ;[21,2]

								||      MPYSU4  .M1     A_cb6420_,  A_k08,      A_cb64:A_cb20   ;[ 9,3]


								        SPACKU4 .S1     A_g_32,     A_g_10,     A_g_3210        ;[34,1]

								||      ADD     .S2X    B_y_6_c0,   A_cb6_c4,   B_b_6           ;[22,2]

								||      ADD     .D2     B_y_6_c0,   B_cr6_c1,   B_r_6           ;[22,2]

								||      ADD     .L2     B_y_7_c0,   B_cr6_c1,   B_r_7           ;[22,2]

								||      MPYSU4  .M2     B_cr6420_,  B_k08,      B_cr64:B_cr20   ;[10,3]

								||      MPYU4   .M1X    B_y_3210,   A_k08,      A_y_32_:A_y_10_ ;[10,3]


								        AND     .S1     A_g_3210,   A_msk6,     A_g_3210_       ;[35,1]

								||      AND     .L1X    A_r_3210,   B_msk5,     A_r_3210_       ;[35,1]

								||      PACKH2  .L2     B_r_7,      B_r_6,      B_r_76          ;[23,2]

								||      ADD     .D2X    B_y_4_c0,   A_cb4_c4,   B_b_4           ;[23,2]

								||      ADD     .S2X    B_y_5_c0,   A_cb4_c4,   B_b_5           ;[23,2]

								||      DOTP2   .M1     A_cb0cr0,   A_c2c3,     A_cg0           ;[23,2]

								||      MPYU4   .M2     B_y_7654,   B_k08,      B_y_76_:B_y_54_ ;[11,3]

								*


								* =========================== PIPE LOOP KERNEL ============================ *

								loop:

								h_loop_0:

								  [A_lp0] MPYU4   .M2X    B_r_7654_,  A_k80,      B_r7_r6:B_r5_r4 ;[36,1] r << 7

								||        PACKH2  .L2     B_b_5,      B_b_4,      B_b_54          ;[24,2]

								||        PACKH2  .S2     B_r_5,      B_r_4,      B_r_54          ;[24,2]

								||        ADD     .L1     A_y_2_c0,   A_cb2_c4,   A_b_2           ;[24,2]

								||        ADD     .S1     A_y_3_c0,   A_cb2_c4,   A_b_3           ;[24,2]

								||        ADD     .D1     A_y_3_c0,   A_cg2,      A_g_3           ;[24,2]

								||        ADD     .D2     B_y_6_c0,   B_cg6,      B_g_6           ;[24,2]

								||        MPY2    .M1X    A_y_10,     B_c0,       A_y_1_c0:A_y_0_c0;[24,2]


								h_loop_1:

								  [A_lp0] ROTL    .M1     A_b_3210_,  29,         A_b_3210__      ;[37,1]

								||[A_lp0] MPYU4   .M2     B_g_7654_,  B_k08,      B_g7_g6:B_g5_g4 ;[37,1] g << 3

								||        PACKH2  .S1     A_b_3,      A_b_2,      A_b_32          ;[25,2]

								||        ADD     .S2     B_y_5_c0,   B_cg4,      B_g_5           ;[25,2]

								||        ADD     .L2     B_y_7_c0,   B_cg6,      B_g_7           ;[25,2]

								||        ADD     .L1X    A_y_2_c0,   B_cr2_c1,   A_r_2           ;[25,2]

								||        ADD     .D1X    A_y_3_c0,   B_cr2_c1,   A_r_3           ;[25,2]

								||        LDW     .D2T2   *B_cr_data++,           B_cr6420        ;[ 1,4]


								h_loop_2:

								          MPYU4   .M2     B_b_7654__, B_k01,      B_b7_b6:B_b5_b4 ;[38,1] b >> 3

								||        MPYU4   .M1     A_r_3210_,  A_k80,      A_r3_r2:A_r1_r0 ;[38,1] r << 7

								||        PACKH2  .S2     B_g_7,      B_g_6,      B_g_76          ;[26,2]

								||        PACKH2  .S1     A_r_3,      A_r_2,      A_r_32          ;[26,2]

								||        ADD     .L2X    B_y_7_c0,   A_cb6_c4,   B_b_7           ;[26,2]

								||        ADD     .L1     A_y_2_c0,   A_cg2,      A_g_2           ;[26,2]

								||        ADD     .D2     B_y_4_c0,   B_cg4,      B_g_4           ;[26,2]

								||        SUB2    .D1X    A_y_32_,    B_n16,      A_y_32          ;[14,3]


								h_loop_3:

								          MPYU4   .M1     A_g_3210_,  A_k08,      A_g3_g2:A_g1_g0 ;[39,1] g << 3

								||        PACKH2  .L2     B_b_7,      B_b_6,      B_b_76          ;[27,2]

								||        PACKH2  .S2     B_g_5,      B_g_4,      B_g_54          ;[27,2]

								||        PACK2   .S1X    A_cb20,     B_cr20,     A_cb0cr0        ;[15,3]

								||        PACKH2  .L1X    A_cb20,     B_cr20,     A_cb2cr2        ;[15,3]

								||        MPY2    .M2     B_cr64,     B_c1,       B_cr6_c1:B_cr4_c1;[15,3]

								||        SUB2    .D2     B_y_54_,    B_n16,      B_y_54          ;[15,3]

								||        LDW     .D1T1   *A_cb_data++,           A_cb6420        ;[ 3,4]


								h_loop_4:

								          MPYU4   .M1X    A_b_3210__, B_k01,      A_b3_b2:A_b1_b0 ;[40,1] b >> 3

								||        SPACKU4 .S2     B_b_76,     B_b_54,     B_b_7654        ;[28,2]

								||        ADD     .S1     A_y_1_c0,   A_cb0_c4,   A_b_1           ;[28,2]

								||        ADD     .L1     A_y_0_c0,   A_cg0,      A_g_0           ;[28,2]

								||        PACKH2  .L2X    B_cr64,     A_cb64,     B_cr6cb6        ;[16,3]

								||        MPY2    .M2     B_cr20,     B_c1,       B_cr2_c1:B_cr0_c1;[16,3]

								||        SUB2    .D2     B_y_76_,    B_n16,      B_y_76          ;[16,3]

								||        LDDW    .D1T2   *A_y_data++,            B_y_7654:B_y_3210;[ 4,4]


								h_loop_5:

								          AND     .D2     B_b_7654,   B_msk5,     B_b_7654_       ;[29,2]

								||        SPACKU4 .S2     B_g_76,     B_g_54,     B_g_7654        ;[29,2]

								||        PACKH2  .L1     A_g_3,      A_g_2,      A_g_32          ;[29,2]

								||        ADD     .S1     A_y_0_c0,   A_cb0_c4,   A_b_0           ;[29,2]

								||        ADD     .D1     A_y_1_c0,   A_cg0,      A_g_1           ;[29,2]

								||        PACK2   .L2X    B_cr64,     A_cb64,     B_cr4cb4        ;[17,3]

								||        MPY2    .M1     A_cb64,     A_c4,       A_cb6_c4:A_cb4_c4;[17,3]

								||        MPY2    .M2     B_y_54,     B_c0,       B_y_5_c0:B_y_4_c0;[17,3]


								h_loop_6:

								          BDEC    .S2     loop,       B_loopcnt                   ;[42,1]

								||        ADDAH   .D2     B_b7_b6,    B_r7_r6,    B_r_b76         ;[42,1] (r<<8)|(b>>3)

								||        AND     .L2X    B_g_7654,   A_msk6,     B_g_7654_       ;[30,2]

								||        PACKH2  .L1     A_b_1,      A_b_0,      A_b_10          ;[30,2]

								||        ADD     .S1X    A_y_0_c0,   B_cr0_c1,   A_r_0           ;[30,2]

								||        ADD     .D1X    A_y_1_c0,   B_cr0_c1,   A_r_1           ;[30,2]

								||        MPY2    .M1     A_cb20,     A_c4,       A_cb2_c4:A_cb0_c4;[18,3]

								||        MPY2    .M2     B_y_76,     B_c0,       B_y_7_c0:B_y_6_c0;[18,3]


								h_loop_7:

								          ADD     .L2     B_r_b76,    B_g7_g6,    B_rgb76         ;[43,1]

								||        ADDAH   .D2     B_b5_b4,    B_r5_r4,    B_r_b54         ;[43,1] (r<<8)|(b>>3)

								||        SPACKU4 .S2     B_r_76,     B_r_54,     B_r_7654        ;[31,2]

								||        PACKH2  .L1     A_g_1,      A_g_0,      A_g_10          ;[31,2]

								||        PACKH2  .S1     A_r_1,      A_r_0,      A_r_10          ;[31,2]

								||        DOTP2   .M1     A_cb2cr2,   A_c2c3,     A_cg2           ;[19,3]

								||        DOTP2   .M2     B_cr6cb6,   B_c3c2,     B_cg6           ;[19,3]

								||        SUB2    .D1X    A_y_10_,    B_n16,      A_y_10          ;[19,3]


								h_loop_8:

								          ADD     .D2     B_r_b54,    B_g5_g4,    B_rgb54         ;[44,1]

								||        ADDAH   .D1     A_b3_b2,    A_r3_r2,    A_r_b32         ;[44,1] (r<<8)|(b>>3)

								||        AND     .L2     B_r_7654,   B_msk5,     B_r_7654_       ;[32,2]

								||        SPACKU4 .S1     A_b_32,     A_b_10,     A_b_3210        ;[32,2]

								||        DOTP2   .M2     B_cr4cb4,   B_c3c2,     B_cg4           ;[20,3]

								||        MPY2    .M1X    A_y_32,     B_c0,       A_y_3_c0:A_y_2_c0;[20,3]

								||        XOR     .L1     A_cb6420,   A_k80,      A_cb6420_       ;[ 8,4]

								||        XOR     .S2X    B_cr6420,   A_k80,      B_cr6420_       ;[ 8,4]


								h_loop_9:

								  [A_lp0] STDW    .D2T2   B_rgb76:B_rgb54,        *B_rgb_ptr++[2] ;[45,1]

								||[A_lp0] ADDAH   .D1     A_b1_b0,    A_r1_r0,    A_r_b10         ;[45,1] (r<<8)|(b>>3)

								||[A_lp0] ROTL    .M2     B_b_7654_,  29,         B_b_7654__      ;[33,2]

								||[A_lp0] AND     .L1X    A_b_3210,   B_msk5,     A_b_3210_       ;[33,2]

								||[A_lp0] SPACKU4 .S1     A_r_32,     A_r_10,     A_r_3210        ;[33,2]

								||        ADD     .L2     B_y_4_c0,   B_cr4_c1,   B_r_4           ;[21,3]

								||        ADD     .S2     B_y_5_c0,   B_cr4_c1,   B_r_5           ;[21,3]

								||        MPYSU4  .M1     A_cb6420_,  A_k08,      A_cb64:A_cb20   ;[ 9,4]


								h_loop_10:

								  [A_lp0] ADD     .L1     A_r_b10,    A_g1_g0,    A_rgb10         ;[46,1]

								||[A_lp0] ADD     .D1     A_r_b32,    A_g3_g2,    A_rgb32         ;[46,1]

								||[A_lp0] SPACKU4 .S1     A_g_32,     A_g_10,     A_g_3210        ;[34,2]

								||        ADD     .S2X    B_y_6_c0,   A_cb6_c4,   B_b_6           ;[22,3]

								||        ADD     .D2     B_y_6_c0,   B_cr6_c1,   B_r_6           ;[22,3]

								||        ADD     .L2     B_y_7_c0,   B_cr6_c1,   B_r_7           ;[22,3]

								||        MPYSU4  .M2     B_cr6420_,  B_k08,      B_cr64:B_cr20   ;[10,4]

								||        MPYU4   .M1X    B_y_3210,   A_k08,      A_y_32_:A_y_10_ ;[10,4]


								h_loop_11:

								  [A_lp0] STDW    .D1T1   A_rgb32:A_rgb10,        *A_rgb_ptr++[2] ;[47,1]

								||[A_lp0] AND     .S1     A_g_3210,   A_msk6,     A_g_3210_       ;[35,2]

								||[A_lp0] AND     .L1X    A_r_3210,   B_msk5,     A_r_3210_       ;[35,2]

								||        PACKH2  .L2     B_r_7,      B_r_6,      B_r_76          ;[23,3]

								||        ADD     .D2X    B_y_4_c0,   A_cb4_c4,   B_b_4           ;[23,3]

								||        ADD     .S2X    B_y_5_c0,   A_cb4_c4,   B_b_5           ;[23,3]

								||        DOTP2   .M1     A_cb0cr0,   A_c2c3,     A_cg0           ;[23,3]

								||        MPYU4   .M2     B_y_7654,   B_k08,      B_y_76_:B_y_54_ ;[11,4]


								* =========================== PIPE LOOP EPILOG ============================ *


								        LDDW      .D2T2   *B_SP++,    B_reg11:B_reg10         ; Restore Regs 11 & 10


								        BNOP      .S2     B_ret,      4                       ; Return to caller

								||      LDW       .D2T2   *-B_SP[3],  B_csr                   ;


								        MVC       .S2     B_csr,      CSR                     ; Restore CSR


								; ===== Branch occurs =====

								; ===== Interrupts may occur here =====


								* ========================================================================= *

								*   End of file:  img_ycbcr422p_rgb565.asm                                  *

								* ------------------------------------------------------------------------- *

								*             Copyright (c) 2003 Texas Instruments, Incorporated.           *

								*                            All Rights Reserved.                           *

								* ========================================================================= *