c6416_sdk/imglib/yc_demux_1.asm


								;* ======================================================================== *;

								;*  TEXAS INSTRUMENTS, INC.                                                 *;

								;*                                                                          *;

								;*  IMGLIB  DSP Image/Video Processing Library                              *;

								;*                                                                          *;

								;*      Release:        Revision 1.04b                                      *;

								;*      CVS Revision:   1.3     Sun Sep 29 03:32:32 2002 (UTC)              *;

								;*      Snapshot date:  23-Oct-2003                                         *;

								;*                                                                          *;

								;*  This library contains proprietary intellectual property of Texas        *;

								;*  Instruments, Inc.  The library and its source code are protected by     *;

								;*  various copyrights, and portions may also be protected by patents or    *;

								;*  other legal protections.                                                *;

								;*                                                                          *;

								;*  This software is licensed for use with Texas Instruments TMS320         *;

								;*  family DSPs.  This license was provided to you prior to installing      *;

								;*  the software.  You may review this license by consulting the file       *;

								;*  TI_license.PDF which accompanies the files in this library.             *;

								;* ------------------------------------------------------------------------ *;

								;*          Copyright (C) 2003 Texas Instruments, Incorporated.             *;

								;*                          All Rights Reserved.                            *;

								;* ======================================================================== *;


								;* ======================================================================== *;

								;*  Assembler compatibility shim for assembling 4.30 and later code on      *;

								;*  tools prior to 4.30.                                                    *;

								;* ======================================================================== *;


								        .if $isdefed(".ASSEMBLER_VERSION")

								        .asg    .ASSEMBLER_VERSION, $asmver

								        .else

								        .asg    0,    $asmver

								        .endif


								        .if ($asmver < 430)


								        .asg    B,    CALL     ; Function Call

								        .asg    B,    RET      ; Return from a Function

								        .asg    B,    CALLRET  ; Function call with Call / Ret chaining.


								        .if .TMS320C6400

								        .asg    BNOP, CALLNOP  ; C64x BNOP as a Fn. Call

								        .asg    BNOP, RETNOP   ; C64x BNOP as a Fn. Return

								        .asg    BNOP, CRNOP    ; C64x Fn call w/, Call/Ret chaining via BNOP.

								        .endif


								        .asg    , .asmfunc     ; .func equivalent for hand-assembly code

								        .asg    , .endasmfunc  ; .endfunc equivalent for hand-assembly code


								        .endif


								;* ======================================================================== *;

								;*  End of assembler compatibility shim.                                    *;

								;* ======================================================================== *;


								* ========================================================================= *

								*   TEXAS INSTRUMENTS, INC.                                                 *

								*                                                                           *

								*   NAME                                                                    *

								*       IMG_yc_demux_le16 -- De-interleave a 4:2:2 LITTLE ENDIAN video stream *

								*                        into three separate LITTLE ENDIAN 16-bit planes    *

								*                                                                           *

								*   REVISION DATE                                                           *

								*       15-Mar-2002                                                         *

								*                                                                           *

								*   USAGE                                                                   *

								*       This function is C callable, and is called as follows:              *

								*                                                                           *

								*       void IMG_yc_demux_le16                                              *

								*       (                                                                   *

								*           int n,                       // Number of luma pixels    //     *

								*           const unsigned char * yc,    // Interleaved luma/chroma  //     *

								*           short *restrict y,           // Luma plane (16-bit)      //     *

								*           short *restrict cr,          // Cr chroma plane (16-bit) //     *

								*           short *restrict cb           // Cb chroma plane (16-bit) //     *

								*       );                                                                  *

								*                                                                           *

								*       The input array 'yc' is expected to be an interleaved 4:2:2         *

								*       video stream.  The input is expected in LITTLE ENDIAN byte          *

								*       order within each 4-byte word.  This is consistent with reading     *

								*       the video stream from a word-oriented LITTLE ENDIAN device          *

								*       while the C6000 device is in a LITTLE ENDIAN configuration.         *

								*                                                                           *

								*       In other words, the expected pixel order is:                        *

								*                                                                           *

								*                   Word 0           Word 1          Word 2                 *

								*              +---------------+---------------+---------------+--          *

								*        Byte# | 0   1   2   3 | 4   5   6   7 | 8   9  10  11 |...         *

								*              | y0 cr0 y1 cb0 | y2 cr2 y3 cb2 | y4 cr4 y5 cb4 |...         *

								*              +---------------+---------------+---------------+--          *

								*                                                                           *

								*       The output arrays 'y', 'cr', and 'cb' are expected to not           *

								*       overlap.  The de-interleaved pixels are written as half-words       *

								*       in LITTLE ENDIAN order.                                             *

								*                                                                           *

								*       Please see the IMGLIB function IMB_yc_demux_be16 for code which     *

								*       handles input coming from a BIG ENDIAN device.                      *

								*                                                                           *

								*   DESCRIPTION                                                             *

								*       This function reads the byte-oriented pixel data, zero-extends      *

								*       it, and then writes it to the appropriate result array.  Both       *

								*       the luma and chroma values are expected to be unsigned.             *

								*                                                                           *

								*       The data is expected to be in an order consistent with reading      *

								*       byte oriented data from a word-oriented peripheral that is          *

								*       operating in LITTLE ENDIAN mode, while the CPU is in LITTLE         *

								*       ENDIAN mode.  This function unpacks the byte-oriented data          *

								*       so that further processing may proceed in LITTLE ENDIAN mode.       *

								*                                                                           *

								*   ASSUMPTIONS                                                             *

								*       Input and output arrays are double-word aligned.                    *

								*       The input must be a multiple of 16 luma pixels long.                *

								*                                                                           *

								*   TECHNIQUES                                                              *

								*       The loop has been unrolled a total of 16 times to allow for         *

								*       processing 8 pixels in each datapath.                               *

								*                                                                           *

								*       Double-word loads and stores maximize memory bandwidth              *

								*       utilization.                                                        *

								*                                                                           *

								*       This code uses _gmpy4() to ease the L/S/D unit bottleneck on        *

								*       ANDs.  The _gmpy4(value, 0x00010001) is equivalent to               *

								*       value & 0x00FF00FF, as long as the size field of GFPGFR is          *

								*       equal to 7.  (The polynomial does not matter.)                      *

								*                                                                           *

								*   NOTES                                                                   *

								*       This code is fully interruptible.                                   *

								*                                                                           *

								*   CYCLES                                                                  *

								*       cycles = 3 * (num_luma / 8) + 18.                                   *

								*       For num_luma = 1024, cycles = 402                                   *

								*                                                                           *

								*       This number includes 6 cycles of function call overhead.  The       *

								*       exact overhead will vary depending on compiler options used.        *

								*                                                                           *

								*   CODESIZE                                                                *

								*       352 bytes                                                           *

								* ------------------------------------------------------------------------- *

								*             Copyright (c) 2003 Texas Instruments, Incorporated.           *

								*                            All Rights Reserved.                           *

								* ========================================================================= *


								        .sect ".text:_yc_demux_le16"

								        .global _IMG_yc_demux_le16

								_IMG_yc_demux_le16


								* ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== *

								        .asg            A4,         A_i

								        .asg            B4,         B_yc

								        .asg            A6,         A_y

								        .asg            B6,         B_cr

								        .asg            A8,         A_cb

								        .asg            B3,         B_ret_addr


								        .asg            A1,         A_p

								        .asg            A2,         A_b0y1r0y0

								        .asg            A3,         A_b2y3r2y2

								        .asg            A16,        A_r0b0

								        .asg            A17,        A_b0r0

								        .asg            A18,        A_b6r6

								        .asg            A19,        A_r2b2

								        .asg            A20,        A_yc

								        .asg            A21,        A_r6b6

								        .asg            A22,        A_cr

								        .asg            A23,        A_00010001

								        .asg            A24,        A_r2r0

								        .asg            A25,        A_b2r2

								        .asg            A25,        A_r6r4

								        .asg            A26,        A_b2b0

								        .asg            A26,        A_y5y4

								        .asg            A27,        A_b4r4

								        .asg            A27,        A_b6b4

								        .asg            A27,        A_y7y6

								        .asg            A28,        A_b4y5r4y4

								        .asg            A29,        A_b6y7r6y6

								        .asg            A30,        A_r4b4

								        .asg            A30,        A_y1y0

								        .asg            A31,        A_y3y2


								        .asg            B8,         B_bCyDrCyC

								        .asg            B9,         B_bEyFrEyE

								        .asg            B17,        B_gfpgfr

								        .asg            B18,        B_rAbA

								        .asg            B19,        B_rCbC

								        .asg            B20,        B_rEbE

								        .asg            B21,        B_cb

								        .asg            B22,        B_00FF00FF

								        .asg            B23,        B_y

								        .asg            B24,        B_rAr8

								        .asg            B24,        B_y9y8

								        .asg            B25,        B_rErC

								        .asg            B25,        B_yByA

								        .asg            B26,        B_b8r8

								        .asg            B26,        B_bAb8

								        .asg            B26,        B_bErE

								        .asg            B27,        B_bArA

								        .asg            B27,        B_bEbC

								        .asg            B28,        B_b8y9r8y8

								        .asg            B29,        B_bAyBrAyA

								        .asg            B29,        B_bCrC

								        .asg            B30,        B_r8b8

								        .asg            B30,        B_yDyC

								        .asg            B31,        B_yFyE

								* ========================================================================= *


								* =========================== PIPE LOOP PROLOG ============================ *

								        LDDW    .D2T1   *B_yc[1],   A_b6y7r6y6:A_b4y5r4y4       ;[ 1,1]

								||      ZERO    .S1     A_p

								||      ADD     .L1X    B_yc,       8,          A_yc

								||      ADD     .L2X    A_y,        8,          B_y


								        MVKLH   .S1     0x0700,     A_p

								||      ADD     .L1     A_yc,       8,          A_yc

								||      ADD     .L2     B_y,        8,          B_y

								||      MVK     .S2     0x00FF,     B_00FF00FF


								        LDDW    .D2T1   *B_yc++[4], A_b2y3r2y2:A_b0y1r0y0       ;[ 3,1]

								||      LDDW    .D1T2   *A_yc[1],   B_bEyFrEyE:B_bCyDrCyC       ;[ 3,1]

								||      PACK2   .L2     B_00FF00FF, B_00FF00FF, B_00FF00FF


								        MVC     .S2     GFPGFR,     B_gfpgfr

								||      ADD     .L1X    B_cr,       8,          A_cr

								||      ADD     .L2X    A_cb,       8,          B_cb

								||      SHRU    .S1     A_i,        4,          A_i


								        MVC     .S2X    A_p,        GFPGFR

								||      SHRU2   .S1X    B_00FF00FF, 7,          A_00010001

								||      SUB     .L1     A_i,        2,          A_i


								        ANDN    .L1X    A_b6y7r6y6, B_00FF00FF, A_b6r6          ;[ 6,1]

								||      AND     .S1X    A_b4y5r4y4, B_00FF00FF, A_y5y4          ;[ 6,1]

								; ===== 1 prolog stage collapsed

								* =========================== PIPE LOOP KERNEL ============================ *

								loop:

								        BDEC    .S1     loop,       A_i                         ;[13,1]

								||      ROTL    .M1     A_b4r4,     8,          A_r4b4          ;[13,1]

								||      ROTL    .M2     B_b8r8,     8,          B_r8b8          ;[13,1]

								||      AND     .L2     B_b8y9r8y8, B_00FF00FF, B_y9y8          ;[13,1]

								||      AND     .S2     B_bAyBrAyA, B_00FF00FF, B_yByA          ;[13,1]

								||      AND     .L1X    A_b6y7r6y6, B_00FF00FF, A_y7y6          ;[ 7,2]

								||      LDDW    .D1T2   *A_yc++[4], B_bAyBrAyA:B_b8y9r8y8       ;[ 7,2]

								||      LDDW    .D2T1   *B_yc[1],   A_b6y7r6y6:A_b4y5r4y4       ;[ 1,3]


								  [!A_p]STDW    .D1T1   A_y3y2:A_y1y0,          *A_y++[4]       ;[14,1]

								||[!A_p]STDW    .D2T2   B_yFyE:B_yDyC,          *B_y[1]         ;[14,1]

								||      PACK2   .S2     B_rEbE,     B_rCbC,     B_bEbC          ;[14,1]

								||      ROTL    .M2     B_bArA,     8,          B_rAbA          ;[14,1]

								||      ANDN    .L1X    A_b2y3r2y2, B_00FF00FF, A_b2r2          ;[ 8,2]

								||      ANDN    .L2     B_bEyFrEyE, B_00FF00FF, B_bErE          ;[ 8,2]

								||      GMPY4   .M1     A_b0y1r0y0, A_00010001, A_y1y0          ;[ 8,2]

								||      AND     .S1X    A_b2y3r2y2, B_00FF00FF, A_y3y2          ;[ 8,2]


								        PACKH2  .L1     A_r2b2,     A_r0b0,     A_r2r0          ;[15,1]

								||      ANDN    .S1X    A_b0y1r0y0, B_00FF00FF, A_b0r0          ;[ 9,2]

								||      ANDN    .S2     B_bCyDrCyC, B_00FF00FF, B_bCrC          ;[ 9,2]

								||      AND     .L2     B_bEyFrEyE, B_00FF00FF, B_yFyE          ;[ 9,2]

								||      LDDW    .D2T1   *B_yc++[4], A_b2y3r2y2:A_b0y1r0y0       ;[ 3,3]

								||      LDDW    .D1T2   *A_yc[1],   B_bEyFrEyE:B_bCyDrCyC       ;[ 3,3]

								||[ A_p]DOTPN2  .M1     A_00010001, A_00010001, A_p             ; pro collapse


								  [!A_p]STDW    .D2T2   B_yByA:B_y9y8,          *B_y++[4]       ;[16,1]

								||      PACK2   .S1     A_r2b2,     A_r0b0,     A_b2b0          ;[16,1]

								||      PACK2   .L1     A_r6b6,     A_r4b4,     A_b6b4          ;[16,1]

								||      PACK2   .S2     B_rAbA,     B_r8b8,     B_bAb8          ;[16,1]

								||      PACKH2  .L2     B_rAbA,     B_r8b8,     B_rAr8          ;[16,1]

								||      STDW    .D1T1   A_y7y6:A_y5y4,          *A_y[1]         ;[10,2]

								||      ROTL    .M1     A_b6r6,     8,          A_r6b6          ;[10,2]

								||      ROTL    .M2     B_bErE,     8,          B_rEbE          ;[10,2]


								  [!A_p]STDW    .D1T1   A_b6b4:A_b2b0,          *A_cb++[2]      ;[17,1]

								||[!A_p]STDW    .D2T2   B_bEbC:B_bAb8,          *B_cb++[2]      ;[17,1]

								||      PACKH2  .L1     A_r6b6,     A_r4b4,     A_r6r4          ;[17,1]

								||      PACKH2  .S2     B_rEbE,     B_rCbC,     B_rErC          ;[17,1]

								||      ROTL    .M1     A_b2r2,     8,          A_r2b2          ;[11,2]

								||      ROTL    .M2     B_bCrC,     8,          B_rCbC          ;[11,2]

								||      ANDN    .S1X    A_b4y5r4y4, B_00FF00FF, A_b4r4          ;[11,2]

								||      AND     .L2     B_bCyDrCyC, B_00FF00FF, B_yDyC          ;[11,2]


								  [!A_p]STDW    .D2T1   A_r6r4:A_r2r0,          *B_cr++[2]      ;[18,1]

								||[!A_p]STDW    .D1T2   B_rErC:B_rAr8,          *A_cr++[2]      ;[18,1]

								||      ROTL    .M1     A_b0r0,     8,          A_r0b0          ;[12,2]

								||      ANDN    .L2     B_b8y9r8y8, B_00FF00FF, B_b8r8          ;[12,2]

								||      ANDN    .S2     B_bAyBrAyA, B_00FF00FF, B_bArA          ;[12,2]

								||      ANDN    .L1X    A_b6y7r6y6, B_00FF00FF, A_b6r6          ;[ 6,3]

								||      AND     .S1X    A_b4y5r4y4, B_00FF00FF, A_y5y4          ;[ 6,3]

								* =========================== PIPE LOOP EPILOG ============================ *

								; ===== 1 epilog stage collapsed

								        ROTL    .M1     A_b4r4,     8,          A_r4b4          ;[13,3]

								||      ROTL    .M2     B_b8r8,     8,          B_r8b8          ;[13,3]

								||      AND     .L2     B_b8y9r8y8, B_00FF00FF, B_y9y8          ;[13,3]

								||      AND     .D2     B_bAyBrAyA, B_00FF00FF, B_yByA          ;[13,3]

								||      RET             B_ret_addr


								        STDW    .D1T1   A_y3y2:A_y1y0,          *A_y            ;[14,3]

								||      STDW    .D2T2   B_yFyE:B_yDyC,          *B_y[1]         ;[14,3]

								||      PACK2   .S2     B_rEbE,     B_rCbC,     B_bEbC          ;[14,3]

								||      ROTL    .M2     B_bArA,     8,          B_rAbA          ;[14,3]


								        PACKH2  .L1     A_r2b2,     A_r0b0,     A_r2r0          ;[15,3]


								        STDW    .D2T2   B_yByA:B_y9y8,          *B_y            ;[16,3]

								||      PACK2   .S1     A_r2b2,     A_r0b0,     A_b2b0          ;[16,3]

								||      PACK2   .L1     A_r6b6,     A_r4b4,     A_b6b4          ;[16,3]

								||      PACK2   .S2     B_rAbA,     B_r8b8,     B_bAb8          ;[16,3]

								||      PACKH2  .L2     B_rAbA,     B_r8b8,     B_rAr8          ;[16,3]


								        STDW    .D1T1   A_b6b4:A_b2b0,          *A_cb           ;[17,3]

								||      STDW    .D2T2   B_bEbC:B_bAb8,          *B_cb           ;[17,3]

								||      PACKH2  .L1     A_r6b6,     A_r4b4,     A_r6r4          ;[17,3]

								||      PACKH2  .S2     B_rEbE,     B_rCbC,     B_rErC          ;[17,3]


								        STDW    .D2T1   A_r6r4:A_r2r0,          *B_cr           ;[18,3]

								||      STDW    .D1T2   B_rErC:B_rAr8,          *A_cr           ;[18,3]

								||      MVC     .S2     B_gfpgfr,   GFPGFR


								* ========================================================================= *

								*   End of file:  img_yc_demux_le16.asm                                     *

								* ------------------------------------------------------------------------- *

								*             Copyright (c) 2003 Texas Instruments, Incorporated.           *

								*                            All Rights Reserved.                           *

								* ========================================================================= *