c6416_sdk/imglib/conv_3x3.asm


								;* ======================================================================== *;

								;*  TEXAS INSTRUMENTS, INC.                                                 *;

								;*                                                                          *;

								;*  IMGLIB  DSP Image/Video Processing Library                              *;

								;*                                                                          *;

								;*      Release:        Revision 1.04b                                      *;

								;*      CVS Revision:   1.5     Sun Sep 29 03:32:19 2002 (UTC)              *;

								;*      Snapshot date:  23-Oct-2003                                         *;

								;*                                                                          *;

								;*  This library contains proprietary intellectual property of Texas        *;

								;*  Instruments, Inc.  The library and its source code are protected by     *;

								;*  various copyrights, and portions may also be protected by patents or    *;

								;*  other legal protections.                                                *;

								;*                                                                          *;

								;*  This software is licensed for use with Texas Instruments TMS320         *;

								;*  family DSPs.  This license was provided to you prior to installing      *;

								;*  the software.  You may review this license by consulting the file       *;

								;*  TI_license.PDF which accompanies the files in this library.             *;

								;* ------------------------------------------------------------------------ *;

								;*          Copyright (C) 2003 Texas Instruments, Incorporated.             *;

								;*                          All Rights Reserved.                            *;

								;* ======================================================================== *;


								;* ======================================================================== *;

								;*  Assembler compatibility shim for assembling 4.30 and later code on      *;

								;*  tools prior to 4.30.                                                    *;

								;* ======================================================================== *;


								        .if $isdefed(".ASSEMBLER_VERSION")

								        .asg    .ASSEMBLER_VERSION, $asmver

								        .else

								        .asg    0,    $asmver

								        .endif


								        .if ($asmver < 430)


								        .asg    B,    CALL     ; Function Call

								        .asg    B,    RET      ; Return from a Function

								        .asg    B,    CALLRET  ; Function call with Call / Ret chaining.


								        .if .TMS320C6400

								        .asg    BNOP, CALLNOP  ; C64x BNOP as a Fn. Call

								        .asg    BNOP, RETNOP   ; C64x BNOP as a Fn. Return

								        .asg    BNOP, CRNOP    ; C64x Fn call w/, Call/Ret chaining via BNOP.

								        .endif


								        .asg    , .asmfunc     ; .func equivalent for hand-assembly code

								        .asg    , .endasmfunc  ; .endfunc equivalent for hand-assembly code


								        .endif


								;* ======================================================================== *;

								;*  End of assembler compatibility shim.                                    *;

								;* ======================================================================== *;


								* ========================================================================= *

								*   NAME                                                                    *

								*       IMG_conv_3x3    -- 3x3 convolution                                  *

								*                                                                           *

								*   REVISION DATE                                                           *

								*       19-May-2002                                                         *

								*                                                                           *

								*   USAGE                                                                   *

								*       This routine has the following C prototype:                         *

								*                                                                           *

								*       void IMG_conv_3x3   (    const unsigned char *restrict inptr,       *

								*                                  unsigned char *restrict outptr,          *

								*                                           int            x_dim,           *

								*                            const          char *restrict mask,            *

								*                                           int            shift)           *

								*                                                                           *

								*      The convolution routine accepts three rows of 'x_dim' input points   *

								*      and performs some operation on each.  A total of 'x_dim' outputs     *

								*      are written to the output array. The 'mask' array has the 3 by 3     *

								*      array of coefficients.                                               *

								*                                                                           *

								*   DESCRIPTION                                                             *

								*                                                                           *

								*      The convolution kernel accepts three rows of 'x_dim' input points    *

								*      and produces one output row of 'x_dim' points using the input mask   *

								*      of 3 by 3. The user defined shift value is used to shift the convo-  *

								*      lution value, down to the byte range. The convolution sum is also    *

								*      range limited to 0..255. The shift amount is non-zero for low pass   *

								*      filters, and zero for high pass and sharpening filters.              *

								*                                                                           *

								*                                                                           *

								*      The following is the C code model for the algorithm:                 *

								*                                                                           *

								*                                                                           *

								*      void IMG_conv_3x3(   const unsigned char *restrict inptr,            *

								*                                unsigned char *restrict outptr,            *

								*                                         int            x_dim,             *

								*                          const          char *restrict mask,              *

								*                                         int            shift)             *

								*      {                                                                    *

								*           const   unsigned char   *IN1,*IN2,*IN3;                         *

								*           unsigned char           *OUT;                                   *

								*                                                                           *

								*           short    pix10,  pix20,  pix30;                                 *

								*           short    mask10, mask20, mask30;                                *

								*                                                                           *

								*           int      sum,      sum00,  sum11;                               *

								*           int      i;                                                     *

								*           int      sum22,    j;                                           *

								*                                                                           *

								*           IN1      =   inptr;                                             *

								*           IN2      =   IN1 + x_dim;                                       *

								*           IN3      =   IN2 + x_dim;                                       *

								*           OUT      =   outptr;                                            *

								*                                                                           *

								*           for (j = 0; j < x_dim ; j++)                                    *

								*           {                                                               *

								*               sum = 0;                                                    *

								*                                                                           *

								*               for (i = 0; i < 3; i++)                                     *

								*               {                                                           *

								*                   pix10  =   IN1[i];                                      *

								*                   pix20  =   IN2[i];                                      *

								*                   pix30  =   IN3[i];                                      *

								*                                                                           *

								*                   mask10 =   mask[i];                                     *

								*                   mask20 =   mask[i + 3];                                 *

								*                   mask30 =   mask[i + 6];                                 *

								*                                                                           *

								*                   sum00  =   pix10 * mask10;                              *

								*                   sum11  =   pix20 * mask20;                              *

								*                   sum22  =   pix30 * mask30;                              *

								*                                                                           *

								*                   sum   +=   sum00 + sum11+ sum22;                        *

								*               }                                                           *

								*                                                                           *

								*               IN1++;                                                      *

								*               IN2++;                                                      *

								*               IN3++;                                                      *

								*                                                                           *

								*               sum = (sum >> shift);                                       *

								*               if ( sum <  0  )       sum = 0;                             *

								*               if ( sum > 255 )       sum = 255;                           *

								*               *OUT++   =       sum;                                       *

								*           }                                                               *

								*      }                                                                    *

								*                                                                           *

								*                                                                           *

								*   TECHNIQUES                                                              *

								*      The inner loop that computes the convolution sum is completely       *

								*      unrolled and 8 output pixels are computed together. The mask         *

								*      values are loaded and packed as double words.                        *

								*                                                                           *

								*   ASSUMPTIONS                                                             *

								*      x_dim must be a multiple of 8.                                       *

								*                                                                           *

								*   NOTES                                                                   *

								*      None                                                                 *

								*                                                                           *

								*   CYCLES                                                                  *

								*      33 + x_dim/8 * 9                                                     *

								*                                                                           *

								*   CODESIZE                                                                *

								*      724 bytes                                                            *

								* ------------------------------------------------------------------------- *

								*             Copyright (c) 2003 Texas Instruments, Incorporated.           *

								*                            All Rights Reserved.                           *

								* ========================================================================= *


								        .sect ".text:_conv_3x3"

								        .global _IMG_conv_3x3

								_IMG_conv_3x3:  ; A_INPTR, B_OUTPTR, A_inputcols, B_mask, A_shift

								                ; A4,      B4,       A6,          B6,     A8


								* ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== *

								        .asg            A0,         A_h01word

								        .asg            A0,         A_mask01

								        .asg            A0,         A_mask20

								        .asg            A1,         A_h20word

								        .asg            A10,        A_line00

								        .asg            A11,        A_line01

								        .asg            A16,        A_pixel45

								        .asg            A16,        A_prodA6

								        .asg            A16,        A_sum1d

								        .asg            A16,        A_sum2g

								        .asg            A16,        A_SP

								        .asg            A17,        A_pixel67a

								        .asg            A17,        A_prodA7

								        .asg            A17,        A_sum3b

								        .asg            A18,        A_line04

								        .asg            A18,        A_line14

								        .asg            A18,        A_prodC8

								        .asg            A18,        A_sum0g

								        .asg            A18,        A_sum2h

								        .asg            A19,        A_line05

								        .asg            A19,        A_line15

								        .asg            A19,        A_prodC9

								        .asg            A19,        A_sum0e

								        .asg            A19,        A_sum1a

								        .asg            A2,         A_h02word

								        .asg            A20,        A_line02

								        .asg            A20,        A_pixel23a

								        .asg            A20,        A_prodA4

								        .asg            A21,        A_line03

								        .asg            A21,        A_prodA5

								        .asg            A22,        A_outword0

								        .asg            A22,        A_pixel01a

								        .asg            A22,        A_prodA10

								        .asg            A22,        A_prodA8

								        .asg            A23,        A_outword1

								        .asg            A23,        A_prodA11

								        .asg            A23,        A_prodA9

								        .asg            A23,        A_sum0f

								        .asg            A24,        A_pixel45a

								        .asg            A24,        A_prodC2

								        .asg            A25,        A_prodC3

								        .asg            A26,        A_prodC0

								        .asg            A26,        A_sum3a

								        .asg            A27,        A_prodC1

								        .asg            A28,        A_line13

								        .asg            A28,        A_prodA2

								        .asg            A29,        A_prodA3

								        .asg            A3,         A_h22word

								        .asg            A30,        A_prodA0

								        .asg            A30,        A_sum1c

								        .asg            A31,        A_mask

								        .asg            A31,        A_mask00

								        .asg            A31,        A_mask02

								        .asg            A31,        A_prodA1

								        .asg            A4,         A_INPTR

								        .asg            A4,         A_sum0h

								        .asg            A4,         A_sum1b

								        .asg            A4,         A_sum2f

								        .asg            A4,         A_sum3c

								        .asg            A5,         A_IN1

								        .asg            A6,         A_inputcols

								        .asg            A6,         A_pixel01

								        .asg            A7,         A_OPTR

								        .asg            A8,         A_shift

								        .asg            A9,         A_h00word

								        .asg            B0,         B_h21word

								        .asg            B0,         B_mask21

								        .asg            B0,         B_roundval1

								        .asg            B1,         B_h12word

								        .asg            B1,         B_mask12

								        .asg            B10,        B_h10word

								        .asg            B15,        B_SP

								        .asg            B16,        B_line22

								        .asg            B16,        B_prodC4

								        .asg            B16,        B_sum1g

								        .asg            B16,        B_sum2a

								        .asg            B16,        B_sum3e

								        .asg            B17,        B_line23

								        .asg            B17,        B_prodC5

								        .asg            B17,        B_sum0b

								        .asg            B17,        B_sum0d

								        .asg            B18,        B_line12

								        .asg            B18,        B_line24

								        .asg            B18,        B_prodB4

								        .asg            B19,        B_line25

								        .asg            B19,        B_prodB5

								        .asg            B19,        B_sum0a

								        .asg            B2,         B_h11word

								        .asg            B2,         B_mask11

								        .asg            B20,        B_line20

								        .asg            B20,        B_prodC10

								        .asg            B20,        B_sum0c

								        .asg            B21,        B_line21

								        .asg            B21,        B_prodC11

								        .asg            B22,        B_line10

								        .asg            B22,        B_sum1h

								        .asg            B23,        B_line11

								        .asg            B23,        B_sum2d

								        .asg            B23,        B_sum2e

								        .asg            B23,        B_sum3f

								        .asg            B24,        B_prodB0

								        .asg            B24,        B_prodC6

								        .asg            B24,        B_sum1f

								        .asg            B24,        B_sum3d

								        .asg            B25,        B_prodB1

								        .asg            B25,        B_prodC7

								        .asg            B26,        B_prodB8

								        .asg            B26,        B_sum2c

								        .asg            B27,        B_pixel23

								        .asg            B27,        B_prodB9

								        .asg            B28,        B_prodB2

								        .asg            B28,        B_prodB6

								        .asg            B28,        B_sum1e

								        .asg            B29,        B_prodB3

								        .asg            B29,        B_prodB7

								        .asg            B30,        B_inputcols

								        .asg            B30,        B_prodB10

								        .asg            B31,        B_mask10

								        .asg            B31,        B_mask22

								        .asg            B31,        B_prodB11

								        .asg            B31,        B_sum2b

								        .asg            B4,         B_h22word

								        .asg            B4,         B_h22word

								        .asg            B4,         B_OUTPTR

								        .asg            B5,         B_roundval

								        .asg            B6,         B_count1

								        .asg            B6,         B_mask

								        .asg            B6,         B_pixel67

								        .asg            B6,         B_sum3g

								        .asg            B6,         B_sum3h

								        .asg            B7,         B_IN3

								        .asg            B8,         B_IN2

								        .asg            B9,         B_count

								        .asg            B9,         B_csr

								        .asg            B9,         B_no_gie

								* ========================================================================= *


								        MVC     .S2     CSR,        B_csr

								||      STW     .D2T1   A11,        *B_SP--[4]

								||      MV      .L1X    B_SP,       A_SP

								||      MV      .S1     A_INPTR,    A_IN1                       ;[10,0]

								||      MV      .L2X    A_inputcols,            B_inputcols     ;[ 7,0]


								        AND     .L2     B_csr,      -2,         B_no_gie

								||      STW     .D2T2   B_csr,      *+B_SP[2]

								||      STW     .D1T1   A10,        *-A_SP[1]

								||      ADD     .S2X    A_IN1,      B_inputcols,B_IN2           ;[12,0]


								        MVC     .S2     B_no_gie,   CSR

								||      STW     .D2T2   B10,        *+B_SP[1]

								||      ADD     .L2     B_IN2,      B_inputcols,B_IN3           ;[13,0]


								        LDB     .D2T2   *+B_mask[8],            B_mask22        ;[ 3,0]

								||      MV      .L1X    B_mask,     A_mask                      ;[ 3,0]


								        LDB     .D2T2   *+B_mask[7],            B_mask21        ;[ 4,0]

								||      LDB     .D1T1   *+A_mask[6],            A_mask20        ;[ 4,0]


								        LDB     .D2T2   *+B_mask[5],            B_mask12        ;[ 5,0]

								||      LDB     .D1T1   *+A_mask[2],            A_mask02        ;[ 5,0]


								        MVK     .S2     0,          B_roundval1                 ;[ 6,0]

								||      LDB     .D2T2   *+B_mask[4],            B_mask11        ;[ 6,0]

								||      LDB     .D1T1   *+A_mask[1],            A_mask01        ;[ 6,0]


								        PACK2   .L2     B_roundval1,B_roundval1,B_roundval      ;[ 7,0]

								||      LDB     .D2T2   *+B_mask[3],            B_mask10        ;[ 7,0]

								||      LDB     .D1T1   *+A_mask[0],            A_mask00        ;[ 7,0]


								        SHRU    .S2     B_inputcols,3,          B_count1        ;[ 8,0]

								||      PACK2   .L2     B_mask22,   B_mask22,   B_h22word       ;[ 8,0]

								||      MV      .D1X    B_OUTPTR,   A_OPTR                      ;[12,0]

								||      LDNDW   .D2T2   *B_IN3++(1),            B_line21:B_line20       ;[ 1,1]


								        PACKL4  .L2     B_h22word,  B_h22word,  B_h22word       ;[ 9,0]

								||      PACK2   .S2     B_mask21,   B_mask21,   B_h21word       ;[ 9,0]

								||      PACK2   .L1     A_mask20,   A_mask20,   A_h20word       ;[ 9,0]

								||      LDNDW   .D2T2   *B_IN2++(2),            B_line11:B_line10       ;[ 2,1]


								        PACKL4  .L2     B_h21word,  B_h21word,  B_h21word       ;[10,0]

								||      PACKL4  .L1     A_h20word,  A_h20word,  A_h20word       ;[10,0]

								||      PACK2   .S2     B_mask12,   B_mask12,   B_h12word       ;[10,0]

								||      PACK2   .S1     A_mask02,   A_mask02,   A_h02word       ;[10,0]

								||      LDNDW   .D2T1   *B_IN2++(6),            A_line15:A_line14       ;[ 3,1]


								        PACKL4  .L2     B_h12word,  B_h12word,  B_h12word       ;[11,0]

								||      PACK2   .S2     B_mask11,   B_mask11,   B_h11word       ;[11,0]

								||      PACKL4  .L1     A_h02word,  A_h02word,  A_h02word       ;[11,0]

								||      PACK2   .S1     A_mask01,   A_mask01,   A_h01word       ;[11,0]

								||      LDNDW   .D2T2   *B_IN3++(1),            B_line23:B_line22       ;[ 4,1]


								        PACKL4  .L2     B_h11word,  B_h11word,  B_h11word       ;[12,0]

								||      PACK2   .S2     B_mask10,   B_mask10,   B_h10word       ;[12,0]

								||      PACKL4  .L1     A_h01word,  A_h01word,  A_h01word       ;[12,0]

								||      PACK2   .S1     A_mask00,   A_mask00,   A_h00word       ;[12,0]

								||      LDNDW   .D1T1   *A_IN1++(1),            A_line01:A_line00       ;[ 5,1]


								        SUB     .S2     B_count1,   2,          B_count         ;[13,0]

								||      MV      .S1X    B_h22word,  A_h22word                   ;[13,0]

								||      PACKL4  .L2     B_h10word,  B_h10word,  B_h10word       ;[13,0]

								||      PACKL4  .L1     A_h00word,  A_h00word,  A_h00word       ;[13,0]

								||      LDNDW   .D1T1   *A_IN1++(1),            A_line03:A_line02       ;[ 6,1]


								* =========================== PIPE LOOP PROLOG ============================ *

								        MPYUS4  .M2     B_line11,   B_h10word,  B_prodB3:B_prodB2       ;[ 7,1]


								        MPYSU4  .M1X    A_h20word,  B_line20,   A_prodC1:A_prodC0       ;[ 8,1]

								||      MPYSU4  .M2X    B_h12word,  A_line15,   B_prodB11:B_prodB10     ;[ 8,1]

								||      SHLMB   .S1     A_line14,   A_line15,   A_line13        ;[ 8,1]

								||      SHRMB   .S2     B_line11,   B_line10,   B_line12        ;[ 8,1]

								||      LDNDW   .D1T1   *A_IN1++(6),            A_line05:A_line04       ;[ 8,1]


								; -

								        MPYUS4  .M2     B_line23,   B_h21word,  B_prodC7:B_prodC6       ;[ 9,1]

								||      MPYSU4  .M1X    A_h20word,  B_line21,   A_prodC3:A_prodC2       ;[ 9,1]

								||      LDNDW   .D2T2   *B_IN3++(6),            B_line25:B_line24       ;[ 9,1]

								||      B       .S1     L7


								        MPYSU4  .M2X    B_h12word,  A_line14,   B_prodB9:B_prodB8       ;[10,1]

								||      MPYUS4  .M1     A_line00,   A_h00word,  A_prodA1:A_prodA0       ;[10,1]

								||      LDNDW   .D2T2   *B_IN3++(1),            B_line21:B_line20       ;[ 1,2]

								||      B       .S1     L8 + 12


								        MPYUS4  .M2     B_line22,   B_h21word,  B_prodC5:B_prodC4       ;[11,1]

								||      MPYUS4  .M1     A_line01,   A_h00word,  A_prodA3:A_prodA2       ;[11,1]

								||      LDNDW   .D2T2   *B_IN2++(2),            B_line11:B_line10       ;[ 2,2]

								||      B       .S1     L9


								        MPYUS4  .M2     B_line10,   B_h10word,  B_prodB1:B_prodB0       ;[12,1]

								||      MPYUS4  .M1     A_line02,   A_h01word,  A_prodA5:A_prodA4       ;[12,1]

								||      LDNDW   .D2T1   *B_IN2++(6),            A_line15:A_line14       ;[ 3,2]

								||      B       .S1     L1


								        ADD2    .L2     B_roundval, B_prodC6,   B_sum2a         ;[13,1]

								||      MPYUS4  .M2     B_line12,   B_h11word,  B_prodB5:B_prodB4       ;[13,1]

								||      MPYUS4  .M1     A_line04,   A_h02word,  A_prodA9:A_prodA8       ;[13,1]

								||      LDNDW   .D2T2   *B_IN3++(1),            B_line23:B_line22       ;[ 4,2]

								||      B       .S1     L2_P


								        ADD2    .S2     B_prodB11,  B_roundval, B_sum3d         ;[14,1]

								||      ADD2    .D2     B_sum2a,    B_prodB2,   B_sum2b         ;[14,1]

								||      ADD2    .L2     B_prodB9,   B_roundval, B_sum1e         ;[14,1]

								||      MPYUS4  .M2X    A_line13,   B_h11word,  B_prodB7:B_prodB6       ;[14,1]

								||      MPYUS4  .M1     A_line03,   A_h01word,  A_prodA7:A_prodA6       ;[14,1]

								||      LDNDW   .D1T1   *A_IN1++(1),            A_line01:A_line00       ;[ 5,2]

								||      B       .S1     L3


								;   L7, L8+12, L9, L1


								L2_P:   ADD2    .S2     B_sum3f,    B_prodB7,   B_sum3g         ;[19,1]

								||      ADD2    .D1     A_prodA11,  A_prodA7,   A_sum3a         ;[19,1]

								||      ADD2    .L2     B_sum2c,    B_prodB10,  B_sum2d         ;[19,1]

								||      ADD2    .L1     A_sum0e,    A_prodC0,   A_sum0f         ;[19,1]

								||      MPYSU4  .M2X    B_h12word,  A_line14,   B_prodB9:B_prodB8       ;[10,2]

								||      MPYUS4  .M1     A_line00,   A_h00word,  A_prodA1:A_prodA0       ;[10,2]

								||      LDNDW   .D2T2   *B_IN3++(1),            B_line21:B_line20       ;[ 1,3]

								||      B       .S1     L8 + 4


								;   L3, L4, L5, L6, L7, L8+4, L9


								* =========================== PIPE LOOP KERNEL ============================ *

								loop:

								L1:     ADD2    .D1     A_sum2f,    A_prodA6,   A_sum2g         ;[18,2]

								||      ADD2    .L2     B_sum2b,    B_prodB6,   B_sum2c         ;[18,2]

								||      ADD2    .L1     A_sum1a,    A_prodC1,   A_sum1b         ;[18,2]

								||      ADD2    .S1     A_prodA4,   A_prodA8,   A_sum0e         ;[18,2]

								||      ADD2    .S2     B_sum0c,    B_prodB8,   B_sum0d         ;[18,2]

								||      MPYUS4  .M2     B_line23,   B_h21word,  B_prodC7:B_prodC6       ;[ 9,3]

								||      MPYSU4  .M1X    A_h20word,  B_line21,   A_prodC3:A_prodC2       ;[ 9,3]

								||      LDNDW   .D2T2   *B_IN3++(6),            B_line25:B_line24       ;[ 9,3]


								        SHR2    .S1X    B_pixel67,  A_shift,    A_pixel67a      ;[28,1]

								||      ADD2    .S2     B_sum3f,    B_prodB7,   B_sum3g         ;[19,2]

								||      ADD2    .D1     A_prodA11,  A_prodA7,   A_sum3a         ;[19,2]

								||      ADD2    .L2     B_sum2c,    B_prodB10,  B_sum2d         ;[19,2]

								||      ADD2    .L1     A_sum0e,    A_prodC0,   A_sum0f         ;[19,2]

								||      MPYSU4  .M2X    B_h12word,  A_line14,   B_prodB9:B_prodB8       ;[10,3]

								||      MPYUS4  .M1     A_line00,   A_h00word,  A_prodA1:A_prodA0       ;[10,3]

								||      LDNDW   .D2T2   *B_IN3++(1),            B_line21:B_line20       ;[ 1,4]


								L3:     SPACKU4 .S1     A_pixel67a, A_pixel45a, A_outword1      ;[29,1]

								||      ADD2    .L2     B_sum3g,    B_prodC11,  B_sum3h         ;[20,2]

								||      ADD2    .L1     A_sum3a,    A_prodC3,   A_sum3b         ;[20,2]

								||      ADD2    .S2     B_sum2d,    B_prodC10,  B_sum2e         ;[20,2]

								||      ADD2    .D1     A_sum0f,    A_prodC8,   A_sum0g         ;[20,2]

								||      MPYUS4  .M2     B_line22,   B_h21word,  B_prodC5:B_prodC4       ;[11,3]

								||      MPYUS4  .M1     A_line01,   A_h00word,  A_prodA3:A_prodA2       ;[11,3]

								||      LDNDW   .D2T2   *B_IN2++(2),            B_line11:B_line10       ;[ 2,4]


								        BDEC    .S2     loop,       B_count                     ;[30,1]

								||      SHR2    .S1X    B_pixel23,  A_shift,    A_pixel23a      ;[30,1]

								||      ADD2    .L2     B_sum1f,    B_prodB1,   B_sum1g         ;[21,2]

								||      ADD2    .D1     A_sum1b,    A_prodC9,   A_sum1c         ;[21,2]

								||      ADD2    .L1     A_sum0g,    A_prodA0,   A_sum0h         ;[21,2]

								||      MPYUS4  .M2     B_line10,   B_h10word,  B_prodB1:B_prodB0       ;[12,3]

								||      MPYUS4  .M1     A_line02,   A_h01word,  A_prodA5:A_prodA4       ;[12,3]

								||      LDNDW   .D2T1   *B_IN2++(6),            A_line15:A_line14       ;[ 3,4]


								        SHR2    .S1     A_pixel01,  A_shift,    A_pixel01a      ;[31,1]

								||      ADD2    .D1     A_sum2g,    A_prodA10,  A_sum2h         ;[22,2]

								||      ADD2    .S2     B_sum1g,    B_prodB5,   B_sum1h         ;[22,2]

								||      ADD2    .L1     A_sum1c,    A_prodA1,   A_sum1d         ;[22,2]

								||      ADD2    .L2     B_roundval, B_prodC6,   B_sum2a         ;[13,3]

								||      MPYUS4  .M2     B_line12,   B_h11word,  B_prodB5:B_prodB4       ;[13,3]

								||      MPYUS4  .M1     A_line04,   A_h02word,  A_prodA9:A_prodA8       ;[13,3]

								||      LDNDW   .D2T2   *B_IN3++(1),            B_line23:B_line22       ;[ 4,4]


								        ADD2    .S1     A_sum3b,    A_prodA3,   A_sum3c         ;[23,2]

								||      ADD2    .L1X    B_sum0d,    A_sum0h,    A_pixel01       ;[23,2]

								||      ADD2    .S2     B_prodB11,  B_roundval, B_sum3d         ;[14,3]

								||      ADD2    .D2     B_sum2a,    B_prodB2,   B_sum2b         ;[14,3]

								||      ADD2    .L2     B_prodB9,   B_roundval, B_sum1e         ;[14,3]

								||      MPYUS4  .M2X    A_line13,   B_h11word,  B_prodB7:B_prodB6       ;[14,3]

								||      MPYUS4  .M1     A_line03,   A_h01word,  A_prodA7:A_prodA6       ;[14,3]

								||      LDNDW   .D1T1   *A_IN1++(1),            A_line01:A_line00       ;[ 5,4]


								L7:     SPACKU4 .S1     A_pixel23a, A_pixel01a, A_outword0      ;[33,1]

								||      ADD2    .L1X    A_sum2h,    B_sum2e,    A_pixel45       ;[24,2]

								||      ADD2    .S2X    B_sum1h,    A_sum1d,    B_pixel23       ;[24,2]

								||      ADD2    .L2     B_sum3d,    B_prodC7,   B_sum3e         ;[15,3]

								||      ADD2    .D2     B_roundval, B_prodC4,   B_sum0a         ;[15,3]

								||      MPYSU4  .M2     B_h22word,  B_line25,   B_prodC11:B_prodC10     ;[15,3]

								||      MPYUS4  .M1     A_line05,   A_h02word,  A_prodA11:A_prodA10     ;[15,3]

								||      LDNDW   .D1T1   *A_IN1++(1),            A_line03:A_line02       ;[ 6,4]

								L8:

								        STNDW   .D1T1   A_outword1:A_outword0,  *A_OPTR++(8)    ;[34,1]

								||      SHR2    .S1     A_pixel45,  A_shift,    A_pixel45a      ;[25,2]

								||      ADD2    .L2X    B_sum3h,    A_sum3c,    B_pixel67       ;[25,2]

								||      ADD2    .L1     A_prodC2,   A_prodA2,   A_sum2f         ;[16,3]

								||      ADD2    .S2     B_sum1e,    B_prodC5,   B_sum1f         ;[16,3]

								||      ADD2    .D2     B_sum0a,    B_prodB0,   B_sum0b         ;[16,3]

								||      MPYSU4  .M1X    A_h22word,  B_line24,   A_prodC9:A_prodC8       ;[16,3]

								||      MPYUS4  .M2     B_line11,   B_h10word,  B_prodB3:B_prodB2       ;[ 7,4]


								L9:     ADD2    .D2     B_sum3e,    B_prodB3,   B_sum3f         ;[17,3]

								||      ADD2    .L1     A_prodA9,   A_prodA5,   A_sum1a         ;[17,3]

								||      ADD2    .L2     B_sum0b,    B_prodB4,   B_sum0c         ;[17,3]

								||      MPYSU4  .M1X    A_h20word,  B_line20,   A_prodC1:A_prodC0       ;[ 8,4]

								||      MPYSU4  .M2X    B_h12word,  A_line15,   B_prodB11:B_prodB10     ;[ 8,4]

								||      SHLMB   .S1     A_line14,   A_line15,   A_line13        ;[ 8,4]

								||      SHRMB   .S2     B_line11,   B_line10,   B_line12        ;[ 8,4]

								||      LDNDW   .D1T1   *A_IN1++(6),            A_line05:A_line04       ;[ 8,4]


								* =========================== PIPE LOOP EPILOG ============================ *

								        SHR2    .S1X    B_pixel67,  A_shift,    A_pixel67a      ;[28,4]

								||      LDW     .D2T2   *+B_SP[2],  B_csr


								        SPACKU4 .S1     A_pixel67a, A_pixel45a, A_outword1      ;[29,4]

								||      RET     .S2     B3

								||      LDW     .D2T1   *+B_SP[3],  A10

								||      MV      .L1X    B_SP,       A_SP


								        SHR2    .S1X    B_pixel23,  A_shift,    A_pixel23a      ;[30,4]

								||      LDW     .D2T1   *++B_SP[4], A11

								||      LDW     .D1T2   *+A_SP[1],  B10


								        SHR2    .S1     A_pixel01,  A_shift,    A_pixel01a      ;[31,4]


								        MVC     .S2     B_csr,      CSR


								        SPACKU4 .S1     A_pixel23a, A_pixel01a, A_outword0      ;[33,4]


								        STNDW   .D1T1   A_outword1:A_outword0,  *A_OPTR++(8)    ;[34,4]


								* ========================================================================= *

								*   End of file:  img_conv_3x3.asm                                          *

								* ------------------------------------------------------------------------- *

								*             Copyright (c) 2003 Texas Instruments, Incorporated.           *

								*                            All Rights Reserved.                           *

								* ========================================================================= *