c6416_sdk/imglib/wave_horz.asm


								;* ======================================================================== *;

								;*  TEXAS INSTRUMENTS, INC.                                                 *;

								;*                                                                          *;

								;*  IMGLIB  DSP Image/Video Processing Library                              *;

								;*                                                                          *;

								;*      Release:        Revision 1.04b                                      *;

								;*      CVS Revision:   1.11    Sun Sep 29 03:32:31 2002 (UTC)              *;

								;*      Snapshot date:  23-Oct-2003                                         *;

								;*                                                                          *;

								;*  This library contains proprietary intellectual property of Texas        *;

								;*  Instruments, Inc.  The library and its source code are protected by     *;

								;*  various copyrights, and portions may also be protected by patents or    *;

								;*  other legal protections.                                                *;

								;*                                                                          *;

								;*  This software is licensed for use with Texas Instruments TMS320         *;

								;*  family DSPs.  This license was provided to you prior to installing      *;

								;*  the software.  You may review this license by consulting the file       *;

								;*  TI_license.PDF which accompanies the files in this library.             *;

								;* ------------------------------------------------------------------------ *;

								;*          Copyright (C) 2003 Texas Instruments, Incorporated.             *;

								;*                          All Rights Reserved.                            *;

								;* ======================================================================== *;


								;* ======================================================================== *;

								;*  Assembler compatibility shim for assembling 4.30 and later code on      *;

								;*  tools prior to 4.30.                                                    *;

								;* ======================================================================== *;


								        .if $isdefed(".ASSEMBLER_VERSION")

								        .asg    .ASSEMBLER_VERSION, $asmver

								        .else

								        .asg    0,    $asmver

								        .endif


								        .if ($asmver < 430)


								        .asg    B,    CALL     ; Function Call

								        .asg    B,    RET      ; Return from a Function

								        .asg    B,    CALLRET  ; Function call with Call / Ret chaining.


								        .if .TMS320C6400

								        .asg    BNOP, CALLNOP  ; C64x BNOP as a Fn. Call

								        .asg    BNOP, RETNOP   ; C64x BNOP as a Fn. Return

								        .asg    BNOP, CRNOP    ; C64x Fn call w/, Call/Ret chaining via BNOP.

								        .endif


								        .asg    , .asmfunc     ; .func equivalent for hand-assembly code

								        .asg    , .endasmfunc  ; .endfunc equivalent for hand-assembly code


								        .endif


								;* ======================================================================== *;

								;*  End of assembler compatibility shim.                                    *;

								;* ======================================================================== *;


								* ========================================================================= *

								*   TEXAS INSTRUMENTS, INC.                                                 *

								*                                                                           *

								*   NAME                                                                    *

								*       IMG_wave_horz : 1D Wavelet Transform                                *

								*                                                                           *

								*   REVISION DATE                                                           *

								*       21-Jan-1999                                                         *

								*                                                                           *

								*   USAGE                                                                   *

								*       This routine is C-callable and can be called as:                    *

								*                                                                           *

								*           void IMG_wave_horz                                              *

								*           (                                                               *

								*               const short *restrict in_data,  /* Row of input pixels  */  *

								*               const short *restrict qmf,      /* Low-pass QMF filter  */  *

								*               const short *restrict mqmf,     /* High-pass QMF filter */  *

								*               short       *restrict out_data, /* Row of output data   */  *

								*               int                   cols      /* Length of input.     */  *

								*           );                                                              *

								*                                                                           *

								*   DESCRIPTION                                                             *

								*       This kernel performs a 1D Periodic Orthogonal Wavelet               *

								*       decomposition.  This also performs athe row decomposition in a      *

								*       2D wavelet transform.  An input signal x[n] is low pass and         *

								*       high pass filtered and decimated by two.  This results in a         *

								*       reference signal r1[n] which is the decimated output obtained       *

								*       by dropping the odd samples of the low pass filtered output and     *

								*       a detail signal d[n] obtained by dropping the odd samples of        *

								*       the high-pass output.  A circular convolution algorithm is          *

								*       implemented and hence the wavelet transform is periodic.  The       *

								*       reference signal and the detail signal are half the size of the     *

								*       original signal.  The reference signal may then be iterated         *

								*       again to perform another scale of multi-resolution analysis.        *

								*                                                                           *

								*   TECHNIQUES                                                              *

								*       The main idea in optimizing the code is to issue one set of         *

								*       reads to the x array and to perform low-pass and high pass          *

								*       filtering together and to perfrom the filtering operations          *

								*       together to maximize the number of multiplies.  The last 6          *

								*       elements of the low-pass filter and the first 6 elements of the     *

								*       high pass filter use the same input This is used to                 *

								*       appropraitely change the output pointer to the low pass filter      *

								*       after 6 iterations.  However for the first six iterations           *

								*       pointer wrap-around can occurr and hence this creates a             *

								*       dependency.  Pre-reading those 6 values outside the array           *

								*       prevents the checks that introduce this dependency.  In addtion     *

								*       the input data is read as word wide quantities and the low-pass     *

								*       and high-pass filter coefficients are stored in registers           *

								*       allowing for the input loop to be completely unrolled.  Thus        *

								*       the assembly code has only one loop.  A predication register is     *

								*       used to reset the low-pass output pointer after three               *

								*       iterations.  The merging of the loops in this fashion allows f      *

								*                                                                           *

								*   ASSUMPTIONS                                                             *

								*       This kernel assumes that the # of filter taps for the qmf and       *

								*       mqmf is 8.                                                          *

								*                                                                           *

								*       Both the filters are assumed to be double-word aligned and have     *

								*       8 taps.                                                             *

								*                                                                           *

								*       The input line is assumed to be word aligned so that LDWs           *

								*       may be performed.                                                   *

								*                                                                           *

								*       This code assumes that filter coefficients are maintained as        *

								*       shorts in Q15 format.                                               *

								*                                                                           *

								*       It also assumes that input data is an array of shorts (16 bit)      *

								*       (The input is assumed to be an array of shorts to allow for         *

								*       re-using this kernel to perform Multi Resolution Analysis as        *

								*       the output of this code will feedback again as input in the         *

								*       next stage.)                                                        *

								*                                                                           *

								*       Since the transform is a dyadic wavelet cols should be a power      *

								*       of 2. Cols must also be >=8.                                        *

								*                                                                           *

								*                                                                           *

								*   MEMORY NOTE                                                             *

								*       This code has no bank conflicts.                                    *

								*                                                                           *

								*       This code is ENDIAN Neutral.                                        *

								*                                                                           *

								*                                                                           *

								*   NOTES                                                                   *

								*       This code masks interrupts for nearly its entire duration.  As      *

								*       a result, the code is interrupt tolerant but not                    *

								*       interruptible.                                                      *

								*                                                                           *

								*       This code can implement the Daubechies D4 filterbank for            *

								*       analysis with 4 vansishing moments.  The length of the analyzing    *

								*       low-pass and high pass filters is 8 in that case.                   *

								*                                                                           *

								*   C CODE                                                                  *

								*                                                                           *

								*       This is the C equivalent of the assembly code without restrictions: *

								*       Note that the assembly code is hand optimized and restrictions      *

								*       apply as noted under "ASSUMPTIONS".                                 *

								*                                                                           *

								*           void IMG_wave_horz                                              *

								*           (                                                               *

								*               const short *restrict in_data,  /* Row of input pixels  */  *

								*               const short *restrict qmf,      /* Low-pass QMF filter  */  *

								*               const short *restrict mqmf,     /* High-pass QMF filter */  *

								*               short       *restrict out_data, /* Row of output data   */  *

								*               int                   cols      /* Length of input.     */  *

								*           );                                                              *

								*                                                                           *

								*           {                                                               *

								*               int    i, res, iters;                                       *

								*               int    j, sum, prod;                                        *

								*               short *xptr  = in_data;                                     *

								*               short *yptr  = out_data;                                    *

								*               short *x_end = &in_data[cols - 1];                          *

								*               short  xdata, hdata;                                        *

								*               short *xstart;                                              *

								*               short *filt_ptr;                                            *

								*               int    M = 8;                                               *

								*                                                                           *

								*               /* ------------------------------------------------- */     *

								*               /*  Set our loop trip count and starting x posn.     */     *

								*               /*  'xstart' is used in the high-pass filter loop.   */     *

								*               /* ------------------------------------------------- */     *

								*               iters  = cols;                                              *

								*               xstart = in_data + (cols - M)  + 2;                         *

								*                                                                           *

								*               /* ------------------------------------------------- */     *

								*               /*  Low pass filter.  Iterate for cols/2 iterations  */     *

								*               /*  generating cols/2 low pass sample points with    */     *

								*               /*  the low-pass quadrature mirror filter.           */     *

								*               /* ------------------------------------------------- */     *

								*               for (i = 0; i < iters; i += 2)                              *

								*               {                                                           *

								*                   /* --------------------------------------------- */     *

								*                   /*  Initialize our sum to the rounding value     */     *

								*                   /*  and reset our pointer.                       */     *

								*                   /* --------------------------------------------- */     *

								*                   sum  = Qr;                                              *

								*                   xptr = in_data + i;                                     *

								*                                                                           *

								*                   /* --------------------------------------------- */     *

								*                   /*  Iterate over the taps in our QMF.            */     *

								*                   /* --------------------------------------------- */     *

								*                   for (j = 0; j < M; j++)                                 *

								*                   {                                                       *

								*                       xdata = *xptr++;                                    *

								*                       hdata =  qmf[j];                                    *

								*                       prod  =  xdata * hdata;                             *

								*                       sum  += prod;                                       *

								*                       if (xptr > x_end) xptr = in_data;                   *

								*                   }                                                       *

								*                                                                           *

								*                   /* --------------------------------------------- */     *

								*                   /*  Adjust the Qpt of our sum and store result.  */     *

								*                   /* --------------------------------------------- */     *

								*                   res    = (sum >> Qpt);                                  *

								*                   *out_data++ = res;                                      *

								*               }                                                           *

								*                                                                           *

								*                                                                           *

								*               /* ------------------------------------------------- */     *

								*               /*  High pass filter.  Iterate for cols/2 iters      */     *

								*               /*  generating cols/2 high pass sample points with   */     *

								*               /*  the high-pass quadrature mirror filter.          */     *

								*               /* ------------------------------------------------- */     *

								*               for (i = 0; i < iters ; i+=2)                               *

								*               {                                                           *

								*                   /* --------------------------------------------- */     *

								*                   /*  Initialize our sum and filter pointer.       */     *

								*                   /* --------------------------------------------- */     *

								*                   sum  = Qr;                                              *

								*                   filt_ptr  = mqmf + (M - 1);                             *

								*                                                                           *

								*                   /* --------------------------------------------- */     *

								*                   /*  Set up our data pointer.  This is slightly   */     *

								*                   /*  more complicated due to how the data wraps   */     *

								*                   /*  around the edge of the buffer.               */     *

								*                   /* --------------------------------------------- */     *

								*                   xptr = xstart;                                          *

								*                   xstart += 2;                                            *

								*                   if (xstart > x_end) xstart = in_data;                   *

								*                                                                           *

								*                   /* --------------------------------------------- */     *

								*                   /*  Iterate over the taps in our QMF.            */     *

								*                   /* --------------------------------------------- */     *

								*                   for ( j = 0; j < M; j++)                                *

								*                   {                                                       *

								*                       xdata = *xptr++;                                    *

								*                       hdata = *filt_ptr--;                                *

								*                       prod  = xdata * hdata;                              *

								*                       if (xptr > x_end) xptr = in_data;                   *

								*                       sum  += prod;                                       *

								*                   }                                                       *

								*                                                                           *

								*                   /* --------------------------------------------- */     *

								*                   /*  Adjust the Qpt of our sum and store result.  */     *

								*                   /* --------------------------------------------- */     *

								*                   res = (sum >> Qpt);                                     *

								*                   *out_data++ =  res;                                     *

								*               }                                                           *

								*           }                                                               *

								*                                                                           *

								*   CYCLES                                                                  *

								*       cycles = cols * 2 + 25.                                             *

								*                                                                           *

								*       For cols = 256, cycles = 537.                                       *

								*       For cols = 512, cycles = 1049.                                      *

								*                                                                           *

								*   CODESIZE                                                                *

								*       360 bytes                                                           *

								*                                                                           *

								* ------------------------------------------------------------------------- *

								*             Copyright (c) 2003 Texas Instruments, Incorporated.           *

								*                            All Rights Reserved.                           *

								* ========================================================================= *


								        .sect ".text:_wave_horz"

								        .global _IMG_wave_horz

								_IMG_wave_horz:


								Qr              .set     16384

								M               .set     8

								; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================

								        .asg            A4,         A_iptr

								        .asg            B4,         B_qmf

								        .asg            A6,         A_filter

								        .asg            B6,         B_yptr_l1

								        .asg            A8,         A_ish_x_dim

								        .asg            A6,         A_qr

								        .asg            B20,        B_iptr

								        .asg            B7,         B_h67

								        .asg            B16,        B_h45

								        .asg            A5,         A_h23

								        .asg            A4,         A_h01

								        .asg            B4,         B_l10

								        .asg            B5,         B_l32

								        .asg            A2,         A_l54

								        .asg            A3,         A_l76

								        .asg            B19,        B_x10

								        .asg            B18,        B_x32

								        .asg            A9,         A_x54

								        .asg            A8,         A_yptr_h

								        .asg            B17,        B_yptr_l0

								        .asg            B6,         B_yptr_l1

								        .asg            A0,         A_i

								        .asg            B0,         B_p

								        .asg            A17,        A_h32

								        .asg            A16,        A_h10

								        .asg            B9,         B_h76

								        .asg            B8,         B_h54

								        .asg            A7,         A_offset

								        .asg            A1,         A_xiptr

								        .asg            B2,         B_xiptr

								        .asg            A5,         A_optr

								        .asg            A7,         A_x76

								        .asg            B21,        B_prod_l10

								        .asg            B29,        B_csr

								        .asg            B28,        B_no_gie

								        .asg            A0,         A_qmf

								; ============================================================================


								        SUB     .L1     A_ish_x_dim,    M - 2,        A_offset      ; x-M+2

								||      MVC     .S2     CSR,            B_csr

								||      LDDW    .D1T2   *A_filter[1],   B_h76:B_h54                 ; High


								        LDDW    .D1T1   *A_filter[0],   A_h32:A_h10                 ; High

								||      AND     .L2     B_csr,          -2,           B_no_gie

								||      MV      .S2X    A_iptr,         B_iptr                      ; iptr


								        ADDAH   .D1     A_iptr,         A_offset,     A_xiptr       ; iptr

								||      MVC     .S2     B_no_gie,       CSR                         ; Disabl

								||      LDW     .D2T1   *B_iptr++,      A_x76                       ; x76


								        ADD     .L2X    A_xiptr,        4,            B_xiptr       ; xiptr

								||      LDW     .D1T2   *A_xiptr++[2],  B_x10                       ; x10

								||      MV      .S1X    B_qmf,          A_qmf


								        LDDW    .D1T1   *A_qmf[1],      A_l76:A_l54                 ; Low

								||      LDDW    .D2T2   *B_qmf[0],      B_l32:B_l10                 ; Low


								        LDW     .D1T1   *A_xiptr++[2],  A_x54                       ; x54

								||      LDW     .D2T2   *B_xiptr++[2],  B_x32                       ; x32

								||      MVK     .S1     Qr,             A_qr                        ; A_qr


								        SHRU    .S1     A_ish_x_dim,    1,            A_i           ; X>>1

								||      MV      .L1X    B_yptr_l1,      A_optr                      ; Copy

								||      MVK     .S2     3,              B_p                         ; switch

								||      ADD     .L2X    B_yptr_l1,      A_offset,     B_yptr_l0     ; optr-off


								        ADDAH   .D1     A_optr,         A_i,          A_yptr_h      ; [x>>1]


								        PACKLH2 .S1     A_h10,          A_h10,        A_h01         ; h0:h1

								||      PACKLH2 .L1     A_h32,          A_h32,        A_h23         ; h2:h3

								||      PACKLH2 .L2     B_h54,          B_h54,        B_h45         ; h4:h5

								||      PACKLH2 .S2     B_h76,          B_h76,        B_h67         ; h6:h7


								; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================

								        .asg            A6,         A_qr

								        .asg            B20,        B_iptr

								        .asg            B7,         B_h67

								        .asg            B16,        B_h45

								        .asg            A5,         A_h23

								        .asg            A4,         A_h01

								        .asg            B4,         B_l10

								        .asg            B5,         B_l32

								        .asg            A2,         A_l54

								        .asg            A3,         A_l76

								        .asg            B19,        B_x10

								        .asg            B18,        B_x32

								        .asg            A9,         A_x54

								        .asg            A8,         A_yptr_h

								        .asg            B17,        B_yptr_l0

								        .asg            B6,         B_yptr_l1

								        .asg            A0,         A_i

								        .asg            B0,         B_p

								        .asg            B24,        B_sum_h

								        .asg            A19,        A_sum_l

								        .asg            A7,         A_x76

								        .asg            B9,         B_prod_h10

								        .asg            B22,        B_prod_h32

								        .asg            A17,        A_prod_h54

								        .asg            A18,        A_prod_h76

								        .asg            B21,        B_prod_l10

								        .asg            B23,        B_prod_l32

								        .asg            A19,        A_prod_l54

								        .asg            A18,        A_prod_l76

								        .asg            B21,        B_tmph0

								        .asg            A17,        A_tmph1

								        .asg            B22,        B_tmpl0

								        .asg            A18,        A_tmpl1

								        .asg            B23,        B_tmp_h

								        .asg            A16,        A_tmp_l

								        .asg            B24,        B_res_hi

								        .asg            A16,        A_res_low

								        .asg            B3,         B_return

								        .asg            A1,         A_p

								; ============================================================================

								; START:

								; ============================ PIPE LOOP PROLOG ==============================

								; PROLOG:


								        LDW     .D2T1   *B_iptr++,    A_x76                     ; Load x76

								||      MVK     .S1     1,            A_p

								||      DOTP2   .M2     B_x10,        B_l10,        B_prod_l10    ; prod

								||      B       .S2     L_3  + 8


								        DOTP2   .M1     A_x76,        A_l76,        A_prod_l76  ; x6l6 + x7l7

								||      DOTP2   .M2     B_x32,        B_l32,        B_prod_l32  ; x2l2 + x3l3

								||      B       .S2     L_4  + 12


								        MV      .L2     B_x32,        B_x10                     ; x10 = x32

								||      DOTP2   .M1     A_x54,        A_l54,        A_prod_l54  ; x4l4 + x5l5

								||      DOTP2   .M2     B_x10,        B_h67,        B_prod_h10  ; x0h7 + x1h6

								||      B       .S2     L_1  + 4


								        DOTP2   .M1     A_x54,        A_h23,        A_prod_h54  ; x5h2 + x4h3

								||      DOTP2   .M2     B_x10,        B_l10,        B_prod_l10  ; x0l0 + x1l1

								||      SUB     .L1     A_i,          3,            A_i

								||      B       .S2     L_2  + 4


								        MV      .D1     A_x76,        A_x54                     ; x54 = x76

								||      MV      .L2X    A_x54,        B_x32                     ; x32 = x54

								||      DOTP2   .M1     A_x76,        A_h01,        A_prod_h76  ; x7h0 + x6h1

								||      DOTP2   .M2     B_x32,        B_h45,        B_prod_h32  ; x3h4 + x2h5

								||      LDW     .D2T1   *B_iptr++,    A_x76                     ; Load x76


								        ADD     .L2     B_prod_l10,   B_prod_l32,   B_tmpl0     ; l10 + l32

								||      DOTP2   .M1     A_x76,        A_l76,        A_prod_l76  ; x6l6 + x7l7

								||      DOTP2   .M2     B_x32,        B_l32,        B_prod_l32  ; x2l2 + x3l3


								; ============================ PIPE LOOP KERNEL ==============================

								LOOP:


								L_1:

								        SHR     .S2     B_sum_h,      15,           B_res_hi    ; >> 15

								||      ADD     .L1     A_qr,         A_tmp_l,      A_sum_l     ; += tmp_l

								||      ADD     .S1     A_prod_h54,   A_prod_h76,   A_tmph1     ; h54 + h76

								||      MV      .D1     A_x76,        A_x54                     ; x54 = x76

								||      MV      .L2X    A_x54,        B_x32                     ; x32 = x54

								||      DOTP2   .M1     A_x76,        A_h01,        A_prod_h76  ; x7h0 + x6h1

								||      DOTP2   .M2     B_x32,        B_h45,        B_prod_h32  ; x3h4 + x2h5

								||      LDW     .D2T1   *B_iptr++,    A_x76                     ; Load x76


								L_2:

								        STH     .D1T2   B_res_hi,     *A_yptr_h++               ; *y_hp

								||      SHR     .S1     A_sum_l,      15,           A_res_low   ; >> 15

								||      ADD     .S2     B_prod_h10,   B_prod_h32,   B_tmph0     ; h10 + h32

								||      ADD     .L2     B_prod_l10,   B_prod_l32,   B_tmpl0     ; l10 + l32

								||      DOTP2   .M1     A_x76,        A_l76,        A_prod_l76  ; x6l6 + x7l7

								||      DOTP2   .M2     B_x32,        B_l32,        B_prod_l32  ; x2l2 + x3l3

								||[A_p] SUB     .L1     A_p,          1,            A_p


								L_3:

								  [!B_p]STH     .D2T1   A_res_low,    *B_yptr_l1++              ; Store *y_lp1

								||      ADD     .L2X    B_tmph0,      A_tmph1,      B_tmp_h     ; tmp_h

								||      BDEC    .S1     LOOP,         A_i                       ; if (i) B LOOP

								||      ADD     .D1     A_prod_l54,   A_prod_l76,   A_tmpl1     ; l54 + l76

								||      MV      .S2     B_x32,        B_x10                     ; x10 = x32

								||      DOTP2   .M1     A_x54,        A_l54,        A_prod_l54  ; x4l4 + x5l5

								||      DOTP2   .M2     B_x10,        B_h67,        B_prod_h10  ; x0h7 + x1h6


								L_4:

								  [ B_p]SUB     .L2     B_p,          1,            B_p         ; pred.for LP

								||[ B_p]STH     .D2T1   A_res_low,    *B_yptr_l0++              ; Store *y_lp0

								||      ADD     .S2X    A_qr,         B_tmp_h,      B_sum_h     ; += tmp_h

								||      ADD     .D1X    A_tmpl1,      B_tmpl0,      A_tmp_l     ; tmp_l

								||      DOTP2   .M1     A_x54,        A_h23,        A_prod_h54  ; x5h2 + x4h3

								||      DOTP2   .M2     B_x10,        B_l10,        B_prod_l10  ; x0l0 + x1l1


								; ============================ PIPE LOOP EPILOG ==============================

								; EPILOG:

								        SHR     .S1X    B_sum_h,      15,           A0          ; sum_h >> 15

								||      ADD     .L1     A_qr,         A_tmp_l,      A_sum_l     ; sum_l+= tmp_l

								||      ADD     .D1     A_prod_h54,   A_prod_h76,   A_tmph1     ; h54 + h76

								||      RET     .S2     B_return


								        STH     .D1T1   A0,           *A_yptr_h++               ; Store *y_hp

								||      SHR     .S1     A_sum_l,      15,           A_res_low   ; sum_l >> 15

								||      ADD     .L2     B_prod_h10,   B_prod_h32,   B_tmph0     ; h10 + h32


								        STH     .D2T1   A_res_low,    *B_yptr_l1++              ; Store *y_lp1

								||      ADD     .L2X    B_tmph0,      A_tmph1,      B_tmp_h     ; tmp_h


								        ADD     .S2X    A_qr,         B_tmp_h,      B_sum_h     ; sum_h += tmp_h


								        SHR     .S2     B_sum_h,      15,           B_res_hi    ; sum_h >> 15


								        STH     .D1T2   B_res_hi,     *A_yptr_h++               ; Store *y_hp

								||      MVC     .S2     B_csr,        CSR


								* ========================================================================= *

								*   End of file:  img_wave_horz.asm                                         *

								* ------------------------------------------------------------------------- *

								*             Copyright (c) 2003 Texas Instruments, Incorporated.           *

								*                            All Rights Reserved.                           *

								* ========================================================================= *