c6416_sdk/imglib/wave_horz.asm

;* ======================================================================== *;
;*  TEXAS INSTRUMENTS, INC.                                                 *;
;*                                                                          *;
;*  IMGLIB  DSP Image/Video Processing Library                              *;
;*                                                                          *;
;*      Release:        Revision 1.04b                                      *;
;*      CVS Revision:   1.11    Sun Sep 29 03:32:31 2002 (UTC)              *;
;*      Snapshot date:  23-Oct-2003                                         *;
;*                                                                          *;
;*  This library contains proprietary intellectual property of Texas        *;
;*  Instruments, Inc.  The library and its source code are protected by     *;
;*  various copyrights, and portions may also be protected by patents or    *;
;*  other legal protections.                                                *;
;*                                                                          *;
;*  This software is licensed for use with Texas Instruments TMS320         *;
;*  family DSPs.  This license was provided to you prior to installing      *;
;*  the software.  You may review this license by consulting the file       *;
;*  TI_license.PDF which accompanies the files in this library.             *;
;* ------------------------------------------------------------------------ *;
;*          Copyright (C) 2003 Texas Instruments, Incorporated.             *;
;*                          All Rights Reserved.                            *;
;* ======================================================================== *;


;* ======================================================================== *;
;*  Assembler compatibility shim for assembling 4.30 and later code on      *;
;*  tools prior to 4.30.                                                    *;
;* ======================================================================== *;

        .if $isdefed(".ASSEMBLER_VERSION")
        .asg    .ASSEMBLER_VERSION, $asmver
        .else
        .asg    0,    $asmver
        .endif

        .if ($asmver < 430)

        .asg    B,    CALL     ; Function Call
        .asg    B,    RET      ; Return from a Function
        .asg    B,    CALLRET  ; Function call with Call / Ret chaining.

        .if .TMS320C6400
        .asg    BNOP, CALLNOP  ; C64x BNOP as a Fn. Call
        .asg    BNOP, RETNOP   ; C64x BNOP as a Fn. Return
        .asg    BNOP, CRNOP    ; C64x Fn call w/, Call/Ret chaining via BNOP.
        .endif

        .asg    , .asmfunc     ; .func equivalent for hand-assembly code
        .asg    , .endasmfunc  ; .endfunc equivalent for hand-assembly code

        .endif

;* ======================================================================== *;
;*  End of assembler compatibility shim.                                    *;
;* ======================================================================== *;


* ========================================================================= *
*   TEXAS INSTRUMENTS, INC.                                                 *
*                                                                           *
*   NAME                                                                    *
*       IMG_wave_horz : 1D Wavelet Transform                                *
*                                                                           *
*   REVISION DATE                                                           *
*       21-Jan-1999                                                         *
*                                                                           *
*   USAGE                                                                   *
*       This routine is C-callable and can be called as:                    *
*                                                                           *
*           void IMG_wave_horz                                              *
*           (                                                               *
*               const short *restrict in_data,  /* Row of input pixels  */  *
*               const short *restrict qmf,      /* Low-pass QMF filter  */  *
*               const short *restrict mqmf,     /* High-pass QMF filter */  *
*               short       *restrict out_data, /* Row of output data   */  *
*               int                   cols      /* Length of input.     */  *
*           );                                                              *
*                                                                           *
*   DESCRIPTION                                                             *
*       This kernel performs a 1D Periodic Orthogonal Wavelet               *
*       decomposition.  This also performs athe row decomposition in a      *
*       2D wavelet transform.  An input signal x[n] is low pass and         *
*       high pass filtered and decimated by two.  This results in a         *
*       reference signal r1[n] which is the decimated output obtained       *
*       by dropping the odd samples of the low pass filtered output and     *
*       a detail signal d[n] obtained by dropping the odd samples of        *
*       the high-pass output.  A circular convolution algorithm is          *
*       implemented and hence the wavelet transform is periodic.  The       *
*       reference signal and the detail signal are half the size of the     *
*       original signal.  The reference signal may then be iterated         *
*       again to perform another scale of multi-resolution analysis.        *
*                                                                           *
*   TECHNIQUES                                                              *
*       The main idea in optimizing the code is to issue one set of         *
*       reads to the x array and to perform low-pass and high pass          *
*       filtering together and to perfrom the filtering operations          *
*       together to maximize the number of multiplies.  The last 6          *
*       elements of the low-pass filter and the first 6 elements of the     *
*       high pass filter use the same input This is used to                 *
*       appropraitely change the output pointer to the low pass filter      *
*       after 6 iterations.  However for the first six iterations           *
*       pointer wrap-around can occurr and hence this creates a             *
*       dependency.  Pre-reading those 6 values outside the array           *
*       prevents the checks that introduce this dependency.  In addtion     *
*       the input data is read as word wide quantities and the low-pass     *
*       and high-pass filter coefficients are stored in registers           *
*       allowing for the input loop to be completely unrolled.  Thus        *
*       the assembly code has only one loop.  A predication register is     *
*       used to reset the low-pass output pointer after three               *
*       iterations.  The merging of the loops in this fashion allows f      *
*                                                                           *
*   ASSUMPTIONS                                                             *
*       This kernel assumes that the # of filter taps for the qmf and       *
*       mqmf is 8.                                                          *
*                                                                           *
*       Both the filters are assumed to be double-word aligned and have     *
*       8 taps.                                                             *
*                                                                           *
*       The input line is assumed to be word aligned so that LDWs           *
*       may be performed.                                                   *
*                                                                           *
*       This code assumes that filter coefficients are maintained as        *
*       shorts in Q15 format.                                               *
*                                                                           *
*       It also assumes that input data is an array of shorts (16 bit)      *
*       (The input is assumed to be an array of shorts to allow for         *
*       re-using this kernel to perform Multi Resolution Analysis as        *
*       the output of this code will feedback again as input in the         *
*       next stage.)                                                        *
*                                                                           *
*       Since the transform is a dyadic wavelet cols should be a power      *
*       of 2. Cols must also be >=8.                                        *
*                                                                           *
*                                                                           *
*   MEMORY NOTE                                                             *
*       This code has no bank conflicts.                                    *
*                                                                           *
*       This code is ENDIAN Neutral.                                        *
*                                                                           *
*                                                                           *
*   NOTES                                                                   *
*       This code masks interrupts for nearly its entire duration.  As      *
*       a result, the code is interrupt tolerant but not                    *
*       interruptible.                                                      *
*                                                                           *
*       This code can implement the Daubechies D4 filterbank for            *
*       analysis with 4 vansishing moments.  The length of the analyzing    *
*       low-pass and high pass filters is 8 in that case.                   *
*                                                                           *
*   C CODE                                                                  *
*                                                                           *
*       This is the C equivalent of the assembly code without restrictions: *
*       Note that the assembly code is hand optimized and restrictions      *
*       apply as noted under "ASSUMPTIONS".                                 *
*                                                                           *
*           void IMG_wave_horz                                              *
*           (                                                               *
*               const short *restrict in_data,  /* Row of input pixels  */  *
*               const short *restrict qmf,      /* Low-pass QMF filter  */  *
*               const short *restrict mqmf,     /* High-pass QMF filter */  *
*               short       *restrict out_data, /* Row of output data   */  *
*               int                   cols      /* Length of input.     */  *
*           );                                                              *
*                                                                           *
*           {                                                               *
*               int    i, res, iters;                                       *
*               int    j, sum, prod;                                        *
*               short *xptr  = in_data;                                     *
*               short *yptr  = out_data;                                    *
*               short *x_end = &in_data[cols - 1];                          *
*               short  xdata, hdata;                                        *
*               short *xstart;                                              *
*               short *filt_ptr;                                            *
*               int    M = 8;                                               *
*                                                                           *
*               /* ------------------------------------------------- */     *
*               /*  Set our loop trip count and starting x posn.     */     *
*               /*  'xstart' is used in the high-pass filter loop.   */     *
*               /* ------------------------------------------------- */     *
*               iters  = cols;                                              *
*               xstart = in_data + (cols - M)  + 2;                         *
*                                                                           *
*               /* ------------------------------------------------- */     *
*               /*  Low pass filter.  Iterate for cols/2 iterations  */     *
*               /*  generating cols/2 low pass sample points with    */     *
*               /*  the low-pass quadrature mirror filter.           */     *
*               /* ------------------------------------------------- */     *
*               for (i = 0; i < iters; i += 2)                              *
*               {                                                           *
*                   /* --------------------------------------------- */     *
*                   /*  Initialize our sum to the rounding value     */     *
*                   /*  and reset our pointer.                       */     *
*                   /* --------------------------------------------- */     *
*                   sum  = Qr;                                              *
*                   xptr = in_data + i;                                     *
*                                                                           *
*                   /* --------------------------------------------- */     *
*                   /*  Iterate over the taps in our QMF.            */     *
*                   /* --------------------------------------------- */     *
*                   for (j = 0; j < M; j++)                                 *
*                   {                                                       *
*                       xdata = *xptr++;                                    *
*                       hdata =  qmf[j];                                    *
*                       prod  =  xdata * hdata;                             *
*                       sum  += prod;                                       *
*                       if (xptr > x_end) xptr = in_data;                   *
*                   }                                                       *
*                                                                           *
*                   /* --------------------------------------------- */     *
*                   /*  Adjust the Qpt of our sum and store result.  */     *
*                   /* --------------------------------------------- */     *
*                   res    = (sum >> Qpt);                                  *
*                   *out_data++ = res;                                      *
*               }                                                           *
*                                                                           *
*                                                                           *
*               /* ------------------------------------------------- */     *
*               /*  High pass filter.  Iterate for cols/2 iters      */     *
*               /*  generating cols/2 high pass sample points with   */     *
*               /*  the high-pass quadrature mirror filter.          */     *
*               /* ------------------------------------------------- */     *
*               for (i = 0; i < iters ; i+=2)                               *
*               {                                                           *
*                   /* --------------------------------------------- */     *
*                   /*  Initialize our sum and filter pointer.       */     *
*                   /* --------------------------------------------- */     *
*                   sum  = Qr;                                              *
*                   filt_ptr  = mqmf + (M - 1);                             *
*                                                                           *
*                   /* --------------------------------------------- */     *
*                   /*  Set up our data pointer.  This is slightly   */     *
*                   /*  more complicated due to how the data wraps   */     *
*                   /*  around the edge of the buffer.               */     *
*                   /* --------------------------------------------- */     *
*                   xptr = xstart;                                          *
*                   xstart += 2;                                            *
*                   if (xstart > x_end) xstart = in_data;                   *
*                                                                           *
*                   /* --------------------------------------------- */     *
*                   /*  Iterate over the taps in our QMF.            */     *
*                   /* --------------------------------------------- */     *
*                   for ( j = 0; j < M; j++)                                *
*                   {                                                       *
*                       xdata = *xptr++;                                    *
*                       hdata = *filt_ptr--;                                *
*                       prod  = xdata * hdata;                              *
*                       if (xptr > x_end) xptr = in_data;                   *
*                       sum  += prod;                                       *
*                   }                                                       *
*                                                                           *
*                   /* --------------------------------------------- */     *
*                   /*  Adjust the Qpt of our sum and store result.  */     *
*                   /* --------------------------------------------- */     *
*                   res = (sum >> Qpt);                                     *
*                   *out_data++ =  res;                                     *
*               }                                                           *
*           }                                                               *
*                                                                           *
*   CYCLES                                                                  *
*       cycles = cols * 2 + 25.                                             *
*                                                                           *
*       For cols = 256, cycles = 537.                                       *
*       For cols = 512, cycles = 1049.                                      *
*                                                                           *
*   CODESIZE                                                                *
*       360 bytes                                                           *
*                                                                           *
* ------------------------------------------------------------------------- *
*             Copyright (c) 2003 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *


        .sect ".text:_wave_horz"
        .global _IMG_wave_horz
_IMG_wave_horz: 


Qr              .set     16384
M               .set     8
; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================
        .asg            A4,         A_iptr
        .asg            B4,         B_qmf
        .asg            A6,         A_filter
        .asg            B6,         B_yptr_l1
        .asg            A8,         A_ish_x_dim
        .asg            A6,         A_qr
        .asg            B20,        B_iptr
        .asg            B7,         B_h67
        .asg            B16,        B_h45
        .asg            A5,         A_h23
        .asg            A4,         A_h01
        .asg            B4,         B_l10
        .asg            B5,         B_l32
        .asg            A2,         A_l54
        .asg            A3,         A_l76
        .asg            B19,        B_x10
        .asg            B18,        B_x32
        .asg            A9,         A_x54
        .asg            A8,         A_yptr_h
        .asg            B17,        B_yptr_l0
        .asg            B6,         B_yptr_l1
        .asg            A0,         A_i
        .asg            B0,         B_p
        .asg            A17,        A_h32
        .asg            A16,        A_h10
        .asg            B9,         B_h76
        .asg            B8,         B_h54
        .asg            A7,         A_offset
        .asg            A1,         A_xiptr
        .asg            B2,         B_xiptr
        .asg            A5,         A_optr
        .asg            A7,         A_x76
        .asg            B21,        B_prod_l10
        .asg            B29,        B_csr
        .asg            B28,        B_no_gie
        .asg            A0,         A_qmf
; ============================================================================

        SUB     .L1     A_ish_x_dim,    M - 2,        A_offset      ; x-M+2
||      MVC     .S2     CSR,            B_csr
||      LDDW    .D1T2   *A_filter[1],   B_h76:B_h54                 ; High

        LDDW    .D1T1   *A_filter[0],   A_h32:A_h10                 ; High
||      AND     .L2     B_csr,          -2,           B_no_gie
||      MV      .S2X    A_iptr,         B_iptr                      ; iptr

        ADDAH   .D1     A_iptr,         A_offset,     A_xiptr       ; iptr
||      MVC     .S2     B_no_gie,       CSR                         ; Disabl
||      LDW     .D2T1   *B_iptr++,      A_x76                       ; x76

        ADD     .L2X    A_xiptr,        4,            B_xiptr       ; xiptr
||      LDW     .D1T2   *A_xiptr++[2],  B_x10                       ; x10
||      MV      .S1X    B_qmf,          A_qmf

        LDDW    .D1T1   *A_qmf[1],      A_l76:A_l54                 ; Low
||      LDDW    .D2T2   *B_qmf[0],      B_l32:B_l10                 ; Low

        LDW     .D1T1   *A_xiptr++[2],  A_x54                       ; x54
||      LDW     .D2T2   *B_xiptr++[2],  B_x32                       ; x32
||      MVK     .S1     Qr,             A_qr                        ; A_qr

        SHRU    .S1     A_ish_x_dim,    1,            A_i           ; X>>1
||      MV      .L1X    B_yptr_l1,      A_optr                      ; Copy
||      MVK     .S2     3,              B_p                         ; switch
||      ADD     .L2X    B_yptr_l1,      A_offset,     B_yptr_l0     ; optr-off

        ADDAH   .D1     A_optr,         A_i,          A_yptr_h      ; [x>>1]

        PACKLH2 .S1     A_h10,          A_h10,        A_h01         ; h0:h1
||      PACKLH2 .L1     A_h32,          A_h32,        A_h23         ; h2:h3
||      PACKLH2 .L2     B_h54,          B_h54,        B_h45         ; h4:h5
||      PACKLH2 .S2     B_h76,          B_h76,        B_h67         ; h6:h7


; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================
        .asg            A6,         A_qr
        .asg            B20,        B_iptr
        .asg            B7,         B_h67
        .asg            B16,        B_h45
        .asg            A5,         A_h23
        .asg            A4,         A_h01
        .asg            B4,         B_l10
        .asg            B5,         B_l32
        .asg            A2,         A_l54
        .asg            A3,         A_l76
        .asg            B19,        B_x10
        .asg            B18,        B_x32
        .asg            A9,         A_x54
        .asg            A8,         A_yptr_h
        .asg            B17,        B_yptr_l0
        .asg            B6,         B_yptr_l1
        .asg            A0,         A_i
        .asg            B0,         B_p
        .asg            B24,        B_sum_h
        .asg            A19,        A_sum_l
        .asg            A7,         A_x76
        .asg            B9,         B_prod_h10
        .asg            B22,        B_prod_h32
        .asg            A17,        A_prod_h54
        .asg            A18,        A_prod_h76
        .asg            B21,        B_prod_l10
        .asg            B23,        B_prod_l32
        .asg            A19,        A_prod_l54
        .asg            A18,        A_prod_l76
        .asg            B21,        B_tmph0
        .asg            A17,        A_tmph1
        .asg            B22,        B_tmpl0
        .asg            A18,        A_tmpl1
        .asg            B23,        B_tmp_h
        .asg            A16,        A_tmp_l
        .asg            B24,        B_res_hi
        .asg            A16,        A_res_low
        .asg            B3,         B_return
        .asg            A1,         A_p
; ============================================================================
; START:
; ============================ PIPE LOOP PROLOG ==============================
; PROLOG:

        LDW     .D2T1   *B_iptr++,    A_x76                     ; Load x76
||      MVK     .S1     1,            A_p
||      DOTP2   .M2     B_x10,        B_l10,        B_prod_l10    ; prod
||      B       .S2     L_3  + 8

        DOTP2   .M1     A_x76,        A_l76,        A_prod_l76  ; x6l6 + x7l7
||      DOTP2   .M2     B_x32,        B_l32,        B_prod_l32  ; x2l2 + x3l3
||      B       .S2     L_4  + 12

        MV      .L2     B_x32,        B_x10                     ; x10 = x32
||      DOTP2   .M1     A_x54,        A_l54,        A_prod_l54  ; x4l4 + x5l5
||      DOTP2   .M2     B_x10,        B_h67,        B_prod_h10  ; x0h7 + x1h6
||      B       .S2     L_1  + 4

        DOTP2   .M1     A_x54,        A_h23,        A_prod_h54  ; x5h2 + x4h3
||      DOTP2   .M2     B_x10,        B_l10,        B_prod_l10  ; x0l0 + x1l1
||      SUB     .L1     A_i,          3,            A_i
||      B       .S2     L_2  + 4

        MV      .D1     A_x76,        A_x54                     ; x54 = x76
||      MV      .L2X    A_x54,        B_x32                     ; x32 = x54
||      DOTP2   .M1     A_x76,        A_h01,        A_prod_h76  ; x7h0 + x6h1
||      DOTP2   .M2     B_x32,        B_h45,        B_prod_h32  ; x3h4 + x2h5
||      LDW     .D2T1   *B_iptr++,    A_x76                     ; Load x76

        ADD     .L2     B_prod_l10,   B_prod_l32,   B_tmpl0     ; l10 + l32
||      DOTP2   .M1     A_x76,        A_l76,        A_prod_l76  ; x6l6 + x7l7
||      DOTP2   .M2     B_x32,        B_l32,        B_prod_l32  ; x2l2 + x3l3

; ============================ PIPE LOOP KERNEL ==============================
LOOP:

L_1:
        SHR     .S2     B_sum_h,      15,           B_res_hi    ; >> 15
||      ADD     .L1     A_qr,         A_tmp_l,      A_sum_l     ; += tmp_l
||      ADD     .S1     A_prod_h54,   A_prod_h76,   A_tmph1     ; h54 + h76
||      MV      .D1     A_x76,        A_x54                     ; x54 = x76
||      MV      .L2X    A_x54,        B_x32                     ; x32 = x54
||      DOTP2   .M1     A_x76,        A_h01,        A_prod_h76  ; x7h0 + x6h1
||      DOTP2   .M2     B_x32,        B_h45,        B_prod_h32  ; x3h4 + x2h5
||      LDW     .D2T1   *B_iptr++,    A_x76                     ; Load x76

L_2:
        STH     .D1T2   B_res_hi,     *A_yptr_h++               ; *y_hp
||      SHR     .S1     A_sum_l,      15,           A_res_low   ; >> 15
||      ADD     .S2     B_prod_h10,   B_prod_h32,   B_tmph0     ; h10 + h32
||      ADD     .L2     B_prod_l10,   B_prod_l32,   B_tmpl0     ; l10 + l32
||      DOTP2   .M1     A_x76,        A_l76,        A_prod_l76  ; x6l6 + x7l7
||      DOTP2   .M2     B_x32,        B_l32,        B_prod_l32  ; x2l2 + x3l3
||[A_p] SUB     .L1     A_p,          1,            A_p

L_3:
  [!B_p]STH     .D2T1   A_res_low,    *B_yptr_l1++              ; Store *y_lp1
||      ADD     .L2X    B_tmph0,      A_tmph1,      B_tmp_h     ; tmp_h
||      BDEC    .S1     LOOP,         A_i                       ; if (i) B LOOP
||      ADD     .D1     A_prod_l54,   A_prod_l76,   A_tmpl1     ; l54 + l76
||      MV      .S2     B_x32,        B_x10                     ; x10 = x32
||      DOTP2   .M1     A_x54,        A_l54,        A_prod_l54  ; x4l4 + x5l5
||      DOTP2   .M2     B_x10,        B_h67,        B_prod_h10  ; x0h7 + x1h6

L_4:
  [ B_p]SUB     .L2     B_p,          1,            B_p         ; pred.for LP
||[ B_p]STH     .D2T1   A_res_low,    *B_yptr_l0++              ; Store *y_lp0
||      ADD     .S2X    A_qr,         B_tmp_h,      B_sum_h     ; += tmp_h
||      ADD     .D1X    A_tmpl1,      B_tmpl0,      A_tmp_l     ; tmp_l
||      DOTP2   .M1     A_x54,        A_h23,        A_prod_h54  ; x5h2 + x4h3
||      DOTP2   .M2     B_x10,        B_l10,        B_prod_l10  ; x0l0 + x1l1

; ============================ PIPE LOOP EPILOG ==============================
; EPILOG:
        SHR     .S1X    B_sum_h,      15,           A0          ; sum_h >> 15
||      ADD     .L1     A_qr,         A_tmp_l,      A_sum_l     ; sum_l+= tmp_l
||      ADD     .D1     A_prod_h54,   A_prod_h76,   A_tmph1     ; h54 + h76
||      RET     .S2     B_return

        STH     .D1T1   A0,           *A_yptr_h++               ; Store *y_hp
||      SHR     .S1     A_sum_l,      15,           A_res_low   ; sum_l >> 15
||      ADD     .L2     B_prod_h10,   B_prod_h32,   B_tmph0     ; h10 + h32

        STH     .D2T1   A_res_low,    *B_yptr_l1++              ; Store *y_lp1
||      ADD     .L2X    B_tmph0,      A_tmph1,      B_tmp_h     ; tmp_h

        ADD     .S2X    A_qr,         B_tmp_h,      B_sum_h     ; sum_h += tmp_h

        SHR     .S2     B_sum_h,      15,           B_res_hi    ; sum_h >> 15

        STH     .D1T2   B_res_hi,     *A_yptr_h++               ; Store *y_hp
||      MVC     .S2     B_csr,        CSR

* ========================================================================= *
*   End of file:  img_wave_horz.asm                                         *
* ------------------------------------------------------------------------- *
*             Copyright (c) 2003 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *
add dsplib imglib Signed-off-by: surenyi <surenyi82@qq.com> 3 years ago			`;* ======================================================================== *;`
			`;* TEXAS INSTRUMENTS, INC. *;`
			`;* *;`
			`;* IMGLIB DSP Image/Video Processing Library *;`
			`;* *;`
			`;* Release: Revision 1.04b *;`
			`;* CVS Revision: 1.11 Sun Sep 29 03:32:31 2002 (UTC) *;`
			`;* Snapshot date: 23-Oct-2003 *;`
			`;* *;`
			`;* This library contains proprietary intellectual property of Texas *;`
			`;* Instruments, Inc. The library and its source code are protected by *;`
			`;* various copyrights, and portions may also be protected by patents or *;`
			`;* other legal protections. *;`
			`;* *;`
			`;* This software is licensed for use with Texas Instruments TMS320 *;`
			`;* family DSPs. This license was provided to you prior to installing *;`
			`;* the software. You may review this license by consulting the file *;`
			`;* TI_license.PDF which accompanies the files in this library. *;`
			`;* ------------------------------------------------------------------------ *;`
			`;* Copyright (C) 2003 Texas Instruments, Incorporated. *;`
			`;* All Rights Reserved. *;`
			`;* ======================================================================== *;`


			`;* ======================================================================== *;`
			`;* Assembler compatibility shim for assembling 4.30 and later code on *;`
			`;* tools prior to 4.30. *;`
			`;* ======================================================================== *;`

			`.if $isdefed(".ASSEMBLER_VERSION")`
			`.asg .ASSEMBLER_VERSION, $asmver`
			`.else`
			`.asg 0, $asmver`
			`.endif`

			`.if ($asmver < 430)`

			`.asg B, CALL ; Function Call`
			`.asg B, RET ; Return from a Function`
			`.asg B, CALLRET ; Function call with Call / Ret chaining.`

			`.if .TMS320C6400`
			`.asg BNOP, CALLNOP ; C64x BNOP as a Fn. Call`
			`.asg BNOP, RETNOP ; C64x BNOP as a Fn. Return`
			`.asg BNOP, CRNOP ; C64x Fn call w/, Call/Ret chaining via BNOP.`
			`.endif`

			`.asg , .asmfunc ; .func equivalent for hand-assembly code`
			`.asg , .endasmfunc ; .endfunc equivalent for hand-assembly code`

			`.endif`

			`;* ======================================================================== *;`
			`;* End of assembler compatibility shim. *;`
			`;* ======================================================================== *;`


			`* ========================================================================= *`
			`* TEXAS INSTRUMENTS, INC. *`
			`* *`
			`* NAME *`
			`* IMG_wave_horz : 1D Wavelet Transform *`
			`* *`
			`* REVISION DATE *`
			`* 21-Jan-1999 *`
			`* *`
			`* USAGE *`
			`* This routine is C-callable and can be called as: *`
			`* *`
			`* void IMG_wave_horz *`
			`* ( *`
			`* const short restrict in_data, / Row of input pixels / `
			`* const short restrict qmf, / Low-pass QMF filter / `
			`* const short restrict mqmf, / High-pass QMF filter / `
			`* short restrict out_data, / Row of output data / `
			`* int cols /* Length of input. / `
			`* ); *`
			`* *`
			`* DESCRIPTION *`
			`* This kernel performs a 1D Periodic Orthogonal Wavelet *`
			`* decomposition. This also performs athe row decomposition in a *`
			`* 2D wavelet transform. An input signal x[n] is low pass and *`
			`* high pass filtered and decimated by two. This results in a *`
			`* reference signal r1[n] which is the decimated output obtained *`
			`* by dropping the odd samples of the low pass filtered output and *`
			`* a detail signal d[n] obtained by dropping the odd samples of *`
			`* the high-pass output. A circular convolution algorithm is *`
			`* implemented and hence the wavelet transform is periodic. The *`
			`* reference signal and the detail signal are half the size of the *`
			`* original signal. The reference signal may then be iterated *`
			`* again to perform another scale of multi-resolution analysis. *`
			`* *`
			`* TECHNIQUES *`
			`* The main idea in optimizing the code is to issue one set of *`
			`* reads to the x array and to perform low-pass and high pass *`
			`* filtering together and to perfrom the filtering operations *`
			`* together to maximize the number of multiplies. The last 6 *`
			`* elements of the low-pass filter and the first 6 elements of the *`
			`* high pass filter use the same input This is used to *`
			`* appropraitely change the output pointer to the low pass filter *`
			`* after 6 iterations. However for the first six iterations *`
			`* pointer wrap-around can occurr and hence this creates a *`
			`* dependency. Pre-reading those 6 values outside the array *`
			`* prevents the checks that introduce this dependency. In addtion *`
			`* the input data is read as word wide quantities and the low-pass *`
			`* and high-pass filter coefficients are stored in registers *`
			`* allowing for the input loop to be completely unrolled. Thus *`
			`* the assembly code has only one loop. A predication register is *`
			`* used to reset the low-pass output pointer after three *`
			`* iterations. The merging of the loops in this fashion allows f *`
			`* *`
			`* ASSUMPTIONS *`
			`* This kernel assumes that the # of filter taps for the qmf and *`
			`* mqmf is 8. *`
			`* *`
			`* Both the filters are assumed to be double-word aligned and have *`
			`* 8 taps. *`
			`* *`
			`* The input line is assumed to be word aligned so that LDWs *`
			`* may be performed. *`
			`* *`
			`* This code assumes that filter coefficients are maintained as *`
			`* shorts in Q15 format. *`
			`* *`
			`* It also assumes that input data is an array of shorts (16 bit) *`
			`* (The input is assumed to be an array of shorts to allow for *`
			`* re-using this kernel to perform Multi Resolution Analysis as *`
			`* the output of this code will feedback again as input in the *`
			`* next stage.) *`
			`* *`
			`* Since the transform is a dyadic wavelet cols should be a power *`
			`* of 2. Cols must also be >=8. *`
			`* *`
			`* *`
			`* MEMORY NOTE *`
			`* This code has no bank conflicts. *`
			`* *`
			`* This code is ENDIAN Neutral. *`
			`* *`
			`* *`
			`* NOTES *`
			`* This code masks interrupts for nearly its entire duration. As *`
			`* a result, the code is interrupt tolerant but not *`
			`* interruptible. *`
			`* *`
			`* This code can implement the Daubechies D4 filterbank for *`
			`* analysis with 4 vansishing moments. The length of the analyzing *`
			`* low-pass and high pass filters is 8 in that case. *`
			`* *`
			`* C CODE *`
			`* *`
			`* This is the C equivalent of the assembly code without restrictions: *`
			`* Note that the assembly code is hand optimized and restrictions *`
			`* apply as noted under "ASSUMPTIONS". *`
			`* *`
			`* void IMG_wave_horz *`
			`* ( *`
			`* const short restrict in_data, / Row of input pixels / `
			`* const short restrict qmf, / Low-pass QMF filter / `
			`* const short restrict mqmf, / High-pass QMF filter / `
			`* short restrict out_data, / Row of output data / `
			`* int cols /* Length of input. / `
			`* ); *`
			`* *`
			`* { *`
			`* int i, res, iters; *`
			`* int j, sum, prod; *`
			`* short xptr = in_data; `
			`* short yptr = out_data; `
			`* short x_end = &in_data[cols - 1]; `
			`* short xdata, hdata; *`
			`* short xstart; `
			`* short filt_ptr; `
			`* int M = 8; *`
			`* *`
			`* /* ------------------------------------------------- / `
			`* /* Set our loop trip count and starting x posn. / `
			`* /* 'xstart' is used in the high-pass filter loop. / `
			`* /* ------------------------------------------------- / `
			`* iters = cols; *`
			`* xstart = in_data + (cols - M) + 2; *`
			`* *`
			`* /* ------------------------------------------------- / `
			`* /* Low pass filter. Iterate for cols/2 iterations / `
			`* /* generating cols/2 low pass sample points with / `
			`* /* the low-pass quadrature mirror filter. / `
			`* /* ------------------------------------------------- / `
			`* for (i = 0; i < iters; i += 2) *`
			`* { *`
			`* /* --------------------------------------------- / `
			`* /* Initialize our sum to the rounding value / `
			`* /* and reset our pointer. / `
			`* /* --------------------------------------------- / `
			`* sum = Qr; *`
			`* xptr = in_data + i; *`
			`* *`
			`* /* --------------------------------------------- / `
			`* /* Iterate over the taps in our QMF. / `
			`* /* --------------------------------------------- / `
			`* for (j = 0; j < M; j++) *`
			`* { *`
			`* xdata = xptr++; `
			`* hdata = qmf[j]; *`
			`* prod = xdata * hdata; *`
			`* sum += prod; *`
			`* if (xptr > x_end) xptr = in_data; *`
			`* } *`
			`* *`
			`* /* --------------------------------------------- / `
			`* /* Adjust the Qpt of our sum and store result. / `
			`* /* --------------------------------------------- / `
			`* res = (sum >> Qpt); *`
			`* out_data++ = res; `
			`* } *`
			`* *`
			`* *`
			`* /* ------------------------------------------------- / `
			`* /* High pass filter. Iterate for cols/2 iters / `
			`* /* generating cols/2 high pass sample points with / `
			`* /* the high-pass quadrature mirror filter. / `
			`* /* ------------------------------------------------- / `
			`* for (i = 0; i < iters ; i+=2) *`
			`* { *`
			`* /* --------------------------------------------- / `
			`* /* Initialize our sum and filter pointer. / `
			`* /* --------------------------------------------- / `
			`* sum = Qr; *`
			`* filt_ptr = mqmf + (M - 1); *`
			`* *`
			`* /* --------------------------------------------- / `
			`* /* Set up our data pointer. This is slightly / `
			`* /* more complicated due to how the data wraps / `
			`* /* around the edge of the buffer. / `
			`* /* --------------------------------------------- / `
			`* xptr = xstart; *`
			`* xstart += 2; *`
			`* if (xstart > x_end) xstart = in_data; *`
			`* *`
			`* /* --------------------------------------------- / `
			`* /* Iterate over the taps in our QMF. / `
			`* /* --------------------------------------------- / `
			`* for ( j = 0; j < M; j++) *`
			`* { *`
			`* xdata = xptr++; `
			`* hdata = filt_ptr--; `
			`* prod = xdata * hdata; *`
			`* if (xptr > x_end) xptr = in_data; *`
			`* sum += prod; *`
			`* } *`
			`* *`
			`* /* --------------------------------------------- / `
			`* /* Adjust the Qpt of our sum and store result. / `
			`* /* --------------------------------------------- / `
			`* res = (sum >> Qpt); *`
			`* out_data++ = res; `
			`* } *`
			`* } *`
			`* *`
			`* CYCLES *`
			`* cycles = cols * 2 + 25. *`
			`* *`
			`* For cols = 256, cycles = 537. *`
			`* For cols = 512, cycles = 1049. *`
			`* *`
			`* CODESIZE *`
			`* 360 bytes *`
			`* *`
			`* ------------------------------------------------------------------------- *`
			`* Copyright (c) 2003 Texas Instruments, Incorporated. *`
			`* All Rights Reserved. *`
			`* ========================================================================= *`


			`.sect ".text:_wave_horz"`
			`.global _IMG_wave_horz`
			`_IMG_wave_horz:`


			`Qr .set 16384`
			`M .set 8`
			`; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================`
			`.asg A4, A_iptr`
			`.asg B4, B_qmf`
			`.asg A6, A_filter`
			`.asg B6, B_yptr_l1`
			`.asg A8, A_ish_x_dim`
			`.asg A6, A_qr`
			`.asg B20, B_iptr`
			`.asg B7, B_h67`
			`.asg B16, B_h45`
			`.asg A5, A_h23`
			`.asg A4, A_h01`
			`.asg B4, B_l10`
			`.asg B5, B_l32`
			`.asg A2, A_l54`
			`.asg A3, A_l76`
			`.asg B19, B_x10`
			`.asg B18, B_x32`
			`.asg A9, A_x54`
			`.asg A8, A_yptr_h`
			`.asg B17, B_yptr_l0`
			`.asg B6, B_yptr_l1`
			`.asg A0, A_i`
			`.asg B0, B_p`
			`.asg A17, A_h32`
			`.asg A16, A_h10`
			`.asg B9, B_h76`
			`.asg B8, B_h54`
			`.asg A7, A_offset`
			`.asg A1, A_xiptr`
			`.asg B2, B_xiptr`
			`.asg A5, A_optr`
			`.asg A7, A_x76`
			`.asg B21, B_prod_l10`
			`.asg B29, B_csr`
			`.asg B28, B_no_gie`
			`.asg A0, A_qmf`
			`; ============================================================================`

			`SUB .L1 A_ish_x_dim, M - 2, A_offset ; x-M+2`
			`\|\| MVC .S2 CSR, B_csr`
			`\|\| LDDW .D1T2 *A_filter[1], B_h76:B_h54 ; High`

			`LDDW .D1T1 *A_filter[0], A_h32:A_h10 ; High`
			`\|\| AND .L2 B_csr, -2, B_no_gie`
			`\|\| MV .S2X A_iptr, B_iptr ; iptr`

			`ADDAH .D1 A_iptr, A_offset, A_xiptr ; iptr`
			`\|\| MVC .S2 B_no_gie, CSR ; Disabl`
			`\|\| LDW .D2T1 *B_iptr++, A_x76 ; x76`

			`ADD .L2X A_xiptr, 4, B_xiptr ; xiptr`
			`\|\| LDW .D1T2 *A_xiptr++[2], B_x10 ; x10`
			`\|\| MV .S1X B_qmf, A_qmf`

			`LDDW .D1T1 *A_qmf[1], A_l76:A_l54 ; Low`
			`\|\| LDDW .D2T2 *B_qmf[0], B_l32:B_l10 ; Low`

			`LDW .D1T1 *A_xiptr++[2], A_x54 ; x54`
			`\|\| LDW .D2T2 *B_xiptr++[2], B_x32 ; x32`
			`\|\| MVK .S1 Qr, A_qr ; A_qr`

			`SHRU .S1 A_ish_x_dim, 1, A_i ; X>>1`
			`\|\| MV .L1X B_yptr_l1, A_optr ; Copy`
			`\|\| MVK .S2 3, B_p ; switch`
			`\|\| ADD .L2X B_yptr_l1, A_offset, B_yptr_l0 ; optr-off`

			`ADDAH .D1 A_optr, A_i, A_yptr_h ; [x>>1]`

			`PACKLH2 .S1 A_h10, A_h10, A_h01 ; h0:h1`
			`\|\| PACKLH2 .L1 A_h32, A_h32, A_h23 ; h2:h3`
			`\|\| PACKLH2 .L2 B_h54, B_h54, B_h45 ; h4:h5`
			`\|\| PACKLH2 .S2 B_h76, B_h76, B_h67 ; h6:h7`


			`; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================`
			`.asg A6, A_qr`
			`.asg B20, B_iptr`
			`.asg B7, B_h67`
			`.asg B16, B_h45`
			`.asg A5, A_h23`
			`.asg A4, A_h01`
			`.asg B4, B_l10`
			`.asg B5, B_l32`
			`.asg A2, A_l54`
			`.asg A3, A_l76`
			`.asg B19, B_x10`
			`.asg B18, B_x32`
			`.asg A9, A_x54`
			`.asg A8, A_yptr_h`
			`.asg B17, B_yptr_l0`
			`.asg B6, B_yptr_l1`
			`.asg A0, A_i`
			`.asg B0, B_p`
			`.asg B24, B_sum_h`
			`.asg A19, A_sum_l`
			`.asg A7, A_x76`
			`.asg B9, B_prod_h10`
			`.asg B22, B_prod_h32`
			`.asg A17, A_prod_h54`
			`.asg A18, A_prod_h76`
			`.asg B21, B_prod_l10`
			`.asg B23, B_prod_l32`
			`.asg A19, A_prod_l54`
			`.asg A18, A_prod_l76`
			`.asg B21, B_tmph0`
			`.asg A17, A_tmph1`
			`.asg B22, B_tmpl0`
			`.asg A18, A_tmpl1`
			`.asg B23, B_tmp_h`
			`.asg A16, A_tmp_l`
			`.asg B24, B_res_hi`
			`.asg A16, A_res_low`
			`.asg B3, B_return`
			`.asg A1, A_p`
			`; ============================================================================`
			`; START:`
			`; ============================ PIPE LOOP PROLOG ==============================`
			`; PROLOG:`

			`LDW .D2T1 *B_iptr++, A_x76 ; Load x76`
			`\|\| MVK .S1 1, A_p`
			`\|\| DOTP2 .M2 B_x10, B_l10, B_prod_l10 ; prod`
			`\|\| B .S2 L_3 + 8`

			`DOTP2 .M1 A_x76, A_l76, A_prod_l76 ; x6l6 + x7l7`
			`\|\| DOTP2 .M2 B_x32, B_l32, B_prod_l32 ; x2l2 + x3l3`
			`\|\| B .S2 L_4 + 12`

			`MV .L2 B_x32, B_x10 ; x10 = x32`
			`\|\| DOTP2 .M1 A_x54, A_l54, A_prod_l54 ; x4l4 + x5l5`
			`\|\| DOTP2 .M2 B_x10, B_h67, B_prod_h10 ; x0h7 + x1h6`
			`\|\| B .S2 L_1 + 4`

			`DOTP2 .M1 A_x54, A_h23, A_prod_h54 ; x5h2 + x4h3`
			`\|\| DOTP2 .M2 B_x10, B_l10, B_prod_l10 ; x0l0 + x1l1`
			`\|\| SUB .L1 A_i, 3, A_i`
			`\|\| B .S2 L_2 + 4`

			`MV .D1 A_x76, A_x54 ; x54 = x76`
			`\|\| MV .L2X A_x54, B_x32 ; x32 = x54`
			`\|\| DOTP2 .M1 A_x76, A_h01, A_prod_h76 ; x7h0 + x6h1`
			`\|\| DOTP2 .M2 B_x32, B_h45, B_prod_h32 ; x3h4 + x2h5`
			`\|\| LDW .D2T1 *B_iptr++, A_x76 ; Load x76`

			`ADD .L2 B_prod_l10, B_prod_l32, B_tmpl0 ; l10 + l32`
			`\|\| DOTP2 .M1 A_x76, A_l76, A_prod_l76 ; x6l6 + x7l7`
			`\|\| DOTP2 .M2 B_x32, B_l32, B_prod_l32 ; x2l2 + x3l3`

			`; ============================ PIPE LOOP KERNEL ==============================`
			`LOOP:`

			`L_1:`
			`SHR .S2 B_sum_h, 15, B_res_hi ; >> 15`
			`\|\| ADD .L1 A_qr, A_tmp_l, A_sum_l ; += tmp_l`
			`\|\| ADD .S1 A_prod_h54, A_prod_h76, A_tmph1 ; h54 + h76`
			`\|\| MV .D1 A_x76, A_x54 ; x54 = x76`
			`\|\| MV .L2X A_x54, B_x32 ; x32 = x54`
			`\|\| DOTP2 .M1 A_x76, A_h01, A_prod_h76 ; x7h0 + x6h1`
			`\|\| DOTP2 .M2 B_x32, B_h45, B_prod_h32 ; x3h4 + x2h5`
			`\|\| LDW .D2T1 *B_iptr++, A_x76 ; Load x76`

			`L_2:`
			`STH .D1T2 B_res_hi, A_yptr_h++ ; y_hp`
			`\|\| SHR .S1 A_sum_l, 15, A_res_low ; >> 15`
			`\|\| ADD .S2 B_prod_h10, B_prod_h32, B_tmph0 ; h10 + h32`
			`\|\| ADD .L2 B_prod_l10, B_prod_l32, B_tmpl0 ; l10 + l32`
			`\|\| DOTP2 .M1 A_x76, A_l76, A_prod_l76 ; x6l6 + x7l7`
			`\|\| DOTP2 .M2 B_x32, B_l32, B_prod_l32 ; x2l2 + x3l3`
			`\|\|[A_p] SUB .L1 A_p, 1, A_p`

			`L_3:`
			`[!B_p]STH .D2T1 A_res_low, B_yptr_l1++ ; Store y_lp1`
			`\|\| ADD .L2X B_tmph0, A_tmph1, B_tmp_h ; tmp_h`
			`\|\| BDEC .S1 LOOP, A_i ; if (i) B LOOP`
			`\|\| ADD .D1 A_prod_l54, A_prod_l76, A_tmpl1 ; l54 + l76`
			`\|\| MV .S2 B_x32, B_x10 ; x10 = x32`
			`\|\| DOTP2 .M1 A_x54, A_l54, A_prod_l54 ; x4l4 + x5l5`
			`\|\| DOTP2 .M2 B_x10, B_h67, B_prod_h10 ; x0h7 + x1h6`

			`L_4:`
			`[ B_p]SUB .L2 B_p, 1, B_p ; pred.for LP`
			`\|\|[ B_p]STH .D2T1 A_res_low, B_yptr_l0++ ; Store y_lp0`
			`\|\| ADD .S2X A_qr, B_tmp_h, B_sum_h ; += tmp_h`
			`\|\| ADD .D1X A_tmpl1, B_tmpl0, A_tmp_l ; tmp_l`
			`\|\| DOTP2 .M1 A_x54, A_h23, A_prod_h54 ; x5h2 + x4h3`
			`\|\| DOTP2 .M2 B_x10, B_l10, B_prod_l10 ; x0l0 + x1l1`

			`; ============================ PIPE LOOP EPILOG ==============================`
			`; EPILOG:`
			`SHR .S1X B_sum_h, 15, A0 ; sum_h >> 15`
			`\|\| ADD .L1 A_qr, A_tmp_l, A_sum_l ; sum_l+= tmp_l`
			`\|\| ADD .D1 A_prod_h54, A_prod_h76, A_tmph1 ; h54 + h76`
			`\|\| RET .S2 B_return`

			`STH .D1T1 A0, A_yptr_h++ ; Store y_hp`
			`\|\| SHR .S1 A_sum_l, 15, A_res_low ; sum_l >> 15`
			`\|\| ADD .L2 B_prod_h10, B_prod_h32, B_tmph0 ; h10 + h32`

			`STH .D2T1 A_res_low, B_yptr_l1++ ; Store y_lp1`
			`\|\| ADD .L2X B_tmph0, A_tmph1, B_tmp_h ; tmp_h`

			`ADD .S2X A_qr, B_tmp_h, B_sum_h ; sum_h += tmp_h`

			`SHR .S2 B_sum_h, 15, B_res_hi ; sum_h >> 15`

			`STH .D1T2 B_res_hi, A_yptr_h++ ; Store y_hp`
			`\|\| MVC .S2 B_csr, CSR`

			`* ========================================================================= *`
			`* End of file: img_wave_horz.asm *`
			`* ------------------------------------------------------------------------- *`
			`* Copyright (c) 2003 Texas Instruments, Incorporated. *`
			`* All Rights Reserved. *`
			`* ========================================================================= *`