c6416_sdk/dsplib/fir_r4.asm

;* ======================================================================== *;
;*  TEXAS INSTRUMENTS, INC.                                                 *;
;*                                                                          *;
;*  DSPLIB  DSP Signal Processing Library                                   *;
;*                                                                          *;
;*      Release:        Revision 1.04b                                      *;
;*      CVS Revision:   1.7     Sun Sep 29 03:32:22 2002 (UTC)              *;
;*      Snapshot date:  23-Oct-2003                                         *;
;*                                                                          *;
;*  This library contains proprietary intellectual property of Texas        *;
;*  Instruments, Inc.  The library and its source code are protected by     *;
;*  various copyrights, and portions may also be protected by patents or    *;
;*  other legal protections.                                                *;
;*                                                                          *;
;*  This software is licensed for use with Texas Instruments TMS320         *;
;*  family DSPs.  This license was provided to you prior to installing      *;
;*  the software.  You may review this license by consulting the file       *;
;*  TI_license.PDF which accompanies the files in this library.             *;
;* ------------------------------------------------------------------------ *;
;*          Copyright (C) 2003 Texas Instruments, Incorporated.             *;
;*                          All Rights Reserved.                            *;
;* ======================================================================== *;


;* ======================================================================== *;
;*  Assembler compatibility shim for assembling 4.30 and later code on      *;
;*  tools prior to 4.30.                                                    *;
;* ======================================================================== *;

        .if $isdefed(".ASSEMBLER_VERSION")
        .asg    .ASSEMBLER_VERSION, $asmver
        .else
        .asg    0,    $asmver
        .endif

        .if ($asmver < 430)

        .asg    B,    CALL     ; Function Call
        .asg    B,    RET      ; Return from a Function
        .asg    B,    CALLRET  ; Function call with Call / Ret chaining.

        .if .TMS320C6400
        .asg    BNOP, CALLNOP  ; C64x BNOP as a Fn. Call
        .asg    BNOP, RETNOP   ; C64x BNOP as a Fn. Return
        .asg    BNOP, CRNOP    ; C64x Fn call w/, Call/Ret chaining via BNOP.
        .endif

        .asg    , .asmfunc     ; .func equivalent for hand-assembly code
        .asg    , .endasmfunc  ; .endfunc equivalent for hand-assembly code

        .endif

;* ======================================================================== *;
;*  End of assembler compatibility shim.                                    *;
;* ======================================================================== *;


* ========================================================================= *
*   TEXAS INSTRUMENTS, INC.                                                 *
*                                                                           *
*   NAME                                                                    *
*       DSP_fir_r4: FIR Filter (radix 4)                                    *
*                                                                           *
*                                                                           *
*   REVISION DATE                                                           *
*       10-Aug-2001                                                         *
*                                                                           *
*   USAGE                                                                   *
*       This routine is C-callable and can be called as:                    *
*                                                                           *
*       void DSP_fir_r4                                                     *
*       (                                                                   *
*           const short *restrict x,  /* Input array [nr+nh-1 elements] */  *
*           const short *restrict h,  /* Coeff array [nh elements]      */  *
*           short       *restrict r,  /* Output array [nr elements]     */  *
*           int nh,                   /* Number of coefficients.        */  *
*           int nr                    /* Number of output samples.      */  *
*       )                                                                   *
*                                                                           *
*   ARGUMENTS PASSED                                                        *
*       *x    ->      A4                                                    *
*       *h    ->      B4                                                    *
*       *r    ->      A6                                                    *
*       nh    ->      B6                                                    *
*       nr    ->      A8                                                    *
*                                                                           *
*   DESCRIPTION                                                             *
*       Computes a real FIR filter (direct-form) using coefficients         *
*       stored in vector h.  The real data input is stored in vector x.     *
*       The filter output result is stored in vector r.  Input data and     *
*       filter taps are 16-bit, with intermediate values kept at 32-bit     *
*       precision.  Filter taps are expected in Q15 format.                 *
*                                                                           *
*       The following is a natural C implementation with no restrictions.   *
*       This version has restrictions as noted in the ASSUMPTIONS below.    *
*                                                                           *
*       void DSP_fir_r4                                                     *
*       (                                                                   *
*           const short *restrict x,                                        *
*           const short *restrict h,                                        *
*           short       *restrict r,                                        *
*           int nh,                                                         *
*           int nr                                                          *
*       )                                                                   *
*       {                                                                   *
*           int i, j, sum;                                                  *
*                                                                           *
*           for (j = 0; j < nr; j++)                                        *
*           {                                                               *
*               sum = 0;                                                    *
*               for (i = 0; i < nh; i++)                                    *
*                   sum += x[i + j] * h[i];                                 *
*               r[j] = sum >> 15;                                           *
*           }                                                               *
*       }                                                                   *
*                                                                           *
*   ASSUMPTIONS                                                             *
*       Number of taps:    'nh' >= 8, multiple of 4                         *
*       Number of samples: 'nr' >= 4, multiple of 4                         *
*                                                                           *
*   NOTES                                                                   *
*       This function blocks interrupts for its entire duration.  It is     *
*       interrupt tolerant, but not interruptible.                          *
*                                                                           *
*   MEMORY NOTE                                                             *
*       No memory bank hits under any conditions.                           *
*       This code is a LITTLE ENDIAN implementation                         *
*                                                                           *
*   TECHNIQUES                                                              *
*       1.  Load double word instruction is used to simultaneously          *
*           load four values in a single clock cycle.                       *
*       2.  The inner loop is unrolled four times                           *
*                                                                           *
*   CYCLES                                                                  *
*       (8 + nh) * nr/4 + 9                                                 *
*                                                                           *
*       For nh = 12 and nr = 12, cycles = 69                                *
*                                                                           *
*   CODESIZE                                                                *
*       308 bytes.                                                          *
* ------------------------------------------------------------------------- *
*             Copyright (c) 2003 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *


               .sect    ".data:copyright_h"

; ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== ;
        .asg   A0,      A_s        ; flag for sum0a,1a,2a,3a: = 1
        .asg   A1,      A_i        ; inner loop count = 2
        .asg   A2,      A_rj       ; outer loop cont j_cnt = (nr+3)>>2
        .asg   A4,      A_x_ptr    ; input array pointer
        .asg   A6,      A_r_ptr    ; output array pointer
        .asg   A7,      A_nh_x     ; = (nh+3)>>2+1
        .asg   A8,      A_nr       ; = (nr+3)
        .asg   A16,     A_h10      ; coef: h[1:0]
        .asg   A17,     A_h32      ; coef: h[3:2]
        .asg   A18,     A_x10      ; input: x[1:0]
        .asg   A19,     A_x32      ; input: x[3:2]
        .asg   A20,     A_x21      ; input: x[2:1]
        .asg   A21,     A_prod00_11; = x[0]*h[0] + x[1]*h[1]
        .asg   A22,     A_prod22_33; = x[2]*h[2] + x[3]*h[3]
        .asg   A23,     A_prod32_43; = x[3]*h[2] + x[4]*h[3]
        .asg   A24,     A_prod10_21; = x[1]*h[0] + x[2]*h[1]
        .asg   A26,     A_sum0a    ; = sum0b + x[2]*h[2] + x[3]*h[3]
        .asg   A26,     A_sum0b    ; = sum0a + x[0]*h[0] + x[1]*h[1]
        .asg   A27,     A_sum1a    ; = sum1b + x[1]*h[0] + x[2]*h[1]
        .asg   A27,     A_sum1b    ; = sum1a + x[3]*h[2] + x[4]*h[3]
        .asg   A25,     A_sum0     ; = sum0a >>15 for r[0]
        .asg   A28,     A_sum1     ; = sum1a >>15 for r[1]
        .asg   B0,      B_j        ; outer loop flag; if 0,branch to iloop
        .asg   B4,      B_h_ptr    ; coef array pointer
        .asg   B5,      B_iloop    ; iloop label address
        .asg   B6,      B_nh       ; initial value for B_j, if A_rj != 0
        .asg   B7,      B_nh_h     ; addr offset for h_ptr: = (nh+3)>>2+1
        .asg   B8,      B_h_ptr1   ; temp coef array ptr: point to h[nh]
        .asg   B8,      B_r10      ; output: r[1:0]
        .asg   B9,      B_r32      ; output: r[3:2]
        .asg   B9,      B_x32      ; input: x[3:2]
        .asg   B16,     B_h10      ; coef: h[1:0]
        .asg   B17,     B_h32      ; coef: h[3:2]
        .asg   B18,     B_x54      ; input: x[5:4]
        .asg   B19,     B_x76      ; input: x[7:6]
        .asg   B20,     B_x43      ; input: x[4:3]
        .asg   B21,     B_x65      ; input: x[6:5]
        .asg   B22,     B_prod42_53; = x[4]*h[2] + x[5]*h[3]
        .asg   B23,     B_prod20_31; = x[2]*h[0] + x[3]*h[1]
        .asg   B24,     B_prod30_41; = x[3]*h[0] + x[4]*h[1]
        .asg   B25,     B_prod52_63; = x[5]*h[2] + x[6]*h[3]
        .asg   B26,     B_sum2a    ; = sum2b + x[2]*h[0] + x[3]*h[1]
        .asg   B26,     B_sum2b    ; = sum2a + x[4]*h[2] + x[5]*h[3]
        .asg   B27,     B_sum3a    ; = sum3b + x[3]*h[0] + x[4]*h[1]
        .asg   B27,     B_sum3b    ; = sum3a + x[5]*h[2] + x[6]*h[3]
        .asg   B28,     B_sum0     ; = A_sum0
        .asg   B29,     B_sum1     ; = A_sum1
        .asg   B30,     B_sum2     ; = sum2a >>15 for r[2]
        .asg   B31,     B_sum3     ; = sum3a >>15 for r[3]

        .sect ".text:_fir_r4"
        .global _DSP_fir_r4
_DSP_fir_r4:
; ======================= SETUP / LOOP PIPE-UP CODE ======================= ;
        LDNDW   .D1T2   *++A_x_ptr,B_x76:B_x54    ; load B_x76:x54
||      ADD     .L1     3,A_nr,A_nr               ; nr + 3
||      SHR     .S2     B_nh,2,B_nh               ; i_cnt = nh>>2
||      B       .S1     dint0                     ; protect setup code

        LDNDW   .D1T1   *-A_x_ptr[1],A_x32:A_x10  ; load A_x32:x10
||      SHR     .S1     A_nr,2,A_rj               ; j_cnt = (nr+3)>>2

        LDNDW   .D2T1   *B_h_ptr,A_h32:A_h10      ; load A_h32:h10
||      SUB     .L1     A_rj,1,A_rj               ; j_cnt -= 1

        LDNDW   .D2T2   *B_h_ptr++,B_h32:B_h10    ; load B_h32:h10
||[A_rj]MV      .S2     B_nh,B_j                  ; jloop cnt
||      MV      .S1X    B_nh,A_i                  ; i_cnt = nh>>2
||      SUB     .L2     -2,B_nh,B_nh_h            ; addr offset for h_ptr

        LDNDW   .D1T2   *++A_x_ptr,B_x76:B_x54    ; load B_x1110:x98
||      MVK     .S1     1,A_s                     ; sync for sum0a,1a,2a,3a
||      ADDKPC  .S2     iloop,B_iloop,0           ; store iloop addr to reg
||      SUB     .L1X    -1,B_nh,A_nh_x            ; addr offset for x_ptr

        LDNDW   .D1T1   *-A_x_ptr[1],A_x32:A_x10  ; load A_x76:x54
||      B       .S1     iloop                     ;
||      ZERO    .L2     B_sum3a:B_sum2a

dint0:  LDNDW   .D2T1   *B_h_ptr,A_h32:A_h10      ; load A_h76:h54
||      PACKLH2 .S2X    B_x54,A_x32,B_x43         ; @ input: x[4:3]
||      ZERO    .L1     A_sum1a:A_sum0a
||[!A_rj]ADD    .L2     5,B_nh,B_j                ; if last jloop, will not

; ========================== "ILOOP" LOOP KERNEL ========================== ;
iloop:
        DOTP2   .M1     A_x10,A_h10,A_prod00_11     ; @ x[0]*h[0] + x[1]*h[1]
||[!A_s]ADD     .S1     A_sum0a,A_prod00_11,A_sum0b ; x[0]*h[0] + x[1]*h[1]
||      DOTP2   .M2X    B_x54,A_h32,B_prod42_53     ; @ x[4]*h[2] + x[5]*h[3]
||[!A_s]ADD     .L2     B_sum2a,B_prod42_53,B_sum2b ; x[4]*h[2] + x[5]*h[3]
||      PACKLH2 .S2     B_x76,B_x54,B_x65           ; @ input: x[6:5]
||      LDNDW   .D2T2   *B_h_ptr++,B_h32:B_h10      ; load coef: h32:h10
||      PACKLH2 .L1     A_x32,A_x10,A_x21           ; input: x[2:1]
||[!B_j]ADDAD   .D1     A_x_ptr,A_nh_x,A_x_ptr      ; reset the input pointer

        DOTP2   .M1     A_x32,A_h32,A_prod22_33     ; x[2]*h[2] + x[3]*h[3]
||[!A_s]ADD     .L1     A_sum0b,A_prod22_33,A_sum0a ; x[2]*h[2] + x[3]*h[3]
||      DOTP2   .M2     B_x65,B_h32,B_prod52_63     ; x[5]*h[2] + x[6]*h[3]
||[!A_s]ADD     .S2     B_sum3a,B_prod52_63,B_sum3b ; x[5]*h[2] + x[6]*h[3]
||      LDNDW   .D1T2   *++A_x_ptr,B_x76:B_x54      ; load intput: x76:x54
||[!B_j]ADDAD   .D2     B_h_ptr,B_nh_h,B_h_ptr      ; reset the coef pointer
||[ A_i]SUB     .S1     A_i,1,A_i                   ; @
||      MV      .L2X    A_x32,B_x32                 ;

        DOTP2   .M2     B_x32,B_h10,B_prod20_31     ; x[2]*h[0] + x[3]*h[1]
||[!A_s]ADD     .L2     B_sum2b,B_prod20_31, B_sum2a; x[2]*h[0] + x[3]*h[1]
||      DOTP2   .M1X    B_x43,A_h32,A_prod32_43     ; x[3]*h[2] + x[4]*h[3]
||[!A_s]ADD     .L1     A_sum1a,A_prod32_43,A_sum1b ; x[3]*h[2] + x[4]*h[3]
||[ A_i]B       .S1     iloop                       ; inner loop brach
||[!B_j]B       .S2     B_iloop                     ; outer loop brach
||      LDNDW   .D1T1   *-A_x_ptr[1],A_x32:A_x10    ; load input: x[32]:x[10]
||      SUB     .D2     B_j,1,B_j                   ; count for outer loop

        DOTP2   .M1X    A_x21,B_h10,A_prod10_21     ; x[1]*h[0] + x[2]*h[1]
||[!A_s]ADD     .D1     A_sum1b,A_prod10_21,A_sum1a ; x[1]*h[0] + x[2]*h[1]
||      DOTP2   .M2     B_x43,B_h10,B_prod30_41     ; x[3]*h[0] + x[4]*h[1]
||[!A_s]ADD     .L2     B_sum3b,B_prod30_41,B_sum3a ; x[3]*h[0] + x[4]*h[1]
||      LDNDW   .D2T1   *B_h_ptr,A_h32:A_h10        ; load coef: h32:h10
||[ A_s]ZERO    .L1     A_s                         ; @flag for sum0a,1a,2a,3a
||      PACKLH2 .S2X    B_x54,A_x32,B_x43           ; @ input: x[4:3]
; ========================= END OF "ILOOP" KERNEL ========================= ;

        SHR     .S1     A_sum0a,15,A_sum0        ; = sum0a >>15 for r[0]
||      ADD     .L2     B_sum2a,B_sum2a,B_sum2   ; = sum2a <<1 for r[2]
||[A_rj]LDNDW   .D2T2   *B_h_ptr++,B_h32:B_h10   ; p load intput: x[3:2]:x[1:0]
||[!A_rj]RET    .S2     B3

  [A_rj]LDNDW   .D1T2   *++A_x_ptr,B_x76:B_x54   ; p load intput: x[7:6]:x[5:4]
||      SHR     .S2X    A_sum1a,15,B_sum1        ; = sum1a >>15 for r[1]
||      ADD     .D2     B_sum3a,B_sum3a,B_sum3   ; = sum3a <<1 for r[3]
||      MV      .S1X    B_nh,A_i                 ; initialize inner loop count
||      ZERO    .L2     B_sum3a:B_sum2a          ; p initialize B_sum3a,2a

        PACK2   .L2X    B_sum1,A_sum0,B_r10      ; r[1:0]
||      PACKH2  .S2     B_sum3,B_sum2,B_r32      ; r[3:2]
||[A_rj]LDNDW   .D2T1   *B_h_ptr,A_h32:A_h10     ; p load coef: h[3:2]:h[1:0]
||      MVD     .M1X    B_x76,A_x32              ; p load intput: x[3:2]
||      MVK     .D1     1,A_s                    ; p flag for accum sum0a..3a
||[A_rj]B       .S1     iloop                    ; p branch to outer loop
||[A_rj]SUB     .L1     A_rj,1,A_rj              ; count down for outer loop

        STNDW   .D1     B_r32:B_r10,*A_r_ptr++   ; p store output r[3:2]:r[1:0]
||      MVD     .M1X    B_x54,A_x10              ; p load intput: x[1:0]
||[A_rj]MV      .S2     B_nh,B_j                 ; initialize outer loop count
||[!A_rj]ADD    .D2     5,B_nh,B_j               ; initialize outer loop count
||      ZERO    .L1     A_sum1a:A_sum0a          ; p initialize B_sum1a,0a
||      PACKLH2 .L2X    B_x54,A_x32,B_x43        ; @ input: x[4:3]
; ============================ END OF "JLOOP" ============================= ;
        NOP             2
; ============================= BRANCH OCCURS ============================= ;

* ========================================================================= *
*   End of file:  dsp_fir_r4.asm                                            *
* ------------------------------------------------------------------------- *
*             Copyright (c) 2003 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *
add dsplib imglib Signed-off-by: surenyi <surenyi82@qq.com> 3 years ago			`;* ======================================================================== *;`
			`;* TEXAS INSTRUMENTS, INC. *;`
			`;* *;`
			`;* DSPLIB DSP Signal Processing Library *;`
			`;* *;`
			`;* Release: Revision 1.04b *;`
			`;* CVS Revision: 1.7 Sun Sep 29 03:32:22 2002 (UTC) *;`
			`;* Snapshot date: 23-Oct-2003 *;`
			`;* *;`
			`;* This library contains proprietary intellectual property of Texas *;`
			`;* Instruments, Inc. The library and its source code are protected by *;`
			`;* various copyrights, and portions may also be protected by patents or *;`
			`;* other legal protections. *;`
			`;* *;`
			`;* This software is licensed for use with Texas Instruments TMS320 *;`
			`;* family DSPs. This license was provided to you prior to installing *;`
			`;* the software. You may review this license by consulting the file *;`
			`;* TI_license.PDF which accompanies the files in this library. *;`
			`;* ------------------------------------------------------------------------ *;`
			`;* Copyright (C) 2003 Texas Instruments, Incorporated. *;`
			`;* All Rights Reserved. *;`
			`;* ======================================================================== *;`


			`;* ======================================================================== *;`
			`;* Assembler compatibility shim for assembling 4.30 and later code on *;`
			`;* tools prior to 4.30. *;`
			`;* ======================================================================== *;`

			`.if $isdefed(".ASSEMBLER_VERSION")`
			`.asg .ASSEMBLER_VERSION, $asmver`
			`.else`
			`.asg 0, $asmver`
			`.endif`

			`.if ($asmver < 430)`

			`.asg B, CALL ; Function Call`
			`.asg B, RET ; Return from a Function`
			`.asg B, CALLRET ; Function call with Call / Ret chaining.`

			`.if .TMS320C6400`
			`.asg BNOP, CALLNOP ; C64x BNOP as a Fn. Call`
			`.asg BNOP, RETNOP ; C64x BNOP as a Fn. Return`
			`.asg BNOP, CRNOP ; C64x Fn call w/, Call/Ret chaining via BNOP.`
			`.endif`

			`.asg , .asmfunc ; .func equivalent for hand-assembly code`
			`.asg , .endasmfunc ; .endfunc equivalent for hand-assembly code`

			`.endif`

			`;* ======================================================================== *;`
			`;* End of assembler compatibility shim. *;`
			`;* ======================================================================== *;`


			`* ========================================================================= *`
			`* TEXAS INSTRUMENTS, INC. *`
			`* *`
			`* NAME *`
			`* DSP_fir_r4: FIR Filter (radix 4) *`
			`* *`
			`* *`
			`* REVISION DATE *`
			`* 10-Aug-2001 *`
			`* *`
			`* USAGE *`
			`* This routine is C-callable and can be called as: *`
			`* *`
			`* void DSP_fir_r4 *`
			`* ( *`
			`* const short restrict x, / Input array [nr+nh-1 elements] / `
			`* const short restrict h, / Coeff array [nh elements] / `
			`* short restrict r, / Output array [nr elements] / `
			`* int nh, /* Number of coefficients. / `
			`* int nr /* Number of output samples. / `
			`* ) *`
			`* *`
			`* ARGUMENTS PASSED *`
			`* x -> A4 `
			`* h -> B4 `
			`* r -> A6 `
			`* nh -> B6 *`
			`* nr -> A8 *`
			`* *`
			`* DESCRIPTION *`
			`* Computes a real FIR filter (direct-form) using coefficients *`
			`* stored in vector h. The real data input is stored in vector x. *`
			`* The filter output result is stored in vector r. Input data and *`
			`* filter taps are 16-bit, with intermediate values kept at 32-bit *`
			`* precision. Filter taps are expected in Q15 format. *`
			`* *`
			`* The following is a natural C implementation with no restrictions. *`
			`* This version has restrictions as noted in the ASSUMPTIONS below. *`
			`* *`
			`* void DSP_fir_r4 *`
			`* ( *`
			`* const short restrict x, `
			`* const short restrict h, `
			`* short restrict r, `
			`* int nh, *`
			`* int nr *`
			`* ) *`
			`* { *`
			`* int i, j, sum; *`
			`* *`
			`* for (j = 0; j < nr; j++) *`
			`* { *`
			`* sum = 0; *`
			`* for (i = 0; i < nh; i++) *`
			`* sum += x[i + j] * h[i]; *`
			`* r[j] = sum >> 15; *`
			`* } *`
			`* } *`
			`* *`
			`* ASSUMPTIONS *`
			`* Number of taps: 'nh' >= 8, multiple of 4 *`
			`* Number of samples: 'nr' >= 4, multiple of 4 *`
			`* *`
			`* NOTES *`
			`* This function blocks interrupts for its entire duration. It is *`
			`* interrupt tolerant, but not interruptible. *`
			`* *`
			`* MEMORY NOTE *`
			`* No memory bank hits under any conditions. *`
			`* This code is a LITTLE ENDIAN implementation *`
			`* *`
			`* TECHNIQUES *`
			`* 1. Load double word instruction is used to simultaneously *`
			`* load four values in a single clock cycle. *`
			`* 2. The inner loop is unrolled four times *`
			`* *`
			`* CYCLES *`
			`* (8 + nh) * nr/4 + 9 *`
			`* *`
			`* For nh = 12 and nr = 12, cycles = 69 *`
			`* *`
			`* CODESIZE *`
			`* 308 bytes. *`
			`* ------------------------------------------------------------------------- *`
			`* Copyright (c) 2003 Texas Instruments, Incorporated. *`
			`* All Rights Reserved. *`
			`* ========================================================================= *`


			`.sect ".data:copyright_h"`

			`; ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== ;`
			`.asg A0, A_s ; flag for sum0a,1a,2a,3a: = 1`
			`.asg A1, A_i ; inner loop count = 2`
			`.asg A2, A_rj ; outer loop cont j_cnt = (nr+3)>>2`
			`.asg A4, A_x_ptr ; input array pointer`
			`.asg A6, A_r_ptr ; output array pointer`
			`.asg A7, A_nh_x ; = (nh+3)>>2+1`
			`.asg A8, A_nr ; = (nr+3)`
			`.asg A16, A_h10 ; coef: h[1:0]`
			`.asg A17, A_h32 ; coef: h[3:2]`
			`.asg A18, A_x10 ; input: x[1:0]`
			`.asg A19, A_x32 ; input: x[3:2]`
			`.asg A20, A_x21 ; input: x[2:1]`
			`.asg A21, A_prod00_11; = x[0]h[0] + x[1]h[1]`
			`.asg A22, A_prod22_33; = x[2]h[2] + x[3]h[3]`
			`.asg A23, A_prod32_43; = x[3]h[2] + x[4]h[3]`
			`.asg A24, A_prod10_21; = x[1]h[0] + x[2]h[1]`
			`.asg A26, A_sum0a ; = sum0b + x[2]h[2] + x[3]h[3]`
			`.asg A26, A_sum0b ; = sum0a + x[0]h[0] + x[1]h[1]`
			`.asg A27, A_sum1a ; = sum1b + x[1]h[0] + x[2]h[1]`
			`.asg A27, A_sum1b ; = sum1a + x[3]h[2] + x[4]h[3]`
			`.asg A25, A_sum0 ; = sum0a >>15 for r[0]`
			`.asg A28, A_sum1 ; = sum1a >>15 for r[1]`
			`.asg B0, B_j ; outer loop flag; if 0,branch to iloop`
			`.asg B4, B_h_ptr ; coef array pointer`
			`.asg B5, B_iloop ; iloop label address`
			`.asg B6, B_nh ; initial value for B_j, if A_rj != 0`
			`.asg B7, B_nh_h ; addr offset for h_ptr: = (nh+3)>>2+1`
			`.asg B8, B_h_ptr1 ; temp coef array ptr: point to h[nh]`
			`.asg B8, B_r10 ; output: r[1:0]`
			`.asg B9, B_r32 ; output: r[3:2]`
			`.asg B9, B_x32 ; input: x[3:2]`
			`.asg B16, B_h10 ; coef: h[1:0]`
			`.asg B17, B_h32 ; coef: h[3:2]`
			`.asg B18, B_x54 ; input: x[5:4]`
			`.asg B19, B_x76 ; input: x[7:6]`
			`.asg B20, B_x43 ; input: x[4:3]`
			`.asg B21, B_x65 ; input: x[6:5]`
			`.asg B22, B_prod42_53; = x[4]h[2] + x[5]h[3]`
			`.asg B23, B_prod20_31; = x[2]h[0] + x[3]h[1]`
			`.asg B24, B_prod30_41; = x[3]h[0] + x[4]h[1]`
			`.asg B25, B_prod52_63; = x[5]h[2] + x[6]h[3]`
			`.asg B26, B_sum2a ; = sum2b + x[2]h[0] + x[3]h[1]`
			`.asg B26, B_sum2b ; = sum2a + x[4]h[2] + x[5]h[3]`
			`.asg B27, B_sum3a ; = sum3b + x[3]h[0] + x[4]h[1]`
			`.asg B27, B_sum3b ; = sum3a + x[5]h[2] + x[6]h[3]`
			`.asg B28, B_sum0 ; = A_sum0`
			`.asg B29, B_sum1 ; = A_sum1`
			`.asg B30, B_sum2 ; = sum2a >>15 for r[2]`
			`.asg B31, B_sum3 ; = sum3a >>15 for r[3]`

			`.sect ".text:_fir_r4"`
			`.global _DSP_fir_r4`
			`_DSP_fir_r4:`
			`; ======================= SETUP / LOOP PIPE-UP CODE ======================= ;`
			`LDNDW .D1T2 *++A_x_ptr,B_x76:B_x54 ; load B_x76:x54`
			`\|\| ADD .L1 3,A_nr,A_nr ; nr + 3`
			`\|\| SHR .S2 B_nh,2,B_nh ; i_cnt = nh>>2`
			`\|\| B .S1 dint0 ; protect setup code`

			`LDNDW .D1T1 *-A_x_ptr[1],A_x32:A_x10 ; load A_x32:x10`
			`\|\| SHR .S1 A_nr,2,A_rj ; j_cnt = (nr+3)>>2`

			`LDNDW .D2T1 *B_h_ptr,A_h32:A_h10 ; load A_h32:h10`
			`\|\| SUB .L1 A_rj,1,A_rj ; j_cnt -= 1`

			`LDNDW .D2T2 *B_h_ptr++,B_h32:B_h10 ; load B_h32:h10`
			`\|\|[A_rj]MV .S2 B_nh,B_j ; jloop cnt`
			`\|\| MV .S1X B_nh,A_i ; i_cnt = nh>>2`
			`\|\| SUB .L2 -2,B_nh,B_nh_h ; addr offset for h_ptr`

			`LDNDW .D1T2 *++A_x_ptr,B_x76:B_x54 ; load B_x1110:x98`
			`\|\| MVK .S1 1,A_s ; sync for sum0a,1a,2a,3a`
			`\|\| ADDKPC .S2 iloop,B_iloop,0 ; store iloop addr to reg`
			`\|\| SUB .L1X -1,B_nh,A_nh_x ; addr offset for x_ptr`

			`LDNDW .D1T1 *-A_x_ptr[1],A_x32:A_x10 ; load A_x76:x54`
			`\|\| B .S1 iloop ;`
			`\|\| ZERO .L2 B_sum3a:B_sum2a`

			`dint0: LDNDW .D2T1 *B_h_ptr,A_h32:A_h10 ; load A_h76:h54`
			`\|\| PACKLH2 .S2X B_x54,A_x32,B_x43 ; @ input: x[4:3]`
			`\|\| ZERO .L1 A_sum1a:A_sum0a`
			`\|\|[!A_rj]ADD .L2 5,B_nh,B_j ; if last jloop, will not`

			`; ========================== "ILOOP" LOOP KERNEL ========================== ;`
			`iloop:`
			`DOTP2 .M1 A_x10,A_h10,A_prod00_11 ; @ x[0]h[0] + x[1]h[1]`
			`\|\|[!A_s]ADD .S1 A_sum0a,A_prod00_11,A_sum0b ; x[0]h[0] + x[1]h[1]`
			`\|\| DOTP2 .M2X B_x54,A_h32,B_prod42_53 ; @ x[4]h[2] + x[5]h[3]`
			`\|\|[!A_s]ADD .L2 B_sum2a,B_prod42_53,B_sum2b ; x[4]h[2] + x[5]h[3]`
			`\|\| PACKLH2 .S2 B_x76,B_x54,B_x65 ; @ input: x[6:5]`
			`\|\| LDNDW .D2T2 *B_h_ptr++,B_h32:B_h10 ; load coef: h32:h10`
			`\|\| PACKLH2 .L1 A_x32,A_x10,A_x21 ; input: x[2:1]`
			`\|\|[!B_j]ADDAD .D1 A_x_ptr,A_nh_x,A_x_ptr ; reset the input pointer`

			`DOTP2 .M1 A_x32,A_h32,A_prod22_33 ; x[2]h[2] + x[3]h[3]`
			`\|\|[!A_s]ADD .L1 A_sum0b,A_prod22_33,A_sum0a ; x[2]h[2] + x[3]h[3]`
			`\|\| DOTP2 .M2 B_x65,B_h32,B_prod52_63 ; x[5]h[2] + x[6]h[3]`
			`\|\|[!A_s]ADD .S2 B_sum3a,B_prod52_63,B_sum3b ; x[5]h[2] + x[6]h[3]`
			`\|\| LDNDW .D1T2 *++A_x_ptr,B_x76:B_x54 ; load intput: x76:x54`
			`\|\|[!B_j]ADDAD .D2 B_h_ptr,B_nh_h,B_h_ptr ; reset the coef pointer`
			`\|\|[ A_i]SUB .S1 A_i,1,A_i ; @`
			`\|\| MV .L2X A_x32,B_x32 ;`

			`DOTP2 .M2 B_x32,B_h10,B_prod20_31 ; x[2]h[0] + x[3]h[1]`
			`\|\|[!A_s]ADD .L2 B_sum2b,B_prod20_31, B_sum2a; x[2]h[0] + x[3]h[1]`
			`\|\| DOTP2 .M1X B_x43,A_h32,A_prod32_43 ; x[3]h[2] + x[4]h[3]`
			`\|\|[!A_s]ADD .L1 A_sum1a,A_prod32_43,A_sum1b ; x[3]h[2] + x[4]h[3]`
			`\|\|[ A_i]B .S1 iloop ; inner loop brach`
			`\|\|[!B_j]B .S2 B_iloop ; outer loop brach`
			`\|\| LDNDW .D1T1 *-A_x_ptr[1],A_x32:A_x10 ; load input: x[32]:x[10]`
			`\|\| SUB .D2 B_j,1,B_j ; count for outer loop`

			`DOTP2 .M1X A_x21,B_h10,A_prod10_21 ; x[1]h[0] + x[2]h[1]`
			`\|\|[!A_s]ADD .D1 A_sum1b,A_prod10_21,A_sum1a ; x[1]h[0] + x[2]h[1]`
			`\|\| DOTP2 .M2 B_x43,B_h10,B_prod30_41 ; x[3]h[0] + x[4]h[1]`
			`\|\|[!A_s]ADD .L2 B_sum3b,B_prod30_41,B_sum3a ; x[3]h[0] + x[4]h[1]`
			`\|\| LDNDW .D2T1 *B_h_ptr,A_h32:A_h10 ; load coef: h32:h10`
			`\|\|[ A_s]ZERO .L1 A_s ; @flag for sum0a,1a,2a,3a`
			`\|\| PACKLH2 .S2X B_x54,A_x32,B_x43 ; @ input: x[4:3]`
			`; ========================= END OF "ILOOP" KERNEL ========================= ;`

			`SHR .S1 A_sum0a,15,A_sum0 ; = sum0a >>15 for r[0]`
			`\|\| ADD .L2 B_sum2a,B_sum2a,B_sum2 ; = sum2a <<1 for r[2]`
			`\|\|[A_rj]LDNDW .D2T2 *B_h_ptr++,B_h32:B_h10 ; p load intput: x[3:2]:x[1:0]`
			`\|\|[!A_rj]RET .S2 B3`

			`[A_rj]LDNDW .D1T2 *++A_x_ptr,B_x76:B_x54 ; p load intput: x[7:6]:x[5:4]`
			`\|\| SHR .S2X A_sum1a,15,B_sum1 ; = sum1a >>15 for r[1]`
			`\|\| ADD .D2 B_sum3a,B_sum3a,B_sum3 ; = sum3a <<1 for r[3]`
			`\|\| MV .S1X B_nh,A_i ; initialize inner loop count`
			`\|\| ZERO .L2 B_sum3a:B_sum2a ; p initialize B_sum3a,2a`

			`PACK2 .L2X B_sum1,A_sum0,B_r10 ; r[1:0]`
			`\|\| PACKH2 .S2 B_sum3,B_sum2,B_r32 ; r[3:2]`
			`\|\|[A_rj]LDNDW .D2T1 *B_h_ptr,A_h32:A_h10 ; p load coef: h[3:2]:h[1:0]`
			`\|\| MVD .M1X B_x76,A_x32 ; p load intput: x[3:2]`
			`\|\| MVK .D1 1,A_s ; p flag for accum sum0a..3a`
			`\|\|[A_rj]B .S1 iloop ; p branch to outer loop`
			`\|\|[A_rj]SUB .L1 A_rj,1,A_rj ; count down for outer loop`

			`STNDW .D1 B_r32:B_r10,*A_r_ptr++ ; p store output r[3:2]:r[1:0]`
			`\|\| MVD .M1X B_x54,A_x10 ; p load intput: x[1:0]`
			`\|\|[A_rj]MV .S2 B_nh,B_j ; initialize outer loop count`
			`\|\|[!A_rj]ADD .D2 5,B_nh,B_j ; initialize outer loop count`
			`\|\| ZERO .L1 A_sum1a:A_sum0a ; p initialize B_sum1a,0a`
			`\|\| PACKLH2 .L2X B_x54,A_x32,B_x43 ; @ input: x[4:3]`
			`; ============================ END OF "JLOOP" ============================= ;`
			`NOP 2`
			`; ============================= BRANCH OCCURS ============================= ;`

			`* ========================================================================= *`
			`* End of file: dsp_fir_r4.asm *`
			`* ------------------------------------------------------------------------- *`
			`* Copyright (c) 2003 Texas Instruments, Incorporated. *`
			`* All Rights Reserved. *`
			`* ========================================================================= *`