c6416_sdk/dsplib/fir_r8.asm


								;* ======================================================================== *;

								;*  TEXAS INSTRUMENTS, INC.                                                 *;

								;*                                                                          *;

								;*  DSPLIB  DSP Signal Processing Library                                   *;

								;*                                                                          *;

								;*      Release:        Revision 1.04b                                      *;

								;*      CVS Revision:   1.5     Sun Sep 29 03:32:22 2002 (UTC)              *;

								;*      Snapshot date:  23-Oct-2003                                         *;

								;*                                                                          *;

								;*  This library contains proprietary intellectual property of Texas        *;

								;*  Instruments, Inc.  The library and its source code are protected by     *;

								;*  various copyrights, and portions may also be protected by patents or    *;

								;*  other legal protections.                                                *;

								;*                                                                          *;

								;*  This software is licensed for use with Texas Instruments TMS320         *;

								;*  family DSPs.  This license was provided to you prior to installing      *;

								;*  the software.  You may review this license by consulting the file       *;

								;*  TI_license.PDF which accompanies the files in this library.             *;

								;* ------------------------------------------------------------------------ *;

								;*          Copyright (C) 2003 Texas Instruments, Incorporated.             *;

								;*                          All Rights Reserved.                            *;

								;* ======================================================================== *;


								;* ======================================================================== *;

								;*  Assembler compatibility shim for assembling 4.30 and later code on      *;

								;*  tools prior to 4.30.                                                    *;

								;* ======================================================================== *;


								        .if $isdefed(".ASSEMBLER_VERSION")

								        .asg    .ASSEMBLER_VERSION, $asmver

								        .else

								        .asg    0,    $asmver

								        .endif


								        .if ($asmver < 430)


								        .asg    B,    CALL     ; Function Call

								        .asg    B,    RET      ; Return from a Function

								        .asg    B,    CALLRET  ; Function call with Call / Ret chaining.


								        .if .TMS320C6400

								        .asg    BNOP, CALLNOP  ; C64x BNOP as a Fn. Call

								        .asg    BNOP, RETNOP   ; C64x BNOP as a Fn. Return

								        .asg    BNOP, CRNOP    ; C64x Fn call w/, Call/Ret chaining via BNOP.

								        .endif


								        .asg    , .asmfunc     ; .func equivalent for hand-assembly code

								        .asg    , .endasmfunc  ; .endfunc equivalent for hand-assembly code


								        .endif


								;* ======================================================================== *;

								;*  End of assembler compatibility shim.                                    *;

								;* ======================================================================== *;


								* ========================================================================= *

								*   TEXAS INSTRUMENTS, INC.                                                 *

								*                                                                           *

								*   NAME                                                                    *

								*       DSP_fir_r8: FIR Filter (radix 8)                                    *

								*                                                                           *

								*   REVISION DATE                                                           *

								*       10-Aug-2001                                                         *

								*                                                                           *

								*   USAGE                                                                   *

								*       This routine is C-callable and can be called as:                    *

								*                                                                           *

								*       void DSP_fir_r8                                                     *

								*       (                                                                   *

								*           const short *restrict x,  /* Input array [nr+nh-1 elements] */  *

								*           const short *restrict h,  /* Coeff array [nh elements]      */  *

								*           short       *restrict r,  /* Output array [nr elements]     */  *

								*           int nh,                   /* Number of coefficients.        */  *

								*           int nr                    /* Number of output samples.      */  *

								*       )                                                                   *

								*                                                                           *

								*   ARGUMENTS PASSED                                                        *

								*       *x    ->      A4                                                    *

								*       *h    ->      B4                                                    *

								*       *r    ->      A6                                                    *

								*       nh    ->      B6                                                    *

								*       nr    ->      A8                                                    *

								*                                                                           *

								*   DESCRIPTION                                                             *

								*       Computes a real FIR filter (direct-form) using coefficients         *

								*       stored in vector h.  The real data input is stored in vector x.     *

								*       The filter output result is stored in vector r.  Input data and     *

								*       filter taps are 16-bit, with intermediate values kept at 32-bit     *

								*       precision.  Filter taps are expected in Q15 format.                 *

								*                                                                           *

								*       The following is a natural C implementation with no restrictions.   *

								*       This version has restrictions as noted in the ASSUMPTIONS below.    *

								*                                                                           *

								*       void DSP_fir_r8                                                     *

								*       (                                                                   *

								*           const short *restrict x,                                        *

								*           const short *restrict h,                                        *

								*           short       *restrict r,                                        *

								*           int nh,                                                         *

								*           int nr                                                          *

								*       )                                                                   *

								*       {                                                                   *

								*           int i, j, sum;                                                  *

								*                                                                           *

								*           for (j = 0; j < nr; j++)                                        *

								*           {                                                               *

								*               sum = 0;                                                    *

								*               for (i = 0; i < nh; i++)                                    *

								*                   sum += x[i + j] * h[i];                                 *

								*               r[j] = sum >> 15;                                           *

								*           }                                                               *

								*       }                                                                   *

								*                                                                           *

								*   ASSUMPTIONS                                                             *

								*       Number of taps:    'nh' >= 8, multiple of 8                         *

								*       Number of samples: 'nr' >= 4, multiple of 4                         *

								*       Array 'r' is word aligned.                                          *

								*                                                                           *

								*   NOTES                                                                   *

								*       This function blocks interrupts for its entire duration.  It is     *

								*       interrupt tolerant, but not interruptible.                          *

								*                                                                           *

								*   MEMORY NOTE                                                             *

								*       No memory bank hits under any conditions.                           *

								*       This code is a LITTLE ENDIAN implementation.                        *

								*                                                                           *

								*   TECHNIQUES                                                              *

								*       1.  Load double word instruction is used to simultaneously load     *

								*           four values in a single clock cycle.                            *

								*                                                                           *

								*       2.  The inner loop is unrolled four times and will always           *

								*           compute a multiple of 4 of nr.                                  *

								*                                                                           *

								*       3.  The outer loop is conditionally exected in parallel with the    *

								*           inner loop.  This allows for a zero overhead outer loop.        *

								*                                                                           *

								*   CYCLES                                                                  *

								*       nh * nr/4 + 17                                                      *

								*                                                                           *

								*       For nh = 32 and nr = 36, cycles = 305.                              *

								*                                                                           *

								*   CODESIZE                                                                *

								*       336 bytes.                                                          *

								*                                                                           *

								* ------------------------------------------------------------------------- *

								*             Copyright (c) 2003 Texas Instruments, Incorporated.           *

								*                            All Rights Reserved.                           *

								* ========================================================================= *


								; ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== ;

								        .asg   A0,      A_i        ; inner loop flag;when = 0,branch to iloop

								        .asg   A1,      A_i1       ; delayed i_cnt

								        .asg   A2,      A_s        ; flag for adding 1st sum

								        .asg   A4,      A_x_ptr    ; input array pointer

								        .asg   A5,      A_r2       ; output: r[2]

								        .asg   A6,      A_r_ptr    ; output ptr that init. point to r[0]

								        .asg   A8,      A_nr       ; number of output samples

								        .asg   A9,      A_x_offset ; offset to reset input ptr for next loop

								        .asg   A16,     A_x54      ; input: x[5:4]

								        .asg   A17,     A_x76      ; input: x[7:6]

								        .asg   A20,     A_h10      ; input: h[1:0]

								        .asg   A21,     A_h32      ; input: h[3:2]

								        .asg   A22,     A_h54      ; input: h[5:4]

								        .asg   A23,     A_h76      ; input: h[7:6]

								        .asg   A24,     A_sum0     ; sum0

								        .asg   A25,     A_sum2     ; sum2

								        .asg   A26,     A_prod0_32 ; = x[3]*h[3] + x[2]*h[2]

								        .asg   A26,     A_prod0_54 ; = x[5]*h[5] + x[4]*h[4]

								        .asg   A26,     A_prod0_76 ; = x[7]*h[7] + x[6]*h[6]

								        .asg   A27,     A_prod2_32 ; = x[5]*h[3] + x[4]*h[2]

								        .asg   A27,     A_prod2_54 ; = x[7]*h[5] + x[6]*h[4]

								        .asg   A27,     A_prod2_76 ; = x[9]*h[7] + x[8]*h[6]

								        .asg   A28,     A_prod0_10 ; = x[1]*h[1] + x[0]*h[0]

								        .asg   A29,     A_prod2_10 ; = x[3]*h[1] + x[2]*h[0]

								        .asg   A28,     A_sum0a    ; sum0

								        .asg   A29,     A_sum2a    ; sum2

								        .asg   A30,     A_r0       ; output: r[0]

								        .asg   A31,     A_r1       ; output: r[1]

								        .asg   A31,     A_r10      ; output: r[1:0]

								        .asg   B0,      B_j        ; outer loop count

								        .asg   B1,      B_i2       ; 2nd delayed i_cnt

								        .asg   B4,      B_h_ptr    ; coef array pointer

								        .asg   B5,      B_x18      ; input: x[1:8]

								        .asg   B6,      B_nh       ; number of coefficients

								        .asg   B7,      B_r_ptr    ; output ptr that init. point to r[2]

								        .asg   B8,      B_x3a      ; input: x[3:10]

								        .asg   B9,      B_h_offset ; offset to reset coef ptr for next loop

								        .asg   B16,     B_x10      ; input: x[1:0]

								        .asg   B17,     B_x32      ; input: x[3:2]

								        .asg   B18,     B_x98      ; input: x[9:8]

								        .asg   B19,     B_xba      ; input: x[11:10]

								        .asg   B20,     B_h0X      ; coef: h[0:-1]

								        .asg   B21,     B_h21      ; coef: h[2:1]

								        .asg   B22,     B_h43      ; coef: h[4:3]

								        .asg   B23,     B_h65      ; coef: h[6:5]

								        .asg   B24,     B_sum1     ; sum1

								        .asg   B25,     B_sum3     ; sum3

								        .asg   B26,     B_prod1_21 ; = x[3]*h[2] + x[2]*h[1]

								        .asg   B26,     B_prod1_65 ; = x[7]*h[6] + x[6]*h[5]

								        .asg   B26,     B_prod1_07 ; = x[1]*h[0] + x[8]*h[7]

								        .asg   B27,     B_prod3_21 ; = x[5]*h[2] + x[4]*h[1]

								        .asg   B27,     B_prod3_65 ; = x[9]*h[6] + x[8]*h[5]

								        .asg   B27,     B_prod3_07 ; = x[3]*h[0] + x[a]*h[7]

								        .asg   B28,     B_prod1_43 ; = x[5]*h[4] + x[4]*h[3]

								        .asg   B28,     B_sum1a    ; sum1

								        .asg   B29,     B_prod3_43 ; = x[7]*h[4] + x[6]*h[3]

								        .asg   B29,     B_sum3a    ; sum3

								        .asg   B30,     B_h07      ; input: h[0:7]

								        .asg   B31,     B_r3       ; output: r[3]

								        .asg   B31,     B_r32      ; output: r[3:2]


								        .sect ".text:_fir_r8"

								        .global _DSP_fir_r8

								_DSP_fir_r8:


								; ======================= SETUP / LOOP PIPE-UP CODE ======================= ;

								        LDNDW   .D1T2 *A_x_ptr++,B_x32:B_x10     ; load input: x[3:2]:x[1:0]

								||      ADD     .L1   3,A_nr,A_nr                ; nr + 3


								        LDNDW   .D2T1 *B_h_ptr++,A_h32:A_h10     ; load coef: h[3:2]:h[1:0]


								        LDNDW   .D2T2 *-B_h_ptr(10),B_h21:B_h0X  ; load coef: h[2:1]:h[0:-1]

								||      SHR     .S2X  A_nr,2,B_j                 ; j_cnt = (nr+3)>>2


								        LDNDW   .D1T1 *A_x_ptr++,A_x76:A_x54     ; load input: x[7:6]:x[5:4]

								||      MPY     .M2   B_j,B_nh,B_j

								||      ADD     .L1X  -4,B_nh,A_x_offset         ; twin reg for h pointer


								        LDNDW   .D2T2 *-B_h_ptr(2),B_h65:B_h43   ; load coef: h[6:5]:h[4:3]

								||      ADD     .S1X  -8,B_nh,A_i                ; j_cnt = (nh>>3)

								||      ZERO    .L1   A_sum2:A_sum0

								||      MVK     .D1   1,A_s                      ; flag for adding 1st sum


								        LDNDW   .D1T2 *A_x_ptr,B_xba:B_x98       ; load input: x[11:10]:x[9:8]

								||      MV      .L2   B_nh,B_h_offset            ; twin reg for h pointer

								||      SHR     .S2   B_j,3,B_j                  ; j_cnt = (nh>>3)

								||      B       .S1   dint0


								        LDNDW   .D2T1 *B_h_ptr++,A_h76:A_h54     ; load coef: h[7:6]:h[5:4]

								||[!A_i]SUBAH   .D1   A_x_ptr,A_x_offset,A_x_ptr ; reset x_ptr

								||      ZERO    .L2   B_sum3:B_sum1


								        ADD     .L2X  4,A_r_ptr,B_r_ptr          ; nr + 3

								||      MVK     .S1   1,A_i1                     ; twin reg for h pointer


								; ========================== "JLOOP" LOOP KERNEL ========================== ;

								jloop:

								        LDNDW   .D1T2 *A_x_ptr++,B_x32:B_x10     ; load input: x[3:2]:x[1:0]

								||[!A_i]SUBAH   .D2   B_h_ptr,B_h_offset,B_h_ptr ; reset h_ptr

								||      DOTP2   .M1X  A_h32,B_x32,A_prod0_32     ; x[3]*h[3] + x[2]*h[2]

								||      DOTP2   .M2X  B_h21,A_x54,B_prod3_21     ; x[5]*h[2] + x[4]*h[1]

								||[!A_s]ADD     .L1   A_sum0,A_prod0_54,A_sum0   ; sum0 += A_prod0_54

								||[!A_s]ADD     .L2   B_sum1,B_prod1_65,B_sum1   ; sum1 += B_prod1_65

								||[!A_s]ADD     .S1   A_sum2,A_sum2a,A_sum2      ; sum2 += A_prod2_32,2_10

								||[!A_s]ADD     .S2   B_sum3,B_sum3a,B_sum3      ; sum3 += B_prod3_43,3_21


								        LDNDW   .D2T1 *B_h_ptr++,A_h32:A_h10     ; load coef: h[3:2]:h[1:0]

								||      DOTP2   .M1   A_x54,A_h32,A_prod2_32     ; x[5]*h[3] + x[4]*h[2]

								||      DOTP2   .M2   B_x32,B_h21,B_prod1_21     ; x[3]*h[2] + x[2]*h[1]

								||[!A_s]ADD     .L1   A_sum2,A_prod2_54,A_sum2   ; sum2 += A_prod2_54

								||[!A_s]ADD     .L2   B_sum3,B_prod3_65,B_sum3   ; sum3 += B_prod3_65

								||[!A_i]MV      .S1X  B_nh,A_i                   ; j_cnt = nh

								||      MV      .D1   A_i,A_i1                   ; delayed i_cnt

								||      MV      .S2X  A_i1,B_i2                  ; 2nd delayed i_cnt


								        LDNDW   .D2T2 *-B_h_ptr(10),B_h21:B_h0X  ; load coef: h[2:1]:h[0:-1]

								||      DOTP2   .M1X  A_h10,B_x10,A_prod0_10     ; x[1]*h[1] + x[0]*h[0]

								||      DOTP2   .M2X  B_h43,A_x54,B_prod1_43     ; x[5]*h[4] + x[4]*h[3]

								||[!A_s]ADD     .L1   A_sum0,A_prod0_76,A_sum0   ; sum0 += A_prod0_76

								||[!A_s]ADD     .L2   B_sum1,B_prod1_07,B_sum1   ; sum1 += B_prod1_07

								||      SUB     .D1   A_i,8,A_i                  ; i_cnt -=8

								||[ B_j]B       .S1   jloop                      ; outer loop branch

								||[!B_j]RET     .S2   B3                         ; return to calling program


								dint0:  DOTP2   .M1X  A_h10,B_x32,A_prod2_10     ; x[3]*h[1] + x[2]*h[0]

								||      DOTP2   .M2X  B_h43,A_x76,B_prod3_43     ; x[7]*h[4] + x[6]*h[3]

								||[ B_j]LDNDW   .D1T1 *A_x_ptr++,A_x76:A_x54     ; load input: x[7:6]:x[5:4]

								||[!A_s]ADD     .L1   A_sum2,A_prod2_76,A_sum2   ; sum2 += A_prod2_76

								||[!A_s]ADD     .D2   B_sum3,B_prod3_07,B_sum3   ; sum3 += B_prod3_07

								||      SHR     .S1   A_sum0,15,A_r0             ; r[0] = sum0 >>15

								||      PACKHL2 .L2   B_x32,B_xba,B_x3a          ; input: x[4:3]

								||      PACKHL2 .S2   B_x10,B_x98,B_x18          ; input: x[6:5]


								  [ B_j]LDNDW   .D2T2 *-B_h_ptr(2),B_h65:B_h43   ; load coef: h[6:5]:h[4:3]

								||      DOTP2   .M1   A_x54,A_h54,A_prod0_54     ; x[5]*h[5] + x[4]*h[4]

								||      DOTP2   .M2X  B_h65,A_x76,B_prod1_65     ; x[7]*h[6] + x[6]*h[5]

								||      SHR     .S1X  B_sum1,15,A_r1             ; r[1] = sum1 >>15

								||      ADD     .L1   A_sum2,A_sum2,A_r2         ; r[2] = sum2 >>15

								||      ADD     .S2   B_sum3,B_sum3,B_r3         ; r[3] = sum3 >>15

								||[ A_s]ZERO    .D1   A_s                        ; start to add the sum

								||[!B_i2]ZERO   .L2   B_sum3:B_sum1


								  [ B_j]LDNDW   .D1T2 *A_x_ptr,B_xba:B_x98       ; load input: x[11:10]:x[9:8]

								||      DOTP2   .M1   A_x76,A_h54,A_prod2_54     ; x[7]*h[5] + x[6]*h[4]

								||      DOTP2   .M2   B_x98,B_h65,B_prod3_65     ; x[9]*h[6] + x[8]*h[5]

								||[!B_i2]ZERO   .L1   A_sum2:A_sum0

								||      PACKH2  .L2X  B_h0X,A_h76,B_h07          ; coef: h[0:7]

								||[ B_j]SUB     .D2   B_j,1,B_j                  ; j_cnt -=1

								||[ A_s]B       .S2   dint0


								  [ B_j]LDNDW   .D2T1 *B_h_ptr++,A_h76:A_h54     ; load coef: h[7:6]:h[5:4]

								||      DOTP2   .M1   A_x76,A_h76,A_prod0_76     ; x[7]*h[7] + x[6]*h[6]

								||      DOTP2   .M2   B_x18,B_h07,B_prod1_07     ; x[1]*h[0] + x[8]*h[7]

								||      ADD     .L1   A_prod0_10,A_prod0_32,A_sum0a   ; sum0a = A_prod0_10+0_32

								||      ADD     .L2   B_prod1_21,B_prod1_43,B_sum1a   ; sum1a = B_prod1_21+1_43

								||[!A_i]SUBAH   .D1   A_x_ptr,A_x_offset,A_x_ptr

								||      PACK2   .S1   A_r1,A_r0,A_r10            ; r[1:0]

								||      PACKH2  .S2X  B_r3,A_r2,B_r32            ; r[3:2]


								  [!B_i2]STW    .D1T1 A_r10,*A_r_ptr++[2]        ; store output r[1:0]

								||[!B_i2]STW    .D2T2 B_r32,*B_r_ptr++[2]        ; store output r[3:2]

								||      DOTP2   .M1X  A_h76,B_x98,A_prod2_76     ; x[9]*h[7] + x[8]*h[6]

								||      DOTP2   .M2   B_x3a,B_h07,B_prod3_07     ; x[3]*h[0] + x[10]*h[7]

								||      ADD     .L1   A_prod2_10,A_prod2_32,A_sum2a   ; sum2a = A_prod2_10+2_32

								||      ADD     .L2   B_prod3_21,B_prod3_43,B_sum3a   ; sum3a = B_prod3_21+3_43

								||      ADD     .S1   A_sum0,A_sum0a,A_sum0      ; sum0 += A_prod0_10,0_32

								||      ADD     .S2   B_sum1,B_sum1a,B_sum1      ; sum1 += B_prod1_21,1_43


								; ============================ END OF "JLOOP" ============================= ;

								; ============================= BRANCH OCCURS ============================= ;


								* ========================================================================= *

								*   End of file:  dsp_fir_r8.asm                                            *

								* ------------------------------------------------------------------------- *

								*             Copyright (c) 2003 Texas Instruments, Incorporated.           *

								*                            All Rights Reserved.                           *

								* ========================================================================= *