;* ======================================================================== *;
;*  TEXAS INSTRUMENTS, INC.                                                 *;
;*                                                                          *;
;*  DSPLIB  DSP Signal Processing Library                                   *;
;*                                                                          *;
;*      Release:        Revision 1.04b                                      *;
;*      CVS Revision:   1.5     Sun Sep 29 03:32:22 2002 (UTC)              *;
;*      Snapshot date:  23-Oct-2003                                         *;
;*                                                                          *;
;*  This library contains proprietary intellectual property of Texas        *;
;*  Instruments, Inc.  The library and its source code are protected by     *;
;*  various copyrights, and portions may also be protected by patents or    *;
;*  other legal protections.                                                *;
;*                                                                          *;
;*  This software is licensed for use with Texas Instruments TMS320         *;
;*  family DSPs.  This license was provided to you prior to installing      *;
;*  the software.  You may review this license by consulting the file       *;
;*  TI_license.PDF which accompanies the files in this library.             *;
;* ------------------------------------------------------------------------ *;
;*          Copyright (C) 2003 Texas Instruments, Incorporated.             *;
;*                          All Rights Reserved.                            *;
;* ======================================================================== *;


;* ======================================================================== *;
;*  Assembler compatibility shim for assembling 4.30 and later code on      *;
;*  tools prior to 4.30.                                                    *;
;* ======================================================================== *;

        .if $isdefed(".ASSEMBLER_VERSION")
        .asg    .ASSEMBLER_VERSION, $asmver
        .else
        .asg    0,    $asmver
        .endif

        .if ($asmver < 430)

        .asg    B,    CALL     ; Function Call
        .asg    B,    RET      ; Return from a Function
        .asg    B,    CALLRET  ; Function call with Call / Ret chaining.

        .if .TMS320C6400
        .asg    BNOP, CALLNOP  ; C64x BNOP as a Fn. Call
        .asg    BNOP, RETNOP   ; C64x BNOP as a Fn. Return
        .asg    BNOP, CRNOP    ; C64x Fn call w/, Call/Ret chaining via BNOP.
        .endif

        .asg    , .asmfunc     ; .func equivalent for hand-assembly code
        .asg    , .endasmfunc  ; .endfunc equivalent for hand-assembly code

        .endif

;* ======================================================================== *;
;*  End of assembler compatibility shim.                                    *;
;* ======================================================================== *;


* ========================================================================= *
*   TEXAS INSTRUMENTS, INC.                                                 *
*                                                                           *
*   NAME                                                                    *
*       DSP_fir_r8: FIR Filter (radix 8)                                    *
*                                                                           *
*   REVISION DATE                                                           *
*       10-Aug-2001                                                         *
*                                                                           *
*   USAGE                                                                   *
*       This routine is C-callable and can be called as:                    *
*                                                                           *
*       void DSP_fir_r8                                                     *
*       (                                                                   *
*           const short *restrict x,  /* Input array [nr+nh-1 elements] */  *
*           const short *restrict h,  /* Coeff array [nh elements]      */  *
*           short       *restrict r,  /* Output array [nr elements]     */  *
*           int nh,                   /* Number of coefficients.        */  *
*           int nr                    /* Number of output samples.      */  *
*       )                                                                   *
*                                                                           *
*   ARGUMENTS PASSED                                                        *
*       *x    ->      A4                                                    *
*       *h    ->      B4                                                    *
*       *r    ->      A6                                                    *
*       nh    ->      B6                                                    *
*       nr    ->      A8                                                    *
*                                                                           *
*   DESCRIPTION                                                             *
*       Computes a real FIR filter (direct-form) using coefficients         *
*       stored in vector h.  The real data input is stored in vector x.     *
*       The filter output result is stored in vector r.  Input data and     *
*       filter taps are 16-bit, with intermediate values kept at 32-bit     *
*       precision.  Filter taps are expected in Q15 format.                 *
*                                                                           *
*       The following is a natural C implementation with no restrictions.   *
*       This version has restrictions as noted in the ASSUMPTIONS below.    *
*                                                                           *
*       void DSP_fir_r8                                                     *
*       (                                                                   *
*           const short *restrict x,                                        *
*           const short *restrict h,                                        *
*           short       *restrict r,                                        *
*           int nh,                                                         *
*           int nr                                                          *
*       )                                                                   *
*       {                                                                   *
*           int i, j, sum;                                                  *
*                                                                           *
*           for (j = 0; j < nr; j++)                                        *
*           {                                                               *
*               sum = 0;                                                    *
*               for (i = 0; i < nh; i++)                                    *
*                   sum += x[i + j] * h[i];                                 *
*               r[j] = sum >> 15;                                           *
*           }                                                               *
*       }                                                                   *
*                                                                           *
*   ASSUMPTIONS                                                             *
*       Number of taps:    'nh' >= 8, multiple of 8                         *
*       Number of samples: 'nr' >= 4, multiple of 4                         *
*       Array 'r' is word aligned.                                          *
*                                                                           *
*   NOTES                                                                   *
*       This function blocks interrupts for its entire duration.  It is     *
*       interrupt tolerant, but not interruptible.                          *
*                                                                           *
*   MEMORY NOTE                                                             *
*       No memory bank hits under any conditions.                           *
*       This code is a LITTLE ENDIAN implementation.                        *
*                                                                           *
*   TECHNIQUES                                                              *
*       1.  Load double word instruction is used to simultaneously load     *
*           four values in a single clock cycle.                            *
*                                                                           *
*       2.  The inner loop is unrolled four times and will always           *
*           compute a multiple of 4 of nr.                                  *
*                                                                           *
*       3.  The outer loop is conditionally exected in parallel with the    *
*           inner loop.  This allows for a zero overhead outer loop.        *
*                                                                           *
*   CYCLES                                                                  *
*       nh * nr/4 + 17                                                      *
*                                                                           *
*       For nh = 32 and nr = 36, cycles = 305.                              *
*                                                                           *
*   CODESIZE                                                                *
*       336 bytes.                                                          *
*                                                                           *
* ------------------------------------------------------------------------- *
*             Copyright (c) 2003 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *

; ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== ;
        .asg   A0,      A_i        ; inner loop flag;when = 0,branch to iloop
        .asg   A1,      A_i1       ; delayed i_cnt
        .asg   A2,      A_s        ; flag for adding 1st sum
        .asg   A4,      A_x_ptr    ; input array pointer
        .asg   A5,      A_r2       ; output: r[2]
        .asg   A6,      A_r_ptr    ; output ptr that init. point to r[0]
        .asg   A8,      A_nr       ; number of output samples
        .asg   A9,      A_x_offset ; offset to reset input ptr for next loop
        .asg   A16,     A_x54      ; input: x[5:4]
        .asg   A17,     A_x76      ; input: x[7:6]
        .asg   A20,     A_h10      ; input: h[1:0]
        .asg   A21,     A_h32      ; input: h[3:2]
        .asg   A22,     A_h54      ; input: h[5:4]
        .asg   A23,     A_h76      ; input: h[7:6]
        .asg   A24,     A_sum0     ; sum0
        .asg   A25,     A_sum2     ; sum2
        .asg   A26,     A_prod0_32 ; = x[3]*h[3] + x[2]*h[2]
        .asg   A26,     A_prod0_54 ; = x[5]*h[5] + x[4]*h[4]
        .asg   A26,     A_prod0_76 ; = x[7]*h[7] + x[6]*h[6]
        .asg   A27,     A_prod2_32 ; = x[5]*h[3] + x[4]*h[2]
        .asg   A27,     A_prod2_54 ; = x[7]*h[5] + x[6]*h[4]
        .asg   A27,     A_prod2_76 ; = x[9]*h[7] + x[8]*h[6]
        .asg   A28,     A_prod0_10 ; = x[1]*h[1] + x[0]*h[0]
        .asg   A29,     A_prod2_10 ; = x[3]*h[1] + x[2]*h[0]
        .asg   A28,     A_sum0a    ; sum0
        .asg   A29,     A_sum2a    ; sum2
        .asg   A30,     A_r0       ; output: r[0]
        .asg   A31,     A_r1       ; output: r[1]
        .asg   A31,     A_r10      ; output: r[1:0]
        .asg   B0,      B_j        ; outer loop count
        .asg   B1,      B_i2       ; 2nd delayed i_cnt
        .asg   B4,      B_h_ptr    ; coef array pointer
        .asg   B5,      B_x18      ; input: x[1:8]
        .asg   B6,      B_nh       ; number of coefficients
        .asg   B7,      B_r_ptr    ; output ptr that init. point to r[2]
        .asg   B8,      B_x3a      ; input: x[3:10]
        .asg   B9,      B_h_offset ; offset to reset coef ptr for next loop
        .asg   B16,     B_x10      ; input: x[1:0]
        .asg   B17,     B_x32      ; input: x[3:2]
        .asg   B18,     B_x98      ; input: x[9:8]
        .asg   B19,     B_xba      ; input: x[11:10]
        .asg   B20,     B_h0X      ; coef: h[0:-1]
        .asg   B21,     B_h21      ; coef: h[2:1]
        .asg   B22,     B_h43      ; coef: h[4:3]
        .asg   B23,     B_h65      ; coef: h[6:5]
        .asg   B24,     B_sum1     ; sum1
        .asg   B25,     B_sum3     ; sum3
        .asg   B26,     B_prod1_21 ; = x[3]*h[2] + x[2]*h[1]
        .asg   B26,     B_prod1_65 ; = x[7]*h[6] + x[6]*h[5]
        .asg   B26,     B_prod1_07 ; = x[1]*h[0] + x[8]*h[7]
        .asg   B27,     B_prod3_21 ; = x[5]*h[2] + x[4]*h[1]
        .asg   B27,     B_prod3_65 ; = x[9]*h[6] + x[8]*h[5]
        .asg   B27,     B_prod3_07 ; = x[3]*h[0] + x[a]*h[7]
        .asg   B28,     B_prod1_43 ; = x[5]*h[4] + x[4]*h[3]
        .asg   B28,     B_sum1a    ; sum1
        .asg   B29,     B_prod3_43 ; = x[7]*h[4] + x[6]*h[3]
        .asg   B29,     B_sum3a    ; sum3
        .asg   B30,     B_h07      ; input: h[0:7]
        .asg   B31,     B_r3       ; output: r[3]
        .asg   B31,     B_r32      ; output: r[3:2]

        .sect ".text:_fir_r8"
        .global _DSP_fir_r8
_DSP_fir_r8:

; ======================= SETUP / LOOP PIPE-UP CODE ======================= ;
        LDNDW   .D1T2 *A_x_ptr++,B_x32:B_x10     ; load input: x[3:2]:x[1:0]
||      ADD     .L1   3,A_nr,A_nr                ; nr + 3

        LDNDW   .D2T1 *B_h_ptr++,A_h32:A_h10     ; load coef: h[3:2]:h[1:0]

        LDNDW   .D2T2 *-B_h_ptr(10),B_h21:B_h0X  ; load coef: h[2:1]:h[0:-1]
||      SHR     .S2X  A_nr,2,B_j                 ; j_cnt = (nr+3)>>2

        LDNDW   .D1T1 *A_x_ptr++,A_x76:A_x54     ; load input: x[7:6]:x[5:4]
||      MPY     .M2   B_j,B_nh,B_j
||      ADD     .L1X  -4,B_nh,A_x_offset         ; twin reg for h pointer

        LDNDW   .D2T2 *-B_h_ptr(2),B_h65:B_h43   ; load coef: h[6:5]:h[4:3]
||      ADD     .S1X  -8,B_nh,A_i                ; j_cnt = (nh>>3)
||      ZERO    .L1   A_sum2:A_sum0
||      MVK     .D1   1,A_s                      ; flag for adding 1st sum

        LDNDW   .D1T2 *A_x_ptr,B_xba:B_x98       ; load input: x[11:10]:x[9:8]
||      MV      .L2   B_nh,B_h_offset            ; twin reg for h pointer
||      SHR     .S2   B_j,3,B_j                  ; j_cnt = (nh>>3)
||      B       .S1   dint0

        LDNDW   .D2T1 *B_h_ptr++,A_h76:A_h54     ; load coef: h[7:6]:h[5:4]
||[!A_i]SUBAH   .D1   A_x_ptr,A_x_offset,A_x_ptr ; reset x_ptr
||      ZERO    .L2   B_sum3:B_sum1

        ADD     .L2X  4,A_r_ptr,B_r_ptr          ; nr + 3
||      MVK     .S1   1,A_i1                     ; twin reg for h pointer


; ========================== "JLOOP" LOOP KERNEL ========================== ;
jloop:
        LDNDW   .D1T2 *A_x_ptr++,B_x32:B_x10     ; load input: x[3:2]:x[1:0]
||[!A_i]SUBAH   .D2   B_h_ptr,B_h_offset,B_h_ptr ; reset h_ptr
||      DOTP2   .M1X  A_h32,B_x32,A_prod0_32     ; x[3]*h[3] + x[2]*h[2]
||      DOTP2   .M2X  B_h21,A_x54,B_prod3_21     ; x[5]*h[2] + x[4]*h[1]
||[!A_s]ADD     .L1   A_sum0,A_prod0_54,A_sum0   ; sum0 += A_prod0_54
||[!A_s]ADD     .L2   B_sum1,B_prod1_65,B_sum1   ; sum1 += B_prod1_65
||[!A_s]ADD     .S1   A_sum2,A_sum2a,A_sum2      ; sum2 += A_prod2_32,2_10
||[!A_s]ADD     .S2   B_sum3,B_sum3a,B_sum3      ; sum3 += B_prod3_43,3_21

        LDNDW   .D2T1 *B_h_ptr++,A_h32:A_h10     ; load coef: h[3:2]:h[1:0]
||      DOTP2   .M1   A_x54,A_h32,A_prod2_32     ; x[5]*h[3] + x[4]*h[2]
||      DOTP2   .M2   B_x32,B_h21,B_prod1_21     ; x[3]*h[2] + x[2]*h[1]
||[!A_s]ADD     .L1   A_sum2,A_prod2_54,A_sum2   ; sum2 += A_prod2_54
||[!A_s]ADD     .L2   B_sum3,B_prod3_65,B_sum3   ; sum3 += B_prod3_65
||[!A_i]MV      .S1X  B_nh,A_i                   ; j_cnt = nh
||      MV      .D1   A_i,A_i1                   ; delayed i_cnt
||      MV      .S2X  A_i1,B_i2                  ; 2nd delayed i_cnt

        LDNDW   .D2T2 *-B_h_ptr(10),B_h21:B_h0X  ; load coef: h[2:1]:h[0:-1]
||      DOTP2   .M1X  A_h10,B_x10,A_prod0_10     ; x[1]*h[1] + x[0]*h[0]
||      DOTP2   .M2X  B_h43,A_x54,B_prod1_43     ; x[5]*h[4] + x[4]*h[3]
||[!A_s]ADD     .L1   A_sum0,A_prod0_76,A_sum0   ; sum0 += A_prod0_76
||[!A_s]ADD     .L2   B_sum1,B_prod1_07,B_sum1   ; sum1 += B_prod1_07
||      SUB     .D1   A_i,8,A_i                  ; i_cnt -=8
||[ B_j]B       .S1   jloop                      ; outer loop branch
||[!B_j]RET     .S2   B3                         ; return to calling program

dint0:  DOTP2   .M1X  A_h10,B_x32,A_prod2_10     ; x[3]*h[1] + x[2]*h[0]
||      DOTP2   .M2X  B_h43,A_x76,B_prod3_43     ; x[7]*h[4] + x[6]*h[3]
||[ B_j]LDNDW   .D1T1 *A_x_ptr++,A_x76:A_x54     ; load input: x[7:6]:x[5:4]
||[!A_s]ADD     .L1   A_sum2,A_prod2_76,A_sum2   ; sum2 += A_prod2_76
||[!A_s]ADD     .D2   B_sum3,B_prod3_07,B_sum3   ; sum3 += B_prod3_07
||      SHR     .S1   A_sum0,15,A_r0             ; r[0] = sum0 >>15
||      PACKHL2 .L2   B_x32,B_xba,B_x3a          ; input: x[4:3]
||      PACKHL2 .S2   B_x10,B_x98,B_x18          ; input: x[6:5]

  [ B_j]LDNDW   .D2T2 *-B_h_ptr(2),B_h65:B_h43   ; load coef: h[6:5]:h[4:3]
||      DOTP2   .M1   A_x54,A_h54,A_prod0_54     ; x[5]*h[5] + x[4]*h[4]
||      DOTP2   .M2X  B_h65,A_x76,B_prod1_65     ; x[7]*h[6] + x[6]*h[5]
||      SHR     .S1X  B_sum1,15,A_r1             ; r[1] = sum1 >>15
||      ADD     .L1   A_sum2,A_sum2,A_r2         ; r[2] = sum2 >>15
||      ADD     .S2   B_sum3,B_sum3,B_r3         ; r[3] = sum3 >>15
||[ A_s]ZERO    .D1   A_s                        ; start to add the sum
||[!B_i2]ZERO   .L2   B_sum3:B_sum1

  [ B_j]LDNDW   .D1T2 *A_x_ptr,B_xba:B_x98       ; load input: x[11:10]:x[9:8]
||      DOTP2   .M1   A_x76,A_h54,A_prod2_54     ; x[7]*h[5] + x[6]*h[4]
||      DOTP2   .M2   B_x98,B_h65,B_prod3_65     ; x[9]*h[6] + x[8]*h[5]
||[!B_i2]ZERO   .L1   A_sum2:A_sum0
||      PACKH2  .L2X  B_h0X,A_h76,B_h07          ; coef: h[0:7]
||[ B_j]SUB     .D2   B_j,1,B_j                  ; j_cnt -=1
||[ A_s]B       .S2   dint0

  [ B_j]LDNDW   .D2T1 *B_h_ptr++,A_h76:A_h54     ; load coef: h[7:6]:h[5:4]
||      DOTP2   .M1   A_x76,A_h76,A_prod0_76     ; x[7]*h[7] + x[6]*h[6]
||      DOTP2   .M2   B_x18,B_h07,B_prod1_07     ; x[1]*h[0] + x[8]*h[7]
||      ADD     .L1   A_prod0_10,A_prod0_32,A_sum0a   ; sum0a = A_prod0_10+0_32
||      ADD     .L2   B_prod1_21,B_prod1_43,B_sum1a   ; sum1a = B_prod1_21+1_43
||[!A_i]SUBAH   .D1   A_x_ptr,A_x_offset,A_x_ptr
||      PACK2   .S1   A_r1,A_r0,A_r10            ; r[1:0]
||      PACKH2  .S2X  B_r3,A_r2,B_r32            ; r[3:2]

  [!B_i2]STW    .D1T1 A_r10,*A_r_ptr++[2]        ; store output r[1:0]
||[!B_i2]STW    .D2T2 B_r32,*B_r_ptr++[2]        ; store output r[3:2]
||      DOTP2   .M1X  A_h76,B_x98,A_prod2_76     ; x[9]*h[7] + x[8]*h[6]
||      DOTP2   .M2   B_x3a,B_h07,B_prod3_07     ; x[3]*h[0] + x[10]*h[7]
||      ADD     .L1   A_prod2_10,A_prod2_32,A_sum2a   ; sum2a = A_prod2_10+2_32
||      ADD     .L2   B_prod3_21,B_prod3_43,B_sum3a   ; sum3a = B_prod3_21+3_43
||      ADD     .S1   A_sum0,A_sum0a,A_sum0      ; sum0 += A_prod0_10,0_32
||      ADD     .S2   B_sum1,B_sum1a,B_sum1      ; sum1 += B_prod1_21,1_43

; ============================ END OF "JLOOP" ============================= ;
; ============================= BRANCH OCCURS ============================= ;

* ========================================================================= *
*   End of file:  dsp_fir_r8.asm                                            *
* ------------------------------------------------------------------------- *
*             Copyright (c) 2003 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *