c6416_sdk/dsplib/fir_gen.asm


								;* ======================================================================== *;

								;*  TEXAS INSTRUMENTS, INC.                                                 *;

								;*                                                                          *;

								;*  DSPLIB  DSP Signal Processing Library                                   *;

								;*                                                                          *;

								;*      Release:        Revision 1.04b                                      *;

								;*      CVS Revision:   1.14    Thu Jun 12 17:41:04 2003 (UTC)              *;

								;*      Snapshot date:  23-Oct-2003                                         *;

								;*                                                                          *;

								;*  This library contains proprietary intellectual property of Texas        *;

								;*  Instruments, Inc.  The library and its source code are protected by     *;

								;*  various copyrights, and portions may also be protected by patents or    *;

								;*  other legal protections.                                                *;

								;*                                                                          *;

								;*  This software is licensed for use with Texas Instruments TMS320         *;

								;*  family DSPs.  This license was provided to you prior to installing      *;

								;*  the software.  You may review this license by consulting the file       *;

								;*  TI_license.PDF which accompanies the files in this library.             *;

								;* ------------------------------------------------------------------------ *;

								;*          Copyright (C) 2003 Texas Instruments, Incorporated.             *;

								;*                          All Rights Reserved.                            *;

								;* ======================================================================== *;


								;* ======================================================================== *;

								;*  Assembler compatibility shim for assembling 4.30 and later code on      *;

								;*  tools prior to 4.30.                                                    *;

								;* ======================================================================== *;


								        .if $isdefed(".ASSEMBLER_VERSION")

								        .asg    .ASSEMBLER_VERSION, $asmver

								        .else

								        .asg    0,    $asmver

								        .endif


								        .if ($asmver < 430)


								        .asg    B,    CALL     ; Function Call

								        .asg    B,    RET      ; Return from a Function

								        .asg    B,    CALLRET  ; Function call with Call / Ret chaining.


								        .if .TMS320C6400

								        .asg    BNOP, CALLNOP  ; C64x BNOP as a Fn. Call

								        .asg    BNOP, RETNOP   ; C64x BNOP as a Fn. Return

								        .asg    BNOP, CRNOP    ; C64x Fn call w/, Call/Ret chaining via BNOP.

								        .endif


								        .asg    , .asmfunc     ; .func equivalent for hand-assembly code

								        .asg    , .endasmfunc  ; .endfunc equivalent for hand-assembly code


								        .endif


								;* ======================================================================== *;

								;*  End of assembler compatibility shim.                                    *;

								;* ======================================================================== *;


								* ========================================================================= *

								*                                                                           *

								*   TEXAS INSTRUMENTS, INC.                                                 *

								*                                                                           *

								*   NAME                                                                    *

								*       DSP_fir_gen: FIR Filter (general purpose)                           *

								*                                                                           *

								*   USAGE                                                                   *

								*       This routine is C-callable and can be called as:                    *

								*                                                                           *

								*       void DSP_fir_gen                                                    *

								*       (                                                                   *

								*           const short *restrict x,  /* Input ('nr + nh - 1' samples) */   *

								*           const short *restrict h,  /* Filter coefficients (nh taps) */   *

								*           short       *restrict r,  /* Output array ('nr' samples)   */   *

								*           int                   nh, /* Length of filter (nh >= 5)    */   *

								*           int                   nr  /* Length of output (nr >= 1)    */   *

								*       );                                                                  *

								*                                                                           *

								*   C CODE                                                                  *

								*                                                                           *

								*       This is the C equivalent of the assembly code. Note that the        *

								*       assembly code is hand optimized and restrictions may apply.         *

								*                                                                           *

								*       void DSP_fir_gen                                                    *

								*       (                                                                   *

								*           const short *restrict x,  /* Input ('nr + nh - 1' samples) */   *

								*           const short *restrict h,  /* Filter coefficients (nh taps) */   *

								*           short       *restrict r,  /* Output array ('nr' samples)   */   *

								*           int                   nh, /* Length of filter (nh >= 5)    */   *

								*           int                   nr  /* Length of output (nr >= 1)    */   *

								*       )                                                                   *

								*       {                                                                   *

								*           int i, j, sum;                                                  *

								*                                                                           *

								*           for (j = 0; j < nr; j++)                                        *

								*           {                                                               *

								*               sum = 0;                                                    *

								*               for (i = 0; i < nh; i++)                                    *

								*                   sum += x[i + j] * h[i];                                 *

								*                                                                           *

								*               r[j] = sum >> 15;                                           *

								*           }                                                               *

								*       }                                                                   *

								*                                                                           *

								*   DESCRIPTION                                                             *

								*       Computes a real FIR filter (direct-form) using coefficients         *

								*       stored in vector h. The real data input is stored in vector x.      *

								*       The filter output result is stored in vector r. This FIR            *

								*       assumes the number of filter coefficients is greater than or        *

								*       equal to 5. It operates on 16-bit data with a 32-bit                *

								*       accumulate. This routine has no memory hits regardless of where     *

								*       x, h, and r arrays are located in memory. The filter is nr          *

								*       output samples and nh coefficients. The assembly routine            *

								*       performs 4 output samples at a time.                                *

								*                                                                           *

								*   TECHNIQUES                                                              *

								*       1. Load double word instruction is used to simultaneously load      *

								*          four values in a single clock cycle.                             *

								*                                                                           *

								*       2. The inner loop is unrolled four times and will always            *

								*          compute a multiple of 4 of nh and nr. If nh % 4 != 0, the        *

								*          code will fill in 0s to make nh a multiple of 4. If nr % 4       *

								*          != 0, the code will still perform a mutiple of 4 outputs.        *

								*                                                                           *

								*       3. Both the inner and outer loops are software pipelined.           *

								*                                                                           *

								*       4. This code yields best performance when ratio of outer            *

								*          loop to inner loop is less than or equal to 4.                   *

								*                                                                           *

								*   ASSUMPTIONS                                                             *

								*       1. Little Endian is assumed for LDNDW.                              *

								*       2. nh >= 5.                                                         *

								*       3. nr multiple of 4.                                                *

								*       4. Output array r[] must be word-aligned                            *

								*                                                                           *

								*   MEMORY NOTE                                                             *

								*       No memory bank hits under any conditions.                           *

								*       Little Endian operation is assumed.                                 *

								*                                                                           *

								*   CYCLES                                                                  *

								*       [11 + 4 * ceil(nh/4)] * nr/4 + 15                                   *

								*                                                                           *

								*   CODESIZE                                                                *

								*       544  bytes                                                          *

								*                                                                           *

								* ------------------------------------------------------------------------- *

								*             Copyright (c) 2003 Texas Instruments, Incorporated.           *

								*                            All Rights Reserved.                           *

								* ========================================================================= *


								        .sect ".text:_fir_gen"

								        .global _DSP_fir_gen

								_DSP_fir_gen:


								* ===================== SYMBOLIC REGISTER ASSIGNMENTS =====================*

								        .asg            A4,         A_x        ; Input data pointer

								        .asg            A27,        A_xptr     ; Input load pointer

								        .asg            B4,         B_hptr     ; Filter pointer

								        .asg            A6,         A_rptr     ; Output pointer

								        .asg            B6,         B_nh       ; # Filter taps

								        .asg            A8,         A_nr       ; # Output samples

								        .asg            A7,         A_ptr_x    ; Input read pointer

								        .asg            A22,        A_h10      ; Packed coefficient h10

								        .asg            A23,        A_h32      ; Packed coefficient h32

								        .asg            B22,        B_h10      ; Packed coefficient h10

								        .asg            B23,        B_h32      ; Packed coefficient h32

								        .asg            A4,         A_sum0     ; Accum. for sample# 0

								        .asg            A5,         A_sum1     ; Accum. for sample# 1

								        .asg            B8,         B_sum2     ; Accum. for sample# 2

								        .asg            B9,         B_sum3     ; Accum. for sample# 3

								        .asg            B6,         B_x54      ; Input samples x54

								        .asg            B7,         B_x76      ; Input samples x76

								        .asg            A3,         A_hptr     ; Filter pointer

								        .asg            A0,         A_i        ; Index variable i

								        .asg            B24,        B_h32_n    ; Special filter

								        .asg            B25,        B_h10_n    ; words h32, h10

								        .asg            B26,        B_optr     ; Output twin pointers

								        .asg            A26,        A_optr     ; Output twin pointers

								        .asg            B27,        B_ptr      ;

								        .asg            A28,        A_ofs      ;

								        .asg            B28,        B_nh_l     ;

								        .asg            B0,         B_j        ; Outer loop trip cnt.

								;---------------------------------------------------------------------------

								        .asg            B1,         B_m        ;

								        .asg            B2,         B_3        ;

								        .asg            B0,         B_2        ;

								        .asg            B1,         B_1        ;

								        .asg            B5,         B_ofs      ;

								        .asg            A16,        A_nr_l     ;

								        .asg            A9,         A_it_i     ;

								        .asg            A29,        A_nh_l     ;

								        .asg            B30,        B_csr      ; CSR

								        .asg            B31,        B_no_gie   ; NO GIE

								* ========================================================================= *


								        AND     .D2     3,          B_nh,       B_m       ; mask = nh & 3

								||      MV      .L1     A_x,        A_xptr                ; xptr = x

								||      MVC     .S2     CSR,        B_csr                 ; CSR


								  [!B_m]MVK     .L2     4,          B_m                   ; mask = 4


								        SUB     .S2     B_nh,       B_m,        B_ofs     ; ofs = nh - m

								||      AND     .L2     B_csr,      -2,         B_no_gie  ; NO GIE


								        ADDAH   .D2     B_hptr,     B_ofs,      B_ptr     ; ptr = &h[ofs]

								||      MVC     .S2     B_no_gie,   CSR                   ; Interr. masked


								;-- Interrupts masked here


								        LDNDW   .D2T2   *B_ptr--,   B_h32:B_h10           ; Load h32:h10

								||      ADD     .L1     A_nr,       3,          A_nr_l    ; nr + 3

								||      ADD     .S2     B_nh,       3,          B_nh_l    ; nh + 3


								        SHRU    .S1     A_nr_l,     2,          A_nr_l    ; nr_l >> 2

								||      CMPEQ   .L2     3,          B_m,        B_3       ; m == 3

								||      SHRU    .S2     B_nh_l,     2,          B_nh_l    ; nr_h >> 2


								        ADD     .D2X    A_rptr,     4,          B_optr    ; optr = &r[4]

								||      MV      .L1X    B_nh_l,     A_nh_l                ; copy.

								||      CMPEQ   .L2     1,          B_m,        B_1       ; m == 1


								        ADD     .D1X    B_ofs,      4,          A_ofs     ; ofs + 4


								        ADD     .S1     A_ofs,      A_ofs,      A_ofs     ; ADDAH


								        ADD     .L1     A_xptr,     A_ofs,      A_ptr_x   ; xptr + ofs

								||      CMPEQ   .L2     2,          B_m,        B_2       ; m == 2

								||[ B_3]CLR     .S2     B_h32,      16, 31,     B_h32     ; h32 = 00XX


								        LDNDW   .D1T2   *A_ptr_x--, B_x76:B_x54           ; x76:x54

								||      ADD     .L1     A_rptr,     0,          A_optr    ; optr= r

								||[ B_1]CLR     .S2     B_h10,      16, 31,     B_h10     ; h10 = 00XX

								||[ B_1]ZERO    .L2     B_h32                             ; h32 = 0

								||[ B_2]ZERO    .D2     B_h32                             ; h32 = 0


								* ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== *

								        .asg            A7,         A_ptr_x             ; Input data ptr

								        .asg            A22,        A_h10               ; Packed h10

								        .asg            A23,        A_h32               ; Packed h32

								        .asg            B22,        B_h10               ; Packed h10

								        .asg            B23,        B_h32               ; Packed h32

								        .asg            A4,         A_sum0              ; Accumulators

								        .asg            A5,         A_sum1              ; for 4 o/p

								        .asg            B8,         B_sum2              ; samples

								        .asg            B9,         B_sum3              ;

								        .asg            B6,         B_x54               ;

								        .asg            B7,         B_x76               ;

								        .asg            A3,         A_hptr              ; Filter ptr

								        .asg            A0,         A_i                 ;

								        .asg            B17,        B_r32               ;

								        .asg            A20,        A_sum0_s            ;

								        .asg            A17,        A_sum1_s            ;

								        .asg            A21,        A_x32               ;

								        .asg            A20,        A_x10               ;

								        .asg            A19,        A_x21               ;

								        .asg            B17,        B_x43               ;

								        .asg            B5,         B_x65               ;

								        .asg            A17,        A_prod0_10          ;

								        .asg            A16,        A_prod0_32          ;

								        .asg            A16,        A_prod2_10          ;

								        .asg            A9,         A_prod2_32          ;

								        .asg            B16,        B_prod1_10          ;

								        .asg            B20,        B_prod1_32          ;

								        .asg            B20,        B_prod3_10          ;

								        .asg            B19,        B_prod3_32          ;

								        .asg            A19,        A_sum0_0            ;

								        .asg            A18,        A_sum2_0            ;

								        .asg            B16,        B_sum1_0            ;

								        .asg            B18,        B_sum3_0            ;

								        .asg            B18,        B_sum2_s            ;

								        .asg            B19,        B_sum3_s            ;

								        .asg            A18,        A_r10               ;

								        .asg            B3,         B_return            ; Return address

								* ========================================================================= *

								* =========================== PIPE LOOP PROLOG ============================ *

								        LDNDW   .D1T1   *A_ptr_x--, A_x32:A_x10                 ;[ 1,1]

								||      MV      .L1X    B_ptr,      A_hptr                      ;[20,0]

								||      MV      .S1     A_nh_l,     A_i                         ;[19,0]

								||      MV      .L2     B_h10,      B_h10_n                     ;[15,0]

								||      MV      .S2     B_h32,      B_h32_n                     ;[15,0]


								        SUB     .S1     A_i,        1,          A_i             ;[21,0]

								||      MV      .D1X    B_h32_n,    A_h32                       ;[21,0]

								||      ZERO    .L1     A_prod0_10                              ;[P ,0]

								||      SUB     .D2X    A_nr_l,     0,          B_j             ;[13,0]


								        MPY     .M2     B_sum3_0,   0,          B_sum3_0        ;[P, 0]

								||      ZERO    .S2     B_prod3_32                              ;[P, 0]

								||      ZERO    .D2     B_sum1_0                                ;[P, 0]

								||      ZERO    .L1     A_prod0_32                              ;[P, 0]

								||      MPY     .M1     A_sum2_0,   0,          A_sum2_0        ;[13,3]

								||      MV      .S1X    B_h10_n,    A_h10                       ;[22,0]

								||      MV      .L2     B_h10_n,    B_h10                       ;[22,0]

								||      ADD     .D1     A_ofs,      8,          A_ofs           ;[22,0]


								LOOPJ:

								   [B_j]LDNDW   .D1T1   *A_hptr,    A_h32:A_h10                 ;[ 4,1]

								|| [B_j]ZERO    .S2     B_prod3_10,                             ;[13,3]

								|| [B_j]MPY     .M2     B_prod1_10, 0,          B_prod1_10      ;[ P,0]

								|| [B_j]ADD     .S1     A_sum0,     A_prod0_10, A_sum0_0        ;[12,3]

								|| [B_j]MPY     .M1     A_prod0_10, 0,          A_prod0_10      ;[ P,0]

								|| [B_j]MV      .D2     B_h32_n,    B_h32                       ;[22,0]

								|| [B_j]ZERO    .L1     A_sum1:A_sum0                           ;[22,0]

								|| [B_j]ZERO    .L2     B_sum3:B_sum2                           ;[22,0]


								 [A_i]  BDEC    .S1     LOOPI,      A_i                         ;[17,1]

								||      DOTP2   .M1X    B_x54,      A_h32,      A_prod2_32      ;[ 5,1]

								||      LDNDW   .D1T1   *A_ptr_x--, A_x32:A_x10                 ;[ 1,2]

								||      ZERO    .L1     A_prod2_10                              ;[ P,0]


								        MV      .D2X    A_x32,      B_x76                       ;[ 6,1]

								||      LDNDW   .D1T2   *A_hptr--,  B_h32:B_h10                 ;[ 6,1]

								||      DOTP2   .M1     A_x10,      A_h10,      A_prod0_10      ;[ 6,1]

								||      PACKLH2 .S2     B_x76,      B_x54,      B_x65           ;[ 6,1]

								||      PACKLH2 .L2X    B_x54,      A_x32,      B_x43           ;[ 6,1]

								||      PACKLH2 .S1     A_x32,      A_x10,      A_x21           ;[ 6,1]

								||      ZERO    .L1     A_prod2_32                              ;[11,3]


								;

								* =========================== PIPE LOOP KERNEL ============================ *

								LOOPI:

								        SHR     .S2     B_sum2,     15,         B_sum2_s        ;[15,2]

								||      SHR     .S1     A_sum0,     15,         A_sum0_s        ;[15,2]

								||      ADD     .L2     B_sum3_0,   B_prod3_32, B_sum3          ;[15,2]

								||      ADD     .D1X    A_sum1,     B_sum1_0,   A_sum1          ;[15,2]

								||      ADD     .L1     A_prod2_10, A_prod2_32, A_sum2_0        ;[11,3]

								||      MV      .D2X    A_x10,      B_x54                       ;[ 7,4]

								||      DOTP2   .M2     B_x43,      B_h32,      B_prod1_32      ;[ 7,4]

								||      DOTP2   .M1     A_x32,      A_h10,      A_prod2_10      ;[ 7,4]


								        SHR     .S2     B_sum3,     15,         B_sum3_s        ;[16,2]

								||      SHR     .S1     A_sum1,     15,         A_sum1_s        ;[16,2]

								||      ADD     .D2     B_prod1_10, B_prod1_32, B_sum1_0        ;[12,3]

								||      ADD     .L1     A_sum0,     A_prod0_10, A_sum0_0        ;[12,3]

								||      DOTP2   .M2X    A_x21,      B_h10,      B_prod1_10      ;[ 8,4]

								||      DOTP2   .M1     A_x32,      A_h32,      A_prod0_32      ;[ 8,4]

								||      LDNDW   .D1T1   *A_hptr,    A_h32:A_h10                 ;[ 4,5]


								  [ A_i]BDEC    .S1     LOOPI,      A_i                         ;[17,2]

								||      PACK2   .L1     A_sum1_s,   A_sum0_s,   A_r10           ;[17,2]

								||      PACK2   .L2     B_sum3_s,   B_sum2_s,   B_r32           ;[17,2]

								||      ADD     .S2     B_sum3,     B_prod3_10, B_sum3_0        ;[13,3]

								||      ADD     .D2X    B_sum2,     A_sum2_0,   B_sum2          ;[13,3]

								||      DOTP2   .M2     B_x43,      B_h10,      B_prod3_10      ;[ 9,4]

								||      DOTP2   .M1X    B_x54,      A_h32,      A_prod2_32      ;[ 5,5]

								||      LDNDW   .D1T1   *A_ptr_x--, A_x32:A_x10                 ;[ 1,6]


								        ADD     .L1     A_sum0_0,   A_prod0_32, A_sum0          ;[14,3]

								||      DOTP2   .M2     B_x65,      B_h32,      B_prod3_32      ;[10,4]

								||      MV      .D2X    A_x32,      B_x76                       ;[ 6,5]

								||      LDNDW   .D1T2   *A_hptr--,  B_h32:B_h10                 ;[ 6,5]

								||      DOTP2   .M1     A_x10,      A_h10,      A_prod0_10      ;[ 6,5]

								||      PACKLH2 .S2     B_x76,      B_x54,      B_x65           ;[ 6,5]

								||      PACKLH2 .L2X    B_x54,      A_x32,      B_x43           ;[ 6,5]

								||      PACKLH2 .S1     A_x32,      A_x10,      A_x21           ;[ 6,5]


								* =========================== PIPE LOOP EPILOG ============================ *


								        SHR     .S2     B_sum2,     15,         B_sum2_s        ;[15,5]

								||      SHR     .S1     A_sum0,     15,         A_sum0_s        ;[15,5]

								||      ADD     .L2     B_sum3_0,   B_prod3_32, B_sum3          ;[15,5]

								||      ADD     .D1X    A_sum1,     B_sum1_0,   A_sum1          ;[15,5]

								||      ADD     .L1     A_prod2_10, A_prod2_32, A_sum2_0        ;[11,6]


								        SHR     .S2     B_sum3,     15,         B_sum3_s        ;[16,5]

								||      SHR     .S1     A_sum1,     15,         A_sum1_s        ;[16,5]

								||      ADD     .D2     B_prod1_10, B_prod1_32, B_sum1_0        ;[12,6]

								||      ADD     .L1     A_sum0,     A_prod0_10, A_sum0_0        ;[12,6]


								        PACK2   .L1     A_sum1_s,   A_sum0_s,   A_r10           ;[17,5]

								||      PACK2   .L2     B_sum3_s,   B_sum2_s,   B_r32           ;[17,5]

								||      ADD     .D2X    B_sum2,     A_sum2_0,   B_sum2          ;[13,6]

								||[B_j] BDEC    .S2     LOOPJ,      B_j                         ;[ 3,0]

								||      ADD     .D1     A_xptr,     A_ofs,      A_ptr_x         ;[17,0]

								||      MV      .S1X    B_h10_n,    A_h10                       ;[22,0]


								        ADD     .L1     A_sum0_0,   A_prod0_32, A_sum0          ;[14,6]

								||      ADD     .D2     B_sum3,     B_prod3_10, B_sum3_0        ;[13,6]

								||      LDNDW   .D1T2   *A_ptr_x--, B_x76:B_x54                 ;[18,0]

								||      MV      .S1X    B_h32_n,    A_h32                       ;[21,0]

								||[!B_j]RET     .S2     B_return


								        SHR     .S2     B_sum2,     15,         B_sum2_s        ;[15,6]

								||      SHR     .S1     A_sum0,     15,         A_sum0_s        ;[15,6]

								||      ADD     .L2     B_sum3_0,   B_prod3_32, B_sum3          ;[15,6]

								||      ADD     .D1X    A_sum1,     B_sum1_0,   A_sum1          ;[15,6]

								||      SUB     .L1     A_nh_l,     1,          A_i             ;[19,0]


								        SHR     .S2     B_sum3,     15,         B_sum3_s        ;[16,6]

								||      SHR     .S1     A_sum1,     15,         A_sum1_s        ;[16,6]

								||[B_j] LDNDW   .D1T1   *A_ptr_x--, A_x32:A_x10                 ;[ 1,1]

								||      MV      .L1X    B_ptr,      A_hptr                      ;[20,0]

								||      MV      .L2     B_h10_n,    B_h10                       ;[22,0]

								||      MPY     .M1     A_prod0_32, 0,          A_prod0_32      ;


								        PACK2   .L1     A_sum1_s,   A_sum0_s,   A_r10           ;[17,6]

								||      PACK2   .L2     B_sum3_s,   B_sum2_s,   B_r32           ;[17,6]

								||      ZERO    .S1     A_prod0_10                              ;[ P,0]

								||      MPY     .M2     B_sum3_0,   0,          B_sum3_0        ;[ P,0]

								||      ZERO    .S2     B_prod3_32                              ;[ P,0]

								||      ZERO    .D2     B_sum1_0                                ;[ P,0]

								||      MPY     .M1     A_sum2_0,   0,          A_sum2_0        ;[13,3]

								||      ADD     .D1     A_ofs,      8,          A_ofs           ;[22,0]


								        STW     .D2T2   B_r32,      *B_optr++[2]                ;[ 8,0]

								||      STW     .D1T1   A_r10,      *A_optr++[2]                ;[ 8,0]

								||[!B_j]MVC     .S2     B_csr,      CSR                         ;[ E,0]


								        ;==== Branch occurs