You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
409 lines
24 KiB
409 lines
24 KiB
;* ======================================================================== *;
|
|
;* TEXAS INSTRUMENTS, INC. *;
|
|
;* *;
|
|
;* DSPLIB DSP Signal Processing Library *;
|
|
;* *;
|
|
;* Release: Revision 1.04b *;
|
|
;* CVS Revision: 1.14 Thu Jun 12 17:41:04 2003 (UTC) *;
|
|
;* Snapshot date: 23-Oct-2003 *;
|
|
;* *;
|
|
;* This library contains proprietary intellectual property of Texas *;
|
|
;* Instruments, Inc. The library and its source code are protected by *;
|
|
;* various copyrights, and portions may also be protected by patents or *;
|
|
;* other legal protections. *;
|
|
;* *;
|
|
;* This software is licensed for use with Texas Instruments TMS320 *;
|
|
;* family DSPs. This license was provided to you prior to installing *;
|
|
;* the software. You may review this license by consulting the file *;
|
|
;* TI_license.PDF which accompanies the files in this library. *;
|
|
;* ------------------------------------------------------------------------ *;
|
|
;* Copyright (C) 2003 Texas Instruments, Incorporated. *;
|
|
;* All Rights Reserved. *;
|
|
;* ======================================================================== *;
|
|
|
|
|
|
;* ======================================================================== *;
|
|
;* Assembler compatibility shim for assembling 4.30 and later code on *;
|
|
;* tools prior to 4.30. *;
|
|
;* ======================================================================== *;
|
|
|
|
.if $isdefed(".ASSEMBLER_VERSION")
|
|
.asg .ASSEMBLER_VERSION, $asmver
|
|
.else
|
|
.asg 0, $asmver
|
|
.endif
|
|
|
|
.if ($asmver < 430)
|
|
|
|
.asg B, CALL ; Function Call
|
|
.asg B, RET ; Return from a Function
|
|
.asg B, CALLRET ; Function call with Call / Ret chaining.
|
|
|
|
.if .TMS320C6400
|
|
.asg BNOP, CALLNOP ; C64x BNOP as a Fn. Call
|
|
.asg BNOP, RETNOP ; C64x BNOP as a Fn. Return
|
|
.asg BNOP, CRNOP ; C64x Fn call w/, Call/Ret chaining via BNOP.
|
|
.endif
|
|
|
|
.asg , .asmfunc ; .func equivalent for hand-assembly code
|
|
.asg , .endasmfunc ; .endfunc equivalent for hand-assembly code
|
|
|
|
.endif
|
|
|
|
;* ======================================================================== *;
|
|
;* End of assembler compatibility shim. *;
|
|
;* ======================================================================== *;
|
|
|
|
|
|
* ========================================================================= *
|
|
* *
|
|
* TEXAS INSTRUMENTS, INC. *
|
|
* *
|
|
* NAME *
|
|
* DSP_fir_gen: FIR Filter (general purpose) *
|
|
* *
|
|
* USAGE *
|
|
* This routine is C-callable and can be called as: *
|
|
* *
|
|
* void DSP_fir_gen *
|
|
* ( *
|
|
* const short *restrict x, /* Input ('nr + nh - 1' samples) */ *
|
|
* const short *restrict h, /* Filter coefficients (nh taps) */ *
|
|
* short *restrict r, /* Output array ('nr' samples) */ *
|
|
* int nh, /* Length of filter (nh >= 5) */ *
|
|
* int nr /* Length of output (nr >= 1) */ *
|
|
* ); *
|
|
* *
|
|
* C CODE *
|
|
* *
|
|
* This is the C equivalent of the assembly code. Note that the *
|
|
* assembly code is hand optimized and restrictions may apply. *
|
|
* *
|
|
* void DSP_fir_gen *
|
|
* ( *
|
|
* const short *restrict x, /* Input ('nr + nh - 1' samples) */ *
|
|
* const short *restrict h, /* Filter coefficients (nh taps) */ *
|
|
* short *restrict r, /* Output array ('nr' samples) */ *
|
|
* int nh, /* Length of filter (nh >= 5) */ *
|
|
* int nr /* Length of output (nr >= 1) */ *
|
|
* ) *
|
|
* { *
|
|
* int i, j, sum; *
|
|
* *
|
|
* for (j = 0; j < nr; j++) *
|
|
* { *
|
|
* sum = 0; *
|
|
* for (i = 0; i < nh; i++) *
|
|
* sum += x[i + j] * h[i]; *
|
|
* *
|
|
* r[j] = sum >> 15; *
|
|
* } *
|
|
* } *
|
|
* *
|
|
* DESCRIPTION *
|
|
* Computes a real FIR filter (direct-form) using coefficients *
|
|
* stored in vector h. The real data input is stored in vector x. *
|
|
* The filter output result is stored in vector r. This FIR *
|
|
* assumes the number of filter coefficients is greater than or *
|
|
* equal to 5. It operates on 16-bit data with a 32-bit *
|
|
* accumulate. This routine has no memory hits regardless of where *
|
|
* x, h, and r arrays are located in memory. The filter is nr *
|
|
* output samples and nh coefficients. The assembly routine *
|
|
* performs 4 output samples at a time. *
|
|
* *
|
|
* TECHNIQUES *
|
|
* 1. Load double word instruction is used to simultaneously load *
|
|
* four values in a single clock cycle. *
|
|
* *
|
|
* 2. The inner loop is unrolled four times and will always *
|
|
* compute a multiple of 4 of nh and nr. If nh % 4 != 0, the *
|
|
* code will fill in 0s to make nh a multiple of 4. If nr % 4 *
|
|
* != 0, the code will still perform a mutiple of 4 outputs. *
|
|
* *
|
|
* 3. Both the inner and outer loops are software pipelined. *
|
|
* *
|
|
* 4. This code yields best performance when ratio of outer *
|
|
* loop to inner loop is less than or equal to 4. *
|
|
* *
|
|
* ASSUMPTIONS *
|
|
* 1. Little Endian is assumed for LDNDW. *
|
|
* 2. nh >= 5. *
|
|
* 3. nr multiple of 4. *
|
|
* 4. Output array r[] must be word-aligned *
|
|
* *
|
|
* MEMORY NOTE *
|
|
* No memory bank hits under any conditions. *
|
|
* Little Endian operation is assumed. *
|
|
* *
|
|
* CYCLES *
|
|
* [11 + 4 * ceil(nh/4)] * nr/4 + 15 *
|
|
* *
|
|
* CODESIZE *
|
|
* 544 bytes *
|
|
* *
|
|
* ------------------------------------------------------------------------- *
|
|
* Copyright (c) 2003 Texas Instruments, Incorporated. *
|
|
* All Rights Reserved. *
|
|
* ========================================================================= *
|
|
|
|
.sect ".text:_fir_gen"
|
|
.global _DSP_fir_gen
|
|
_DSP_fir_gen:
|
|
|
|
* ===================== SYMBOLIC REGISTER ASSIGNMENTS =====================*
|
|
.asg A4, A_x ; Input data pointer
|
|
.asg A27, A_xptr ; Input load pointer
|
|
.asg B4, B_hptr ; Filter pointer
|
|
.asg A6, A_rptr ; Output pointer
|
|
.asg B6, B_nh ; # Filter taps
|
|
.asg A8, A_nr ; # Output samples
|
|
.asg A7, A_ptr_x ; Input read pointer
|
|
.asg A22, A_h10 ; Packed coefficient h10
|
|
.asg A23, A_h32 ; Packed coefficient h32
|
|
.asg B22, B_h10 ; Packed coefficient h10
|
|
.asg B23, B_h32 ; Packed coefficient h32
|
|
.asg A4, A_sum0 ; Accum. for sample# 0
|
|
.asg A5, A_sum1 ; Accum. for sample# 1
|
|
.asg B8, B_sum2 ; Accum. for sample# 2
|
|
.asg B9, B_sum3 ; Accum. for sample# 3
|
|
.asg B6, B_x54 ; Input samples x54
|
|
.asg B7, B_x76 ; Input samples x76
|
|
.asg A3, A_hptr ; Filter pointer
|
|
.asg A0, A_i ; Index variable i
|
|
.asg B24, B_h32_n ; Special filter
|
|
.asg B25, B_h10_n ; words h32, h10
|
|
.asg B26, B_optr ; Output twin pointers
|
|
.asg A26, A_optr ; Output twin pointers
|
|
.asg B27, B_ptr ;
|
|
.asg A28, A_ofs ;
|
|
.asg B28, B_nh_l ;
|
|
.asg B0, B_j ; Outer loop trip cnt.
|
|
;---------------------------------------------------------------------------
|
|
.asg B1, B_m ;
|
|
.asg B2, B_3 ;
|
|
.asg B0, B_2 ;
|
|
.asg B1, B_1 ;
|
|
.asg B5, B_ofs ;
|
|
.asg A16, A_nr_l ;
|
|
.asg A9, A_it_i ;
|
|
.asg A29, A_nh_l ;
|
|
.asg B30, B_csr ; CSR
|
|
.asg B31, B_no_gie ; NO GIE
|
|
* ========================================================================= *
|
|
|
|
AND .D2 3, B_nh, B_m ; mask = nh & 3
|
|
|| MV .L1 A_x, A_xptr ; xptr = x
|
|
|| MVC .S2 CSR, B_csr ; CSR
|
|
|
|
[!B_m]MVK .L2 4, B_m ; mask = 4
|
|
|
|
SUB .S2 B_nh, B_m, B_ofs ; ofs = nh - m
|
|
|| AND .L2 B_csr, -2, B_no_gie ; NO GIE
|
|
|
|
ADDAH .D2 B_hptr, B_ofs, B_ptr ; ptr = &h[ofs]
|
|
|| MVC .S2 B_no_gie, CSR ; Interr. masked
|
|
|
|
;-- Interrupts masked here
|
|
|
|
LDNDW .D2T2 *B_ptr--, B_h32:B_h10 ; Load h32:h10
|
|
|| ADD .L1 A_nr, 3, A_nr_l ; nr + 3
|
|
|| ADD .S2 B_nh, 3, B_nh_l ; nh + 3
|
|
|
|
SHRU .S1 A_nr_l, 2, A_nr_l ; nr_l >> 2
|
|
|| CMPEQ .L2 3, B_m, B_3 ; m == 3
|
|
|| SHRU .S2 B_nh_l, 2, B_nh_l ; nr_h >> 2
|
|
|
|
ADD .D2X A_rptr, 4, B_optr ; optr = &r[4]
|
|
|| MV .L1X B_nh_l, A_nh_l ; copy.
|
|
|| CMPEQ .L2 1, B_m, B_1 ; m == 1
|
|
|
|
ADD .D1X B_ofs, 4, A_ofs ; ofs + 4
|
|
|
|
ADD .S1 A_ofs, A_ofs, A_ofs ; ADDAH
|
|
|
|
ADD .L1 A_xptr, A_ofs, A_ptr_x ; xptr + ofs
|
|
|| CMPEQ .L2 2, B_m, B_2 ; m == 2
|
|
||[ B_3]CLR .S2 B_h32, 16, 31, B_h32 ; h32 = 00XX
|
|
|
|
LDNDW .D1T2 *A_ptr_x--, B_x76:B_x54 ; x76:x54
|
|
|| ADD .L1 A_rptr, 0, A_optr ; optr= r
|
|
||[ B_1]CLR .S2 B_h10, 16, 31, B_h10 ; h10 = 00XX
|
|
||[ B_1]ZERO .L2 B_h32 ; h32 = 0
|
|
||[ B_2]ZERO .D2 B_h32 ; h32 = 0
|
|
|
|
|
|
* ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== *
|
|
.asg A7, A_ptr_x ; Input data ptr
|
|
.asg A22, A_h10 ; Packed h10
|
|
.asg A23, A_h32 ; Packed h32
|
|
.asg B22, B_h10 ; Packed h10
|
|
.asg B23, B_h32 ; Packed h32
|
|
.asg A4, A_sum0 ; Accumulators
|
|
.asg A5, A_sum1 ; for 4 o/p
|
|
.asg B8, B_sum2 ; samples
|
|
.asg B9, B_sum3 ;
|
|
.asg B6, B_x54 ;
|
|
.asg B7, B_x76 ;
|
|
.asg A3, A_hptr ; Filter ptr
|
|
.asg A0, A_i ;
|
|
.asg B17, B_r32 ;
|
|
.asg A20, A_sum0_s ;
|
|
.asg A17, A_sum1_s ;
|
|
.asg A21, A_x32 ;
|
|
.asg A20, A_x10 ;
|
|
.asg A19, A_x21 ;
|
|
.asg B17, B_x43 ;
|
|
.asg B5, B_x65 ;
|
|
.asg A17, A_prod0_10 ;
|
|
.asg A16, A_prod0_32 ;
|
|
.asg A16, A_prod2_10 ;
|
|
.asg A9, A_prod2_32 ;
|
|
.asg B16, B_prod1_10 ;
|
|
.asg B20, B_prod1_32 ;
|
|
.asg B20, B_prod3_10 ;
|
|
.asg B19, B_prod3_32 ;
|
|
.asg A19, A_sum0_0 ;
|
|
.asg A18, A_sum2_0 ;
|
|
.asg B16, B_sum1_0 ;
|
|
.asg B18, B_sum3_0 ;
|
|
.asg B18, B_sum2_s ;
|
|
.asg B19, B_sum3_s ;
|
|
.asg A18, A_r10 ;
|
|
.asg B3, B_return ; Return address
|
|
* ========================================================================= *
|
|
* =========================== PIPE LOOP PROLOG ============================ *
|
|
LDNDW .D1T1 *A_ptr_x--, A_x32:A_x10 ;[ 1,1]
|
|
|| MV .L1X B_ptr, A_hptr ;[20,0]
|
|
|| MV .S1 A_nh_l, A_i ;[19,0]
|
|
|| MV .L2 B_h10, B_h10_n ;[15,0]
|
|
|| MV .S2 B_h32, B_h32_n ;[15,0]
|
|
|
|
SUB .S1 A_i, 1, A_i ;[21,0]
|
|
|| MV .D1X B_h32_n, A_h32 ;[21,0]
|
|
|| ZERO .L1 A_prod0_10 ;[P ,0]
|
|
|| SUB .D2X A_nr_l, 0, B_j ;[13,0]
|
|
|
|
MPY .M2 B_sum3_0, 0, B_sum3_0 ;[P, 0]
|
|
|| ZERO .S2 B_prod3_32 ;[P, 0]
|
|
|| ZERO .D2 B_sum1_0 ;[P, 0]
|
|
|| ZERO .L1 A_prod0_32 ;[P, 0]
|
|
|| MPY .M1 A_sum2_0, 0, A_sum2_0 ;[13,3]
|
|
|| MV .S1X B_h10_n, A_h10 ;[22,0]
|
|
|| MV .L2 B_h10_n, B_h10 ;[22,0]
|
|
|| ADD .D1 A_ofs, 8, A_ofs ;[22,0]
|
|
|
|
LOOPJ:
|
|
[B_j]LDNDW .D1T1 *A_hptr, A_h32:A_h10 ;[ 4,1]
|
|
|| [B_j]ZERO .S2 B_prod3_10, ;[13,3]
|
|
|| [B_j]MPY .M2 B_prod1_10, 0, B_prod1_10 ;[ P,0]
|
|
|| [B_j]ADD .S1 A_sum0, A_prod0_10, A_sum0_0 ;[12,3]
|
|
|| [B_j]MPY .M1 A_prod0_10, 0, A_prod0_10 ;[ P,0]
|
|
|| [B_j]MV .D2 B_h32_n, B_h32 ;[22,0]
|
|
|| [B_j]ZERO .L1 A_sum1:A_sum0 ;[22,0]
|
|
|| [B_j]ZERO .L2 B_sum3:B_sum2 ;[22,0]
|
|
|
|
[A_i] BDEC .S1 LOOPI, A_i ;[17,1]
|
|
|| DOTP2 .M1X B_x54, A_h32, A_prod2_32 ;[ 5,1]
|
|
|| LDNDW .D1T1 *A_ptr_x--, A_x32:A_x10 ;[ 1,2]
|
|
|| ZERO .L1 A_prod2_10 ;[ P,0]
|
|
|
|
MV .D2X A_x32, B_x76 ;[ 6,1]
|
|
|| LDNDW .D1T2 *A_hptr--, B_h32:B_h10 ;[ 6,1]
|
|
|| DOTP2 .M1 A_x10, A_h10, A_prod0_10 ;[ 6,1]
|
|
|| PACKLH2 .S2 B_x76, B_x54, B_x65 ;[ 6,1]
|
|
|| PACKLH2 .L2X B_x54, A_x32, B_x43 ;[ 6,1]
|
|
|| PACKLH2 .S1 A_x32, A_x10, A_x21 ;[ 6,1]
|
|
|| ZERO .L1 A_prod2_32 ;[11,3]
|
|
|
|
;
|
|
* =========================== PIPE LOOP KERNEL ============================ *
|
|
LOOPI:
|
|
SHR .S2 B_sum2, 15, B_sum2_s ;[15,2]
|
|
|| SHR .S1 A_sum0, 15, A_sum0_s ;[15,2]
|
|
|| ADD .L2 B_sum3_0, B_prod3_32, B_sum3 ;[15,2]
|
|
|| ADD .D1X A_sum1, B_sum1_0, A_sum1 ;[15,2]
|
|
|| ADD .L1 A_prod2_10, A_prod2_32, A_sum2_0 ;[11,3]
|
|
|| MV .D2X A_x10, B_x54 ;[ 7,4]
|
|
|| DOTP2 .M2 B_x43, B_h32, B_prod1_32 ;[ 7,4]
|
|
|| DOTP2 .M1 A_x32, A_h10, A_prod2_10 ;[ 7,4]
|
|
|
|
SHR .S2 B_sum3, 15, B_sum3_s ;[16,2]
|
|
|| SHR .S1 A_sum1, 15, A_sum1_s ;[16,2]
|
|
|| ADD .D2 B_prod1_10, B_prod1_32, B_sum1_0 ;[12,3]
|
|
|| ADD .L1 A_sum0, A_prod0_10, A_sum0_0 ;[12,3]
|
|
|| DOTP2 .M2X A_x21, B_h10, B_prod1_10 ;[ 8,4]
|
|
|| DOTP2 .M1 A_x32, A_h32, A_prod0_32 ;[ 8,4]
|
|
|| LDNDW .D1T1 *A_hptr, A_h32:A_h10 ;[ 4,5]
|
|
|
|
[ A_i]BDEC .S1 LOOPI, A_i ;[17,2]
|
|
|| PACK2 .L1 A_sum1_s, A_sum0_s, A_r10 ;[17,2]
|
|
|| PACK2 .L2 B_sum3_s, B_sum2_s, B_r32 ;[17,2]
|
|
|| ADD .S2 B_sum3, B_prod3_10, B_sum3_0 ;[13,3]
|
|
|| ADD .D2X B_sum2, A_sum2_0, B_sum2 ;[13,3]
|
|
|| DOTP2 .M2 B_x43, B_h10, B_prod3_10 ;[ 9,4]
|
|
|| DOTP2 .M1X B_x54, A_h32, A_prod2_32 ;[ 5,5]
|
|
|| LDNDW .D1T1 *A_ptr_x--, A_x32:A_x10 ;[ 1,6]
|
|
|
|
ADD .L1 A_sum0_0, A_prod0_32, A_sum0 ;[14,3]
|
|
|| DOTP2 .M2 B_x65, B_h32, B_prod3_32 ;[10,4]
|
|
|| MV .D2X A_x32, B_x76 ;[ 6,5]
|
|
|| LDNDW .D1T2 *A_hptr--, B_h32:B_h10 ;[ 6,5]
|
|
|| DOTP2 .M1 A_x10, A_h10, A_prod0_10 ;[ 6,5]
|
|
|| PACKLH2 .S2 B_x76, B_x54, B_x65 ;[ 6,5]
|
|
|| PACKLH2 .L2X B_x54, A_x32, B_x43 ;[ 6,5]
|
|
|| PACKLH2 .S1 A_x32, A_x10, A_x21 ;[ 6,5]
|
|
|
|
* =========================== PIPE LOOP EPILOG ============================ *
|
|
|
|
SHR .S2 B_sum2, 15, B_sum2_s ;[15,5]
|
|
|| SHR .S1 A_sum0, 15, A_sum0_s ;[15,5]
|
|
|| ADD .L2 B_sum3_0, B_prod3_32, B_sum3 ;[15,5]
|
|
|| ADD .D1X A_sum1, B_sum1_0, A_sum1 ;[15,5]
|
|
|| ADD .L1 A_prod2_10, A_prod2_32, A_sum2_0 ;[11,6]
|
|
|
|
SHR .S2 B_sum3, 15, B_sum3_s ;[16,5]
|
|
|| SHR .S1 A_sum1, 15, A_sum1_s ;[16,5]
|
|
|| ADD .D2 B_prod1_10, B_prod1_32, B_sum1_0 ;[12,6]
|
|
|| ADD .L1 A_sum0, A_prod0_10, A_sum0_0 ;[12,6]
|
|
|
|
PACK2 .L1 A_sum1_s, A_sum0_s, A_r10 ;[17,5]
|
|
|| PACK2 .L2 B_sum3_s, B_sum2_s, B_r32 ;[17,5]
|
|
|| ADD .D2X B_sum2, A_sum2_0, B_sum2 ;[13,6]
|
|
||[B_j] BDEC .S2 LOOPJ, B_j ;[ 3,0]
|
|
|| ADD .D1 A_xptr, A_ofs, A_ptr_x ;[17,0]
|
|
|| MV .S1X B_h10_n, A_h10 ;[22,0]
|
|
|
|
ADD .L1 A_sum0_0, A_prod0_32, A_sum0 ;[14,6]
|
|
|| ADD .D2 B_sum3, B_prod3_10, B_sum3_0 ;[13,6]
|
|
|| LDNDW .D1T2 *A_ptr_x--, B_x76:B_x54 ;[18,0]
|
|
|| MV .S1X B_h32_n, A_h32 ;[21,0]
|
|
||[!B_j]RET .S2 B_return
|
|
|
|
SHR .S2 B_sum2, 15, B_sum2_s ;[15,6]
|
|
|| SHR .S1 A_sum0, 15, A_sum0_s ;[15,6]
|
|
|| ADD .L2 B_sum3_0, B_prod3_32, B_sum3 ;[15,6]
|
|
|| ADD .D1X A_sum1, B_sum1_0, A_sum1 ;[15,6]
|
|
|| SUB .L1 A_nh_l, 1, A_i ;[19,0]
|
|
|
|
SHR .S2 B_sum3, 15, B_sum3_s ;[16,6]
|
|
|| SHR .S1 A_sum1, 15, A_sum1_s ;[16,6]
|
|
||[B_j] LDNDW .D1T1 *A_ptr_x--, A_x32:A_x10 ;[ 1,1]
|
|
|| MV .L1X B_ptr, A_hptr ;[20,0]
|
|
|| MV .L2 B_h10_n, B_h10 ;[22,0]
|
|
|| MPY .M1 A_prod0_32, 0, A_prod0_32 ;
|
|
|
|
PACK2 .L1 A_sum1_s, A_sum0_s, A_r10 ;[17,6]
|
|
|| PACK2 .L2 B_sum3_s, B_sum2_s, B_r32 ;[17,6]
|
|
|| ZERO .S1 A_prod0_10 ;[ P,0]
|
|
|| MPY .M2 B_sum3_0, 0, B_sum3_0 ;[ P,0]
|
|
|| ZERO .S2 B_prod3_32 ;[ P,0]
|
|
|| ZERO .D2 B_sum1_0 ;[ P,0]
|
|
|| MPY .M1 A_sum2_0, 0, A_sum2_0 ;[13,3]
|
|
|| ADD .D1 A_ofs, 8, A_ofs ;[22,0]
|
|
|
|
STW .D2T2 B_r32, *B_optr++[2] ;[ 8,0]
|
|
|| STW .D1T1 A_r10, *A_optr++[2] ;[ 8,0]
|
|
||[!B_j]MVC .S2 B_csr, CSR ;[ E,0]
|
|
|
|
;==== Branch occurs
|
|
|
|
|