You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

409 lines
24 KiB

;* ======================================================================== *;
;* TEXAS INSTRUMENTS, INC. *;
;* *;
;* DSPLIB DSP Signal Processing Library *;
;* *;
;* Release: Revision 1.04b *;
;* CVS Revision: 1.14 Thu Jun 12 17:41:04 2003 (UTC) *;
;* Snapshot date: 23-Oct-2003 *;
;* *;
;* This library contains proprietary intellectual property of Texas *;
;* Instruments, Inc. The library and its source code are protected by *;
;* various copyrights, and portions may also be protected by patents or *;
;* other legal protections. *;
;* *;
;* This software is licensed for use with Texas Instruments TMS320 *;
;* family DSPs. This license was provided to you prior to installing *;
;* the software. You may review this license by consulting the file *;
;* TI_license.PDF which accompanies the files in this library. *;
;* ------------------------------------------------------------------------ *;
;* Copyright (C) 2003 Texas Instruments, Incorporated. *;
;* All Rights Reserved. *;
;* ======================================================================== *;
;* ======================================================================== *;
;* Assembler compatibility shim for assembling 4.30 and later code on *;
;* tools prior to 4.30. *;
;* ======================================================================== *;
.if $isdefed(".ASSEMBLER_VERSION")
.asg .ASSEMBLER_VERSION, $asmver
.else
.asg 0, $asmver
.endif
.if ($asmver < 430)
.asg B, CALL ; Function Call
.asg B, RET ; Return from a Function
.asg B, CALLRET ; Function call with Call / Ret chaining.
.if .TMS320C6400
.asg BNOP, CALLNOP ; C64x BNOP as a Fn. Call
.asg BNOP, RETNOP ; C64x BNOP as a Fn. Return
.asg BNOP, CRNOP ; C64x Fn call w/, Call/Ret chaining via BNOP.
.endif
.asg , .asmfunc ; .func equivalent for hand-assembly code
.asg , .endasmfunc ; .endfunc equivalent for hand-assembly code
.endif
;* ======================================================================== *;
;* End of assembler compatibility shim. *;
;* ======================================================================== *;
* ========================================================================= *
* *
* TEXAS INSTRUMENTS, INC. *
* *
* NAME *
* DSP_fir_gen: FIR Filter (general purpose) *
* *
* USAGE *
* This routine is C-callable and can be called as: *
* *
* void DSP_fir_gen *
* ( *
* const short *restrict x, /* Input ('nr + nh - 1' samples) */ *
* const short *restrict h, /* Filter coefficients (nh taps) */ *
* short *restrict r, /* Output array ('nr' samples) */ *
* int nh, /* Length of filter (nh >= 5) */ *
* int nr /* Length of output (nr >= 1) */ *
* ); *
* *
* C CODE *
* *
* This is the C equivalent of the assembly code. Note that the *
* assembly code is hand optimized and restrictions may apply. *
* *
* void DSP_fir_gen *
* ( *
* const short *restrict x, /* Input ('nr + nh - 1' samples) */ *
* const short *restrict h, /* Filter coefficients (nh taps) */ *
* short *restrict r, /* Output array ('nr' samples) */ *
* int nh, /* Length of filter (nh >= 5) */ *
* int nr /* Length of output (nr >= 1) */ *
* ) *
* { *
* int i, j, sum; *
* *
* for (j = 0; j < nr; j++) *
* { *
* sum = 0; *
* for (i = 0; i < nh; i++) *
* sum += x[i + j] * h[i]; *
* *
* r[j] = sum >> 15; *
* } *
* } *
* *
* DESCRIPTION *
* Computes a real FIR filter (direct-form) using coefficients *
* stored in vector h. The real data input is stored in vector x. *
* The filter output result is stored in vector r. This FIR *
* assumes the number of filter coefficients is greater than or *
* equal to 5. It operates on 16-bit data with a 32-bit *
* accumulate. This routine has no memory hits regardless of where *
* x, h, and r arrays are located in memory. The filter is nr *
* output samples and nh coefficients. The assembly routine *
* performs 4 output samples at a time. *
* *
* TECHNIQUES *
* 1. Load double word instruction is used to simultaneously load *
* four values in a single clock cycle. *
* *
* 2. The inner loop is unrolled four times and will always *
* compute a multiple of 4 of nh and nr. If nh % 4 != 0, the *
* code will fill in 0s to make nh a multiple of 4. If nr % 4 *
* != 0, the code will still perform a mutiple of 4 outputs. *
* *
* 3. Both the inner and outer loops are software pipelined. *
* *
* 4. This code yields best performance when ratio of outer *
* loop to inner loop is less than or equal to 4. *
* *
* ASSUMPTIONS *
* 1. Little Endian is assumed for LDNDW. *
* 2. nh >= 5. *
* 3. nr multiple of 4. *
* 4. Output array r[] must be word-aligned *
* *
* MEMORY NOTE *
* No memory bank hits under any conditions. *
* Little Endian operation is assumed. *
* *
* CYCLES *
* [11 + 4 * ceil(nh/4)] * nr/4 + 15 *
* *
* CODESIZE *
* 544 bytes *
* *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *
.sect ".text:_fir_gen"
.global _DSP_fir_gen
_DSP_fir_gen:
* ===================== SYMBOLIC REGISTER ASSIGNMENTS =====================*
.asg A4, A_x ; Input data pointer
.asg A27, A_xptr ; Input load pointer
.asg B4, B_hptr ; Filter pointer
.asg A6, A_rptr ; Output pointer
.asg B6, B_nh ; # Filter taps
.asg A8, A_nr ; # Output samples
.asg A7, A_ptr_x ; Input read pointer
.asg A22, A_h10 ; Packed coefficient h10
.asg A23, A_h32 ; Packed coefficient h32
.asg B22, B_h10 ; Packed coefficient h10
.asg B23, B_h32 ; Packed coefficient h32
.asg A4, A_sum0 ; Accum. for sample# 0
.asg A5, A_sum1 ; Accum. for sample# 1
.asg B8, B_sum2 ; Accum. for sample# 2
.asg B9, B_sum3 ; Accum. for sample# 3
.asg B6, B_x54 ; Input samples x54
.asg B7, B_x76 ; Input samples x76
.asg A3, A_hptr ; Filter pointer
.asg A0, A_i ; Index variable i
.asg B24, B_h32_n ; Special filter
.asg B25, B_h10_n ; words h32, h10
.asg B26, B_optr ; Output twin pointers
.asg A26, A_optr ; Output twin pointers
.asg B27, B_ptr ;
.asg A28, A_ofs ;
.asg B28, B_nh_l ;
.asg B0, B_j ; Outer loop trip cnt.
;---------------------------------------------------------------------------
.asg B1, B_m ;
.asg B2, B_3 ;
.asg B0, B_2 ;
.asg B1, B_1 ;
.asg B5, B_ofs ;
.asg A16, A_nr_l ;
.asg A9, A_it_i ;
.asg A29, A_nh_l ;
.asg B30, B_csr ; CSR
.asg B31, B_no_gie ; NO GIE
* ========================================================================= *
AND .D2 3, B_nh, B_m ; mask = nh & 3
|| MV .L1 A_x, A_xptr ; xptr = x
|| MVC .S2 CSR, B_csr ; CSR
[!B_m]MVK .L2 4, B_m ; mask = 4
SUB .S2 B_nh, B_m, B_ofs ; ofs = nh - m
|| AND .L2 B_csr, -2, B_no_gie ; NO GIE
ADDAH .D2 B_hptr, B_ofs, B_ptr ; ptr = &h[ofs]
|| MVC .S2 B_no_gie, CSR ; Interr. masked
;-- Interrupts masked here
LDNDW .D2T2 *B_ptr--, B_h32:B_h10 ; Load h32:h10
|| ADD .L1 A_nr, 3, A_nr_l ; nr + 3
|| ADD .S2 B_nh, 3, B_nh_l ; nh + 3
SHRU .S1 A_nr_l, 2, A_nr_l ; nr_l >> 2
|| CMPEQ .L2 3, B_m, B_3 ; m == 3
|| SHRU .S2 B_nh_l, 2, B_nh_l ; nr_h >> 2
ADD .D2X A_rptr, 4, B_optr ; optr = &r[4]
|| MV .L1X B_nh_l, A_nh_l ; copy.
|| CMPEQ .L2 1, B_m, B_1 ; m == 1
ADD .D1X B_ofs, 4, A_ofs ; ofs + 4
ADD .S1 A_ofs, A_ofs, A_ofs ; ADDAH
ADD .L1 A_xptr, A_ofs, A_ptr_x ; xptr + ofs
|| CMPEQ .L2 2, B_m, B_2 ; m == 2
||[ B_3]CLR .S2 B_h32, 16, 31, B_h32 ; h32 = 00XX
LDNDW .D1T2 *A_ptr_x--, B_x76:B_x54 ; x76:x54
|| ADD .L1 A_rptr, 0, A_optr ; optr= r
||[ B_1]CLR .S2 B_h10, 16, 31, B_h10 ; h10 = 00XX
||[ B_1]ZERO .L2 B_h32 ; h32 = 0
||[ B_2]ZERO .D2 B_h32 ; h32 = 0
* ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== *
.asg A7, A_ptr_x ; Input data ptr
.asg A22, A_h10 ; Packed h10
.asg A23, A_h32 ; Packed h32
.asg B22, B_h10 ; Packed h10
.asg B23, B_h32 ; Packed h32
.asg A4, A_sum0 ; Accumulators
.asg A5, A_sum1 ; for 4 o/p
.asg B8, B_sum2 ; samples
.asg B9, B_sum3 ;
.asg B6, B_x54 ;
.asg B7, B_x76 ;
.asg A3, A_hptr ; Filter ptr
.asg A0, A_i ;
.asg B17, B_r32 ;
.asg A20, A_sum0_s ;
.asg A17, A_sum1_s ;
.asg A21, A_x32 ;
.asg A20, A_x10 ;
.asg A19, A_x21 ;
.asg B17, B_x43 ;
.asg B5, B_x65 ;
.asg A17, A_prod0_10 ;
.asg A16, A_prod0_32 ;
.asg A16, A_prod2_10 ;
.asg A9, A_prod2_32 ;
.asg B16, B_prod1_10 ;
.asg B20, B_prod1_32 ;
.asg B20, B_prod3_10 ;
.asg B19, B_prod3_32 ;
.asg A19, A_sum0_0 ;
.asg A18, A_sum2_0 ;
.asg B16, B_sum1_0 ;
.asg B18, B_sum3_0 ;
.asg B18, B_sum2_s ;
.asg B19, B_sum3_s ;
.asg A18, A_r10 ;
.asg B3, B_return ; Return address
* ========================================================================= *
* =========================== PIPE LOOP PROLOG ============================ *
LDNDW .D1T1 *A_ptr_x--, A_x32:A_x10 ;[ 1,1]
|| MV .L1X B_ptr, A_hptr ;[20,0]
|| MV .S1 A_nh_l, A_i ;[19,0]
|| MV .L2 B_h10, B_h10_n ;[15,0]
|| MV .S2 B_h32, B_h32_n ;[15,0]
SUB .S1 A_i, 1, A_i ;[21,0]
|| MV .D1X B_h32_n, A_h32 ;[21,0]
|| ZERO .L1 A_prod0_10 ;[P ,0]
|| SUB .D2X A_nr_l, 0, B_j ;[13,0]
MPY .M2 B_sum3_0, 0, B_sum3_0 ;[P, 0]
|| ZERO .S2 B_prod3_32 ;[P, 0]
|| ZERO .D2 B_sum1_0 ;[P, 0]
|| ZERO .L1 A_prod0_32 ;[P, 0]
|| MPY .M1 A_sum2_0, 0, A_sum2_0 ;[13,3]
|| MV .S1X B_h10_n, A_h10 ;[22,0]
|| MV .L2 B_h10_n, B_h10 ;[22,0]
|| ADD .D1 A_ofs, 8, A_ofs ;[22,0]
LOOPJ:
[B_j]LDNDW .D1T1 *A_hptr, A_h32:A_h10 ;[ 4,1]
|| [B_j]ZERO .S2 B_prod3_10, ;[13,3]
|| [B_j]MPY .M2 B_prod1_10, 0, B_prod1_10 ;[ P,0]
|| [B_j]ADD .S1 A_sum0, A_prod0_10, A_sum0_0 ;[12,3]
|| [B_j]MPY .M1 A_prod0_10, 0, A_prod0_10 ;[ P,0]
|| [B_j]MV .D2 B_h32_n, B_h32 ;[22,0]
|| [B_j]ZERO .L1 A_sum1:A_sum0 ;[22,0]
|| [B_j]ZERO .L2 B_sum3:B_sum2 ;[22,0]
[A_i] BDEC .S1 LOOPI, A_i ;[17,1]
|| DOTP2 .M1X B_x54, A_h32, A_prod2_32 ;[ 5,1]
|| LDNDW .D1T1 *A_ptr_x--, A_x32:A_x10 ;[ 1,2]
|| ZERO .L1 A_prod2_10 ;[ P,0]
MV .D2X A_x32, B_x76 ;[ 6,1]
|| LDNDW .D1T2 *A_hptr--, B_h32:B_h10 ;[ 6,1]
|| DOTP2 .M1 A_x10, A_h10, A_prod0_10 ;[ 6,1]
|| PACKLH2 .S2 B_x76, B_x54, B_x65 ;[ 6,1]
|| PACKLH2 .L2X B_x54, A_x32, B_x43 ;[ 6,1]
|| PACKLH2 .S1 A_x32, A_x10, A_x21 ;[ 6,1]
|| ZERO .L1 A_prod2_32 ;[11,3]
;
* =========================== PIPE LOOP KERNEL ============================ *
LOOPI:
SHR .S2 B_sum2, 15, B_sum2_s ;[15,2]
|| SHR .S1 A_sum0, 15, A_sum0_s ;[15,2]
|| ADD .L2 B_sum3_0, B_prod3_32, B_sum3 ;[15,2]
|| ADD .D1X A_sum1, B_sum1_0, A_sum1 ;[15,2]
|| ADD .L1 A_prod2_10, A_prod2_32, A_sum2_0 ;[11,3]
|| MV .D2X A_x10, B_x54 ;[ 7,4]
|| DOTP2 .M2 B_x43, B_h32, B_prod1_32 ;[ 7,4]
|| DOTP2 .M1 A_x32, A_h10, A_prod2_10 ;[ 7,4]
SHR .S2 B_sum3, 15, B_sum3_s ;[16,2]
|| SHR .S1 A_sum1, 15, A_sum1_s ;[16,2]
|| ADD .D2 B_prod1_10, B_prod1_32, B_sum1_0 ;[12,3]
|| ADD .L1 A_sum0, A_prod0_10, A_sum0_0 ;[12,3]
|| DOTP2 .M2X A_x21, B_h10, B_prod1_10 ;[ 8,4]
|| DOTP2 .M1 A_x32, A_h32, A_prod0_32 ;[ 8,4]
|| LDNDW .D1T1 *A_hptr, A_h32:A_h10 ;[ 4,5]
[ A_i]BDEC .S1 LOOPI, A_i ;[17,2]
|| PACK2 .L1 A_sum1_s, A_sum0_s, A_r10 ;[17,2]
|| PACK2 .L2 B_sum3_s, B_sum2_s, B_r32 ;[17,2]
|| ADD .S2 B_sum3, B_prod3_10, B_sum3_0 ;[13,3]
|| ADD .D2X B_sum2, A_sum2_0, B_sum2 ;[13,3]
|| DOTP2 .M2 B_x43, B_h10, B_prod3_10 ;[ 9,4]
|| DOTP2 .M1X B_x54, A_h32, A_prod2_32 ;[ 5,5]
|| LDNDW .D1T1 *A_ptr_x--, A_x32:A_x10 ;[ 1,6]
ADD .L1 A_sum0_0, A_prod0_32, A_sum0 ;[14,3]
|| DOTP2 .M2 B_x65, B_h32, B_prod3_32 ;[10,4]
|| MV .D2X A_x32, B_x76 ;[ 6,5]
|| LDNDW .D1T2 *A_hptr--, B_h32:B_h10 ;[ 6,5]
|| DOTP2 .M1 A_x10, A_h10, A_prod0_10 ;[ 6,5]
|| PACKLH2 .S2 B_x76, B_x54, B_x65 ;[ 6,5]
|| PACKLH2 .L2X B_x54, A_x32, B_x43 ;[ 6,5]
|| PACKLH2 .S1 A_x32, A_x10, A_x21 ;[ 6,5]
* =========================== PIPE LOOP EPILOG ============================ *
SHR .S2 B_sum2, 15, B_sum2_s ;[15,5]
|| SHR .S1 A_sum0, 15, A_sum0_s ;[15,5]
|| ADD .L2 B_sum3_0, B_prod3_32, B_sum3 ;[15,5]
|| ADD .D1X A_sum1, B_sum1_0, A_sum1 ;[15,5]
|| ADD .L1 A_prod2_10, A_prod2_32, A_sum2_0 ;[11,6]
SHR .S2 B_sum3, 15, B_sum3_s ;[16,5]
|| SHR .S1 A_sum1, 15, A_sum1_s ;[16,5]
|| ADD .D2 B_prod1_10, B_prod1_32, B_sum1_0 ;[12,6]
|| ADD .L1 A_sum0, A_prod0_10, A_sum0_0 ;[12,6]
PACK2 .L1 A_sum1_s, A_sum0_s, A_r10 ;[17,5]
|| PACK2 .L2 B_sum3_s, B_sum2_s, B_r32 ;[17,5]
|| ADD .D2X B_sum2, A_sum2_0, B_sum2 ;[13,6]
||[B_j] BDEC .S2 LOOPJ, B_j ;[ 3,0]
|| ADD .D1 A_xptr, A_ofs, A_ptr_x ;[17,0]
|| MV .S1X B_h10_n, A_h10 ;[22,0]
ADD .L1 A_sum0_0, A_prod0_32, A_sum0 ;[14,6]
|| ADD .D2 B_sum3, B_prod3_10, B_sum3_0 ;[13,6]
|| LDNDW .D1T2 *A_ptr_x--, B_x76:B_x54 ;[18,0]
|| MV .S1X B_h32_n, A_h32 ;[21,0]
||[!B_j]RET .S2 B_return
SHR .S2 B_sum2, 15, B_sum2_s ;[15,6]
|| SHR .S1 A_sum0, 15, A_sum0_s ;[15,6]
|| ADD .L2 B_sum3_0, B_prod3_32, B_sum3 ;[15,6]
|| ADD .D1X A_sum1, B_sum1_0, A_sum1 ;[15,6]
|| SUB .L1 A_nh_l, 1, A_i ;[19,0]
SHR .S2 B_sum3, 15, B_sum3_s ;[16,6]
|| SHR .S1 A_sum1, 15, A_sum1_s ;[16,6]
||[B_j] LDNDW .D1T1 *A_ptr_x--, A_x32:A_x10 ;[ 1,1]
|| MV .L1X B_ptr, A_hptr ;[20,0]
|| MV .L2 B_h10_n, B_h10 ;[22,0]
|| MPY .M1 A_prod0_32, 0, A_prod0_32 ;
PACK2 .L1 A_sum1_s, A_sum0_s, A_r10 ;[17,6]
|| PACK2 .L2 B_sum3_s, B_sum2_s, B_r32 ;[17,6]
|| ZERO .S1 A_prod0_10 ;[ P,0]
|| MPY .M2 B_sum3_0, 0, B_sum3_0 ;[ P,0]
|| ZERO .S2 B_prod3_32 ;[ P,0]
|| ZERO .D2 B_sum1_0 ;[ P,0]
|| MPY .M1 A_sum2_0, 0, A_sum2_0 ;[13,3]
|| ADD .D1 A_ofs, 8, A_ofs ;[22,0]
STW .D2T2 B_r32, *B_optr++[2] ;[ 8,0]
|| STW .D1T1 A_r10, *A_optr++[2] ;[ 8,0]
||[!B_j]MVC .S2 B_csr, CSR ;[ E,0]
;==== Branch occurs