You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

330 lines
20 KiB

;* ======================================================================== *;
;* TEXAS INSTRUMENTS, INC. *;
;* *;
;* DSPLIB DSP Signal Processing Library *;
;* *;
;* Release: Revision 1.04b *;
;* CVS Revision: 1.5 Sun Sep 29 03:32:22 2002 (UTC) *;
;* Snapshot date: 23-Oct-2003 *;
;* *;
;* This library contains proprietary intellectual property of Texas *;
;* Instruments, Inc. The library and its source code are protected by *;
;* various copyrights, and portions may also be protected by patents or *;
;* other legal protections. *;
;* *;
;* This software is licensed for use with Texas Instruments TMS320 *;
;* family DSPs. This license was provided to you prior to installing *;
;* the software. You may review this license by consulting the file *;
;* TI_license.PDF which accompanies the files in this library. *;
;* ------------------------------------------------------------------------ *;
;* Copyright (C) 2003 Texas Instruments, Incorporated. *;
;* All Rights Reserved. *;
;* ======================================================================== *;
;* ======================================================================== *;
;* Assembler compatibility shim for assembling 4.30 and later code on *;
;* tools prior to 4.30. *;
;* ======================================================================== *;
.if $isdefed(".ASSEMBLER_VERSION")
.asg .ASSEMBLER_VERSION, $asmver
.else
.asg 0, $asmver
.endif
.if ($asmver < 430)
.asg B, CALL ; Function Call
.asg B, RET ; Return from a Function
.asg B, CALLRET ; Function call with Call / Ret chaining.
.if .TMS320C6400
.asg BNOP, CALLNOP ; C64x BNOP as a Fn. Call
.asg BNOP, RETNOP ; C64x BNOP as a Fn. Return
.asg BNOP, CRNOP ; C64x Fn call w/, Call/Ret chaining via BNOP.
.endif
.asg , .asmfunc ; .func equivalent for hand-assembly code
.asg , .endasmfunc ; .endfunc equivalent for hand-assembly code
.endif
;* ======================================================================== *;
;* End of assembler compatibility shim. *;
;* ======================================================================== *;
* ========================================================================= *
* TEXAS INSTRUMENTS, INC. *
* *
* NAME *
* DSP_fir_r8: FIR Filter (radix 8) *
* *
* REVISION DATE *
* 10-Aug-2001 *
* *
* USAGE *
* This routine is C-callable and can be called as: *
* *
* void DSP_fir_r8 *
* ( *
* const short *restrict x, /* Input array [nr+nh-1 elements] */ *
* const short *restrict h, /* Coeff array [nh elements] */ *
* short *restrict r, /* Output array [nr elements] */ *
* int nh, /* Number of coefficients. */ *
* int nr /* Number of output samples. */ *
* ) *
* *
* ARGUMENTS PASSED *
* *x -> A4 *
* *h -> B4 *
* *r -> A6 *
* nh -> B6 *
* nr -> A8 *
* *
* DESCRIPTION *
* Computes a real FIR filter (direct-form) using coefficients *
* stored in vector h. The real data input is stored in vector x. *
* The filter output result is stored in vector r. Input data and *
* filter taps are 16-bit, with intermediate values kept at 32-bit *
* precision. Filter taps are expected in Q15 format. *
* *
* The following is a natural C implementation with no restrictions. *
* This version has restrictions as noted in the ASSUMPTIONS below. *
* *
* void DSP_fir_r8 *
* ( *
* const short *restrict x, *
* const short *restrict h, *
* short *restrict r, *
* int nh, *
* int nr *
* ) *
* { *
* int i, j, sum; *
* *
* for (j = 0; j < nr; j++) *
* { *
* sum = 0; *
* for (i = 0; i < nh; i++) *
* sum += x[i + j] * h[i]; *
* r[j] = sum >> 15; *
* } *
* } *
* *
* ASSUMPTIONS *
* Number of taps: 'nh' >= 8, multiple of 8 *
* Number of samples: 'nr' >= 4, multiple of 4 *
* Array 'r' is word aligned. *
* *
* NOTES *
* This function blocks interrupts for its entire duration. It is *
* interrupt tolerant, but not interruptible. *
* *
* MEMORY NOTE *
* No memory bank hits under any conditions. *
* This code is a LITTLE ENDIAN implementation. *
* *
* TECHNIQUES *
* 1. Load double word instruction is used to simultaneously load *
* four values in a single clock cycle. *
* *
* 2. The inner loop is unrolled four times and will always *
* compute a multiple of 4 of nr. *
* *
* 3. The outer loop is conditionally exected in parallel with the *
* inner loop. This allows for a zero overhead outer loop. *
* *
* CYCLES *
* nh * nr/4 + 17 *
* *
* For nh = 32 and nr = 36, cycles = 305. *
* *
* CODESIZE *
* 336 bytes. *
* *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *
; ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== ;
.asg A0, A_i ; inner loop flag;when = 0,branch to iloop
.asg A1, A_i1 ; delayed i_cnt
.asg A2, A_s ; flag for adding 1st sum
.asg A4, A_x_ptr ; input array pointer
.asg A5, A_r2 ; output: r[2]
.asg A6, A_r_ptr ; output ptr that init. point to r[0]
.asg A8, A_nr ; number of output samples
.asg A9, A_x_offset ; offset to reset input ptr for next loop
.asg A16, A_x54 ; input: x[5:4]
.asg A17, A_x76 ; input: x[7:6]
.asg A20, A_h10 ; input: h[1:0]
.asg A21, A_h32 ; input: h[3:2]
.asg A22, A_h54 ; input: h[5:4]
.asg A23, A_h76 ; input: h[7:6]
.asg A24, A_sum0 ; sum0
.asg A25, A_sum2 ; sum2
.asg A26, A_prod0_32 ; = x[3]*h[3] + x[2]*h[2]
.asg A26, A_prod0_54 ; = x[5]*h[5] + x[4]*h[4]
.asg A26, A_prod0_76 ; = x[7]*h[7] + x[6]*h[6]
.asg A27, A_prod2_32 ; = x[5]*h[3] + x[4]*h[2]
.asg A27, A_prod2_54 ; = x[7]*h[5] + x[6]*h[4]
.asg A27, A_prod2_76 ; = x[9]*h[7] + x[8]*h[6]
.asg A28, A_prod0_10 ; = x[1]*h[1] + x[0]*h[0]
.asg A29, A_prod2_10 ; = x[3]*h[1] + x[2]*h[0]
.asg A28, A_sum0a ; sum0
.asg A29, A_sum2a ; sum2
.asg A30, A_r0 ; output: r[0]
.asg A31, A_r1 ; output: r[1]
.asg A31, A_r10 ; output: r[1:0]
.asg B0, B_j ; outer loop count
.asg B1, B_i2 ; 2nd delayed i_cnt
.asg B4, B_h_ptr ; coef array pointer
.asg B5, B_x18 ; input: x[1:8]
.asg B6, B_nh ; number of coefficients
.asg B7, B_r_ptr ; output ptr that init. point to r[2]
.asg B8, B_x3a ; input: x[3:10]
.asg B9, B_h_offset ; offset to reset coef ptr for next loop
.asg B16, B_x10 ; input: x[1:0]
.asg B17, B_x32 ; input: x[3:2]
.asg B18, B_x98 ; input: x[9:8]
.asg B19, B_xba ; input: x[11:10]
.asg B20, B_h0X ; coef: h[0:-1]
.asg B21, B_h21 ; coef: h[2:1]
.asg B22, B_h43 ; coef: h[4:3]
.asg B23, B_h65 ; coef: h[6:5]
.asg B24, B_sum1 ; sum1
.asg B25, B_sum3 ; sum3
.asg B26, B_prod1_21 ; = x[3]*h[2] + x[2]*h[1]
.asg B26, B_prod1_65 ; = x[7]*h[6] + x[6]*h[5]
.asg B26, B_prod1_07 ; = x[1]*h[0] + x[8]*h[7]
.asg B27, B_prod3_21 ; = x[5]*h[2] + x[4]*h[1]
.asg B27, B_prod3_65 ; = x[9]*h[6] + x[8]*h[5]
.asg B27, B_prod3_07 ; = x[3]*h[0] + x[a]*h[7]
.asg B28, B_prod1_43 ; = x[5]*h[4] + x[4]*h[3]
.asg B28, B_sum1a ; sum1
.asg B29, B_prod3_43 ; = x[7]*h[4] + x[6]*h[3]
.asg B29, B_sum3a ; sum3
.asg B30, B_h07 ; input: h[0:7]
.asg B31, B_r3 ; output: r[3]
.asg B31, B_r32 ; output: r[3:2]
.sect ".text:_fir_r8"
.global _DSP_fir_r8
_DSP_fir_r8:
; ======================= SETUP / LOOP PIPE-UP CODE ======================= ;
LDNDW .D1T2 *A_x_ptr++,B_x32:B_x10 ; load input: x[3:2]:x[1:0]
|| ADD .L1 3,A_nr,A_nr ; nr + 3
LDNDW .D2T1 *B_h_ptr++,A_h32:A_h10 ; load coef: h[3:2]:h[1:0]
LDNDW .D2T2 *-B_h_ptr(10),B_h21:B_h0X ; load coef: h[2:1]:h[0:-1]
|| SHR .S2X A_nr,2,B_j ; j_cnt = (nr+3)>>2
LDNDW .D1T1 *A_x_ptr++,A_x76:A_x54 ; load input: x[7:6]:x[5:4]
|| MPY .M2 B_j,B_nh,B_j
|| ADD .L1X -4,B_nh,A_x_offset ; twin reg for h pointer
LDNDW .D2T2 *-B_h_ptr(2),B_h65:B_h43 ; load coef: h[6:5]:h[4:3]
|| ADD .S1X -8,B_nh,A_i ; j_cnt = (nh>>3)
|| ZERO .L1 A_sum2:A_sum0
|| MVK .D1 1,A_s ; flag for adding 1st sum
LDNDW .D1T2 *A_x_ptr,B_xba:B_x98 ; load input: x[11:10]:x[9:8]
|| MV .L2 B_nh,B_h_offset ; twin reg for h pointer
|| SHR .S2 B_j,3,B_j ; j_cnt = (nh>>3)
|| B .S1 dint0
LDNDW .D2T1 *B_h_ptr++,A_h76:A_h54 ; load coef: h[7:6]:h[5:4]
||[!A_i]SUBAH .D1 A_x_ptr,A_x_offset,A_x_ptr ; reset x_ptr
|| ZERO .L2 B_sum3:B_sum1
ADD .L2X 4,A_r_ptr,B_r_ptr ; nr + 3
|| MVK .S1 1,A_i1 ; twin reg for h pointer
; ========================== "JLOOP" LOOP KERNEL ========================== ;
jloop:
LDNDW .D1T2 *A_x_ptr++,B_x32:B_x10 ; load input: x[3:2]:x[1:0]
||[!A_i]SUBAH .D2 B_h_ptr,B_h_offset,B_h_ptr ; reset h_ptr
|| DOTP2 .M1X A_h32,B_x32,A_prod0_32 ; x[3]*h[3] + x[2]*h[2]
|| DOTP2 .M2X B_h21,A_x54,B_prod3_21 ; x[5]*h[2] + x[4]*h[1]
||[!A_s]ADD .L1 A_sum0,A_prod0_54,A_sum0 ; sum0 += A_prod0_54
||[!A_s]ADD .L2 B_sum1,B_prod1_65,B_sum1 ; sum1 += B_prod1_65
||[!A_s]ADD .S1 A_sum2,A_sum2a,A_sum2 ; sum2 += A_prod2_32,2_10
||[!A_s]ADD .S2 B_sum3,B_sum3a,B_sum3 ; sum3 += B_prod3_43,3_21
LDNDW .D2T1 *B_h_ptr++,A_h32:A_h10 ; load coef: h[3:2]:h[1:0]
|| DOTP2 .M1 A_x54,A_h32,A_prod2_32 ; x[5]*h[3] + x[4]*h[2]
|| DOTP2 .M2 B_x32,B_h21,B_prod1_21 ; x[3]*h[2] + x[2]*h[1]
||[!A_s]ADD .L1 A_sum2,A_prod2_54,A_sum2 ; sum2 += A_prod2_54
||[!A_s]ADD .L2 B_sum3,B_prod3_65,B_sum3 ; sum3 += B_prod3_65
||[!A_i]MV .S1X B_nh,A_i ; j_cnt = nh
|| MV .D1 A_i,A_i1 ; delayed i_cnt
|| MV .S2X A_i1,B_i2 ; 2nd delayed i_cnt
LDNDW .D2T2 *-B_h_ptr(10),B_h21:B_h0X ; load coef: h[2:1]:h[0:-1]
|| DOTP2 .M1X A_h10,B_x10,A_prod0_10 ; x[1]*h[1] + x[0]*h[0]
|| DOTP2 .M2X B_h43,A_x54,B_prod1_43 ; x[5]*h[4] + x[4]*h[3]
||[!A_s]ADD .L1 A_sum0,A_prod0_76,A_sum0 ; sum0 += A_prod0_76
||[!A_s]ADD .L2 B_sum1,B_prod1_07,B_sum1 ; sum1 += B_prod1_07
|| SUB .D1 A_i,8,A_i ; i_cnt -=8
||[ B_j]B .S1 jloop ; outer loop branch
||[!B_j]RET .S2 B3 ; return to calling program
dint0: DOTP2 .M1X A_h10,B_x32,A_prod2_10 ; x[3]*h[1] + x[2]*h[0]
|| DOTP2 .M2X B_h43,A_x76,B_prod3_43 ; x[7]*h[4] + x[6]*h[3]
||[ B_j]LDNDW .D1T1 *A_x_ptr++,A_x76:A_x54 ; load input: x[7:6]:x[5:4]
||[!A_s]ADD .L1 A_sum2,A_prod2_76,A_sum2 ; sum2 += A_prod2_76
||[!A_s]ADD .D2 B_sum3,B_prod3_07,B_sum3 ; sum3 += B_prod3_07
|| SHR .S1 A_sum0,15,A_r0 ; r[0] = sum0 >>15
|| PACKHL2 .L2 B_x32,B_xba,B_x3a ; input: x[4:3]
|| PACKHL2 .S2 B_x10,B_x98,B_x18 ; input: x[6:5]
[ B_j]LDNDW .D2T2 *-B_h_ptr(2),B_h65:B_h43 ; load coef: h[6:5]:h[4:3]
|| DOTP2 .M1 A_x54,A_h54,A_prod0_54 ; x[5]*h[5] + x[4]*h[4]
|| DOTP2 .M2X B_h65,A_x76,B_prod1_65 ; x[7]*h[6] + x[6]*h[5]
|| SHR .S1X B_sum1,15,A_r1 ; r[1] = sum1 >>15
|| ADD .L1 A_sum2,A_sum2,A_r2 ; r[2] = sum2 >>15
|| ADD .S2 B_sum3,B_sum3,B_r3 ; r[3] = sum3 >>15
||[ A_s]ZERO .D1 A_s ; start to add the sum
||[!B_i2]ZERO .L2 B_sum3:B_sum1
[ B_j]LDNDW .D1T2 *A_x_ptr,B_xba:B_x98 ; load input: x[11:10]:x[9:8]
|| DOTP2 .M1 A_x76,A_h54,A_prod2_54 ; x[7]*h[5] + x[6]*h[4]
|| DOTP2 .M2 B_x98,B_h65,B_prod3_65 ; x[9]*h[6] + x[8]*h[5]
||[!B_i2]ZERO .L1 A_sum2:A_sum0
|| PACKH2 .L2X B_h0X,A_h76,B_h07 ; coef: h[0:7]
||[ B_j]SUB .D2 B_j,1,B_j ; j_cnt -=1
||[ A_s]B .S2 dint0
[ B_j]LDNDW .D2T1 *B_h_ptr++,A_h76:A_h54 ; load coef: h[7:6]:h[5:4]
|| DOTP2 .M1 A_x76,A_h76,A_prod0_76 ; x[7]*h[7] + x[6]*h[6]
|| DOTP2 .M2 B_x18,B_h07,B_prod1_07 ; x[1]*h[0] + x[8]*h[7]
|| ADD .L1 A_prod0_10,A_prod0_32,A_sum0a ; sum0a = A_prod0_10+0_32
|| ADD .L2 B_prod1_21,B_prod1_43,B_sum1a ; sum1a = B_prod1_21+1_43
||[!A_i]SUBAH .D1 A_x_ptr,A_x_offset,A_x_ptr
|| PACK2 .S1 A_r1,A_r0,A_r10 ; r[1:0]
|| PACKH2 .S2X B_r3,A_r2,B_r32 ; r[3:2]
[!B_i2]STW .D1T1 A_r10,*A_r_ptr++[2] ; store output r[1:0]
||[!B_i2]STW .D2T2 B_r32,*B_r_ptr++[2] ; store output r[3:2]
|| DOTP2 .M1X A_h76,B_x98,A_prod2_76 ; x[9]*h[7] + x[8]*h[6]
|| DOTP2 .M2 B_x3a,B_h07,B_prod3_07 ; x[3]*h[0] + x[10]*h[7]
|| ADD .L1 A_prod2_10,A_prod2_32,A_sum2a ; sum2a = A_prod2_10+2_32
|| ADD .L2 B_prod3_21,B_prod3_43,B_sum3a ; sum3a = B_prod3_21+3_43
|| ADD .S1 A_sum0,A_sum0a,A_sum0 ; sum0 += A_prod0_10,0_32
|| ADD .S2 B_sum1,B_sum1a,B_sum1 ; sum1 += B_prod1_21,1_43
; ============================ END OF "JLOOP" ============================= ;
; ============================= BRANCH OCCURS ============================= ;
* ========================================================================= *
* End of file: dsp_fir_r8.asm *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *