You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

307 lines
19 KiB

;* ======================================================================== *;
;* TEXAS INSTRUMENTS, INC. *;
;* *;
;* DSPLIB DSP Signal Processing Library *;
;* *;
;* Release: Revision 1.04b *;
;* CVS Revision: 1.7 Sun Sep 29 03:32:22 2002 (UTC) *;
;* Snapshot date: 23-Oct-2003 *;
;* *;
;* This library contains proprietary intellectual property of Texas *;
;* Instruments, Inc. The library and its source code are protected by *;
;* various copyrights, and portions may also be protected by patents or *;
;* other legal protections. *;
;* *;
;* This software is licensed for use with Texas Instruments TMS320 *;
;* family DSPs. This license was provided to you prior to installing *;
;* the software. You may review this license by consulting the file *;
;* TI_license.PDF which accompanies the files in this library. *;
;* ------------------------------------------------------------------------ *;
;* Copyright (C) 2003 Texas Instruments, Incorporated. *;
;* All Rights Reserved. *;
;* ======================================================================== *;
;* ======================================================================== *;
;* Assembler compatibility shim for assembling 4.30 and later code on *;
;* tools prior to 4.30. *;
;* ======================================================================== *;
.if $isdefed(".ASSEMBLER_VERSION")
.asg .ASSEMBLER_VERSION, $asmver
.else
.asg 0, $asmver
.endif
.if ($asmver < 430)
.asg B, CALL ; Function Call
.asg B, RET ; Return from a Function
.asg B, CALLRET ; Function call with Call / Ret chaining.
.if .TMS320C6400
.asg BNOP, CALLNOP ; C64x BNOP as a Fn. Call
.asg BNOP, RETNOP ; C64x BNOP as a Fn. Return
.asg BNOP, CRNOP ; C64x Fn call w/, Call/Ret chaining via BNOP.
.endif
.asg , .asmfunc ; .func equivalent for hand-assembly code
.asg , .endasmfunc ; .endfunc equivalent for hand-assembly code
.endif
;* ======================================================================== *;
;* End of assembler compatibility shim. *;
;* ======================================================================== *;
* ========================================================================= *
* TEXAS INSTRUMENTS, INC. *
* *
* NAME *
* DSP_fir_r4: FIR Filter (radix 4) *
* *
* *
* REVISION DATE *
* 10-Aug-2001 *
* *
* USAGE *
* This routine is C-callable and can be called as: *
* *
* void DSP_fir_r4 *
* ( *
* const short *restrict x, /* Input array [nr+nh-1 elements] */ *
* const short *restrict h, /* Coeff array [nh elements] */ *
* short *restrict r, /* Output array [nr elements] */ *
* int nh, /* Number of coefficients. */ *
* int nr /* Number of output samples. */ *
* ) *
* *
* ARGUMENTS PASSED *
* *x -> A4 *
* *h -> B4 *
* *r -> A6 *
* nh -> B6 *
* nr -> A8 *
* *
* DESCRIPTION *
* Computes a real FIR filter (direct-form) using coefficients *
* stored in vector h. The real data input is stored in vector x. *
* The filter output result is stored in vector r. Input data and *
* filter taps are 16-bit, with intermediate values kept at 32-bit *
* precision. Filter taps are expected in Q15 format. *
* *
* The following is a natural C implementation with no restrictions. *
* This version has restrictions as noted in the ASSUMPTIONS below. *
* *
* void DSP_fir_r4 *
* ( *
* const short *restrict x, *
* const short *restrict h, *
* short *restrict r, *
* int nh, *
* int nr *
* ) *
* { *
* int i, j, sum; *
* *
* for (j = 0; j < nr; j++) *
* { *
* sum = 0; *
* for (i = 0; i < nh; i++) *
* sum += x[i + j] * h[i]; *
* r[j] = sum >> 15; *
* } *
* } *
* *
* ASSUMPTIONS *
* Number of taps: 'nh' >= 8, multiple of 4 *
* Number of samples: 'nr' >= 4, multiple of 4 *
* *
* NOTES *
* This function blocks interrupts for its entire duration. It is *
* interrupt tolerant, but not interruptible. *
* *
* MEMORY NOTE *
* No memory bank hits under any conditions. *
* This code is a LITTLE ENDIAN implementation *
* *
* TECHNIQUES *
* 1. Load double word instruction is used to simultaneously *
* load four values in a single clock cycle. *
* 2. The inner loop is unrolled four times *
* *
* CYCLES *
* (8 + nh) * nr/4 + 9 *
* *
* For nh = 12 and nr = 12, cycles = 69 *
* *
* CODESIZE *
* 308 bytes. *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *
.sect ".data:copyright_h"
; ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== ;
.asg A0, A_s ; flag for sum0a,1a,2a,3a: = 1
.asg A1, A_i ; inner loop count = 2
.asg A2, A_rj ; outer loop cont j_cnt = (nr+3)>>2
.asg A4, A_x_ptr ; input array pointer
.asg A6, A_r_ptr ; output array pointer
.asg A7, A_nh_x ; = (nh+3)>>2+1
.asg A8, A_nr ; = (nr+3)
.asg A16, A_h10 ; coef: h[1:0]
.asg A17, A_h32 ; coef: h[3:2]
.asg A18, A_x10 ; input: x[1:0]
.asg A19, A_x32 ; input: x[3:2]
.asg A20, A_x21 ; input: x[2:1]
.asg A21, A_prod00_11; = x[0]*h[0] + x[1]*h[1]
.asg A22, A_prod22_33; = x[2]*h[2] + x[3]*h[3]
.asg A23, A_prod32_43; = x[3]*h[2] + x[4]*h[3]
.asg A24, A_prod10_21; = x[1]*h[0] + x[2]*h[1]
.asg A26, A_sum0a ; = sum0b + x[2]*h[2] + x[3]*h[3]
.asg A26, A_sum0b ; = sum0a + x[0]*h[0] + x[1]*h[1]
.asg A27, A_sum1a ; = sum1b + x[1]*h[0] + x[2]*h[1]
.asg A27, A_sum1b ; = sum1a + x[3]*h[2] + x[4]*h[3]
.asg A25, A_sum0 ; = sum0a >>15 for r[0]
.asg A28, A_sum1 ; = sum1a >>15 for r[1]
.asg B0, B_j ; outer loop flag; if 0,branch to iloop
.asg B4, B_h_ptr ; coef array pointer
.asg B5, B_iloop ; iloop label address
.asg B6, B_nh ; initial value for B_j, if A_rj != 0
.asg B7, B_nh_h ; addr offset for h_ptr: = (nh+3)>>2+1
.asg B8, B_h_ptr1 ; temp coef array ptr: point to h[nh]
.asg B8, B_r10 ; output: r[1:0]
.asg B9, B_r32 ; output: r[3:2]
.asg B9, B_x32 ; input: x[3:2]
.asg B16, B_h10 ; coef: h[1:0]
.asg B17, B_h32 ; coef: h[3:2]
.asg B18, B_x54 ; input: x[5:4]
.asg B19, B_x76 ; input: x[7:6]
.asg B20, B_x43 ; input: x[4:3]
.asg B21, B_x65 ; input: x[6:5]
.asg B22, B_prod42_53; = x[4]*h[2] + x[5]*h[3]
.asg B23, B_prod20_31; = x[2]*h[0] + x[3]*h[1]
.asg B24, B_prod30_41; = x[3]*h[0] + x[4]*h[1]
.asg B25, B_prod52_63; = x[5]*h[2] + x[6]*h[3]
.asg B26, B_sum2a ; = sum2b + x[2]*h[0] + x[3]*h[1]
.asg B26, B_sum2b ; = sum2a + x[4]*h[2] + x[5]*h[3]
.asg B27, B_sum3a ; = sum3b + x[3]*h[0] + x[4]*h[1]
.asg B27, B_sum3b ; = sum3a + x[5]*h[2] + x[6]*h[3]
.asg B28, B_sum0 ; = A_sum0
.asg B29, B_sum1 ; = A_sum1
.asg B30, B_sum2 ; = sum2a >>15 for r[2]
.asg B31, B_sum3 ; = sum3a >>15 for r[3]
.sect ".text:_fir_r4"
.global _DSP_fir_r4
_DSP_fir_r4:
; ======================= SETUP / LOOP PIPE-UP CODE ======================= ;
LDNDW .D1T2 *++A_x_ptr,B_x76:B_x54 ; load B_x76:x54
|| ADD .L1 3,A_nr,A_nr ; nr + 3
|| SHR .S2 B_nh,2,B_nh ; i_cnt = nh>>2
|| B .S1 dint0 ; protect setup code
LDNDW .D1T1 *-A_x_ptr[1],A_x32:A_x10 ; load A_x32:x10
|| SHR .S1 A_nr,2,A_rj ; j_cnt = (nr+3)>>2
LDNDW .D2T1 *B_h_ptr,A_h32:A_h10 ; load A_h32:h10
|| SUB .L1 A_rj,1,A_rj ; j_cnt -= 1
LDNDW .D2T2 *B_h_ptr++,B_h32:B_h10 ; load B_h32:h10
||[A_rj]MV .S2 B_nh,B_j ; jloop cnt
|| MV .S1X B_nh,A_i ; i_cnt = nh>>2
|| SUB .L2 -2,B_nh,B_nh_h ; addr offset for h_ptr
LDNDW .D1T2 *++A_x_ptr,B_x76:B_x54 ; load B_x1110:x98
|| MVK .S1 1,A_s ; sync for sum0a,1a,2a,3a
|| ADDKPC .S2 iloop,B_iloop,0 ; store iloop addr to reg
|| SUB .L1X -1,B_nh,A_nh_x ; addr offset for x_ptr
LDNDW .D1T1 *-A_x_ptr[1],A_x32:A_x10 ; load A_x76:x54
|| B .S1 iloop ;
|| ZERO .L2 B_sum3a:B_sum2a
dint0: LDNDW .D2T1 *B_h_ptr,A_h32:A_h10 ; load A_h76:h54
|| PACKLH2 .S2X B_x54,A_x32,B_x43 ; @ input: x[4:3]
|| ZERO .L1 A_sum1a:A_sum0a
||[!A_rj]ADD .L2 5,B_nh,B_j ; if last jloop, will not
; ========================== "ILOOP" LOOP KERNEL ========================== ;
iloop:
DOTP2 .M1 A_x10,A_h10,A_prod00_11 ; @ x[0]*h[0] + x[1]*h[1]
||[!A_s]ADD .S1 A_sum0a,A_prod00_11,A_sum0b ; x[0]*h[0] + x[1]*h[1]
|| DOTP2 .M2X B_x54,A_h32,B_prod42_53 ; @ x[4]*h[2] + x[5]*h[3]
||[!A_s]ADD .L2 B_sum2a,B_prod42_53,B_sum2b ; x[4]*h[2] + x[5]*h[3]
|| PACKLH2 .S2 B_x76,B_x54,B_x65 ; @ input: x[6:5]
|| LDNDW .D2T2 *B_h_ptr++,B_h32:B_h10 ; load coef: h32:h10
|| PACKLH2 .L1 A_x32,A_x10,A_x21 ; input: x[2:1]
||[!B_j]ADDAD .D1 A_x_ptr,A_nh_x,A_x_ptr ; reset the input pointer
DOTP2 .M1 A_x32,A_h32,A_prod22_33 ; x[2]*h[2] + x[3]*h[3]
||[!A_s]ADD .L1 A_sum0b,A_prod22_33,A_sum0a ; x[2]*h[2] + x[3]*h[3]
|| DOTP2 .M2 B_x65,B_h32,B_prod52_63 ; x[5]*h[2] + x[6]*h[3]
||[!A_s]ADD .S2 B_sum3a,B_prod52_63,B_sum3b ; x[5]*h[2] + x[6]*h[3]
|| LDNDW .D1T2 *++A_x_ptr,B_x76:B_x54 ; load intput: x76:x54
||[!B_j]ADDAD .D2 B_h_ptr,B_nh_h,B_h_ptr ; reset the coef pointer
||[ A_i]SUB .S1 A_i,1,A_i ; @
|| MV .L2X A_x32,B_x32 ;
DOTP2 .M2 B_x32,B_h10,B_prod20_31 ; x[2]*h[0] + x[3]*h[1]
||[!A_s]ADD .L2 B_sum2b,B_prod20_31, B_sum2a; x[2]*h[0] + x[3]*h[1]
|| DOTP2 .M1X B_x43,A_h32,A_prod32_43 ; x[3]*h[2] + x[4]*h[3]
||[!A_s]ADD .L1 A_sum1a,A_prod32_43,A_sum1b ; x[3]*h[2] + x[4]*h[3]
||[ A_i]B .S1 iloop ; inner loop brach
||[!B_j]B .S2 B_iloop ; outer loop brach
|| LDNDW .D1T1 *-A_x_ptr[1],A_x32:A_x10 ; load input: x[32]:x[10]
|| SUB .D2 B_j,1,B_j ; count for outer loop
DOTP2 .M1X A_x21,B_h10,A_prod10_21 ; x[1]*h[0] + x[2]*h[1]
||[!A_s]ADD .D1 A_sum1b,A_prod10_21,A_sum1a ; x[1]*h[0] + x[2]*h[1]
|| DOTP2 .M2 B_x43,B_h10,B_prod30_41 ; x[3]*h[0] + x[4]*h[1]
||[!A_s]ADD .L2 B_sum3b,B_prod30_41,B_sum3a ; x[3]*h[0] + x[4]*h[1]
|| LDNDW .D2T1 *B_h_ptr,A_h32:A_h10 ; load coef: h32:h10
||[ A_s]ZERO .L1 A_s ; @flag for sum0a,1a,2a,3a
|| PACKLH2 .S2X B_x54,A_x32,B_x43 ; @ input: x[4:3]
; ========================= END OF "ILOOP" KERNEL ========================= ;
SHR .S1 A_sum0a,15,A_sum0 ; = sum0a >>15 for r[0]
|| ADD .L2 B_sum2a,B_sum2a,B_sum2 ; = sum2a <<1 for r[2]
||[A_rj]LDNDW .D2T2 *B_h_ptr++,B_h32:B_h10 ; p load intput: x[3:2]:x[1:0]
||[!A_rj]RET .S2 B3
[A_rj]LDNDW .D1T2 *++A_x_ptr,B_x76:B_x54 ; p load intput: x[7:6]:x[5:4]
|| SHR .S2X A_sum1a,15,B_sum1 ; = sum1a >>15 for r[1]
|| ADD .D2 B_sum3a,B_sum3a,B_sum3 ; = sum3a <<1 for r[3]
|| MV .S1X B_nh,A_i ; initialize inner loop count
|| ZERO .L2 B_sum3a:B_sum2a ; p initialize B_sum3a,2a
PACK2 .L2X B_sum1,A_sum0,B_r10 ; r[1:0]
|| PACKH2 .S2 B_sum3,B_sum2,B_r32 ; r[3:2]
||[A_rj]LDNDW .D2T1 *B_h_ptr,A_h32:A_h10 ; p load coef: h[3:2]:h[1:0]
|| MVD .M1X B_x76,A_x32 ; p load intput: x[3:2]
|| MVK .D1 1,A_s ; p flag for accum sum0a..3a
||[A_rj]B .S1 iloop ; p branch to outer loop
||[A_rj]SUB .L1 A_rj,1,A_rj ; count down for outer loop
STNDW .D1 B_r32:B_r10,*A_r_ptr++ ; p store output r[3:2]:r[1:0]
|| MVD .M1X B_x54,A_x10 ; p load intput: x[1:0]
||[A_rj]MV .S2 B_nh,B_j ; initialize outer loop count
||[!A_rj]ADD .D2 5,B_nh,B_j ; initialize outer loop count
|| ZERO .L1 A_sum1a:A_sum0a ; p initialize B_sum1a,0a
|| PACKLH2 .L2X B_x54,A_x32,B_x43 ; @ input: x[4:3]
; ============================ END OF "JLOOP" ============================= ;
NOP 2
; ============================= BRANCH OCCURS ============================= ;
* ========================================================================= *
* End of file: dsp_fir_r4.asm *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *