You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

393 lines
22 KiB

;* ======================================================================== *;
;* TEXAS INSTRUMENTS, INC. *;
;* *;
;* DSPLIB DSP Signal Processing Library *;
;* *;
;* Release: Revision 1.04b *;
;* CVS Revision: 1.5 Mon Dec 9 17:23:52 2002 (UTC) *;
;* Snapshot date: 23-Oct-2003 *;
;* *;
;* This library contains proprietary intellectual property of Texas *;
;* Instruments, Inc. The library and its source code are protected by *;
;* various copyrights, and portions may also be protected by patents or *;
;* other legal protections. *;
;* *;
;* This software is licensed for use with Texas Instruments TMS320 *;
;* family DSPs. This license was provided to you prior to installing *;
;* the software. You may review this license by consulting the file *;
;* TI_license.PDF which accompanies the files in this library. *;
;* ------------------------------------------------------------------------ *;
;* Copyright (C) 2003 Texas Instruments, Incorporated. *;
;* All Rights Reserved. *;
;* ======================================================================== *;
;* ======================================================================== *;
;* Assembler compatibility shim for assembling 4.30 and later code on *;
;* tools prior to 4.30. *;
;* ======================================================================== *;
.if $isdefed(".ASSEMBLER_VERSION")
.asg .ASSEMBLER_VERSION, $asmver
.else
.asg 0, $asmver
.endif
.if ($asmver < 430)
.asg B, CALL ; Function Call
.asg B, RET ; Return from a Function
.asg B, CALLRET ; Function call with Call / Ret chaining.
.if .TMS320C6400
.asg BNOP, CALLNOP ; C64x BNOP as a Fn. Call
.asg BNOP, RETNOP ; C64x BNOP as a Fn. Return
.asg BNOP, CRNOP ; C64x Fn call w/, Call/Ret chaining via BNOP.
.endif
.asg , .asmfunc ; .func equivalent for hand-assembly code
.asg , .endasmfunc ; .endfunc equivalent for hand-assembly code
.endif
;* ======================================================================== *;
;* End of assembler compatibility shim. *;
;* ======================================================================== *;
* ========================================================================= *
* *
* TEXAS INSTRUMENTS, INC. *
* *
* NAME *
* DSP_radix2 -- In-place Radix-2 FFT (Little Endian) *
* *
* REVISION DATE *
* 09-Dec-2002 *
* *
* USAGE *
* This routine is C-callable and can be called as: *
* *
* void DSP_radix2(int n, short *restrict xy, *
* const short *restrict w); *
* *
* n -- FFT size (input) *
* xy[] -- input and output sequences (dim-n) (input/output) *
* w[] -- FFT coefficients (dim-n/2) (input) *
* *
* DESCRIPTION *
* This routine is used to compute FFT of a complex sequece of size *
* n, a power of 2, with "decimation-in-frequency decomposition" *
* method, ie, the output is in bit-reversed order. Each complex *
* value is with interleaved 16-bit real and imaginary parts. To *
* prevent overflow, input samples may have to be scaled by 1/n. *
* *
* void DSP_radix2(int n, short *restrict xy, *
* const short *restrict w) *
* { *
* short n1,n2,ie,ia,i,j,k,l; *
* short xt,yt,c,s; *
* *
* n2 = n; *
* ie = 1; *
* for (k=n; k > 1; k = (k >> 1) ) *
* { *
* n1 = n2; *
* n2 = n2>>1; *
* ia = 0; *
* for (j=0; j < n2; j++) *
* { *
* c = w[2*ia]; *
* s = w[2*ia+1]; *
* ia = ia + ie; *
* for (i=j; i < n; i += n1) *
* { *
* l = i + n2; *
* xt = xy[2*l] - xy[2*i]; *
* xy[2*i] = xy[2*i] + xy[2*l]; *
* yt = xy[2*l+1] - xy[2*i+1]; *
* xy[2*i+1] = xy[2*i+1] + xy[2*l+1]; *
* xy[2*l] = (c*xt + s*yt)>>15; *
* xy[2*l+1] = (c*yt - s*xt)>>15; *
* } *
* } *
* ie = ie<<1; *
* } *
* } *
* *
* ASSUMPTIONS *
* 16 <= n <= 32768 *
* Both input xy and coefficient w must be aligned on word boundary. *
* w coef stored ordered is k*(-cos[0*delta]), k*(-sin[0*delta]), *
* k*(-cos[1*delta]), ... where delta = 2*PI/N, k = 32767 *
* Assembly code is written for processor in Little Endian mode *
* Input xy and coefficients w are 16 bit data. *
* *
* MEMORY NOTE *
* Align xy and w on different word boundaries to minimize *
* memory bank hits. *
* *
* TECHNIQUES *
* 1. Loading input xy as well as coefficient w in word. *
* 2. Both loops j and i shown in the C code are placed in the *
* INNERLOOP of the assembly code. *
* *
* CYCLES *
* cycles = log2(N) * (4*N/2+7) + 34 + N/4. *
* *
* (The N/4 term is due to bank conflicts that occur when xy and w *
* are aligned as suggested above, under "MEMORY NOTE.") *
* *
* For N = 256, cycles = 4250. *
* *
* CODESIZE *
* 800 bytes *
* *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *
.sect ".text:_radix2"
.global _DSP_radix2
_DSP_radix2:
STW .D2T1 A15, *B15--[12] ; push A15 to SP[12]
|| SUB .L1X B15, 4, A15
STW .D1T1 A10, *A15--[10] ; push A10 to SP[11]
|| STW .D2T2 B10, *+B15[10] ; push B10 to SP[10]
|| MVC .S2 CSR, B0
STW .D1T1 A11, *+A15[8] ; push A11 to SP[ 9]
|| STW .D2T2 B11, *+B15[8] ; push B11 to SP[ 8]
|| AND .S2 B0, -2, B1
;-
STW .D1T1 A12, *+A15[6] ; push A12 to SP[ 7]
|| STW .D2T2 B12, *+B15[6] ; push B12 to SP[ 6]
|| MVC .S2 B1, CSR
;== Interrupts disabled here ===
STW .D1T1 A13, *+A15[4] ; push A13 to SP[ 5]
|| STW .D2T2 B13, *+B15[4] ; push B13 to SP[ 4]
STW .D1T1 A14, *+A15[2] ; push A14 to SP[ 3]
|| STW .D2T2 B14, *+B15[2] ; push B14 to SP[ 2]
|| MV .L1X B4,A9 ; &XY
;-
LMBD .L1 1,A4,A1 ; outer loop count calculation
|| MV .L2X A4,B13 ; &N
|| STW .D1T2 B0, *+A15[0] ; push CSR to SP[ 1]
MVK .S1 1,A2 ; IE = 1
|| MV .D2 B13,B10 ; XY index setup
|| MV .L1 A4,A7 ; XY index setup
|| SHL .S2 B13,2,B14 ; calculating reset offset
|| MV .L2X A6,B12 ; permanent ptr for W[0]
;-
SHR .S2 B13,1,B13 ; used for loop count
|| SUB .D2 B14,4,B14 ; calculating reset offset
|| SUB .L1X B14,4,A8 ; calculating reset offset
ADDAH .D1 A9,A7,A3 ; setup ptr for X[i+N2] & Y[i+N2]
|| MV .L2X A9,B9 ; setup ptr for X[i] & Y[i]
|| SUB .D2 B13,4,B13 ; inner loop count
|| MVK .S2 31,B7 ; outer loop count calculation
;-
MV .S2 B9,B4 ; setup store ptr for X[i] & Y[i]
|| MV .L2 B9,B11 ; permanent ptr for X[0] & Y[0]
|| SUB .L1X B7,A1,A1 ; outer loop conter
LDW .D2 *B9++[B10],B7 ; X[i] & Y[i]
|| MV .L2X A2,B2 ; reset twiddle factor counter
|| LDW .D1 *A6++[A2],A5 ; CS = W[IA] & W[IA+1]
|| SHL .S1 A7,1,A0 ; calculating reset offset
;-
[ B2] SUB B2,1,B2 ; decrement twiddle factor counter
|| LDW .D1 *A3++[A7],A13 ; X[i+N2] & Y[i+N2]
[!B2] SUB B9,B14,B9 ; reset load X[i] & Y[i] ptrs
[!B2] SUB A3,A8,A3 ; reset load X[i+N2] & Y[i+N2] ptrs
OUTLOOP:
LDW .D2 *B9++[B10],B7 ;* X[i] & Y[i]
||[!B2] MV .L2X A2,B2 ;* reset twiddle factor counter
||[!B2] LDW .D1 *A6++[A2],A5 ;* CS = W[IA] & W[IA+1]
|| MPY .M2 B2,1,B0 ; move to next iteration
;-
[ B2] SUB B2,1,B2 ;* decrement twiddle factor counter
|| LDW .D1 *A3++[A7],A13 ;* X[i+N2] & Y[i+N2]
SUB2 .S1X A13,B7,A9 ; XYT = X[i]-X[i+N2] & Y[i]-Y[i+N2]
|| MV A5,A10 ; move to other file
|| ADD2 .S2X B7,A13,B7 ; X[i]+X[i+N2] & Y[i]+Y[i+N2]
||[!B2] SUB B9,B14,B9 ;* reset load X[i] & Y[i] ptrs
MPYHL .M1 A9,A5,A14 ; (C)*(YT)
|| STW .D2 B7,*B4++[B10] ; XY[i] = X[i]+X[i+N2] & Y[i]+Y[i+N2]
;-
MPYLH .M1 A9,A5,A15 ; (S)*XT
|| LDW .D2 *B9++[B10],B7 ;** X[i] & Y[i]
||[!B2] MV .L2X A2,B2 ;** reset twiddle factor counter
||[!B2] LDW .D1 *A6++[A2],A5 ;** CS = W[IA] & W[IA+1]
|| SUB .L1X B4,A0,A4 ; set store X[i+N2] ptr
|| MPY .M2 B2,1,B0 ;* move to next iteration
||[!B2] SUB A3,A8,A3 ;* reset load X[i+N2] & Y[i+N2] ptrs
;-
MPY .M1 A9,A10,A13 ; (C)*XT
||[!B0] SUB B4,B14,B4 ; reset store X[i] & Y[i] ptrs
||[ B2] SUB B2,1,B2 ;** decrement twiddle factor counter
|| LDW .D1 *A3++[A7],A13 ;** X[i+N2] & Y[i+N2]
MPYH .M1 A9,A10,A14 ; (S)*(YT)
|| SUB2 .S1X A13,B7,A9 ;* XYT = X[i]-X[i+N2] & Y[i]-Y[i+N2]
|| ADD2 .S2X B7,A13,B7 ;* X[i]+X[i+N2] & Y[i]+Y[i+N2]
||[!B2] SUB B9,B14,B9 ;** reset load X[i] & Y[i] ptrs
;-
SUB .S1 A14,A15,A15 ; Y[i+N2] = C*YT - S*XT
|| MPYHL .M1 A9,A5,A14 ;* (C)*(YT)
|| STW .D2 B7,*B4++[B10] ;* XY[i] = X[i]+X[i+N2] & Y[i]+Y[i+N2]
||[!B2] SUB .D1 A3,A8,A3 ;** reset load X[i+N2] & Y[i+N2] ptrs
|| MV .L2X A4,B6 ; set store X[i+N2] ptr
|| MV .S2 B13,B1 ; set inner loop counter
|| MV .L1 A5,A10 ;* move to other file
B .S2 LOOP ; for (i=j; i < N; i += N1)
|| ADD .S1 A14,A13,A13 ; X[i+N2] = C*XT + S*YT
|| MPYLH .M1 A9,A5,A15 ;* (S)*XT
|| LDW .D2 *B9++[B10],B7 ;*** X[i] & Y[i]
||[!B2] MV .L2X A2,B2 ;*** reset twiddle factor counter
||[!B2] LDW .D1 *A6++[A2],A5 ;*** CS = W[IA] & W[IA+1]
|| SUB .L1X B4,A0,A4 ;* set store X[i+N2] ptr
|| MPY .M2 B2,1,B0 ;** move to next iteration
SHR .S2X A13,15,B5 ; scale X[i+N2]
|| SHR .S1 A15,15,A11 ; scale Y[i+N2]
|| MPY .M1 A9,A10,A13 ;* (C)*XT
||[!B0] SUB .L2 B4,B14,B4 ;* reset store X[i] & Y[i] ptrs
||[ B2] SUB .D2 B2,1,B2 ;*** decrement twiddle factor counter
|| LDW .D1 *A3++[A7],A13 ;*** X[i+N2] & Y[i+N2]
|| ADD .L1X B6,2,A12 ; set store Y[i+N2] ptr
LOOP:
STH .D2 B5,*B6 ; X[i+N2] = C*YT + S*XT
|| MPYH .M1 A9,A10,A14 ;* (S)*(YT)
|| SUB2 .S1X A13,B7,A9 ;** XYT = X[i]-X[i+N2] & Y[i]-Y[i+N2]
|| MV .L1 A5,A10 ;** move to other file
|| ADD2 .S2X B7,A13,B7 ;** X[i]+X[i+N2] & Y[i]+Y[i+N2]
||[!B2] SUB .L2 B9,B14,B9 ;*** reset load X[i] & Y[i] ptrs
SUB .L1 A14,A15,A15 ;* Y[i+N2] = C*YT - S*XT
|| MV .L2X A4,B6 ;* set store X[i+N2] ptr
||[ B1] SUB .S2 B1,1,B1 ;** dec i lp cntr
|| MPYHL .M1 A9,A5,A14 ;** (C)*(YT)
|| STW .D2 B7,*B4++[B10] ;** XY[i] = X[i]+X[i+N2] & Y[i]+Y[i+N2]
|| STH .D1 A11,*A12 ; Y[i+N2] = C*YT - S*XT
||[!B2] SUB .S1 A3,A8,A3 ;*** reset load X[i+N2] & Y[i+N2] ptrs
[ B1] B .S2 LOOP ;* for (i=j; i < N; i += N1)
|| ADD .S1 A13,A14,A13 ;* X[i+N2] = C*XT + S*YT
|| MPYLH .M1 A9,A5,A15 ;** (S)*XT
|| SUB .L1X B4,A0,A4 ;** set store X[i+N2] ptr
|| LDW .D2 *B9++[B10],B7 ;**** X[i] & Y[i]
||[!B2] MV .L2X A2,B2 ;**** reset twiddle factor counter
||[!B2] LDW .D1 *A6++[A2],A5 ;**** CS = W[IA] & W[IA+1]
|| MPY .M2 B2,1,B0 ;*** move to next iteration
SHR .S2X A13,15,B5 ;* scale X[i+N2]
|| SHR .S1 A15,15,A11 ;* scale Y[i+N2]
|| ADD .L1X B6,2,A12 ;* set store Y[i+N2] ptr
||[!B0] SUB .L2 B4,B14,B4 ;** reset store X[i] & Y[i] ptrs
|| MPY .M1 A9,A10,A13 ;** (C)*XT
||[ B2] SUB .D2 B2,1,B2 ;**** decrement twiddle factor counter
|| LDW .D1 *A3++[A7],A13 ;**** X[i+N2] & Y[i+N2]
; - - - - - - - - - - - - - - - - - - - ; LOOP ENDS HERE
;-
STH .D2 B5,*B6 ; X[i+N2] = C*YT + S*XT
|| STH .D1 A11,*A12 ; Y[i+N2] = C*YT - S*XT
|| MPYH .M1 A9,A10,A14 ;* (S)*(YT)
|| SUB2 .S1X A13,B7,A9 ;** XYT = X[i]-X[i+N2] & Y[i]-Y[i+N2]
|| MV .L1 A5,A10 ;** move to other file
|| ADD2 .S2X B7,A13,B7 ;** X[i]+X[i+N2] & Y[i]+Y[i+N2]
|| MV .L2 B11,B9 ; set up load ptr to X[0] & Y[0]
;-
SUB .D1 A14,A15,A15 ;* Y[i+N2] = C*YT - S*XT
|| MV .L2X A4,B6 ;* set store X[i+N2] ptr
|| MPYHL .M1 A9,A5,A14 ;** (C)*(YT)
|| STW .D2 B7,*B4++[B10] ;** XY[i] = X[i]+X[i+N2] & Y[i]+Y[i+N2]
|| SHL .S1 A2,1,A2 ; IE << 1
|| SUB .L1 A1,1,A1 ; decrement outer loop counter
;-
ADD .D1 A14,A13,A13 ;* X[i+N2] = C*XT + S*YT
|| MPYLH .M1 A9,A5,A15 ;** (S)*XT
|| SUB .L1X B4,A0,A4 ;** set store X[i+N2] ptr
|| SHR .S1 A15,15,A11 ;* scale Y[i+N2]
SHR .S2X A13,15,B5 ;* scale X[i+N2]
|| ADD .L1X B6,2,A12 ;* set store Y[i+N2] ptr
|| MPY .M1 A9,A10,A13 ;** (C)*XT
||[ A1] B .S1 OUTLOOP ; branch to next stage
;-
STH .D2 B5,*B6 ; X[i+N2] = C*YT + S*XT
|| STH .D1 A11,*A12 ; Y[i+N2] = C*YT - S*XT
|| MPYH .M1 A9,A10,A10 ;* (S)*(YT)
|| SHR .S2 B10,1,B10 ; load index >> 1
|| SHR .S1 A0,1,A0 ; N2 >> 1
|| MV .L1X B12,A6 ; reset W ptr
|| MV .L2X A2,B2 ; reset twiddle factor counter
;-
MV .L2X A4,B6 ;* set store X[i+N2] ptr
|| LDW .D2 *B9++[B10],B7 ; X[i] & Y[i]
|| ADD .L1X A7,B9,A3 ; set up load ptr to X[N2] & Y[N2]
|| LDW .D1 *A6++[A2],A5 ; CS = W[IA] & W[IA+1]
|| SHR .S1 A7,1,A7 ; load index >> 1
||[B2] SUB .S2 B2,1,B2 ; decrement twiddle factor counter
;-
ADD .L1 A10,A13,A13 ;* X[i+N2] = C*XT + S*YT
|| SUB .S1 A14,A15,A15 ;* Y[i+N2] = C*YT - S*XT
||[ A1] LDW .D1 *A3++[A7],A13 ; X[i+N2] & Y[i+N2]
||[!B2] SUB .L2 B9,B14,B9 ; reset load X[i] & Y[i] ptrs
SHR .S2X A13,15,B5 ;* scale X[i+N2]
|| SHR .S1 A15,15,A11 ;* scale Y[i+N2]
|| ADD .L1X B6,2,A12 ;* set store Y[i+N2] ptr
||[!B2] SUB A3,A8,A3 ; reset load X[i+N2] & Y[i+N2] ptrs
;-
STH .D2 B5,*B6 ; X[i+N2] = C*YT + S*XT
|| STH .D1 A11,*A12 ; Y[i+N2] = C*YT - S*XT
|| SHL .S1 A7,1,A0 ; calculating reset offset
|| MV .S2 B11,B4 ; set up store ptr to X[0] & Y[0]
; - - - - - - - - - - - - - - - - - - - ; OUTLOOP ENDS HERE
LDW .D2T2 *+B15[1], B0 ; pop CSR from SP[ 1]
|| MV B15, A15
LDW .D1T1 *+A15[11], A10 ; pop A10 from SP[11]
|| LDW .D2T2 *+B15[10], B10 ; pop B10 from SP[10]
;-
LDW .D1T1 *+A15[9], A11 ; pop A11 from SP[ 9]
|| LDW .D2T2 *+B15[8], B11 ; pop B11 from SP[ 8]
LDW .D1T1 *+A15[7], A12 ; pop A12 from SP[ 7]
|| LDW .D2T2 *+B15[6], B12 ; pop B12 from SP[ 6]
LDW .D1T1 *+A15[5], A13 ; pop A13 from SP[ 5]
|| LDW .D2T2 *+B15[4], B13 ; pop B13 from SP[ 4]
;-
LDW .D2T2 *+B15[2], B14 ; pop B14 from SP[ 2]
|| LDW .D1T1 *+A15[3], A14 ; pop A14 from SP[ 3]
|| RET B3
LDW .D2T1 *++B15[12],A15 ; pop A15 from SP[12]
MVC B0, CSR
NOP 3
;-
* ========================================================================= *
* End of file: dsp_radix2.asm *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *