You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1026 lines
71 KiB
1026 lines
71 KiB
;* ======================================================================== *;
|
|
;* TEXAS INSTRUMENTS, INC. *;
|
|
;* *;
|
|
;* DSPLIB DSP Signal Processing Library *;
|
|
;* *;
|
|
;* Release: Revision 1.04b *;
|
|
;* CVS Revision: 1.14 Sun Sep 29 03:32:20 2002 (UTC) *;
|
|
;* Snapshot date: 23-Oct-2003 *;
|
|
;* *;
|
|
;* This library contains proprietary intellectual property of Texas *;
|
|
;* Instruments, Inc. The library and its source code are protected by *;
|
|
;* various copyrights, and portions may also be protected by patents or *;
|
|
;* other legal protections. *;
|
|
;* *;
|
|
;* This software is licensed for use with Texas Instruments TMS320 *;
|
|
;* family DSPs. This license was provided to you prior to installing *;
|
|
;* the software. You may review this license by consulting the file *;
|
|
;* TI_license.PDF which accompanies the files in this library. *;
|
|
;* ------------------------------------------------------------------------ *;
|
|
;* Copyright (C) 2003 Texas Instruments, Incorporated. *;
|
|
;* All Rights Reserved. *;
|
|
;* ======================================================================== *;
|
|
|
|
|
|
;* ======================================================================== *;
|
|
;* Assembler compatibility shim for assembling 4.30 and later code on *;
|
|
;* tools prior to 4.30. *;
|
|
;* ======================================================================== *;
|
|
|
|
.if $isdefed(".ASSEMBLER_VERSION")
|
|
.asg .ASSEMBLER_VERSION, $asmver
|
|
.else
|
|
.asg 0, $asmver
|
|
.endif
|
|
|
|
.if ($asmver < 430)
|
|
|
|
.asg B, CALL ; Function Call
|
|
.asg B, RET ; Return from a Function
|
|
.asg B, CALLRET ; Function call with Call / Ret chaining.
|
|
|
|
.if .TMS320C6400
|
|
.asg BNOP, CALLNOP ; C64x BNOP as a Fn. Call
|
|
.asg BNOP, RETNOP ; C64x BNOP as a Fn. Return
|
|
.asg BNOP, CRNOP ; C64x Fn call w/, Call/Ret chaining via BNOP.
|
|
.endif
|
|
|
|
.asg , .asmfunc ; .func equivalent for hand-assembly code
|
|
.asg , .endasmfunc ; .endfunc equivalent for hand-assembly code
|
|
|
|
.endif
|
|
|
|
;* ======================================================================== *;
|
|
;* End of assembler compatibility shim. *;
|
|
;* ======================================================================== *;
|
|
|
|
|
|
* ========================================================================= *
|
|
* *
|
|
* TEXAS INSTRUMENTS, INC. *
|
|
* *
|
|
* NAME *
|
|
* DSP_fft *
|
|
* *
|
|
* *
|
|
* REVISION DATE *
|
|
* 16-Oct-2000 *
|
|
* *
|
|
* USAGE *
|
|
* This routine is C-callable and can be called as: *
|
|
* *
|
|
* void DSP_fft(const short *w, int nsamp, short *x, short *y); *
|
|
* *
|
|
* nsamp = length of DSP_fft in complex samples *
|
|
* x = pointer to complex data input, time domain *
|
|
* w = pointer to complex twiddle factors *
|
|
* y = pointer to complex data output, frequency domain *
|
|
* *
|
|
* DESCRIPTION *
|
|
* This code performs a Radix-4 FFT with digit reversal. The code *
|
|
* uses a special ordering of twiddle factors and memory accesses *
|
|
* to improve performance in the presence of cache. It operates *
|
|
* largely in-place, but the final digit-reversed output is written *
|
|
* out-of-place. *
|
|
* *
|
|
* This code requires a special sequence of twiddle factors stored *
|
|
* in Q.15 fixed-point format. The following C code illustrates *
|
|
* one way to generate the desired twiddle-factor array: *
|
|
* *
|
|
* #include <math.h> *
|
|
* *
|
|
* #ifndef PI *
|
|
* # define PI (3.14159265358979323846) *
|
|
* #endif *
|
|
* *
|
|
* short d2s(double d) *
|
|
* { *
|
|
* d = floor(0.5 + d); /* Explicit rounding to integer */ *
|
|
* if (d >= 32767.0) return 32767; *
|
|
* if (d <= -32768.0) return -32768; *
|
|
* return (short)d; *
|
|
* } *
|
|
* *
|
|
* void gen_twiddle(short *w, int n) *
|
|
* { *
|
|
* double M = 32767.5; *
|
|
* int i, j, k; *
|
|
* *
|
|
* for (j = 1, k = 0; j < n >> 2; j = j << 2) *
|
|
* { *
|
|
* for (i = 0; i < n >> 2; i += j << 1) *
|
|
* { *
|
|
* w[k + 11] = d2s(M * cos(6.0 * PI * (i + j) / n)); *
|
|
* w[k + 10] = d2s(M * sin(6.0 * PI * (i + j) / n)); *
|
|
* w[k + 9] = d2s(M * cos(6.0 * PI * (i ) / n)); *
|
|
* w[k + 8] = d2s(M * sin(6.0 * PI * (i ) / n)); *
|
|
* *
|
|
* w[k + 7] = d2s(M * cos(4.0 * PI * (i + j) / n)); *
|
|
* w[k + 6] = d2s(M * sin(4.0 * PI * (i + j) / n)); *
|
|
* w[k + 5] = d2s(M * cos(4.0 * PI * (i ) / n)); *
|
|
* w[k + 4] = d2s(M * sin(4.0 * PI * (i ) / n)); *
|
|
* *
|
|
* w[k + 3] = d2s(M * cos(2.0 * PI * (i + j) / n)); *
|
|
* w[k + 2] = d2s(M * sin(2.0 * PI * (i + j) / n)); *
|
|
* w[k + 1] = d2s(M * cos(2.0 * PI * (i ) / n)); *
|
|
* w[k + 0] = d2s(M * sin(2.0 * PI * (i ) / n)); *
|
|
* *
|
|
* k += 12; *
|
|
* } *
|
|
* } *
|
|
* w[2*n - 1] = w[2*n - 3] = w[2*n - 5] = 32767; *
|
|
* w[2*n - 2] = w[2*n - 4] = w[2*n - 6] = 0; *
|
|
* } *
|
|
* *
|
|
* ASSUMPTIONS *
|
|
* n must be a power of 4 and n >= 16 & n < 32768. *
|
|
* FFT data x are aligned on a double word boundary, in real/imag *
|
|
* pairs, FFT twiddle factors w are also aligned on a double word *
|
|
* boundary in real/imaginary pairs. *
|
|
* *
|
|
* Input FFT coeffs. are in signed Q.15 format. *
|
|
* The memory Configuration is LITTLE ENDIAN. *
|
|
* The complex data will be returned in natural order. This code is *
|
|
* uninteruptable, interupts are disabled on entry to the function and *
|
|
* re-enabled on exit. *
|
|
* *
|
|
* MEMORY NOTE *
|
|
* No bank conflict stalls occur in this code. *
|
|
* *
|
|
* TECHNIQUES *
|
|
* A special sequence of coefficients. are used (as generated above) *
|
|
* to produce the DSP_fft. This collapses the inner 2 loops in the *
|
|
* taditional Burrus and Parks implementation Fortran Code. *
|
|
* *
|
|
* The following C code represents an implementation of the Cooley *
|
|
* Tukey radix 4 DIF FFT. It accepts the inputs in normal order and *
|
|
* produces the outputs in digit reversed order. The natural C code *
|
|
* shown in this file on the other hand, accepts the inputs in nor- *
|
|
* mal order and produces the outputs in normal order. *
|
|
* *
|
|
* Several transformations have been applied to the original Cooley *
|
|
* Tukey code to produce the natural C code description shown here. *
|
|
* In order to understand these it would first be educational to *
|
|
* understand some of the issues involved in the conventional Cooley *
|
|
* Tukey FFT code. *
|
|
* *
|
|
* void radix4(int n, short x[], short wn[]) *
|
|
* { *
|
|
* int n1, n2, ie, ia1, ia2, ia3; *
|
|
* int i0, i1, i2, i3, i, j, k; *
|
|
* short co1, co2, co3, si1, si2, si3; *
|
|
* short xt0, yt0, xt1, yt1, xt2, yt2; *
|
|
* short xh0, xh1, xh20, xh21, xl0, xl1,xl20,xl21; *
|
|
* *
|
|
* n2 = n; *
|
|
* ie = 1; *
|
|
* for (k = n; k > 1; k >>= 2) *
|
|
* { *
|
|
* n1 = n2; *
|
|
* n2 >>= 2; *
|
|
* ia1 = 0; *
|
|
* *
|
|
* for (j = 0; j < n2; j++) *
|
|
* { *
|
|
* ia2 = ia1 + ia1; *
|
|
* ia3 = ia2 + ia1; *
|
|
* *
|
|
* co1 = wn[2 * ia1 ]; *
|
|
* si1 = wn[2 * ia1 + 1]; *
|
|
* co2 = wn[2 * ia2 ]; *
|
|
* si2 = wn[2 * ia2 + 1]; *
|
|
* co3 = wn[2 * ia3 ]; *
|
|
* si3 = wn[2 * ia3 + 1]; *
|
|
* ia1 = ia1 + ie; *
|
|
* *
|
|
* for (i0 = j; i0< n; i0 += n1) *
|
|
* { *
|
|
* i1 = i0 + n2; *
|
|
* i2 = i1 + n2; *
|
|
* i3 = i2 + n2; *
|
|
* *
|
|
* *
|
|
* xh0 = x[2 * i0 ] + x[2 * i2 ]; *
|
|
* xh1 = x[2 * i0 + 1] + x[2 * i2 + 1]; *
|
|
* xl0 = x[2 * i0 ] - x[2 * i2 ]; *
|
|
* xl1 = x[2 * i0 + 1] - x[2 * i2 + 1]; *
|
|
* *
|
|
* xh20 = x[2 * i1 ] + x[2 * i3 ]; *
|
|
* xh21 = x[2 * i1 + 1] + x[2 * i3 + 1]; *
|
|
* xl20 = x[2 * i1 ] - x[2 * i3 ]; *
|
|
* xl21 = x[2 * i1 + 1] - x[2 * i3 + 1]; *
|
|
* *
|
|
* x[2 * i0 ] = xh0 + xh20; *
|
|
* x[2 * i0 + 1] = xh1 + xh21; *
|
|
* *
|
|
* xt0 = xh0 - xh20; *
|
|
* yt0 = xh1 - xh21; *
|
|
* xt1 = xl0 + xl21; *
|
|
* yt2 = xl1 + xl20; *
|
|
* xt2 = xl0 - xl21; *
|
|
* yt1 = xl1 - xl20; *
|
|
* *
|
|
* x[2 * i1 ] = (xt1 * co1 + yt1 * si1) >> 15; *
|
|
* x[2 * i1 + 1] = (yt1 * co1 - xt1 * si1) >> 15; *
|
|
* x[2 * i2 ] = (xt0 * co2 + yt0 * si2) >> 15; *
|
|
* x[2 * i2 + 1] = (yt0 * co2 - xt0 * si2) >> 15; *
|
|
* x[2 * i3 ] = (xt2 * co3 + yt2 * si3) >> 15; *
|
|
* x[2 * i3 + 1] = (yt2 * co3 - xt2 * si3) >> 15; *
|
|
* } *
|
|
* } *
|
|
* *
|
|
* ie <<= 2; *
|
|
* } *
|
|
* } *
|
|
* *
|
|
* The conventional Cooley Tukey FFT, is written using three loops. *
|
|
* The outermost loop "k" cycles through the stages. There are log *
|
|
* N to the base 4 stages in all. The loop "j" cycles through the *
|
|
* groups of butterflies with different twiddle factors, loop "i" *
|
|
* reuses the twiddle factors for the different butterflies within *
|
|
* a stage. It is interesting to note the following: *
|
|
* *
|
|
*---------------------------------------------------------------------------*
|
|
* Stage# #Groups # Butterflies with common #Groups*Bflys *
|
|
* twiddle factors *
|
|
*---------------------------------------------------------------------------*
|
|
* 1 N/4 1 N/4 *
|
|
* 2 N/16 4 N/4 *
|
|
* .. *
|
|
* logN 1 N/4 N/4 *
|
|
*---------------------------------------------------------------------------*
|
|
* *
|
|
* The following statements can be made based on above observations: *
|
|
* *
|
|
* a) Inner loop "i0" iterates a veriable number of times. In *
|
|
* particular the number of iterations quadruples every time from *
|
|
* 1..N/4. Hence software pipelining a loop that iterates a vraiable *
|
|
* number of times is not profitable. *
|
|
* *
|
|
* b) Outer loop "j" iterates a variable number of times as well. *
|
|
* However the number of iterations is quartered every time from *
|
|
* N/4 ..1. Hence the behaviour in (a) and (b) are exactly opposite *
|
|
* to each other. *
|
|
* *
|
|
* c) If the two loops "i" and "j" are colaesced together then they *
|
|
* will iterate for a fixed number of times namely N/4. This allows *
|
|
* us to combine the "i" and "j" loops into 1 loop. Optimized impl- *
|
|
* ementations will make use of this fact. *
|
|
* *
|
|
* In addition the Cooley Tukey FFT accesses three twiddle factors *
|
|
* per iteration of the inner loop, as the butterflies that re-use *
|
|
* twiddle factors are lumped together. This leads to accessing the *
|
|
* twiddle factor array at three points each sepearted by "ie". Note *
|
|
* that "ie" is initially 1, and is quadrupled with every iteration. *
|
|
* Therfore these three twiddle factors are not even contiguous in *
|
|
* the array. *
|
|
* *
|
|
* In order to vectorize the FFT, it is desirable to access twiddle *
|
|
* factor array using double word wide loads and fetch the twiddle *
|
|
* factors needed. In order to do this a modified twiddle factor *
|
|
* array is created, in which the factors WN/4, WN/2, W3N/4 are *
|
|
* arranged to be contiguous. This eliminates the seperation between *
|
|
* twiddle factors within a butterfly. However this implies that as *
|
|
* the loop is traversed from one stage to another, that we maintain *
|
|
* a redundant version of the twiddle factor array. Hence the size *
|
|
* of the twiddle factor array increases as compared to the normal *
|
|
* Cooley Tukey FFT. The modified twiddle factor array is of size *
|
|
* "2 * N" where the conventional Cooley Tukey FFT is of size"3N/4" *
|
|
* where N is the number of complex points to be transformed. The *
|
|
* routine that generates the modified twiddle factor array was *
|
|
* presented earlier. With the above transformation of the FFT, *
|
|
* both the input data and the twiddle factor array can be accessed *
|
|
* using double-word wide loads to enable packed data processing. *
|
|
* *
|
|
* The final stage is optimised to remove the multiplication as *
|
|
* w0 = 1. This stage also performs digit reversal on the data, *
|
|
* so the final output is in natural order. *
|
|
* *
|
|
* The DSP_fft() code shown here performs the bulk of the computation *
|
|
* in place. However, because digit-reversal cannot be performed *
|
|
* in-place, the final result is written to a separate array, y[]. *
|
|
* *
|
|
* There is one slight break in the flow of packed processing that *
|
|
* needs to be comprehended. The real part of the complex number is *
|
|
* in the lower half, and the imaginary part is in the upper half. *
|
|
* The flow breaks in case of "xl0" and "xl1" because in this case *
|
|
* the real part needs to be combined with the imaginary part because *
|
|
* of the multiplication by "j". This requires a packed quantity like *
|
|
* "xl21xl20" to be rotated as "xl20xl21" so that it can be combined *
|
|
* using add2's and sub2's. Hence the natural version of C code *
|
|
* shown below is transformed using packed data processing as shown: *
|
|
* *
|
|
* xl0 = x[2 * i0 ] - x[2 * i2 ]; *
|
|
* xl1 = x[2 * i0 + 1] - x[2 * i2 + 1]; *
|
|
* xl20 = x[2 * i1 ] - x[2 * i3 ]; *
|
|
* xl21 = x[2 * i1 + 1] - x[2 * i3 + 1]; *
|
|
* *
|
|
* xt1 = xl0 + xl21; *
|
|
* yt2 = xl1 + xl20; *
|
|
* xt2 = xl0 - xl21; *
|
|
* yt1 = xl1 - xl20; *
|
|
* *
|
|
* xl1_xl0 = _sub2(x21_x20, x21_x20) *
|
|
* xl21_xl20 = _sub2(x32_x22, x23_x22) *
|
|
* xl20_xl21 = _rotl(xl21_xl20, 16) *
|
|
* *
|
|
* yt2_xt1 = _add2(xl1_xl0, xl20_xl21) *
|
|
* yt1_xt2 = _sub2(xl1_xl0, xl20_xl21) *
|
|
* *
|
|
* Also notice that xt1, yt1 endup on seperate words, these need to *
|
|
* be packed together to take advantage of the packed twiddle fact *
|
|
* ors that have been loaded. In order for this to be achieved they *
|
|
* are re-aligned as follows: *
|
|
* *
|
|
* yt1_xt1 = _packhl2(yt1_xt2, yt2_xt1) *
|
|
* yt2_xt2 = _packhl2(yt2_xt1, yt1_xt2) *
|
|
* *
|
|
* The packed words "yt1_xt1" allows the loaded"sc" twiddle factor *
|
|
* to be used for the complex multiplies. The real part os the *
|
|
* complex multiply is implemented using _dotp2. The imaginary *
|
|
* part of the complex multiply is implemented using _dotpn2 *
|
|
* after the twiddle factors are swizzled within the half word. *
|
|
* *
|
|
* (X + jY) ( C + j S) = (XC + YS) + j (YC - XS). *
|
|
* *
|
|
* The actual twiddle factors for the FFT are cosine, - sine. The *
|
|
* twiddle factors stored in the table are csine and sine, hence *
|
|
* the sign of the "sine" term is comprehended during multipli- *
|
|
* cation as shown above. *
|
|
* *
|
|
* CYCLES *
|
|
* cycles = 1.25*nsamp*log4(nsamp) - 0.5*nsamp + 23*log4(nsamp) - 1 *
|
|
* *
|
|
* For nsamp = 1024, cycles = 6002 *
|
|
* For nsamp = 256, cycles = 1243 *
|
|
* For nsamp = 64, cycles = 276 *
|
|
* *
|
|
* CODESIZE *
|
|
* 988 bytes *
|
|
* *
|
|
* C CODE *
|
|
* This is the C equivalent of the assembly code without restrictions: *
|
|
* Note that the assembly code is hand optimized and restrictions may *
|
|
* apply. *
|
|
* *
|
|
* void DSP_fft(short *ptr_w, int n, short *ptr_x, short *ptr_y) *
|
|
* { *
|
|
* int i, j, l1, l2, h2, predj, tw_offset, stride, fft_jmp; *
|
|
* short xt0_0, yt0_0, xt1_0, yt1_0, xt2_0, yt2_0; *
|
|
* short xt0_1, yt0_1, xt1_1, yt1_1, xt2_1, yt2_1; *
|
|
* short xh0_0, xh1_0, xh20_0, xh21_0, xl0_0, xl1_0, xl20_0, xl21_0; *
|
|
* short xh0_1, xh1_1, xh20_1, xh21_1, xl0_1, xl1_1, xl20_1, xl21_1; *
|
|
* short x_0, x_1, x_2, x_3, x_l1_0, x_l1_1, x_l1_2, x_l1_3, x_l2_0: *
|
|
* short x_10, x_11, x_12, x_13, x_14, x_15, x_16, x_17, x_l2_1, x_h2_3; *
|
|
* short x_4, x_5, x_6, x_7, x_l2_2, x_l2_3, x_h2_0, x_h2_1, x_h2_2; *
|
|
* short si10, si20, si30, co10, co20, co30; *
|
|
* short si11, si21, si31, co11, co21, co31; *
|
|
* short * x, *w, * x2, * x0; *
|
|
* short * y0, * y1, * y2, *y3; *
|
|
* *
|
|
* stride = n; -* n is the number of complex samples *- *
|
|
* tw_offset = 0; *
|
|
* while (stride > 4) /* for all strides > 4 */ *
|
|
* { *
|
|
* j = 0; *
|
|
* fft_jmp = stride + (stride>>1); *
|
|
* h2 = stride>>1; /* n/4 */ *
|
|
* l1 = stride; /* n/2 */ *
|
|
* l2 = stride + (stride>>1); /* 3n/4 */ *
|
|
* x = ptr_x; *
|
|
* w = ptr_w + tw_offset; *
|
|
* tw_offset += fft_jmp; *
|
|
* stride = stride>>2; *
|
|
* *
|
|
* for (i = 0; i < n>>1; i += 4) *
|
|
* { *
|
|
* co10 = w[j+1]; si10 = w[j+0]; /* W */ *
|
|
* co11 = w[j+3]; si11 = w[j+2]; *
|
|
* co20 = w[j+5]; si20 = w[j+4]; /* W^2 */ *
|
|
* co21 = w[j+7]; si21 = w[j+6]; *
|
|
* co30 = w[j+9]; si30 = w[j+8]; /* W^3 */ *
|
|
* co31 = w[j+11]; si31 = w[j+10]; *
|
|
* *
|
|
* x_0 = x[0]; x_1 = x[1]; /* perform 2 parallel */ *
|
|
* x_2 = x[2]; x_3 = x[3]; /* radix4 butterflies */ *
|
|
* *
|
|
* x_l1_0 = x[l1 ]; x_l1_1 = x[l1+1]; *
|
|
* x_l1_2 = x[l1+2]; x_l1_3 = x[l1+3]; *
|
|
* *
|
|
* x_l2_0 = x[l2 ]; x_l2_1 = x[l2+1]; *
|
|
* x_l2_2 = x[l2+2]; x_l2_3 = x[l2+3]; *
|
|
* *
|
|
* x_h2_0 = x[h2 ]; x_h2_1 = x[h2+1]; *
|
|
* x_h2_2 = x[h2+2]; x_h2_3 = x[h2+3]; *
|
|
* *
|
|
* xh0_0 = x_0 + x_l1_0; xh1_0 = x_1 + x_l1_1; *
|
|
* xh0_1 = x_2 + x_l1_2; xh1_1 = x_3 + x_l1_3; *
|
|
* *
|
|
* xl0_0 = x_0 - x_l1_0; xl1_0 = x_1 - x_l1_1; *
|
|
* xl0_1 = x_2 - x_l1_2; xl1_1 = x_3 - x_l1_3; *
|
|
* *
|
|
* xh20_0 = x_h2_0 + x_l2_0; xh21_0 = x_h2_1 + x_l2_1; *
|
|
* xh20_1 = x_h2_2 + x_l2_2; xh21_1 = x_h2_3 + x_l2_3; *
|
|
* *
|
|
* xl20_0 = x_h2_0 - x_l2_0; xl21_0 = x_h2_1 - x_l2_1; *
|
|
* xl20_1 = x_h2_2 - x_l2_2; xl21_1 = x_h2_3 - x_l2_3; *
|
|
* *
|
|
* x0 = x; *
|
|
* x2 = x0; /* copy pointers for output*/ *
|
|
* *
|
|
* j += 12; *
|
|
* x += 4; *
|
|
* predj = (j - fft_jmp); /* check if reached end of */ *
|
|
* if (!predj) x += fft_jmp;/* current twiddle factor section */ *
|
|
* if (!predj) j = 0; *
|
|
* *
|
|
* x0[0] = xh0_0 + xh20_0; x0[1] = xh1_0 + xh21_0; *
|
|
* x0[2] = xh0_1 + xh20_1; x0[3] = xh1_1 + xh21_1; *
|
|
* *
|
|
* xt0_0 = xh0_0 - xh20_0; yt0_0 = xh1_0 - xh21_0; *
|
|
* xt0_1 = xh0_1 - xh20_1; yt0_1 = xh1_1 - xh21_1; *
|
|
* *
|
|
* xt1_0 = xl0_0 + xl21_0; yt2_0 = xl1_0 + xl20_0; *
|
|
* xt2_0 = xl0_0 - xl21_0; yt1_0 = xl1_0 - xl20_0; *
|
|
* xt1_1 = xl0_1 + xl21_1; yt2_1 = xl1_1 + xl20_1; *
|
|
* xt2_1 = xl0_1 - xl21_1; yt1_1 = xl1_1 - xl20_1; *
|
|
* *
|
|
* x2[h2 ] = (si10 * yt1_0 + co10 * xt1_0) >> 15; *
|
|
* x2[h2+1] = (co10 * yt1_0 - si10 * xt1_0) >> 15; *
|
|
* *
|
|
* x2[h2+2] = (si11 * yt1_1 + co11 * xt1_1) >> 15; *
|
|
* x2[h2+3] = (co11 * yt1_1 - si11 * xt1_1) >> 15; *
|
|
* *
|
|
* x2[l1 ] = (si20 * yt0_0 + co20 * xt0_0) >> 15; *
|
|
* x2[l1+1] = (co20 * yt0_0 - si20 * xt0_0) >> 15; *
|
|
* *
|
|
* x2[l1+2] = (si21 * yt0_1 + co21 * xt0_1) >> 15; *
|
|
* x2[l1+3] = (co21 * yt0_1 - si21 * xt0_1) >> 15; *
|
|
* *
|
|
* x2[l2 ] = (si30 * yt2_0 + co30 * xt2_0) >> 15; *
|
|
* x2[l2+1] = (co30 * yt2_0 - si30 * xt2_0) >> 15; *
|
|
* *
|
|
* x2[l2+2] = (si31 * yt2_1 + co31 * xt2_1) >> 15; *
|
|
* x2[l2+3] = (co31 * yt2_1 - si31 * xt2_1) >> 15; *
|
|
* } *
|
|
* }-* end while *- *
|
|
* *
|
|
* y0 = ptr_y; *
|
|
* y1 = y0 + (int)(n>>1); *
|
|
* y2 = y1 + (int)(n>>1); *
|
|
* y3 = y2 + (int)(n>>1); *
|
|
* x0 = ptr_x; *
|
|
* x2 = ptr_x + (int)(n>>1); *
|
|
* l1 = _norm(n) + 2; *
|
|
* j = 0; *
|
|
* for (i = 0; i < n; i += 8) *
|
|
* { *
|
|
* h2 = _deal(j); *
|
|
* h2 = _bitr(h2); *
|
|
* h2 = _rotl(h2, 16); *
|
|
* h2 = _shfl(h2); *
|
|
* h2 >>= l1; *
|
|
* *
|
|
* x_0 = x0[0]; x_1 = x0[1]; *
|
|
* x_2 = x0[2]; x_3 = x0[3]; *
|
|
* x_4 = x0[4]; x_5 = x0[5]; *
|
|
* x_6 = x0[6]; x_7 = x0[7]; *
|
|
* x0 += 8; *
|
|
* *
|
|
* xh0_0 = x_0 + x_4; xh1_0 = x_1 + x_5; *
|
|
* xl0_0 = x_0 - x_4; xl1_0 = x_1 - x_5; *
|
|
* xh20_0 = x_2 + x_6; xh21_0 = x_3 + x_7; *
|
|
* xl20_0 = x_2 - x_6; xl21_0 = x_3 - x_7; *
|
|
* *
|
|
* xt0_0 = xh0_0 - xh20_0; *
|
|
* yt0_0 = xh1_0 - xh21_0; *
|
|
* xt1_0 = xl0_0 + xl21_0; *
|
|
* yt2_0 = xl1_0 + xl20_0; *
|
|
* xt2_0 = xl0_0 - xl21_0; *
|
|
* yt1_0 = xl1_0 - xl20_0; *
|
|
* *
|
|
* y0[2*h2 ] = xh0_0 + xh20_0; *
|
|
* y0[2*h2+1] = xh1_0 + xh21_0; *
|
|
* y1[2*h2 ] = xt1_0; *
|
|
* y1[2*h2+1] = yt1_0; *
|
|
* y2[2*h2 ] = xt0_0; *
|
|
* y2[2*h2+1] = yt0_0; *
|
|
* y3[2*h2 ] = xt2_0; *
|
|
* y3[2*h2+1] = yt2_0; *
|
|
* *
|
|
* x_10 = x2[0]; x_11 = x2[1]; *
|
|
* x_12 = x2[2]; x_13 = x2[3]; *
|
|
* x_14 = x2[4]; x_15 = x2[5]; *
|
|
* x_16 = x2[6]; x_17 = x2[7]; *
|
|
* x2 += 8; *
|
|
* *
|
|
* xh0_1 = x_10 + x_14; xh1_1 = x_11 + x_15; *
|
|
* xl0_1 = x_10 - x_14; xl1_1 = x_11 - x_15; *
|
|
* xh20_1 = x_12 + x_16; xh21_1 = x_13 + x_17; *
|
|
* xl20_1 = x_12 - x_16; xl21_1 = x_13 - x_17; *
|
|
* *
|
|
* xt0_1 = xh0_1 - xh20_1; *
|
|
* yt0_1 = xh1_1 - xh21_1; *
|
|
* xt1_1 = xl0_1 + xl21_1; *
|
|
* yt2_1 = xl1_1 + xl20_1; *
|
|
* xt2_1 = xl0_1 - xl21_1; *
|
|
* yt1_1 = xl1_1 - xl20_1; *
|
|
* *
|
|
* y0[2*h2+2] = xh0_1 + xh20_1; *
|
|
* y0[2*h2+3] = xh1_1 + xh21_1; *
|
|
* y1[2*h2+2] = xt1_1; *
|
|
* y1[2*h2+3] = yt1_1; *
|
|
* y2[2*h2+2] = xt0_1; *
|
|
* y2[2*h2+3] = yt0_1; *
|
|
* y3[2*h2+2] = xt2_1; *
|
|
* y3[2*h2+3] = yt2_1; *
|
|
* *
|
|
* j += 4; *
|
|
* if (j == n>>2) *
|
|
* { *
|
|
* j += n>>2; *
|
|
* x0 += (int) n>>1; *
|
|
* x2 += (int) n>>1; *
|
|
* } *
|
|
* } *
|
|
* } *
|
|
* ------------------------------------------------------------------------- *
|
|
* Copyright (c) 2003 Texas Instruments, Incorporated. *
|
|
* All Rights Reserved. *
|
|
* ========================================================================= *
|
|
|
|
.sect ".text:_fft"
|
|
.global _DSP_fft
|
|
_DSP_fft:
|
|
*================== SYMBOLIC REGISTER ASSIGNMENTS: SETUP ======================*
|
|
.asg B15, B_SP ;Stack pointer, B datapath
|
|
.asg A31, A_SP ;Copy of stack pointer
|
|
.asg A4, A_ptr_w ;poiter to twiddle factors
|
|
.asg B4, B_n ;number of points in transform
|
|
.asg A6, A_ptr_x ;pointer to timne domain data
|
|
.asg B6, B_ptr_y ;pointer to frequency domain
|
|
.asg B12, B_stride ;offset to n/2 complex pt
|
|
.asg B5, B_stride_1 ;offset to n/4 complex pt
|
|
.asg A12, A_tw_offset ;increment to twiddle table
|
|
.asg A3, A_w0 ;pointer to twiddle factor
|
|
.asg B5, B_fft_jmp_1 ;jump factor for time data
|
|
.asg A20, A_zero ;cont = 0
|
|
.asg B16, B_w0 ;pointer to twiddle factor
|
|
.asg A21, A_w1 ;twiddle factor + 2 dword
|
|
.asg B17, B_w2 ;twiddle factor + 4 dword
|
|
.asg B11, B_j ;counter for twiddle factors
|
|
.asg A19, A_j ;counter for twiddle factors
|
|
.asg B10, B_x ;pointer to time domaindata
|
|
.asg B18, B_h2 ;offset to n/4 time points
|
|
.asg B19, B_l1 ;offset to n/2 time points
|
|
.asg B20, B_l2 ;offset to 3n/4 time points
|
|
.asg A22, A_fft_jmp ;jump for twiddle factors
|
|
.asg B21, B_fft_jmp ;jump for twiddle factors
|
|
.asg A0, A_p0 ;prolog collapse variable
|
|
.asg A2, A_i ;1st loop counter
|
|
.asg A25, A_h2 ;offset to n/4 freq. points
|
|
.asg A23, A_l1 ;offset to n/2 freq. points
|
|
.asg A24, A_l2 ;offset to 3n/4 freq. points
|
|
.asg A31, A_co11_si11 ;W^n+1
|
|
.asg A30, A_co10_si10 ;W^n
|
|
.asg B29, B_co21_si21 ;W^2n+1
|
|
.asg B28, B_co20_si20 ;W^2n
|
|
.asg B31, B_co31_si31 ;W^3n+1
|
|
.asg B30, B_co30_si30 ;W^3n
|
|
.asg A29, A_x_3_x_2 ;data into butterflies
|
|
.asg A28, A_x_1_x_0 ;data into butterflies
|
|
.asg A27, A_xh2_3_xh2_2 ;data itno butterflies
|
|
.asg A26, A_xh2_1_xh2_0 ;data into butterflies
|
|
.asg B23, B_xl1_3_xl1_2 ;data into butterflies
|
|
.asg B22, B_xl1_1_xl1_0 ;data into butterflies
|
|
.asg B25, B_xl2_3_xl2_2 ;data into butterflies
|
|
.asg B24, B_xl2_1_xl2_0 ;data into butterflies
|
|
.asg B22, B_xh1_0_xh0_0 ;data into butterflies
|
|
.asg B8, B_xh1_1_xh0_1 ;data into butterflies
|
|
.asg A7, A_xl1_0_xl0_0 ;data into butterflies
|
|
.asg A16, A_xl1_1_xl0_1 ;data into butterflies
|
|
.asg B26, B_xh21_0_xh20_0 ;data into butterflies
|
|
.asg B5, B_xh21_1_xh20_1 ;data into butterflies
|
|
.asg A3, A_xl21_0_xl20_0 ;data into butterflies
|
|
.asg A7, A_xl21_1_xl20_1 ;data into butterflies
|
|
.asg A18, A_x_ ;copy of freq. pointer
|
|
.asg A10, A_x__ ;copy of freq. pointer
|
|
.asg A1, A_ifj ;jump condition for DSP_fft
|
|
.asg B26, B_x_1_x_0 ;data out of butterflies
|
|
.asg B27, B_x_3_x_2 ;data out of butterflies
|
|
.asg B7, B_yt0_0_xt0_0 ;data out of butterflies
|
|
.asg B8, B_yt0_1_xt0_1 ;data out of butterflies
|
|
.asg A3, A_xl20_0_xl21_0 ;data out of butterflies
|
|
.asg A5, A_xl20_1_xl21_1 ;data out of butterflies
|
|
.asg A8, A_yt2_0_xt1_0 ;data out of butterflies
|
|
.asg A11, A_yt2_1_xt1_1 ;data out of butterflies
|
|
.asg A9, A_yt1_0_xt2_0 ;data out of butterflies
|
|
.asg A16, A_yt1_1_xt2_1 ;data out of butterflies
|
|
.asg A7, A_xt1_0_yt1_0 ;data out of butterflies
|
|
.asg A5, A_xt1_1_yt1_1 ;data out of butterflies
|
|
.asg A8, A_xt2_0_yt2_0 ;data out of butterflies
|
|
.asg A28, A_xt2_1_yt2_1 ;data out of butterflies
|
|
.asg B26, B_xt0_0_yt0_0 ;data out of butterflies
|
|
.asg B9, B_xt0_1_yt0_1 ;data out of butterflies
|
|
.asg A1, A_yt1_0_xt1_0 ;data out of butterflies
|
|
.asg A5, A_yt1_1_xt1_1 ;data out of butterflies
|
|
.asg A3, A_yt2_0_xt2_0 ;data out of butterflies
|
|
.asg A3, A_yt2_1_xt2_1 ;data out of butterflies
|
|
.asg A26, A_x_h2_0 ;frequency >> 15
|
|
.asg A27, A_x_h2_1 ;frequency >> 15
|
|
.asg A17, A_x_h2_2 ;frequency >> 15
|
|
.asg A29, A_x_h2_3 ;frequency >> 15
|
|
.asg B0, B_x_l1_0 ;frequency >> 15
|
|
.asg B2, B_x_l1_1 ;frequency >> 15
|
|
.asg B23, B_x_l1_2 ;frequency >> 15
|
|
.asg B5, B_x_l1_3 ;frequency >> 15
|
|
.asg B1, B_x_l2_0 ;frequency >> 15
|
|
.asg B27, B_x_l2_1 ;frequency >> 15
|
|
.asg B2, B_x_l2_2 ;frequency >> 15
|
|
.asg B7, B_x_l2_3 ;frequency >> 15
|
|
.asg A28, A_xh2_1_0 ;packed frequencies
|
|
.asg A29, A_xh2_3_2 ;packed frequencies
|
|
.asg B22, B_xl1_1_0 ;packed frequencies
|
|
.asg B23, B_xl1_3_2 ;packed frequencies
|
|
.asg B24, B_xl2_1_0 ;packed frequencies
|
|
.asg B25, B_xl2_3_2 ;packed frequencies
|
|
.asg B0, B_whl ;outer while loop condition
|
|
.asg B13, B_csr ;CSR before function call
|
|
.asg B10, B_no_gie ;CSR before with GIE bit off
|
|
*==============================================================================*
|
|
MV.L1X B_SP, A_SP ;make cpy of SP
|
|
|| STW.D2T2 B12, *B_SP--[8] ;Save B12
|
|
|| SHRU.S2 B_n, 3, B_h2 ;h2=n/8
|
|
|
|
STW.D2T1 A12, *+B_SP[1] ;Save A12 SP[1]
|
|
|| MV.L2 B_n, B_stride ;stride = n
|
|
|| STW.D1T2 B13, *-A_SP[6] ;Save B13 SP[2]
|
|
|| MVC.S2 CSR, B_csr ;save CSR
|
|
|| B.S1 LOOP_WHILE_B ; no ints!
|
|
;-
|
|
STW.D2T2 B11, *+B_SP[3] ;Save B11 SP[3]
|
|
|| STW.D1T1 A11, *-A_SP[4] ;Save A11 SP[4]
|
|
|| ZERO.L1 A_tw_offset ;tw_offset=0
|
|
|
|
STW.D2T2 B10, *+B_SP[5] ;Save B10 SP[5]
|
|
|| STW.D1T1 A10, *-A_SP[2] ;Save A10 SP[6]
|
|
|| AND.S2 B_csr, -2, B_no_gie ;kill gie bit
|
|
;-
|
|
MV.L2X A_ptr_x, B_x ;x= ptr_x
|
|
|| ADDAH.D2 B_h2, B_h2, B_l2 ;l2=3*h2
|
|
|| MVC.S2 B_no_gie, CSR ;disable intrpt.
|
|
|
|
LDDW.D2T2 *B_x[B_l2], B_xl2_3_xl2_2:B_xl2_1_xl2_0 ;x[l2+0..3]
|
|
|| SHRU.S1X B_n, 3, A_i ;i=n>>3
|
|
|| SHL.S2 B_stride, 1, B_stride_1 ;stride/2
|
|
|
|
*============================= PIPE LOOP PROLOG ===============================*
|
|
LOOP_WHILE:
|
|
LDDW.D2T1 *B_x[B_h2], A_xh2_3_xh2_2:A_xh2_1_xh2_0 ;x[h2+0..3]
|
|
|| MV.L1X B_l2, A_l2 ;l2 copy
|
|
|| SHRU.S2 B_stride, 2, B_l1 ;fft_jmp=
|
|
|| ADD.L2 B_stride, B_stride_1, B_fft_jmp ;1.5*stride
|
|
|
|
; -- remainder relocated below
|
|
|
|
*============================= PIPE LOOP KERNEL ===============================*
|
|
LOOP_Y:
|
|
ADD.S2 B_x_l1_1, B_x_l1_1, B_x_l1_1 ;x[l1+1]>>15
|
|
|| DOTP2.M2X B_co31_si31, A_xt2_1_yt2_1, B_x_l2_2 ;Im{W^3n+1*x2}
|
|
|| ADD.S1 A_x_h2_1, A_x_h2_1, A_x_h2_1 ;x[h2+1]>>15
|
|
|| ROTL.M1 A_xt1_0_yt1_0, 16, A_yt1_0_xt1_0;xt1<=>yt1
|
|
|| PACKLH2.L1 A_yt2_1_xt1_1, A_yt1_1_xt2_1, A_xt1_1_yt1_1;repack data
|
|
|| ADD.D1 A_j, 3, A_j ;j+=3
|
|
||[!A_ifj]ZERO.L2 B_j ;j=0 if at end
|
|
|| LDDW.D2T2 *B_x[B_l1], B_xl1_3_xl1_2:B_xl1_1_xl1_0 ;x[l1+0..3]
|
|
|
|
PACKH2.S2 B_x_l1_1, B_x_l1_0, B_xl1_1_0 ;repack data
|
|
|| PACKH2.L1 A_x_h2_1, A_x_h2_0, A_xh2_1_0 ;repack data
|
|
|| ADD2.L2 B_xh21_0_xh20_0, B_xh1_0_xh0_0, B_x_1_x_0 ;xh2x+xhx
|
|
|| ROTL.M2 B_yt0_0_xt0_0, 16, B_xt0_0_yt0_0;repack data
|
|
|| DOTP2.M1 A_xt1_1_yt1_1, A_co11_si11, A_x_h2_2 ;Im{W^n+1*X1}
|
|
|| PACKLH2.S1 A_xt1_1_yt1_1, A_xt1_1_yt1_1, A_yt1_1_xt1_1;repack data
|
|
||[!A_ifj]ZERO.D1 A_j ;j=0?
|
|
|| LDDW.D2T1 *B_x[0], A_x_3_x_2:A_x_1_x_0 ;x[0..3]
|
|
|
|
ADD.S2 B_x_l2_0, B_x_l2_0, B_x_l2_0 ;x[l2]>>15
|
|
|| ADD.L2 B_x_l1_3, B_x_l1_3, B_x_l1_3 ;x[l1+3]>>15
|
|
|| ADD.D2 B_x_l1_2, B_x_l1_2, B_x_l1_2 ;x[l1+2]>>15
|
|
|| DOTPN2.M2X B_co31_si31, A_yt2_1_xt2_1, B_x_l2_3 ;Re{W^3n+1*X2}
|
|
|| STDW.D1T2 B_x_3_x_2:B_x_1_x_0, *A_x_[0] ;write x[0..3]
|
|
|| DOTPN2.M1 A_yt1_0_xt1_0, A_co10_si10, A_x_h2_1 ;Re{X1*W^n}
|
|
|| SUB2.S1X A_xh2_1_xh2_0, B_xl2_1_xl2_0, A_xl21_0_xl20_0;x[h2]-x[l2]
|
|
|| SUB.L1 A_fft_jmp, A_j, A_ifj ;(fft_jmp==j)?
|
|
|
|
PACKH2.S2 B_x_l1_3, B_x_l1_2, B_xl1_3_2 ;repack data
|
|
|| ADD.S1 A_x_h2_3, A_x_h2_3, A_x_h2_3 ;x[h2+3]>>15
|
|
|| DOTP2.M2 B_xt0_0_yt0_0, B_co20_si20, B_x_l1_0 ;Im{W^2n*X0}
|
|
|| DOTP2.M1 A_xt1_0_yt1_0, A_co10_si10, A_x_h2_0 ;Im{W^n*X1}
|
|
|| LDDW.D1T2 *A_w1[A_j], B_co21_si21:B_co20_si20 ;W^2n+1,W^2n
|
|
|| LDDW.D2T1 *B_w0[B_j], A_co11_si11:A_co10_si10 ;W^n+1,W^n
|
|
|| ADD2.L2X B_xl2_1_xl2_0, A_xh2_1_xh2_0, B_xh21_0_xh20_0;xl2+xh2
|
|
|| SUB2.L1X A_xh2_3_xh2_2, B_xl2_3_xl2_2, A_xl21_1_xl20_1;xh2-xl2
|
|
|
|
[ A_i]BDEC.S1 LOOP_Y, A_i ;} end for
|
|
|| ADD.D2 B_x_l2_1, B_x_l2_1, B_x_l2_1 ;x[l2+1]>>15
|
|
||[!A_p0]STDW.D1T2 B_xl1_3_2:B_xl1_1_0, *A_x__[A_l1] ;save x[l1+0..3]
|
|
|| ADD.L2 B_x_l2_2, B_x_l2_2, B_x_l2_2 ;x[l2+2]>>15
|
|
|| PACKH2.L1 A_x_h2_3, A_x_h2_2, A_xh2_3_2 ;repack data
|
|
|| DOTPN2.M2 B_yt0_1_xt0_1, B_co21_si21, B_x_l1_3 ;Im{W^2n+1*X0}
|
|
||[!A_ifj]ADD.S2 B_x, B_fft_jmp, B_x ;x+=fft_fmp?
|
|
|| MVD.M1X B_x, A_x_ ;x_=x
|
|
LOOP_Y5:
|
|
PACKH2.S2 B_x_l2_1, B_x_l2_0, B_xl2_1_0 ;repack data
|
|
||[!A_p0]STDW.D1T1 A_xh2_3_2:A_xh2_1_0, *A_x__[A_h2] ;save x[h2+0..3]
|
|
|| PACKLH2.S1 A_yt1_0_xt2_0, A_yt2_0_xt1_0, A_xt2_0_yt2_0;repack datat
|
|
|| ADD.L1 A_x_h2_2, A_x_h2_2, A_x_h2_2 ;x[h2+2]>>15
|
|
|| DOTPN2.M1 A_yt1_1_xt1_1, A_co11_si11, A_x_h2_3 ;Re{W^n+1*X1}
|
|
|| DOTPN2.M2 B_yt0_0_xt0_0, B_co20_si20, B_x_l1_1 ;Re{W^2n*X0}
|
|
|| ADD2.D2X B_xl2_3_xl2_2, A_xh2_3_xh2_2, B_xh21_1_xh20_1;xl2+xh2
|
|
|| ADD.L2 B_x, 8, B_x ;x+=8
|
|
|
|
ADD.L2 B_x_l2_3, B_x_l2_3, B_x_l2_3 ;x[l2+3]>>15
|
|
|| ROTL.M1 A_xt2_0_yt2_0, 16, A_yt2_0_xt2_0;repack data
|
|
|| DOTP2.M2 B_xt0_1_yt0_1, B_co21_si21, B_x_l1_2 ;Im{W^2n+1*X0}
|
|
|| ADD2.S2X B_xl1_3_xl1_2, A_x_3_x_2, B_xh1_1_xh0_1;x[l1]+x[0]
|
|
|| PACKLH2.L1 A_xl21_1_xl20_1,A_xl21_1_xl20_1,A_xl20_1_xl21_1;repack data
|
|
|| PACKLH2.S1 A_xl21_0_xl20_0,A_xl21_0_xl20_0,A_xl20_0_xl21_0;repack data
|
|
|| SUB2.D1X A_x_1_x_0, B_xl1_1_xl1_0, A_xl1_0_xl0_0;x[0]-x[l1]
|
|
|| LDDW.D2T2 *B_x[B_l2], B_xl2_3_xl2_2:B_xl2_1_xl2_0 ;x[l2+0..3]
|
|
|
|
DOTP2.M2X B_co30_si30, A_xt2_0_yt2_0, B_x_l2_0 ;Im{W^3n*X3}
|
|
|| ADD.D1 A_x_h2_0, A_x_h2_0, A_x_h2_0 ;x[h2]>>15
|
|
|| MVD.M1 A_x_, A_x__ ;x__=x_
|
|
|| SUB2.L1 A_xl1_0_xl0_0, A_xl20_0_xl21_0, A_yt1_0_xt2_0;xl1-xl20
|
|
|| ADD2.S1 A_xl1_0_xl0_0, A_xl20_0_xl21_0, A_yt2_0_xt1_0;xl1+xl20
|
|
|| ADD2.L2 B_xh21_1_xh20_1, B_xh1_1_xh0_1, B_x_3_x_2 ;xh21+xh1
|
|
|| SUB2.S2 B_xh1_1_xh0_1, B_xh21_1_xh20_1, B_yt0_1_xt0_1;xh1-xh21
|
|
|| LDDW.D2T1 *B_x[B_h2], A_xh2_3_xh2_2:A_xh2_1_xh2_0 ;x[h2+0..3]
|
|
LOOP_Y8:
|
|
PACKH2.L2 B_x_l2_3, B_x_l2_2, B_xl2_3_2 ;repack data
|
|
|| MVD.M1 A_zero, A_p0 ;disable prolog
|
|
|| PACKLH2.S1 A_yt1_1_xt2_1, A_yt2_1_xt1_1, A_xt2_1_yt2_1;reformat
|
|
|| ROTL.M2 B_yt0_1_xt0_1, 16, B_xt0_1_yt0_1;repack data
|
|
|| PACKLH2.L1 A_yt2_0_xt1_0, A_yt1_0_xt2_0, A_xt1_0_yt1_0;repack data
|
|
|| SUB2.D1X A_x_3_x_2, B_xl1_3_xl1_2, A_xl1_1_xl0_1;x[0]-x[l1]
|
|
|| ADD2.S2X B_xl1_1_xl1_0, A_x_1_x_0, B_xh1_0_xh0_0;x[0]+x[l1]
|
|
|| LDDW.D2T2 *B_w2[B_j], B_co31_si31:B_co30_si30 ;W^3n+1,W^3n
|
|
|
|
[!A_p0]STDW.D1T2 B_xl2_3_2:B_xl2_1_0, *A_x__[A_l2] ;x[l2+0..3]
|
|
|| DOTPN2.M2X B_co30_si30, A_yt2_0_xt2_0, B_x_l2_1 ;Re{W^3n*X2}
|
|
|| ADD.L2 B_x_l1_0, B_x_l1_0, B_x_l1_0 ;x[l1]>>15
|
|
|| ROTL.M1 A_xt2_1_yt2_1, 16, A_yt2_1_xt2_1;repack data
|
|
|| SUB2.S2 B_xh1_0_xh0_0, B_xh21_0_xh20_0, B_yt0_0_xt0_0;xh1-xh21
|
|
|| ADD.D2 B_j, 3, B_j ;j+=3
|
|
|| ADD2.L1 A_xl1_1_xl0_1, A_xl20_1_xl21_1, A_yt2_1_xt1_1;xl1+xl20
|
|
|| SUB2.S1 A_xl1_1_xl0_1, A_xl20_1_xl21_1, A_yt1_1_xt2_1;xl1-xl20
|
|
*============================= PIPE LOOP EPILOG ===============================*
|
|
ADD.D2 B_x_l1_1, B_x_l1_1, B_x_l1_1 ;x[l1+1]>>15
|
|
|| DOTP2.M2X B_co31_si31, A_xt2_1_yt2_1, B_x_l2_2 ;Im{W^3n+1*X2}
|
|
|| ADD.S1 A_x_h2_1, A_x_h2_1, A_x_h2_1 ;x[h2+1]>>15
|
|
|| SHRU.S2 B_stride, 3, B_h2 ;h2=stride/8
|
|
|
|
PACKH2.S2 B_x_l1_1, B_x_l1_0, B_xl1_1_0 ;repack data
|
|
|| PACKH2.L1 A_x_h2_1, A_x_h2_0, A_xh2_1_0 ;repack data
|
|
|| MV.L2X A_ptr_x, B_x ;x= ptr_x
|
|
|| ADDAH.D2 B_h2, B_h2, B_l2 ;l2=3*h2
|
|
;-
|
|
ADD.S2 B_x_l2_0, B_x_l2_0, B_x_l2_0 ;x[l2]>>15
|
|
|| ADD.L2 B_x_l1_3, B_x_l1_3, B_x_l1_3 ;x[l1+3]>>15
|
|
|| ADD.D2 B_x_l1_2, B_x_l1_2, B_x_l1_2 ;x[l1+2]>>15
|
|
|| DOTPN2.M2X B_co31_si31, A_yt2_1_xt2_1, B_x_l2_3 ;Re{W^3n+1*X2}
|
|
|
|
PACKH2.S2 B_x_l1_3, B_x_l1_2, B_xl1_3_2 ;repack data
|
|
|| ADD.S1 A_x_h2_3, A_x_h2_3, A_x_h2_3 ;x[h2+3]>>15
|
|
|| CMPLTU.L2 4, B_stride, B_whl ;stride>4 ?
|
|
|| MV.L1X B_SP, A_SP ;cp B_SP
|
|
;-
|
|
ADD.D2 B_x_l2_1, B_x_l2_1, B_x_l2_1 ;x[l2+1]>>15
|
|
|| STDW.D1T2 B_xl1_3_2:B_xl1_1_0, *A_x__[A_l1] ;save x[l1+0..3]
|
|
|| ADD.L2 B_x_l2_2, B_x_l2_2, B_x_l2_2 ;x[l2+2]>>15
|
|
|| PACKH2.L1 A_x_h2_3, A_x_h2_2, A_xh2_3_2 ;repack data
|
|
||[B_whl]B.S1 LOOP_WHILE ;}end while
|
|
|
|
PACKH2.S2 B_x_l2_1, B_x_l2_0, B_xl2_1_0 ;repack data
|
|
|| STDW.D1T1 A_xh2_3_2:A_xh2_1_0, *A_x__[A_h2] ;save x[h2+0..3]
|
|
||[B_whl]B.S1 LOOP_WHILE_B ;}end while
|
|
|
|
ADD.L2 B_x_l2_3, B_x_l2_3, B_x_l2_3 ;x[l2+3]>>15
|
|
;-
|
|
PACKH2.L2 B_x_l2_3, B_x_l2_2, B_xl2_3_2 ;repack data
|
|
|| SHL.S2 B_stride, 1, B_stride_1 ;2*stride
|
|
|| SHRU.S1X B_n, 3, A_i ;i=n>>3
|
|
*======================= SYMBOLIC REGISTER ASSIGNMENTS ========================*
|
|
.asg B4, B_n ;number of complex points
|
|
.asg A6, A_ptr_x ;pointer to n-1th transformed data
|
|
.asg B6, B_ptr_y ;pointer to frequency domain
|
|
.asg A6, A_x0 ;even samples
|
|
.asg B16, B_x0 ;even samples
|
|
.asg A24, A_x1 ;odd samples
|
|
.asg B17, B_x1 ;odd samples
|
|
.asg A25, A_i ;loop count
|
|
.asg B1, B_j ;index to input
|
|
.asg B18, B_nm2 ;n-2
|
|
.asg A26, A_n_2 ;n/4
|
|
.asg A2, A_p0 ;prolog collapse counter
|
|
.asg A27, A_n ;number of complex points
|
|
.asg B19, B_y0 ;y[i]
|
|
.asg B20, B_y1 ;y[i+n/4]
|
|
.asg B21, B_y2 ;y[i+n/2]
|
|
.asg B22, B_y3 ;y[i+3*n/4]
|
|
.asg B23, B_l1 ;shift for digit reverse
|
|
.asg A19, A_h0 ;partial digit reverse
|
|
.asg A17, A_h1 ;partial digit reverse
|
|
.asg B28, B_h2 ;partial digit reverse
|
|
.asg B24, B_h3 ;partial digit reverse
|
|
.asg B8, B_h4 ;digit reversed index
|
|
.asg A28, A_x1x0 ;x[i+1],x[i+0]
|
|
.asg A29, A_x3x2 ;x[i+3],x[i+2]
|
|
.asg B24, B_x5x4 ;x[i+5],x[i+4]
|
|
.asg B25, B_x7x6 ;x[i+7],x[i+6]
|
|
.asg A30, A_x9x8 ;x[i+9],x[i+8]
|
|
.asg A31, A_xbxa ;x[i+11],x[i+10]
|
|
.asg B28, B_xdxc ;x[i+13],x[i+12]
|
|
.asg B29, B_xfxe ;x[i+15],x[i+14]
|
|
.asg B26, B_xh1_0_xh0_0 ;stageI outputs from butterflies
|
|
.asg B28, B_xh21_0_xh20_0;stageI outputs from butterflies
|
|
.asg A18, A_xl1_0_xl0_0 ;stageI outputs from butterflies
|
|
.asg A20, A_xl21_0_xl20_0;stageI outputs from butterflies
|
|
.asg A19, A_xl0_0_xl1_0 ;stageI outputs from butterflies
|
|
.asg B26, B_yt0_0_xt0_0 ;stageI outputs from butterflies
|
|
.asg A20, A_xt1_0_yt2_0 ;stageI outputs from butterflies
|
|
.asg A30, A_xt2_0_yt1_0 ;stageI outputs from butterflies
|
|
.asg B9, B_xh1_1_xh0_1 ;stageII outputs from butterflies
|
|
.asg B27, B_xh21_1_xh20_1;stageII outputs from butterflies
|
|
.asg A30, A_xl1_1_xl0_1 ;stageII outputs from butterflies
|
|
.asg A22, A_xl21_1_xl20_1;stageII outputs from butterflies
|
|
.asg A19, A_xl0_1_xl1_1 ;stageII outputs from butterflies
|
|
.asg B27, B_yt0_1_xt0_1 ;stageII outputs from butterflies
|
|
.asg A21, A_xt1_1_yt2_1 ;stageII outputs from butterflies
|
|
.asg A22, A_xt2_1_yt1_1 ;stageII outputs from butterflies
|
|
.asg B30, B_y1y0 ;complex frequencies
|
|
.asg A30, A_y3y2 ;complex frequencies
|
|
.asg A28, A_y7y6 ;complex frequencies
|
|
.asg B31, B_y9y8 ;complex frequencies
|
|
.asg A31, A_ybya ;complex frequencies
|
|
.asg A29, A_yfye ;complex frequencies
|
|
.asg B2, B_ifi ;jump condition for digit reversal
|
|
*============================PIPE LOOP PROLOG =================================*
|
|
SHRU.S1X B_n, 2, A_n_2 ;n/4
|
|
|| STDW.D1T2 B_xl2_3_2:B_xl2_1_0, *A_x__[A_l2] ;x[l2+...3]
|
|
;-
|
|
SHRU.S2 B_n, 3, B_j ;j=n/8
|
|
|| ADD.S1X A_ptr_x, B_n, A_x1 ;n/4 words
|
|
|| LDW.D1T1 *+A_SP[4], A11 ;Restore A11
|
|
||[B_whl]LDDW.D2T2 *B_x[B_l2], B_xl2_3_xl2_2:B_xl2_1_xl2_0 ;x[l2]+0..3]
|
|
|
|
ADD.L2 B_ptr_y, B_n, B_y1 ;y1=y+n/4
|
|
|| SUB.S2X 4, A_n_2, B_nm2 ;4 - n/4
|
|
|| LDW.D1T1 *+A_SP[6], A10 ;Restore A10
|
|
|| LDW.D2T2 *+B_SP[3], B11 ;Restore B11
|
|
;-
|
|
MVK.S1 0xFFFF8000, A_p0 ;kill prolog
|
|
|| NORM.L2 B_n, B_l1 ;_norm(n)+3
|
|
|| ADD.D2 B_y1, B_n, B_y2 ;y2=y1+n/4
|
|
|| ADD.S2X A_x1, 8, B_x1 ;ptr_x+n/2
|
|
|
|
ADD.S2 B_y2, B_n, B_y3 ;y3=y2+n/4
|
|
|| ADD.L2X A_ptr_x, 8, B_x0 ;x = ptr_x
|
|
|| MV.L1X B_n, A_n ;copy n
|
|
|| LDW.D2T2 *+B_SP[5], B10 ;Restore B10
|
|
;-
|
|
MVK.L2 1, B_ifi ;if(j==n>>2){
|
|
|| MVK.L1 4, A_i ;j += 4;
|
|
|| LDDW.D2T2 *B_x1++[2], B_xfxe:B_xdxc ;x[15..12]
|
|
|| LDDW.D1T1 *A_x0++[2], A_x3x2:A_x1x0 ;x[3..0]
|
|
|| MPYSU.M1 0, A_h0, A_h0 ;h2=_deal(j)
|
|
|| ADD.S2 B_l1, 3, B_l1 ;rev dwords
|
|
|| ROTL.M2 B_ptr_y, 0, B_y0 ;y0= y
|
|
*============================= PIPE LOOP KERNEL2===============================*
|
|
LOOP_Z:
|
|
[ B_j]BDEC.S2 LOOP_Z, B_j ;}end for
|
|
|| SUB2.L1 A_xl0_0_xl1_0, A_xl21_0_xl20_0, A_xt2_0_yt1_0 ;xl0-xl21
|
|
|| ADD2.S1 A_xl0_0_xl1_0, A_xl21_0_xl20_0, A_xt1_0_yt2_0 ;xl0+xl21
|
|
|| ROTL.M1 A_xl1_1_xl0_1, 16, A_xl0_1_xl1_1 ;repack data
|
|
|| ADD2.L2X B_x7x6, A_x3x2, B_xh21_0_xh20_0;x7+x3
|
|
|| SHFL.M2 B_h2, B_h3 ;h2=_shfl(h2)
|
|
|| LDDW.D1T1 *A_x1++[2], A_xbxa:A_x9x8 ;x[11..8]
|
|
|| LDDW.D2T2 *B_x0++[2], B_x7x6:B_x5x4 ;x[7..4]
|
|
|
|
[!A_p0]STDW.D2T2 B_yt0_1_xt0_1:B_yt0_0_xt0_0, *B_y2[B_h4] ;store freq.
|
|
|| PACKLH2.S1 A_xt1_0_yt2_0, A_xt2_0_yt1_0, A_y7y6 ;repack data
|
|
|| SUB2.D1X A_x1x0, B_x5x4, A_xl1_0_xl0_0 ;x1-x5
|
|
|| ADD2.S2X B_x5x4, A_x1x0, B_xh1_0_xh0_0 ;x5+x1
|
|
||[!B_ifi]ADD.L2 B_x1, B_n, B_x1 ;x2+=n>>1
|
|
||[!B_ifi]ADD.L1 A_i, A_n_2, A_i ;j+=n>>2
|
|
|| BITR.M1 A_h0, A_h1 ;h2=_bitr(h2)
|
|
|
|
[!A_p0]STDW.D2T2 B_y9y8:B_y1y0, *B_y0[B_h4] ;store freq.
|
|
|| PACKLH2.S1 A_xt1_1_yt2_1, A_xt2_1_yt1_1, A_yfye ;repack data
|
|
|| PACKLH2.L1 A_xt2_0_yt1_0, A_xt1_0_yt2_0, A_y3y2 ;repack data
|
|
|| ADD2.S2X B_xfxe, A_xbxa, B_xh21_1_xh20_1;x[15]+x[11]
|
|
|| ADD2.L2 B_xh1_0_xh0_0, B_xh21_0_xh20_0, B_y1y0 ;xh1+xh21
|
|
|| SUB2.D1X A_x3x2, B_x7x6, A_xl21_0_xl20_0;x3-x7
|
|
|
|
[!A_p0]STDW.D2T1 A_yfye:A_y7y6, *B_y3[B_h4] ;store freq.
|
|
|| MPYSU.M1 2, A_p0, A_p0 ;prolog count
|
|
|| PACKLH2.S1 A_xt2_1_yt1_1, A_xt1_1_yt2_1, A_ybya ;repack data
|
|
|| ADD2.L2 B_xh1_1_xh0_1, B_xh21_1_xh20_1, B_y9y8 ;xh1+xh21
|
|
|| SUB2.D1X A_xbxa, B_xfxe, A_xl21_1_xl20_1;x[11]-x[15]
|
|
|| SUB2.S2 B_xh1_0_xh0_0, B_xh21_0_xh20_0, B_yt0_0_xt0_0 ;xh1-xh21
|
|
||[!B_ifi]ADD.L1 A_x0, A_n, A_x0 ;x0 += n>>1
|
|
|| AVG2.M2X A_i, B_nm2, B_ifi ;if(j==n>>2){
|
|
|
|
[!A_p0]STDW.D2T1 A_ybya:A_y3y2, *B_y1[B_h4] ;store freq.
|
|
|| SUB2.S1 A_xl0_1_xl1_1, A_xl21_1_xl20_1, A_xt2_1_yt1_1 ;xl0-xl21
|
|
|| ADD2.D1 A_xl0_1_xl1_1, A_xl21_1_xl20_1, A_xt1_1_yt2_1 ;xl0+xl21
|
|
|| ROTL.M1 A_xl1_0_xl0_0, 16, A_xl0_0_xl1_0 ;repack data
|
|
|| SHRU.S2 B_h3, B_l1, B_h4 ;h2 >>= l1
|
|
||[!B_ifi]ADD.L1 A_x1, A_n, A_x1 ;x2 += n>>1
|
|
||[!B_ifi]ADD.L2 B_x0, B_n, B_x0 ;x0 += n>>1
|
|
|| ROTL.M2X A_h1, 16, B_h2 ;rotl(h2, 16)
|
|
|
|
SUB2.S2 B_xh1_1_xh0_1, B_xh21_1_xh20_1, B_yt0_1_xt0_1 ;xh1-xh21
|
|
|| SUB2.L1X A_x9x8, B_xdxc, A_xl1_1_xl0_1 ;x[9]-x[13]
|
|
|| ADD2.L2X B_xdxc, A_x9x8, B_xh1_1_xh0_1 ;x[13]+x[9]
|
|
|| ADD.S1 A_i, 4, A_i ;j += 4;
|
|
|| LDDW.D2T2 *B_x1++[2], B_xfxe:B_xdxc ;x[15..12]
|
|
|| LDDW.D1T1 *A_x0++[2], A_x3x2:A_x1x0 ;x[3..0]
|
|
|| DEAL.M1 A_i, A_h0 ;h2=_deal(j)
|
|
*============================= PIPE LOOP EPILOG ===============================*
|
|
SUB2.L1 A_xl0_0_xl1_0, A_xl21_0_xl20_0, A_xt2_0_yt1_0 ;xl1-xl21
|
|
|| ADD2.S1 A_xl0_0_xl1_0, A_xl21_0_xl20_0, A_xt1_0_yt2_0 ;xl1+xl21
|
|
|| LDW.D2T2 *++B_SP[8], B12 ; safe due to branch ;Restore B12
|
|
|| MV B_SP, A_SP
|
|
|| RET.S2 B3 ;Rtrn to call
|
|
|
|
LDW.D2T1 *-B_SP[7], A12 ; safe due to branch ;Restore A12
|
|
|| LDW.D1T2 *+A_SP[2], B13 ;Restore B13
|
|
|| PACKLH2.S1 A_xt1_0_yt2_0, A_xt2_0_yt1_0, A_y7y6 ;repack data
|
|
;-
|
|
STDW.D2T2 B_yt0_1_xt0_1:B_yt0_0_xt0_0, *B_y2[B_h4] ;store freq.
|
|
|| PACKLH2.S1 A_xt1_1_yt2_1, A_xt2_1_yt1_1, A_yfye ;repack data
|
|
|| PACKLH2.L1 A_xt2_0_yt1_0, A_xt1_0_yt2_0, A_y3y2 ;repack data
|
|
|
|
STDW.D2T1 A_yfye:A_y7y6, *B_y3[B_h4] ;store freq.
|
|
|| PACKLH2.S1 A_xt2_1_yt1_1, A_xt1_1_yt2_1, A_ybya ;repack data
|
|
|| MVC.S2 B_csr, CSR ;rstr interupt
|
|
|
|
STDW.D2T2 B_y9y8:B_y1y0, *B_y0[B_h4] ;store freq.
|
|
|
|
STDW.D2T1 A_ybya:A_y3y2, *B_y1[B_h4] ;store freq.
|
|
;Branch Occurs
|
|
|
|
*================== SYMBOLIC REGISTER ASSIGNMENTS: LOOP_WHILE =================*
|
|
.asg A2, A_i ;1st loop counter
|
|
.asg B18, B_h2 ;offset to n/4 time points
|
|
.asg B19, B_l1 ;offset to n/2 time points
|
|
.asg A3, A_xl21_0_xl20_0 ;data into butterflies
|
|
.asg B26, B_xh21_0_xh20_0 ;data into butterflies
|
|
.asg A7, A_xl21_1_xl20_1 ;data into butterflies
|
|
.asg B11, B_j ;counter for twiddle factors
|
|
.asg A0, A_p0 ;prolog collapse variable
|
|
*==============================================================================*
|
|
; --- this code was relocated from above
|
|
;-
|
|
LOOP_WHILE_B:
|
|
SHRU.S2 B_fft_jmp, 1, B_fft_jmp_1 ;fft_jmp/2
|
|
|| MV.S1X B_h2, A_h2 ;copy h2
|
|
|
|
B.S2 LOOP_Y5 + 24 ;trunc. prolog
|
|
|| SHRU.S1X B_fft_jmp, 3, A_fft_jmp ;w=ptr_w+
|
|
|| ADDAH.D1 A_ptr_w, A_tw_offset, A_w0 ;tw_offset
|
|
|
|
LDDW.D2T2 *B_x[B_l1], B_xl1_3_xl1_2:B_xl1_1_xl1_0 ;xl1[+0..3]
|
|
|| MV.D1X B_l1, A_l1 ;cpy l1
|
|
;-
|
|
LDDW.D2T1 *B_x[0], A_x_3_x_2:A_x_1_x_0 ;x0..3]
|
|
|| SUB.L1 A_fft_jmp, 3, A_fft_jmp ;tw_offset+=
|
|
|| ADD.S1X A_tw_offset, B_fft_jmp_1, A_tw_offset ; fft_jmp
|
|
|
|
SUB2.S1X A_xh2_1_xh2_0, B_xl2_1_xl2_0, A_xl21_0_xl20_0;x[h2]-x[l2]
|
|
|| B.S2 LOOP_Y8 + 12 ;trunc prolog
|
|
|| MV.L2X A_w0, B_w0 ;cpy w0
|
|
|| ADD.L1 A_w0, 8, A_w1 ;w1=w0+1
|
|
;-
|
|
LDDW.D1T2 *A_w1[0], B_co21_si21:B_co20_si20 ;W^2n+1,W^2n
|
|
|| LDDW.D2T1 *B_w0[0], A_co11_si11:A_co10_si10 ;W^n+1,W^n
|
|
|| ADD2.L2X B_xl2_1_xl2_0, A_xh2_1_xh2_0, B_xh21_0_xh20_0;xh2=x[l2]+x[h2]
|
|
|| SUB2.L1X A_xh2_3_xh2_2, B_xl2_3_xl2_2, A_xl21_1_xl20_1;xl2=x[l2]-x[h2]
|
|
|| MPYSU.M1 0, A_zero, A_zero ;zero = 0
|
|
|| MV.S1 A_fft_jmp, A_ifj ;ify=fft_jmp
|
|
;-
|
|
MVD.M1X B_x, A_x_ ;x_=x
|
|
|| BDEC.S1 LOOP_Y, A_i ;trunc prolog
|
|
|| SHRU.S2 B_stride, 2, B_stride ;stride>>=2
|
|
|| MPYSU.M2 0, B_j, B_j ;j = 0
|
|
|| ZERO.D1 A_j ;j = 0
|
|
|| MVK.L1 1, A_p0 ;p0=1
|
|
|| ADD.L2X A_w1, 8, B_w2 ;w2=w1+1
|
|
|
|
* ========================================================================= *
|
|
* End of file: dsp_fft.asm *
|
|
* ------------------------------------------------------------------------- *
|
|
* Copyright (c) 2003 Texas Instruments, Incorporated. *
|
|
* All Rights Reserved. *
|
|
* ========================================================================= *
|
|
|