c6416_sdk/dsplib/fft.asm


								;* ======================================================================== *;

								;*  TEXAS INSTRUMENTS, INC.                                                 *;

								;*                                                                          *;

								;*  DSPLIB  DSP Signal Processing Library                                   *;

								;*                                                                          *;

								;*      Release:        Revision 1.04b                                      *;

								;*      CVS Revision:   1.14    Sun Sep 29 03:32:20 2002 (UTC)              *;

								;*      Snapshot date:  23-Oct-2003                                         *;

								;*                                                                          *;

								;*  This library contains proprietary intellectual property of Texas        *;

								;*  Instruments, Inc.  The library and its source code are protected by     *;

								;*  various copyrights, and portions may also be protected by patents or    *;

								;*  other legal protections.                                                *;

								;*                                                                          *;

								;*  This software is licensed for use with Texas Instruments TMS320         *;

								;*  family DSPs.  This license was provided to you prior to installing      *;

								;*  the software.  You may review this license by consulting the file       *;

								;*  TI_license.PDF which accompanies the files in this library.             *;

								;* ------------------------------------------------------------------------ *;

								;*          Copyright (C) 2003 Texas Instruments, Incorporated.             *;

								;*                          All Rights Reserved.                            *;

								;* ======================================================================== *;


								;* ======================================================================== *;

								;*  Assembler compatibility shim for assembling 4.30 and later code on      *;

								;*  tools prior to 4.30.                                                    *;

								;* ======================================================================== *;


								        .if $isdefed(".ASSEMBLER_VERSION")

								        .asg    .ASSEMBLER_VERSION, $asmver

								        .else

								        .asg    0,    $asmver

								        .endif


								        .if ($asmver < 430)


								        .asg    B,    CALL     ; Function Call

								        .asg    B,    RET      ; Return from a Function

								        .asg    B,    CALLRET  ; Function call with Call / Ret chaining.


								        .if .TMS320C6400

								        .asg    BNOP, CALLNOP  ; C64x BNOP as a Fn. Call

								        .asg    BNOP, RETNOP   ; C64x BNOP as a Fn. Return

								        .asg    BNOP, CRNOP    ; C64x Fn call w/, Call/Ret chaining via BNOP.

								        .endif


								        .asg    , .asmfunc     ; .func equivalent for hand-assembly code

								        .asg    , .endasmfunc  ; .endfunc equivalent for hand-assembly code


								        .endif


								;* ======================================================================== *;

								;*  End of assembler compatibility shim.                                    *;

								;* ======================================================================== *;


								* ========================================================================= *

								*                                                                           *

								*   TEXAS INSTRUMENTS, INC.                                                 *

								*                                                                           *

								*   NAME                                                                    *

								*       DSP_fft                                                             *

								*                                                                           *

								*                                                                           *

								*   REVISION DATE                                                           *

								*       16-Oct-2000                                                         *

								*                                                                           *

								*   USAGE                                                                   *

								*       This routine is C-callable and can be called as:                    *

								*                                                                           *

								*       void DSP_fft(const short *w, int nsamp, short *x, short *y);        *

								*                                                                           *

								*       nsamp = length of DSP_fft in complex samples                        *

								*       x     = pointer to complex data input, time domain                  *

								*       w     = pointer to complex twiddle factors                          *

								*       y     = pointer to complex data output, frequency domain            *

								*                                                                           *

								*   DESCRIPTION                                                             *

								*       This code performs a Radix-4 FFT with digit reversal.  The code     *

								*       uses a special ordering of twiddle factors and memory accesses      *

								*       to improve performance in the presence of cache.  It operates       *

								*       largely in-place, but the final digit-reversed output is written    *

								*       out-of-place.                                                       *

								*                                                                           *

								*       This code requires a special sequence of twiddle factors stored     *

								*       in Q.15 fixed-point format.  The following C code illustrates       *

								*       one way to generate the desired twiddle-factor array:               *

								*                                                                           *

								*       #include <math.h>                                                   *

								*                                                                           *

								*       #ifndef PI                                                          *

								*       # define PI (3.14159265358979323846)                                *

								*       #endif                                                              *

								*                                                                           *

								*       short d2s(double d)                                                 *

								*       {                                                                   *

								*           d = floor(0.5 + d);  /* Explicit rounding to integer */         *

								*           if (d >=  32767.0) return  32767;                               *

								*           if (d <= -32768.0) return -32768;                               *

								*           return (short)d;                                                *

								*       }                                                                   *

								*                                                                           *

								*       void gen_twiddle(short *w, int n)                                   *

								*       {                                                                   *

								*           double M = 32767.5;                                             *

								*           int i, j, k;                                                    *

								*                                                                           *

								*           for (j = 1, k = 0; j < n >> 2; j = j << 2)                      *

								*           {                                                               *

								*               for (i = 0; i < n >> 2; i += j << 1)                        *

								*               {                                                           *

								*                   w[k + 11] = d2s(M * cos(6.0 * PI * (i + j) / n));       *

								*                   w[k + 10] = d2s(M * sin(6.0 * PI * (i + j) / n));       *

								*                   w[k +  9] = d2s(M * cos(6.0 * PI * (i    ) / n));       *

								*                   w[k +  8] = d2s(M * sin(6.0 * PI * (i    ) / n));       *

								*                                                                           *

								*                   w[k +  7] = d2s(M * cos(4.0 * PI * (i + j) / n));       *

								*                   w[k +  6] = d2s(M * sin(4.0 * PI * (i + j) / n));       *

								*                   w[k +  5] = d2s(M * cos(4.0 * PI * (i    ) / n));       *

								*                   w[k +  4] = d2s(M * sin(4.0 * PI * (i    ) / n));       *

								*                                                                           *

								*                   w[k +  3] = d2s(M * cos(2.0 * PI * (i + j) / n));       *

								*                   w[k +  2] = d2s(M * sin(2.0 * PI * (i + j) / n));       *

								*                   w[k +  1] = d2s(M * cos(2.0 * PI * (i    ) / n));       *

								*                   w[k +  0] = d2s(M * sin(2.0 * PI * (i    ) / n));       *

								*                                                                           *

								*                   k += 12;                                                *

								*               }                                                           *

								*           }                                                               *

								*           w[2*n - 1] = w[2*n - 3] = w[2*n - 5] = 32767;                   *

								*           w[2*n - 2] = w[2*n - 4] = w[2*n - 6] = 0;                       *

								*       }                                                                   *

								*                                                                           *

								*   ASSUMPTIONS                                                             *

								*       n must be a power of 4 and n >= 16 & n < 32768.                     *

								*       FFT data x are aligned on a double word boundary, in real/imag      *

								*       pairs, FFT twiddle factors w are also aligned on a double word      *

								*       boundary in real/imaginary pairs.                                   *

								*                                                                           *

								*       Input FFT coeffs. are in signed Q.15 format.                        *

								*       The memory Configuration is LITTLE ENDIAN.                          *

								*       The complex data will be returned in natural order. This code is    *

								*       uninteruptable, interupts are disabled on entry to the function and *

								*       re-enabled on exit.                                                 *

								*                                                                           *

								*   MEMORY NOTE                                                             *

								*       No bank conflict stalls occur in this code.                         *

								*                                                                           *

								*   TECHNIQUES                                                              *

								*       A special sequence of coefficients. are used (as generated above)   *

								*       to produce the DSP_fft. This collapses the inner 2 loops in the     *

								*       taditional Burrus and Parks implementation Fortran Code.            *

								*                                                                           *

								*      The following C code represents an implementation of the Cooley      *

								*      Tukey radix 4 DIF FFT. It accepts the inputs in normal order and     *

								*      produces the outputs in digit reversed order. The natural C code     *

								*      shown in this file on the other hand, accepts the inputs in nor-     *

								*      mal order and produces the outputs in normal order.                  *

								*                                                                           *

								*      Several transformations have been applied to the original Cooley     *

								*      Tukey code to produce the natural C code description shown here.     *

								*      In order to understand these it would first be educational to        *

								*      understand some of the issues involved in the conventional Cooley    *

								*      Tukey FFT code.                                                      *

								*                                                                           *

								*       void radix4(int n, short x[], short wn[])                           *

								*       {                                                                   *

								*           int    n1,  n2,  ie,   ia1,  ia2, ia3;                          *

								*           int    i0,  i1,  i2,    i3,    i, j,     k;                     *

								*           short  co1, co2, co3,  si1,  si2, si3;                          *

								*           short  xt0, yt0, xt1,  yt1,  xt2, yt2;                          *

								*           short  xh0, xh1, xh20, xh21, xl0, xl1,xl20,xl21;                *

								*                                                                           *

								*           n2 = n;                                                         *

								*           ie = 1;                                                         *

								*           for (k = n; k > 1; k >>= 2)                                     *

								*           {                                                               *

								*               n1 = n2;                                                    *

								*               n2 >>= 2;                                                   *

								*               ia1 = 0;                                                    *

								*                                                                           *

								*               for (j = 0; j < n2; j++)                                    *

								*               {                                                           *

								*                    ia2 = ia1 + ia1;                                       *

								*                    ia3 = ia2 + ia1;                                       *

								*                                                                           *

								*                    co1 = wn[2 * ia1    ];                                 *

								*                    si1 = wn[2 * ia1 + 1];                                 *

								*                    co2 = wn[2 * ia2    ];                                 *

								*                    si2 = wn[2 * ia2 + 1];                                 *

								*                    co3 = wn[2 * ia3    ];                                 *

								*                    si3 = wn[2 * ia3 + 1];                                 *

								*                    ia1 = ia1 + ie;                                        *

								*                                                                           *

								*                    for (i0 = j; i0< n; i0 += n1)                          *

								*                    {                                                      *

								*                        i1 = i0 + n2;                                      *

								*                        i2 = i1 + n2;                                      *

								*                        i3 = i2 + n2;                                      *

								*                                                                           *

								*                                                                           *

								*                        xh0  = x[2 * i0    ] + x[2 * i2    ];              *

								*                        xh1  = x[2 * i0 + 1] + x[2 * i2 + 1];              *

								*                        xl0  = x[2 * i0    ] - x[2 * i2    ];              *

								*                        xl1  = x[2 * i0 + 1] - x[2 * i2 + 1];              *

								*                                                                           *

								*                        xh20 = x[2 * i1    ] + x[2 * i3    ];              *

								*                        xh21 = x[2 * i1 + 1] + x[2 * i3 + 1];              *

								*                        xl20 = x[2 * i1    ] - x[2 * i3    ];              *

								*                        xl21 = x[2 * i1 + 1] - x[2 * i3 + 1];              *

								*                                                                           *

								*                        x[2 * i0    ] = xh0 + xh20;                        *

								*                        x[2 * i0 + 1] = xh1 + xh21;                        *

								*                                                                           *

								*                        xt0  = xh0 - xh20;                                 *

								*                        yt0  = xh1 - xh21;                                 *

								*                        xt1  = xl0 + xl21;                                 *

								*                        yt2  = xl1 + xl20;                                 *

								*                        xt2  = xl0 - xl21;                                 *

								*                        yt1  = xl1 - xl20;                                 *

								*                                                                           *

								*                        x[2 * i1    ] = (xt1 * co1 + yt1 * si1) >> 15;     *

								*                        x[2 * i1 + 1] = (yt1 * co1 - xt1 * si1) >> 15;     *

								*                        x[2 * i2    ] = (xt0 * co2 + yt0 * si2) >> 15;     *

								*                        x[2 * i2 + 1] = (yt0 * co2 - xt0 * si2) >> 15;     *

								*                        x[2 * i3    ] = (xt2 * co3 + yt2 * si3) >> 15;     *

								*                        x[2 * i3 + 1] = (yt2 * co3 - xt2 * si3) >> 15;     *

								*                    }                                                      *

								*              }                                                            *

								*                                                                           *

								*              ie <<= 2;                                                    *

								*          }                                                                *

								*      }                                                                    *

								*                                                                           *

								*       The conventional Cooley Tukey FFT, is written using three loops.    *

								*       The outermost loop "k" cycles through the stages. There are log     *

								*       N to the base 4 stages in all. The loop "j" cycles through the      *

								*       groups of butterflies with different twiddle factors, loop "i"      *

								*       reuses the twiddle factors for the different butterflies within     *

								*       a stage. It is interesting to note the following:                   *

								*                                                                           *

								*---------------------------------------------------------------------------*

								*       Stage#     #Groups     # Butterflies with common     #Groups*Bflys  *

								*                                twiddle factors                            *

								*---------------------------------------------------------------------------*

								*        1         N/4          1                            N/4            *

								*        2         N/16         4                            N/4            *

								*        ..                                                                 *

								*        logN      1            N/4                          N/4            *

								*---------------------------------------------------------------------------*

								*                                                                           *

								*       The following statements can be made based on above observations:   *

								*                                                                           *

								*       a) Inner loop "i0" iterates a veriable number of times. In          *

								*       particular the number of iterations quadruples every time from      *

								*       1..N/4. Hence software pipelining a loop that iterates a vraiable   *

								*       number of times is not profitable.                                  *

								*                                                                           *

								*       b) Outer loop "j" iterates a variable number of times as well.      *

								*       However the number of iterations is quartered every time from       *

								*       N/4 ..1. Hence the behaviour in (a) and (b) are exactly opposite    *

								*       to each other.                                                      *

								*                                                                           *

								*       c) If the two loops "i" and "j" are colaesced together then they    *

								*       will iterate for a fixed number of times namely N/4. This allows    *

								*       us to combine the "i" and "j" loops into 1 loop. Optimized impl-    *

								*       ementations will make use of this fact.                             *

								*                                                                           *

								*       In addition the Cooley Tukey FFT accesses three twiddle factors     *

								*       per iteration of the inner loop, as the butterflies that re-use     *

								*       twiddle factors are lumped together. This leads to accessing the    *

								*       twiddle factor array at three points each sepearted by "ie". Note   *

								*       that "ie" is initially 1, and is quadrupled with every iteration.   *

								*       Therfore these three twiddle factors are not even contiguous in     *

								*       the array.                                                          *

								*                                                                           *

								*       In order to vectorize the FFT, it is desirable to access twiddle    *

								*       factor array using double word wide loads and fetch the twiddle     *

								*       factors needed. In order to do this a modified twiddle factor       *

								*       array is created, in which the factors WN/4, WN/2, W3N/4 are        *

								*       arranged to be contiguous. This eliminates the seperation between   *

								*       twiddle factors within a butterfly. However this implies that as    *

								*       the loop is traversed from one stage to another, that we maintain   *

								*       a redundant version of the twiddle factor array. Hence the size     *

								*       of the twiddle factor array increases as compared to the normal     *

								*       Cooley Tukey FFT.  The modified twiddle factor array is of size     *

								*       "2 * N" where the conventional Cooley Tukey FFT is of size"3N/4"    *

								*       where N is the number of complex points to be transformed. The      *

								*       routine that generates the modified twiddle factor array was        *

								*       presented earlier. With the above transformation of the FFT,        *

								*       both the input data and the twiddle factor array can be accessed    *

								*       using double-word wide loads to enable packed data processing.      *

								*                                                                           *

								*       The final stage is optimised to remove the multiplication as        *

								*       w0 = 1.  This stage also performs digit reversal on the data,       *

								*       so the final output is in natural order.                            *

								*                                                                           *

								*       The DSP_fft() code shown here performs the bulk of the computation  *

								*       in place. However, because digit-reversal cannot be performed       *

								*       in-place, the final result is written to a separate array, y[].     *

								*                                                                           *

								*       There is one slight break in the flow of packed processing that     *

								*       needs to be comprehended. The real part of the complex number is    *

								*       in the lower half, and the imaginary part is in the upper half.     *

								*       The flow breaks in case of "xl0" and "xl1" because in this case     *

								*       the real part needs to be combined with the imaginary part because  *

								*       of the multiplication by "j". This requires a packed quantity like  *

								*       "xl21xl20" to be rotated as "xl20xl21" so that it can be combined   *

								*        using add2's and sub2's. Hence the natural version of C code       *

								*       shown below is transformed using packed data processing as shown:   *

								*                                                                           *

								*                        xl0  = x[2 * i0    ] - x[2 * i2    ];              *

								*                        xl1  = x[2 * i0 + 1] - x[2 * i2 + 1];              *

								*                        xl20 = x[2 * i1    ] - x[2 * i3    ];              *

								*                        xl21 = x[2 * i1 + 1] - x[2 * i3 + 1];              *

								*                                                                           *

								*                        xt1  = xl0 + xl21;                                 *

								*                        yt2  = xl1 + xl20;                                 *

								*                        xt2  = xl0 - xl21;                                 *

								*                        yt1  = xl1 - xl20;                                 *

								*                                                                           *

								*                        xl1_xl0   = _sub2(x21_x20, x21_x20)                *

								*                        xl21_xl20 = _sub2(x32_x22, x23_x22)                *

								*                        xl20_xl21 = _rotl(xl21_xl20, 16)                   *

								*                                                                           *

								*                        yt2_xt1   = _add2(xl1_xl0, xl20_xl21)              *

								*                        yt1_xt2   = _sub2(xl1_xl0, xl20_xl21)              *

								*                                                                           *

								*       Also notice that xt1, yt1 endup on seperate words, these need to    *

								*       be packed together to take advantage of the packed twiddle fact     *

								*       ors that have been loaded. In order for this to be achieved they    *

								*       are re-aligned as follows:                                          *

								*                                                                           *

								*       yt1_xt1 = _packhl2(yt1_xt2, yt2_xt1)                                *

								*       yt2_xt2 = _packhl2(yt2_xt1, yt1_xt2)                                *

								*                                                                           *

								*       The packed words "yt1_xt1" allows the loaded"sc" twiddle factor     *

								*       to be used for the complex multiplies. The real part os the         *

								*       complex multiply is implemented using _dotp2. The imaginary         *

								*       part of the complex multiply is implemented using _dotpn2           *

								*       after the twiddle factors are swizzled within the half word.        *

								*                                                                           *

								*       (X + jY) ( C + j S) = (XC + YS) + j (YC - XS).                      *

								*                                                                           *

								*       The actual twiddle factors for the FFT are cosine, - sine. The      *

								*       twiddle factors stored in the table are csine and sine, hence       *

								*       the sign of the "sine" term is comprehended during multipli-        *

								*       cation as shown above.                                              *

								*                                                                           *

								*   CYCLES                                                                  *

								*       cycles = 1.25*nsamp*log4(nsamp) - 0.5*nsamp + 23*log4(nsamp) - 1    *

								*                                                                           *

								*       For nsamp = 1024,  cycles = 6002                                    *

								*       For nsamp = 256,   cycles = 1243                                    *

								*       For nsamp = 64,    cycles = 276                                     *

								*                                                                           *

								*   CODESIZE                                                                *

								*       988 bytes                                                           *

								*                                                                           *

								*   C CODE                                                                  *

								*       This is the C equivalent of the assembly code without restrictions: *

								*       Note that the assembly code is hand optimized and restrictions may  *

								*       apply.                                                              *

								*                                                                           *

								* void DSP_fft(short *ptr_w, int n, short *ptr_x, short *ptr_y)             *

								* {                                                                         *

								*    int   i, j, l1, l2, h2, predj, tw_offset, stride, fft_jmp;             *

								*    short xt0_0, yt0_0, xt1_0, yt1_0, xt2_0, yt2_0;                        *

								*    short xt0_1, yt0_1, xt1_1, yt1_1, xt2_1, yt2_1;                        *

								*    short xh0_0, xh1_0, xh20_0, xh21_0, xl0_0, xl1_0, xl20_0, xl21_0;      *

								*    short xh0_1, xh1_1, xh20_1, xh21_1, xl0_1, xl1_1, xl20_1, xl21_1;      *

								*    short x_0, x_1, x_2, x_3, x_l1_0, x_l1_1, x_l1_2, x_l1_3, x_l2_0:      *

								*    short x_10, x_11, x_12, x_13, x_14, x_15, x_16, x_17, x_l2_1, x_h2_3;  *

								*    short x_4, x_5, x_6, x_7, x_l2_2, x_l2_3, x_h2_0, x_h2_1, x_h2_2;      *

								*    short si10, si20, si30, co10, co20, co30;                              *

								*    short si11, si21, si31, co11, co21, co31;                              *

								*    short * x, *w, * x2, * x0;                                             *

								*    short * y0, * y1, * y2, *y3;                                           *

								*                                                                           *

								*    stride = n; -* n is the number of complex samples *-                   *

								*    tw_offset = 0;                                                         *

								*    while (stride > 4)  /* for all strides > 4 */                          *

								*    {                                                                      *

								*        j = 0;                                                             *

								*        fft_jmp = stride + (stride>>1);                                    *

								*        h2 = stride>>1;                          /* n/4 */                 *

								*        l1 = stride;                             /* n/2 */                 *

								*        l2 = stride + (stride>>1);               /* 3n/4 */                *

								*        x = ptr_x;                                                         *

								*        w = ptr_w + tw_offset;                                             *

								*        tw_offset += fft_jmp;                                              *

								*        stride = stride>>2;                                                *

								*                                                                           *

								*        for (i = 0; i < n>>1; i += 4)                                      *

								*        {                                                                  *

								*            co10 = w[j+1];    si10 = w[j+0];   /* W  */                    *

								*            co11 = w[j+3];    si11 = w[j+2];                               *

								*            co20 = w[j+5];    si20 = w[j+4];   /* W^2 */                   *

								*            co21 = w[j+7];    si21 = w[j+6];                               *

								*            co30 = w[j+9];    si30 = w[j+8];   /* W^3 */                   *

								*            co31 = w[j+11];   si31 = w[j+10];                              *

								*                                                                           *

								*            x_0 = x[0];       x_1 = x[1];         /* perform 2 parallel */ *

								*            x_2 = x[2];       x_3 = x[3];         /* radix4 butterflies */ *

								*                                                                           *

								*            x_l1_0 = x[l1  ]; x_l1_1 = x[l1+1];                            *

								*            x_l1_2 = x[l1+2]; x_l1_3 = x[l1+3];                            *

								*                                                                           *

								*            x_l2_0 = x[l2  ]; x_l2_1 = x[l2+1];                            *

								*            x_l2_2 = x[l2+2]; x_l2_3 = x[l2+3];                            *

								*                                                                           *

								*            x_h2_0 = x[h2  ]; x_h2_1 = x[h2+1];                            *

								*            x_h2_2 = x[h2+2]; x_h2_3 = x[h2+3];                            *

								*                                                                           *

								*            xh0_0  = x_0    + x_l1_0; xh1_0  = x_1    + x_l1_1;            *

								*            xh0_1  = x_2    + x_l1_2; xh1_1  = x_3    + x_l1_3;            *

								*                                                                           *

								*            xl0_0  = x_0    - x_l1_0; xl1_0  = x_1    - x_l1_1;            *

								*            xl0_1  = x_2    - x_l1_2; xl1_1  = x_3    - x_l1_3;            *

								*                                                                           *

								*            xh20_0 = x_h2_0 + x_l2_0; xh21_0 = x_h2_1 + x_l2_1;            *

								*            xh20_1 = x_h2_2 + x_l2_2; xh21_1 = x_h2_3 + x_l2_3;            *

								*                                                                           *

								*            xl20_0 = x_h2_0 - x_l2_0; xl21_0 = x_h2_1 - x_l2_1;            *

								*            xl20_1 = x_h2_2 - x_l2_2; xl21_1 = x_h2_3 - x_l2_3;            *

								*                                                                           *

								*            x0 = x;                                                        *

								*            x2 = x0;                 /* copy pointers for output*/         *

								*                                                                           *

								*            j += 12;                                                       *

								*            x += 4;                                                        *

								*            predj = (j - fft_jmp);   /* check if reached end of */         *

								*            if (!predj) x += fft_jmp;/* current twiddle factor section */  *

								*            if (!predj) j = 0;                                             *

								*                                                                           *

								*            x0[0] = xh0_0 + xh20_0; x0[1] = xh1_0 + xh21_0;                *

								*            x0[2] = xh0_1 + xh20_1; x0[3] = xh1_1 + xh21_1;                *

								*                                                                           *

								*            xt0_0 = xh0_0 - xh20_0;  yt0_0 = xh1_0 - xh21_0;               *

								*            xt0_1 = xh0_1 - xh20_1;  yt0_1 = xh1_1 - xh21_1;               *

								*                                                                           *

								*            xt1_0 = xl0_0 + xl21_0;  yt2_0 = xl1_0 + xl20_0;               *

								*            xt2_0 = xl0_0 - xl21_0;  yt1_0 = xl1_0 - xl20_0;               *

								*            xt1_1 = xl0_1 + xl21_1;  yt2_1 = xl1_1 + xl20_1;               *

								*            xt2_1 = xl0_1 - xl21_1;  yt1_1 = xl1_1 - xl20_1;               *

								*                                                                           *

								*            x2[h2  ] = (si10 * yt1_0 + co10 * xt1_0) >> 15;                *

								*            x2[h2+1] = (co10 * yt1_0 - si10 * xt1_0) >> 15;                *

								*                                                                           *

								*            x2[h2+2] = (si11 * yt1_1 + co11 * xt1_1) >> 15;                *

								*            x2[h2+3] = (co11 * yt1_1 - si11 * xt1_1) >> 15;                *

								*                                                                           *

								*            x2[l1  ] = (si20 * yt0_0 + co20 * xt0_0) >> 15;                *

								*            x2[l1+1] = (co20 * yt0_0 - si20 * xt0_0) >> 15;                *

								*                                                                           *

								*            x2[l1+2] = (si21 * yt0_1 + co21 * xt0_1) >> 15;                *

								*            x2[l1+3] = (co21 * yt0_1 - si21 * xt0_1) >> 15;                *

								*                                                                           *

								*            x2[l2  ] = (si30 * yt2_0 + co30 * xt2_0) >> 15;                *

								*            x2[l2+1] = (co30 * yt2_0 - si30 * xt2_0) >> 15;                *

								*                                                                           *

								*            x2[l2+2] = (si31 * yt2_1 + co31 * xt2_1) >> 15;                *

								*            x2[l2+3] = (co31 * yt2_1 - si31 * xt2_1) >> 15;                *

								*        }                                                                  *

								*    }-* end while *-                                                       *

								*                                                                           *

								*    y0 = ptr_y;                                                            *

								*    y1 = y0 + (int)(n>>1);                                                 *

								*    y2 = y1 + (int)(n>>1);                                                 *

								*    y3 = y2 + (int)(n>>1);                                                 *

								*    x0 = ptr_x;                                                            *

								*    x2 = ptr_x + (int)(n>>1);                                              *

								*    l1 = _norm(n) + 2;                                                     *

								*    j = 0;                                                                 *

								*    for (i = 0; i < n; i += 8)                                             *

								*    {                                                                      *

								*        h2 = _deal(j);                                                     *

								*        h2 = _bitr(h2);                                                    *

								*        h2 = _rotl(h2, 16);                                                *

								*        h2 = _shfl(h2);                                                    *

								*        h2 >>= l1;                                                         *

								*                                                                           *

								*        x_0 = x0[0]; x_1 = x0[1];                                          *

								*        x_2 = x0[2]; x_3 = x0[3];                                          *

								*        x_4 = x0[4]; x_5 = x0[5];                                          *

								*        x_6 = x0[6]; x_7 = x0[7];                                          *

								*        x0 += 8;                                                           *

								*                                                                           *

								*        xh0_0  = x_0 + x_4; xh1_0  = x_1 + x_5;                            *

								*        xl0_0  = x_0 - x_4; xl1_0  = x_1 - x_5;                            *

								*        xh20_0 = x_2 + x_6; xh21_0 = x_3 + x_7;                            *

								*        xl20_0 = x_2 - x_6; xl21_0 = x_3 - x_7;                            *

								*                                                                           *

								*        xt0_0 = xh0_0 - xh20_0;                                            *

								*        yt0_0 = xh1_0 - xh21_0;                                            *

								*        xt1_0 = xl0_0 + xl21_0;                                            *

								*        yt2_0 = xl1_0 + xl20_0;                                            *

								*        xt2_0 = xl0_0 - xl21_0;                                            *

								*        yt1_0 = xl1_0 - xl20_0;                                            *

								*                                                                           *

								*        y0[2*h2  ] = xh0_0 + xh20_0;                                       *

								*        y0[2*h2+1] = xh1_0 + xh21_0;                                       *

								*        y1[2*h2  ] = xt1_0;                                                *

								*        y1[2*h2+1] = yt1_0;                                                *

								*        y2[2*h2  ] = xt0_0;                                                *

								*        y2[2*h2+1] = yt0_0;                                                *

								*        y3[2*h2  ] = xt2_0;                                                *

								*        y3[2*h2+1] = yt2_0;                                                *

								*                                                                           *

								*        x_10 = x2[0]; x_11 = x2[1];                                        *

								*        x_12 = x2[2]; x_13 = x2[3];                                        *

								*        x_14 = x2[4]; x_15 = x2[5];                                        *

								*        x_16 = x2[6]; x_17 = x2[7];                                        *

								*        x2 += 8;                                                           *

								*                                                                           *

								*        xh0_1  = x_10 + x_14; xh1_1  = x_11 + x_15;                        *

								*        xl0_1  = x_10 - x_14; xl1_1  = x_11 - x_15;                        *

								*        xh20_1 = x_12 + x_16; xh21_1 = x_13 + x_17;                        *

								*        xl20_1 = x_12 - x_16; xl21_1 = x_13 - x_17;                        *

								*                                                                           *

								*        xt0_1 = xh0_1 - xh20_1;                                            *

								*        yt0_1 = xh1_1 - xh21_1;                                            *

								*        xt1_1 = xl0_1 + xl21_1;                                            *

								*        yt2_1 = xl1_1 + xl20_1;                                            *

								*        xt2_1 = xl0_1 - xl21_1;                                            *

								*        yt1_1 = xl1_1 - xl20_1;                                            *

								*                                                                           *

								*        y0[2*h2+2] = xh0_1 + xh20_1;                                       *

								*        y0[2*h2+3] = xh1_1 + xh21_1;                                       *

								*        y1[2*h2+2] = xt1_1;                                                *

								*        y1[2*h2+3] = yt1_1;                                                *

								*        y2[2*h2+2] = xt0_1;                                                *

								*        y2[2*h2+3] = yt0_1;                                                *

								*        y3[2*h2+2] = xt2_1;                                                *

								*        y3[2*h2+3] = yt2_1;                                                *

								*                                                                           *

								*        j += 4;                                                            *

								*        if (j == n>>2)                                                     *

								*        {                                                                  *

								*          j  += n>>2;                                                      *

								*          x0 += (int) n>>1;                                                *

								*          x2 += (int) n>>1;                                                *

								*        }                                                                  *

								*      }                                                                    *

								*  }                                                                        *

								* ------------------------------------------------------------------------- *

								*             Copyright (c) 2003 Texas Instruments, Incorporated.           *

								*                            All Rights Reserved.                           *

								* ========================================================================= *


								        .sect ".text:_fft"

								        .global _DSP_fft

								_DSP_fft:

								*================== SYMBOLIC REGISTER ASSIGNMENTS: SETUP ======================*

								       .asg         B15,        B_SP            ;Stack pointer, B datapath

								       .asg         A31,        A_SP            ;Copy of stack pointer

								       .asg         A4,         A_ptr_w         ;poiter to twiddle factors

								       .asg         B4,         B_n             ;number of points in transform

								       .asg         A6,         A_ptr_x         ;pointer to timne domain data

								       .asg         B6,         B_ptr_y         ;pointer to frequency domain

								       .asg         B12,        B_stride        ;offset to n/2 complex pt

								       .asg         B5,         B_stride_1      ;offset to n/4 complex pt

								       .asg         A12,        A_tw_offset     ;increment to twiddle table

								       .asg         A3,         A_w0            ;pointer to twiddle factor

								       .asg         B5,         B_fft_jmp_1     ;jump factor for time data

								       .asg         A20,        A_zero          ;cont = 0

								       .asg         B16,        B_w0            ;pointer to twiddle factor

								       .asg         A21,        A_w1            ;twiddle factor + 2 dword

								       .asg         B17,        B_w2            ;twiddle factor + 4 dword

								       .asg         B11,        B_j             ;counter for twiddle factors

								       .asg         A19,        A_j             ;counter for twiddle factors

								       .asg         B10,        B_x             ;pointer to time domaindata

								       .asg         B18,        B_h2            ;offset to n/4 time points

								       .asg         B19,        B_l1            ;offset to n/2 time points

								       .asg         B20,        B_l2            ;offset to 3n/4 time points

								       .asg         A22,        A_fft_jmp       ;jump for twiddle factors

								       .asg         B21,        B_fft_jmp       ;jump for twiddle factors

								       .asg         A0,         A_p0            ;prolog collapse variable

								       .asg         A2,         A_i             ;1st loop counter

								       .asg         A25,        A_h2            ;offset to n/4 freq. points

								       .asg         A23,        A_l1            ;offset to n/2 freq. points

								       .asg         A24,        A_l2            ;offset to 3n/4 freq. points

								       .asg         A31,        A_co11_si11     ;W^n+1

								       .asg         A30,        A_co10_si10     ;W^n

								       .asg         B29,        B_co21_si21     ;W^2n+1

								       .asg         B28,        B_co20_si20     ;W^2n

								       .asg         B31,        B_co31_si31     ;W^3n+1

								       .asg         B30,        B_co30_si30     ;W^3n

								       .asg         A29,        A_x_3_x_2       ;data into butterflies

								       .asg         A28,        A_x_1_x_0       ;data into butterflies

								       .asg         A27,        A_xh2_3_xh2_2   ;data itno butterflies

								       .asg         A26,        A_xh2_1_xh2_0   ;data into butterflies

								       .asg         B23,        B_xl1_3_xl1_2   ;data into butterflies

								       .asg         B22,        B_xl1_1_xl1_0   ;data into butterflies

								       .asg         B25,        B_xl2_3_xl2_2   ;data into butterflies

								       .asg         B24,        B_xl2_1_xl2_0   ;data into butterflies

								       .asg         B22,        B_xh1_0_xh0_0   ;data into butterflies

								       .asg         B8,         B_xh1_1_xh0_1   ;data into butterflies

								       .asg         A7,         A_xl1_0_xl0_0   ;data into butterflies

								       .asg         A16,        A_xl1_1_xl0_1   ;data into butterflies

								       .asg         B26,        B_xh21_0_xh20_0 ;data into butterflies

								       .asg         B5,         B_xh21_1_xh20_1 ;data into butterflies

								       .asg         A3,         A_xl21_0_xl20_0 ;data into butterflies

								       .asg         A7,         A_xl21_1_xl20_1 ;data into butterflies

								       .asg         A18,        A_x_            ;copy of freq. pointer

								       .asg         A10,        A_x__           ;copy of freq. pointer

								       .asg         A1,         A_ifj           ;jump condition for DSP_fft

								       .asg         B26,        B_x_1_x_0       ;data out of butterflies

								       .asg         B27,        B_x_3_x_2       ;data out of butterflies

								       .asg         B7,         B_yt0_0_xt0_0   ;data out of butterflies

								       .asg         B8,         B_yt0_1_xt0_1   ;data out of butterflies

								       .asg         A3,         A_xl20_0_xl21_0 ;data out of butterflies

								       .asg         A5,         A_xl20_1_xl21_1 ;data out of butterflies

								       .asg         A8,         A_yt2_0_xt1_0   ;data out of butterflies

								       .asg         A11,        A_yt2_1_xt1_1   ;data out of butterflies

								       .asg         A9,         A_yt1_0_xt2_0   ;data out of butterflies

								       .asg         A16,        A_yt1_1_xt2_1   ;data out of butterflies

								       .asg         A7,         A_xt1_0_yt1_0   ;data out of butterflies

								       .asg         A5,         A_xt1_1_yt1_1   ;data out of butterflies

								       .asg         A8,         A_xt2_0_yt2_0   ;data out of butterflies

								       .asg         A28,        A_xt2_1_yt2_1   ;data out of butterflies

								       .asg         B26,        B_xt0_0_yt0_0   ;data out of butterflies

								       .asg         B9,         B_xt0_1_yt0_1   ;data out of butterflies

								       .asg         A1,         A_yt1_0_xt1_0   ;data out of butterflies

								       .asg         A5,         A_yt1_1_xt1_1   ;data out of butterflies

								       .asg         A3,         A_yt2_0_xt2_0   ;data out of butterflies

								       .asg         A3,         A_yt2_1_xt2_1   ;data out of butterflies

								       .asg         A26,        A_x_h2_0        ;frequency >> 15

								       .asg         A27,        A_x_h2_1        ;frequency >> 15

								       .asg         A17,        A_x_h2_2        ;frequency >> 15

								       .asg         A29,        A_x_h2_3        ;frequency >> 15

								       .asg         B0,         B_x_l1_0        ;frequency >> 15

								       .asg         B2,         B_x_l1_1        ;frequency >> 15

								       .asg         B23,        B_x_l1_2        ;frequency >> 15

								       .asg         B5,         B_x_l1_3        ;frequency >> 15

								       .asg         B1,         B_x_l2_0        ;frequency >> 15

								       .asg         B27,        B_x_l2_1        ;frequency >> 15

								       .asg         B2,         B_x_l2_2        ;frequency >> 15

								       .asg         B7,         B_x_l2_3        ;frequency >> 15

								       .asg         A28,        A_xh2_1_0       ;packed frequencies

								       .asg         A29,        A_xh2_3_2       ;packed frequencies

								       .asg         B22,        B_xl1_1_0       ;packed frequencies

								       .asg         B23,        B_xl1_3_2       ;packed frequencies

								       .asg         B24,        B_xl2_1_0       ;packed frequencies

								       .asg         B25,        B_xl2_3_2       ;packed frequencies

								       .asg         B0,         B_whl           ;outer while loop condition

								       .asg         B13,        B_csr           ;CSR before function call

								       .asg         B10,        B_no_gie        ;CSR before with GIE bit off

								*==============================================================================*

								        MV.L1X     B_SP,          A_SP                          ;make cpy of SP

								||      STW.D2T2   B12,           *B_SP--[8]                    ;Save B12

								||      SHRU.S2    B_n,           3,               B_h2         ;h2=n/8


								        STW.D2T1   A12,           *+B_SP[1]                     ;Save A12 SP[1]

								||      MV.L2      B_n,           B_stride                      ;stride = n

								||      STW.D1T2   B13,           *-A_SP[6]                     ;Save B13 SP[2]

								||      MVC.S2     CSR,           B_csr                         ;save CSR

								||      B.S1       LOOP_WHILE_B                                 ; no ints!

								;-

								        STW.D2T2   B11,           *+B_SP[3]                     ;Save B11 SP[3]

								||      STW.D1T1   A11,           *-A_SP[4]                     ;Save A11 SP[4]

								||      ZERO.L1    A_tw_offset                                  ;tw_offset=0


								        STW.D2T2   B10,           *+B_SP[5]                     ;Save B10 SP[5]

								||      STW.D1T1   A10,           *-A_SP[2]                     ;Save A10 SP[6]

								||      AND.S2     B_csr,         -2,              B_no_gie     ;kill gie bit

								;-

								        MV.L2X     A_ptr_x,       B_x                           ;x= ptr_x

								||      ADDAH.D2   B_h2,          B_h2,            B_l2         ;l2=3*h2

								||      MVC.S2     B_no_gie,      CSR                           ;disable intrpt.


								        LDDW.D2T2  *B_x[B_l2],    B_xl2_3_xl2_2:B_xl2_1_xl2_0   ;x[l2+0..3]

								||      SHRU.S1X   B_n,           3,               A_i          ;i=n>>3

								||      SHL.S2     B_stride,      1,               B_stride_1   ;stride/2


								*============================= PIPE LOOP PROLOG ===============================*

								LOOP_WHILE:

								        LDDW.D2T1  *B_x[B_h2],    A_xh2_3_xh2_2:A_xh2_1_xh2_0   ;x[h2+0..3]

								||      MV.L1X     B_l2,          A_l2                          ;l2 copy

								||      SHRU.S2    B_stride,      2,               B_l1         ;fft_jmp=

								||      ADD.L2     B_stride,      B_stride_1,      B_fft_jmp    ;1.5*stride


								; -- remainder relocated below


								*============================= PIPE LOOP KERNEL ===============================*

								LOOP_Y:

								        ADD.S2     B_x_l1_1,      B_x_l1_1,        B_x_l1_1     ;x[l1+1]>>15

								||      DOTP2.M2X  B_co31_si31,   A_xt2_1_yt2_1,   B_x_l2_2     ;Im{W^3n+1*x2}

								||      ADD.S1     A_x_h2_1,      A_x_h2_1,        A_x_h2_1     ;x[h2+1]>>15

								||      ROTL.M1    A_xt1_0_yt1_0, 16,              A_yt1_0_xt1_0;xt1<=>yt1

								||      PACKLH2.L1 A_yt2_1_xt1_1, A_yt1_1_xt2_1,   A_xt1_1_yt1_1;repack data

								||      ADD.D1     A_j,           3,               A_j          ;j+=3

								||[!A_ifj]ZERO.L2  B_j                                          ;j=0 if at end

								||      LDDW.D2T2  *B_x[B_l1],    B_xl1_3_xl1_2:B_xl1_1_xl1_0   ;x[l1+0..3]


								        PACKH2.S2  B_x_l1_1,      B_x_l1_0,        B_xl1_1_0    ;repack data

								||      PACKH2.L1  A_x_h2_1,      A_x_h2_0,        A_xh2_1_0    ;repack data

								||      ADD2.L2    B_xh21_0_xh20_0, B_xh1_0_xh0_0, B_x_1_x_0    ;xh2x+xhx

								||      ROTL.M2    B_yt0_0_xt0_0, 16,              B_xt0_0_yt0_0;repack data

								||      DOTP2.M1   A_xt1_1_yt1_1, A_co11_si11,     A_x_h2_2     ;Im{W^n+1*X1}

								||      PACKLH2.S1 A_xt1_1_yt1_1, A_xt1_1_yt1_1,   A_yt1_1_xt1_1;repack data

								||[!A_ifj]ZERO.D1  A_j                                          ;j=0?

								||      LDDW.D2T1  *B_x[0],       A_x_3_x_2:A_x_1_x_0           ;x[0..3]


								        ADD.S2     B_x_l2_0,      B_x_l2_0,        B_x_l2_0     ;x[l2]>>15

								||      ADD.L2     B_x_l1_3,      B_x_l1_3,        B_x_l1_3     ;x[l1+3]>>15

								||      ADD.D2     B_x_l1_2,      B_x_l1_2,        B_x_l1_2     ;x[l1+2]>>15

								||      DOTPN2.M2X B_co31_si31,   A_yt2_1_xt2_1,   B_x_l2_3     ;Re{W^3n+1*X2}

								||      STDW.D1T2  B_x_3_x_2:B_x_1_x_0,            *A_x_[0]     ;write x[0..3]

								||      DOTPN2.M1  A_yt1_0_xt1_0, A_co10_si10,     A_x_h2_1     ;Re{X1*W^n}

								||      SUB2.S1X   A_xh2_1_xh2_0, B_xl2_1_xl2_0, A_xl21_0_xl20_0;x[h2]-x[l2]

								||      SUB.L1     A_fft_jmp,     A_j,             A_ifj        ;(fft_jmp==j)?


								        PACKH2.S2  B_x_l1_3,      B_x_l1_2,        B_xl1_3_2    ;repack data

								||      ADD.S1     A_x_h2_3,      A_x_h2_3,        A_x_h2_3     ;x[h2+3]>>15

								||      DOTP2.M2   B_xt0_0_yt0_0, B_co20_si20,     B_x_l1_0     ;Im{W^2n*X0}

								||      DOTP2.M1   A_xt1_0_yt1_0, A_co10_si10,     A_x_h2_0     ;Im{W^n*X1}

								||      LDDW.D1T2  *A_w1[A_j],    B_co21_si21:B_co20_si20       ;W^2n+1,W^2n

								||      LDDW.D2T1  *B_w0[B_j],    A_co11_si11:A_co10_si10       ;W^n+1,W^n

								||      ADD2.L2X   B_xl2_1_xl2_0, A_xh2_1_xh2_0, B_xh21_0_xh20_0;xl2+xh2

								||      SUB2.L1X   A_xh2_3_xh2_2, B_xl2_3_xl2_2, A_xl21_1_xl20_1;xh2-xl2


								  [ A_i]BDEC.S1    LOOP_Y,        A_i                           ;} end for

								||      ADD.D2     B_x_l2_1,      B_x_l2_1,        B_x_l2_1     ;x[l2+1]>>15

								||[!A_p0]STDW.D1T2 B_xl1_3_2:B_xl1_1_0,            *A_x__[A_l1] ;save x[l1+0..3]

								||      ADD.L2     B_x_l2_2,      B_x_l2_2,        B_x_l2_2     ;x[l2+2]>>15

								||      PACKH2.L1  A_x_h2_3,      A_x_h2_2,        A_xh2_3_2    ;repack data

								||      DOTPN2.M2  B_yt0_1_xt0_1, B_co21_si21,     B_x_l1_3     ;Im{W^2n+1*X0}

								||[!A_ifj]ADD.S2   B_x,           B_fft_jmp,       B_x          ;x+=fft_fmp?

								||      MVD.M1X    B_x,           A_x_                          ;x_=x

								LOOP_Y5:

								        PACKH2.S2  B_x_l2_1,      B_x_l2_0,        B_xl2_1_0    ;repack data

								||[!A_p0]STDW.D1T1 A_xh2_3_2:A_xh2_1_0,            *A_x__[A_h2] ;save x[h2+0..3]

								||      PACKLH2.S1 A_yt1_0_xt2_0, A_yt2_0_xt1_0,   A_xt2_0_yt2_0;repack datat

								||      ADD.L1     A_x_h2_2,      A_x_h2_2,        A_x_h2_2     ;x[h2+2]>>15

								||      DOTPN2.M1  A_yt1_1_xt1_1, A_co11_si11,     A_x_h2_3     ;Re{W^n+1*X1}

								||      DOTPN2.M2  B_yt0_0_xt0_0, B_co20_si20,     B_x_l1_1     ;Re{W^2n*X0}

								||      ADD2.D2X   B_xl2_3_xl2_2, A_xh2_3_xh2_2, B_xh21_1_xh20_1;xl2+xh2

								||      ADD.L2     B_x,           8,               B_x          ;x+=8


								        ADD.L2     B_x_l2_3,      B_x_l2_3,        B_x_l2_3     ;x[l2+3]>>15

								||      ROTL.M1    A_xt2_0_yt2_0, 16,              A_yt2_0_xt2_0;repack data

								||      DOTP2.M2   B_xt0_1_yt0_1, B_co21_si21,     B_x_l1_2     ;Im{W^2n+1*X0}

								||      ADD2.S2X   B_xl1_3_xl1_2, A_x_3_x_2,       B_xh1_1_xh0_1;x[l1]+x[0]

								||      PACKLH2.L1 A_xl21_1_xl20_1,A_xl21_1_xl20_1,A_xl20_1_xl21_1;repack data

								||      PACKLH2.S1 A_xl21_0_xl20_0,A_xl21_0_xl20_0,A_xl20_0_xl21_0;repack data

								||      SUB2.D1X   A_x_1_x_0,     B_xl1_1_xl1_0,   A_xl1_0_xl0_0;x[0]-x[l1]

								||      LDDW.D2T2  *B_x[B_l2],    B_xl2_3_xl2_2:B_xl2_1_xl2_0   ;x[l2+0..3]


								        DOTP2.M2X  B_co30_si30,   A_xt2_0_yt2_0,   B_x_l2_0     ;Im{W^3n*X3}

								||      ADD.D1     A_x_h2_0,      A_x_h2_0,        A_x_h2_0     ;x[h2]>>15

								||      MVD.M1     A_x_,          A_x__                         ;x__=x_

								||      SUB2.L1    A_xl1_0_xl0_0, A_xl20_0_xl21_0, A_yt1_0_xt2_0;xl1-xl20

								||      ADD2.S1    A_xl1_0_xl0_0, A_xl20_0_xl21_0, A_yt2_0_xt1_0;xl1+xl20

								||      ADD2.L2    B_xh21_1_xh20_1, B_xh1_1_xh0_1, B_x_3_x_2    ;xh21+xh1

								||      SUB2.S2    B_xh1_1_xh0_1, B_xh21_1_xh20_1, B_yt0_1_xt0_1;xh1-xh21

								||      LDDW.D2T1  *B_x[B_h2],    A_xh2_3_xh2_2:A_xh2_1_xh2_0   ;x[h2+0..3]

								LOOP_Y8:

								        PACKH2.L2  B_x_l2_3,      B_x_l2_2,        B_xl2_3_2    ;repack data

								||      MVD.M1     A_zero,        A_p0                          ;disable prolog

								||      PACKLH2.S1 A_yt1_1_xt2_1, A_yt2_1_xt1_1,   A_xt2_1_yt2_1;reformat

								||      ROTL.M2    B_yt0_1_xt0_1, 16,              B_xt0_1_yt0_1;repack data

								||      PACKLH2.L1 A_yt2_0_xt1_0, A_yt1_0_xt2_0,   A_xt1_0_yt1_0;repack data

								||      SUB2.D1X   A_x_3_x_2,     B_xl1_3_xl1_2,   A_xl1_1_xl0_1;x[0]-x[l1]

								||      ADD2.S2X   B_xl1_1_xl1_0, A_x_1_x_0,       B_xh1_0_xh0_0;x[0]+x[l1]

								||      LDDW.D2T2  *B_w2[B_j],    B_co31_si31:B_co30_si30       ;W^3n+1,W^3n


								  [!A_p0]STDW.D1T2 B_xl2_3_2:B_xl2_1_0,            *A_x__[A_l2] ;x[l2+0..3]

								||      DOTPN2.M2X B_co30_si30,   A_yt2_0_xt2_0,   B_x_l2_1     ;Re{W^3n*X2}

								||      ADD.L2     B_x_l1_0,      B_x_l1_0,        B_x_l1_0     ;x[l1]>>15

								||      ROTL.M1    A_xt2_1_yt2_1, 16,              A_yt2_1_xt2_1;repack data

								||      SUB2.S2    B_xh1_0_xh0_0, B_xh21_0_xh20_0, B_yt0_0_xt0_0;xh1-xh21

								||      ADD.D2     B_j,           3,               B_j          ;j+=3

								||      ADD2.L1    A_xl1_1_xl0_1, A_xl20_1_xl21_1, A_yt2_1_xt1_1;xl1+xl20

								||      SUB2.S1    A_xl1_1_xl0_1, A_xl20_1_xl21_1, A_yt1_1_xt2_1;xl1-xl20

								*============================= PIPE LOOP EPILOG ===============================*

								        ADD.D2     B_x_l1_1,      B_x_l1_1,        B_x_l1_1     ;x[l1+1]>>15

								||      DOTP2.M2X  B_co31_si31,   A_xt2_1_yt2_1,   B_x_l2_2     ;Im{W^3n+1*X2}

								||      ADD.S1     A_x_h2_1,      A_x_h2_1,        A_x_h2_1     ;x[h2+1]>>15

								||      SHRU.S2    B_stride,      3,               B_h2         ;h2=stride/8


								        PACKH2.S2  B_x_l1_1,      B_x_l1_0,        B_xl1_1_0    ;repack data

								||      PACKH2.L1  A_x_h2_1,      A_x_h2_0,        A_xh2_1_0    ;repack data

								||      MV.L2X     A_ptr_x,       B_x                           ;x= ptr_x

								||      ADDAH.D2   B_h2,          B_h2,            B_l2         ;l2=3*h2

								;-

								        ADD.S2     B_x_l2_0,      B_x_l2_0,        B_x_l2_0     ;x[l2]>>15

								||      ADD.L2     B_x_l1_3,      B_x_l1_3,        B_x_l1_3     ;x[l1+3]>>15

								||      ADD.D2     B_x_l1_2,      B_x_l1_2,        B_x_l1_2     ;x[l1+2]>>15

								||      DOTPN2.M2X B_co31_si31,   A_yt2_1_xt2_1,   B_x_l2_3     ;Re{W^3n+1*X2}


								        PACKH2.S2  B_x_l1_3,      B_x_l1_2,        B_xl1_3_2    ;repack data

								||      ADD.S1     A_x_h2_3,      A_x_h2_3,        A_x_h2_3     ;x[h2+3]>>15

								||      CMPLTU.L2  4,             B_stride,        B_whl        ;stride>4 ?

								||      MV.L1X     B_SP,          A_SP                          ;cp B_SP

								;-

								        ADD.D2     B_x_l2_1,      B_x_l2_1,        B_x_l2_1     ;x[l2+1]>>15

								||      STDW.D1T2  B_xl1_3_2:B_xl1_1_0,            *A_x__[A_l1] ;save x[l1+0..3]

								||      ADD.L2     B_x_l2_2,      B_x_l2_2,        B_x_l2_2     ;x[l2+2]>>15

								||      PACKH2.L1  A_x_h2_3,      A_x_h2_2,        A_xh2_3_2    ;repack data

								||[B_whl]B.S1      LOOP_WHILE                                   ;}end while


								        PACKH2.S2  B_x_l2_1,      B_x_l2_0,        B_xl2_1_0    ;repack data

								||      STDW.D1T1  A_xh2_3_2:A_xh2_1_0,            *A_x__[A_h2] ;save x[h2+0..3]

								||[B_whl]B.S1      LOOP_WHILE_B                                 ;}end while


								        ADD.L2     B_x_l2_3,      B_x_l2_3,        B_x_l2_3     ;x[l2+3]>>15

								;-

								        PACKH2.L2  B_x_l2_3,      B_x_l2_2,        B_xl2_3_2    ;repack data

								||      SHL.S2     B_stride,      1,               B_stride_1   ;2*stride

								||      SHRU.S1X   B_n,           3,               A_i          ;i=n>>3

								*======================= SYMBOLIC REGISTER ASSIGNMENTS ========================*

								       .asg        B4,         B_n            ;number of complex points

								       .asg        A6,         A_ptr_x        ;pointer to n-1th transformed data

								       .asg        B6,         B_ptr_y        ;pointer to frequency domain

								       .asg        A6,         A_x0           ;even samples

								       .asg        B16,        B_x0           ;even samples

								       .asg        A24,        A_x1           ;odd samples

								       .asg        B17,        B_x1           ;odd samples

								       .asg        A25,        A_i            ;loop count

								       .asg        B1,         B_j            ;index to input

								       .asg        B18,        B_nm2          ;n-2

								       .asg        A26,        A_n_2          ;n/4

								       .asg        A2,         A_p0           ;prolog collapse counter

								       .asg        A27,        A_n            ;number of complex points

								       .asg        B19,        B_y0           ;y[i]

								       .asg        B20,        B_y1           ;y[i+n/4]

								       .asg        B21,        B_y2           ;y[i+n/2]

								       .asg        B22,        B_y3           ;y[i+3*n/4]

								       .asg        B23,        B_l1           ;shift for digit reverse

								       .asg        A19,        A_h0           ;partial digit reverse

								       .asg        A17,        A_h1           ;partial digit reverse

								       .asg        B28,        B_h2           ;partial digit reverse

								       .asg        B24,        B_h3           ;partial digit reverse

								       .asg        B8,         B_h4           ;digit reversed index

								       .asg        A28,        A_x1x0         ;x[i+1],x[i+0]

								       .asg        A29,        A_x3x2         ;x[i+3],x[i+2]

								       .asg        B24,        B_x5x4         ;x[i+5],x[i+4]

								       .asg        B25,        B_x7x6         ;x[i+7],x[i+6]

								       .asg        A30,        A_x9x8         ;x[i+9],x[i+8]

								       .asg        A31,        A_xbxa         ;x[i+11],x[i+10]

								       .asg        B28,        B_xdxc         ;x[i+13],x[i+12]

								       .asg        B29,        B_xfxe         ;x[i+15],x[i+14]

								       .asg        B26,        B_xh1_0_xh0_0  ;stageI outputs from butterflies

								       .asg        B28,        B_xh21_0_xh20_0;stageI outputs from butterflies

								       .asg        A18,        A_xl1_0_xl0_0  ;stageI outputs from butterflies

								       .asg        A20,        A_xl21_0_xl20_0;stageI outputs from butterflies

								       .asg        A19,        A_xl0_0_xl1_0  ;stageI outputs from butterflies

								       .asg        B26,        B_yt0_0_xt0_0  ;stageI outputs from butterflies

								       .asg        A20,        A_xt1_0_yt2_0  ;stageI outputs from butterflies

								       .asg        A30,        A_xt2_0_yt1_0  ;stageI outputs from butterflies

								       .asg        B9,         B_xh1_1_xh0_1  ;stageII outputs from butterflies

								       .asg        B27,        B_xh21_1_xh20_1;stageII outputs from butterflies

								       .asg        A30,        A_xl1_1_xl0_1  ;stageII outputs from butterflies

								       .asg        A22,        A_xl21_1_xl20_1;stageII outputs from butterflies

								       .asg        A19,        A_xl0_1_xl1_1  ;stageII outputs from butterflies

								       .asg        B27,        B_yt0_1_xt0_1  ;stageII outputs from butterflies

								       .asg        A21,        A_xt1_1_yt2_1  ;stageII outputs from butterflies

								       .asg        A22,        A_xt2_1_yt1_1  ;stageII outputs from butterflies

								       .asg        B30,        B_y1y0         ;complex frequencies

								       .asg        A30,        A_y3y2         ;complex frequencies

								       .asg        A28,        A_y7y6         ;complex frequencies

								       .asg        B31,        B_y9y8         ;complex frequencies

								       .asg        A31,        A_ybya         ;complex frequencies

								       .asg        A29,        A_yfye         ;complex frequencies

								       .asg        B2,         B_ifi          ;jump condition for digit reversal

								*============================PIPE LOOP PROLOG =================================*

								        SHRU.S1X   B_n,           2,               A_n_2          ;n/4

								||      STDW.D1T2  B_xl2_3_2:B_xl2_1_0,            *A_x__[A_l2]   ;x[l2+...3]

								;-

								        SHRU.S2    B_n,           3,               B_j            ;j=n/8

								||      ADD.S1X    A_ptr_x,       B_n,             A_x1           ;n/4 words

								||      LDW.D1T1   *+A_SP[4],     A11                             ;Restore A11

								||[B_whl]LDDW.D2T2 *B_x[B_l2],    B_xl2_3_xl2_2:B_xl2_1_xl2_0     ;x[l2]+0..3]


								        ADD.L2     B_ptr_y,       B_n,             B_y1           ;y1=y+n/4

								||      SUB.S2X    4,             A_n_2,           B_nm2          ;4 - n/4

								||      LDW.D1T1   *+A_SP[6],     A10                             ;Restore A10

								||      LDW.D2T2   *+B_SP[3],     B11                             ;Restore B11

								;-

								        MVK.S1     0xFFFF8000,    A_p0                            ;kill prolog

								||      NORM.L2    B_n,           B_l1                            ;_norm(n)+3

								||      ADD.D2     B_y1,          B_n,             B_y2           ;y2=y1+n/4

								||      ADD.S2X    A_x1,          8,               B_x1           ;ptr_x+n/2


								        ADD.S2     B_y2,          B_n,             B_y3           ;y3=y2+n/4

								||      ADD.L2X    A_ptr_x,       8,               B_x0           ;x = ptr_x

								||      MV.L1X     B_n,           A_n                             ;copy n

								||      LDW.D2T2   *+B_SP[5],     B10                             ;Restore B10

								;-

								        MVK.L2     1,             B_ifi                           ;if(j==n>>2){

								||      MVK.L1     4,             A_i                             ;j += 4;

								||      LDDW.D2T2  *B_x1++[2],    B_xfxe:B_xdxc                   ;x[15..12]

								||      LDDW.D1T1  *A_x0++[2],    A_x3x2:A_x1x0                   ;x[3..0]

								||      MPYSU.M1   0,             A_h0,            A_h0           ;h2=_deal(j)

								||      ADD.S2     B_l1,          3,               B_l1           ;rev dwords

								||      ROTL.M2    B_ptr_y,       0,               B_y0           ;y0= y

								*============================= PIPE LOOP KERNEL2===============================*

								LOOP_Z:

								  [ B_j]BDEC.S2    LOOP_Z,        B_j                             ;}end for

								||      SUB2.L1    A_xl0_0_xl1_0, A_xl21_0_xl20_0, A_xt2_0_yt1_0  ;xl0-xl21

								||      ADD2.S1    A_xl0_0_xl1_0, A_xl21_0_xl20_0, A_xt1_0_yt2_0  ;xl0+xl21

								||      ROTL.M1    A_xl1_1_xl0_1, 16,              A_xl0_1_xl1_1  ;repack data

								||      ADD2.L2X   B_x7x6,        A_x3x2,          B_xh21_0_xh20_0;x7+x3

								||      SHFL.M2    B_h2,          B_h3                            ;h2=_shfl(h2)

								||      LDDW.D1T1  *A_x1++[2],    A_xbxa:A_x9x8                   ;x[11..8]

								||      LDDW.D2T2  *B_x0++[2],    B_x7x6:B_x5x4                   ;x[7..4]


								 [!A_p0]STDW.D2T2  B_yt0_1_xt0_1:B_yt0_0_xt0_0,    *B_y2[B_h4]    ;store freq.

								||      PACKLH2.S1 A_xt1_0_yt2_0, A_xt2_0_yt1_0,   A_y7y6         ;repack data

								||      SUB2.D1X   A_x1x0,        B_x5x4,          A_xl1_0_xl0_0  ;x1-x5

								||      ADD2.S2X   B_x5x4,        A_x1x0,          B_xh1_0_xh0_0  ;x5+x1

								||[!B_ifi]ADD.L2   B_x1,          B_n,             B_x1           ;x2+=n>>1

								||[!B_ifi]ADD.L1   A_i,           A_n_2,           A_i            ;j+=n>>2

								||      BITR.M1    A_h0,          A_h1                            ;h2=_bitr(h2)


								 [!A_p0]STDW.D2T2  B_y9y8:B_y1y0, *B_y0[B_h4]                     ;store freq.

								||      PACKLH2.S1 A_xt1_1_yt2_1, A_xt2_1_yt1_1,   A_yfye         ;repack data

								||      PACKLH2.L1 A_xt2_0_yt1_0, A_xt1_0_yt2_0,   A_y3y2         ;repack data

								||      ADD2.S2X   B_xfxe,        A_xbxa,          B_xh21_1_xh20_1;x[15]+x[11]

								||      ADD2.L2    B_xh1_0_xh0_0, B_xh21_0_xh20_0, B_y1y0         ;xh1+xh21

								||      SUB2.D1X   A_x3x2,        B_x7x6,          A_xl21_0_xl20_0;x3-x7


								 [!A_p0]STDW.D2T1  A_yfye:A_y7y6, *B_y3[B_h4]                     ;store freq.

								||      MPYSU.M1   2,             A_p0,            A_p0           ;prolog count

								||      PACKLH2.S1 A_xt2_1_yt1_1, A_xt1_1_yt2_1,   A_ybya         ;repack data

								||      ADD2.L2    B_xh1_1_xh0_1, B_xh21_1_xh20_1, B_y9y8         ;xh1+xh21

								||      SUB2.D1X   A_xbxa,        B_xfxe,          A_xl21_1_xl20_1;x[11]-x[15]

								||      SUB2.S2    B_xh1_0_xh0_0, B_xh21_0_xh20_0, B_yt0_0_xt0_0  ;xh1-xh21

								||[!B_ifi]ADD.L1   A_x0,          A_n,             A_x0           ;x0 += n>>1

								||      AVG2.M2X   A_i,           B_nm2,           B_ifi          ;if(j==n>>2){


								 [!A_p0]STDW.D2T1  A_ybya:A_y3y2, *B_y1[B_h4]                     ;store freq.

								||      SUB2.S1    A_xl0_1_xl1_1, A_xl21_1_xl20_1, A_xt2_1_yt1_1  ;xl0-xl21

								||      ADD2.D1    A_xl0_1_xl1_1, A_xl21_1_xl20_1, A_xt1_1_yt2_1  ;xl0+xl21

								||      ROTL.M1    A_xl1_0_xl0_0, 16,              A_xl0_0_xl1_0  ;repack data

								||      SHRU.S2    B_h3,          B_l1,            B_h4           ;h2 >>= l1

								||[!B_ifi]ADD.L1   A_x1,          A_n,             A_x1           ;x2 += n>>1

								||[!B_ifi]ADD.L2   B_x0,          B_n,             B_x0           ;x0 += n>>1

								||      ROTL.M2X   A_h1,          16,              B_h2           ;rotl(h2, 16)


								        SUB2.S2    B_xh1_1_xh0_1, B_xh21_1_xh20_1, B_yt0_1_xt0_1  ;xh1-xh21

								||      SUB2.L1X   A_x9x8,        B_xdxc,          A_xl1_1_xl0_1  ;x[9]-x[13]

								||      ADD2.L2X   B_xdxc,        A_x9x8,          B_xh1_1_xh0_1  ;x[13]+x[9]

								||      ADD.S1     A_i,           4,               A_i            ;j += 4;

								||      LDDW.D2T2  *B_x1++[2],    B_xfxe:B_xdxc                   ;x[15..12]

								||      LDDW.D1T1  *A_x0++[2],    A_x3x2:A_x1x0                   ;x[3..0]

								||      DEAL.M1    A_i,           A_h0                            ;h2=_deal(j)

								*============================= PIPE LOOP EPILOG ===============================*

								        SUB2.L1    A_xl0_0_xl1_0, A_xl21_0_xl20_0, A_xt2_0_yt1_0  ;xl1-xl21

								||      ADD2.S1    A_xl0_0_xl1_0, A_xl21_0_xl20_0, A_xt1_0_yt2_0  ;xl1+xl21

								||      LDW.D2T2   *++B_SP[8],    B12   ; safe due to branch      ;Restore B12

								||      MV         B_SP,          A_SP

								||      RET.S2     B3                                             ;Rtrn to call


								        LDW.D2T1   *-B_SP[7],     A12   ; safe due to branch      ;Restore A12

								||      LDW.D1T2   *+A_SP[2],     B13                             ;Restore B13

								||      PACKLH2.S1 A_xt1_0_yt2_0, A_xt2_0_yt1_0,   A_y7y6         ;repack data

								;-

								        STDW.D2T2  B_yt0_1_xt0_1:B_yt0_0_xt0_0,    *B_y2[B_h4]    ;store freq.

								||      PACKLH2.S1 A_xt1_1_yt2_1, A_xt2_1_yt1_1,   A_yfye         ;repack data

								||      PACKLH2.L1 A_xt2_0_yt1_0, A_xt1_0_yt2_0,   A_y3y2         ;repack data


								        STDW.D2T1  A_yfye:A_y7y6, *B_y3[B_h4]                     ;store freq.

								||      PACKLH2.S1 A_xt2_1_yt1_1, A_xt1_1_yt2_1,   A_ybya         ;repack data

								||      MVC.S2     B_csr,         CSR                             ;rstr interupt


								        STDW.D2T2  B_y9y8:B_y1y0, *B_y0[B_h4]                     ;store freq.


								        STDW.D2T1  A_ybya:A_y3y2, *B_y1[B_h4]                     ;store freq.

								        ;Branch Occurs


								*================== SYMBOLIC REGISTER ASSIGNMENTS: LOOP_WHILE =================*

								       .asg         A2,         A_i             ;1st loop counter

								       .asg         B18,        B_h2            ;offset to n/4 time points

								       .asg         B19,        B_l1            ;offset to n/2 time points

								       .asg         A3,         A_xl21_0_xl20_0 ;data into butterflies

								       .asg         B26,        B_xh21_0_xh20_0 ;data into butterflies

								       .asg         A7,         A_xl21_1_xl20_1 ;data into butterflies

								       .asg         B11,        B_j             ;counter for twiddle factors

								       .asg         A0,         A_p0            ;prolog collapse variable

								*==============================================================================*

								; --- this code was relocated from above

								;-

								LOOP_WHILE_B:

								        SHRU.S2    B_fft_jmp,     1,               B_fft_jmp_1  ;fft_jmp/2

								||      MV.S1X     B_h2,          A_h2                          ;copy h2


								        B.S2       LOOP_Y5 + 24                                 ;trunc. prolog

								||      SHRU.S1X   B_fft_jmp,     3,               A_fft_jmp    ;w=ptr_w+

								||      ADDAH.D1   A_ptr_w,       A_tw_offset,     A_w0         ;tw_offset


								        LDDW.D2T2  *B_x[B_l1],    B_xl1_3_xl1_2:B_xl1_1_xl1_0   ;xl1[+0..3]

								||      MV.D1X     B_l1,          A_l1                          ;cpy l1

								;-

								        LDDW.D2T1  *B_x[0],       A_x_3_x_2:A_x_1_x_0           ;x0..3]

								||      SUB.L1     A_fft_jmp,     3,               A_fft_jmp    ;tw_offset+=

								||      ADD.S1X    A_tw_offset,   B_fft_jmp_1,     A_tw_offset  ;  fft_jmp


								        SUB2.S1X   A_xh2_1_xh2_0, B_xl2_1_xl2_0, A_xl21_0_xl20_0;x[h2]-x[l2]

								||      B.S2       LOOP_Y8 + 12                                 ;trunc prolog

								||      MV.L2X     A_w0,          B_w0                          ;cpy w0

								||      ADD.L1     A_w0,          8,               A_w1         ;w1=w0+1

								;-

								        LDDW.D1T2  *A_w1[0],      B_co21_si21:B_co20_si20       ;W^2n+1,W^2n

								||      LDDW.D2T1  *B_w0[0],      A_co11_si11:A_co10_si10       ;W^n+1,W^n

								||      ADD2.L2X   B_xl2_1_xl2_0, A_xh2_1_xh2_0, B_xh21_0_xh20_0;xh2=x[l2]+x[h2]

								||      SUB2.L1X   A_xh2_3_xh2_2, B_xl2_3_xl2_2, A_xl21_1_xl20_1;xl2=x[l2]-x[h2]

								||      MPYSU.M1   0,             A_zero,          A_zero       ;zero = 0

								||      MV.S1      A_fft_jmp,     A_ifj                         ;ify=fft_jmp

								;-

								        MVD.M1X    B_x,           A_x_                          ;x_=x

								||      BDEC.S1    LOOP_Y,        A_i                           ;trunc prolog

								||      SHRU.S2    B_stride,      2,               B_stride     ;stride>>=2

								||      MPYSU.M2   0,             B_j,             B_j          ;j = 0

								||      ZERO.D1    A_j                                          ;j = 0

								||      MVK.L1     1,             A_p0                          ;p0=1

								||      ADD.L2X    A_w1,          8,               B_w2         ;w2=w1+1


								* ========================================================================= *

								*   End of file:  dsp_fft.asm                                               *

								* ------------------------------------------------------------------------- *

								*             Copyright (c) 2003 Texas Instruments, Incorporated.           *

								*                            All Rights Reserved.                           *

								* ========================================================================= *