;* ======================================================================== *;
;*  TEXAS INSTRUMENTS, INC.                                                 *;
;*                                                                          *;
;*  DSPLIB  DSP Signal Processing Library                                   *;
;*                                                                          *;
;*      Release:        Revision 1.04b                                      *;
;*      CVS Revision:   1.7     Sun Sep 29 03:32:21 2002 (UTC)              *;
;*      Snapshot date:  23-Oct-2003                                         *;
;*                                                                          *;
;*  This library contains proprietary intellectual property of Texas        *;
;*  Instruments, Inc.  The library and its source code are protected by     *;
;*  various copyrights, and portions may also be protected by patents or    *;
;*  other legal protections.                                                *;
;*                                                                          *;
;*  This software is licensed for use with Texas Instruments TMS320         *;
;*  family DSPs.  This license was provided to you prior to installing      *;
;*  the software.  You may review this license by consulting the file       *;
;*  TI_license.PDF which accompanies the files in this library.             *;
;* ------------------------------------------------------------------------ *;
;*          Copyright (C) 2003 Texas Instruments, Incorporated.             *;
;*                          All Rights Reserved.                            *;
;* ======================================================================== *;


;* ======================================================================== *;
;*  Assembler compatibility shim for assembling 4.30 and later code on      *;
;*  tools prior to 4.30.                                                    *;
;* ======================================================================== *;

        .if $isdefed(".ASSEMBLER_VERSION")
        .asg    .ASSEMBLER_VERSION, $asmver
        .else
        .asg    0,    $asmver
        .endif

        .if ($asmver < 430)

        .asg    B,    CALL     ; Function Call
        .asg    B,    RET      ; Return from a Function
        .asg    B,    CALLRET  ; Function call with Call / Ret chaining.

        .if .TMS320C6400
        .asg    BNOP, CALLNOP  ; C64x BNOP as a Fn. Call
        .asg    BNOP, RETNOP   ; C64x BNOP as a Fn. Return
        .asg    BNOP, CRNOP    ; C64x Fn call w/, Call/Ret chaining via BNOP.
        .endif

        .asg    , .asmfunc     ; .func equivalent for hand-assembly code
        .asg    , .endasmfunc  ; .endfunc equivalent for hand-assembly code

        .endif

;* ======================================================================== *;
;*  End of assembler compatibility shim.                                    *;
;* ======================================================================== *;


*========================================================================== *
*      TEXAS INSTRUMENTS, INC.                                              *
*                                                                           *
*      NAME                                                                 *
*            DSP_fft32x32: Double Precision FFT                             *
*                                                                           *
*      USAGE                                                                *
*            This routine is C-callable and can be called as:               *
*                                                                           *
*           void DSP_fft32x32(const int  * ptr_w, int  npoints,             *
*                            int   * ptr_x, int  * ptr_y ) ;                *
*                                                                           *
*             ptr_w   =  input twiddle factors                              *
*             npoints =  number of points                                   *
*             ptr_x   =  transformed data reversed                          *
*             ptr_y   =  linear transformed data                            *
*                                                                           *
*            (See the C compiler reference guide.)                          *
*                                                                           *
*   DESCRIPTION                                                             *
*       The following code performs a mixed radix FFT for "npoints" which   *
*       is either a multiple of 4 or 2. It uses logN4 - 1 stages of radix4  *
*       transform and performs either a radix2 or radix4 transform on the   *
*       last stage depending on "npoints". If "npoints" is a multiple of 4, *
*       then this last stage is also a radix4 transform, otherwise it is a  *
*       radix2 transform. This program is available as a C compilable file  *
*       to automatically generate the twiddle factors "twiddle_split.c"     *
*                                                                           *
*       Generate special vector of twiddle factors                          *
*                                                                           *
*       for (j=1, k=0; j < npoints>>2; j = j <<2 )                          *
*       {                                                                   *
*           for (i=0; i < npoints>>2; i += j)                               *
*           {                                                               *
*               theta1 = 2*PI*i/npoints;                                    *
*               x_t = M*cos(theta1);                                        *
*               y_t = M*sin(theta1);                                        *
*               ptr_w[k+1] = (int) x_t;                                     *
*               if (x_t >= M) ptr_w[k+1] = 0x7fffffff;                      *
*               ptr_w[k+0] = (int) y_t;                                     *
*               if (y_t >= M) ptr_w[k+0] = 0x7fffffff;                      *
*                                                                           *
*               theta2 = 4*PI*i/npoints;                                    *
*               x_t = M*cos(theta2);                                        *
*               y_t = M*sin(theta2);                                        *
*               ptr_w[k+3] = (int) x_t;                                     *
*                                                                           *
*               if (x_t >= M) ptr_w[k+3] = 0x7fffffff;                      *
*               ptr_w[k+2] = (int) y_t;                                     *
*               if (y_t >= M) ptr_w[k+2] = 0x7fffffff;                      *
*                                                                           *
*               theta3 = 6*PI*i/npoints;                                    *
*               x_t = M*cos(theta3);                                        *
*               y_t = M*sin(theta3);                                        *
*               ptr_w[k+5] = (int) x_t;                                     *
*               if (x_t >= M) ptr_w[k+5] = 0x7fffffff;                      *
*               ptr_w[k+4] = (int) y_t;                                     *
*               if (y_t >= M) ptr_w[k+4] = 0x7fffffff;                      *
*               k += 6;                                                     *
*           }                                                               *
*       }                                                                   *
*                                                                           *
*                                                                           *
*   ASSUMPTIONS                                                             *
*       This code works for  both "npoints" a multiple of 2 or 4.           *
*       The arrays 'x[]', 'y[]', and 'w[]' all must be aligned on a         *
*       double-word boundary for the "optimized" implementations.           *
*       The input and output data are complex, with the real/imaginary      *
*       components stored in adjacent locations in the array.  The real     *
*       components are stored at even array indices, and the imaginary      *
*       components are stored at odd array indices. The input, twiddle      *
*       factors are in 32 bit precision. The 32 by 32 multiplies are        *
*       done with a 1.5 bit loss in accuracy. This comes about because      *
*       the contribution of the low sixteen bits to the 32 bit result       *
*       is not computed. In addition the contribution of the low * high     *
*       term is shifted by 16 as opposed to 15, for a loss 0f 0.5 bits      *
*       after rounding. To illustrate real part of complex multiply of:     *
*       (X + jY) ( C + jS) =                                                *
*                                                                           *
*       _mpyhir(si10 , yt1_0)  + _mpyhir(co10 , xt1_0) +                    *
*                        (((MPYLUHS(si10,yt1_0) + MPYLUHS(co10, xt1_0)      *
*                                               + 0x8000)  >> 16) << 1)     *
*                                                                           *
*       The intrinsic C version of this code performs this function as:     *
*                                                                           *
*       _mpyhir(si10 , yt1_0)  + _mpyhir(co10 , xt1_0) +                    *
*                        (_dotprsu2(yt1_0xt1_0, si10co10) << 1);            *
*                                                                           *
*                                                                           *
*       where the functions _mpyhir, MPYLUHS are as follows:                *
*                                                                           *
*   #define _mpyhir(x,y) \                                                  *
*   (((int)((short)(x>>16)*(unsigned short)(y&0x0000FFFF)+0x4000) >> 15)    *
*    + \ ((int)((short)(x >> 16) * (short)((y) >> 16)) << 1))               *
*                                                                           *
*   #define MPYLUHS(x,y)   \                                                *
*       ( (int) ((unsigned short)(x & 0x0000FFFF) * (short) (y >> 16)) )    *
*                                                                           *
*                                                                           *
*   TECHNIQUES                                                              *
*       The following C code represents an implementation of the Cooley     *
*       Tukey radix 4 DIF FFT. It accepts the inputs in normal order and    *
*       produces the outputs in digit reversed order. The natural C code    *
*       shown in this file on the other hand, accepts the inputs in nor-    *
*       mal order and produces the outputs in normal order.                 *
*                                                                           *
*       Several transformations have been applied to the original Cooley    *
*       Tukey code to produce the natural C code description shown here.    *
*       In order to understand these it would first be educational to       *
*       understand some of the issues involved in the conventional Cooley   *
*       Tukey FFT code.                                                     *
*                                                                           *
*       void radix4(int n, short x[], short wn[])                           *
*       {                                                                   *
*           int    n1,  n2,  ie,   ia1,  ia2, ia3;                          *
*           int    i0,  i1,  i2,    i3,    i, j,     k;                     *
*           short  co1, co2, co3,  si1,  si2, si3;                          *
*           short  xt0, yt0, xt1,  yt1,  xt2, yt2;                          *
*           short  xh0, xh1, xh20, xh21, xl0, xl1,xl20,xl21;                *
*                                                                           *
*           n2 = n;                                                         *
*           ie = 1;                                                         *
*           for (k = n; k > 1; k >>= 2)                                     *
*           {                                                               *
*               n1 = n2;                                                    *
*               n2 >>= 2;                                                   *
*               ia1 = 0;                                                    *
*                                                                           *
*               for (j = 0; j < n2; j++)                                    *
*               {                                                           *
*                    ia2 = ia1 + ia1;                                       *
*                    ia3 = ia2 + ia1;                                       *
*                                                                           *
*                    co1 = wn[2 * ia1    ];                                 *
*                    si1 = wn[2 * ia1 + 1];                                 *
*                    co2 = wn[2 * ia2    ];                                 *
*                    si2 = wn[2 * ia2 + 1];                                 *
*                    co3 = wn[2 * ia3    ];                                 *
*                    si3 = wn[2 * ia3 + 1];                                 *
*                    ia1 = ia1 + ie;                                        *
*                                                                           *
*                    for (i0 = j; i0< n; i0 += n1)                          *
*                    {                                                      *
*                        i1 = i0 + n2;                                      *
*                        i2 = i1 + n2;                                      *
*                        i3 = i2 + n2;                                      *
*                                                                           *
*                                                                           *
*                        xh0  = x[2 * i0    ] + x[2 * i2    ];              *
*                        xh1  = x[2 * i0 + 1] + x[2 * i2 + 1];              *
*                        xl0  = x[2 * i0    ] - x[2 * i2    ];              *
*                        xl1  = x[2 * i0 + 1] - x[2 * i2 + 1];              *
*                                                                           *
*                        xh20 = x[2 * i1    ] + x[2 * i3    ];              *
*                        xh21 = x[2 * i1 + 1] + x[2 * i3 + 1];              *
*                        xl20 = x[2 * i1    ] - x[2 * i3    ];              *
*                        xl21 = x[2 * i1 + 1] - x[2 * i3 + 1];              *
*                                                                           *
*                        x[2 * i0    ] = xh0 + xh20;                        *
*                        x[2 * i0 + 1] = xh1 + xh21;                        *
*                                                                           *
*                        xt0  = xh0 - xh20;                                 *
*                        yt0  = xh1 - xh21;                                 *
*                        xt1  = xl0 + xl21;                                 *
*                        yt2  = xl1 + xl20;                                 *
*                        xt2  = xl0 - xl21;                                 *
*                        yt1  = xl1 - xl20;                                 *
*                                                                           *
*                        x[2 * i1    ] = (xt1 * co1 + yt1 * si1) >> 15;     *
*                        x[2 * i1 + 1] = (yt1 * co1 - xt1 * si1) >> 15;     *
*                        x[2 * i2    ] = (xt0 * co2 + yt0 * si2) >> 15;     *
*                        x[2 * i2 + 1] = (yt0 * co2 - xt0 * si2) >> 15;     *
*                        x[2 * i3    ] = (xt2 * co3 + yt2 * si3) >> 15;     *
*                        x[2 * i3 + 1] = (yt2 * co3 - xt2 * si3) >> 15;     *
*                    }                                                      *
*              }                                                            *
*                                                                           *
*              ie <<= 2;                                                    *
*          }                                                                *
*      }                                                                    *
*                                                                           *
*       The conventional Cooley Tukey FFT, is written using three loops.    *
*       The outermost loop "k" cycles through the stages. There are log     *
*       N to the base 4 stages in all. The loop "j" cycles through the      *
*       groups of butterflies with different twiddle factors, loop "i"      *
*       reuses the twiddle factors for the different butterflies within     *
*       a stage. It is interesting to note the following:                   *
*                                                                           *
*-------------------------------------------------------------------------- *
*       Stage#     #Groups     # Butterflies with common     #Groups*Bflys  *
*                                twiddle factors                            *
*-------------------------------------------------------------------------- *
*        1         N/4          1                            N/4            *
*        2         N/16         4                            N/4            *
*        ..                                                                 *
*        logN      1            N/4                          N/4            *
*-------------------------------------------------------------------------- *
*                                                                           *
*       The following statements can be made based on above observations:   *
*                                                                           *
*       a) Inner loop "i0" iterates a veriable number of times. In          *
*       particular the number of iterations quadruples every time from      *
*       1..N/4. Hence software pipelining a loop that iterates a vraiable   *
*       number of times is not profitable.                                  *
*                                                                           *
*       b) Outer loop "j" iterates a variable number of times as well.      *
*       However the number of iterations is quartered every time from       *
*       N/4 .  . Hence the behaviour in (a) and (b) are exactly opposite    *
*       to each other.                                                      *
*                                                                           *
*       c) If the two loops "i" and "j" are colaesced together then they    *
*       will iterate for a fixed number of times namely N/4. This allows    *
*       us to combine the "i" and "j" loops into 1 loop. Optimized impl-    *
*       ementations will make use of this fact.                             *
*                                                                           *
*       In addition the Cooley Tukey FFT accesses three twiddle factors     *
*       per iteration of the inner loop, as the butterflies that re-use     *
*       twiddle factors are lumped together. This leads to accessing the    *
*       twiddle factor array at three points each sepearted by "ie". Note   *
*       that "ie" is initially 1, and is quadrupled with every iteration.   *
*       Therfore these three twiddle factors are not even contiguous in     *
*       the array.                                                          *
*                                                                           *
*       In order to vectorize the FFT, it is desirable to access twiddle    *
*       factor array using double word wide loads and fetch the twiddle     *
*       factors needed. In order to do this a modified twiddle factor       *
*       array is created, in which the factors WN/4, WN/2, W3N/4 are        *
*       arranged to be contiguous. This eliminates the seperation between   *
*       twiddle factors within a butterfly. However this implies that as    *
*       the loop is traversed from one stage to another, that we maintain   *
*       a redundant version of the twiddle factor array. Hence the size     *
*       of the twiddle factor array increases as compared to the normal     *
*       Cooley Tukey FFT.  The modified twiddle factor array is of size     *
*       "2 * N" where the conventional Cooley Tukey FFT is of size"3N/4"    *
*       where N is the number of complex points to be transformed. The      *
*       routine that generates the modified twiddle factor array was        *
*       presented earlier. With the above transformation of the FFT,        *
*       both the input data and the twiddle factor array can be accessed    *
*       using double-word wide loads to enable packed data processing.      *
*                                                                           *
*       The final stage is optimised to remove the multiplication as        *
*       w0 = 1.  This stage also performs digit reversal on the data,       *
*       so the final output is in natural order.                            *
*                                                                           *
*       The fft() code shown here performs the bulk of the computation      *
*       in place. However, because digit-reversal cannot be performed       *
*       in-place, the final result is written to a separate array, y[].     *
*                                                                           *
*       There is one slight break in the flow of packed processing that     *
*       needs to be comprehended. The real part of the complex number is    *
*       in the lower half, and the imaginary part is in the upper half.     *
*       The flow breaks in case of "xl0" and "xl1" because in this case     *
*       the real part needs to be combined with the imaginary part because  *
*       of the multiplication by "j". This requires a packed quantity like  *
*       "xl21xl20" to be rotated as "xl20xl21" so that it can be combined   *
*        using add2's and sub2's. Hence the natural version of C code       *
*       shown below is transformed using packed data processing as shown:   *
*                                                                           *
*                        xl0  = x[2 * i0    ] - x[2 * i2    ];              *
*                        xl1  = x[2 * i0 + 1] - x[2 * i2 + 1];              *
*                        xl20 = x[2 * i1    ] - x[2 * i3    ];              *
*                        xl21 = x[2 * i1 + 1] - x[2 * i3 + 1];              *
*                                                                           *
*                        xt1  = xl0 + xl21;                                 *
*                        yt2  = xl1 + xl20;                                 *
*                        xt2  = xl0 - xl21;                                 *
*                        yt1  = xl1 - xl20;                                 *
*                                                                           *
*                        xl1_xl0   = _sub2(x21_x20, x21_x20)                *
*                        xl21_xl20 = _sub2(x32_x22, x23_x22)                *
*                        xl20_xl21 = _rotl(xl21_xl20, 16)                   *
*                                                                           *
*                        yt2_xt1   = _add2(xl1_xl0, xl20_xl21)              *
*                        yt1_xt2   = _sub2(xl1_xl0, xl20_xl21)              *
*                                                                           *
*       Also notice that xt1, yt1 endup on seperate words, these need to    *
*       be packed together to take advantage of the packed twiddle fact     *
*       ors that have been loaded. In order for this to be achieved they    *
*       are re-aligned as follows:                                          *
*                                                                           *
*       yt1_xt1 = _packhl2(yt1_xt2, yt2_xt1)                                *
*       yt2_xt2 = _packhl2(yt2_xt1, yt1_xt2)                                *
*                                                                           *
*       In the folllowing code since all data elements are 32 bits, add2    *
*       sub2 are replaced with normal 32 bit add's and subtracts.           *
*       The packed words "yt1_xt1" allows the loaded"sc" twiddle factor     *
*       to be used for the complex multiplies. The real part of the         *
*       multiply and the imaginary part of the multiply are performed       *
*       as 16x32 multiplies using MPYLIR and MPYHIR                         *
*                                                                           *
*       (X + jY) ( C + j S) = (XC + YS) + j (YC - XS).                      *
*                                                                           *
*       The actual twiddle factors for the FFT are cosine, - sine. The      *
*       twiddle factors stored in the table are csine and sine, hence       *
*       the sign of the "sine" term is comprehended during multipli-        *
*       cation as shown above.                                              *
*                                                                           *
*   MEMORY NOTE                                                             *
*       The optimized implementations are written for LITTLE ENDIAN.        *
*                                                                           *
*   CYCLES                                                                  *
*       [(N/4 + 1) * 10 + 10] * ceil(log4(N) - 1) + 6 * (N/4 + 2) + 27      *
*                                                                           *
*       N = 512, [1290 + 10] * 4 + 6 * 130 + 27 = 6007 cycles               *
*                                                                           *
*   CODESIZE                                                                *
*       972 bytes                                                           *
* ------------------------------------------------------------------------- *
*             Copyright (c) 2003 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *

*============================================================================*
        .sect ".text:_fft32x32"
        .global _DSP_fft32x32
_DSP_fft32x32: 
*================== SYMBOLIC REGISTER ASSIGNMENTS: SETUP ====================*
        .asg            B15,        B_SP        ; Stack pointer, B datapath
        .asg            A31,        A_SP        ; Stack pointer, A datapath
        .asg            B0,         B_csr       ; CSR's value
        .asg            B1,         B_no_gie    ; CSR w/ GIE bit cleared
        .asg            A0,         A_csr       ; Copy of CSR's value
        .asg            B3,         B_ret       ; Return address
        .asg            A0,         A_whl
; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================
        .asg            A4,         A_ptr_w
        .asg            B4,         B_n
        .asg            A6,         A_ptr_x
        .asg            B6,         B_ptr_y
        .asg            A15,        A_stride
        .asg            A13,        A_tw_offset
        .asg            A14,        A_radix
        .asg            B0,         B_radix2
        .asg            A17,        A_j
        .asg            A23,        A_fft_jmp
        .asg            B20,        B_fft_jmp
        .asg            A19,        A_h2
        .asg            A18,        A_l1
        .asg            A20,        A_l2
        .asg            B21,        B_l1
        .asg            B22,        B_h2
        .asg            B23,        B_l2
        .asg            A10,        A_x
        .asg            A21,        A_w0
        .asg            B19,        B_w1
        .asg            A22,        A_w2
        .asg            A3,         A_fft_jmp_1
        .asg            A2,         A_i
        .asg            A1,         A_pro
        .asg            B2,         B_pro2
        .asg            B25,        B_xp1
        .asg            B24,        B_xp0
        .asg            A25,        A_xl1p1
        .asg            A24,        A_xl1p0
        .asg            B29,        B_xh2p1
        .asg            B28,        B_xh2p0
        .asg            A27,        A_xl2p1
        .asg            A26,        A_xl2p0
        .asg            A9,         A_xh0
        .asg            A24,        A_xh1
        .asg            B30,        B_xl0
        .asg            B27,        B_xl1
        .asg            A8,         A_xh20
        .asg            A29,        A_xh21
        .asg            B5,         B_xl20
        .asg            B7,         B_xl21
        .asg            A26,        A_y_h1_0
        .asg            A27,        A_y_h1_1
        .asg            B26,        B_j
        .asg            B29,        B_co10
        .asg            B28,        B_si10
        .asg            A31,        A_co20
        .asg            A30,        A_si20
        .asg            B27,        B_co30
        .asg            B26,        B_si30
        .asg            A5,         A_xt0
        .asg            A7,         A_yt0
        .asg            B8,         B_xt1
        .asg            B0,         B_yt2
        .asg            B9,         B_xt2
        .asg            B3,         B_yt1
        .asg            B5,         B_co10si10
        .asg            A5,         A_co20si20
        .asg            B3,         B_co30si30
        .asg            B8,         B_si10co10
        .asg            A8,         A_si20co20
        .asg            B5,         B_si30co30
        .asg            A7,         A_yt0xt0
        .asg            B16,        B_yt1xt1
        .asg            B9,         B_yt2xt2
        .asg            A5,         A_si10
        .asg            B18,        B_p0r
        .asg            A11,        A_p1r
        .asg            B30,        B_y_h2_0
        .asg            B3,         B_p01r
        .asg            B7,         B_p0c
        .asg            A3,         A_p1c
        .asg            B31,        B_y_h2_1
        .asg            B9,         B_p01c
        .asg            A27,        A_p2r
        .asg            A3,         A_p3r
        .asg            A28,        A_y_l1_0
        .asg            A24,        A_p23r
        .asg            A25,        A_p2c
        .asg            A3,         A_p3c
        .asg            A29,        A_y_l1_1
        .asg            A3,         A_p23c
        .asg            B7,         B_p4r
        .asg            B26,        B_p5r
        .asg            B24,        B_y_l2_0
        .asg            B8,         B_p45r
        .asg            B25,        B_p4c
        .asg            B24,        B_p5c
        .asg            B25,        B_y_l2_1
        .asg            B5,         B_p45c
        .asg            A16,        A_x_1
        .asg            B17,        B_x__
        .asg            A0,         A_ifj
        .asg            A0,         A_whl
; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================
        ; Stack frame.  14 words:  A10..A15, B10..B14, B3, CSR, pad
;-
        STW     .D2T1 A15,        *B_SP--[14]          ; Reserve stack, Save A15

        MV      .S1X  B_SP,       A_SP                 ; Twin Stack Pointer

        STW     .D1T1 A14,        *+A_SP[12]           ; Save A14
||      STW     .D2T2 B14,        *+B_SP[11]           ; Save B14
||      MVC     .S2   CSR,        B_csr                ; Capture CSR's state

        STW     .D1T1 A13,        *+A_SP[10]           ; Save A13
||      STW     .D2T2 B13,        *+B_SP[ 9]           ; Save B13
||      AND     .L2   B_csr, -2,  B_no_gie             ; Clear GIE
;-
        STW     .D1T1 A12,        *+A_SP[ 8]           ; Save A12
||      STW     .D2T2 B12,        *+B_SP[ 7]           ; Save B12

        STW     .D1T1 A11,        *+A_SP[ 6]           ; Save A11
||      STW     .D2T2 B11,        *+B_SP[ 5]           ; Save B11
||      MV      .L1X  B_csr,      A_csr                ; Partitioning MV

        STW     .D1T1 A10,        *+A_SP[ 4]           ; Save A10
||      STW     .D2T2 B10,        *+B_SP[ 3]           ; Save B10
||      MVC     .S2   B_no_gie,   CSR                  ; Disable interrupts
||      NORM    .L2   B_n,        B_radix2             ;[ 2,0]

; ===== Interrupts masked here =====

        AND     .L2   B_radix2,   1,B_radix2           ;[ 3,0] _norm(npoints) & 1
||      MVK     .S1   4,          A_radix              ;[ 3,0] radix = 4?
||      STW     .D1T1 A_csr,      *+A_SP[ 2]           ; Save CSR
||      STW     .D2T2 B_ret,      *+B_SP[ 1]           ; Remember return address

  [ B_radix2]MVK.D1   2,          A_radix              ;[ 4,0] radix = 2
||      ZERO    .L1   A_tw_offset                      ;[ 4,0] tw_offset = 0;
||      MV      .S1X  B_n,        A_stride             ;[ 4,0] stride=n

; ============================ PIPE LOOP PROLOG ==============================

        ADDAH   .D1   A_ptr_w,A_tw_offset,A_w0         ;[ 6,0]  ptr_w + tw_offset
||      SHRU    .S1   A_stride,   2,A_h2               ;[ 6,0]
||      MVK     .L1   1,          A_pro                ;[11,0]

        ADDAH   .D1   A_h2,       A_h2,A_l2            ;[ 7,0]
||      MVK     .L2   1,          B_pro2               ;
||      SHL     .S1   A_pro,      29,A_pro             ;

        ADD     .L2X  A_w0,       8,B_w1               ;[ 8,0]
||      MPYSU   .M1   6,A_stride, A_fft_jmp            ;fft_jmp=stride+stride>>1

        SHRU    .S1X  B_n,        2,A_i                ;[ 9,0] n>>3
||      MV      .D2X  A_l2,       B_l2                 ;[ 9,0]

        MV      .L2X  A_h2,       B_h2                 ;[10,0]
||      SHRU    .S1   A_stride,   1,A_l1               ;[10,0]
||      ROTL    .M1   A_ptr_x,    0,A_x                ;x = ptr_x
||      ADD     .D1X  B_w1,       8,A_w2               ;[12,0]

LOOP_WHILE_N:

        SUB     .L1   A_i,        1,A_i                ;[11,0]
||      SHRU    .S2X  A_fft_jmp,  3,B_fft_jmp          ;[11,0]
||      SHRU    .S1   A_fft_jmp,  1,A_fft_jmp_1        ;[11,0]
||      MPYSU   .M1   0,          A_j,A_j              ;[11,0] j = 0
||      LDDW    .D1T2 *A_ptr_x[0],B_xp1:B_xp0          ;x[0]  (0)

        MV      .S2X  A_l1,       B_l1                 ;[12,0]
||      ADD     .L1   A_tw_offset,A_fft_jmp_1,A_tw_offset;[12,0]tw_offset+=
||      SUB     .L2   B_fft_jmp,  3,B_fft_jmp          ;[12,0]          fft_jmp
||      SHRU    .S1   A_stride,   2,A_stride           ;[12,0] stride = stride>>2
||      LDDW    .D1T1 *A_ptr_x[A_l1], A_xl1p1:A_xl1p0  ;x[l1] (N/2)
; ============================ PIPE LOOP KERNEL ==============================
LOOP_Y:
        ADD     .S1   A_p2r,      A_p3r,   A_y_l1_0    ;[23,2]y[l1] = (si20*yt0+c
||      ADDAH   .D2   B_y_h2_1,   B_p01c,  B_y_h2_1    ;[23,2] o20*xt0)>>15
||      MPYHIR  .M2   B_co10,     B_xt1,   B_p0r       ;[13,3]
||      PACKH2  .S2   B_yt1,      B_xt1,   B_yt1xt1    ;[13,3]
||      ADD     .L2   B_xl20,     B_xl1,   B_yt2       ;[13,3] yt2=xl1+xl20
||      SUB     .L1   A_xh1,      A_xh21,  A_yt0       ;[13,3] yt0=xh1-xh21
||      LDDW    .D1T1 *A_x[A_l2], A_xl2p1:A_xl2p0      ;[ 3,4] x[l2] (3N/4)

        ADD     .L1   A_pro,      A_pro,   A_pro       ;[34,1]
||      ADDAH   .D2   B_y_l2_1,   B_p45c,  B_y_l2_1    ;[24,2]
||      MPYHIR  .M2   B_si30,     B_yt2,   B_p5r       ;[14,3]
||      MPYHIR  .M1X  A_si10,     B_yt1,   A_p1r       ;[14,3]
||      PACK2   .S2   B_si30,     B_co30,  B_si30co30  ;[14,3] ()>>16
||      SUB     .L2   B_xl0,      B_xl21,  B_xt2       ;[14,3] xt2=xl0-xl21
||      ADD     .S1   A_xh21,     A_xh1,   A_y_h1_1    ;[14,3] y[1]=xh1+xh21
||      LDDW    .D1T2 *A_x[A_h2], B_xh2p1:B_xh2p0      ;[ 4,4] x[h2] (N/4)

  [!A_pro]STDW  .D2T2 B_y_l2_1:B_y_l2_0,  *B_x__[B_l2] ;[25,2]
||      SUB     .L1   A_p2c,      A_p3c,   A_y_l1_1    ;[25,2]y[l1+1]=co20*yt0-
||      ADDAH   .D1   A_y_l1_0,   A_p23r,  A_y_l1_0    ;[25,2] si20*xt0)>>15
||      ADD     .L2X  B_p0r,      A_p1r,   B_y_h2_0    ;[25,2]y[h2] = (si10*yt1+
||      MPYHIR  .M2   B_co30,     B_yt2,   B_p4c       ;[15,3] co10*xt1)>>15
||      MPYHIR  .M1X  A_si10,     B_xt1,   A_p1c       ;[15,3]
||      PACK2   .S2   B_si10,     B_co10,  B_si10co10  ;[15,3] ()>>16
||      SUB     .S1   A_xh0,      A_xh20,  A_xt0       ;[15,3] xt0=xh0-xh20

        ADDAH   .D2   B_y_h2_0,   B_p01r,  B_y_h2_0    ;[26,2]
||[!B_pro2]STDW .D1T1 A_y_h1_1:A_y_h1_0,    *A_x_1[0]  ;[16,3]
||      MPYHIR  .M2   B_si30,     B_xt2,   B_p5c       ;[16,3]
||      MPYHIR  .M1   A_co20,     A_yt0,   A_p2c       ;[16,3]
||      PACK2   .S1   A_si20,     A_co20,  A_si20co20  ;[16,3] ()>>16
||      PACK2   .L2   B_co30,     B_si30,  B_co30si30  ;[16,3] ()>>16
||      SUB     .L1X  B_fft_jmp,  A_j,     A_ifj       ;[ 6,4] ifj = (j - fft_jmp)
||      MV      .S2X  A_j,        B_j                  ;[ 6,4]

        BDEC    .S1   LOOP_Y,     A_i                  ;[37,1]
||      MPYHIR  .M2   B_co30,     B_xt2,   B_p4r       ;[17,3]
||      MPYHIR  .M1   A_si20,     A_yt0,   A_p3r       ;[17,3]
||      PACKH2  .S2   B_yt2,      B_xt2,   B_yt2xt2    ;[17,3]
||      LDDW    .D2T1 *B_w1[B_j], A_co20:A_si20        ;[ 7,4]
||      LDDW    .D1T2 *A_w0[A_j], B_co10:B_si10        ;[ 7,4]
||      SUB     .L2X  B_xp1,      A_xl1p1, B_xl1       ;[ 7,4] xl1=x[1]-x[l1p1]
||      ADD     .L1X  B_xp0,      A_xl1p0, A_xh0       ;[ 7,4] xh0=x[0]+x[l1]

  [!A_pro]STDW  .D2T2 B_y_h2_1:B_y_h2_0, *B_x__[B_h2]  ;[28,2]
||      ADDAH   .D1   A_y_l1_1,   A_p23c,  A_y_l1_1    ;[28,2]
||      DOTPRSU2.M2   B_yt2xt2,   B_si30co30, B_p45r   ;[18,3]
||      MPYHIR  .M1   A_co20,     A_xt0,   A_p2r       ;[18,3]
||      PACKH2  .L1   A_yt0,      A_xt0,   A_yt0xt0    ;[18,3]
||      PACK2   .S2   B_co10,     B_si10,  B_co10si10  ;[18,3] ()>>16
||      SUB     .L2X  B_xp0,      A_xl1p0, B_xl0       ;[ 8,4] xl0=x[0]-x[l1]
||      ADD     .S1X  B_xp1,      A_xl1p1, A_xh1       ;[ 8,4] xh1=x[1]+x[l1p1]

        DOTPNRSU2.M2  B_yt1xt1,   B_co10si10, B_p01c   ;[19,3]
||[!A_ifj]ADD   .L1   A_x,        A_fft_jmp, A_x       ;[ 9,4]if(!predj)x+=fft_jmp
||      ADD     .S1   A_j,        3,       A_j         ;[ 9,4] j += 1
||      MVD     .M1   A_x,        A_x_1                ;[ 9,4]
||      LDDW    .D1T2 *A_w2[A_j], B_co30:B_si30        ;[ 9,4]
||      SUB     .D2X  B_xh2p0,    A_xl2p0, B_xl20      ;[ 9,4] xl20=x[h2]  -x[l2]
||      ZERO    .L2   B_pro2

  [!A_pro]STDW  .D2T1 A_y_l1_1:A_y_l1_0, *B_x__[B_l1]  ;[30,2]
||      DOTPNRSU2.M2  B_yt2xt2,   B_co30si30, B_p45c   ;[20,3]
||      SUB     .S2X  B_p0c,      A_p1c,   B_y_h2_1    ;[20,3] y[h2+1]=(co10*yt1-
||      MPYHIR  .M1   A_si20,     A_xt0,   A_p3c       ;[20,3] si10*xt1)>>15
||[!A_ifj]ZERO  .D1   A_j                              ;[10,4] if (!predj) j = 0
||      ADD     .L1   A_x,        8,       A_x         ;[10,4]
||      SUB     .L2   B_xl1,      B_xl20,  B_yt1       ;[10,4] yt1=xl1-xl20
||      ADD     .S1X  B_xh2p1,    A_xl2p1, A_xh21      ;[10,4]xh21=x[h2p1]+x[l2p1]

        SUB     .S2   B_p4c,      B_p5c,   B_y_l2_1    ;[21,3]y[l2+1]=(si30*yt2-
||      DOTPRSU2.M1   A_yt0xt0,   A_si20co20, A_p23r   ;[21,3]       co30*xt2)>>15
||      ADD     .L2   B_p4r,      B_p5r,   B_y_l2_0    ;[21,3]y[l2]=(co30*yt2+
||      DOTPRSU2.M2   B_yt1xt1,   B_si10co10, B_p01r   ;[21,3] si30*xt2)>>15
||      PACK2   .L1   A_co20,     A_si20,  A_co20si20  ;()>>16
||      SUB     .D2X  B_xh2p1,    A_xl2p1, B_xl21      ;xl21=x[h2p1]-x[l2p1]
||      ADD     .S1X  B_xh2p0,    A_xl2p0, A_xh20      ;xh20=x[h2]+x[l2]
||      LDDW    .D1T2 *A_x[0],    B_xp1:B_xp0          ;x[0]  (0)

        MV      .S2X  A_x_1,      B_x__                ;[22,3]
||      ADDAH   .D2   B_y_l2_0,   B_p45r,  B_y_l2_0    ;[22,3]
||      DOTPNRSU2.M1  A_yt0xt0,   A_co20si20, A_p23c   ;[22,3]
||      MPYHIR  .M2   B_co10,     B_yt1,   B_p0c       ;[12,4]
||      MV      .L1X  B_si10,     A_si10               ;[12,4]
||      ADD     .L2   B_xl21,     B_xl0,   B_xt1       ;xt1=xl0+xl21
||      ADD     .S1   A_xh20,     A_xh0,   A_y_h1_0    ;y[0]=xh0+xh20
||      LDDW    .D1T1 *A_x[A_l1], A_xl1p1:A_xl1p0      ;x[l1] (N/2)
; ============================ PIPE LOOP EPILOG ==============================
        ADD     .S1   A_p2r,      A_p3r,   A_y_l1_0    ;[23,5] y[l1] = (si20*yt0
||      ADDAH   .D2   B_y_h2_1,   B_p01c,  B_y_h2_1    ;[23,5] +co20*xt0)>>15

        ADDAH   .D2   B_y_l2_1,   B_p45c,  B_y_l2_1    ;[24,5]
||      CMPGTU  .L1   A_stride,   A_radix, A_whl       ;while (stride > radix) do

        STDW  .D2T2   B_y_l2_1:B_y_l2_0,   *B_x__[B_l2];[25,5]
||      SUB     .L1   A_p2c,      A_p3c,   A_y_l1_1   ;[25,5] y[l1+1]=(co20*yt0-
||      ADDAH   .D1   A_y_l1_0,   A_p23r,  A_y_l1_0   ;[25,5] si20*xt0)>>15
||[A_whl]B      .S1   LOOP_WHILE_N                    ;} end while
||      ADD     .L2X  B_p0r,      A_p1r,   B_y_h2_0   ;[25,5] y[h2] = (si10*yt1+
                                                      ;co10*xt1)>>15
        ADDAH   .D2   B_y_h2_0,   B_p01r,  B_y_h2_0   ;[26,5]
||      ADDAH   .D1   A_ptr_w,A_tw_offset,A_w0        ;[ 6,0]  ptr_w + tw_offset
||      SHRU    .S1   A_stride,   2,A_h2              ;[ 6,0]
||      MVK     .L1   1,          A_pro               ;[11,0]

        ADDAH   .D1   A_h2,       A_h2,     A_l2       ;[ 7,0]
||      MVK     .L2   1,          B_pro2               ;
||      SHL     .S1   A_pro,      29,       A_pro      ;

        STDW    .D2T2  B_y_h2_1:B_y_h2_0,   *B_x__[B_h2];[28,5]
||      ADDAH   .D1   A_y_l1_1,   A_p23c,  A_y_l1_1    ;[28,5]
||      ADD     .L2X  A_w0,       8,B_w1               ;[ 8,0]
||      MPYSU   .M1   6,A_stride, A_fft_jmp            ;fft_jmp=stride+stride>>1

        SHRU    .S1X  B_n,        2,A_i                ;[ 9,0] n>>3
||      MV      .D2X  A_l2,       B_l2                 ;[ 9,0]

        STDW    .D2T1 A_y_l1_1:A_y_l1_0,   *B_x__[B_l1];[30,5]
||      MV      .L2X  A_h2,       B_h2                 ;[10,0]
||      SHRU    .S1   A_stride,   1,A_l1               ;[10,0]
||      ROTL    .M1   A_ptr_x,    0,A_x                ;x = ptr_x
||      ADD     .D1X  B_w1,       8,A_w2               ;[12,0]

; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================
        .asg            A14,        A_radix
        .asg            A6,         A_ptr_x
        .asg            B6,         B_ptr_y
        .asg            B4,         B_n
        .asg            A0,         A_r2
        .asg            A20,        A_p_x0
        .asg            B8,         B_p_x0
        .asg            B21,        B_p_y0
        .asg            B22,        B_p_y2
        .asg            B23,        B_p_y1
        .asg            B3,         B_p_y3
        .asg            B20,        B_l1
        .asg            B19,        B_j0
        .asg            A18,        A_i
        .asg            B9,         B_j
        .asg            A1,         A_pro
        .asg            B25,        B_h0
        .asg            B7,         B_h1
        .asg            B7,         B_h2
        .asg            B5,         B_h3
        .asg            B16,        B_h4
        .asg            A7,         A_x1
        .asg            A6,         A_x0
        .asg            B29,        B_x3
        .asg            B28,        B_x2
        .asg            A5,         A_x5
        .asg            A4,         A_x4
        .asg            B5,         B_x7
        .asg            B4,         B_x6
        .asg            A21,        A_xh0_0
        .asg            A3,         A_xh1_0
        .asg            B24,        B_xh0_1
        .asg            B26,        B_xh1_1
        .asg            B24,        B_y0
        .asg            B25,        B_y1
        .asg            B6,         B_y4
        .asg            B7,         B_y5
        .asg            A16,        A_xl0_0
        .asg            A19,        A_xl1_0
        .asg            B18,        B_xl0_1
        .asg            B17,        B_xl1_1
        .asg            A16,        A_y2
        .asg            A17,        A_y3
        .asg            A8,         A_y6
        .asg            A9,         A_y7
        .asg            A22,        A_temp
; ============================ PIPE LOOP PROLOG ==============================
        NORM    .L2     B_n,        B_l1               ;[ 2,0] l1 = _norm(n)+2
||      MV      .D2     B_ptr_y,    B_p_y0             ;[ 2,0]
||      MVK     .L1     1,          A_pro              ;

        ZERO    .L2     B_j                            ;[ 3,0]
||      SUB     .D1     A_radix,    2,     A_r2        ;[ 3,0]
||      ADD     .S2     B_l1,       2,     B_l1        ;[ 3,0]
||      ADDAW   .D2     B_p_y0,     B_n,   B_p_y2      ;[ 3,0]

        MVK     .S2     4,          B_j0               ;[ 4,0] j0 = 4
||[!A_r2]NORM   .L2     B_n,        B_l1               ;[ 4,0] l1 = _norm(n)+1;
||      ADDAH   .D2     B_p_y2,     B_n,   B_p_y3      ;[ 4,0]
||      SHL     .S1     A_pro,      15,    A_pro       ;

        SHRU    .S1X    B_n,        2,     A_i         ;[ 5,0]
||[!A_r2]MVK    .S2     8,          B_j0               ;[ 5,0] j0 = 8
||[!A_r2]ADD    .L2     B_l1,       1,     B_l1        ;[ 5,0]
||      ADDAH   .D2     B_p_y0,     B_n,   B_p_y1      ;[ 5,0]

  [!A_r2]ADD    .S2     B_p_y2,     B_n,   B_p_y3      ;[ 6,0]
||[!A_r2]ADD    .L2     B_p_y0,     B_n,   B_p_y1      ;[ 6,0]
||      ADD     .D2X    A_ptr_x,    8,     B_p_x0      ;[ 6,0] x = ptr_x
||      MV      .L1     A_ptr_x,    A_p_x0             ;[ 6,0]
; ============================ PIPE LOOP KERNEL ==============================
LOOP_Z:
  [!A_r2]ROTL   .M1     A_x4,       0,     A_xl0_0     ;[13,1]
||      SUB     .L1X    A_xl1_0,    B_xl0_1, A_y3      ;[13,1]
||      ADD     .S2X    A_xh1_0,    B_xh1_1, B_y1      ;[13,1]
||      BDEC    .S1     LOOP_Z,     A_i                ;[13,1] }end for
||      ADD     .L2     B_j,        B_j0,  B_j         ;[ 1,3] j  += j0;
||      LDDW    .D2T2   *B_p_x0++[2],      B_x3:B_x2   ;[ 1,3]
||      LDDW    .D1T1   *A_p_x0++[2],      A_x1:A_x0   ;[ 1,3]
||      DEAL    .M2     B_j,        B_h0               ;[ 1,3] h2 = _deal(j);

 [!A_pro]STDW   .D2T2   B_y1:B_y0,  *B_p_y0[B_h4]      ;[14,1]
||      MV      .S1     A_y3,       A_temp             ;[14,1]
||      ADD     .L1X    A_xl1_0,    B_xl0_1,A_y7       ;[14,1]
||      SUB     .L2     B_x2,       B_x6,  B_xl0_1     ;[ 8,2]
||      ADD     .S2     B_x6,       B_x2,  B_xh0_1     ;[ 8,2]
||      ADD     .D1     A_x4,       A_x0,  A_xh0_0     ;[ 8,2]
||[!A_r2]ROTL   .M1     A_x0,       0,     A_xh0_0     ;[ 8,2]
||[!A_r2]ROTL   .M2     B_x2,       0,     B_xh0_1     ;[ 8,2]

        SUB     .L1X    A_xl0_0,    B_xl1_1,A_y6       ;[15,1]
||      SUB     .L2X    A_xh1_0,    B_xh1_1,B_y5       ;[15,1]
||      ADD     .S1     A_x5,       A_x1,  A_xh1_0     ;[ 9,2]
||[!A_r2]ROTL   .M1     A_x1,       0,     A_xh1_0     ;[ 9,2]
||[!A_r2]MV     .S2     B_x7,       B_xl0_1            ;[ 9,2]
||      SHFL    .M2     B_h2,       B_h3               ;[ 9,2] h2 = _shfl(h2);
||      LDDW    .D1T1   *A_p_x0++[2],      A_x5:A_x4   ;[ 3,3]
||      LDDW    .D2T2   *B_p_x0++[2],      B_x7:B_x6   ;[ 3,3]

        ADD     .S1X    A_xl0_0,    B_xl1_1,A_y2       ;[16,1]
||[!A_r2]MV     .D1     A_y7,       A_y3               ;[16,1]
||[!A_pro]STDW  .D2T2   B_y5:B_y4,  *B_p_y2[B_h4]      ;[16,1]
||      SUB     .L1     A_x1,       A_x5,  A_xl1_0     ;[10,2]
||[!A_r2]ROTL   .M1     A_x5,       0,     A_xl1_0     ;[10,2]
||      ADD     .S2     B_x7,       B_x3,  B_xh1_1     ;[10,2]
||      SUB     .L2     B_x3,       B_x7,  B_xl1_1     ;[10,2]
||      BITR    .M2     B_h0,       B_h1               ;[ 4,3] h2 = _bitr(h2);

  [!A_r2]MV     .L1     A_temp,     A_y7               ;[17,1]
||[!A_pro]STDW  .D2T1   A_y3:A_y2,  *B_p_y1[B_h4]      ;[17,1]
||      SUB     .D1     A_x0,       A_x4,  A_xl0_0     ;[11,2]
||      ADD     .L2X    A_xh0_0,    B_xh0_1,B_y0       ;[11,2]
||      SUB     .S2X    A_xh0_0,    B_xh0_1,B_y4       ;[11,2]
||[!A_r2]ROTL   .M2     B_x3,       0,     B_xh1_1     ;[11,2]

 [!A_pro]STDW   .D2T1   A_y7:A_y6,  *B_p_y3[B_h4]      ;[18,1]
||      SHRU    .S2     B_h3,       B_l1,  B_h4        ;[12,2] h2 >>= l1;
||[!A_r2]MV     .L2     B_x6,       B_xl1_1            ;[12,2]
||      ROTL    .M2     B_h1,       16,    B_h2        ;[ 6,3] h2=_rotl(h2, 16)
||      MPYSU   .M1     2,          A_pro, A_pro       ;10000
||      MV      .S1X    B_SP,       A_SP               ; Twin Stack Pointer
; ============================ PIPE LOOP EPILOG ==============================
        LDW     .D1T2   *+A_SP[ 1], B_ret       ; Get return address
||      LDW     .D2T1   *+B_SP[ 2], A_csr       ; Get CSR's value

        LDW     .D1T2   *+A_SP[ 3], B10         ; Restore B10
||      LDW     .D2T1   *+B_SP[ 4], A10         ; Restore A10

        LDW     .D1T2   *+A_SP[ 5], B11         ; Restore B11
||      LDW     .D2T1   *+B_SP[ 6], A11         ; Restore A11

        LDW     .D1T2   *+A_SP[ 7], B12         ; Restore B12
||      LDW     .D2T1   *+B_SP[ 8], A12         ; Restore A12

        LDW     .D1T2   *+A_SP[ 9], B13         ; Restore B13
||      LDW     .D2T1   *+B_SP[10], A13         ; Restore A13

        LDW     .D1T2   *+A_SP[11], B14         ; Restore B14
||      LDW     .D2T1   *+B_SP[12], A14         ; Restore A14

        LDW     .D2T1   *++B_SP[14],A15         ; Restore A15
||      RETNOP  .S2     B_ret, 4                ; Return to caller

        MVC     .S2X    A_csr,      CSR         ; Restore CSR
*====== Interruptibility state restored
;====== Branch Occurs =====

*============================================================================*
*=  End of file:  dsp_fft32x32.asm                                          =*
*============================================================================*
*             Copyright (c) 2003 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
*============================================================================*