c6416_sdk/dsplib/fft16x32.asm


								;* ======================================================================== *;

								;*  TEXAS INSTRUMENTS, INC.                                                 *;

								;*                                                                          *;

								;*  DSPLIB  DSP Signal Processing Library                                   *;

								;*                                                                          *;

								;*      Release:        Revision 1.04b                                      *;

								;*      CVS Revision:   1.5     Sun Sep 29 03:32:21 2002 (UTC)              *;

								;*      Snapshot date:  23-Oct-2003                                         *;

								;*                                                                          *;

								;*  This library contains proprietary intellectual property of Texas        *;

								;*  Instruments, Inc.  The library and its source code are protected by     *;

								;*  various copyrights, and portions may also be protected by patents or    *;

								;*  other legal protections.                                                *;

								;*                                                                          *;

								;*  This software is licensed for use with Texas Instruments TMS320         *;

								;*  family DSPs.  This license was provided to you prior to installing      *;

								;*  the software.  You may review this license by consulting the file       *;

								;*  TI_license.PDF which accompanies the files in this library.             *;

								;* ------------------------------------------------------------------------ *;

								;*          Copyright (C) 2003 Texas Instruments, Incorporated.             *;

								;*                          All Rights Reserved.                            *;

								;* ======================================================================== *;


								;* ======================================================================== *;

								;*  Assembler compatibility shim for assembling 4.30 and later code on      *;

								;*  tools prior to 4.30.                                                    *;

								;* ======================================================================== *;


								        .if $isdefed(".ASSEMBLER_VERSION")

								        .asg    .ASSEMBLER_VERSION, $asmver

								        .else

								        .asg    0,    $asmver

								        .endif


								        .if ($asmver < 430)


								        .asg    B,    CALL     ; Function Call

								        .asg    B,    RET      ; Return from a Function

								        .asg    B,    CALLRET  ; Function call with Call / Ret chaining.


								        .if .TMS320C6400

								        .asg    BNOP, CALLNOP  ; C64x BNOP as a Fn. Call

								        .asg    BNOP, RETNOP   ; C64x BNOP as a Fn. Return

								        .asg    BNOP, CRNOP    ; C64x Fn call w/, Call/Ret chaining via BNOP.

								        .endif


								        .asg    , .asmfunc     ; .func equivalent for hand-assembly code

								        .asg    , .endasmfunc  ; .endfunc equivalent for hand-assembly code


								        .endif


								;* ======================================================================== *;

								;*  End of assembler compatibility shim.                                    *;

								;* ======================================================================== *;


								*========================================================================== *

								*      TEXAS INSTRUMENTS, INC.                                              *

								*                                                                           *

								*      NAME                                                                 *

								*            DSP_fft16x32                                                   *

								*                                                                           *

								*      USAGE                                                                *

								*            This routine is C-callable and can be called as:               *

								*                                                                           *

								*           void DSP_fft16x32(const short * ptr_w, int  npoints,            *

								*                            int   * ptr_x, int  *ptr_y ) ;                 *

								*                                                                           *

								*             ptr_w   =  input twiddle factors                              *

								*             npoints =  number of points                                   *

								*             ptr_x   =  transformed data reversed                          *

								*             ptr_y   =  linear transformed data                            *

								*                                                                           *

								*            (See the C compiler reference guide.)                          *

								*                                                                           *

								*   DESCRIPTION                                                             *

								*       The following code performs a mixed radix FFT for "npoints" which   *

								*       is either a multiple of 4 or 2. It uses logN4 - 1 stages of radix4  *

								*       transform and performs either a radix2 or radix4 transform on the   *

								*       last stage depending on "npoints". If "npoints" is a multiple of 4, *

								*       then this last stage is also a radix4 transform, otherwise it is a  *

								*       radix2 transform. This program is available as a C compilable file  *

								*       to automatically generate the twiddle factors "twiddle_split.c"     *

								*                                                                           *

								*         int i, j, k, n = N;                                               *

								*         double theta1, theta2, theta3, x_t, y_t;                          *

								*         const double M = 32768.0, PI = 3  41592654;                       *

								*                                                                           *

								*         for (j=1, k=0; j < n>>2; j = j<<2)                                *

								*         {                                                                 *

								*             for (i=0; i < n>>2; i += j<<1)                                *

								*             {                                                             *

								*                 theta1 = 2*PI*i/n;                                        *

								*                 x_t = M*cos(theta1);                                      *

								*                 y_t = M*sin(theta1);                                      *

								*                 w[k+1] = (short) x_t;                                     *

								*                 if (x_t >= M) w[k+1] = 0x7fff;                            *

								*                 w[k+0] = (short) y_t;                                     *

								*                 if (y_t >= M) w[k+0] = 0x7fff;                            *

								*                                                                           *

								*                 theta1 = 2*PI*(i+j)/n;                                    *

								*                 x_t = M*cos(theta1);                                      *

								*                 y_t = M*sin(theta1);                                      *

								*                 w[k+7] = (short) x_t;                                     *

								*                 if (x_t >= M) w[k+3] = 0x7fff;                            *

								*                 w[k+6] = (short) y_t;                                     *

								*                 if (y_t >= M) w[k+2] = 0x7fff;                            *

								*                                                                           *

								*                 theta2 = 4*PI*i/n;                                        *

								*                 x_t = M*cos(theta2);                                      *

								*                 y_t = M*sin(theta2);                                      *

								*                 w[k+3] = (short) x_t;                                     *

								*                 if (x_t >= M) w[k+5] = 0x7fff;                            *

								*                 w[k+2] = (short) y_t;                                     *

								*                 if (y_t >= M) w[k+4] = 0x7fff;                            *

								*                                                                           *

								*                 theta2 = 4*PI*(i+j)/n;                                    *

								*                 x_t = M*cos(theta2);                                      *

								*                 y_t = M*sin(theta2);                                      *

								*                 w[k+9] = (short) x_t;                                     *

								*                 if (x_t >= M) w[k+7] = 0x7fff;                            *

								*                 w[k+8] = (short) y_t;                                     *

								*                 if (y_t >= M) w[k+6] = 0x7fff;                            *

								*                                                                           *

								*                 theta3 = 6*PI*i/n;                                        *

								*                 x_t = M*cos(theta3);                                      *

								*                 y_t = M*sin(theta3);                                      *

								*                 w[k+5] = (short) x_t;                                     *

								*                 if (x_t >= M) w[k+9] = 0x7fff;                            *

								*                 w[k+4] = (short) y_t;                                     *

								*                 if (y_t >= M) w[k+8] = 0x7fff;                            *

								*                                                                           *

								*                 theta3 = 6*PI*(i+j)/n;                                    *

								*                 x_t = M*cos(theta3);                                      *

								*                 y_t = M*sin(theta3);                                      *

								*                 w[k+11] = (short) x_t;                                    *

								*                 if (x_t >= M) w[k+11] = 0x7fff;                           *

								*                 w[k+10] = (short) y_t;                                    *

								*                 if (y_t >= M) w[k+10] = 0x7fff;                           *

								*                                                                           *

								*                 k += 12;                                                  *

								*             }                                                             *

								*         }                                                                 *

								*         w[2*n-1] = w[2*n-3] = w[2*n-5] = 0x7fff;                          *

								*         w[2*n-2] = w[2*n-4] = w[2*n-6] = 0x0000;                          *

								*                                                                           *

								*   ASSUMPTIONS                                                             *

								*       This code works for  both "npoints" a multiple of 2 or 4.           *

								*       The arrays 'x[]', 'y[]', and 'w[]' all must be aligned on a         *

								*       double-word boundary for the "optimized" implementations.           *

								*                                                                           *

								*       The input and output data are complex, with the real/imaginary      *

								*       components stored in adjacent locations in the array.  The real     *

								*       components are stored at even array indices, and the imaginary      *

								*       components are stored at odd array indices.                         *

								*                                                                           *

								*   TECHNIQUES                                                              *

								*       The following C code represents an implementation of the Cooley     *

								*       Tukey radix 4 DIF FFT. It accepts the inputs in normal order and    *

								*       produces the outputs in digit reversed order. The natural C code    *

								*       shown in this file on the other hand, accepts the inputs in nor-    *

								*       mal order and produces the outputs in normal order.                 *

								*                                                                           *

								*       Several transformations have been applied to the original Cooley    *

								*       Tukey code to produce the natural C code description shown here.    *

								*       In order to understand these it would first be educational to       *

								*       understand some of the issues involved in the conventional Cooley   *

								*       Tukey FFT code.                                                     *

								*                                                                           *

								*       void radix4(int n, short x[], short wn[])                           *

								*       {                                                                   *

								*           int    n1,  n2,  ie,   ia1,  ia2, ia3;                          *

								*           int    i0,  i1,  i2,    i3,    i, j,     k;                     *

								*           short  co1, co2, co3,  si1,  si2, si3;                          *

								*           short  xt0, yt0, xt1,  yt1,  xt2, yt2;                          *

								*           short  xh0, xh1, xh20, xh21, xl0, xl1,xl20,xl21;                *

								*                                                                           *

								*           n2 = n;                                                         *

								*           ie = 1;                                                         *

								*           for (k = n; k > 1; k >>= 2)                                     *

								*           {                                                               *

								*               n1 = n2;                                                    *

								*               n2 >>= 2;                                                   *

								*               ia1 = 0;                                                    *

								*                                                                           *

								*               for (j = 0; j < n2; j++)                                    *

								*               {                                                           *

								*                    ia2 = ia1 + ia1;                                       *

								*                    ia3 = ia2 + ia1;                                       *

								*                                                                           *

								*                    co1 = wn[2 * ia1    ];                                 *

								*                    si1 = wn[2 * ia1 + 1];                                 *

								*                    co2 = wn[2 * ia2    ];                                 *

								*                    si2 = wn[2 * ia2 + 1];                                 *

								*                    co3 = wn[2 * ia3    ];                                 *

								*                    si3 = wn[2 * ia3 + 1];                                 *

								*                    ia1 = ia1 + ie;                                        *

								*                                                                           *

								*                    for (i0 = j; i0< n; i0 += n1)                          *

								*                    {                                                      *

								*                        i1 = i0 + n2;                                      *

								*                        i2 = i1 + n2;                                      *

								*                        i3 = i2 + n2;                                      *

								*                                                                           *

								*                                                                           *

								*                        xh0  = x[2 * i0    ] + x[2 * i2    ];              *

								*                        xh1  = x[2 * i0 + 1] + x[2 * i2 + 1];              *

								*                        xl0  = x[2 * i0    ] - x[2 * i2    ];              *

								*                        xl1  = x[2 * i0 + 1] - x[2 * i2 + 1];              *

								*                                                                           *

								*                        xh20 = x[2 * i1    ] + x[2 * i3    ];              *

								*                        xh21 = x[2 * i1 + 1] + x[2 * i3 + 1];              *

								*                        xl20 = x[2 * i1    ] - x[2 * i3    ];              *

								*                        xl21 = x[2 * i1 + 1] - x[2 * i3 + 1];              *

								*                                                                           *

								*                        x[2 * i0    ] = xh0 + xh20;                        *

								*                        x[2 * i0 + 1] = xh1 + xh21;                        *

								*                                                                           *

								*                        xt0  = xh0 - xh20;                                 *

								*                        yt0  = xh1 - xh21;                                 *

								*                        xt1  = xl0 + xl21;                                 *

								*                        yt2  = xl1 + xl20;                                 *

								*                        xt2  = xl0 - xl21;                                 *

								*                        yt1  = xl1 - xl20;                                 *

								*                                                                           *

								*                        x[2 * i1    ] = (xt1 * co1 + yt1 * si1) >> 15;     *

								*                        x[2 * i1 + 1] = (yt1 * co1 - xt1 * si1) >> 15;     *

								*                        x[2 * i2    ] = (xt0 * co2 + yt0 * si2) >> 15;     *

								*                        x[2 * i2 + 1] = (yt0 * co2 - xt0 * si2) >> 15;     *

								*                        x[2 * i3    ] = (xt2 * co3 + yt2 * si3) >> 15;     *

								*                        x[2 * i3 + 1] = (yt2 * co3 - xt2 * si3) >> 15;     *

								*                    }                                                      *

								*              }                                                            *

								*                                                                           *

								*              ie <<= 2;                                                    *

								*          }                                                                *

								*      }                                                                    *

								*                                                                           *

								*       The conventional Cooley Tukey FFT, is written using three loops.    *

								*       The outermost loop "k" cycles through the stages. There are log     *

								*       N to the base 4 stages in all. The loop "j" cycles through the      *

								*       groups of butterflies with different twiddle factors, loop "i"      *

								*       reuses the twiddle factors for the different butterflies within     *

								*       a stage. It is interesting to note the following:                   *

								*                                                                           *

								*-------------------------------------------------------------------------- *

								*       Stage#     #Groups     # Butterflies with common     #Groups*Bflys  *

								*                                twiddle factors                            *

								*-------------------------------------------------------------------------- *

								*        1         N/4          1                            N/4            *

								*        2         N/16         4                            N/4            *

								*        ..                                                                 *

								*        logN      1            N/4                          N/4            *

								*-------------------------------------------------------------------------- *

								*                                                                           *

								*       The following statements can be made based on above observations:   *

								*                                                                           *

								*       a) Inner loop "i0" iterates a veriable number of times. In          *

								*       particular the number of iterations quadruples every time from      *

								*       1..N/4. Hence software pipelining a loop that iterates a vraiable   *

								*       number of times is not profitable.                                  *

								*                                                                           *

								*       b) Outer loop "j" iterates a variable number of times as well.      *

								*       However the number of iterations is quartered every time from       *

								*       N/4 .  . Hence the behaviour in (a) and (b) are exactly opposite    *

								*       to each other.                                                      *

								*                                                                           *

								*       c) If the two loops "i" and "j" are colaesced together then they    *

								*       will iterate for a fixed number of times namely N/4. This allows    *

								*       us to combine the "i" and "j" loops into 1 loop. Optimized impl-    *

								*       ementations will make use of this fact.                             *

								*                                                                           *

								*       In addition the Cooley Tukey FFT accesses three twiddle factors     *

								*       per iteration of the inner loop, as the butterflies that re-use     *

								*       twiddle factors are lumped together. This leads to accessing the    *

								*       twiddle factor array at three points each sepearted by "ie". Note   *

								*       that "ie" is initially 1, and is quadrupled with every iteration.   *

								*       Therfore these three twiddle factors are not even contiguous in     *

								*       the array.                                                          *

								*                                                                           *

								*       In order to vectorize the FFT, it is desirable to access twiddle    *

								*       factor array using double word wide loads and fetch the twiddle     *

								*       factors needed. In order to do this a modified twiddle factor       *

								*       array is created, in which the factors WN/4, WN/2, W3N/4 are        *

								*       arranged to be contiguous. This eliminates the seperation between   *

								*       twiddle factors within a butterfly. However this implies that as    *

								*       the loop is traversed from one stage to another, that we maintain   *

								*       a redundant version of the twiddle factor array. Hence the size     *

								*       of the twiddle factor array increases as compared to the normal     *

								*       Cooley Tukey FFT.  The modified twiddle factor array is of size     *

								*       "2 * N" where the conventional Cooley Tukey FFT is of size"3N/4"    *

								*       where N is the number of complex points to be transformed. The      *

								*       routine that generates the modified twiddle factor array was        *

								*       presented earlier. With the above transformation of the FFT,        *

								*       both the input data and the twiddle factor array can be accessed    *

								*       using double-word wide loads to enable packed data processing.      *

								*                                                                           *

								*       The final stage is optimised to remove the multiplication as        *

								*       w0 = 1.  This stage also performs digit reversal on the data,       *

								*       so the final output is in natural order.                            *

								*                                                                           *

								*       The fft() code shown here performs the bulk of the computation      *

								*       in place. However, because digit-reversal cannot be performed       *

								*       in-place, the final result is written to a separate array, y[].     *

								*                                                                           *

								*       There is one slight break in the flow of packed processing that     *

								*       needs to be comprehended. The real part of the complex number is    *

								*       in the lower half, and the imaginary part is in the upper half.     *

								*       The flow breaks in case of "xl0" and "xl1" because in this case     *

								*       the real part needs to be combined with the imaginary part because  *

								*       of the multiplication by "j". This requires a packed quantity like  *

								*       "xl21xl20" to be rotated as "xl20xl21" so that it can be combined   *

								*        using add2's and sub2's. Hence the natural version of C code       *

								*       shown below is transformed using packed data processing as shown:   *

								*                                                                           *

								*                        xl0  = x[2 * i0    ] - x[2 * i2    ];              *

								*                        xl1  = x[2 * i0 + 1] - x[2 * i2 + 1];              *

								*                        xl20 = x[2 * i1    ] - x[2 * i3    ];              *

								*                        xl21 = x[2 * i1 + 1] - x[2 * i3 + 1];              *

								*                                                                           *

								*                        xt1  = xl0 + xl21;                                 *

								*                        yt2  = xl1 + xl20;                                 *

								*                        xt2  = xl0 - xl21;                                 *

								*                        yt1  = xl1 - xl20;                                 *

								*                                                                           *

								*                        xl1_xl0   = _sub2(x21_x20, x21_x20)                *

								*                        xl21_xl20 = _sub2(x32_x22, x23_x22)                *

								*                        xl20_xl21 = _rotl(xl21_xl20, 16)                   *

								*                                                                           *

								*                        yt2_xt1   = _add2(xl1_xl0, xl20_xl21)              *

								*                        yt1_xt2   = _sub2(xl1_xl0, xl20_xl21)              *

								*                                                                           *

								*       Also notice that xt1, yt1 endup on seperate words, these need to    *

								*       be packed together to take advantage of the packed twiddle fact     *

								*       ors that have been loaded. In order for this to be achieved they    *

								*       are re-aligned as follows:                                          *

								*                                                                           *

								*       yt1_xt1 = _packhl2(yt1_xt2, yt2_xt1)                                *

								*       yt2_xt2 = _packhl2(yt2_xt1, yt1_xt2)                                *

								*                                                                           *

								*       The packed words "yt1_xt1" allows the loaded"sc" twiddle factor     *

								*       to be used for the complex multiplies. The real part os the         *

								*       complex multiply is implemented using _dotp2. The imaginary         *

								*       part of the complex multiply is implemented using the 16x32         *

								*       multiply instruction "mpylir" or "mpyhir".                          *

								*                                                                           *

								*       (X + jY) ( C + j S) = (XC + YS) + j (YC - XS).                      *

								*                                                                           *

								*       The actual twiddle factors for the FFT are cosine, - sine. The      *

								*       twiddle factors stored in the table are csine and sine, hence       *

								*       the sign of the "sine" term is comprehended during multipli-        *

								*       cation as shown above.                                              *

								*                                                                           *

								*   MEMORY NOTE                                                             *

								*       The optimized implementations are written for LITTLE ENDIAN.        *

								*                                                                           *

								*   INTERRUPTS                                                              *

								*       This code is interrupt tolerant but not interruptible. It masks out *

								*   interrupts for the entire duration of the code.                         *

								*                                                                           *

								*   CYCLES                                                                  *

								*       (13 * N/8 + 24) * ceil(log4(N) - 1) + (N + 8) * 1.5 + 27            *

								*                                                                           *

								*       N = 512, (13 * 64 + 24) * 4 + 520 * 1.5 + 27 = 4231 cycles          *

								*                                                                           *

								*   CODESIZE                                                                *

								*       1068 bytes                                                          *

								* ------------------------------------------------------------------------- *

								*             Copyright (c) 2003 Texas Instruments, Incorporated.           *

								*                            All Rights Reserved.                           *

								* ========================================================================= *


								*============================================================================*

								        .sect ".text:_fft16x32"

								        .global _DSP_fft16x32

								_DSP_fft16x32:

								*================== SYMBOLIC REGISTER ASSIGNMENTS: SETUP ====================*

								        .asg            B15,        B_SP          ; Stack pointer, B datapath

								        .asg            A31,        A_SP          ; Stack pointer, A datapath

								        .asg            B0,         B_csr         ; CSR's value

								        .asg            B1,         B_no_gie      ; CSR w/ GIE bit cleared

								        .asg            A0,         A_csr         ; Copy of CSR's value

								        .asg            B3,         B_ret         ; Return address

								; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================

								        .asg            A0,         A_whl

								*============================================================================*

								        ; Stack frame.  14 words:  A10..A15, B10..B14, B3, CSR, pad

								;-

								        STW     .D2T1   A15,        *B_SP--[14]   ; Reserve stack, Save A15


								        MV      .S1X    B_SP,       A_SP          ; Twin Stack Pointer


								        STW     .D1T1   A14,        *+A_SP[12]    ; Save A14

								||      STW     .D2T2   B14,        *+B_SP[11]    ; Save B14

								||      MVC     .S2     CSR,        B_csr         ; Capture CSR's state


								        STW     .D1T1   A13,        *+A_SP[10]    ; Save A13

								||      STW     .D2T2   B13,        *+B_SP[ 9]    ; Save B13

								||      AND     .L2     B_csr,      -2,B_no_gie   ; Clear GIE


								; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================

								        .asg            A4,         A_ptr_w

								        .asg            B4,         B_n

								        .asg            A6,         A_ptr_x

								        .asg            B6,         B_ptr_y

								        .asg            B14,        B_radix

								        .asg            B0,         B_radix2

								        .asg            B18,        B_h2

								        .asg            B13,        B_tw_offset

								        .asg            B0,         B_stride

								        .asg            B1,         B_j

								        .asg            B19,        B_fft_jmp

								        .asg            A19,        A_fft_jmp

								        .asg            B16,        B_l1

								        .asg            B17,        B_l2

								        .asg            A16,        A_l1

								        .asg            A18,        A_h2

								        .asg            A17,        A_l2

								        .asg            B0,         B_x

								        .asg            A15,        A_w0

								        .asg            B0,         B_fft_jmp_1

								        .asg            A14,        A_i

								        .asg            B2,         B_pro

								; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================

								        .asg            A15,        A_w0

								        .asg            B1,         B_j

								        .asg            B0,         B_x

								        .asg            B16,        B_l1

								        .asg            B17,        B_l2

								        .asg            B18,        B_h2

								        .asg            A16,        A_l1

								        .asg            A17,        A_l2

								        .asg            A18,        A_h2

								        .asg            A19,        A_fft_jmp

								        .asg            B19,        B_fft_jmp

								        .asg            A14,        A_i

								        .asg            A5,         A_j

								        .asg            B5,         B_w1

								        .asg            A8,         A_w2

								        .asg            B29,        B_co20_si20

								        .asg            B28,        B_co10_si10

								        .asg            A25,        A_co11_si11

								        .asg            A24,        A_co30_si30

								        .asg            A31,        A_co31_si31

								        .asg            A30,        A_co21_si21

								        .asg            B31,        B_x_1

								        .asg            B30,        B_x_0

								        .asg            B25,        B_xl1_1i

								        .asg            B24,        B_xl1_0i

								        .asg            B27,        B_xl2_1i

								        .asg            B26,        B_xl2_0i

								        .asg            B21,        B_xh2_1i

								        .asg            B20,        B_xh2_0i

								        .asg            A10,        A_x

								        .asg            A27,        A_x_3

								        .asg            A26,        A_x_2

								        .asg            A25,        A_xl1_3i

								        .asg            A24,        A_xl1_2i

								        .asg            A25,        A_xl2_3i

								        .asg            A24,        A_xl2_2i

								        .asg            A23,        A_xh2_3i

								        .asg            A22,        A_xh2_2i

								        .asg            B31,        B_xh1_0

								        .asg            B7,         B_xh0_0

								        .asg            A29,        A_xh1_1

								        .asg            A0,         A_xh0_1

								        .asg            B25,        B_xl1_0

								        .asg            B3,         B_xl0_0

								        .asg            A20,        A_xl1_1

								        .asg            A1,         A_xl0_1

								        .asg            B20,        B_xh21_0

								        .asg            B12,        B_xh20_0

								        .asg            A11,        A_xh21_1

								        .asg            A8,         A_xh20_1

								        .asg            B28,        B_xl21_0

								        .asg            B26,        B_xl20_0

								        .asg            A7,         A_xl21_1

								        .asg            A5,         A_xl20_1

								        .asg            A13,        A_x_

								        .asg            B11,        B_x__

								        .asg            A2,         A_ifj

								        .asg            B25,        B_x_1o

								        .asg            B24,        B_x_0o

								        .asg            A29,        A_x_3o

								        .asg            A28,        A_x_2o

								        .asg            B3,         B_yt0_0

								        .asg            B7,         B_xt0_0

								        .asg            A9,         A_yt0_1

								        .asg            A3,         A_xt0_1

								        .asg            B9,         B_yt1_0

								        .asg            B7,         B_xt1_0

								        .asg            A3,         A_yt2_1

								        .asg            A22,        A_xt1_1

								        .asg            B23,        B_yt2_0

								        .asg            B10,        B_xt2_0

								        .asg            A20,        A_yt1_1

								        .asg            A28,        A_xt2_1

								        .asg            B5,         B_p0

								        .asg            B8,         B_p1

								        .asg            B22,        B_xh2_0o

								        .asg            B14,        B_p2

								        .asg            B8,         B_p3

								        .asg            B23,        B_xh2_1o

								        .asg            A8,         A_p4

								        .asg            A23,        A_p5

								        .asg            A22,        A_xh2_2o

								        .asg            A9,         A_p6

								        .asg            A23,        A_p7

								        .asg            A23,        A_xh2_3o

								        .asg            B21,        B_p8

								        .asg            B8,         B_p9

								        .asg            B26,        B_xl1_0o

								        .asg            B27,        B_pa

								        .asg            B9,         B_pb

								        .asg            B27,        B_xl1_1o

								        .asg            A27,        A_pc

								        .asg            A26,        A_pd

								        .asg            A26,        A_xl1_2o

								        .asg            A21,        A_pe

								        .asg            A23,        A_pf

								        .asg            A27,        A_xl1_3o

								        .asg            B29,        B_co30_si30

								        .asg            B5,         B_p10

								        .asg            B8,         B_p11

								        .asg            B30,        B_xl2_0o

								        .asg            B5,         B_p12

								        .asg            B8,         B_p13

								        .asg            B31,        B_xl2_1o

								        .asg            A24,        A_p14

								        .asg            A7,         A_p15

								        .asg            A20,        A_xl2_2o

								        .asg            A23,        A_p16

								        .asg            A21,        A_p17

								        .asg            A21,        A_xl2_3o

								        .asg            A12,        A_x__

								        .asg            B20,        B_l1c

								        .asg            A1,         A_pro

								; ============================================================================


								        ZERO    .L2     B_tw_offset                             ;tw_offset = 0;

								||      MV      .S2     B_n,        B_h2                        ;stride=n

								||      STW     .D1T1   A12,        *+A_SP[ 8]                  ; Save A12

								||      STW     .D2T2   B12,        *+B_SP[ 7]                  ; Save B12


								        SHRU    .S2     B_h2,       2,          B_h2            ;[ 7,0]

								||      STW     .D1T1   A11,        *+A_SP[ 6]                  ; Save A11

								||      STW     .D2T2   B11,        *+B_SP[ 5]                  ; Save B11

								||      MV      .L1X    B_csr,      A_csr                       ;


								        ADDAH   .D2     B_h2,       B_h2,       B_l2            ;[ 8,0]


								        SHRU    .S1X    B_n,        3,          A_i             ;[ 9,0] n>>3

								||      STW     .D1T1   A10,        *+A_SP[ 4]                  ; Save A10

								||      STW     .D2T2   B10,        *+B_SP[ 3]                  ; Save B10

								||      MVC     .S2     B_no_gie,   CSR                         ; Disable


								        MV      .L1X    B_l2,       A_l2                        ;[10,0]

								||      SHL     .S2     B_h2,       1,          B_l1            ;[10,0]

								||      MPYSU   .M2     8,          B_l2,       B_fft_jmp       ;1.5*stride


								        MV      .S1X    B_h2,       A_h2                        ;[11,0]

								||      STW     .D1T1   A_csr,      *+A_SP[ 2]                  ; Save CSR

								||      STW     .D2T2   B_ret,      *+B_SP[ 1]                  ; return


								        MV      .L1X    B_l1,       A_l1                        ;[12,0]

								||      LDDW    .D1T2   *A_ptr_x[A_h2], B_xh2_1i:B_xh2_0i       ;[ 1,1]


								LOOP_WHILE_N:


								        SHRU    .S1X    B_fft_jmp,  4,          A_fft_jmp       ;[13,0]

								||      MVK     .L2     1,          B_pro                       ;

								||      ADD     .L1     A_ptr_x,    8,          A_x             ;[ 2,1]


								        ADD     .D1X    A_ptr_w,    B_tw_offset,A_w0            ;[14,0]

								||      SHRU    .S2     B_fft_jmp,  2,          B_fft_jmp_1     ;[14,0]

								||      MPYSU   .M2     0,          B_j,        B_j             ;[14,0] j = 0


								        MV      .L2X    A_ptr_x,    B_x                         ;[15,0] ptr_x

								||      SUB     .D1     A_fft_jmp,  3,          A_fft_jmp       ;[15,0]

								||      ADD     .L1X    A_w0,       B_tw_offset,A_w0            ;[15,0]

								||      ADD     .D2     B_tw_offset,B_fft_jmp_1,B_tw_offset     ;[15,0] fft_jmp


								        MVD     .M1X    B_x,        A_x_                        ;[ 5,1]

								||      LDDW    .D2T2   *B_x[B_l2], B_xl2_1i:B_xl2_0i           ;[ 5,1]

								||      BDEC    .S1     LOOP_Y8 + 20, A_i


								        SUB     .S1     A_fft_jmp,  0,          A_ifj           ;[ 6,1]

								||      LDDW    .D1T1   *A_x[0],    A_x_3:A_x_2                 ;[ 6,1]

								||      LDDW    .D2T2   *B_x[0],    B_x_1:B_x_0                 ;[ 6,1]


								  [!A_ifj]ADD   .S2     B_x,        B_fft_jmp,  B_x             ;[ 7,1]

								||      LDDW    .D2T2   *B_x[B_l1], B_xl1_1i:B_xl1_0i           ;[ 7,1]

								||      MV      .L1X    B_j,        A_j                         ;[ 7,1]


								        LDDW    .D1T1   *A_x[A_h2], A_xh2_3i:A_xh2_2i           ;[ 8,1]


								        LDDW    .D1T1   *A_x[A_l2], A_xl2_3i:A_xl2_2i           ;[ 9,1]

								||      ADD     .L2X    A_w0,       8,          B_w1            ;[ 9,1]


								        SUB     .L2     B_xh2_0i,   B_xl2_0i,   B_xl20_0        ;[10,1]

								||      ADD     .S2     B_xl2_0i,   B_xh2_0i,   B_xh20_0        ;[10,1]

								||      ADD     .D2     B_xl2_1i,   B_xh2_1i,   B_xh21_0        ;[10,1]

								||      BDEC    .S1     LOOP_Y,     A_i

								; ============================ PIPE LOOP KERNEL ==============================

								LOOP_Y:

								  [!B_pro]STDW  .D1T1   A_xl2_3o:A_xl2_2o,      *A_x__[A_l2]    ;[29,1]

								||[!B_pro]STDW  .D2T2   B_xh2_1o:B_xh2_0o,      *B_x__[B_h2]    ;[29,1]

								||      MPYHIR  .M2     B_co30_si30,            B_xt2_0,B_p11   ;[29,1]

								||      MPYHIR  .M1     A_co21_si21,            A_xt0_1,A_pd    ;[29,1]

								||      SUB     .L2     B_xl0_0,    B_xl21_0,   B_xt2_0         ;[16,2]

								||[!A_ifj]ZERO  .S2     B_j                                     ;[16,2]

								||      SUB     .L1     A_x_2,      A_xl1_2i,   A_xl0_1         ;[16,2]

								||      SUB     .S1     A_x_3,      A_xl1_3i,   A_xl1_1         ;[16,2]


								  [!B_pro]STDW  .D1T1   A_x_3o:A_x_2o,          *A_x__[0]       ;[30,1]

								||      MPYLIR  .M1     A_co21_si21,A_xt0_1,    A_pf            ;[30,1]

								||      ADD     .D2     B_p8,       B_p9,       B_xl1_0o        ;[30,1]

								||      MPYLIR  .M2     B_co10_si10,B_yt1_0,    B_p0            ;[17,2]

								||      SUB     .S1     A_xl0_1,    A_xl21_1,   A_xt2_1         ;[17,2]

								||      ADD     .L2     B_xl1_0,    B_xl20_0,   B_yt2_0         ;[17,2]

								||      ADD     .L1     A_xl1_1,    A_xl20_1,   A_yt2_1         ;[17,2]

								||      SUB     .S2     B_xh1_0,    B_xh21_0,   B_yt0_0         ;[17,2]


								        SUB     .S2     B_p12,      B_p13,      B_xl2_1o        ;[31,1]

								||      SUB     .S1     A_p6,       A_p7,       A_xh2_3o        ;[31,1]

								||      MPYHIR  .M2     B_co10_si10,B_xt1_0,    B_p1            ;[18,2]

								||      ADD     .L2     B_xh21_0,   B_xh1_0,    B_x_1o          ;[18,2]

								||      ADD     .D1     A_xl1_2i,   A_x_2,      A_xh0_1         ;[18,2]

								||      ADD     .L1     A_xl1_3i,   A_x_3,      A_xh1_1         ;[18,2]

								||      MVD     .M1X    B_x,        A_x_                        ;[ 5,3]

								||      LDDW    .D2T2   *B_x[B_l2], B_xl2_1i:B_xl2_0i           ;[ 5,3]


								        MPYHIR  .M1     A_co31_si31,A_xt2_1,    A_p15           ;[19,2]

								||      MPYLIR  .M2     B_co10_si10,B_xt1_0,    B_p3            ;[19,2]

								||      SUB     .L1     A_xl1_1,    A_xl20_1,   A_yt1_1         ;[19,2]

								||      ADD     .S2     B_xl1_0i,   B_x_0,      B_xh0_0         ;[19,2]

								||      SUB     .S1X    A_fft_jmp,  B_j,        A_ifj           ;[ 6,3]

								||      LDDW    .D1T1   *A_x[0],    A_x_3:A_x_2                 ;[ 6,3]

								||      LDDW    .D2T2   *B_x[0],    B_x_1:B_x_0                 ;[ 6,3]


								  [!B_pro]STDW  .D1T1   A_xh2_3o:A_xh2_2o,      *A_x__[A_h2]    ;[33,1]

								||      ADD     .L2     B_p10,      B_p11,      B_xl2_0o        ;[33,1]

								||      MPYLIR  .M1     A_co31_si31,A_xt2_1,    A_p17           ;[20,2]

								||      SUB     .S1     A_xh1_1,    A_xh21_1,   A_yt0_1         ;[20,2]

								||      MVD     .M2X    A_x_,       B_x__                       ;[20,2]

								||[!A_ifj]ADD   .S2     B_x,        B_fft_jmp,  B_x             ;[ 7,3]

								||      LDDW    .D2T2   *B_x[B_l1], B_xl1_1i:B_xl1_0i           ;[ 7,3]

								||      MV      .L1X    B_j,        A_j                         ;[ 7,3]


								  [!B_pro]STDW  .D2T2   B_xl2_1o:B_xl2_0o,      *B_x__[B_l2]    ;[34,1]

								||      MPYHIR  .M1     A_co31_si31,A_yt2_1,    A_p16           ;[21,2]

								||      MPYHIR  .M2     B_co20_si20,B_yt0_0,    B_pa            ;[21,2]

								||      ADD     .S1     A_xl0_1,    A_xl21_1,   A_xt1_1         ;[21,2]

								||      SUB     .L2     B_xh0_0,    B_xh20_0,   B_xt0_0         ;[21,2]

								||      ADD     .L1     A_xh20_1,   A_xh0_1,    A_x_2o          ;[21,2]

								||      ADD     .S2     B_xh20_0,   B_xh0_0,    B_x_0o          ;[21,2]

								||      LDDW    .D1T1   *A_x[A_h2], A_xh2_3i:A_xh2_2i           ;[ 8,3]


								  [!B_pro]STDW  .D2T2   B_xl1_1o:B_xl1_0o,      *B_x__[B_l1]    ;[35,1]

								||      SUB     .L1     A_pe,       A_pf,       A_xl1_3o        ;[35,1]

								||      ADD     .S1     A_pc,       A_pd,       A_xl1_2o        ;[35,1]

								||      MPYLIR  .M1     A_co31_si31,A_yt2_1,    A_p14           ;[22,2]

								||      MPYLIR  .M2     B_co20_si20,B_yt0_0,    B_p8            ;[22,2]

								||      ADD     .S2     B_p0,       B_p1,       B_xh2_0o        ;[22,2]

								||      LDDW    .D1T1   *A_x[A_l2], A_xl2_3i:A_xl2_2i           ;[ 9,3]

								||      ADD     .L2X    A_w0,       8,          B_w1            ;[ 9,3]


								        BDEC    .S1     LOOP_Y,     A_i                         ;} end for

								||[!B_pro]STDW  .D1T1   A_xl1_3o:A_xl1_2o,      *A_x__[A_l1]    ;[36,1]

								||      MPYLIR  .M2     B_co20_si20,B_xt0_0,    B_pb            ;[23,2]

								||      MPYLIR  .M1     A_co11_si11,A_yt1_1,    A_p4            ;[23,2]

								||      SUB     .L1     A_xh0_1,    A_xh20_1,   A_xt0_1         ;[23,2]

								||      SUB     .L2     B_xh2_0i,   B_xl2_0i,   B_xl20_0        ;[10,3]

								||      ADD     .S2     B_xl2_0i,   B_xh2_0i,   B_xh20_0        ;[10,3]

								||      ADD     .D2     B_xl2_1i,   B_xh2_1i,   B_xh21_0        ;[10,3]

								LOOP_Y8:

								        STDW    .D2T2   B_x_1o:B_x_0o,          *B_x__[0]       ;[24,2]

								||[B_pro]SUB    .S2     B_pro,      1,          B_pro           ;

								||      MPYHIR  .M1     A_co11_si11,A_xt1_1,    A_p5            ;[24,2]

								||      MPYHIR  .M2     B_co10_si10,B_yt1_0,    B_p2            ;[24,2]

								||      ADD     .S1     A_xh21_1,   A_xh1_1,    A_x_3o          ;[24,2]

								||      SUB     .L2     B_xh2_1i,   B_xl2_1i,   B_xl21_0        ;[11,3]

								||      LDDW    .D1T1   *A_x[A_l1], A_xl1_3i:A_xl1_2i           ;[11,3]

								||      ADD     .L1X    B_w1,       8,          A_w2            ;[11,3]

								;LOOP_Y9:

								        SUB     .S1     A_p16,      A_p17,      A_xl2_3o        ;[25,2]

								||      MV      .S2X    A_co30_si30,            B_co30_si30     ;[25,2]

								||      MPYHIR  .M2     B_co20_si20,B_xt0_0,    B_p9            ;[25,2]

								||      MPYLIR  .M1     A_co11_si11,A_xt1_1,    A_p7            ;[25,2]

								||      ADD     .D2     B_x,        16,         B_x             ;[12,3]

								||      SUB     .L2     B_x_0,      B_xl1_0i,   B_xl0_0         ;[12,3]

								||      LDDW    .D1T2   *A_w0[A_j], B_co20_si20:B_co10_si10     ;[12,3]

								;LOOP_Y10:

								        ADD     .S1X    B_x__,      8,          A_x__           ;[26,2]

								||      ADD     .L1     A_p14,      A_p15,      A_xl2_2o        ;[26,2]

								||      MPYHIR  .M2     B_co30_si30,B_yt2_0,    B_p12           ;[26,2]

								||      MPYHIR  .M1     A_co11_si11,A_yt1_1,    A_p6            ;[26,2]

								||      ADD     .S2     B_xl0_0,    B_xl21_0,   B_xt1_0         ;[13,3]

								||      SUB     .L2     B_x_1,      B_xl1_1i,   B_xl1_0         ;[13,3]

								||      ADD     .D2     B_xl1_1i,   B_x_1,      B_xh1_0         ;[13,3]

								||      LDDW    .D1T1   *A_w2[A_j], A_co31_si31:A_co21_si21     ;[13,3]

								;LOOP_Y11:

								        MPYLIR  .M2     B_co30_si30,B_xt2_0,    B_p13           ;[27,2]

								||      MPYHIR  .M1     A_co21_si21,A_yt0_1,    A_pe            ;[27,2]

								||      SUB     .S2     B_pa,       B_pb,       B_xl1_1o        ;[27,2]

								||      SUB     .L2     B_xl1_0,    B_xl20_0,   B_yt1_0         ;[14,3]

								||      SUB     .S1     A_xh2_2i,   A_xl2_2i,   A_xl20_1        ;[14,3]

								||      SUB     .D1     A_xh2_3i,   A_xl2_3i,   A_xl21_1        ;[14,3]

								||      ADD     .L1     A_xl2_3i,   A_xh2_3i,   A_xh21_1        ;[14,3]

								||      LDDW    .D2T2   *B_x[B_h2], B_xh2_1i:B_xh2_0i           ;[ 1,4]

								;LOOP_Y12:

								        MPYLIR  .M2     B_co30_si30,B_yt2_0,    B_p10           ;[28,2]

								||      MPYLIR  .M1     A_co21_si21,A_yt0_1,    A_pc            ;[28,2]

								||      ADD     .L1     A_p4,       A_p5,       A_xh2_2o        ;[28,2]

								||      SUB     .S2     B_p2,       B_p3,       B_xh2_1o        ;[28,2]

								||      ADD     .L2     B_j,        3,          B_j             ;[15,3]

								||      ADD     .S1     A_xl2_2i,   A_xh2_2i,   A_xh20_1        ;[15,3]

								||      LDDW    .D2T1   *B_w1[B_j], A_co11_si11:A_co30_si30     ;[15,3]

								||      ADD     .D1X    B_x,        8,          A_x             ;[ 2,4]

								; ============================ PIPE LOOP EPILOG ==============================

								        STDW    .D1T1   A_xl2_3o:A_xl2_2o,      *A_x__[A_l2]    ;[29,4]

								||      STDW    .D2T2   B_xh2_1o:B_xh2_0o,      *B_x__[B_h2]    ;[29,4]

								||      MPYHIR  .M2     B_co30_si30,B_xt2_0,    B_p11           ;[29,4]

								||      MPYHIR  .M1     A_co21_si21,A_xt0_1,    A_pd            ;[29,4]

								||      NORM    .L2     B_n,        B_radix2                    ;[ 2,0]


								        STDW    .D1T1   A_x_3o:A_x_2o,          *A_x__[0]       ;[30,4]

								||      MPYLIR  .M1     A_co21_si21,A_xt0_1,    A_pf            ;[30,4]

								||      ADD     .D2     B_p8,       B_p9,       B_xl1_0o        ;[30,4]

								||      AND     .S2     B_radix2,   1,          B_radix2        ;norm(npoints)&1

								||      MVK     .L2     4,          B_radix                     ; radix = 4?


								        SUB     .L2     B_p12,      B_p13,      B_xl2_1o        ;[31,4]

								||      SUB     .L1     A_p6,       A_p7,       A_xh2_3o        ;[31,4]

								||[B_radix2]MVK .D2     2,          B_radix                     ;radix = 2


								        CMPGTU  .L1X    A_h2,       B_radix,    A_whl           ;


								        STDW    .D1T1   A_xh2_3o:A_xh2_2o,      *A_x__[A_h2]    ;[33,4]

								||      ADD     .L2     B_p10,      B_p11,      B_xl2_0o        ;[33,4]

								||[A_whl]B      .S1     LOOP_WHILE_N                            ;} end while


								        STDW    .D2T2   B_xl2_1o:B_xl2_0o,      *B_x__[B_l2]    ;[34,4]

								||[A_whl]SHRU   .S2     B_h2,       2,          B_h2            ;[ 7,0]


								        STDW    .D2T2   B_xl1_1o:B_xl1_0o,      *B_x__[B_l1]    ;[35,4]

								||      SUB     .L1     A_pe,       A_pf,       A_xl1_3o        ;[35,4]

								||      ADD     .D1     A_pc,       A_pd,       A_xl1_2o        ;[35,4]

								||[A_whl]SHRU   .S1X    B_n,        3,          A_i             ;[ 9,0] n>>3


								        STDW    .D1T1   A_xl1_3o:A_xl1_2o,      *A_x__[A_l1]    ;[36,4]

								||[A_whl]ADDAH  .D2     B_h2,       B_h2,       B_l2            ;[ 8,0]

								||[A_whl]MV     .L1X    B_h2,       A_h2                       ;[11,0]


								  [A_whl]MV      .L1X    B_l2,       A_l2                       ;[10,0]

								||[A_whl]SHL     .S2     B_h2,       1,         B_l1            ;[10,0]

								||[A_whl]MPYSU   .M2     8,          B_l2,      B_fft_jmp       ;1.5*stride


								  [A_whl]MV      .S1X    B_l1,       A_l1                        ;[12,0]

								||[A_whl]LDDW    .D1T2   *A_ptr_x[A_h2], B_xh2_1i:B_xh2_0i       ;[ 1,1]

								||[!A_whl]NORM   .L2     B_n,        B_l1c                       ;l1=_norm(n)+2

								||[!A_whl]MVK    .L1     1,          A_pro                       ;


								; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================

								        .asg            B14,        B_radix

								        .asg            A6,         A_ptr_x

								        .asg            B6,         B_ptr_y

								        .asg            B4,         B_n

								        .asg            A0,         A_r2

								        .asg            A20,        A_p_x0

								        .asg            B8,         B_p_x0

								        .asg            B22,        B_p_y2

								        .asg            B23,        B_p_y1

								        .asg            B3,         B_p_y3

								        .asg            B20,        B_l1

								        .asg            B19,        B_j0

								        .asg            A18,        A_i

								        .asg            B9,         B_j

								        .asg            A1,         A_pro

								; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================

								        .asg            B9,         B_j

								        .asg            A18,        A_i

								        .asg            B19,        B_j0

								        .asg            B20,        B_l1

								        .asg            B23,        B_p_y1

								        .asg            B22,        B_p_y2

								        .asg            B3,         B_p_y3

								        .asg            A20,        A_p_x0

								        .asg            B8,         B_p_x0

								        .asg            A0,         A_r2

								        .asg            B25,        B_h0

								        .asg            B7,         B_h1

								        .asg            B7,         B_h2

								        .asg            B5,         B_h3

								        .asg            B16,        B_h4

								        .asg            A27,        A_x1

								        .asg            A26,        A_x0

								        .asg            B29,        B_x3

								        .asg            B28,        B_x2

								        .asg            A5,         A_x5

								        .asg            A4,         A_x4

								        .asg            B5,         B_x7

								        .asg            B4,         B_x6

								        .asg            A21,        A_xh0_0

								        .asg            A3,         A_xh1_0

								        .asg            B24,        B_xh0_1

								        .asg            B26,        B_xh1_1

								        .asg            B24,        B_y0

								        .asg            B25,        B_y1

								        .asg            B31,        B_y5

								        .asg            B30,        B_y4

								        .asg            A16,        A_xl0_0

								        .asg            A19,        A_xl1_0

								        .asg            B18,        B_xl0_1

								        .asg            B17,        B_xl1_1

								        .asg            A16,        A_y2

								        .asg            A17,        A_y3

								        .asg            A8,         A_y6

								        .asg            A9,         A_y7

								        .asg            A22,        A_temp

								; ============================ PIPE LOOP PROLOG ==============================

								        ZERO    .L2     B_j                                     ;[ 3,0]

								||      SUB     .D1X    B_radix,    2,          A_r2            ;[ 3,0]

								||      ADD     .S2     B_l1,       2,          B_l1            ;[ 3,0]

								||      ADDAW   .D2     B_ptr_y,    B_n,        B_p_y2          ;[ 3,0]


								        MVK     .S2     4,          B_j0                        ;[ 4,0] j0 = 4

								||      ADDAH   .D2     B_p_y2,     B_n,        B_p_y3          ;[ 4,0]

								||      SHL     .S1     A_pro,      15,         A_pro           ;


								        SHRU    .S1X    B_n,        2,          A_i             ;[ 5,0]

								||[!A_r2]MVK    .S2     8,          B_j0                        ;[ 5,0] j0 = 8

								||[!A_r2]SUB    .L2     B_l1,       1,          B_l1            ;[ 5,0]

								||      ADDAH   .D2     B_ptr_y,    B_n,        B_p_y1          ;[ 5,0]


								  [!A_r2]ADD    .S2     B_p_y2,     B_n,        B_p_y3          ;[ 6,0]

								||[!A_r2]ADD    .L2     B_ptr_y,    B_n,        B_p_y1          ;[ 6,0]

								||      ADD     .D2X    A_ptr_x,    8,          B_p_x0          ;x = ptr_x

								; ============================ PIPE LOOP KERNEL ==============================

								LOOP_Z:

								  [!A_r2]ROTL   .M1     A_x4,       0,          A_xl0_0         ;[13,1]

								||      SUB     .L1X    A_xl1_0,    B_xl0_1,    A_y3            ;[13,1]

								||      ADD     .S2X    A_xh1_0,    B_xh1_1,    B_y1            ;[13,1]

								||      BDEC    .S1     LOOP_Z,     A_i                         ;[13,1] }end for

								||      ADD     .L2     B_j,        B_j0,       B_j             ;j  += j0;

								||      LDDW    .D2T2   *B_p_x0++[2],           B_x3:B_x2       ;[ 1,3]

								||      LDDW    .D1T1   *A_ptr_x++[2],          A_x1:A_x0       ;[ 1,3]

								||      DEAL    .M2     B_j,        B_h0                        ;h2=_deal(j);


								 [!A_pro]STDW   .D2T2   B_y1:B_y0,  *B_ptr_y[B_h4]              ;[14,1]

								||      MV      .S1     A_y3,       A_temp                      ;[14,1]

								||      ADD     .L1X    A_xl1_0,    B_xl0_1,    A_y7            ;[14,1]

								||      SUB     .L2     B_x2,       B_x6,       B_xl0_1         ;[ 8,2]

								||      ADD     .S2     B_x6,       B_x2,       B_xh0_1         ;[ 8,2]

								||      ADD     .D1     A_x4,       A_x0,       A_xh0_0         ;[ 8,2]

								||[!A_r2]ROTL   .M1     A_x0,       0,          A_xh0_0         ;[ 8,2]

								||[!A_r2]ROTL   .M2     B_x2,       0,          B_xh0_1         ;[ 8,2]


								        SUB     .L1X    A_xl0_0,    B_xl1_1,    A_y6            ;[15,1]

								||      SUB     .L2X    A_xh1_0,    B_xh1_1,    B_y5            ;[15,1]

								||      ADD     .S1     A_x5,       A_x1,       A_xh1_0         ;[ 9,2]

								||[!A_r2]ROTL   .M1     A_x1,       0,          A_xh1_0         ;[ 9,2]

								||[!A_r2]MV     .S2     B_x7,       B_xl0_1                     ;[ 9,2]

								||      SHFL    .M2     B_h2,       B_h3                        ;h2= _shfl(h2);

								||      LDDW    .D1T1   *A_ptr_x++[2],          A_x5:A_x4       ;[ 3,3]

								||      LDDW    .D2T2   *B_p_x0++[2],           B_x7:B_x6       ;[ 3,3]


								        ADD     .S1X    A_xl0_0,    B_xl1_1,    A_y2            ;[16,1]

								||[!A_r2]MV     .D1     A_y7,       A_y3                        ;[16,1]

								||[!A_pro]STDW  .D2T2   B_y5:B_y4,  *B_p_y2[B_h4]               ;[16,1]

								||      SUB     .L1     A_x1,       A_x5,       A_xl1_0         ;[10,2]

								||[!A_r2]ROTL   .M1     A_x5,       0,          A_xl1_0         ;[10,2]

								||      ADD     .S2     B_x7,       B_x3,       B_xh1_1         ;[10,2]

								||      SUB     .L2     B_x3,       B_x7,       B_xl1_1         ;[10,2]

								||      BITR    .M2     B_h0,       B_h1                        ;h2=_bitr(h2);


								  [!A_r2]MV     .L1     A_temp,     A_y7                        ;[17,1]

								||[!A_pro]STDW  .D2T1   A_y3:A_y2,  *B_p_y1[B_h4]               ;[17,1]

								||      SUB     .D1     A_x0,       A_x4,       A_xl0_0         ;[11,2]

								||      ADD     .L2X    A_xh0_0,    B_xh0_1,    B_y0            ;[11,2]

								||      SUB     .S2X    A_xh0_0,    B_xh0_1,    B_y4            ;[11,2]

								||[!A_r2]ROTL   .M2     B_x3,       0,          B_xh1_1         ;[11,2]


								 [!A_pro]STDW   .D2T1   A_y7:A_y6,  *B_p_y3[B_h4]               ;[18,1]

								||      SHRU    .S2     B_h3,       B_l1,       B_h4            ;h2 >>= l1;

								||[!A_r2]MV     .L2     B_x6,       B_xl1_1                     ;[12,2]

								||      ROTL    .M2     B_h1,       16,         B_h2            ;h2=_rotl(h2,16)

								||      MPYSU   .M1     2,          A_pro,      A_pro           ;10000

								||      MV      .S1X    B_SP,       A_SP        ; Twin Stack Pointer

								; ============================ PIPE LOOP EPILOG ==============================

								        LDW     .D1T2   *+A_SP[ 1], B_ret       ; Get return address

								||      LDW     .D2T1   *+B_SP[ 2], A_csr       ; Get CSR's value


								        LDW     .D1T2   *+A_SP[ 3], B10         ; Restore B10

								||      LDW     .D2T1   *+B_SP[ 4], A10         ; Restore A10


								        LDW     .D1T2   *+A_SP[ 5], B11         ; Restore B11

								||      LDW     .D2T1   *+B_SP[ 6], A11         ; Restore A11


								        LDW     .D1T2   *+A_SP[ 7], B12         ; Restore B12

								||      LDW     .D2T1   *+B_SP[ 8], A12         ; Restore A12


								        LDW     .D1T2   *+A_SP[ 9], B13         ; Restore B13

								||      LDW     .D2T1   *+B_SP[10], A13         ; Restore A13


								        LDW     .D1T2   *+A_SP[11], B14         ; Restore B14

								||      LDW     .D2T1   *+B_SP[12], A14         ; Restore A14


								        LDW     .D2T1   *++B_SP[14],A15         ; Restore A15

								||      RETNOP  .S2     B_ret, 4                ; Return to caller


								        MVC     .S2X    A_csr,      CSR         ; Restore CSR

								*====== Interruptibility state restored

								;====== Branch Occurs =====


								*============================================================================*

								*=  End of file:  fft32x16_h.asm                                            =*

								*============================================================================*

								*             Copyright (c) 2003 Texas Instruments, Incorporated.           *

								*                            All Rights Reserved.                           *

								*============================================================================*