;* ======================================================================== *; ;* TEXAS INSTRUMENTS, INC. *; ;* *; ;* DSPLIB DSP Signal Processing Library *; ;* *; ;* Release: Revision 1.04b *; ;* CVS Revision: 1.7 Sun Sep 29 03:32:21 2002 (UTC) *; ;* Snapshot date: 23-Oct-2003 *; ;* *; ;* This library contains proprietary intellectual property of Texas *; ;* Instruments, Inc. The library and its source code are protected by *; ;* various copyrights, and portions may also be protected by patents or *; ;* other legal protections. *; ;* *; ;* This software is licensed for use with Texas Instruments TMS320 *; ;* family DSPs. This license was provided to you prior to installing *; ;* the software. You may review this license by consulting the file *; ;* TI_license.PDF which accompanies the files in this library. *; ;* ------------------------------------------------------------------------ *; ;* Copyright (C) 2003 Texas Instruments, Incorporated. *; ;* All Rights Reserved. *; ;* ======================================================================== *; ;* ======================================================================== *; ;* Assembler compatibility shim for assembling 4.30 and later code on *; ;* tools prior to 4.30. *; ;* ======================================================================== *; .if $isdefed(".ASSEMBLER_VERSION") .asg .ASSEMBLER_VERSION, $asmver .else .asg 0, $asmver .endif .if ($asmver < 430) .asg B, CALL ; Function Call .asg B, RET ; Return from a Function .asg B, CALLRET ; Function call with Call / Ret chaining. .if .TMS320C6400 .asg BNOP, CALLNOP ; C64x BNOP as a Fn. Call .asg BNOP, RETNOP ; C64x BNOP as a Fn. Return .asg BNOP, CRNOP ; C64x Fn call w/, Call/Ret chaining via BNOP. .endif .asg , .asmfunc ; .func equivalent for hand-assembly code .asg , .endasmfunc ; .endfunc equivalent for hand-assembly code .endif ;* ======================================================================== *; ;* End of assembler compatibility shim. *; ;* ======================================================================== *; *========================================================================== * * TEXAS INSTRUMENTS, INC. * * * * NAME * * DSP_fft32x32: Double Precision FFT * * * * USAGE * * This routine is C-callable and can be called as: * * * * void DSP_fft32x32(const int * ptr_w, int npoints, * * int * ptr_x, int * ptr_y ) ; * * * * ptr_w = input twiddle factors * * npoints = number of points * * ptr_x = transformed data reversed * * ptr_y = linear transformed data * * * * (See the C compiler reference guide.) * * * * DESCRIPTION * * The following code performs a mixed radix FFT for "npoints" which * * is either a multiple of 4 or 2. It uses logN4 - 1 stages of radix4 * * transform and performs either a radix2 or radix4 transform on the * * last stage depending on "npoints". If "npoints" is a multiple of 4, * * then this last stage is also a radix4 transform, otherwise it is a * * radix2 transform. This program is available as a C compilable file * * to automatically generate the twiddle factors "twiddle_split.c" * * * * Generate special vector of twiddle factors * * * * for (j=1, k=0; j < npoints>>2; j = j <<2 ) * * { * * for (i=0; i < npoints>>2; i += j) * * { * * theta1 = 2*PI*i/npoints; * * x_t = M*cos(theta1); * * y_t = M*sin(theta1); * * ptr_w[k+1] = (int) x_t; * * if (x_t >= M) ptr_w[k+1] = 0x7fffffff; * * ptr_w[k+0] = (int) y_t; * * if (y_t >= M) ptr_w[k+0] = 0x7fffffff; * * * * theta2 = 4*PI*i/npoints; * * x_t = M*cos(theta2); * * y_t = M*sin(theta2); * * ptr_w[k+3] = (int) x_t; * * * * if (x_t >= M) ptr_w[k+3] = 0x7fffffff; * * ptr_w[k+2] = (int) y_t; * * if (y_t >= M) ptr_w[k+2] = 0x7fffffff; * * * * theta3 = 6*PI*i/npoints; * * x_t = M*cos(theta3); * * y_t = M*sin(theta3); * * ptr_w[k+5] = (int) x_t; * * if (x_t >= M) ptr_w[k+5] = 0x7fffffff; * * ptr_w[k+4] = (int) y_t; * * if (y_t >= M) ptr_w[k+4] = 0x7fffffff; * * k += 6; * * } * * } * * * * * * ASSUMPTIONS * * This code works for both "npoints" a multiple of 2 or 4. * * The arrays 'x[]', 'y[]', and 'w[]' all must be aligned on a * * double-word boundary for the "optimized" implementations. * * The input and output data are complex, with the real/imaginary * * components stored in adjacent locations in the array. The real * * components are stored at even array indices, and the imaginary * * components are stored at odd array indices. The input, twiddle * * factors are in 32 bit precision. The 32 by 32 multiplies are * * done with a 1.5 bit loss in accuracy. This comes about because * * the contribution of the low sixteen bits to the 32 bit result * * is not computed. In addition the contribution of the low * high * * term is shifted by 16 as opposed to 15, for a loss 0f 0.5 bits * * after rounding. To illustrate real part of complex multiply of: * * (X + jY) ( C + jS) = * * * * _mpyhir(si10 , yt1_0) + _mpyhir(co10 , xt1_0) + * * (((MPYLUHS(si10,yt1_0) + MPYLUHS(co10, xt1_0) * * + 0x8000) >> 16) << 1) * * * * The intrinsic C version of this code performs this function as: * * * * _mpyhir(si10 , yt1_0) + _mpyhir(co10 , xt1_0) + * * (_dotprsu2(yt1_0xt1_0, si10co10) << 1); * * * * * * where the functions _mpyhir, MPYLUHS are as follows: * * * * #define _mpyhir(x,y) \ * * (((int)((short)(x>>16)*(unsigned short)(y&0x0000FFFF)+0x4000) >> 15) * * + \ ((int)((short)(x >> 16) * (short)((y) >> 16)) << 1)) * * * * #define MPYLUHS(x,y) \ * * ( (int) ((unsigned short)(x & 0x0000FFFF) * (short) (y >> 16)) ) * * * * * * TECHNIQUES * * The following C code represents an implementation of the Cooley * * Tukey radix 4 DIF FFT. It accepts the inputs in normal order and * * produces the outputs in digit reversed order. The natural C code * * shown in this file on the other hand, accepts the inputs in nor- * * mal order and produces the outputs in normal order. * * * * Several transformations have been applied to the original Cooley * * Tukey code to produce the natural C code description shown here. * * In order to understand these it would first be educational to * * understand some of the issues involved in the conventional Cooley * * Tukey FFT code. * * * * void radix4(int n, short x[], short wn[]) * * { * * int n1, n2, ie, ia1, ia2, ia3; * * int i0, i1, i2, i3, i, j, k; * * short co1, co2, co3, si1, si2, si3; * * short xt0, yt0, xt1, yt1, xt2, yt2; * * short xh0, xh1, xh20, xh21, xl0, xl1,xl20,xl21; * * * * n2 = n; * * ie = 1; * * for (k = n; k > 1; k >>= 2) * * { * * n1 = n2; * * n2 >>= 2; * * ia1 = 0; * * * * for (j = 0; j < n2; j++) * * { * * ia2 = ia1 + ia1; * * ia3 = ia2 + ia1; * * * * co1 = wn[2 * ia1 ]; * * si1 = wn[2 * ia1 + 1]; * * co2 = wn[2 * ia2 ]; * * si2 = wn[2 * ia2 + 1]; * * co3 = wn[2 * ia3 ]; * * si3 = wn[2 * ia3 + 1]; * * ia1 = ia1 + ie; * * * * for (i0 = j; i0< n; i0 += n1) * * { * * i1 = i0 + n2; * * i2 = i1 + n2; * * i3 = i2 + n2; * * * * * * xh0 = x[2 * i0 ] + x[2 * i2 ]; * * xh1 = x[2 * i0 + 1] + x[2 * i2 + 1]; * * xl0 = x[2 * i0 ] - x[2 * i2 ]; * * xl1 = x[2 * i0 + 1] - x[2 * i2 + 1]; * * * * xh20 = x[2 * i1 ] + x[2 * i3 ]; * * xh21 = x[2 * i1 + 1] + x[2 * i3 + 1]; * * xl20 = x[2 * i1 ] - x[2 * i3 ]; * * xl21 = x[2 * i1 + 1] - x[2 * i3 + 1]; * * * * x[2 * i0 ] = xh0 + xh20; * * x[2 * i0 + 1] = xh1 + xh21; * * * * xt0 = xh0 - xh20; * * yt0 = xh1 - xh21; * * xt1 = xl0 + xl21; * * yt2 = xl1 + xl20; * * xt2 = xl0 - xl21; * * yt1 = xl1 - xl20; * * * * x[2 * i1 ] = (xt1 * co1 + yt1 * si1) >> 15; * * x[2 * i1 + 1] = (yt1 * co1 - xt1 * si1) >> 15; * * x[2 * i2 ] = (xt0 * co2 + yt0 * si2) >> 15; * * x[2 * i2 + 1] = (yt0 * co2 - xt0 * si2) >> 15; * * x[2 * i3 ] = (xt2 * co3 + yt2 * si3) >> 15; * * x[2 * i3 + 1] = (yt2 * co3 - xt2 * si3) >> 15; * * } * * } * * * * ie <<= 2; * * } * * } * * * * The conventional Cooley Tukey FFT, is written using three loops. * * The outermost loop "k" cycles through the stages. There are log * * N to the base 4 stages in all. The loop "j" cycles through the * * groups of butterflies with different twiddle factors, loop "i" * * reuses the twiddle factors for the different butterflies within * * a stage. It is interesting to note the following: * * * *-------------------------------------------------------------------------- * * Stage# #Groups # Butterflies with common #Groups*Bflys * * twiddle factors * *-------------------------------------------------------------------------- * * 1 N/4 1 N/4 * * 2 N/16 4 N/4 * * .. * * logN 1 N/4 N/4 * *-------------------------------------------------------------------------- * * * * The following statements can be made based on above observations: * * * * a) Inner loop "i0" iterates a veriable number of times. In * * particular the number of iterations quadruples every time from * * 1..N/4. Hence software pipelining a loop that iterates a vraiable * * number of times is not profitable. * * * * b) Outer loop "j" iterates a variable number of times as well. * * However the number of iterations is quartered every time from * * N/4 . . Hence the behaviour in (a) and (b) are exactly opposite * * to each other. * * * * c) If the two loops "i" and "j" are colaesced together then they * * will iterate for a fixed number of times namely N/4. This allows * * us to combine the "i" and "j" loops into 1 loop. Optimized impl- * * ementations will make use of this fact. * * * * In addition the Cooley Tukey FFT accesses three twiddle factors * * per iteration of the inner loop, as the butterflies that re-use * * twiddle factors are lumped together. This leads to accessing the * * twiddle factor array at three points each sepearted by "ie". Note * * that "ie" is initially 1, and is quadrupled with every iteration. * * Therfore these three twiddle factors are not even contiguous in * * the array. * * * * In order to vectorize the FFT, it is desirable to access twiddle * * factor array using double word wide loads and fetch the twiddle * * factors needed. In order to do this a modified twiddle factor * * array is created, in which the factors WN/4, WN/2, W3N/4 are * * arranged to be contiguous. This eliminates the seperation between * * twiddle factors within a butterfly. However this implies that as * * the loop is traversed from one stage to another, that we maintain * * a redundant version of the twiddle factor array. Hence the size * * of the twiddle factor array increases as compared to the normal * * Cooley Tukey FFT. The modified twiddle factor array is of size * * "2 * N" where the conventional Cooley Tukey FFT is of size"3N/4" * * where N is the number of complex points to be transformed. The * * routine that generates the modified twiddle factor array was * * presented earlier. With the above transformation of the FFT, * * both the input data and the twiddle factor array can be accessed * * using double-word wide loads to enable packed data processing. * * * * The final stage is optimised to remove the multiplication as * * w0 = 1. This stage also performs digit reversal on the data, * * so the final output is in natural order. * * * * The fft() code shown here performs the bulk of the computation * * in place. However, because digit-reversal cannot be performed * * in-place, the final result is written to a separate array, y[]. * * * * There is one slight break in the flow of packed processing that * * needs to be comprehended. The real part of the complex number is * * in the lower half, and the imaginary part is in the upper half. * * The flow breaks in case of "xl0" and "xl1" because in this case * * the real part needs to be combined with the imaginary part because * * of the multiplication by "j". This requires a packed quantity like * * "xl21xl20" to be rotated as "xl20xl21" so that it can be combined * * using add2's and sub2's. Hence the natural version of C code * * shown below is transformed using packed data processing as shown: * * * * xl0 = x[2 * i0 ] - x[2 * i2 ]; * * xl1 = x[2 * i0 + 1] - x[2 * i2 + 1]; * * xl20 = x[2 * i1 ] - x[2 * i3 ]; * * xl21 = x[2 * i1 + 1] - x[2 * i3 + 1]; * * * * xt1 = xl0 + xl21; * * yt2 = xl1 + xl20; * * xt2 = xl0 - xl21; * * yt1 = xl1 - xl20; * * * * xl1_xl0 = _sub2(x21_x20, x21_x20) * * xl21_xl20 = _sub2(x32_x22, x23_x22) * * xl20_xl21 = _rotl(xl21_xl20, 16) * * * * yt2_xt1 = _add2(xl1_xl0, xl20_xl21) * * yt1_xt2 = _sub2(xl1_xl0, xl20_xl21) * * * * Also notice that xt1, yt1 endup on seperate words, these need to * * be packed together to take advantage of the packed twiddle fact * * ors that have been loaded. In order for this to be achieved they * * are re-aligned as follows: * * * * yt1_xt1 = _packhl2(yt1_xt2, yt2_xt1) * * yt2_xt2 = _packhl2(yt2_xt1, yt1_xt2) * * * * In the folllowing code since all data elements are 32 bits, add2 * * sub2 are replaced with normal 32 bit add's and subtracts. * * The packed words "yt1_xt1" allows the loaded"sc" twiddle factor * * to be used for the complex multiplies. The real part of the * * multiply and the imaginary part of the multiply are performed * * as 16x32 multiplies using MPYLIR and MPYHIR * * * * (X + jY) ( C + j S) = (XC + YS) + j (YC - XS). * * * * The actual twiddle factors for the FFT are cosine, - sine. The * * twiddle factors stored in the table are csine and sine, hence * * the sign of the "sine" term is comprehended during multipli- * * cation as shown above. * * * * MEMORY NOTE * * The optimized implementations are written for LITTLE ENDIAN. * * * * CYCLES * * [(N/4 + 1) * 10 + 10] * ceil(log4(N) - 1) + 6 * (N/4 + 2) + 27 * * * * N = 512, [1290 + 10] * 4 + 6 * 130 + 27 = 6007 cycles * * * * CODESIZE * * 972 bytes * * ------------------------------------------------------------------------- * * Copyright (c) 2003 Texas Instruments, Incorporated. * * All Rights Reserved. * * ========================================================================= * *============================================================================* .sect ".text:_fft32x32" .global _DSP_fft32x32 _DSP_fft32x32: *================== SYMBOLIC REGISTER ASSIGNMENTS: SETUP ====================* .asg B15, B_SP ; Stack pointer, B datapath .asg A31, A_SP ; Stack pointer, A datapath .asg B0, B_csr ; CSR's value .asg B1, B_no_gie ; CSR w/ GIE bit cleared .asg A0, A_csr ; Copy of CSR's value .asg B3, B_ret ; Return address .asg A0, A_whl ; ====================== SYMBOLIC REGISTER ASSIGNMENTS ======================= .asg A4, A_ptr_w .asg B4, B_n .asg A6, A_ptr_x .asg B6, B_ptr_y .asg A15, A_stride .asg A13, A_tw_offset .asg A14, A_radix .asg B0, B_radix2 .asg A17, A_j .asg A23, A_fft_jmp .asg B20, B_fft_jmp .asg A19, A_h2 .asg A18, A_l1 .asg A20, A_l2 .asg B21, B_l1 .asg B22, B_h2 .asg B23, B_l2 .asg A10, A_x .asg A21, A_w0 .asg B19, B_w1 .asg A22, A_w2 .asg A3, A_fft_jmp_1 .asg A2, A_i .asg A1, A_pro .asg B2, B_pro2 .asg B25, B_xp1 .asg B24, B_xp0 .asg A25, A_xl1p1 .asg A24, A_xl1p0 .asg B29, B_xh2p1 .asg B28, B_xh2p0 .asg A27, A_xl2p1 .asg A26, A_xl2p0 .asg A9, A_xh0 .asg A24, A_xh1 .asg B30, B_xl0 .asg B27, B_xl1 .asg A8, A_xh20 .asg A29, A_xh21 .asg B5, B_xl20 .asg B7, B_xl21 .asg A26, A_y_h1_0 .asg A27, A_y_h1_1 .asg B26, B_j .asg B29, B_co10 .asg B28, B_si10 .asg A31, A_co20 .asg A30, A_si20 .asg B27, B_co30 .asg B26, B_si30 .asg A5, A_xt0 .asg A7, A_yt0 .asg B8, B_xt1 .asg B0, B_yt2 .asg B9, B_xt2 .asg B3, B_yt1 .asg B5, B_co10si10 .asg A5, A_co20si20 .asg B3, B_co30si30 .asg B8, B_si10co10 .asg A8, A_si20co20 .asg B5, B_si30co30 .asg A7, A_yt0xt0 .asg B16, B_yt1xt1 .asg B9, B_yt2xt2 .asg A5, A_si10 .asg B18, B_p0r .asg A11, A_p1r .asg B30, B_y_h2_0 .asg B3, B_p01r .asg B7, B_p0c .asg A3, A_p1c .asg B31, B_y_h2_1 .asg B9, B_p01c .asg A27, A_p2r .asg A3, A_p3r .asg A28, A_y_l1_0 .asg A24, A_p23r .asg A25, A_p2c .asg A3, A_p3c .asg A29, A_y_l1_1 .asg A3, A_p23c .asg B7, B_p4r .asg B26, B_p5r .asg B24, B_y_l2_0 .asg B8, B_p45r .asg B25, B_p4c .asg B24, B_p5c .asg B25, B_y_l2_1 .asg B5, B_p45c .asg A16, A_x_1 .asg B17, B_x__ .asg A0, A_ifj .asg A0, A_whl ; ====================== SYMBOLIC REGISTER ASSIGNMENTS ======================= ; Stack frame. 14 words: A10..A15, B10..B14, B3, CSR, pad ;- STW .D2T1 A15, *B_SP--[14] ; Reserve stack, Save A15 MV .S1X B_SP, A_SP ; Twin Stack Pointer STW .D1T1 A14, *+A_SP[12] ; Save A14 || STW .D2T2 B14, *+B_SP[11] ; Save B14 || MVC .S2 CSR, B_csr ; Capture CSR's state STW .D1T1 A13, *+A_SP[10] ; Save A13 || STW .D2T2 B13, *+B_SP[ 9] ; Save B13 || AND .L2 B_csr, -2, B_no_gie ; Clear GIE ;- STW .D1T1 A12, *+A_SP[ 8] ; Save A12 || STW .D2T2 B12, *+B_SP[ 7] ; Save B12 STW .D1T1 A11, *+A_SP[ 6] ; Save A11 || STW .D2T2 B11, *+B_SP[ 5] ; Save B11 || MV .L1X B_csr, A_csr ; Partitioning MV STW .D1T1 A10, *+A_SP[ 4] ; Save A10 || STW .D2T2 B10, *+B_SP[ 3] ; Save B10 || MVC .S2 B_no_gie, CSR ; Disable interrupts || NORM .L2 B_n, B_radix2 ;[ 2,0] ; ===== Interrupts masked here ===== AND .L2 B_radix2, 1,B_radix2 ;[ 3,0] _norm(npoints) & 1 || MVK .S1 4, A_radix ;[ 3,0] radix = 4? || STW .D1T1 A_csr, *+A_SP[ 2] ; Save CSR || STW .D2T2 B_ret, *+B_SP[ 1] ; Remember return address [ B_radix2]MVK.D1 2, A_radix ;[ 4,0] radix = 2 || ZERO .L1 A_tw_offset ;[ 4,0] tw_offset = 0; || MV .S1X B_n, A_stride ;[ 4,0] stride=n ; ============================ PIPE LOOP PROLOG ============================== ADDAH .D1 A_ptr_w,A_tw_offset,A_w0 ;[ 6,0] ptr_w + tw_offset || SHRU .S1 A_stride, 2,A_h2 ;[ 6,0] || MVK .L1 1, A_pro ;[11,0] ADDAH .D1 A_h2, A_h2,A_l2 ;[ 7,0] || MVK .L2 1, B_pro2 ; || SHL .S1 A_pro, 29,A_pro ; ADD .L2X A_w0, 8,B_w1 ;[ 8,0] || MPYSU .M1 6,A_stride, A_fft_jmp ;fft_jmp=stride+stride>>1 SHRU .S1X B_n, 2,A_i ;[ 9,0] n>>3 || MV .D2X A_l2, B_l2 ;[ 9,0] MV .L2X A_h2, B_h2 ;[10,0] || SHRU .S1 A_stride, 1,A_l1 ;[10,0] || ROTL .M1 A_ptr_x, 0,A_x ;x = ptr_x || ADD .D1X B_w1, 8,A_w2 ;[12,0] LOOP_WHILE_N: SUB .L1 A_i, 1,A_i ;[11,0] || SHRU .S2X A_fft_jmp, 3,B_fft_jmp ;[11,0] || SHRU .S1 A_fft_jmp, 1,A_fft_jmp_1 ;[11,0] || MPYSU .M1 0, A_j,A_j ;[11,0] j = 0 || LDDW .D1T2 *A_ptr_x[0],B_xp1:B_xp0 ;x[0] (0) MV .S2X A_l1, B_l1 ;[12,0] || ADD .L1 A_tw_offset,A_fft_jmp_1,A_tw_offset;[12,0]tw_offset+= || SUB .L2 B_fft_jmp, 3,B_fft_jmp ;[12,0] fft_jmp || SHRU .S1 A_stride, 2,A_stride ;[12,0] stride = stride>>2 || LDDW .D1T1 *A_ptr_x[A_l1], A_xl1p1:A_xl1p0 ;x[l1] (N/2) ; ============================ PIPE LOOP KERNEL ============================== LOOP_Y: ADD .S1 A_p2r, A_p3r, A_y_l1_0 ;[23,2]y[l1] = (si20*yt0+c || ADDAH .D2 B_y_h2_1, B_p01c, B_y_h2_1 ;[23,2] o20*xt0)>>15 || MPYHIR .M2 B_co10, B_xt1, B_p0r ;[13,3] || PACKH2 .S2 B_yt1, B_xt1, B_yt1xt1 ;[13,3] || ADD .L2 B_xl20, B_xl1, B_yt2 ;[13,3] yt2=xl1+xl20 || SUB .L1 A_xh1, A_xh21, A_yt0 ;[13,3] yt0=xh1-xh21 || LDDW .D1T1 *A_x[A_l2], A_xl2p1:A_xl2p0 ;[ 3,4] x[l2] (3N/4) ADD .L1 A_pro, A_pro, A_pro ;[34,1] || ADDAH .D2 B_y_l2_1, B_p45c, B_y_l2_1 ;[24,2] || MPYHIR .M2 B_si30, B_yt2, B_p5r ;[14,3] || MPYHIR .M1X A_si10, B_yt1, A_p1r ;[14,3] || PACK2 .S2 B_si30, B_co30, B_si30co30 ;[14,3] ()>>16 || SUB .L2 B_xl0, B_xl21, B_xt2 ;[14,3] xt2=xl0-xl21 || ADD .S1 A_xh21, A_xh1, A_y_h1_1 ;[14,3] y[1]=xh1+xh21 || LDDW .D1T2 *A_x[A_h2], B_xh2p1:B_xh2p0 ;[ 4,4] x[h2] (N/4) [!A_pro]STDW .D2T2 B_y_l2_1:B_y_l2_0, *B_x__[B_l2] ;[25,2] || SUB .L1 A_p2c, A_p3c, A_y_l1_1 ;[25,2]y[l1+1]=co20*yt0- || ADDAH .D1 A_y_l1_0, A_p23r, A_y_l1_0 ;[25,2] si20*xt0)>>15 || ADD .L2X B_p0r, A_p1r, B_y_h2_0 ;[25,2]y[h2] = (si10*yt1+ || MPYHIR .M2 B_co30, B_yt2, B_p4c ;[15,3] co10*xt1)>>15 || MPYHIR .M1X A_si10, B_xt1, A_p1c ;[15,3] || PACK2 .S2 B_si10, B_co10, B_si10co10 ;[15,3] ()>>16 || SUB .S1 A_xh0, A_xh20, A_xt0 ;[15,3] xt0=xh0-xh20 ADDAH .D2 B_y_h2_0, B_p01r, B_y_h2_0 ;[26,2] ||[!B_pro2]STDW .D1T1 A_y_h1_1:A_y_h1_0, *A_x_1[0] ;[16,3] || MPYHIR .M2 B_si30, B_xt2, B_p5c ;[16,3] || MPYHIR .M1 A_co20, A_yt0, A_p2c ;[16,3] || PACK2 .S1 A_si20, A_co20, A_si20co20 ;[16,3] ()>>16 || PACK2 .L2 B_co30, B_si30, B_co30si30 ;[16,3] ()>>16 || SUB .L1X B_fft_jmp, A_j, A_ifj ;[ 6,4] ifj = (j - fft_jmp) || MV .S2X A_j, B_j ;[ 6,4] BDEC .S1 LOOP_Y, A_i ;[37,1] || MPYHIR .M2 B_co30, B_xt2, B_p4r ;[17,3] || MPYHIR .M1 A_si20, A_yt0, A_p3r ;[17,3] || PACKH2 .S2 B_yt2, B_xt2, B_yt2xt2 ;[17,3] || LDDW .D2T1 *B_w1[B_j], A_co20:A_si20 ;[ 7,4] || LDDW .D1T2 *A_w0[A_j], B_co10:B_si10 ;[ 7,4] || SUB .L2X B_xp1, A_xl1p1, B_xl1 ;[ 7,4] xl1=x[1]-x[l1p1] || ADD .L1X B_xp0, A_xl1p0, A_xh0 ;[ 7,4] xh0=x[0]+x[l1] [!A_pro]STDW .D2T2 B_y_h2_1:B_y_h2_0, *B_x__[B_h2] ;[28,2] || ADDAH .D1 A_y_l1_1, A_p23c, A_y_l1_1 ;[28,2] || DOTPRSU2.M2 B_yt2xt2, B_si30co30, B_p45r ;[18,3] || MPYHIR .M1 A_co20, A_xt0, A_p2r ;[18,3] || PACKH2 .L1 A_yt0, A_xt0, A_yt0xt0 ;[18,3] || PACK2 .S2 B_co10, B_si10, B_co10si10 ;[18,3] ()>>16 || SUB .L2X B_xp0, A_xl1p0, B_xl0 ;[ 8,4] xl0=x[0]-x[l1] || ADD .S1X B_xp1, A_xl1p1, A_xh1 ;[ 8,4] xh1=x[1]+x[l1p1] DOTPNRSU2.M2 B_yt1xt1, B_co10si10, B_p01c ;[19,3] ||[!A_ifj]ADD .L1 A_x, A_fft_jmp, A_x ;[ 9,4]if(!predj)x+=fft_jmp || ADD .S1 A_j, 3, A_j ;[ 9,4] j += 1 || MVD .M1 A_x, A_x_1 ;[ 9,4] || LDDW .D1T2 *A_w2[A_j], B_co30:B_si30 ;[ 9,4] || SUB .D2X B_xh2p0, A_xl2p0, B_xl20 ;[ 9,4] xl20=x[h2] -x[l2] || ZERO .L2 B_pro2 [!A_pro]STDW .D2T1 A_y_l1_1:A_y_l1_0, *B_x__[B_l1] ;[30,2] || DOTPNRSU2.M2 B_yt2xt2, B_co30si30, B_p45c ;[20,3] || SUB .S2X B_p0c, A_p1c, B_y_h2_1 ;[20,3] y[h2+1]=(co10*yt1- || MPYHIR .M1 A_si20, A_xt0, A_p3c ;[20,3] si10*xt1)>>15 ||[!A_ifj]ZERO .D1 A_j ;[10,4] if (!predj) j = 0 || ADD .L1 A_x, 8, A_x ;[10,4] || SUB .L2 B_xl1, B_xl20, B_yt1 ;[10,4] yt1=xl1-xl20 || ADD .S1X B_xh2p1, A_xl2p1, A_xh21 ;[10,4]xh21=x[h2p1]+x[l2p1] SUB .S2 B_p4c, B_p5c, B_y_l2_1 ;[21,3]y[l2+1]=(si30*yt2- || DOTPRSU2.M1 A_yt0xt0, A_si20co20, A_p23r ;[21,3] co30*xt2)>>15 || ADD .L2 B_p4r, B_p5r, B_y_l2_0 ;[21,3]y[l2]=(co30*yt2+ || DOTPRSU2.M2 B_yt1xt1, B_si10co10, B_p01r ;[21,3] si30*xt2)>>15 || PACK2 .L1 A_co20, A_si20, A_co20si20 ;()>>16 || SUB .D2X B_xh2p1, A_xl2p1, B_xl21 ;xl21=x[h2p1]-x[l2p1] || ADD .S1X B_xh2p0, A_xl2p0, A_xh20 ;xh20=x[h2]+x[l2] || LDDW .D1T2 *A_x[0], B_xp1:B_xp0 ;x[0] (0) MV .S2X A_x_1, B_x__ ;[22,3] || ADDAH .D2 B_y_l2_0, B_p45r, B_y_l2_0 ;[22,3] || DOTPNRSU2.M1 A_yt0xt0, A_co20si20, A_p23c ;[22,3] || MPYHIR .M2 B_co10, B_yt1, B_p0c ;[12,4] || MV .L1X B_si10, A_si10 ;[12,4] || ADD .L2 B_xl21, B_xl0, B_xt1 ;xt1=xl0+xl21 || ADD .S1 A_xh20, A_xh0, A_y_h1_0 ;y[0]=xh0+xh20 || LDDW .D1T1 *A_x[A_l1], A_xl1p1:A_xl1p0 ;x[l1] (N/2) ; ============================ PIPE LOOP EPILOG ============================== ADD .S1 A_p2r, A_p3r, A_y_l1_0 ;[23,5] y[l1] = (si20*yt0 || ADDAH .D2 B_y_h2_1, B_p01c, B_y_h2_1 ;[23,5] +co20*xt0)>>15 ADDAH .D2 B_y_l2_1, B_p45c, B_y_l2_1 ;[24,5] || CMPGTU .L1 A_stride, A_radix, A_whl ;while (stride > radix) do STDW .D2T2 B_y_l2_1:B_y_l2_0, *B_x__[B_l2];[25,5] || SUB .L1 A_p2c, A_p3c, A_y_l1_1 ;[25,5] y[l1+1]=(co20*yt0- || ADDAH .D1 A_y_l1_0, A_p23r, A_y_l1_0 ;[25,5] si20*xt0)>>15 ||[A_whl]B .S1 LOOP_WHILE_N ;} end while || ADD .L2X B_p0r, A_p1r, B_y_h2_0 ;[25,5] y[h2] = (si10*yt1+ ;co10*xt1)>>15 ADDAH .D2 B_y_h2_0, B_p01r, B_y_h2_0 ;[26,5] || ADDAH .D1 A_ptr_w,A_tw_offset,A_w0 ;[ 6,0] ptr_w + tw_offset || SHRU .S1 A_stride, 2,A_h2 ;[ 6,0] || MVK .L1 1, A_pro ;[11,0] ADDAH .D1 A_h2, A_h2, A_l2 ;[ 7,0] || MVK .L2 1, B_pro2 ; || SHL .S1 A_pro, 29, A_pro ; STDW .D2T2 B_y_h2_1:B_y_h2_0, *B_x__[B_h2];[28,5] || ADDAH .D1 A_y_l1_1, A_p23c, A_y_l1_1 ;[28,5] || ADD .L2X A_w0, 8,B_w1 ;[ 8,0] || MPYSU .M1 6,A_stride, A_fft_jmp ;fft_jmp=stride+stride>>1 SHRU .S1X B_n, 2,A_i ;[ 9,0] n>>3 || MV .D2X A_l2, B_l2 ;[ 9,0] STDW .D2T1 A_y_l1_1:A_y_l1_0, *B_x__[B_l1];[30,5] || MV .L2X A_h2, B_h2 ;[10,0] || SHRU .S1 A_stride, 1,A_l1 ;[10,0] || ROTL .M1 A_ptr_x, 0,A_x ;x = ptr_x || ADD .D1X B_w1, 8,A_w2 ;[12,0] ; ====================== SYMBOLIC REGISTER ASSIGNMENTS ======================= .asg A14, A_radix .asg A6, A_ptr_x .asg B6, B_ptr_y .asg B4, B_n .asg A0, A_r2 .asg A20, A_p_x0 .asg B8, B_p_x0 .asg B21, B_p_y0 .asg B22, B_p_y2 .asg B23, B_p_y1 .asg B3, B_p_y3 .asg B20, B_l1 .asg B19, B_j0 .asg A18, A_i .asg B9, B_j .asg A1, A_pro .asg B25, B_h0 .asg B7, B_h1 .asg B7, B_h2 .asg B5, B_h3 .asg B16, B_h4 .asg A7, A_x1 .asg A6, A_x0 .asg B29, B_x3 .asg B28, B_x2 .asg A5, A_x5 .asg A4, A_x4 .asg B5, B_x7 .asg B4, B_x6 .asg A21, A_xh0_0 .asg A3, A_xh1_0 .asg B24, B_xh0_1 .asg B26, B_xh1_1 .asg B24, B_y0 .asg B25, B_y1 .asg B6, B_y4 .asg B7, B_y5 .asg A16, A_xl0_0 .asg A19, A_xl1_0 .asg B18, B_xl0_1 .asg B17, B_xl1_1 .asg A16, A_y2 .asg A17, A_y3 .asg A8, A_y6 .asg A9, A_y7 .asg A22, A_temp ; ============================ PIPE LOOP PROLOG ============================== NORM .L2 B_n, B_l1 ;[ 2,0] l1 = _norm(n)+2 || MV .D2 B_ptr_y, B_p_y0 ;[ 2,0] || MVK .L1 1, A_pro ; ZERO .L2 B_j ;[ 3,0] || SUB .D1 A_radix, 2, A_r2 ;[ 3,0] || ADD .S2 B_l1, 2, B_l1 ;[ 3,0] || ADDAW .D2 B_p_y0, B_n, B_p_y2 ;[ 3,0] MVK .S2 4, B_j0 ;[ 4,0] j0 = 4 ||[!A_r2]NORM .L2 B_n, B_l1 ;[ 4,0] l1 = _norm(n)+1; || ADDAH .D2 B_p_y2, B_n, B_p_y3 ;[ 4,0] || SHL .S1 A_pro, 15, A_pro ; SHRU .S1X B_n, 2, A_i ;[ 5,0] ||[!A_r2]MVK .S2 8, B_j0 ;[ 5,0] j0 = 8 ||[!A_r2]ADD .L2 B_l1, 1, B_l1 ;[ 5,0] || ADDAH .D2 B_p_y0, B_n, B_p_y1 ;[ 5,0] [!A_r2]ADD .S2 B_p_y2, B_n, B_p_y3 ;[ 6,0] ||[!A_r2]ADD .L2 B_p_y0, B_n, B_p_y1 ;[ 6,0] || ADD .D2X A_ptr_x, 8, B_p_x0 ;[ 6,0] x = ptr_x || MV .L1 A_ptr_x, A_p_x0 ;[ 6,0] ; ============================ PIPE LOOP KERNEL ============================== LOOP_Z: [!A_r2]ROTL .M1 A_x4, 0, A_xl0_0 ;[13,1] || SUB .L1X A_xl1_0, B_xl0_1, A_y3 ;[13,1] || ADD .S2X A_xh1_0, B_xh1_1, B_y1 ;[13,1] || BDEC .S1 LOOP_Z, A_i ;[13,1] }end for || ADD .L2 B_j, B_j0, B_j ;[ 1,3] j += j0; || LDDW .D2T2 *B_p_x0++[2], B_x3:B_x2 ;[ 1,3] || LDDW .D1T1 *A_p_x0++[2], A_x1:A_x0 ;[ 1,3] || DEAL .M2 B_j, B_h0 ;[ 1,3] h2 = _deal(j); [!A_pro]STDW .D2T2 B_y1:B_y0, *B_p_y0[B_h4] ;[14,1] || MV .S1 A_y3, A_temp ;[14,1] || ADD .L1X A_xl1_0, B_xl0_1,A_y7 ;[14,1] || SUB .L2 B_x2, B_x6, B_xl0_1 ;[ 8,2] || ADD .S2 B_x6, B_x2, B_xh0_1 ;[ 8,2] || ADD .D1 A_x4, A_x0, A_xh0_0 ;[ 8,2] ||[!A_r2]ROTL .M1 A_x0, 0, A_xh0_0 ;[ 8,2] ||[!A_r2]ROTL .M2 B_x2, 0, B_xh0_1 ;[ 8,2] SUB .L1X A_xl0_0, B_xl1_1,A_y6 ;[15,1] || SUB .L2X A_xh1_0, B_xh1_1,B_y5 ;[15,1] || ADD .S1 A_x5, A_x1, A_xh1_0 ;[ 9,2] ||[!A_r2]ROTL .M1 A_x1, 0, A_xh1_0 ;[ 9,2] ||[!A_r2]MV .S2 B_x7, B_xl0_1 ;[ 9,2] || SHFL .M2 B_h2, B_h3 ;[ 9,2] h2 = _shfl(h2); || LDDW .D1T1 *A_p_x0++[2], A_x5:A_x4 ;[ 3,3] || LDDW .D2T2 *B_p_x0++[2], B_x7:B_x6 ;[ 3,3] ADD .S1X A_xl0_0, B_xl1_1,A_y2 ;[16,1] ||[!A_r2]MV .D1 A_y7, A_y3 ;[16,1] ||[!A_pro]STDW .D2T2 B_y5:B_y4, *B_p_y2[B_h4] ;[16,1] || SUB .L1 A_x1, A_x5, A_xl1_0 ;[10,2] ||[!A_r2]ROTL .M1 A_x5, 0, A_xl1_0 ;[10,2] || ADD .S2 B_x7, B_x3, B_xh1_1 ;[10,2] || SUB .L2 B_x3, B_x7, B_xl1_1 ;[10,2] || BITR .M2 B_h0, B_h1 ;[ 4,3] h2 = _bitr(h2); [!A_r2]MV .L1 A_temp, A_y7 ;[17,1] ||[!A_pro]STDW .D2T1 A_y3:A_y2, *B_p_y1[B_h4] ;[17,1] || SUB .D1 A_x0, A_x4, A_xl0_0 ;[11,2] || ADD .L2X A_xh0_0, B_xh0_1,B_y0 ;[11,2] || SUB .S2X A_xh0_0, B_xh0_1,B_y4 ;[11,2] ||[!A_r2]ROTL .M2 B_x3, 0, B_xh1_1 ;[11,2] [!A_pro]STDW .D2T1 A_y7:A_y6, *B_p_y3[B_h4] ;[18,1] || SHRU .S2 B_h3, B_l1, B_h4 ;[12,2] h2 >>= l1; ||[!A_r2]MV .L2 B_x6, B_xl1_1 ;[12,2] || ROTL .M2 B_h1, 16, B_h2 ;[ 6,3] h2=_rotl(h2, 16) || MPYSU .M1 2, A_pro, A_pro ;10000 || MV .S1X B_SP, A_SP ; Twin Stack Pointer ; ============================ PIPE LOOP EPILOG ============================== LDW .D1T2 *+A_SP[ 1], B_ret ; Get return address || LDW .D2T1 *+B_SP[ 2], A_csr ; Get CSR's value LDW .D1T2 *+A_SP[ 3], B10 ; Restore B10 || LDW .D2T1 *+B_SP[ 4], A10 ; Restore A10 LDW .D1T2 *+A_SP[ 5], B11 ; Restore B11 || LDW .D2T1 *+B_SP[ 6], A11 ; Restore A11 LDW .D1T2 *+A_SP[ 7], B12 ; Restore B12 || LDW .D2T1 *+B_SP[ 8], A12 ; Restore A12 LDW .D1T2 *+A_SP[ 9], B13 ; Restore B13 || LDW .D2T1 *+B_SP[10], A13 ; Restore A13 LDW .D1T2 *+A_SP[11], B14 ; Restore B14 || LDW .D2T1 *+B_SP[12], A14 ; Restore A14 LDW .D2T1 *++B_SP[14],A15 ; Restore A15 || RETNOP .S2 B_ret, 4 ; Return to caller MVC .S2X A_csr, CSR ; Restore CSR *====== Interruptibility state restored ;====== Branch Occurs ===== *============================================================================* *= End of file: dsp_fft32x32.asm =* *============================================================================* * Copyright (c) 2003 Texas Instruments, Incorporated. * * All Rights Reserved. * *============================================================================*