You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
923 lines
56 KiB
923 lines
56 KiB
;* ======================================================================== *;
|
|
;* TEXAS INSTRUMENTS, INC. *;
|
|
;* *;
|
|
;* DSPLIB DSP Signal Processing Library *;
|
|
;* *;
|
|
;* Release: Revision 1.04b *;
|
|
;* CVS Revision: 1.5 Sun Sep 29 03:32:21 2002 (UTC) *;
|
|
;* Snapshot date: 23-Oct-2003 *;
|
|
;* *;
|
|
;* This library contains proprietary intellectual property of Texas *;
|
|
;* Instruments, Inc. The library and its source code are protected by *;
|
|
;* various copyrights, and portions may also be protected by patents or *;
|
|
;* other legal protections. *;
|
|
;* *;
|
|
;* This software is licensed for use with Texas Instruments TMS320 *;
|
|
;* family DSPs. This license was provided to you prior to installing *;
|
|
;* the software. You may review this license by consulting the file *;
|
|
;* TI_license.PDF which accompanies the files in this library. *;
|
|
;* ------------------------------------------------------------------------ *;
|
|
;* Copyright (C) 2003 Texas Instruments, Incorporated. *;
|
|
;* All Rights Reserved. *;
|
|
;* ======================================================================== *;
|
|
|
|
|
|
;* ======================================================================== *;
|
|
;* Assembler compatibility shim for assembling 4.30 and later code on *;
|
|
;* tools prior to 4.30. *;
|
|
;* ======================================================================== *;
|
|
|
|
.if $isdefed(".ASSEMBLER_VERSION")
|
|
.asg .ASSEMBLER_VERSION, $asmver
|
|
.else
|
|
.asg 0, $asmver
|
|
.endif
|
|
|
|
.if ($asmver < 430)
|
|
|
|
.asg B, CALL ; Function Call
|
|
.asg B, RET ; Return from a Function
|
|
.asg B, CALLRET ; Function call with Call / Ret chaining.
|
|
|
|
.if .TMS320C6400
|
|
.asg BNOP, CALLNOP ; C64x BNOP as a Fn. Call
|
|
.asg BNOP, RETNOP ; C64x BNOP as a Fn. Return
|
|
.asg BNOP, CRNOP ; C64x Fn call w/, Call/Ret chaining via BNOP.
|
|
.endif
|
|
|
|
.asg , .asmfunc ; .func equivalent for hand-assembly code
|
|
.asg , .endasmfunc ; .endfunc equivalent for hand-assembly code
|
|
|
|
.endif
|
|
|
|
;* ======================================================================== *;
|
|
;* End of assembler compatibility shim. *;
|
|
;* ======================================================================== *;
|
|
|
|
|
|
*========================================================================== *
|
|
* TEXAS INSTRUMENTS, INC. *
|
|
* *
|
|
* NAME *
|
|
* DSP_fft16x32 *
|
|
* *
|
|
* USAGE *
|
|
* This routine is C-callable and can be called as: *
|
|
* *
|
|
* void DSP_fft16x32(const short * ptr_w, int npoints, *
|
|
* int * ptr_x, int *ptr_y ) ; *
|
|
* *
|
|
* ptr_w = input twiddle factors *
|
|
* npoints = number of points *
|
|
* ptr_x = transformed data reversed *
|
|
* ptr_y = linear transformed data *
|
|
* *
|
|
* (See the C compiler reference guide.) *
|
|
* *
|
|
* DESCRIPTION *
|
|
* The following code performs a mixed radix FFT for "npoints" which *
|
|
* is either a multiple of 4 or 2. It uses logN4 - 1 stages of radix4 *
|
|
* transform and performs either a radix2 or radix4 transform on the *
|
|
* last stage depending on "npoints". If "npoints" is a multiple of 4, *
|
|
* then this last stage is also a radix4 transform, otherwise it is a *
|
|
* radix2 transform. This program is available as a C compilable file *
|
|
* to automatically generate the twiddle factors "twiddle_split.c" *
|
|
* *
|
|
* int i, j, k, n = N; *
|
|
* double theta1, theta2, theta3, x_t, y_t; *
|
|
* const double M = 32768.0, PI = 3 41592654; *
|
|
* *
|
|
* for (j=1, k=0; j < n>>2; j = j<<2) *
|
|
* { *
|
|
* for (i=0; i < n>>2; i += j<<1) *
|
|
* { *
|
|
* theta1 = 2*PI*i/n; *
|
|
* x_t = M*cos(theta1); *
|
|
* y_t = M*sin(theta1); *
|
|
* w[k+1] = (short) x_t; *
|
|
* if (x_t >= M) w[k+1] = 0x7fff; *
|
|
* w[k+0] = (short) y_t; *
|
|
* if (y_t >= M) w[k+0] = 0x7fff; *
|
|
* *
|
|
* theta1 = 2*PI*(i+j)/n; *
|
|
* x_t = M*cos(theta1); *
|
|
* y_t = M*sin(theta1); *
|
|
* w[k+7] = (short) x_t; *
|
|
* if (x_t >= M) w[k+3] = 0x7fff; *
|
|
* w[k+6] = (short) y_t; *
|
|
* if (y_t >= M) w[k+2] = 0x7fff; *
|
|
* *
|
|
* theta2 = 4*PI*i/n; *
|
|
* x_t = M*cos(theta2); *
|
|
* y_t = M*sin(theta2); *
|
|
* w[k+3] = (short) x_t; *
|
|
* if (x_t >= M) w[k+5] = 0x7fff; *
|
|
* w[k+2] = (short) y_t; *
|
|
* if (y_t >= M) w[k+4] = 0x7fff; *
|
|
* *
|
|
* theta2 = 4*PI*(i+j)/n; *
|
|
* x_t = M*cos(theta2); *
|
|
* y_t = M*sin(theta2); *
|
|
* w[k+9] = (short) x_t; *
|
|
* if (x_t >= M) w[k+7] = 0x7fff; *
|
|
* w[k+8] = (short) y_t; *
|
|
* if (y_t >= M) w[k+6] = 0x7fff; *
|
|
* *
|
|
* theta3 = 6*PI*i/n; *
|
|
* x_t = M*cos(theta3); *
|
|
* y_t = M*sin(theta3); *
|
|
* w[k+5] = (short) x_t; *
|
|
* if (x_t >= M) w[k+9] = 0x7fff; *
|
|
* w[k+4] = (short) y_t; *
|
|
* if (y_t >= M) w[k+8] = 0x7fff; *
|
|
* *
|
|
* theta3 = 6*PI*(i+j)/n; *
|
|
* x_t = M*cos(theta3); *
|
|
* y_t = M*sin(theta3); *
|
|
* w[k+11] = (short) x_t; *
|
|
* if (x_t >= M) w[k+11] = 0x7fff; *
|
|
* w[k+10] = (short) y_t; *
|
|
* if (y_t >= M) w[k+10] = 0x7fff; *
|
|
* *
|
|
* k += 12; *
|
|
* } *
|
|
* } *
|
|
* w[2*n-1] = w[2*n-3] = w[2*n-5] = 0x7fff; *
|
|
* w[2*n-2] = w[2*n-4] = w[2*n-6] = 0x0000; *
|
|
* *
|
|
* ASSUMPTIONS *
|
|
* This code works for both "npoints" a multiple of 2 or 4. *
|
|
* The arrays 'x[]', 'y[]', and 'w[]' all must be aligned on a *
|
|
* double-word boundary for the "optimized" implementations. *
|
|
* *
|
|
* The input and output data are complex, with the real/imaginary *
|
|
* components stored in adjacent locations in the array. The real *
|
|
* components are stored at even array indices, and the imaginary *
|
|
* components are stored at odd array indices. *
|
|
* *
|
|
* TECHNIQUES *
|
|
* The following C code represents an implementation of the Cooley *
|
|
* Tukey radix 4 DIF FFT. It accepts the inputs in normal order and *
|
|
* produces the outputs in digit reversed order. The natural C code *
|
|
* shown in this file on the other hand, accepts the inputs in nor- *
|
|
* mal order and produces the outputs in normal order. *
|
|
* *
|
|
* Several transformations have been applied to the original Cooley *
|
|
* Tukey code to produce the natural C code description shown here. *
|
|
* In order to understand these it would first be educational to *
|
|
* understand some of the issues involved in the conventional Cooley *
|
|
* Tukey FFT code. *
|
|
* *
|
|
* void radix4(int n, short x[], short wn[]) *
|
|
* { *
|
|
* int n1, n2, ie, ia1, ia2, ia3; *
|
|
* int i0, i1, i2, i3, i, j, k; *
|
|
* short co1, co2, co3, si1, si2, si3; *
|
|
* short xt0, yt0, xt1, yt1, xt2, yt2; *
|
|
* short xh0, xh1, xh20, xh21, xl0, xl1,xl20,xl21; *
|
|
* *
|
|
* n2 = n; *
|
|
* ie = 1; *
|
|
* for (k = n; k > 1; k >>= 2) *
|
|
* { *
|
|
* n1 = n2; *
|
|
* n2 >>= 2; *
|
|
* ia1 = 0; *
|
|
* *
|
|
* for (j = 0; j < n2; j++) *
|
|
* { *
|
|
* ia2 = ia1 + ia1; *
|
|
* ia3 = ia2 + ia1; *
|
|
* *
|
|
* co1 = wn[2 * ia1 ]; *
|
|
* si1 = wn[2 * ia1 + 1]; *
|
|
* co2 = wn[2 * ia2 ]; *
|
|
* si2 = wn[2 * ia2 + 1]; *
|
|
* co3 = wn[2 * ia3 ]; *
|
|
* si3 = wn[2 * ia3 + 1]; *
|
|
* ia1 = ia1 + ie; *
|
|
* *
|
|
* for (i0 = j; i0< n; i0 += n1) *
|
|
* { *
|
|
* i1 = i0 + n2; *
|
|
* i2 = i1 + n2; *
|
|
* i3 = i2 + n2; *
|
|
* *
|
|
* *
|
|
* xh0 = x[2 * i0 ] + x[2 * i2 ]; *
|
|
* xh1 = x[2 * i0 + 1] + x[2 * i2 + 1]; *
|
|
* xl0 = x[2 * i0 ] - x[2 * i2 ]; *
|
|
* xl1 = x[2 * i0 + 1] - x[2 * i2 + 1]; *
|
|
* *
|
|
* xh20 = x[2 * i1 ] + x[2 * i3 ]; *
|
|
* xh21 = x[2 * i1 + 1] + x[2 * i3 + 1]; *
|
|
* xl20 = x[2 * i1 ] - x[2 * i3 ]; *
|
|
* xl21 = x[2 * i1 + 1] - x[2 * i3 + 1]; *
|
|
* *
|
|
* x[2 * i0 ] = xh0 + xh20; *
|
|
* x[2 * i0 + 1] = xh1 + xh21; *
|
|
* *
|
|
* xt0 = xh0 - xh20; *
|
|
* yt0 = xh1 - xh21; *
|
|
* xt1 = xl0 + xl21; *
|
|
* yt2 = xl1 + xl20; *
|
|
* xt2 = xl0 - xl21; *
|
|
* yt1 = xl1 - xl20; *
|
|
* *
|
|
* x[2 * i1 ] = (xt1 * co1 + yt1 * si1) >> 15; *
|
|
* x[2 * i1 + 1] = (yt1 * co1 - xt1 * si1) >> 15; *
|
|
* x[2 * i2 ] = (xt0 * co2 + yt0 * si2) >> 15; *
|
|
* x[2 * i2 + 1] = (yt0 * co2 - xt0 * si2) >> 15; *
|
|
* x[2 * i3 ] = (xt2 * co3 + yt2 * si3) >> 15; *
|
|
* x[2 * i3 + 1] = (yt2 * co3 - xt2 * si3) >> 15; *
|
|
* } *
|
|
* } *
|
|
* *
|
|
* ie <<= 2; *
|
|
* } *
|
|
* } *
|
|
* *
|
|
* The conventional Cooley Tukey FFT, is written using three loops. *
|
|
* The outermost loop "k" cycles through the stages. There are log *
|
|
* N to the base 4 stages in all. The loop "j" cycles through the *
|
|
* groups of butterflies with different twiddle factors, loop "i" *
|
|
* reuses the twiddle factors for the different butterflies within *
|
|
* a stage. It is interesting to note the following: *
|
|
* *
|
|
*-------------------------------------------------------------------------- *
|
|
* Stage# #Groups # Butterflies with common #Groups*Bflys *
|
|
* twiddle factors *
|
|
*-------------------------------------------------------------------------- *
|
|
* 1 N/4 1 N/4 *
|
|
* 2 N/16 4 N/4 *
|
|
* .. *
|
|
* logN 1 N/4 N/4 *
|
|
*-------------------------------------------------------------------------- *
|
|
* *
|
|
* The following statements can be made based on above observations: *
|
|
* *
|
|
* a) Inner loop "i0" iterates a veriable number of times. In *
|
|
* particular the number of iterations quadruples every time from *
|
|
* 1..N/4. Hence software pipelining a loop that iterates a vraiable *
|
|
* number of times is not profitable. *
|
|
* *
|
|
* b) Outer loop "j" iterates a variable number of times as well. *
|
|
* However the number of iterations is quartered every time from *
|
|
* N/4 . . Hence the behaviour in (a) and (b) are exactly opposite *
|
|
* to each other. *
|
|
* *
|
|
* c) If the two loops "i" and "j" are colaesced together then they *
|
|
* will iterate for a fixed number of times namely N/4. This allows *
|
|
* us to combine the "i" and "j" loops into 1 loop. Optimized impl- *
|
|
* ementations will make use of this fact. *
|
|
* *
|
|
* In addition the Cooley Tukey FFT accesses three twiddle factors *
|
|
* per iteration of the inner loop, as the butterflies that re-use *
|
|
* twiddle factors are lumped together. This leads to accessing the *
|
|
* twiddle factor array at three points each sepearted by "ie". Note *
|
|
* that "ie" is initially 1, and is quadrupled with every iteration. *
|
|
* Therfore these three twiddle factors are not even contiguous in *
|
|
* the array. *
|
|
* *
|
|
* In order to vectorize the FFT, it is desirable to access twiddle *
|
|
* factor array using double word wide loads and fetch the twiddle *
|
|
* factors needed. In order to do this a modified twiddle factor *
|
|
* array is created, in which the factors WN/4, WN/2, W3N/4 are *
|
|
* arranged to be contiguous. This eliminates the seperation between *
|
|
* twiddle factors within a butterfly. However this implies that as *
|
|
* the loop is traversed from one stage to another, that we maintain *
|
|
* a redundant version of the twiddle factor array. Hence the size *
|
|
* of the twiddle factor array increases as compared to the normal *
|
|
* Cooley Tukey FFT. The modified twiddle factor array is of size *
|
|
* "2 * N" where the conventional Cooley Tukey FFT is of size"3N/4" *
|
|
* where N is the number of complex points to be transformed. The *
|
|
* routine that generates the modified twiddle factor array was *
|
|
* presented earlier. With the above transformation of the FFT, *
|
|
* both the input data and the twiddle factor array can be accessed *
|
|
* using double-word wide loads to enable packed data processing. *
|
|
* *
|
|
* The final stage is optimised to remove the multiplication as *
|
|
* w0 = 1. This stage also performs digit reversal on the data, *
|
|
* so the final output is in natural order. *
|
|
* *
|
|
* The fft() code shown here performs the bulk of the computation *
|
|
* in place. However, because digit-reversal cannot be performed *
|
|
* in-place, the final result is written to a separate array, y[]. *
|
|
* *
|
|
* There is one slight break in the flow of packed processing that *
|
|
* needs to be comprehended. The real part of the complex number is *
|
|
* in the lower half, and the imaginary part is in the upper half. *
|
|
* The flow breaks in case of "xl0" and "xl1" because in this case *
|
|
* the real part needs to be combined with the imaginary part because *
|
|
* of the multiplication by "j". This requires a packed quantity like *
|
|
* "xl21xl20" to be rotated as "xl20xl21" so that it can be combined *
|
|
* using add2's and sub2's. Hence the natural version of C code *
|
|
* shown below is transformed using packed data processing as shown: *
|
|
* *
|
|
* xl0 = x[2 * i0 ] - x[2 * i2 ]; *
|
|
* xl1 = x[2 * i0 + 1] - x[2 * i2 + 1]; *
|
|
* xl20 = x[2 * i1 ] - x[2 * i3 ]; *
|
|
* xl21 = x[2 * i1 + 1] - x[2 * i3 + 1]; *
|
|
* *
|
|
* xt1 = xl0 + xl21; *
|
|
* yt2 = xl1 + xl20; *
|
|
* xt2 = xl0 - xl21; *
|
|
* yt1 = xl1 - xl20; *
|
|
* *
|
|
* xl1_xl0 = _sub2(x21_x20, x21_x20) *
|
|
* xl21_xl20 = _sub2(x32_x22, x23_x22) *
|
|
* xl20_xl21 = _rotl(xl21_xl20, 16) *
|
|
* *
|
|
* yt2_xt1 = _add2(xl1_xl0, xl20_xl21) *
|
|
* yt1_xt2 = _sub2(xl1_xl0, xl20_xl21) *
|
|
* *
|
|
* Also notice that xt1, yt1 endup on seperate words, these need to *
|
|
* be packed together to take advantage of the packed twiddle fact *
|
|
* ors that have been loaded. In order for this to be achieved they *
|
|
* are re-aligned as follows: *
|
|
* *
|
|
* yt1_xt1 = _packhl2(yt1_xt2, yt2_xt1) *
|
|
* yt2_xt2 = _packhl2(yt2_xt1, yt1_xt2) *
|
|
* *
|
|
* The packed words "yt1_xt1" allows the loaded"sc" twiddle factor *
|
|
* to be used for the complex multiplies. The real part os the *
|
|
* complex multiply is implemented using _dotp2. The imaginary *
|
|
* part of the complex multiply is implemented using the 16x32 *
|
|
* multiply instruction "mpylir" or "mpyhir". *
|
|
* *
|
|
* (X + jY) ( C + j S) = (XC + YS) + j (YC - XS). *
|
|
* *
|
|
* The actual twiddle factors for the FFT are cosine, - sine. The *
|
|
* twiddle factors stored in the table are csine and sine, hence *
|
|
* the sign of the "sine" term is comprehended during multipli- *
|
|
* cation as shown above. *
|
|
* *
|
|
* MEMORY NOTE *
|
|
* The optimized implementations are written for LITTLE ENDIAN. *
|
|
* *
|
|
* INTERRUPTS *
|
|
* This code is interrupt tolerant but not interruptible. It masks out *
|
|
* interrupts for the entire duration of the code. *
|
|
* *
|
|
* CYCLES *
|
|
* (13 * N/8 + 24) * ceil(log4(N) - 1) + (N + 8) * 1.5 + 27 *
|
|
* *
|
|
* N = 512, (13 * 64 + 24) * 4 + 520 * 1.5 + 27 = 4231 cycles *
|
|
* *
|
|
* CODESIZE *
|
|
* 1068 bytes *
|
|
* ------------------------------------------------------------------------- *
|
|
* Copyright (c) 2003 Texas Instruments, Incorporated. *
|
|
* All Rights Reserved. *
|
|
* ========================================================================= *
|
|
|
|
*============================================================================*
|
|
.sect ".text:_fft16x32"
|
|
.global _DSP_fft16x32
|
|
_DSP_fft16x32:
|
|
*================== SYMBOLIC REGISTER ASSIGNMENTS: SETUP ====================*
|
|
.asg B15, B_SP ; Stack pointer, B datapath
|
|
.asg A31, A_SP ; Stack pointer, A datapath
|
|
.asg B0, B_csr ; CSR's value
|
|
.asg B1, B_no_gie ; CSR w/ GIE bit cleared
|
|
.asg A0, A_csr ; Copy of CSR's value
|
|
.asg B3, B_ret ; Return address
|
|
; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================
|
|
.asg A0, A_whl
|
|
*============================================================================*
|
|
; Stack frame. 14 words: A10..A15, B10..B14, B3, CSR, pad
|
|
;-
|
|
STW .D2T1 A15, *B_SP--[14] ; Reserve stack, Save A15
|
|
|
|
MV .S1X B_SP, A_SP ; Twin Stack Pointer
|
|
|
|
STW .D1T1 A14, *+A_SP[12] ; Save A14
|
|
|| STW .D2T2 B14, *+B_SP[11] ; Save B14
|
|
|| MVC .S2 CSR, B_csr ; Capture CSR's state
|
|
|
|
STW .D1T1 A13, *+A_SP[10] ; Save A13
|
|
|| STW .D2T2 B13, *+B_SP[ 9] ; Save B13
|
|
|| AND .L2 B_csr, -2,B_no_gie ; Clear GIE
|
|
|
|
; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================
|
|
.asg A4, A_ptr_w
|
|
.asg B4, B_n
|
|
.asg A6, A_ptr_x
|
|
.asg B6, B_ptr_y
|
|
.asg B14, B_radix
|
|
.asg B0, B_radix2
|
|
.asg B18, B_h2
|
|
.asg B13, B_tw_offset
|
|
.asg B0, B_stride
|
|
.asg B1, B_j
|
|
.asg B19, B_fft_jmp
|
|
.asg A19, A_fft_jmp
|
|
.asg B16, B_l1
|
|
.asg B17, B_l2
|
|
.asg A16, A_l1
|
|
.asg A18, A_h2
|
|
.asg A17, A_l2
|
|
.asg B0, B_x
|
|
.asg A15, A_w0
|
|
.asg B0, B_fft_jmp_1
|
|
.asg A14, A_i
|
|
.asg B2, B_pro
|
|
; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================
|
|
.asg A15, A_w0
|
|
.asg B1, B_j
|
|
.asg B0, B_x
|
|
.asg B16, B_l1
|
|
.asg B17, B_l2
|
|
.asg B18, B_h2
|
|
.asg A16, A_l1
|
|
.asg A17, A_l2
|
|
.asg A18, A_h2
|
|
.asg A19, A_fft_jmp
|
|
.asg B19, B_fft_jmp
|
|
.asg A14, A_i
|
|
.asg A5, A_j
|
|
.asg B5, B_w1
|
|
.asg A8, A_w2
|
|
.asg B29, B_co20_si20
|
|
.asg B28, B_co10_si10
|
|
.asg A25, A_co11_si11
|
|
.asg A24, A_co30_si30
|
|
.asg A31, A_co31_si31
|
|
.asg A30, A_co21_si21
|
|
.asg B31, B_x_1
|
|
.asg B30, B_x_0
|
|
.asg B25, B_xl1_1i
|
|
.asg B24, B_xl1_0i
|
|
.asg B27, B_xl2_1i
|
|
.asg B26, B_xl2_0i
|
|
.asg B21, B_xh2_1i
|
|
.asg B20, B_xh2_0i
|
|
.asg A10, A_x
|
|
.asg A27, A_x_3
|
|
.asg A26, A_x_2
|
|
.asg A25, A_xl1_3i
|
|
.asg A24, A_xl1_2i
|
|
.asg A25, A_xl2_3i
|
|
.asg A24, A_xl2_2i
|
|
.asg A23, A_xh2_3i
|
|
.asg A22, A_xh2_2i
|
|
.asg B31, B_xh1_0
|
|
.asg B7, B_xh0_0
|
|
.asg A29, A_xh1_1
|
|
.asg A0, A_xh0_1
|
|
.asg B25, B_xl1_0
|
|
.asg B3, B_xl0_0
|
|
.asg A20, A_xl1_1
|
|
.asg A1, A_xl0_1
|
|
.asg B20, B_xh21_0
|
|
.asg B12, B_xh20_0
|
|
.asg A11, A_xh21_1
|
|
.asg A8, A_xh20_1
|
|
.asg B28, B_xl21_0
|
|
.asg B26, B_xl20_0
|
|
.asg A7, A_xl21_1
|
|
.asg A5, A_xl20_1
|
|
.asg A13, A_x_
|
|
.asg B11, B_x__
|
|
.asg A2, A_ifj
|
|
.asg B25, B_x_1o
|
|
.asg B24, B_x_0o
|
|
.asg A29, A_x_3o
|
|
.asg A28, A_x_2o
|
|
.asg B3, B_yt0_0
|
|
.asg B7, B_xt0_0
|
|
.asg A9, A_yt0_1
|
|
.asg A3, A_xt0_1
|
|
.asg B9, B_yt1_0
|
|
.asg B7, B_xt1_0
|
|
.asg A3, A_yt2_1
|
|
.asg A22, A_xt1_1
|
|
.asg B23, B_yt2_0
|
|
.asg B10, B_xt2_0
|
|
.asg A20, A_yt1_1
|
|
.asg A28, A_xt2_1
|
|
.asg B5, B_p0
|
|
.asg B8, B_p1
|
|
.asg B22, B_xh2_0o
|
|
.asg B14, B_p2
|
|
.asg B8, B_p3
|
|
.asg B23, B_xh2_1o
|
|
.asg A8, A_p4
|
|
.asg A23, A_p5
|
|
.asg A22, A_xh2_2o
|
|
.asg A9, A_p6
|
|
.asg A23, A_p7
|
|
.asg A23, A_xh2_3o
|
|
.asg B21, B_p8
|
|
.asg B8, B_p9
|
|
.asg B26, B_xl1_0o
|
|
.asg B27, B_pa
|
|
.asg B9, B_pb
|
|
.asg B27, B_xl1_1o
|
|
.asg A27, A_pc
|
|
.asg A26, A_pd
|
|
.asg A26, A_xl1_2o
|
|
.asg A21, A_pe
|
|
.asg A23, A_pf
|
|
.asg A27, A_xl1_3o
|
|
.asg B29, B_co30_si30
|
|
.asg B5, B_p10
|
|
.asg B8, B_p11
|
|
.asg B30, B_xl2_0o
|
|
.asg B5, B_p12
|
|
.asg B8, B_p13
|
|
.asg B31, B_xl2_1o
|
|
.asg A24, A_p14
|
|
.asg A7, A_p15
|
|
.asg A20, A_xl2_2o
|
|
.asg A23, A_p16
|
|
.asg A21, A_p17
|
|
.asg A21, A_xl2_3o
|
|
.asg A12, A_x__
|
|
.asg B20, B_l1c
|
|
.asg A1, A_pro
|
|
; ============================================================================
|
|
|
|
ZERO .L2 B_tw_offset ;tw_offset = 0;
|
|
|| MV .S2 B_n, B_h2 ;stride=n
|
|
|| STW .D1T1 A12, *+A_SP[ 8] ; Save A12
|
|
|| STW .D2T2 B12, *+B_SP[ 7] ; Save B12
|
|
|
|
SHRU .S2 B_h2, 2, B_h2 ;[ 7,0]
|
|
|| STW .D1T1 A11, *+A_SP[ 6] ; Save A11
|
|
|| STW .D2T2 B11, *+B_SP[ 5] ; Save B11
|
|
|| MV .L1X B_csr, A_csr ;
|
|
|
|
ADDAH .D2 B_h2, B_h2, B_l2 ;[ 8,0]
|
|
|
|
SHRU .S1X B_n, 3, A_i ;[ 9,0] n>>3
|
|
|| STW .D1T1 A10, *+A_SP[ 4] ; Save A10
|
|
|| STW .D2T2 B10, *+B_SP[ 3] ; Save B10
|
|
|| MVC .S2 B_no_gie, CSR ; Disable
|
|
|
|
MV .L1X B_l2, A_l2 ;[10,0]
|
|
|| SHL .S2 B_h2, 1, B_l1 ;[10,0]
|
|
|| MPYSU .M2 8, B_l2, B_fft_jmp ;1.5*stride
|
|
|
|
MV .S1X B_h2, A_h2 ;[11,0]
|
|
|| STW .D1T1 A_csr, *+A_SP[ 2] ; Save CSR
|
|
|| STW .D2T2 B_ret, *+B_SP[ 1] ; return
|
|
|
|
MV .L1X B_l1, A_l1 ;[12,0]
|
|
|| LDDW .D1T2 *A_ptr_x[A_h2], B_xh2_1i:B_xh2_0i ;[ 1,1]
|
|
|
|
LOOP_WHILE_N:
|
|
|
|
SHRU .S1X B_fft_jmp, 4, A_fft_jmp ;[13,0]
|
|
|| MVK .L2 1, B_pro ;
|
|
|| ADD .L1 A_ptr_x, 8, A_x ;[ 2,1]
|
|
|
|
ADD .D1X A_ptr_w, B_tw_offset,A_w0 ;[14,0]
|
|
|| SHRU .S2 B_fft_jmp, 2, B_fft_jmp_1 ;[14,0]
|
|
|| MPYSU .M2 0, B_j, B_j ;[14,0] j = 0
|
|
|
|
MV .L2X A_ptr_x, B_x ;[15,0] ptr_x
|
|
|| SUB .D1 A_fft_jmp, 3, A_fft_jmp ;[15,0]
|
|
|| ADD .L1X A_w0, B_tw_offset,A_w0 ;[15,0]
|
|
|| ADD .D2 B_tw_offset,B_fft_jmp_1,B_tw_offset ;[15,0] fft_jmp
|
|
|
|
MVD .M1X B_x, A_x_ ;[ 5,1]
|
|
|| LDDW .D2T2 *B_x[B_l2], B_xl2_1i:B_xl2_0i ;[ 5,1]
|
|
|| BDEC .S1 LOOP_Y8 + 20, A_i
|
|
|
|
SUB .S1 A_fft_jmp, 0, A_ifj ;[ 6,1]
|
|
|| LDDW .D1T1 *A_x[0], A_x_3:A_x_2 ;[ 6,1]
|
|
|| LDDW .D2T2 *B_x[0], B_x_1:B_x_0 ;[ 6,1]
|
|
|
|
[!A_ifj]ADD .S2 B_x, B_fft_jmp, B_x ;[ 7,1]
|
|
|| LDDW .D2T2 *B_x[B_l1], B_xl1_1i:B_xl1_0i ;[ 7,1]
|
|
|| MV .L1X B_j, A_j ;[ 7,1]
|
|
|
|
LDDW .D1T1 *A_x[A_h2], A_xh2_3i:A_xh2_2i ;[ 8,1]
|
|
|
|
LDDW .D1T1 *A_x[A_l2], A_xl2_3i:A_xl2_2i ;[ 9,1]
|
|
|| ADD .L2X A_w0, 8, B_w1 ;[ 9,1]
|
|
|
|
SUB .L2 B_xh2_0i, B_xl2_0i, B_xl20_0 ;[10,1]
|
|
|| ADD .S2 B_xl2_0i, B_xh2_0i, B_xh20_0 ;[10,1]
|
|
|| ADD .D2 B_xl2_1i, B_xh2_1i, B_xh21_0 ;[10,1]
|
|
|| BDEC .S1 LOOP_Y, A_i
|
|
; ============================ PIPE LOOP KERNEL ==============================
|
|
LOOP_Y:
|
|
[!B_pro]STDW .D1T1 A_xl2_3o:A_xl2_2o, *A_x__[A_l2] ;[29,1]
|
|
||[!B_pro]STDW .D2T2 B_xh2_1o:B_xh2_0o, *B_x__[B_h2] ;[29,1]
|
|
|| MPYHIR .M2 B_co30_si30, B_xt2_0,B_p11 ;[29,1]
|
|
|| MPYHIR .M1 A_co21_si21, A_xt0_1,A_pd ;[29,1]
|
|
|| SUB .L2 B_xl0_0, B_xl21_0, B_xt2_0 ;[16,2]
|
|
||[!A_ifj]ZERO .S2 B_j ;[16,2]
|
|
|| SUB .L1 A_x_2, A_xl1_2i, A_xl0_1 ;[16,2]
|
|
|| SUB .S1 A_x_3, A_xl1_3i, A_xl1_1 ;[16,2]
|
|
|
|
[!B_pro]STDW .D1T1 A_x_3o:A_x_2o, *A_x__[0] ;[30,1]
|
|
|| MPYLIR .M1 A_co21_si21,A_xt0_1, A_pf ;[30,1]
|
|
|| ADD .D2 B_p8, B_p9, B_xl1_0o ;[30,1]
|
|
|| MPYLIR .M2 B_co10_si10,B_yt1_0, B_p0 ;[17,2]
|
|
|| SUB .S1 A_xl0_1, A_xl21_1, A_xt2_1 ;[17,2]
|
|
|| ADD .L2 B_xl1_0, B_xl20_0, B_yt2_0 ;[17,2]
|
|
|| ADD .L1 A_xl1_1, A_xl20_1, A_yt2_1 ;[17,2]
|
|
|| SUB .S2 B_xh1_0, B_xh21_0, B_yt0_0 ;[17,2]
|
|
|
|
SUB .S2 B_p12, B_p13, B_xl2_1o ;[31,1]
|
|
|| SUB .S1 A_p6, A_p7, A_xh2_3o ;[31,1]
|
|
|| MPYHIR .M2 B_co10_si10,B_xt1_0, B_p1 ;[18,2]
|
|
|| ADD .L2 B_xh21_0, B_xh1_0, B_x_1o ;[18,2]
|
|
|| ADD .D1 A_xl1_2i, A_x_2, A_xh0_1 ;[18,2]
|
|
|| ADD .L1 A_xl1_3i, A_x_3, A_xh1_1 ;[18,2]
|
|
|| MVD .M1X B_x, A_x_ ;[ 5,3]
|
|
|| LDDW .D2T2 *B_x[B_l2], B_xl2_1i:B_xl2_0i ;[ 5,3]
|
|
|
|
MPYHIR .M1 A_co31_si31,A_xt2_1, A_p15 ;[19,2]
|
|
|| MPYLIR .M2 B_co10_si10,B_xt1_0, B_p3 ;[19,2]
|
|
|| SUB .L1 A_xl1_1, A_xl20_1, A_yt1_1 ;[19,2]
|
|
|| ADD .S2 B_xl1_0i, B_x_0, B_xh0_0 ;[19,2]
|
|
|| SUB .S1X A_fft_jmp, B_j, A_ifj ;[ 6,3]
|
|
|| LDDW .D1T1 *A_x[0], A_x_3:A_x_2 ;[ 6,3]
|
|
|| LDDW .D2T2 *B_x[0], B_x_1:B_x_0 ;[ 6,3]
|
|
|
|
[!B_pro]STDW .D1T1 A_xh2_3o:A_xh2_2o, *A_x__[A_h2] ;[33,1]
|
|
|| ADD .L2 B_p10, B_p11, B_xl2_0o ;[33,1]
|
|
|| MPYLIR .M1 A_co31_si31,A_xt2_1, A_p17 ;[20,2]
|
|
|| SUB .S1 A_xh1_1, A_xh21_1, A_yt0_1 ;[20,2]
|
|
|| MVD .M2X A_x_, B_x__ ;[20,2]
|
|
||[!A_ifj]ADD .S2 B_x, B_fft_jmp, B_x ;[ 7,3]
|
|
|| LDDW .D2T2 *B_x[B_l1], B_xl1_1i:B_xl1_0i ;[ 7,3]
|
|
|| MV .L1X B_j, A_j ;[ 7,3]
|
|
|
|
[!B_pro]STDW .D2T2 B_xl2_1o:B_xl2_0o, *B_x__[B_l2] ;[34,1]
|
|
|| MPYHIR .M1 A_co31_si31,A_yt2_1, A_p16 ;[21,2]
|
|
|| MPYHIR .M2 B_co20_si20,B_yt0_0, B_pa ;[21,2]
|
|
|| ADD .S1 A_xl0_1, A_xl21_1, A_xt1_1 ;[21,2]
|
|
|| SUB .L2 B_xh0_0, B_xh20_0, B_xt0_0 ;[21,2]
|
|
|| ADD .L1 A_xh20_1, A_xh0_1, A_x_2o ;[21,2]
|
|
|| ADD .S2 B_xh20_0, B_xh0_0, B_x_0o ;[21,2]
|
|
|| LDDW .D1T1 *A_x[A_h2], A_xh2_3i:A_xh2_2i ;[ 8,3]
|
|
|
|
[!B_pro]STDW .D2T2 B_xl1_1o:B_xl1_0o, *B_x__[B_l1] ;[35,1]
|
|
|| SUB .L1 A_pe, A_pf, A_xl1_3o ;[35,1]
|
|
|| ADD .S1 A_pc, A_pd, A_xl1_2o ;[35,1]
|
|
|| MPYLIR .M1 A_co31_si31,A_yt2_1, A_p14 ;[22,2]
|
|
|| MPYLIR .M2 B_co20_si20,B_yt0_0, B_p8 ;[22,2]
|
|
|| ADD .S2 B_p0, B_p1, B_xh2_0o ;[22,2]
|
|
|| LDDW .D1T1 *A_x[A_l2], A_xl2_3i:A_xl2_2i ;[ 9,3]
|
|
|| ADD .L2X A_w0, 8, B_w1 ;[ 9,3]
|
|
|
|
BDEC .S1 LOOP_Y, A_i ;} end for
|
|
||[!B_pro]STDW .D1T1 A_xl1_3o:A_xl1_2o, *A_x__[A_l1] ;[36,1]
|
|
|| MPYLIR .M2 B_co20_si20,B_xt0_0, B_pb ;[23,2]
|
|
|| MPYLIR .M1 A_co11_si11,A_yt1_1, A_p4 ;[23,2]
|
|
|| SUB .L1 A_xh0_1, A_xh20_1, A_xt0_1 ;[23,2]
|
|
|| SUB .L2 B_xh2_0i, B_xl2_0i, B_xl20_0 ;[10,3]
|
|
|| ADD .S2 B_xl2_0i, B_xh2_0i, B_xh20_0 ;[10,3]
|
|
|| ADD .D2 B_xl2_1i, B_xh2_1i, B_xh21_0 ;[10,3]
|
|
LOOP_Y8:
|
|
STDW .D2T2 B_x_1o:B_x_0o, *B_x__[0] ;[24,2]
|
|
||[B_pro]SUB .S2 B_pro, 1, B_pro ;
|
|
|| MPYHIR .M1 A_co11_si11,A_xt1_1, A_p5 ;[24,2]
|
|
|| MPYHIR .M2 B_co10_si10,B_yt1_0, B_p2 ;[24,2]
|
|
|| ADD .S1 A_xh21_1, A_xh1_1, A_x_3o ;[24,2]
|
|
|| SUB .L2 B_xh2_1i, B_xl2_1i, B_xl21_0 ;[11,3]
|
|
|| LDDW .D1T1 *A_x[A_l1], A_xl1_3i:A_xl1_2i ;[11,3]
|
|
|| ADD .L1X B_w1, 8, A_w2 ;[11,3]
|
|
;LOOP_Y9:
|
|
SUB .S1 A_p16, A_p17, A_xl2_3o ;[25,2]
|
|
|| MV .S2X A_co30_si30, B_co30_si30 ;[25,2]
|
|
|| MPYHIR .M2 B_co20_si20,B_xt0_0, B_p9 ;[25,2]
|
|
|| MPYLIR .M1 A_co11_si11,A_xt1_1, A_p7 ;[25,2]
|
|
|| ADD .D2 B_x, 16, B_x ;[12,3]
|
|
|| SUB .L2 B_x_0, B_xl1_0i, B_xl0_0 ;[12,3]
|
|
|| LDDW .D1T2 *A_w0[A_j], B_co20_si20:B_co10_si10 ;[12,3]
|
|
;LOOP_Y10:
|
|
ADD .S1X B_x__, 8, A_x__ ;[26,2]
|
|
|| ADD .L1 A_p14, A_p15, A_xl2_2o ;[26,2]
|
|
|| MPYHIR .M2 B_co30_si30,B_yt2_0, B_p12 ;[26,2]
|
|
|| MPYHIR .M1 A_co11_si11,A_yt1_1, A_p6 ;[26,2]
|
|
|| ADD .S2 B_xl0_0, B_xl21_0, B_xt1_0 ;[13,3]
|
|
|| SUB .L2 B_x_1, B_xl1_1i, B_xl1_0 ;[13,3]
|
|
|| ADD .D2 B_xl1_1i, B_x_1, B_xh1_0 ;[13,3]
|
|
|| LDDW .D1T1 *A_w2[A_j], A_co31_si31:A_co21_si21 ;[13,3]
|
|
;LOOP_Y11:
|
|
MPYLIR .M2 B_co30_si30,B_xt2_0, B_p13 ;[27,2]
|
|
|| MPYHIR .M1 A_co21_si21,A_yt0_1, A_pe ;[27,2]
|
|
|| SUB .S2 B_pa, B_pb, B_xl1_1o ;[27,2]
|
|
|| SUB .L2 B_xl1_0, B_xl20_0, B_yt1_0 ;[14,3]
|
|
|| SUB .S1 A_xh2_2i, A_xl2_2i, A_xl20_1 ;[14,3]
|
|
|| SUB .D1 A_xh2_3i, A_xl2_3i, A_xl21_1 ;[14,3]
|
|
|| ADD .L1 A_xl2_3i, A_xh2_3i, A_xh21_1 ;[14,3]
|
|
|| LDDW .D2T2 *B_x[B_h2], B_xh2_1i:B_xh2_0i ;[ 1,4]
|
|
;LOOP_Y12:
|
|
MPYLIR .M2 B_co30_si30,B_yt2_0, B_p10 ;[28,2]
|
|
|| MPYLIR .M1 A_co21_si21,A_yt0_1, A_pc ;[28,2]
|
|
|| ADD .L1 A_p4, A_p5, A_xh2_2o ;[28,2]
|
|
|| SUB .S2 B_p2, B_p3, B_xh2_1o ;[28,2]
|
|
|| ADD .L2 B_j, 3, B_j ;[15,3]
|
|
|| ADD .S1 A_xl2_2i, A_xh2_2i, A_xh20_1 ;[15,3]
|
|
|| LDDW .D2T1 *B_w1[B_j], A_co11_si11:A_co30_si30 ;[15,3]
|
|
|| ADD .D1X B_x, 8, A_x ;[ 2,4]
|
|
; ============================ PIPE LOOP EPILOG ==============================
|
|
STDW .D1T1 A_xl2_3o:A_xl2_2o, *A_x__[A_l2] ;[29,4]
|
|
|| STDW .D2T2 B_xh2_1o:B_xh2_0o, *B_x__[B_h2] ;[29,4]
|
|
|| MPYHIR .M2 B_co30_si30,B_xt2_0, B_p11 ;[29,4]
|
|
|| MPYHIR .M1 A_co21_si21,A_xt0_1, A_pd ;[29,4]
|
|
|| NORM .L2 B_n, B_radix2 ;[ 2,0]
|
|
|
|
STDW .D1T1 A_x_3o:A_x_2o, *A_x__[0] ;[30,4]
|
|
|| MPYLIR .M1 A_co21_si21,A_xt0_1, A_pf ;[30,4]
|
|
|| ADD .D2 B_p8, B_p9, B_xl1_0o ;[30,4]
|
|
|| AND .S2 B_radix2, 1, B_radix2 ;norm(npoints)&1
|
|
|| MVK .L2 4, B_radix ; radix = 4?
|
|
|
|
SUB .L2 B_p12, B_p13, B_xl2_1o ;[31,4]
|
|
|| SUB .L1 A_p6, A_p7, A_xh2_3o ;[31,4]
|
|
||[B_radix2]MVK .D2 2, B_radix ;radix = 2
|
|
|
|
CMPGTU .L1X A_h2, B_radix, A_whl ;
|
|
|
|
STDW .D1T1 A_xh2_3o:A_xh2_2o, *A_x__[A_h2] ;[33,4]
|
|
|| ADD .L2 B_p10, B_p11, B_xl2_0o ;[33,4]
|
|
||[A_whl]B .S1 LOOP_WHILE_N ;} end while
|
|
|
|
STDW .D2T2 B_xl2_1o:B_xl2_0o, *B_x__[B_l2] ;[34,4]
|
|
||[A_whl]SHRU .S2 B_h2, 2, B_h2 ;[ 7,0]
|
|
|
|
|
|
STDW .D2T2 B_xl1_1o:B_xl1_0o, *B_x__[B_l1] ;[35,4]
|
|
|| SUB .L1 A_pe, A_pf, A_xl1_3o ;[35,4]
|
|
|| ADD .D1 A_pc, A_pd, A_xl1_2o ;[35,4]
|
|
||[A_whl]SHRU .S1X B_n, 3, A_i ;[ 9,0] n>>3
|
|
|
|
STDW .D1T1 A_xl1_3o:A_xl1_2o, *A_x__[A_l1] ;[36,4]
|
|
||[A_whl]ADDAH .D2 B_h2, B_h2, B_l2 ;[ 8,0]
|
|
||[A_whl]MV .L1X B_h2, A_h2 ;[11,0]
|
|
|
|
[A_whl]MV .L1X B_l2, A_l2 ;[10,0]
|
|
||[A_whl]SHL .S2 B_h2, 1, B_l1 ;[10,0]
|
|
||[A_whl]MPYSU .M2 8, B_l2, B_fft_jmp ;1.5*stride
|
|
|
|
[A_whl]MV .S1X B_l1, A_l1 ;[12,0]
|
|
||[A_whl]LDDW .D1T2 *A_ptr_x[A_h2], B_xh2_1i:B_xh2_0i ;[ 1,1]
|
|
||[!A_whl]NORM .L2 B_n, B_l1c ;l1=_norm(n)+2
|
|
||[!A_whl]MVK .L1 1, A_pro ;
|
|
|
|
; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================
|
|
.asg B14, B_radix
|
|
.asg A6, A_ptr_x
|
|
.asg B6, B_ptr_y
|
|
.asg B4, B_n
|
|
.asg A0, A_r2
|
|
.asg A20, A_p_x0
|
|
.asg B8, B_p_x0
|
|
.asg B22, B_p_y2
|
|
.asg B23, B_p_y1
|
|
.asg B3, B_p_y3
|
|
.asg B20, B_l1
|
|
.asg B19, B_j0
|
|
.asg A18, A_i
|
|
.asg B9, B_j
|
|
.asg A1, A_pro
|
|
; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================
|
|
.asg B9, B_j
|
|
.asg A18, A_i
|
|
.asg B19, B_j0
|
|
.asg B20, B_l1
|
|
.asg B23, B_p_y1
|
|
.asg B22, B_p_y2
|
|
.asg B3, B_p_y3
|
|
.asg A20, A_p_x0
|
|
.asg B8, B_p_x0
|
|
.asg A0, A_r2
|
|
.asg B25, B_h0
|
|
.asg B7, B_h1
|
|
.asg B7, B_h2
|
|
.asg B5, B_h3
|
|
.asg B16, B_h4
|
|
.asg A27, A_x1
|
|
.asg A26, A_x0
|
|
.asg B29, B_x3
|
|
.asg B28, B_x2
|
|
.asg A5, A_x5
|
|
.asg A4, A_x4
|
|
.asg B5, B_x7
|
|
.asg B4, B_x6
|
|
.asg A21, A_xh0_0
|
|
.asg A3, A_xh1_0
|
|
.asg B24, B_xh0_1
|
|
.asg B26, B_xh1_1
|
|
.asg B24, B_y0
|
|
.asg B25, B_y1
|
|
.asg B31, B_y5
|
|
.asg B30, B_y4
|
|
.asg A16, A_xl0_0
|
|
.asg A19, A_xl1_0
|
|
.asg B18, B_xl0_1
|
|
.asg B17, B_xl1_1
|
|
.asg A16, A_y2
|
|
.asg A17, A_y3
|
|
.asg A8, A_y6
|
|
.asg A9, A_y7
|
|
.asg A22, A_temp
|
|
; ============================ PIPE LOOP PROLOG ==============================
|
|
ZERO .L2 B_j ;[ 3,0]
|
|
|| SUB .D1X B_radix, 2, A_r2 ;[ 3,0]
|
|
|| ADD .S2 B_l1, 2, B_l1 ;[ 3,0]
|
|
|| ADDAW .D2 B_ptr_y, B_n, B_p_y2 ;[ 3,0]
|
|
|
|
MVK .S2 4, B_j0 ;[ 4,0] j0 = 4
|
|
|| ADDAH .D2 B_p_y2, B_n, B_p_y3 ;[ 4,0]
|
|
|| SHL .S1 A_pro, 15, A_pro ;
|
|
|
|
SHRU .S1X B_n, 2, A_i ;[ 5,0]
|
|
||[!A_r2]MVK .S2 8, B_j0 ;[ 5,0] j0 = 8
|
|
||[!A_r2]SUB .L2 B_l1, 1, B_l1 ;[ 5,0]
|
|
|| ADDAH .D2 B_ptr_y, B_n, B_p_y1 ;[ 5,0]
|
|
|
|
[!A_r2]ADD .S2 B_p_y2, B_n, B_p_y3 ;[ 6,0]
|
|
||[!A_r2]ADD .L2 B_ptr_y, B_n, B_p_y1 ;[ 6,0]
|
|
|| ADD .D2X A_ptr_x, 8, B_p_x0 ;x = ptr_x
|
|
; ============================ PIPE LOOP KERNEL ==============================
|
|
LOOP_Z:
|
|
[!A_r2]ROTL .M1 A_x4, 0, A_xl0_0 ;[13,1]
|
|
|| SUB .L1X A_xl1_0, B_xl0_1, A_y3 ;[13,1]
|
|
|| ADD .S2X A_xh1_0, B_xh1_1, B_y1 ;[13,1]
|
|
|| BDEC .S1 LOOP_Z, A_i ;[13,1] }end for
|
|
|| ADD .L2 B_j, B_j0, B_j ;j += j0;
|
|
|| LDDW .D2T2 *B_p_x0++[2], B_x3:B_x2 ;[ 1,3]
|
|
|| LDDW .D1T1 *A_ptr_x++[2], A_x1:A_x0 ;[ 1,3]
|
|
|| DEAL .M2 B_j, B_h0 ;h2=_deal(j);
|
|
|
|
[!A_pro]STDW .D2T2 B_y1:B_y0, *B_ptr_y[B_h4] ;[14,1]
|
|
|| MV .S1 A_y3, A_temp ;[14,1]
|
|
|| ADD .L1X A_xl1_0, B_xl0_1, A_y7 ;[14,1]
|
|
|| SUB .L2 B_x2, B_x6, B_xl0_1 ;[ 8,2]
|
|
|| ADD .S2 B_x6, B_x2, B_xh0_1 ;[ 8,2]
|
|
|| ADD .D1 A_x4, A_x0, A_xh0_0 ;[ 8,2]
|
|
||[!A_r2]ROTL .M1 A_x0, 0, A_xh0_0 ;[ 8,2]
|
|
||[!A_r2]ROTL .M2 B_x2, 0, B_xh0_1 ;[ 8,2]
|
|
|
|
SUB .L1X A_xl0_0, B_xl1_1, A_y6 ;[15,1]
|
|
|| SUB .L2X A_xh1_0, B_xh1_1, B_y5 ;[15,1]
|
|
|| ADD .S1 A_x5, A_x1, A_xh1_0 ;[ 9,2]
|
|
||[!A_r2]ROTL .M1 A_x1, 0, A_xh1_0 ;[ 9,2]
|
|
||[!A_r2]MV .S2 B_x7, B_xl0_1 ;[ 9,2]
|
|
|| SHFL .M2 B_h2, B_h3 ;h2= _shfl(h2);
|
|
|| LDDW .D1T1 *A_ptr_x++[2], A_x5:A_x4 ;[ 3,3]
|
|
|| LDDW .D2T2 *B_p_x0++[2], B_x7:B_x6 ;[ 3,3]
|
|
|
|
ADD .S1X A_xl0_0, B_xl1_1, A_y2 ;[16,1]
|
|
||[!A_r2]MV .D1 A_y7, A_y3 ;[16,1]
|
|
||[!A_pro]STDW .D2T2 B_y5:B_y4, *B_p_y2[B_h4] ;[16,1]
|
|
|| SUB .L1 A_x1, A_x5, A_xl1_0 ;[10,2]
|
|
||[!A_r2]ROTL .M1 A_x5, 0, A_xl1_0 ;[10,2]
|
|
|| ADD .S2 B_x7, B_x3, B_xh1_1 ;[10,2]
|
|
|| SUB .L2 B_x3, B_x7, B_xl1_1 ;[10,2]
|
|
|| BITR .M2 B_h0, B_h1 ;h2=_bitr(h2);
|
|
|
|
[!A_r2]MV .L1 A_temp, A_y7 ;[17,1]
|
|
||[!A_pro]STDW .D2T1 A_y3:A_y2, *B_p_y1[B_h4] ;[17,1]
|
|
|| SUB .D1 A_x0, A_x4, A_xl0_0 ;[11,2]
|
|
|| ADD .L2X A_xh0_0, B_xh0_1, B_y0 ;[11,2]
|
|
|| SUB .S2X A_xh0_0, B_xh0_1, B_y4 ;[11,2]
|
|
||[!A_r2]ROTL .M2 B_x3, 0, B_xh1_1 ;[11,2]
|
|
|
|
[!A_pro]STDW .D2T1 A_y7:A_y6, *B_p_y3[B_h4] ;[18,1]
|
|
|| SHRU .S2 B_h3, B_l1, B_h4 ;h2 >>= l1;
|
|
||[!A_r2]MV .L2 B_x6, B_xl1_1 ;[12,2]
|
|
|| ROTL .M2 B_h1, 16, B_h2 ;h2=_rotl(h2,16)
|
|
|| MPYSU .M1 2, A_pro, A_pro ;10000
|
|
|| MV .S1X B_SP, A_SP ; Twin Stack Pointer
|
|
; ============================ PIPE LOOP EPILOG ==============================
|
|
LDW .D1T2 *+A_SP[ 1], B_ret ; Get return address
|
|
|| LDW .D2T1 *+B_SP[ 2], A_csr ; Get CSR's value
|
|
|
|
LDW .D1T2 *+A_SP[ 3], B10 ; Restore B10
|
|
|| LDW .D2T1 *+B_SP[ 4], A10 ; Restore A10
|
|
|
|
LDW .D1T2 *+A_SP[ 5], B11 ; Restore B11
|
|
|| LDW .D2T1 *+B_SP[ 6], A11 ; Restore A11
|
|
|
|
LDW .D1T2 *+A_SP[ 7], B12 ; Restore B12
|
|
|| LDW .D2T1 *+B_SP[ 8], A12 ; Restore A12
|
|
|
|
LDW .D1T2 *+A_SP[ 9], B13 ; Restore B13
|
|
|| LDW .D2T1 *+B_SP[10], A13 ; Restore A13
|
|
|
|
LDW .D1T2 *+A_SP[11], B14 ; Restore B14
|
|
|| LDW .D2T1 *+B_SP[12], A14 ; Restore A14
|
|
|
|
LDW .D2T1 *++B_SP[14],A15 ; Restore A15
|
|
|| RETNOP .S2 B_ret, 4 ; Return to caller
|
|
|
|
MVC .S2X A_csr, CSR ; Restore CSR
|
|
*====== Interruptibility state restored
|
|
;====== Branch Occurs =====
|
|
|
|
*============================================================================*
|
|
*= End of file: fft32x16_h.asm =*
|
|
*============================================================================*
|
|
* Copyright (c) 2003 Texas Instruments, Incorporated. *
|
|
* All Rights Reserved. *
|
|
*============================================================================*
|
|
|