You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
353 lines
27 KiB
353 lines
27 KiB
/* ======================================================================== */
|
|
/* TEXAS INSTRUMENTS, INC. */
|
|
/* */
|
|
/* DSPLIB DSP Signal Processing Library */
|
|
/* */
|
|
/* Release: Revision 1.04b */
|
|
/* CVS Revision: 1.6 Sun Sep 29 03:32:21 2002 (UTC) */
|
|
/* Snapshot date: 23-Oct-2003 */
|
|
/* */
|
|
/* This library contains proprietary intellectual property of Texas */
|
|
/* Instruments, Inc. The library and its source code are protected by */
|
|
/* various copyrights, and portions may also be protected by patents or */
|
|
/* other legal protections. */
|
|
/* */
|
|
/* This software is licensed for use with Texas Instruments TMS320 */
|
|
/* family DSPs. This license was provided to you prior to installing */
|
|
/* the software. You may review this license by consulting the file */
|
|
/* TI_license.PDF which accompanies the files in this library. */
|
|
/* ------------------------------------------------------------------------ */
|
|
/* Copyright (C) 2003 Texas Instruments, Incorporated. */
|
|
/* All Rights Reserved. */
|
|
/* ======================================================================== */
|
|
/* ======================================================================== */
|
|
/* Assembler compatibility shim for assembling 4.30 and later code on */
|
|
/* tools prior to 4.30. */
|
|
/* ======================================================================== */
|
|
/* ======================================================================== */
|
|
/* End of assembler compatibility shim. */
|
|
/* ======================================================================== */
|
|
/*=========================================================================S*/
|
|
/* TEXAS INSTRUMENTS, INC. */
|
|
/* */
|
|
/* NAME */
|
|
/* DSP_fft32x32s: Double Precision FFT with scaling */
|
|
/* */
|
|
/* USAGE */
|
|
/* This routine is C-callable and can be called as: */
|
|
/* */
|
|
/* void DSP_fft32x32s(const int * ptr_w, int npoints, */
|
|
/* int * ptr_x, int * ptr_y ) ; */
|
|
/* */
|
|
/* ptr_w = input twiddle factors */
|
|
/* npoints = number of points */
|
|
/* ptr_x = transformed data reversed */
|
|
/* ptr_y = linear transformed data */
|
|
/* */
|
|
/* (See the C compiler reference guide.) */
|
|
/* */
|
|
/* DESCRIPTION */
|
|
/* The following code performs a mixed radix FFT for "npoints" which */
|
|
/* is either a multiple of 4 or 2. It uses logN4 - 1 stages of radix4 */
|
|
/* transform and performs either a radix2 or radix4 transform on the */
|
|
/* last stage depending on "npoints". If "npoints" is a multiple of 4, */
|
|
/* then this last stage is also a radix4 transform, otherwise it is a */
|
|
/* radix2 transform. This program is available as a C compilable file */
|
|
/* to automatically generate the twiddle factors "twiddle_split.c" */
|
|
/* */
|
|
/* Generate special vector of twiddle factors */
|
|
/* */
|
|
/* for (j=1, k=0; j < npoints>>2; j = j <<2 ) */
|
|
/* { */
|
|
/* for (i=0; i < npoints>>2; i += j) */
|
|
/* { */
|
|
/* theta1 = 2*PI*i/npoints; */
|
|
/* x_t = M*cos(theta1); */
|
|
/* y_t = M*sin(theta1); */
|
|
/* ptr_w[k+1] = (int) x_t; */
|
|
/* if (x_t >= M) ptr_w[k+1] = 0x7fffffff; */
|
|
/* ptr_w[k+0] = (int) y_t; */
|
|
/* if (y_t >= M) ptr_w[k+0] = 0x7fffffff; */
|
|
/* */
|
|
/* theta2 = 4*PI*i/npoints; */
|
|
/* x_t = M*cos(theta2); */
|
|
/* y_t = M*sin(theta2); */
|
|
/* ptr_w[k+3] = (int) x_t; */
|
|
/* */
|
|
/* if (x_t >= M) ptr_w[k+3] = 0x7fffffff; */
|
|
/* ptr_w[k+2] = (int) y_t; */
|
|
/* if (y_t >= M) ptr_w[k+2] = 0x7fffffff; */
|
|
/* */
|
|
/* theta3 = 6*PI*i/npoints; */
|
|
/* x_t = M*cos(theta3); */
|
|
/* y_t = M*sin(theta3); */
|
|
/* ptr_w[k+5] = (int) x_t; */
|
|
/* if (x_t >= M) ptr_w[k+5] = 0x7fffffff; */
|
|
/* ptr_w[k+4] = (int) y_t; */
|
|
/* if (y_t >= M) ptr_w[k+4] = 0x7fffffff; */
|
|
/* k += 6; */
|
|
/* } */
|
|
/* } */
|
|
/* */
|
|
/* */
|
|
/* ASSUMPTIONS */
|
|
/* This code works for both "npoints" a multiple of 2 or 4. */
|
|
/* The arrays 'x[]', 'y[]', and 'w[]' all must be aligned on a */
|
|
/* double-word boundary for the "optimized" implementations. */
|
|
/* The input and output data are complex, with the real/imaginary */
|
|
/* components stored in adjacent locations in the array. The real */
|
|
/* components are stored at even array indices, and the imaginary */
|
|
/* components are stored at odd array indices. The input, twiddle */
|
|
/* factors are in 32 bit precision. The 32 by 32 multiplies are */
|
|
/* done with a 1.5 bit loss in accuracy. This comes about because */
|
|
/* the contribution of the low sixteen bits to the 32 bit result */
|
|
/* is not computed. In addition the contribution of the low * high */
|
|
/* term is shifted by 16 as opposed to 15, for a loss 0f 0.5 bits */
|
|
/* after rounding. To illustrate real part of complex multiply of: */
|
|
/* (X + jY) ( C + jS) = */
|
|
/* */
|
|
/* _mpyhir(si10 , yt1_0) + _mpyhir(co10 , xt1_0) + */
|
|
/* (((MPYLUHS(si10,yt1_0) + MPYLUHS(co10, xt1_0) */
|
|
/* + 0x8000) >> 16) << 1) */
|
|
/* */
|
|
/* The intrinsic C version of this code performs this function as: */
|
|
/* */
|
|
/* _mpyhir(si10 , yt1_0) + _mpyhir(co10 , xt1_0) + */
|
|
/* (_dotprsu2(yt1_0xt1_0, si10co10) << 1); */
|
|
/* */
|
|
/* */
|
|
/* where the functions _mpyhir, MPYLUHS are as follows: */
|
|
/* */
|
|
/* #define _mpyhir(x,y) \ */
|
|
/* (((int)((short)(x>>16)*(unsigned short)(y&0x0000FFFF)+0x4000) >> 15) */
|
|
/* + \ ((int)((short)(x >> 16) * (short)((y) >> 16)) << 1)) */
|
|
/* */
|
|
/* #define MPYLUHS(x,y) \ */
|
|
/* ( (int) ((unsigned short)(x & 0x0000FFFF) * (short) (y >> 16)) ) */
|
|
/* */
|
|
/* */
|
|
/* TECHNIQUES */
|
|
/* The following C code represents an implementation of the Cooley */
|
|
/* Tukey radix 4 DIF FFT. It accepts the inputs in normal order and */
|
|
/* produces the outputs in digit reversed order. The natural C code */
|
|
/* shown in this file on the other hand, accepts the inputs in nor- */
|
|
/* mal order and produces the outputs in normal order. */
|
|
/* */
|
|
/* Several transformations have been applied to the original Cooley */
|
|
/* Tukey code to produce the natural C code description shown here. */
|
|
/* In order to understand these it would first be educational to */
|
|
/* understand some of the issues involved in the conventional Cooley */
|
|
/* Tukey FFT code. */
|
|
/* */
|
|
/* void radix4(int n, short x[], short wn[]) */
|
|
/* { */
|
|
/* int n1, n2, ie, ia1, ia2, ia3; */
|
|
/* int i0, i1, i2, i3, i, j, k; */
|
|
/* short co1, co2, co3, si1, si2, si3; */
|
|
/* short xt0, yt0, xt1, yt1, xt2, yt2; */
|
|
/* short xh0, xh1, xh20, xh21, xl0, xl1,xl20,xl21; */
|
|
/* */
|
|
/* n2 = n; */
|
|
/* ie = 1; */
|
|
/* for (k = n; k > 1; k >>= 2) */
|
|
/* { */
|
|
/* n1 = n2; */
|
|
/* n2 >>= 2; */
|
|
/* ia1 = 0; */
|
|
/* */
|
|
/* for (j = 0; j < n2; j++) */
|
|
/* { */
|
|
/* ia2 = ia1 + ia1; */
|
|
/* ia3 = ia2 + ia1; */
|
|
/* */
|
|
/* co1 = wn[2 * ia1 ]; */
|
|
/* si1 = wn[2 * ia1 + 1]; */
|
|
/* co2 = wn[2 * ia2 ]; */
|
|
/* si2 = wn[2 * ia2 + 1]; */
|
|
/* co3 = wn[2 * ia3 ]; */
|
|
/* si3 = wn[2 * ia3 + 1]; */
|
|
/* ia1 = ia1 + ie; */
|
|
/* */
|
|
/* for (i0 = j; i0< n; i0 += n1) */
|
|
/* { */
|
|
/* i1 = i0 + n2; */
|
|
/* i2 = i1 + n2; */
|
|
/* i3 = i2 + n2; */
|
|
/* */
|
|
/* */
|
|
/* xh0 = x[2 * i0 ] + x[2 * i2 ]; */
|
|
/* xh1 = x[2 * i0 + 1] + x[2 * i2 + 1]; */
|
|
/* xl0 = x[2 * i0 ] - x[2 * i2 ]; */
|
|
/* xl1 = x[2 * i0 + 1] - x[2 * i2 + 1]; */
|
|
/* */
|
|
/* xh20 = x[2 * i1 ] + x[2 * i3 ]; */
|
|
/* xh21 = x[2 * i1 + 1] + x[2 * i3 + 1]; */
|
|
/* xl20 = x[2 * i1 ] - x[2 * i3 ]; */
|
|
/* xl21 = x[2 * i1 + 1] - x[2 * i3 + 1]; */
|
|
/* */
|
|
/* x[2 * i0 ] = xh0 + xh20; */
|
|
/* x[2 * i0 + 1] = xh1 + xh21; */
|
|
/* */
|
|
/* xt0 = xh0 - xh20; */
|
|
/* yt0 = xh1 - xh21; */
|
|
/* xt1 = xl0 + xl21; */
|
|
/* yt2 = xl1 + xl20; */
|
|
/* xt2 = xl0 - xl21; */
|
|
/* yt1 = xl1 - xl20; */
|
|
/* */
|
|
/* x[2 * i1 ] = (xt1 * co1 + yt1 * si1) >> 15; */
|
|
/* x[2 * i1 + 1] = (yt1 * co1 - xt1 * si1) >> 15; */
|
|
/* x[2 * i2 ] = (xt0 * co2 + yt0 * si2) >> 15; */
|
|
/* x[2 * i2 + 1] = (yt0 * co2 - xt0 * si2) >> 15; */
|
|
/* x[2 * i3 ] = (xt2 * co3 + yt2 * si3) >> 15; */
|
|
/* x[2 * i3 + 1] = (yt2 * co3 - xt2 * si3) >> 15; */
|
|
/* } */
|
|
/* } */
|
|
/* */
|
|
/* ie <<= 2; */
|
|
/* } */
|
|
/* } */
|
|
/* */
|
|
/* The conventional Cooley Tukey FFT, is written using three loops. */
|
|
/* The outermost loop "k" cycles through the stages. There are log */
|
|
/* N to the base 4 stages in all. The loop "j" cycles through the */
|
|
/* groups of butterflies with different twiddle factors, loop "i" */
|
|
/* reuses the twiddle factors for the different butterflies within */
|
|
/* a stage. It is interesting to note the following: */
|
|
/* */
|
|
/*-------------------------------------------------------------------------S*/
|
|
/* Stage# #Groups # Butterflies with common #Groups*Bflys */
|
|
/* twiddle factors */
|
|
/*-------------------------------------------------------------------------S*/
|
|
/* 1 N/4 1 N/4 */
|
|
/* 2 N/16 4 N/4 */
|
|
/* .. */
|
|
/* logN 1 N/4 N/4 */
|
|
/*-------------------------------------------------------------------------S*/
|
|
/* */
|
|
/* The following statements can be made based on above observations: */
|
|
/* */
|
|
/* a) Inner loop "i0" iterates a veriable number of times. In */
|
|
/* particular the number of iterations quadruples every time from */
|
|
/* 1..N/4. Hence software pipelining a loop that iterates a vraiable */
|
|
/* number of times is not profitable. */
|
|
/* */
|
|
/* b) Outer loop "j" iterates a variable number of times as well. */
|
|
/* However the number of iterations is quartered every time from */
|
|
/* N/4 . . Hence the behaviour in (a) and (b) are exactly opposite */
|
|
/* to each other. */
|
|
/* */
|
|
/* c) If the two loops "i" and "j" are colaesced together then they */
|
|
/* will iterate for a fixed number of times namely N/4. This allows */
|
|
/* us to combine the "i" and "j" loops into 1 loop. Optimized impl- */
|
|
/* ementations will make use of this fact. */
|
|
/* */
|
|
/* In addition the Cooley Tukey FFT accesses three twiddle factors */
|
|
/* per iteration of the inner loop, as the butterflies that re-use */
|
|
/* twiddle factors are lumped together. This leads to accessing the */
|
|
/* twiddle factor array at three points each sepearted by "ie". Note */
|
|
/* that "ie" is initially 1, and is quadrupled with every iteration. */
|
|
/* Therfore these three twiddle factors are not even contiguous in */
|
|
/* the array. */
|
|
/* */
|
|
/* In order to vectorize the FFT, it is desirable to access twiddle */
|
|
/* factor array using double word wide loads and fetch the twiddle */
|
|
/* factors needed. In order to do this a modified twiddle factor */
|
|
/* array is created, in which the factors WN/4, WN/2, W3N/4 are */
|
|
/* arranged to be contiguous. This eliminates the seperation between */
|
|
/* twiddle factors within a butterfly. However this implies that as */
|
|
/* the loop is traversed from one stage to another, that we maintain */
|
|
/* a redundant version of the twiddle factor array. Hence the size */
|
|
/* of the twiddle factor array increases as compared to the normal */
|
|
/* Cooley Tukey FFT. The modified twiddle factor array is of size */
|
|
/* "2 * N" where the conventional Cooley Tukey FFT is of size"3N/4" */
|
|
/* where N is the number of complex points to be transformed. The */
|
|
/* routine that generates the modified twiddle factor array was */
|
|
/* presented earlier. With the above transformation of the FFT, */
|
|
/* both the input data and the twiddle factor array can be accessed */
|
|
/* using double-word wide loads to enable packed data processing. */
|
|
/* */
|
|
/* The final stage is optimised to remove the multiplication as */
|
|
/* w0 = 1. This stage also performs digit reversal on the data, */
|
|
/* so the final output is in natural order. */
|
|
/* */
|
|
/* The fft() code shown here performs the bulk of the computation */
|
|
/* in place. However, because digit-reversal cannot be performed */
|
|
/* in-place, the final result is written to a separate array, y[]. */
|
|
/* */
|
|
/* There is one slight break in the flow of packed processing that */
|
|
/* needs to be comprehended. The real part of the complex number is */
|
|
/* in the lower half, and the imaginary part is in the upper half. */
|
|
/* The flow breaks in case of "xl0" and "xl1" because in this case */
|
|
/* the real part needs to be combined with the imaginary part because */
|
|
/* of the multiplication by "j". This requires a packed quantity like */
|
|
/* "xl21xl20" to be rotated as "xl20xl21" so that it can be combined */
|
|
/* using add2's and sub2's. Hence the natural version of C code */
|
|
/* shown below is transformed using packed data processing as shown: */
|
|
/* */
|
|
/* xl0 = x[2 * i0 ] - x[2 * i2 ]; */
|
|
/* xl1 = x[2 * i0 + 1] - x[2 * i2 + 1]; */
|
|
/* xl20 = x[2 * i1 ] - x[2 * i3 ]; */
|
|
/* xl21 = x[2 * i1 + 1] - x[2 * i3 + 1]; */
|
|
/* */
|
|
/* xt1 = xl0 + xl21; */
|
|
/* yt2 = xl1 + xl20; */
|
|
/* xt2 = xl0 - xl21; */
|
|
/* yt1 = xl1 - xl20; */
|
|
/* */
|
|
/* xl1_xl0 = _sub2(x21_x20, x21_x20) */
|
|
/* xl21_xl20 = _sub2(x32_x22, x23_x22) */
|
|
/* xl20_xl21 = _rotl(xl21_xl20, 16) */
|
|
/* */
|
|
/* yt2_xt1 = _add2(xl1_xl0, xl20_xl21) */
|
|
/* yt1_xt2 = _sub2(xl1_xl0, xl20_xl21) */
|
|
/* */
|
|
/* Also notice that xt1, yt1 endup on seperate words, these need to */
|
|
/* be packed together to take advantage of the packed twiddle fact */
|
|
/* ors that have been loaded. In order for this to be achieved they */
|
|
/* are re-aligned as follows: */
|
|
/* */
|
|
/* yt1_xt1 = _packhl2(yt1_xt2, yt2_xt1) */
|
|
/* yt2_xt2 = _packhl2(yt2_xt1, yt1_xt2) */
|
|
/* */
|
|
/* In the folllowing code since all data elements are 32 bits, add2 */
|
|
/* sub2 are replaced with normal 32 bit add's and subtracts. */
|
|
/* The packed words "yt1_xt1" allows the loaded"sc" twiddle factor */
|
|
/* to be used for the complex multiplies. The real part of the */
|
|
/* multiply and the imaginary part of the multiply are performed */
|
|
/* as 16x32 multiplies using MPYLIR and MPYHIR */
|
|
/* */
|
|
/* (X + jY) ( C + j S) = (XC + YS) + j (YC - XS). */
|
|
/* */
|
|
/* The actual twiddle factors for the FFT are cosine, - sine. The */
|
|
/* twiddle factors stored in the table are csine and sine, hence */
|
|
/* the sign of the "sine" term is comprehended during multipli- */
|
|
/* cation as shown above. */
|
|
/* */
|
|
/* MEMORY NOTE */
|
|
/* The optimized implementations are written for LITTLE ENDIAN. */
|
|
/* */
|
|
/* CYCLES */
|
|
/* [(N/4 + 1) * 10 + 10] * ceil(log4(N) - 1) + 6 * (N/4 + 2) + 27 */
|
|
/* */
|
|
/* N = 512, [1290 + 10] * 4 + 6 * 130 + 27 = 6007 cycles */
|
|
/* */
|
|
/* CODESIZE */
|
|
/* 932 bytes */
|
|
/* ------------------------------------------------------------------------ */
|
|
/* Copyright (c) 2003 Texas Instruments, Incorporated. */
|
|
/* All Rights Reserved. */
|
|
/* ======================================================================== */
|
|
#ifndef DSP_FFT32X32S_H_
|
|
#define DSP_FFT32X32S_H_ 1
|
|
|
|
void DSP_fft32x32s(const int * ptr_w, int npoints,
|
|
int * ptr_x, int * ptr_y ) ;
|
|
|
|
#endif
|
|
/* ======================================================================== */
|
|
/* End of file: dsp_fft32x32s.h */
|
|
/* ------------------------------------------------------------------------ */
|
|
/* Copyright (c) 2003 Texas Instruments, Incorporated. */
|
|
/* All Rights Reserved. */
|
|
/* ======================================================================== */
|
|
|