You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

247 lines
13 KiB

/*============================================================================*/
/* */
/* */
/* TEXAS INSTRUMENTS, INC. */
/* */
/* NAME */
/* wave_horz : 1D Wavelet Transform */
/* */
/* USAGE */
/* This routine is C-callable and can be called as: */
/* */
/* void wave_horz_co(short *iptr, short *qmf, short *filter, */
/* short * optr, int ish_x_dim ) */
/* iptr = input row of data */
/* qmf = qmf filter-bank for Low-Pass */
/* filter = Mirror qmf filter bank for High_pass */
/* optr = output row of detailed/reference decimated outputs */
/* ish_x_dim = width of the input row */
/* */
/* (See the C compiler reference guide.) */
/* */
/* DESCRIPTION */
/* */
/* This kernel performs a 1D Periodic Orthogonal Wavelet decomp- */
/* osition. This also performs athe row decomposition in a 2D wavelet */
/* transform. An in put signal x[n] is first low pass and high pass */
/* filterd and decimated by two. This results in a reference signal */
/* r1[n] which is the decimated output obtained by dropping the odd */
/* samples of the low pass filtered output and a detail signal d[n] */
/* obtained by dropping the odd samples of the high-pass output. A */
/* circular convolution algorithm is implemented and hence the wave- */
/* let transform is periodic. The reference signal and the detail */
/* signal are half the size of the original signal. The reference */
/* signal may then be iterated again to perform another scale of */
/* multi-resolution analysis. */
/* */
/* */
/* ASSUMPTIONS */
/* This kernel uses the Daubechies D4 filter bank for analysis with 4 */
/* vansishing moments. Hence the length of the analyzing low-pass and */
/* high pass filters is 8. */
/* */
/* MEMORY NOTE */
/* Hand Assembly kernel should not have any bank conflicts */
/* */
/* TECHNIQUES */
/* The main idea behind the optimized C code is to issue one set of */
/* reads to the x array and to perform low-pass and high pass filtering */
/* together and to perfrom the filtering operations together to maximize */
/* the number of multiplies. The last 6 elements of the low-pass filter */
/* and the first 6 elements of the high pass filter use the same input */
/* This is used to appropraitely change the output pointer to the low */
/* pass filter after 6 iterations. However for the first six iterations */
/* pointer wrap-around can occurr and hence this creates a dependency. */
/* Pre-reading those 6 values outside the array prevents the checks that */
/* introduce this dependency. In addtion the input data is read as word */
/* wide quantities and the low-pass and high-pass filter coefficients */
/* are stored in registers allowing for the input loop to be completely */
/* unrolled. Thus the intrinsic C code has only one loop. A predication */
/* register is used to reset the low-pass output pointer after three */
/* iterations. The merging of the loops in this fashion allows for the */
/* maximum number of multiplies with the minimum number of reads. */
/* The dotp2 instruction performs the computation of the partial products */
/* and is used to compute the low pass and high pass summation results. */
/*============================================================================*/
/* Copyright (C) 1997-1999 Texas Instruments Incorporated. */
/* All Rights Reserved */
/*============================================================================*/
#define Qpt 15
#define Qr 16384
#define Qs 32767
#include <stdio.h>
#include <stdlib.h>
#pragma CODE_SECTION(wave_horz_c, ".text:intrinsic");
static const char Copyright[] = "Copyright (C) 1999 Texas Instruments "
"Incorporated. All Rights Reserved.";
void wave_horz_c
(
const short *restrict iptr, /* Row of input pixels */
const short *restrict qmf, /* Low-pass QMF filter */
const short *restrict filter, /* High-pass QMF filter */
short *restrict optr, /* Row of output data */
int ish_x_dim /* Length of input. */
)
{
const int M = 8;
int i, iters;
short res_h, res_l;
int prod_h10, prod_h32, prod_h54, prod_h76;
int prod_l10, prod_l32, prod_l54, prod_l76;
int sum_h, sum_l;
int h10, h32, h54, h76;
int l10, l32, l54, l76;
int x10, x32, x54, x76;
int h01, h23, h45, h67;
double hf_val0, hf_val1;
double lf_val0, lf_val1;
int p;
int offset;
const short *restrict xstart;
short *restrict yptr_h, *restrict yptr_l0, *restrict yptr_l1;
const double *restrict lf_ptr, *restrict hf_ptr;
const int *restrict xiptr;
/*---------------------------------------------------------------------*/
/* Cast filters into double pointers and perform double word loads. */
/* Obtain low and high halves of the double words for the different */
/* filter coefficients. However in order to achieve the multiply and */
/* accumulate capability using dotp2 's the filter coefficients need */
/* to be reversed in order, since the high pass filter computes: */
/* x0h7 + x1h6 + x2h5 + x3h4 + x4h3 + x5h2 + x6h1 + x7h0 */
/* This is done using the _packlh2 that accomplishes the swap2 */
/* ility, that is being sought after. The low pass filters are fine */
/* in the sense that their order need not be reversed. */
/*---------------------------------------------------------------------*/
hf_ptr = (const double *) filter;
lf_ptr = (const double *) qmf;
hf_val0 = hf_ptr[0];
hf_val1 = hf_ptr[1];
lf_val0 = lf_ptr[0];
lf_val1 = lf_ptr[1];
h10 = (int) _lo(hf_val0);
h32 = (int) _hi(hf_val0);
h54 = (int) _lo(hf_val1);
h76 = (int) _hi(hf_val1);
h01 = _packlh2(h10, h10);
h23 = _packlh2(h32, h32);
h45 = _packlh2(h54, h54);
h67 = _packlh2(h76, h76);
l10 = (int) _lo(lf_val0);
l32 = (int) _hi(lf_val0);
l54 = (int) _lo(lf_val1);
l76 = (int) _hi(lf_val1);
/*-----------------------------------------------------------------------*/
/* The number of loop iterations is x_dim >> 1 and for each iteration 2 */
/* outputs one for the low-pass and one for the high pass will be produ */
/* ced. Pre-read the last 6 values of the input line into registers so */
/* that the sependency that arises from checking pointer updates and */
/* wrapping around need not be done. Data reuse is achieved by issuing */
/* moves within the kernel. Because of this the input data reads now */
/* start from the beginning of the input line in the loop. Predication */
/* register is used to switch the location where the output low-pass */
/* samples are stored. The high pass output pointer need not be switched */
/* though. */
/*-----------------------------------------------------------------------*/
iters = ish_x_dim >> 1 ;
offset = (ish_x_dim - M) + 2;
xstart = iptr + offset;
yptr_h = optr + (ish_x_dim >> 1);
yptr_l0 = optr + ( offset >> 1);
yptr_l1 = optr;
xiptr = (const int *) xstart;
x10 = *xiptr++;
x32 = *xiptr++;
x54 = *xiptr++;
/*-----------------------------------------------------------------------*/
/* Re-allign input pointer to point to the beginning of the input line */
/* and cast as an int pointer to perform word wide loads. The _dotp2 */
/* instruction computes the various partial products and sums these */
/* up together to form the high pass output and low pass output sam- */
/* ples. These are shifted by 15 because of Q15 math and stored as */
/* low pass and high pass outputs. */
/* */
/* The foll: are the intermediate results being computed */
/* */
/* High Pass: */
/* prod_h0 = x0 * h7 prod_h1 = x1 * h6 */
/* prod_h2 = x2 * h5 prod_h3 = x3 * h4 */
/* prod_h4 = x4 * h3 prod_h5 = x5 * h2 */
/* prod_h6 = x6 * h1 prod_h7 = x7 * h0 */
/* */
/* Low Pass: */
/* prod_l0 = x0 * g0 prod_l1 = x1 * g1 */
/* prod_l2 = x2 * g2 prod_l3 = x3 * g3 */
/* prod_l4 = x4 * g4 prod_l5 = x5 * g5 */
/* prod_l6 = x6 * g6 prod_l7 = x7 * g7 */
/* */
/* In addition note the use of p which serves as a pedication register */
/* to switch pointers to the low pass filter. The first three low-pass */
/* samples that are computed actually need to be stored at the end of */
/* the low-pass array, after which samples need to be stored at the */
/* beginning of the array. The predication register p accomplishes this. */
/* */
/*-----------------------------------------------------------------------*/
xiptr = (int *) iptr;
p = 3;
for(i = iters; i > 0; i--)
{
sum_l = Qr;
sum_h = Qr;
x76 = *xiptr++;
prod_h10 = _dotp2(x10, h67);
prod_h32 = _dotp2(x32, h45);
prod_h54 = _dotp2(x54, h23);
prod_h76 = _dotp2(x76, h01);
prod_l10 = _dotp2(x10, l10);
prod_l32 = _dotp2(x32, l32);
prod_l54 = _dotp2(x54, l54);
prod_l76 = _dotp2(x76, l76);
sum_h += prod_h10 + prod_h32 + prod_h54 + prod_h76;
sum_l += prod_l10 + prod_l32 + prod_l54 + prod_l76;
x10 = x32;
x32 = x54;
x54 = x76;
res_h = (sum_h >> Qpt);
res_l = (sum_l >> Qpt);
*yptr_h++ = res_h;
if (p) *yptr_l0++ = res_l;
if (!p) *yptr_l1++ = res_l;
if (p) p--;
}
}
/*============================================================================*/
/* Copyright (C) 1997-1999 Texas Instruments Incorporated. */
/* All Rights Reserved */
/*============================================================================*/