You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
495 lines
30 KiB
495 lines
30 KiB
;* ======================================================================== *;
|
|
;* TEXAS INSTRUMENTS, INC. *;
|
|
;* *;
|
|
;* IMGLIB DSP Image/Video Processing Library *;
|
|
;* *;
|
|
;* Release: Revision 1.04b *;
|
|
;* CVS Revision: 1.11 Sun Sep 29 03:32:31 2002 (UTC) *;
|
|
;* Snapshot date: 23-Oct-2003 *;
|
|
;* *;
|
|
;* This library contains proprietary intellectual property of Texas *;
|
|
;* Instruments, Inc. The library and its source code are protected by *;
|
|
;* various copyrights, and portions may also be protected by patents or *;
|
|
;* other legal protections. *;
|
|
;* *;
|
|
;* This software is licensed for use with Texas Instruments TMS320 *;
|
|
;* family DSPs. This license was provided to you prior to installing *;
|
|
;* the software. You may review this license by consulting the file *;
|
|
;* TI_license.PDF which accompanies the files in this library. *;
|
|
;* ------------------------------------------------------------------------ *;
|
|
;* Copyright (C) 2003 Texas Instruments, Incorporated. *;
|
|
;* All Rights Reserved. *;
|
|
;* ======================================================================== *;
|
|
|
|
|
|
;* ======================================================================== *;
|
|
;* Assembler compatibility shim for assembling 4.30 and later code on *;
|
|
;* tools prior to 4.30. *;
|
|
;* ======================================================================== *;
|
|
|
|
.if $isdefed(".ASSEMBLER_VERSION")
|
|
.asg .ASSEMBLER_VERSION, $asmver
|
|
.else
|
|
.asg 0, $asmver
|
|
.endif
|
|
|
|
.if ($asmver < 430)
|
|
|
|
.asg B, CALL ; Function Call
|
|
.asg B, RET ; Return from a Function
|
|
.asg B, CALLRET ; Function call with Call / Ret chaining.
|
|
|
|
.if .TMS320C6400
|
|
.asg BNOP, CALLNOP ; C64x BNOP as a Fn. Call
|
|
.asg BNOP, RETNOP ; C64x BNOP as a Fn. Return
|
|
.asg BNOP, CRNOP ; C64x Fn call w/, Call/Ret chaining via BNOP.
|
|
.endif
|
|
|
|
.asg , .asmfunc ; .func equivalent for hand-assembly code
|
|
.asg , .endasmfunc ; .endfunc equivalent for hand-assembly code
|
|
|
|
.endif
|
|
|
|
;* ======================================================================== *;
|
|
;* End of assembler compatibility shim. *;
|
|
;* ======================================================================== *;
|
|
|
|
|
|
* ========================================================================= *
|
|
* TEXAS INSTRUMENTS, INC. *
|
|
* *
|
|
* NAME *
|
|
* IMG_wave_horz : 1D Wavelet Transform *
|
|
* *
|
|
* REVISION DATE *
|
|
* 21-Jan-1999 *
|
|
* *
|
|
* USAGE *
|
|
* This routine is C-callable and can be called as: *
|
|
* *
|
|
* void IMG_wave_horz *
|
|
* ( *
|
|
* const short *restrict in_data, /* Row of input pixels */ *
|
|
* const short *restrict qmf, /* Low-pass QMF filter */ *
|
|
* const short *restrict mqmf, /* High-pass QMF filter */ *
|
|
* short *restrict out_data, /* Row of output data */ *
|
|
* int cols /* Length of input. */ *
|
|
* ); *
|
|
* *
|
|
* DESCRIPTION *
|
|
* This kernel performs a 1D Periodic Orthogonal Wavelet *
|
|
* decomposition. This also performs athe row decomposition in a *
|
|
* 2D wavelet transform. An input signal x[n] is low pass and *
|
|
* high pass filtered and decimated by two. This results in a *
|
|
* reference signal r1[n] which is the decimated output obtained *
|
|
* by dropping the odd samples of the low pass filtered output and *
|
|
* a detail signal d[n] obtained by dropping the odd samples of *
|
|
* the high-pass output. A circular convolution algorithm is *
|
|
* implemented and hence the wavelet transform is periodic. The *
|
|
* reference signal and the detail signal are half the size of the *
|
|
* original signal. The reference signal may then be iterated *
|
|
* again to perform another scale of multi-resolution analysis. *
|
|
* *
|
|
* TECHNIQUES *
|
|
* The main idea in optimizing the code is to issue one set of *
|
|
* reads to the x array and to perform low-pass and high pass *
|
|
* filtering together and to perfrom the filtering operations *
|
|
* together to maximize the number of multiplies. The last 6 *
|
|
* elements of the low-pass filter and the first 6 elements of the *
|
|
* high pass filter use the same input This is used to *
|
|
* appropraitely change the output pointer to the low pass filter *
|
|
* after 6 iterations. However for the first six iterations *
|
|
* pointer wrap-around can occurr and hence this creates a *
|
|
* dependency. Pre-reading those 6 values outside the array *
|
|
* prevents the checks that introduce this dependency. In addtion *
|
|
* the input data is read as word wide quantities and the low-pass *
|
|
* and high-pass filter coefficients are stored in registers *
|
|
* allowing for the input loop to be completely unrolled. Thus *
|
|
* the assembly code has only one loop. A predication register is *
|
|
* used to reset the low-pass output pointer after three *
|
|
* iterations. The merging of the loops in this fashion allows f *
|
|
* *
|
|
* ASSUMPTIONS *
|
|
* This kernel assumes that the # of filter taps for the qmf and *
|
|
* mqmf is 8. *
|
|
* *
|
|
* Both the filters are assumed to be double-word aligned and have *
|
|
* 8 taps. *
|
|
* *
|
|
* The input line is assumed to be word aligned so that LDWs *
|
|
* may be performed. *
|
|
* *
|
|
* This code assumes that filter coefficients are maintained as *
|
|
* shorts in Q15 format. *
|
|
* *
|
|
* It also assumes that input data is an array of shorts (16 bit) *
|
|
* (The input is assumed to be an array of shorts to allow for *
|
|
* re-using this kernel to perform Multi Resolution Analysis as *
|
|
* the output of this code will feedback again as input in the *
|
|
* next stage.) *
|
|
* *
|
|
* Since the transform is a dyadic wavelet cols should be a power *
|
|
* of 2. Cols must also be >=8. *
|
|
* *
|
|
* *
|
|
* MEMORY NOTE *
|
|
* This code has no bank conflicts. *
|
|
* *
|
|
* This code is ENDIAN Neutral. *
|
|
* *
|
|
* *
|
|
* NOTES *
|
|
* This code masks interrupts for nearly its entire duration. As *
|
|
* a result, the code is interrupt tolerant but not *
|
|
* interruptible. *
|
|
* *
|
|
* This code can implement the Daubechies D4 filterbank for *
|
|
* analysis with 4 vansishing moments. The length of the analyzing *
|
|
* low-pass and high pass filters is 8 in that case. *
|
|
* *
|
|
* C CODE *
|
|
* *
|
|
* This is the C equivalent of the assembly code without restrictions: *
|
|
* Note that the assembly code is hand optimized and restrictions *
|
|
* apply as noted under "ASSUMPTIONS". *
|
|
* *
|
|
* void IMG_wave_horz *
|
|
* ( *
|
|
* const short *restrict in_data, /* Row of input pixels */ *
|
|
* const short *restrict qmf, /* Low-pass QMF filter */ *
|
|
* const short *restrict mqmf, /* High-pass QMF filter */ *
|
|
* short *restrict out_data, /* Row of output data */ *
|
|
* int cols /* Length of input. */ *
|
|
* ); *
|
|
* *
|
|
* { *
|
|
* int i, res, iters; *
|
|
* int j, sum, prod; *
|
|
* short *xptr = in_data; *
|
|
* short *yptr = out_data; *
|
|
* short *x_end = &in_data[cols - 1]; *
|
|
* short xdata, hdata; *
|
|
* short *xstart; *
|
|
* short *filt_ptr; *
|
|
* int M = 8; *
|
|
* *
|
|
* /* ------------------------------------------------- */ *
|
|
* /* Set our loop trip count and starting x posn. */ *
|
|
* /* 'xstart' is used in the high-pass filter loop. */ *
|
|
* /* ------------------------------------------------- */ *
|
|
* iters = cols; *
|
|
* xstart = in_data + (cols - M) + 2; *
|
|
* *
|
|
* /* ------------------------------------------------- */ *
|
|
* /* Low pass filter. Iterate for cols/2 iterations */ *
|
|
* /* generating cols/2 low pass sample points with */ *
|
|
* /* the low-pass quadrature mirror filter. */ *
|
|
* /* ------------------------------------------------- */ *
|
|
* for (i = 0; i < iters; i += 2) *
|
|
* { *
|
|
* /* --------------------------------------------- */ *
|
|
* /* Initialize our sum to the rounding value */ *
|
|
* /* and reset our pointer. */ *
|
|
* /* --------------------------------------------- */ *
|
|
* sum = Qr; *
|
|
* xptr = in_data + i; *
|
|
* *
|
|
* /* --------------------------------------------- */ *
|
|
* /* Iterate over the taps in our QMF. */ *
|
|
* /* --------------------------------------------- */ *
|
|
* for (j = 0; j < M; j++) *
|
|
* { *
|
|
* xdata = *xptr++; *
|
|
* hdata = qmf[j]; *
|
|
* prod = xdata * hdata; *
|
|
* sum += prod; *
|
|
* if (xptr > x_end) xptr = in_data; *
|
|
* } *
|
|
* *
|
|
* /* --------------------------------------------- */ *
|
|
* /* Adjust the Qpt of our sum and store result. */ *
|
|
* /* --------------------------------------------- */ *
|
|
* res = (sum >> Qpt); *
|
|
* *out_data++ = res; *
|
|
* } *
|
|
* *
|
|
* *
|
|
* /* ------------------------------------------------- */ *
|
|
* /* High pass filter. Iterate for cols/2 iters */ *
|
|
* /* generating cols/2 high pass sample points with */ *
|
|
* /* the high-pass quadrature mirror filter. */ *
|
|
* /* ------------------------------------------------- */ *
|
|
* for (i = 0; i < iters ; i+=2) *
|
|
* { *
|
|
* /* --------------------------------------------- */ *
|
|
* /* Initialize our sum and filter pointer. */ *
|
|
* /* --------------------------------------------- */ *
|
|
* sum = Qr; *
|
|
* filt_ptr = mqmf + (M - 1); *
|
|
* *
|
|
* /* --------------------------------------------- */ *
|
|
* /* Set up our data pointer. This is slightly */ *
|
|
* /* more complicated due to how the data wraps */ *
|
|
* /* around the edge of the buffer. */ *
|
|
* /* --------------------------------------------- */ *
|
|
* xptr = xstart; *
|
|
* xstart += 2; *
|
|
* if (xstart > x_end) xstart = in_data; *
|
|
* *
|
|
* /* --------------------------------------------- */ *
|
|
* /* Iterate over the taps in our QMF. */ *
|
|
* /* --------------------------------------------- */ *
|
|
* for ( j = 0; j < M; j++) *
|
|
* { *
|
|
* xdata = *xptr++; *
|
|
* hdata = *filt_ptr--; *
|
|
* prod = xdata * hdata; *
|
|
* if (xptr > x_end) xptr = in_data; *
|
|
* sum += prod; *
|
|
* } *
|
|
* *
|
|
* /* --------------------------------------------- */ *
|
|
* /* Adjust the Qpt of our sum and store result. */ *
|
|
* /* --------------------------------------------- */ *
|
|
* res = (sum >> Qpt); *
|
|
* *out_data++ = res; *
|
|
* } *
|
|
* } *
|
|
* *
|
|
* CYCLES *
|
|
* cycles = cols * 2 + 25. *
|
|
* *
|
|
* For cols = 256, cycles = 537. *
|
|
* For cols = 512, cycles = 1049. *
|
|
* *
|
|
* CODESIZE *
|
|
* 360 bytes *
|
|
* *
|
|
* ------------------------------------------------------------------------- *
|
|
* Copyright (c) 2003 Texas Instruments, Incorporated. *
|
|
* All Rights Reserved. *
|
|
* ========================================================================= *
|
|
|
|
|
|
.sect ".text:_wave_horz"
|
|
.global _IMG_wave_horz
|
|
_IMG_wave_horz:
|
|
|
|
|
|
Qr .set 16384
|
|
M .set 8
|
|
; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================
|
|
.asg A4, A_iptr
|
|
.asg B4, B_qmf
|
|
.asg A6, A_filter
|
|
.asg B6, B_yptr_l1
|
|
.asg A8, A_ish_x_dim
|
|
.asg A6, A_qr
|
|
.asg B20, B_iptr
|
|
.asg B7, B_h67
|
|
.asg B16, B_h45
|
|
.asg A5, A_h23
|
|
.asg A4, A_h01
|
|
.asg B4, B_l10
|
|
.asg B5, B_l32
|
|
.asg A2, A_l54
|
|
.asg A3, A_l76
|
|
.asg B19, B_x10
|
|
.asg B18, B_x32
|
|
.asg A9, A_x54
|
|
.asg A8, A_yptr_h
|
|
.asg B17, B_yptr_l0
|
|
.asg B6, B_yptr_l1
|
|
.asg A0, A_i
|
|
.asg B0, B_p
|
|
.asg A17, A_h32
|
|
.asg A16, A_h10
|
|
.asg B9, B_h76
|
|
.asg B8, B_h54
|
|
.asg A7, A_offset
|
|
.asg A1, A_xiptr
|
|
.asg B2, B_xiptr
|
|
.asg A5, A_optr
|
|
.asg A7, A_x76
|
|
.asg B21, B_prod_l10
|
|
.asg B29, B_csr
|
|
.asg B28, B_no_gie
|
|
.asg A0, A_qmf
|
|
; ============================================================================
|
|
|
|
SUB .L1 A_ish_x_dim, M - 2, A_offset ; x-M+2
|
|
|| MVC .S2 CSR, B_csr
|
|
|| LDDW .D1T2 *A_filter[1], B_h76:B_h54 ; High
|
|
|
|
LDDW .D1T1 *A_filter[0], A_h32:A_h10 ; High
|
|
|| AND .L2 B_csr, -2, B_no_gie
|
|
|| MV .S2X A_iptr, B_iptr ; iptr
|
|
|
|
ADDAH .D1 A_iptr, A_offset, A_xiptr ; iptr
|
|
|| MVC .S2 B_no_gie, CSR ; Disabl
|
|
|| LDW .D2T1 *B_iptr++, A_x76 ; x76
|
|
|
|
ADD .L2X A_xiptr, 4, B_xiptr ; xiptr
|
|
|| LDW .D1T2 *A_xiptr++[2], B_x10 ; x10
|
|
|| MV .S1X B_qmf, A_qmf
|
|
|
|
LDDW .D1T1 *A_qmf[1], A_l76:A_l54 ; Low
|
|
|| LDDW .D2T2 *B_qmf[0], B_l32:B_l10 ; Low
|
|
|
|
LDW .D1T1 *A_xiptr++[2], A_x54 ; x54
|
|
|| LDW .D2T2 *B_xiptr++[2], B_x32 ; x32
|
|
|| MVK .S1 Qr, A_qr ; A_qr
|
|
|
|
SHRU .S1 A_ish_x_dim, 1, A_i ; X>>1
|
|
|| MV .L1X B_yptr_l1, A_optr ; Copy
|
|
|| MVK .S2 3, B_p ; switch
|
|
|| ADD .L2X B_yptr_l1, A_offset, B_yptr_l0 ; optr-off
|
|
|
|
ADDAH .D1 A_optr, A_i, A_yptr_h ; [x>>1]
|
|
|
|
PACKLH2 .S1 A_h10, A_h10, A_h01 ; h0:h1
|
|
|| PACKLH2 .L1 A_h32, A_h32, A_h23 ; h2:h3
|
|
|| PACKLH2 .L2 B_h54, B_h54, B_h45 ; h4:h5
|
|
|| PACKLH2 .S2 B_h76, B_h76, B_h67 ; h6:h7
|
|
|
|
|
|
; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================
|
|
.asg A6, A_qr
|
|
.asg B20, B_iptr
|
|
.asg B7, B_h67
|
|
.asg B16, B_h45
|
|
.asg A5, A_h23
|
|
.asg A4, A_h01
|
|
.asg B4, B_l10
|
|
.asg B5, B_l32
|
|
.asg A2, A_l54
|
|
.asg A3, A_l76
|
|
.asg B19, B_x10
|
|
.asg B18, B_x32
|
|
.asg A9, A_x54
|
|
.asg A8, A_yptr_h
|
|
.asg B17, B_yptr_l0
|
|
.asg B6, B_yptr_l1
|
|
.asg A0, A_i
|
|
.asg B0, B_p
|
|
.asg B24, B_sum_h
|
|
.asg A19, A_sum_l
|
|
.asg A7, A_x76
|
|
.asg B9, B_prod_h10
|
|
.asg B22, B_prod_h32
|
|
.asg A17, A_prod_h54
|
|
.asg A18, A_prod_h76
|
|
.asg B21, B_prod_l10
|
|
.asg B23, B_prod_l32
|
|
.asg A19, A_prod_l54
|
|
.asg A18, A_prod_l76
|
|
.asg B21, B_tmph0
|
|
.asg A17, A_tmph1
|
|
.asg B22, B_tmpl0
|
|
.asg A18, A_tmpl1
|
|
.asg B23, B_tmp_h
|
|
.asg A16, A_tmp_l
|
|
.asg B24, B_res_hi
|
|
.asg A16, A_res_low
|
|
.asg B3, B_return
|
|
.asg A1, A_p
|
|
; ============================================================================
|
|
; START:
|
|
; ============================ PIPE LOOP PROLOG ==============================
|
|
; PROLOG:
|
|
|
|
LDW .D2T1 *B_iptr++, A_x76 ; Load x76
|
|
|| MVK .S1 1, A_p
|
|
|| DOTP2 .M2 B_x10, B_l10, B_prod_l10 ; prod
|
|
|| B .S2 L_3 + 8
|
|
|
|
DOTP2 .M1 A_x76, A_l76, A_prod_l76 ; x6l6 + x7l7
|
|
|| DOTP2 .M2 B_x32, B_l32, B_prod_l32 ; x2l2 + x3l3
|
|
|| B .S2 L_4 + 12
|
|
|
|
MV .L2 B_x32, B_x10 ; x10 = x32
|
|
|| DOTP2 .M1 A_x54, A_l54, A_prod_l54 ; x4l4 + x5l5
|
|
|| DOTP2 .M2 B_x10, B_h67, B_prod_h10 ; x0h7 + x1h6
|
|
|| B .S2 L_1 + 4
|
|
|
|
DOTP2 .M1 A_x54, A_h23, A_prod_h54 ; x5h2 + x4h3
|
|
|| DOTP2 .M2 B_x10, B_l10, B_prod_l10 ; x0l0 + x1l1
|
|
|| SUB .L1 A_i, 3, A_i
|
|
|| B .S2 L_2 + 4
|
|
|
|
MV .D1 A_x76, A_x54 ; x54 = x76
|
|
|| MV .L2X A_x54, B_x32 ; x32 = x54
|
|
|| DOTP2 .M1 A_x76, A_h01, A_prod_h76 ; x7h0 + x6h1
|
|
|| DOTP2 .M2 B_x32, B_h45, B_prod_h32 ; x3h4 + x2h5
|
|
|| LDW .D2T1 *B_iptr++, A_x76 ; Load x76
|
|
|
|
ADD .L2 B_prod_l10, B_prod_l32, B_tmpl0 ; l10 + l32
|
|
|| DOTP2 .M1 A_x76, A_l76, A_prod_l76 ; x6l6 + x7l7
|
|
|| DOTP2 .M2 B_x32, B_l32, B_prod_l32 ; x2l2 + x3l3
|
|
|
|
; ============================ PIPE LOOP KERNEL ==============================
|
|
LOOP:
|
|
|
|
L_1:
|
|
SHR .S2 B_sum_h, 15, B_res_hi ; >> 15
|
|
|| ADD .L1 A_qr, A_tmp_l, A_sum_l ; += tmp_l
|
|
|| ADD .S1 A_prod_h54, A_prod_h76, A_tmph1 ; h54 + h76
|
|
|| MV .D1 A_x76, A_x54 ; x54 = x76
|
|
|| MV .L2X A_x54, B_x32 ; x32 = x54
|
|
|| DOTP2 .M1 A_x76, A_h01, A_prod_h76 ; x7h0 + x6h1
|
|
|| DOTP2 .M2 B_x32, B_h45, B_prod_h32 ; x3h4 + x2h5
|
|
|| LDW .D2T1 *B_iptr++, A_x76 ; Load x76
|
|
|
|
L_2:
|
|
STH .D1T2 B_res_hi, *A_yptr_h++ ; *y_hp
|
|
|| SHR .S1 A_sum_l, 15, A_res_low ; >> 15
|
|
|| ADD .S2 B_prod_h10, B_prod_h32, B_tmph0 ; h10 + h32
|
|
|| ADD .L2 B_prod_l10, B_prod_l32, B_tmpl0 ; l10 + l32
|
|
|| DOTP2 .M1 A_x76, A_l76, A_prod_l76 ; x6l6 + x7l7
|
|
|| DOTP2 .M2 B_x32, B_l32, B_prod_l32 ; x2l2 + x3l3
|
|
||[A_p] SUB .L1 A_p, 1, A_p
|
|
|
|
L_3:
|
|
[!B_p]STH .D2T1 A_res_low, *B_yptr_l1++ ; Store *y_lp1
|
|
|| ADD .L2X B_tmph0, A_tmph1, B_tmp_h ; tmp_h
|
|
|| BDEC .S1 LOOP, A_i ; if (i) B LOOP
|
|
|| ADD .D1 A_prod_l54, A_prod_l76, A_tmpl1 ; l54 + l76
|
|
|| MV .S2 B_x32, B_x10 ; x10 = x32
|
|
|| DOTP2 .M1 A_x54, A_l54, A_prod_l54 ; x4l4 + x5l5
|
|
|| DOTP2 .M2 B_x10, B_h67, B_prod_h10 ; x0h7 + x1h6
|
|
|
|
L_4:
|
|
[ B_p]SUB .L2 B_p, 1, B_p ; pred.for LP
|
|
||[ B_p]STH .D2T1 A_res_low, *B_yptr_l0++ ; Store *y_lp0
|
|
|| ADD .S2X A_qr, B_tmp_h, B_sum_h ; += tmp_h
|
|
|| ADD .D1X A_tmpl1, B_tmpl0, A_tmp_l ; tmp_l
|
|
|| DOTP2 .M1 A_x54, A_h23, A_prod_h54 ; x5h2 + x4h3
|
|
|| DOTP2 .M2 B_x10, B_l10, B_prod_l10 ; x0l0 + x1l1
|
|
|
|
; ============================ PIPE LOOP EPILOG ==============================
|
|
; EPILOG:
|
|
SHR .S1X B_sum_h, 15, A0 ; sum_h >> 15
|
|
|| ADD .L1 A_qr, A_tmp_l, A_sum_l ; sum_l+= tmp_l
|
|
|| ADD .D1 A_prod_h54, A_prod_h76, A_tmph1 ; h54 + h76
|
|
|| RET .S2 B_return
|
|
|
|
STH .D1T1 A0, *A_yptr_h++ ; Store *y_hp
|
|
|| SHR .S1 A_sum_l, 15, A_res_low ; sum_l >> 15
|
|
|| ADD .L2 B_prod_h10, B_prod_h32, B_tmph0 ; h10 + h32
|
|
|
|
STH .D2T1 A_res_low, *B_yptr_l1++ ; Store *y_lp1
|
|
|| ADD .L2X B_tmph0, A_tmph1, B_tmp_h ; tmp_h
|
|
|
|
ADD .S2X A_qr, B_tmp_h, B_sum_h ; sum_h += tmp_h
|
|
|
|
SHR .S2 B_sum_h, 15, B_res_hi ; sum_h >> 15
|
|
|
|
STH .D1T2 B_res_hi, *A_yptr_h++ ; Store *y_hp
|
|
|| MVC .S2 B_csr, CSR
|
|
|
|
* ========================================================================= *
|
|
* End of file: img_wave_horz.asm *
|
|
* ------------------------------------------------------------------------- *
|
|
* Copyright (c) 2003 Texas Instruments, Incorporated. *
|
|
* All Rights Reserved. *
|
|
* ========================================================================= *
|
|
|