You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

496 lines
30 KiB

;* ======================================================================== *;
;* TEXAS INSTRUMENTS, INC. *;
;* *;
;* IMGLIB DSP Image/Video Processing Library *;
;* *;
;* Release: Revision 1.04b *;
;* CVS Revision: 1.11 Sun Sep 29 03:32:31 2002 (UTC) *;
;* Snapshot date: 23-Oct-2003 *;
;* *;
;* This library contains proprietary intellectual property of Texas *;
;* Instruments, Inc. The library and its source code are protected by *;
;* various copyrights, and portions may also be protected by patents or *;
;* other legal protections. *;
;* *;
;* This software is licensed for use with Texas Instruments TMS320 *;
;* family DSPs. This license was provided to you prior to installing *;
;* the software. You may review this license by consulting the file *;
;* TI_license.PDF which accompanies the files in this library. *;
;* ------------------------------------------------------------------------ *;
;* Copyright (C) 2003 Texas Instruments, Incorporated. *;
;* All Rights Reserved. *;
;* ======================================================================== *;
;* ======================================================================== *;
;* Assembler compatibility shim for assembling 4.30 and later code on *;
;* tools prior to 4.30. *;
;* ======================================================================== *;
.if $isdefed(".ASSEMBLER_VERSION")
.asg .ASSEMBLER_VERSION, $asmver
.else
.asg 0, $asmver
.endif
.if ($asmver < 430)
.asg B, CALL ; Function Call
.asg B, RET ; Return from a Function
.asg B, CALLRET ; Function call with Call / Ret chaining.
.if .TMS320C6400
.asg BNOP, CALLNOP ; C64x BNOP as a Fn. Call
.asg BNOP, RETNOP ; C64x BNOP as a Fn. Return
.asg BNOP, CRNOP ; C64x Fn call w/, Call/Ret chaining via BNOP.
.endif
.asg , .asmfunc ; .func equivalent for hand-assembly code
.asg , .endasmfunc ; .endfunc equivalent for hand-assembly code
.endif
;* ======================================================================== *;
;* End of assembler compatibility shim. *;
;* ======================================================================== *;
* ========================================================================= *
* TEXAS INSTRUMENTS, INC. *
* *
* NAME *
* IMG_wave_horz : 1D Wavelet Transform *
* *
* REVISION DATE *
* 21-Jan-1999 *
* *
* USAGE *
* This routine is C-callable and can be called as: *
* *
* void IMG_wave_horz *
* ( *
* const short *restrict in_data, /* Row of input pixels */ *
* const short *restrict qmf, /* Low-pass QMF filter */ *
* const short *restrict mqmf, /* High-pass QMF filter */ *
* short *restrict out_data, /* Row of output data */ *
* int cols /* Length of input. */ *
* ); *
* *
* DESCRIPTION *
* This kernel performs a 1D Periodic Orthogonal Wavelet *
* decomposition. This also performs athe row decomposition in a *
* 2D wavelet transform. An input signal x[n] is low pass and *
* high pass filtered and decimated by two. This results in a *
* reference signal r1[n] which is the decimated output obtained *
* by dropping the odd samples of the low pass filtered output and *
* a detail signal d[n] obtained by dropping the odd samples of *
* the high-pass output. A circular convolution algorithm is *
* implemented and hence the wavelet transform is periodic. The *
* reference signal and the detail signal are half the size of the *
* original signal. The reference signal may then be iterated *
* again to perform another scale of multi-resolution analysis. *
* *
* TECHNIQUES *
* The main idea in optimizing the code is to issue one set of *
* reads to the x array and to perform low-pass and high pass *
* filtering together and to perfrom the filtering operations *
* together to maximize the number of multiplies. The last 6 *
* elements of the low-pass filter and the first 6 elements of the *
* high pass filter use the same input This is used to *
* appropraitely change the output pointer to the low pass filter *
* after 6 iterations. However for the first six iterations *
* pointer wrap-around can occurr and hence this creates a *
* dependency. Pre-reading those 6 values outside the array *
* prevents the checks that introduce this dependency. In addtion *
* the input data is read as word wide quantities and the low-pass *
* and high-pass filter coefficients are stored in registers *
* allowing for the input loop to be completely unrolled. Thus *
* the assembly code has only one loop. A predication register is *
* used to reset the low-pass output pointer after three *
* iterations. The merging of the loops in this fashion allows f *
* *
* ASSUMPTIONS *
* This kernel assumes that the # of filter taps for the qmf and *
* mqmf is 8. *
* *
* Both the filters are assumed to be double-word aligned and have *
* 8 taps. *
* *
* The input line is assumed to be word aligned so that LDWs *
* may be performed. *
* *
* This code assumes that filter coefficients are maintained as *
* shorts in Q15 format. *
* *
* It also assumes that input data is an array of shorts (16 bit) *
* (The input is assumed to be an array of shorts to allow for *
* re-using this kernel to perform Multi Resolution Analysis as *
* the output of this code will feedback again as input in the *
* next stage.) *
* *
* Since the transform is a dyadic wavelet cols should be a power *
* of 2. Cols must also be >=8. *
* *
* *
* MEMORY NOTE *
* This code has no bank conflicts. *
* *
* This code is ENDIAN Neutral. *
* *
* *
* NOTES *
* This code masks interrupts for nearly its entire duration. As *
* a result, the code is interrupt tolerant but not *
* interruptible. *
* *
* This code can implement the Daubechies D4 filterbank for *
* analysis with 4 vansishing moments. The length of the analyzing *
* low-pass and high pass filters is 8 in that case. *
* *
* C CODE *
* *
* This is the C equivalent of the assembly code without restrictions: *
* Note that the assembly code is hand optimized and restrictions *
* apply as noted under "ASSUMPTIONS". *
* *
* void IMG_wave_horz *
* ( *
* const short *restrict in_data, /* Row of input pixels */ *
* const short *restrict qmf, /* Low-pass QMF filter */ *
* const short *restrict mqmf, /* High-pass QMF filter */ *
* short *restrict out_data, /* Row of output data */ *
* int cols /* Length of input. */ *
* ); *
* *
* { *
* int i, res, iters; *
* int j, sum, prod; *
* short *xptr = in_data; *
* short *yptr = out_data; *
* short *x_end = &in_data[cols - 1]; *
* short xdata, hdata; *
* short *xstart; *
* short *filt_ptr; *
* int M = 8; *
* *
* /* ------------------------------------------------- */ *
* /* Set our loop trip count and starting x posn. */ *
* /* 'xstart' is used in the high-pass filter loop. */ *
* /* ------------------------------------------------- */ *
* iters = cols; *
* xstart = in_data + (cols - M) + 2; *
* *
* /* ------------------------------------------------- */ *
* /* Low pass filter. Iterate for cols/2 iterations */ *
* /* generating cols/2 low pass sample points with */ *
* /* the low-pass quadrature mirror filter. */ *
* /* ------------------------------------------------- */ *
* for (i = 0; i < iters; i += 2) *
* { *
* /* --------------------------------------------- */ *
* /* Initialize our sum to the rounding value */ *
* /* and reset our pointer. */ *
* /* --------------------------------------------- */ *
* sum = Qr; *
* xptr = in_data + i; *
* *
* /* --------------------------------------------- */ *
* /* Iterate over the taps in our QMF. */ *
* /* --------------------------------------------- */ *
* for (j = 0; j < M; j++) *
* { *
* xdata = *xptr++; *
* hdata = qmf[j]; *
* prod = xdata * hdata; *
* sum += prod; *
* if (xptr > x_end) xptr = in_data; *
* } *
* *
* /* --------------------------------------------- */ *
* /* Adjust the Qpt of our sum and store result. */ *
* /* --------------------------------------------- */ *
* res = (sum >> Qpt); *
* *out_data++ = res; *
* } *
* *
* *
* /* ------------------------------------------------- */ *
* /* High pass filter. Iterate for cols/2 iters */ *
* /* generating cols/2 high pass sample points with */ *
* /* the high-pass quadrature mirror filter. */ *
* /* ------------------------------------------------- */ *
* for (i = 0; i < iters ; i+=2) *
* { *
* /* --------------------------------------------- */ *
* /* Initialize our sum and filter pointer. */ *
* /* --------------------------------------------- */ *
* sum = Qr; *
* filt_ptr = mqmf + (M - 1); *
* *
* /* --------------------------------------------- */ *
* /* Set up our data pointer. This is slightly */ *
* /* more complicated due to how the data wraps */ *
* /* around the edge of the buffer. */ *
* /* --------------------------------------------- */ *
* xptr = xstart; *
* xstart += 2; *
* if (xstart > x_end) xstart = in_data; *
* *
* /* --------------------------------------------- */ *
* /* Iterate over the taps in our QMF. */ *
* /* --------------------------------------------- */ *
* for ( j = 0; j < M; j++) *
* { *
* xdata = *xptr++; *
* hdata = *filt_ptr--; *
* prod = xdata * hdata; *
* if (xptr > x_end) xptr = in_data; *
* sum += prod; *
* } *
* *
* /* --------------------------------------------- */ *
* /* Adjust the Qpt of our sum and store result. */ *
* /* --------------------------------------------- */ *
* res = (sum >> Qpt); *
* *out_data++ = res; *
* } *
* } *
* *
* CYCLES *
* cycles = cols * 2 + 25. *
* *
* For cols = 256, cycles = 537. *
* For cols = 512, cycles = 1049. *
* *
* CODESIZE *
* 360 bytes *
* *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *
.sect ".text:_wave_horz"
.global _IMG_wave_horz
_IMG_wave_horz:
Qr .set 16384
M .set 8
; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================
.asg A4, A_iptr
.asg B4, B_qmf
.asg A6, A_filter
.asg B6, B_yptr_l1
.asg A8, A_ish_x_dim
.asg A6, A_qr
.asg B20, B_iptr
.asg B7, B_h67
.asg B16, B_h45
.asg A5, A_h23
.asg A4, A_h01
.asg B4, B_l10
.asg B5, B_l32
.asg A2, A_l54
.asg A3, A_l76
.asg B19, B_x10
.asg B18, B_x32
.asg A9, A_x54
.asg A8, A_yptr_h
.asg B17, B_yptr_l0
.asg B6, B_yptr_l1
.asg A0, A_i
.asg B0, B_p
.asg A17, A_h32
.asg A16, A_h10
.asg B9, B_h76
.asg B8, B_h54
.asg A7, A_offset
.asg A1, A_xiptr
.asg B2, B_xiptr
.asg A5, A_optr
.asg A7, A_x76
.asg B21, B_prod_l10
.asg B29, B_csr
.asg B28, B_no_gie
.asg A0, A_qmf
; ============================================================================
SUB .L1 A_ish_x_dim, M - 2, A_offset ; x-M+2
|| MVC .S2 CSR, B_csr
|| LDDW .D1T2 *A_filter[1], B_h76:B_h54 ; High
LDDW .D1T1 *A_filter[0], A_h32:A_h10 ; High
|| AND .L2 B_csr, -2, B_no_gie
|| MV .S2X A_iptr, B_iptr ; iptr
ADDAH .D1 A_iptr, A_offset, A_xiptr ; iptr
|| MVC .S2 B_no_gie, CSR ; Disabl
|| LDW .D2T1 *B_iptr++, A_x76 ; x76
ADD .L2X A_xiptr, 4, B_xiptr ; xiptr
|| LDW .D1T2 *A_xiptr++[2], B_x10 ; x10
|| MV .S1X B_qmf, A_qmf
LDDW .D1T1 *A_qmf[1], A_l76:A_l54 ; Low
|| LDDW .D2T2 *B_qmf[0], B_l32:B_l10 ; Low
LDW .D1T1 *A_xiptr++[2], A_x54 ; x54
|| LDW .D2T2 *B_xiptr++[2], B_x32 ; x32
|| MVK .S1 Qr, A_qr ; A_qr
SHRU .S1 A_ish_x_dim, 1, A_i ; X>>1
|| MV .L1X B_yptr_l1, A_optr ; Copy
|| MVK .S2 3, B_p ; switch
|| ADD .L2X B_yptr_l1, A_offset, B_yptr_l0 ; optr-off
ADDAH .D1 A_optr, A_i, A_yptr_h ; [x>>1]
PACKLH2 .S1 A_h10, A_h10, A_h01 ; h0:h1
|| PACKLH2 .L1 A_h32, A_h32, A_h23 ; h2:h3
|| PACKLH2 .L2 B_h54, B_h54, B_h45 ; h4:h5
|| PACKLH2 .S2 B_h76, B_h76, B_h67 ; h6:h7
; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================
.asg A6, A_qr
.asg B20, B_iptr
.asg B7, B_h67
.asg B16, B_h45
.asg A5, A_h23
.asg A4, A_h01
.asg B4, B_l10
.asg B5, B_l32
.asg A2, A_l54
.asg A3, A_l76
.asg B19, B_x10
.asg B18, B_x32
.asg A9, A_x54
.asg A8, A_yptr_h
.asg B17, B_yptr_l0
.asg B6, B_yptr_l1
.asg A0, A_i
.asg B0, B_p
.asg B24, B_sum_h
.asg A19, A_sum_l
.asg A7, A_x76
.asg B9, B_prod_h10
.asg B22, B_prod_h32
.asg A17, A_prod_h54
.asg A18, A_prod_h76
.asg B21, B_prod_l10
.asg B23, B_prod_l32
.asg A19, A_prod_l54
.asg A18, A_prod_l76
.asg B21, B_tmph0
.asg A17, A_tmph1
.asg B22, B_tmpl0
.asg A18, A_tmpl1
.asg B23, B_tmp_h
.asg A16, A_tmp_l
.asg B24, B_res_hi
.asg A16, A_res_low
.asg B3, B_return
.asg A1, A_p
; ============================================================================
; START:
; ============================ PIPE LOOP PROLOG ==============================
; PROLOG:
LDW .D2T1 *B_iptr++, A_x76 ; Load x76
|| MVK .S1 1, A_p
|| DOTP2 .M2 B_x10, B_l10, B_prod_l10 ; prod
|| B .S2 L_3 + 8
DOTP2 .M1 A_x76, A_l76, A_prod_l76 ; x6l6 + x7l7
|| DOTP2 .M2 B_x32, B_l32, B_prod_l32 ; x2l2 + x3l3
|| B .S2 L_4 + 12
MV .L2 B_x32, B_x10 ; x10 = x32
|| DOTP2 .M1 A_x54, A_l54, A_prod_l54 ; x4l4 + x5l5
|| DOTP2 .M2 B_x10, B_h67, B_prod_h10 ; x0h7 + x1h6
|| B .S2 L_1 + 4
DOTP2 .M1 A_x54, A_h23, A_prod_h54 ; x5h2 + x4h3
|| DOTP2 .M2 B_x10, B_l10, B_prod_l10 ; x0l0 + x1l1
|| SUB .L1 A_i, 3, A_i
|| B .S2 L_2 + 4
MV .D1 A_x76, A_x54 ; x54 = x76
|| MV .L2X A_x54, B_x32 ; x32 = x54
|| DOTP2 .M1 A_x76, A_h01, A_prod_h76 ; x7h0 + x6h1
|| DOTP2 .M2 B_x32, B_h45, B_prod_h32 ; x3h4 + x2h5
|| LDW .D2T1 *B_iptr++, A_x76 ; Load x76
ADD .L2 B_prod_l10, B_prod_l32, B_tmpl0 ; l10 + l32
|| DOTP2 .M1 A_x76, A_l76, A_prod_l76 ; x6l6 + x7l7
|| DOTP2 .M2 B_x32, B_l32, B_prod_l32 ; x2l2 + x3l3
; ============================ PIPE LOOP KERNEL ==============================
LOOP:
L_1:
SHR .S2 B_sum_h, 15, B_res_hi ; >> 15
|| ADD .L1 A_qr, A_tmp_l, A_sum_l ; += tmp_l
|| ADD .S1 A_prod_h54, A_prod_h76, A_tmph1 ; h54 + h76
|| MV .D1 A_x76, A_x54 ; x54 = x76
|| MV .L2X A_x54, B_x32 ; x32 = x54
|| DOTP2 .M1 A_x76, A_h01, A_prod_h76 ; x7h0 + x6h1
|| DOTP2 .M2 B_x32, B_h45, B_prod_h32 ; x3h4 + x2h5
|| LDW .D2T1 *B_iptr++, A_x76 ; Load x76
L_2:
STH .D1T2 B_res_hi, *A_yptr_h++ ; *y_hp
|| SHR .S1 A_sum_l, 15, A_res_low ; >> 15
|| ADD .S2 B_prod_h10, B_prod_h32, B_tmph0 ; h10 + h32
|| ADD .L2 B_prod_l10, B_prod_l32, B_tmpl0 ; l10 + l32
|| DOTP2 .M1 A_x76, A_l76, A_prod_l76 ; x6l6 + x7l7
|| DOTP2 .M2 B_x32, B_l32, B_prod_l32 ; x2l2 + x3l3
||[A_p] SUB .L1 A_p, 1, A_p
L_3:
[!B_p]STH .D2T1 A_res_low, *B_yptr_l1++ ; Store *y_lp1
|| ADD .L2X B_tmph0, A_tmph1, B_tmp_h ; tmp_h
|| BDEC .S1 LOOP, A_i ; if (i) B LOOP
|| ADD .D1 A_prod_l54, A_prod_l76, A_tmpl1 ; l54 + l76
|| MV .S2 B_x32, B_x10 ; x10 = x32
|| DOTP2 .M1 A_x54, A_l54, A_prod_l54 ; x4l4 + x5l5
|| DOTP2 .M2 B_x10, B_h67, B_prod_h10 ; x0h7 + x1h6
L_4:
[ B_p]SUB .L2 B_p, 1, B_p ; pred.for LP
||[ B_p]STH .D2T1 A_res_low, *B_yptr_l0++ ; Store *y_lp0
|| ADD .S2X A_qr, B_tmp_h, B_sum_h ; += tmp_h
|| ADD .D1X A_tmpl1, B_tmpl0, A_tmp_l ; tmp_l
|| DOTP2 .M1 A_x54, A_h23, A_prod_h54 ; x5h2 + x4h3
|| DOTP2 .M2 B_x10, B_l10, B_prod_l10 ; x0l0 + x1l1
; ============================ PIPE LOOP EPILOG ==============================
; EPILOG:
SHR .S1X B_sum_h, 15, A0 ; sum_h >> 15
|| ADD .L1 A_qr, A_tmp_l, A_sum_l ; sum_l+= tmp_l
|| ADD .D1 A_prod_h54, A_prod_h76, A_tmph1 ; h54 + h76
|| RET .S2 B_return
STH .D1T1 A0, *A_yptr_h++ ; Store *y_hp
|| SHR .S1 A_sum_l, 15, A_res_low ; sum_l >> 15
|| ADD .L2 B_prod_h10, B_prod_h32, B_tmph0 ; h10 + h32
STH .D2T1 A_res_low, *B_yptr_l1++ ; Store *y_lp1
|| ADD .L2X B_tmph0, A_tmph1, B_tmp_h ; tmp_h
ADD .S2X A_qr, B_tmp_h, B_sum_h ; sum_h += tmp_h
SHR .S2 B_sum_h, 15, B_res_hi ; sum_h >> 15
STH .D1T2 B_res_hi, *A_yptr_h++ ; Store *y_hp
|| MVC .S2 B_csr, CSR
* ========================================================================= *
* End of file: img_wave_horz.asm *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *