You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
345 lines
20 KiB
345 lines
20 KiB
;* ======================================================================== *;
|
|
;* TEXAS INSTRUMENTS, INC. *;
|
|
;* *;
|
|
;* IMGLIB DSP Image/Video Processing Library *;
|
|
;* *;
|
|
;* Release: Revision 1.04b *;
|
|
;* CVS Revision: 1.4 Sun Sep 29 03:32:19 2002 (UTC) *;
|
|
;* Snapshot date: 23-Oct-2003 *;
|
|
;* *;
|
|
;* This library contains proprietary intellectual property of Texas *;
|
|
;* Instruments, Inc. The library and its source code are protected by *;
|
|
;* various copyrights, and portions may also be protected by patents or *;
|
|
;* other legal protections. *;
|
|
;* *;
|
|
;* This software is licensed for use with Texas Instruments TMS320 *;
|
|
;* family DSPs. This license was provided to you prior to installing *;
|
|
;* the software. You may review this license by consulting the file *;
|
|
;* TI_license.PDF which accompanies the files in this library. *;
|
|
;* ------------------------------------------------------------------------ *;
|
|
;* Copyright (C) 2003 Texas Instruments, Incorporated. *;
|
|
;* All Rights Reserved. *;
|
|
;* ======================================================================== *;
|
|
|
|
|
|
;* ======================================================================== *;
|
|
;* Assembler compatibility shim for assembling 4.30 and later code on *;
|
|
;* tools prior to 4.30. *;
|
|
;* ======================================================================== *;
|
|
|
|
.if $isdefed(".ASSEMBLER_VERSION")
|
|
.asg .ASSEMBLER_VERSION, $asmver
|
|
.else
|
|
.asg 0, $asmver
|
|
.endif
|
|
|
|
.if ($asmver < 430)
|
|
|
|
.asg B, CALL ; Function Call
|
|
.asg B, RET ; Return from a Function
|
|
.asg B, CALLRET ; Function call with Call / Ret chaining.
|
|
|
|
.if .TMS320C6400
|
|
.asg BNOP, CALLNOP ; C64x BNOP as a Fn. Call
|
|
.asg BNOP, RETNOP ; C64x BNOP as a Fn. Return
|
|
.asg BNOP, CRNOP ; C64x Fn call w/, Call/Ret chaining via BNOP.
|
|
.endif
|
|
|
|
.asg , .asmfunc ; .func equivalent for hand-assembly code
|
|
.asg , .endasmfunc ; .endfunc equivalent for hand-assembly code
|
|
|
|
.endif
|
|
|
|
;* ======================================================================== *;
|
|
;* End of assembler compatibility shim. *;
|
|
;* ======================================================================== *;
|
|
|
|
|
|
* ========================================================================= *
|
|
* TEXAS INSTRUMENTS, INC. *
|
|
* *
|
|
* NAME *
|
|
* corr_3x3: 3x3 correlation with rounding for 8 bit data *
|
|
* *
|
|
* REVISION DATE *
|
|
* 14-Mar-2002 *
|
|
* *
|
|
* USAGE *
|
|
* This routine is C-callable and can be called as: *
|
|
* *
|
|
* void IMG_corr_3x3 *
|
|
* ( *
|
|
* const unsigned char *i_data, // input image // *
|
|
* int *restrict o_data, // output image // *
|
|
* const unsigned char mask[3][3], // convolution mask // *
|
|
* int x_dim, // width of image // *
|
|
* int n_out // number of outputs // *
|
|
* ); *
|
|
* *
|
|
* DESCRIPTION *
|
|
* The correlation performs a point by point multiplication of the *
|
|
* 3 by 3 mask with the input image. The result of the nine *
|
|
* multiplications are then summed up together to produce a *
|
|
* convolution sum. This sum is then stored to the output array. *
|
|
* *
|
|
* The image mask to be correlated is typically part of the input *
|
|
* image and indicates the area of the best match between the *
|
|
* input image and mask. The mask is moved one column at a time, *
|
|
* advancing the mask over the portion of the row specified by *
|
|
* 'n_out'. When 'n_out' is larger than 'x_dim', multiple rows *
|
|
* will be processed. *
|
|
* *
|
|
* An application may call this kernel once per row to calculate *
|
|
* the correlation for an entire image: *
|
|
* *
|
|
* for (i = 0; i < rows; i++) *
|
|
* { *
|
|
* IMG_corr_3x3(&i_data[i * x_dim], &o_data[i * n_out], *
|
|
* mask, x_dim, n_out); *
|
|
* } *
|
|
* *
|
|
* Alternately, the kernel may be invoked for multiple rows at *
|
|
* a time, although the two outputs at the end of each row will *
|
|
* have meaningless values. For example: *
|
|
* *
|
|
* IMG_corr_3x3(i_data, o_data, mask, x_dim, 2 * x_dim); *
|
|
* *
|
|
* This will produce two rows of outputs into 'o_data'. The *
|
|
* outputs at locations o_data[x_dim - 2], o_data[x_dim - 1], *
|
|
* o_data[2*x_dim - 2] and o_data[2*x_dim - 1] will have *
|
|
* meaningless values. This is harmless, although the application *
|
|
* will have to account for this when interpreting the results. *
|
|
* *
|
|
* ASSUMPTIONS *
|
|
* The array pointed to by o_data does not alias with the array *
|
|
* pointed to by i_data or mask. *
|
|
* *
|
|
* The number of outputs 'n_out' must be a multiple of 8. In cases *
|
|
* where 'n_out' is not a multiple of 8, most applications can safely *
|
|
* round 'n_out' up to the next multiple of 8 and ignore the extra *
|
|
* outputs. This kernel does not round 'n_out' up for the user. *
|
|
* *
|
|
* NOTE *
|
|
* This kernel is fully interruptible. *
|
|
* *
|
|
* MEMORY NOTE *
|
|
* This kernel places no restrictions on the alignment of its input. *
|
|
* *
|
|
* No bank conflicts occur. *
|
|
* *
|
|
* This code assumes a LITTLE ENDIAN configuration. *
|
|
* *
|
|
* TECHNIQUES *
|
|
* The inner loops are unrolled completely, and the outer loop is *
|
|
* unrolled 8 times. *
|
|
* *
|
|
* We use 3 DOTPU4s to calculate the 3 rows of each output pixel. *
|
|
* We then accumulate the 3 DOTPU4s to a 32-bit result and store *
|
|
* them out. (Note that only 3 of every 4 8-bit MPYs in the DOTPU4 *
|
|
* is actually used. The fourth MPY is unused.) *
|
|
* *
|
|
* We use non-aligned loads and stores to avoid alignment issues. *
|
|
* *
|
|
* CYCLES *
|
|
* cycles = 1.5 * n_out + 22 *
|
|
* For n_out = 248, cycles = 394. *
|
|
* *
|
|
* This number includes 6 cycles of function call overhead. The *
|
|
* exact overhead will vary depending on compiler options used. *
|
|
* *
|
|
* CODESIZE *
|
|
* 296 bytes. *
|
|
* ------------------------------------------------------------------------- *
|
|
* Copyright (c) 2003 Texas Instruments, Incorporated. *
|
|
* All Rights Reserved. *
|
|
* ========================================================================= *
|
|
|
|
|
|
|
|
.sect ".text:_corr_3x3_32"
|
|
.global _IMG_corr_3x3
|
|
_IMG_corr_3x3:
|
|
|
|
* ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== *
|
|
.asg A4, A_row0
|
|
.asg B4, B_o_ptr
|
|
.asg A6, A_mask
|
|
.asg B6, B_x_dim
|
|
.asg A8, A_n_out
|
|
.asg B3, B_ret_addr
|
|
|
|
.asg A1, A_i
|
|
.asg A16, A_o0_r2
|
|
.asg A2, A_h3210
|
|
.asg A3, A_h7654
|
|
.asg A5, A_h8
|
|
.asg A7, A_h6543
|
|
.asg A8, A_h8765
|
|
.asg A16, A_o1_r1
|
|
.asg A17, A_o1_r2
|
|
.asg A17, A_o1_r21
|
|
.asg A17, A_o2_r2
|
|
.asg A17, A_o2_r21
|
|
.asg A18, A_o0_r0
|
|
.asg A18, A_o2_r1
|
|
.asg A19, A_o3_r1
|
|
.asg A19, A_o3_r21
|
|
.asg A20, A_row2
|
|
.asg A21, A_row1
|
|
.asg A22, A_h2_210_
|
|
.asg A23, A_h2__210
|
|
.asg A24, A_h1_210_
|
|
.asg A25, A_h1__210
|
|
.asg A26, A_h0_210_
|
|
.asg A27, A_h0__210
|
|
.asg A28, A_o2
|
|
.asg A28, A_o2_r0
|
|
.asg A29, A_o3
|
|
.asg A29, A_o3_r0
|
|
.asg A30, A_o0
|
|
.asg A30, A_o0_r1
|
|
.asg A30, A_o0_r21
|
|
.asg A31, A_o1
|
|
.asg A31, A_o1_r0
|
|
.asg A31, A_o3_r2
|
|
.asg B1, B_p
|
|
.asg B6, B_o4_r1
|
|
.asg B6, B_o4_r21
|
|
.asg B7, B_o5_r2
|
|
.asg B7, B_o7_r0
|
|
.asg B8, B_o4_r0
|
|
.asg B9, B_o4_r2
|
|
.asg B9, B_o5_r0
|
|
.asg B16, B_o5_r1
|
|
.asg B16, B_o7_r2
|
|
.asg B17, B_o7_r1
|
|
.asg B18, B_o6
|
|
.asg B18, B_o6_r0
|
|
.asg B19, B_o6_r1
|
|
.asg B19, B_o7
|
|
.asg B20, B_l1_5432
|
|
.asg B20, B_o4
|
|
.asg B21, B_l1_9876
|
|
.asg B21, B_o5
|
|
.asg B21, B_o5_r21
|
|
.asg B22, B_l2_5432
|
|
.asg B23, B_l2_9876
|
|
.asg B23, B_o6_r2
|
|
.asg B24, B_l0_5432
|
|
.asg B25, B_l0_9876
|
|
.asg B26, B_l0_3210
|
|
.asg B26, B_o6_r21
|
|
.asg B27, B_l0_7654
|
|
.asg B28, B_l2_3210
|
|
.asg B29, B_l2_7654
|
|
.asg B30, B_l1_3210
|
|
.asg B31, B_l1_7654
|
|
.asg B31, B_o7_r21
|
|
* ========================================================================= *
|
|
|
|
LDNDW .D1T1 *A_mask(0), A_h7654:A_h3210
|
|
|
|
LDBU .D1T1 *A_mask(8), A_h8
|
|
|
|
ADD .L1X A_row0, B_x_dim, A_row1
|
|
|
|
ADD .L1X A_row1, B_x_dim, A_row2
|
|
|
|
SHR .S1 A_n_out, 3, A_i
|
|
|| B loop_6 ; prolog collapse
|
|
* =========================== PIPE LOOP PROLOG ============================ *
|
|
LDNDW .D1T2 *+A_row1(2), B_l1_9876:B_l1_5432 ;[ 1,1]
|
|
|| SHLMB .L1 A_h3210, A_h7654, A_h6543
|
|
|
|
LDNDW .D1T2 *+A_row0(2), B_l0_9876:B_l0_5432 ;[ 2,1]
|
|
|| SHRMB .L1 A_h8, A_h7654, A_h8765
|
|
|| CLR .S1 A_h6543, 24, 31, A_h1__210
|
|
|
|
LDNDW .D1T2 *+A_row2(2), B_l2_9876:B_l2_5432 ;[ 3,1]
|
|
|| CLR .S1 A_h8765, 0, 7, A_h2_210_
|
|
|| ROTL .M1 A_h1__210, 8, A_h1_210_
|
|
|
|
LDNDW .D1T2 *A_row2++(8), B_l2_7654:B_l2_3210 ;[ 4,1]
|
|
|| ROTL .M1 A_h2_210_, 24, A_h2__210
|
|
|| CLR .S1 A_h3210, 24, 31, A_h0__210
|
|
|
|
LDNDW .D1T2 *A_row1++(8), B_l1_7654:B_l1_3210 ;[ 5,1]
|
|
|| ROTL .M1 A_h0__210, 8, A_h0_210_
|
|
|| MVK .D2 1, B_p ; prolog collapse
|
|
|
|
; ===== 6 cycles of prolog collapsed
|
|
* =========================== PIPE LOOP KERNEL ============================ *
|
|
loop:
|
|
DOTPU4 .M2X B_l0_9876, A_h0_210_, B_o7_r0 ;[12,1]
|
|
|| DOTPU4 .M1X B_l1_3210, A_h1__210, A_o0_r1 ;[12,1]
|
|
||[ A_i]SUB A_i, 1, A_i
|
|
|
|
DOTPU4 .M2X B_l1_7654, A_h1__210, B_o4_r1 ;[13,1]
|
|
|| DOTPU4 .M1X B_l2_3210, A_h2__210, A_o0_r2 ;[13,1]
|
|
||[ A_i]LDNDW .D1T2 *+A_row1(2), B_l1_9876:B_l1_5432 ;[ 1,2]
|
|
|
|
ADD .L1 A_o2_r2, A_o2_r1, A_o2_r21 ;[14,1]
|
|
|| ADD .D2 B_o7_r2, B_o7_r1, B_o7_r21 ;[14,1]
|
|
|| DOTPU4 .M1X B_l0_3210, A_h0__210, A_o0_r0 ;[14,1]
|
|
|| DOTPU4 .M2X B_l1_7654, A_h1_210_, B_o5_r1 ;[14,1]
|
|
||[ A_i]LDNDW .D1T2 *+A_row0(2), B_l0_9876:B_l0_5432 ;[ 2,2]
|
|
|
|
ADD .L1 A_o2_r21, A_o2_r0, A_o2 ;[15,1]
|
|
|| ADD .S1 A_o3_r2, A_o3_r1, A_o3_r21 ;[15,1]
|
|
|| ADD .D2 B_o6_r2, B_o6_r1, B_o6_r21 ;[15,1]
|
|
|| DOTPU4 .M1X B_l0_3210, A_h0_210_, A_o1_r0 ;[15,1]
|
|
|| DOTPU4 .M2X B_l0_7654, A_h0__210, B_o4_r0 ;[15,1]
|
|
||[ A_i]LDNDW .D1T2 *+A_row2(2), B_l2_9876:B_l2_5432 ;[ 3,2]
|
|
|
|
ADD .S1 A_o3_r21, A_o3_r0, A_o3 ;[16,1]
|
|
|| ADD .D2 B_o7_r21, B_o7_r0, B_o7 ;[16,1]
|
|
|| DOTPU4 .M1X B_l2_3210, A_h2_210_, A_o1_r2 ;[16,1]
|
|
|| DOTPU4 .M2X B_l2_7654, A_h2_210_, B_o5_r2 ;[16,1]
|
|
||[ A_i]LDNDW .D1T2 *A_row2++(8), B_l2_7654:B_l2_3210 ;[ 4,2]
|
|
|
|
ADD .S2 B_o6_r21, B_o6_r0, B_o6 ;[17,1]
|
|
|| ADD .L1 A_o0_r2, A_o0_r1, A_o0_r21 ;[17,1]
|
|
|| ADD .D2 B_o4_r2, B_o4_r1, B_o4_r21 ;[17,1]
|
|
|| DOTPU4 .M2X B_l0_7654, A_h0_210_, B_o5_r0 ;[17,1]
|
|
|| DOTPU4 .M1X B_l1_3210, A_h1_210_, A_o1_r1 ;[17,1]
|
|
||[ A_i]LDNDW .D1T2 *A_row1++(8), B_l1_7654:B_l1_3210 ;[ 5,2]
|
|
loop_6:
|
|
[!A_i]RET .S2 B_ret_addr
|
|
||[ A_i]B .S1 loop ;[18,1]
|
|
||[!B_p]ADD .L1 A_o0_r21, A_o0_r0, A_o0 ;[18,1]
|
|
||[ A_i]DOTPU4 .M1X B_l1_5432, A_h1_210_, A_o3_r1 ;[ 6,2]
|
|
||[ A_i]DOTPU4 .M2X B_l1_9876, A_h1_210_, B_o7_r1 ;[ 6,2]
|
|
||[ A_i]LDNDW .D1T2 *A_row0++(8), B_l0_7654:B_l0_3210 ;[ 6,2]
|
|
|
|
[!B_p]STNDW .D2T2 B_o7:B_o6, *+B_o_ptr[3] ;[19,1]
|
|
||[!B_p]ADD .S2 B_o4_r21, B_o4_r0, B_o4 ;[19,1]
|
|
||[ A_i]DOTPU4 .M1X B_l1_5432, A_h1__210, A_o2_r1 ;[ 7,2]
|
|
||[ A_i]DOTPU4 .M2X B_l1_9876, A_h1__210, B_o6_r1 ;[ 7,2]
|
|
|
|
[!B_p]STNDW .D2T1 A_o3:A_o2, *+B_o_ptr[1] ;[20,1]
|
|
||[!B_p]ADD .L2 B_o5_r2, B_o5_r1, B_o5_r21 ;[20,1]
|
|
||[ A_i]DOTPU4 .M1X B_l0_5432, A_h0__210, A_o2_r0 ;[ 8,2]
|
|
||[ A_i]DOTPU4 .M2X B_l2_9876, A_h2_210_, B_o7_r2 ;[ 8,2]
|
|
|
|
[!B_p]ADD .L2 B_o5_r21, B_o5_r0, B_o5 ;[21,1]
|
|
||[!B_p]ADD .D1 A_o1_r2, A_o1_r1, A_o1_r21 ;[21,1]
|
|
||[ A_i]DOTPU4 .M1X B_l0_5432, A_h0_210_, A_o3_r0 ;[ 9,2]
|
|
||[ A_i]DOTPU4 .M2X B_l0_9876, A_h0__210, B_o6_r0 ;[ 9,2]
|
|
|
|
[!B_p]STNDW .D2T2 B_o5:B_o4, *+B_o_ptr[2] ;[22,1]
|
|
||[!B_p]ADD .D1 A_o1_r21, A_o1_r0, A_o1 ;[22,1]
|
|
||[ A_i]DOTPU4 .M1X B_l2_5432, A_h2__210, A_o2_r2 ;[10,2]
|
|
||[ A_i]DOTPU4 .M2X B_l2_9876, A_h2__210, B_o6_r2 ;[10,2]
|
|
|
|
[!B_p]STNDW .D2T1 A_o1:A_o0, *B_o_ptr++[4] ;[23,1]
|
|
||[ A_i]DOTPU4 .M1X B_l2_5432, A_h2_210_, A_o3_r2 ;[11,2]
|
|
||[ A_i]DOTPU4 .M2X B_l2_7654, A_h2__210, B_o4_r2 ;[11,2]
|
|
|| ZERO .S2 B_p
|
|
* =========================== PIPE LOOP EPILOG ============================ *
|
|
; ===== epilog collapsed completely
|
|
* ========================================================================= *
|
|
* End of file: img_corr_3x3.asm *
|
|
* ------------------------------------------------------------------------- *
|
|
* Copyright (c) 2003 Texas Instruments, Incorporated. *
|
|
* All Rights Reserved. *
|
|
* ========================================================================= *
|
|
|