You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

345 lines
20 KiB

;* ======================================================================== *;
;* TEXAS INSTRUMENTS, INC. *;
;* *;
;* IMGLIB DSP Image/Video Processing Library *;
;* *;
;* Release: Revision 1.04b *;
;* CVS Revision: 1.4 Sun Sep 29 03:32:19 2002 (UTC) *;
;* Snapshot date: 23-Oct-2003 *;
;* *;
;* This library contains proprietary intellectual property of Texas *;
;* Instruments, Inc. The library and its source code are protected by *;
;* various copyrights, and portions may also be protected by patents or *;
;* other legal protections. *;
;* *;
;* This software is licensed for use with Texas Instruments TMS320 *;
;* family DSPs. This license was provided to you prior to installing *;
;* the software. You may review this license by consulting the file *;
;* TI_license.PDF which accompanies the files in this library. *;
;* ------------------------------------------------------------------------ *;
;* Copyright (C) 2003 Texas Instruments, Incorporated. *;
;* All Rights Reserved. *;
;* ======================================================================== *;
;* ======================================================================== *;
;* Assembler compatibility shim for assembling 4.30 and later code on *;
;* tools prior to 4.30. *;
;* ======================================================================== *;
.if $isdefed(".ASSEMBLER_VERSION")
.asg .ASSEMBLER_VERSION, $asmver
.else
.asg 0, $asmver
.endif
.if ($asmver < 430)
.asg B, CALL ; Function Call
.asg B, RET ; Return from a Function
.asg B, CALLRET ; Function call with Call / Ret chaining.
.if .TMS320C6400
.asg BNOP, CALLNOP ; C64x BNOP as a Fn. Call
.asg BNOP, RETNOP ; C64x BNOP as a Fn. Return
.asg BNOP, CRNOP ; C64x Fn call w/, Call/Ret chaining via BNOP.
.endif
.asg , .asmfunc ; .func equivalent for hand-assembly code
.asg , .endasmfunc ; .endfunc equivalent for hand-assembly code
.endif
;* ======================================================================== *;
;* End of assembler compatibility shim. *;
;* ======================================================================== *;
* ========================================================================= *
* TEXAS INSTRUMENTS, INC. *
* *
* NAME *
* corr_3x3: 3x3 correlation with rounding for 8 bit data *
* *
* REVISION DATE *
* 14-Mar-2002 *
* *
* USAGE *
* This routine is C-callable and can be called as: *
* *
* void IMG_corr_3x3 *
* ( *
* const unsigned char *i_data, // input image // *
* int *restrict o_data, // output image // *
* const unsigned char mask[3][3], // convolution mask // *
* int x_dim, // width of image // *
* int n_out // number of outputs // *
* ); *
* *
* DESCRIPTION *
* The correlation performs a point by point multiplication of the *
* 3 by 3 mask with the input image. The result of the nine *
* multiplications are then summed up together to produce a *
* convolution sum. This sum is then stored to the output array. *
* *
* The image mask to be correlated is typically part of the input *
* image and indicates the area of the best match between the *
* input image and mask. The mask is moved one column at a time, *
* advancing the mask over the portion of the row specified by *
* 'n_out'. When 'n_out' is larger than 'x_dim', multiple rows *
* will be processed. *
* *
* An application may call this kernel once per row to calculate *
* the correlation for an entire image: *
* *
* for (i = 0; i < rows; i++) *
* { *
* IMG_corr_3x3(&i_data[i * x_dim], &o_data[i * n_out], *
* mask, x_dim, n_out); *
* } *
* *
* Alternately, the kernel may be invoked for multiple rows at *
* a time, although the two outputs at the end of each row will *
* have meaningless values. For example: *
* *
* IMG_corr_3x3(i_data, o_data, mask, x_dim, 2 * x_dim); *
* *
* This will produce two rows of outputs into 'o_data'. The *
* outputs at locations o_data[x_dim - 2], o_data[x_dim - 1], *
* o_data[2*x_dim - 2] and o_data[2*x_dim - 1] will have *
* meaningless values. This is harmless, although the application *
* will have to account for this when interpreting the results. *
* *
* ASSUMPTIONS *
* The array pointed to by o_data does not alias with the array *
* pointed to by i_data or mask. *
* *
* The number of outputs 'n_out' must be a multiple of 8. In cases *
* where 'n_out' is not a multiple of 8, most applications can safely *
* round 'n_out' up to the next multiple of 8 and ignore the extra *
* outputs. This kernel does not round 'n_out' up for the user. *
* *
* NOTE *
* This kernel is fully interruptible. *
* *
* MEMORY NOTE *
* This kernel places no restrictions on the alignment of its input. *
* *
* No bank conflicts occur. *
* *
* This code assumes a LITTLE ENDIAN configuration. *
* *
* TECHNIQUES *
* The inner loops are unrolled completely, and the outer loop is *
* unrolled 8 times. *
* *
* We use 3 DOTPU4s to calculate the 3 rows of each output pixel. *
* We then accumulate the 3 DOTPU4s to a 32-bit result and store *
* them out. (Note that only 3 of every 4 8-bit MPYs in the DOTPU4 *
* is actually used. The fourth MPY is unused.) *
* *
* We use non-aligned loads and stores to avoid alignment issues. *
* *
* CYCLES *
* cycles = 1.5 * n_out + 22 *
* For n_out = 248, cycles = 394. *
* *
* This number includes 6 cycles of function call overhead. The *
* exact overhead will vary depending on compiler options used. *
* *
* CODESIZE *
* 296 bytes. *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *
.sect ".text:_corr_3x3_32"
.global _IMG_corr_3x3
_IMG_corr_3x3:
* ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== *
.asg A4, A_row0
.asg B4, B_o_ptr
.asg A6, A_mask
.asg B6, B_x_dim
.asg A8, A_n_out
.asg B3, B_ret_addr
.asg A1, A_i
.asg A16, A_o0_r2
.asg A2, A_h3210
.asg A3, A_h7654
.asg A5, A_h8
.asg A7, A_h6543
.asg A8, A_h8765
.asg A16, A_o1_r1
.asg A17, A_o1_r2
.asg A17, A_o1_r21
.asg A17, A_o2_r2
.asg A17, A_o2_r21
.asg A18, A_o0_r0
.asg A18, A_o2_r1
.asg A19, A_o3_r1
.asg A19, A_o3_r21
.asg A20, A_row2
.asg A21, A_row1
.asg A22, A_h2_210_
.asg A23, A_h2__210
.asg A24, A_h1_210_
.asg A25, A_h1__210
.asg A26, A_h0_210_
.asg A27, A_h0__210
.asg A28, A_o2
.asg A28, A_o2_r0
.asg A29, A_o3
.asg A29, A_o3_r0
.asg A30, A_o0
.asg A30, A_o0_r1
.asg A30, A_o0_r21
.asg A31, A_o1
.asg A31, A_o1_r0
.asg A31, A_o3_r2
.asg B1, B_p
.asg B6, B_o4_r1
.asg B6, B_o4_r21
.asg B7, B_o5_r2
.asg B7, B_o7_r0
.asg B8, B_o4_r0
.asg B9, B_o4_r2
.asg B9, B_o5_r0
.asg B16, B_o5_r1
.asg B16, B_o7_r2
.asg B17, B_o7_r1
.asg B18, B_o6
.asg B18, B_o6_r0
.asg B19, B_o6_r1
.asg B19, B_o7
.asg B20, B_l1_5432
.asg B20, B_o4
.asg B21, B_l1_9876
.asg B21, B_o5
.asg B21, B_o5_r21
.asg B22, B_l2_5432
.asg B23, B_l2_9876
.asg B23, B_o6_r2
.asg B24, B_l0_5432
.asg B25, B_l0_9876
.asg B26, B_l0_3210
.asg B26, B_o6_r21
.asg B27, B_l0_7654
.asg B28, B_l2_3210
.asg B29, B_l2_7654
.asg B30, B_l1_3210
.asg B31, B_l1_7654
.asg B31, B_o7_r21
* ========================================================================= *
LDNDW .D1T1 *A_mask(0), A_h7654:A_h3210
LDBU .D1T1 *A_mask(8), A_h8
ADD .L1X A_row0, B_x_dim, A_row1
ADD .L1X A_row1, B_x_dim, A_row2
SHR .S1 A_n_out, 3, A_i
|| B loop_6 ; prolog collapse
* =========================== PIPE LOOP PROLOG ============================ *
LDNDW .D1T2 *+A_row1(2), B_l1_9876:B_l1_5432 ;[ 1,1]
|| SHLMB .L1 A_h3210, A_h7654, A_h6543
LDNDW .D1T2 *+A_row0(2), B_l0_9876:B_l0_5432 ;[ 2,1]
|| SHRMB .L1 A_h8, A_h7654, A_h8765
|| CLR .S1 A_h6543, 24, 31, A_h1__210
LDNDW .D1T2 *+A_row2(2), B_l2_9876:B_l2_5432 ;[ 3,1]
|| CLR .S1 A_h8765, 0, 7, A_h2_210_
|| ROTL .M1 A_h1__210, 8, A_h1_210_
LDNDW .D1T2 *A_row2++(8), B_l2_7654:B_l2_3210 ;[ 4,1]
|| ROTL .M1 A_h2_210_, 24, A_h2__210
|| CLR .S1 A_h3210, 24, 31, A_h0__210
LDNDW .D1T2 *A_row1++(8), B_l1_7654:B_l1_3210 ;[ 5,1]
|| ROTL .M1 A_h0__210, 8, A_h0_210_
|| MVK .D2 1, B_p ; prolog collapse
; ===== 6 cycles of prolog collapsed
* =========================== PIPE LOOP KERNEL ============================ *
loop:
DOTPU4 .M2X B_l0_9876, A_h0_210_, B_o7_r0 ;[12,1]
|| DOTPU4 .M1X B_l1_3210, A_h1__210, A_o0_r1 ;[12,1]
||[ A_i]SUB A_i, 1, A_i
DOTPU4 .M2X B_l1_7654, A_h1__210, B_o4_r1 ;[13,1]
|| DOTPU4 .M1X B_l2_3210, A_h2__210, A_o0_r2 ;[13,1]
||[ A_i]LDNDW .D1T2 *+A_row1(2), B_l1_9876:B_l1_5432 ;[ 1,2]
ADD .L1 A_o2_r2, A_o2_r1, A_o2_r21 ;[14,1]
|| ADD .D2 B_o7_r2, B_o7_r1, B_o7_r21 ;[14,1]
|| DOTPU4 .M1X B_l0_3210, A_h0__210, A_o0_r0 ;[14,1]
|| DOTPU4 .M2X B_l1_7654, A_h1_210_, B_o5_r1 ;[14,1]
||[ A_i]LDNDW .D1T2 *+A_row0(2), B_l0_9876:B_l0_5432 ;[ 2,2]
ADD .L1 A_o2_r21, A_o2_r0, A_o2 ;[15,1]
|| ADD .S1 A_o3_r2, A_o3_r1, A_o3_r21 ;[15,1]
|| ADD .D2 B_o6_r2, B_o6_r1, B_o6_r21 ;[15,1]
|| DOTPU4 .M1X B_l0_3210, A_h0_210_, A_o1_r0 ;[15,1]
|| DOTPU4 .M2X B_l0_7654, A_h0__210, B_o4_r0 ;[15,1]
||[ A_i]LDNDW .D1T2 *+A_row2(2), B_l2_9876:B_l2_5432 ;[ 3,2]
ADD .S1 A_o3_r21, A_o3_r0, A_o3 ;[16,1]
|| ADD .D2 B_o7_r21, B_o7_r0, B_o7 ;[16,1]
|| DOTPU4 .M1X B_l2_3210, A_h2_210_, A_o1_r2 ;[16,1]
|| DOTPU4 .M2X B_l2_7654, A_h2_210_, B_o5_r2 ;[16,1]
||[ A_i]LDNDW .D1T2 *A_row2++(8), B_l2_7654:B_l2_3210 ;[ 4,2]
ADD .S2 B_o6_r21, B_o6_r0, B_o6 ;[17,1]
|| ADD .L1 A_o0_r2, A_o0_r1, A_o0_r21 ;[17,1]
|| ADD .D2 B_o4_r2, B_o4_r1, B_o4_r21 ;[17,1]
|| DOTPU4 .M2X B_l0_7654, A_h0_210_, B_o5_r0 ;[17,1]
|| DOTPU4 .M1X B_l1_3210, A_h1_210_, A_o1_r1 ;[17,1]
||[ A_i]LDNDW .D1T2 *A_row1++(8), B_l1_7654:B_l1_3210 ;[ 5,2]
loop_6:
[!A_i]RET .S2 B_ret_addr
||[ A_i]B .S1 loop ;[18,1]
||[!B_p]ADD .L1 A_o0_r21, A_o0_r0, A_o0 ;[18,1]
||[ A_i]DOTPU4 .M1X B_l1_5432, A_h1_210_, A_o3_r1 ;[ 6,2]
||[ A_i]DOTPU4 .M2X B_l1_9876, A_h1_210_, B_o7_r1 ;[ 6,2]
||[ A_i]LDNDW .D1T2 *A_row0++(8), B_l0_7654:B_l0_3210 ;[ 6,2]
[!B_p]STNDW .D2T2 B_o7:B_o6, *+B_o_ptr[3] ;[19,1]
||[!B_p]ADD .S2 B_o4_r21, B_o4_r0, B_o4 ;[19,1]
||[ A_i]DOTPU4 .M1X B_l1_5432, A_h1__210, A_o2_r1 ;[ 7,2]
||[ A_i]DOTPU4 .M2X B_l1_9876, A_h1__210, B_o6_r1 ;[ 7,2]
[!B_p]STNDW .D2T1 A_o3:A_o2, *+B_o_ptr[1] ;[20,1]
||[!B_p]ADD .L2 B_o5_r2, B_o5_r1, B_o5_r21 ;[20,1]
||[ A_i]DOTPU4 .M1X B_l0_5432, A_h0__210, A_o2_r0 ;[ 8,2]
||[ A_i]DOTPU4 .M2X B_l2_9876, A_h2_210_, B_o7_r2 ;[ 8,2]
[!B_p]ADD .L2 B_o5_r21, B_o5_r0, B_o5 ;[21,1]
||[!B_p]ADD .D1 A_o1_r2, A_o1_r1, A_o1_r21 ;[21,1]
||[ A_i]DOTPU4 .M1X B_l0_5432, A_h0_210_, A_o3_r0 ;[ 9,2]
||[ A_i]DOTPU4 .M2X B_l0_9876, A_h0__210, B_o6_r0 ;[ 9,2]
[!B_p]STNDW .D2T2 B_o5:B_o4, *+B_o_ptr[2] ;[22,1]
||[!B_p]ADD .D1 A_o1_r21, A_o1_r0, A_o1 ;[22,1]
||[ A_i]DOTPU4 .M1X B_l2_5432, A_h2__210, A_o2_r2 ;[10,2]
||[ A_i]DOTPU4 .M2X B_l2_9876, A_h2__210, B_o6_r2 ;[10,2]
[!B_p]STNDW .D2T1 A_o1:A_o0, *B_o_ptr++[4] ;[23,1]
||[ A_i]DOTPU4 .M1X B_l2_5432, A_h2_210_, A_o3_r2 ;[11,2]
||[ A_i]DOTPU4 .M2X B_l2_7654, A_h2__210, B_o4_r2 ;[11,2]
|| ZERO .S2 B_p
* =========================== PIPE LOOP EPILOG ============================ *
; ===== epilog collapsed completely
* ========================================================================= *
* End of file: img_corr_3x3.asm *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *