c6416_sdk/imglib/corr_3x3.asm


								;* ======================================================================== *;

								;*  TEXAS INSTRUMENTS, INC.                                                 *;

								;*                                                                          *;

								;*  IMGLIB  DSP Image/Video Processing Library                              *;

								;*                                                                          *;

								;*      Release:        Revision 1.04b                                      *;

								;*      CVS Revision:   1.4     Sun Sep 29 03:32:19 2002 (UTC)              *;

								;*      Snapshot date:  23-Oct-2003                                         *;

								;*                                                                          *;

								;*  This library contains proprietary intellectual property of Texas        *;

								;*  Instruments, Inc.  The library and its source code are protected by     *;

								;*  various copyrights, and portions may also be protected by patents or    *;

								;*  other legal protections.                                                *;

								;*                                                                          *;

								;*  This software is licensed for use with Texas Instruments TMS320         *;

								;*  family DSPs.  This license was provided to you prior to installing      *;

								;*  the software.  You may review this license by consulting the file       *;

								;*  TI_license.PDF which accompanies the files in this library.             *;

								;* ------------------------------------------------------------------------ *;

								;*          Copyright (C) 2003 Texas Instruments, Incorporated.             *;

								;*                          All Rights Reserved.                            *;

								;* ======================================================================== *;


								;* ======================================================================== *;

								;*  Assembler compatibility shim for assembling 4.30 and later code on      *;

								;*  tools prior to 4.30.                                                    *;

								;* ======================================================================== *;


								        .if $isdefed(".ASSEMBLER_VERSION")

								        .asg    .ASSEMBLER_VERSION, $asmver

								        .else

								        .asg    0,    $asmver

								        .endif


								        .if ($asmver < 430)


								        .asg    B,    CALL     ; Function Call

								        .asg    B,    RET      ; Return from a Function

								        .asg    B,    CALLRET  ; Function call with Call / Ret chaining.


								        .if .TMS320C6400

								        .asg    BNOP, CALLNOP  ; C64x BNOP as a Fn. Call

								        .asg    BNOP, RETNOP   ; C64x BNOP as a Fn. Return

								        .asg    BNOP, CRNOP    ; C64x Fn call w/, Call/Ret chaining via BNOP.

								        .endif


								        .asg    , .asmfunc     ; .func equivalent for hand-assembly code

								        .asg    , .endasmfunc  ; .endfunc equivalent for hand-assembly code


								        .endif


								;* ======================================================================== *;

								;*  End of assembler compatibility shim.                                    *;

								;* ======================================================================== *;


								* ========================================================================= *

								*   TEXAS INSTRUMENTS, INC.                                                 *

								*                                                                           *

								*   NAME                                                                    *

								*       corr_3x3: 3x3 correlation with rounding for 8 bit data              *

								*                                                                           *

								*   REVISION DATE                                                           *

								*       14-Mar-2002                                                         *

								*                                                                           *

								*   USAGE                                                                   *

								*       This routine is C-callable and can be called as:                    *

								*                                                                           *

								*           void IMG_corr_3x3                                               *

								*           (                                                               *

								*               const unsigned char *i_data,       // input image       //  *

								*               int        *restrict o_data,       // output image      //  *

								*               const unsigned char  mask[3][3],   // convolution mask  //  *

								*               int                  x_dim,        // width of image    //  *

								*               int                  n_out         // number of outputs //  *

								*           );                                                              *

								*                                                                           *

								*   DESCRIPTION                                                             *

								*       The correlation performs a point by point multiplication of the     *

								*       3 by 3 mask with the input image.  The result of the nine           *

								*       multiplications are then summed up together to produce a            *

								*       convolution sum.  This sum is then stored to the output array.      *

								*                                                                           *

								*       The image mask to be correlated is typically part of the input      *

								*       image and indicates the area of the best match between the          *

								*       input image and mask.  The mask is moved one column at a time,      *

								*       advancing the mask over the portion of the row specified by         *

								*       'n_out'.  When 'n_out' is larger than 'x_dim', multiple rows        *

								*       will be processed.                                                  *

								*                                                                           *

								*       An application may call this kernel once per row to calculate       *

								*       the correlation for an entire image:                                *

								*                                                                           *

								*           for (i = 0; i < rows; i++)                                      *

								*           {                                                               *

								*               IMG_corr_3x3(&i_data[i * x_dim], &o_data[i * n_out],        *

								*                           mask, x_dim, n_out);                            *

								*           }                                                               *

								*                                                                           *

								*       Alternately, the kernel may be invoked for multiple rows at         *

								*       a time, although the two outputs at the end of each row will        *

								*       have meaningless values.  For example:                              *

								*                                                                           *

								*           IMG_corr_3x3(i_data, o_data, mask, x_dim, 2 * x_dim);           *

								*                                                                           *

								*       This will produce two rows of outputs into 'o_data'.  The           *

								*       outputs at locations o_data[x_dim - 2], o_data[x_dim - 1],          *

								*       o_data[2*x_dim - 2] and o_data[2*x_dim - 1] will have               *

								*       meaningless values.  This is harmless, although the application     *

								*       will have to account for this when interpreting the results.        *

								*                                                                           *

								*   ASSUMPTIONS                                                             *

								*       The array pointed to by o_data does not alias with the array        *

								*       pointed to by i_data or mask.                                       *

								*                                                                           *

								*       The number of outputs 'n_out' must be a multiple of 8.  In cases    *

								*       where 'n_out' is not a multiple of 8, most applications can safely  *

								*       round 'n_out' up to the next multiple of 8 and ignore the extra     *

								*       outputs.  This kernel does not round 'n_out' up for the user.       *

								*                                                                           *

								*   NOTE                                                                    *

								*       This kernel is fully interruptible.                                 *

								*                                                                           *

								*   MEMORY NOTE                                                             *

								*       This kernel places no restrictions on the alignment of its input.   *

								*                                                                           *

								*       No bank conflicts occur.                                            *

								*                                                                           *

								*       This code assumes a LITTLE ENDIAN configuration.                    *

								*                                                                           *

								*   TECHNIQUES                                                              *

								*       The inner loops are unrolled completely, and the outer loop is      *

								*       unrolled 8 times.                                                   *

								*                                                                           *

								*       We use 3 DOTPU4s to calculate the 3 rows of each output pixel.      *

								*       We then accumulate the 3 DOTPU4s to a 32-bit result and store       *

								*       them out.  (Note that only 3 of every 4 8-bit MPYs in the DOTPU4    *

								*       is actually used.  The fourth MPY is unused.)                       *

								*                                                                           *

								*       We use non-aligned loads and stores to avoid alignment issues.      *

								*                                                                           *

								*   CYCLES                                                                  *

								*       cycles = 1.5 * n_out + 22                                           *

								*       For n_out = 248, cycles = 394.                                      *

								*                                                                           *

								*       This number includes 6 cycles of function call overhead.  The       *

								*       exact overhead will vary depending on compiler options used.        *

								*                                                                           *

								*   CODESIZE                                                                *

								*       296 bytes.                                                          *

								* ------------------------------------------------------------------------- *

								*             Copyright (c) 2003 Texas Instruments, Incorporated.           *

								*                            All Rights Reserved.                           *

								* ========================================================================= *


								        .sect ".text:_corr_3x3_32"

								        .global _IMG_corr_3x3

								_IMG_corr_3x3:


								* ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== *

								        .asg            A4,         A_row0

								        .asg            B4,         B_o_ptr

								        .asg            A6,         A_mask

								        .asg            B6,         B_x_dim

								        .asg            A8,         A_n_out

								        .asg            B3,         B_ret_addr


								        .asg            A1,         A_i

								        .asg            A16,        A_o0_r2

								        .asg            A2,         A_h3210

								        .asg            A3,         A_h7654

								        .asg            A5,         A_h8

								        .asg            A7,         A_h6543

								        .asg            A8,         A_h8765

								        .asg            A16,        A_o1_r1

								        .asg            A17,        A_o1_r2

								        .asg            A17,        A_o1_r21

								        .asg            A17,        A_o2_r2

								        .asg            A17,        A_o2_r21

								        .asg            A18,        A_o0_r0

								        .asg            A18,        A_o2_r1

								        .asg            A19,        A_o3_r1

								        .asg            A19,        A_o3_r21

								        .asg            A20,        A_row2

								        .asg            A21,        A_row1

								        .asg            A22,        A_h2_210_

								        .asg            A23,        A_h2__210

								        .asg            A24,        A_h1_210_

								        .asg            A25,        A_h1__210

								        .asg            A26,        A_h0_210_

								        .asg            A27,        A_h0__210

								        .asg            A28,        A_o2

								        .asg            A28,        A_o2_r0

								        .asg            A29,        A_o3

								        .asg            A29,        A_o3_r0

								        .asg            A30,        A_o0

								        .asg            A30,        A_o0_r1

								        .asg            A30,        A_o0_r21

								        .asg            A31,        A_o1

								        .asg            A31,        A_o1_r0

								        .asg            A31,        A_o3_r2

								        .asg            B1,         B_p

								        .asg            B6,         B_o4_r1

								        .asg            B6,         B_o4_r21

								        .asg            B7,         B_o5_r2

								        .asg            B7,         B_o7_r0

								        .asg            B8,         B_o4_r0

								        .asg            B9,         B_o4_r2

								        .asg            B9,         B_o5_r0

								        .asg            B16,        B_o5_r1

								        .asg            B16,        B_o7_r2

								        .asg            B17,        B_o7_r1

								        .asg            B18,        B_o6

								        .asg            B18,        B_o6_r0

								        .asg            B19,        B_o6_r1

								        .asg            B19,        B_o7

								        .asg            B20,        B_l1_5432

								        .asg            B20,        B_o4

								        .asg            B21,        B_l1_9876

								        .asg            B21,        B_o5

								        .asg            B21,        B_o5_r21

								        .asg            B22,        B_l2_5432

								        .asg            B23,        B_l2_9876

								        .asg            B23,        B_o6_r2

								        .asg            B24,        B_l0_5432

								        .asg            B25,        B_l0_9876

								        .asg            B26,        B_l0_3210

								        .asg            B26,        B_o6_r21

								        .asg            B27,        B_l0_7654

								        .asg            B28,        B_l2_3210

								        .asg            B29,        B_l2_7654

								        .asg            B30,        B_l1_3210

								        .asg            B31,        B_l1_7654

								        .asg            B31,        B_o7_r21

								* ========================================================================= *


								        LDNDW   .D1T1   *A_mask(0), A_h7654:A_h3210


								        LDBU    .D1T1   *A_mask(8), A_h8


								        ADD     .L1X    A_row0,     B_x_dim,    A_row1


								        ADD     .L1X    A_row1,     B_x_dim,    A_row2


								        SHR     .S1     A_n_out,    3,          A_i

								||      B               loop_6                  ; prolog collapse

								* =========================== PIPE LOOP PROLOG ============================ *

								        LDNDW   .D1T2   *+A_row1(2),            B_l1_9876:B_l1_5432 ;[ 1,1]

								||      SHLMB   .L1     A_h3210,    A_h7654,    A_h6543


								        LDNDW   .D1T2   *+A_row0(2),            B_l0_9876:B_l0_5432 ;[ 2,1]

								||      SHRMB   .L1     A_h8,       A_h7654,    A_h8765

								||      CLR     .S1     A_h6543,    24, 31,     A_h1__210


								        LDNDW   .D1T2   *+A_row2(2),            B_l2_9876:B_l2_5432 ;[ 3,1]

								||      CLR     .S1     A_h8765,    0,  7,      A_h2_210_

								||      ROTL    .M1     A_h1__210,  8,          A_h1_210_


								        LDNDW   .D1T2   *A_row2++(8),           B_l2_7654:B_l2_3210 ;[ 4,1]

								||      ROTL    .M1     A_h2_210_,  24,         A_h2__210

								||      CLR     .S1     A_h3210,    24, 31,     A_h0__210


								        LDNDW   .D1T2   *A_row1++(8),           B_l1_7654:B_l1_3210 ;[ 5,1]

								||      ROTL    .M1     A_h0__210,  8,          A_h0_210_

								||      MVK     .D2     1,          B_p         ; prolog collapse


								; ===== 6 cycles of prolog collapsed

								* =========================== PIPE LOOP KERNEL ============================ *

								loop:

								        DOTPU4  .M2X    B_l0_9876,  A_h0_210_,  B_o7_r0             ;[12,1]

								||      DOTPU4  .M1X    B_l1_3210,  A_h1__210,  A_o0_r1             ;[12,1]

								||[ A_i]SUB             A_i,        1,          A_i


								        DOTPU4  .M2X    B_l1_7654,  A_h1__210,  B_o4_r1             ;[13,1]

								||      DOTPU4  .M1X    B_l2_3210,  A_h2__210,  A_o0_r2             ;[13,1]

								||[ A_i]LDNDW   .D1T2   *+A_row1(2),            B_l1_9876:B_l1_5432 ;[ 1,2]


								        ADD     .L1     A_o2_r2,    A_o2_r1,    A_o2_r21            ;[14,1]

								||      ADD     .D2     B_o7_r2,    B_o7_r1,    B_o7_r21            ;[14,1]

								||      DOTPU4  .M1X    B_l0_3210,  A_h0__210,  A_o0_r0             ;[14,1]

								||      DOTPU4  .M2X    B_l1_7654,  A_h1_210_,  B_o5_r1             ;[14,1]

								||[ A_i]LDNDW   .D1T2   *+A_row0(2),            B_l0_9876:B_l0_5432 ;[ 2,2]


								        ADD     .L1     A_o2_r21,   A_o2_r0,    A_o2                ;[15,1]

								||      ADD     .S1     A_o3_r2,    A_o3_r1,    A_o3_r21            ;[15,1]

								||      ADD     .D2     B_o6_r2,    B_o6_r1,    B_o6_r21            ;[15,1]

								||      DOTPU4  .M1X    B_l0_3210,  A_h0_210_,  A_o1_r0             ;[15,1]

								||      DOTPU4  .M2X    B_l0_7654,  A_h0__210,  B_o4_r0             ;[15,1]

								||[ A_i]LDNDW   .D1T2   *+A_row2(2),            B_l2_9876:B_l2_5432 ;[ 3,2]


								        ADD     .S1     A_o3_r21,   A_o3_r0,    A_o3                ;[16,1]

								||      ADD     .D2     B_o7_r21,   B_o7_r0,    B_o7                ;[16,1]

								||      DOTPU4  .M1X    B_l2_3210,  A_h2_210_,  A_o1_r2             ;[16,1]

								||      DOTPU4  .M2X    B_l2_7654,  A_h2_210_,  B_o5_r2             ;[16,1]

								||[ A_i]LDNDW   .D1T2   *A_row2++(8),           B_l2_7654:B_l2_3210 ;[ 4,2]


								        ADD     .S2     B_o6_r21,   B_o6_r0,    B_o6                ;[17,1]

								||      ADD     .L1     A_o0_r2,    A_o0_r1,    A_o0_r21            ;[17,1]

								||      ADD     .D2     B_o4_r2,    B_o4_r1,    B_o4_r21            ;[17,1]

								||      DOTPU4  .M2X    B_l0_7654,  A_h0_210_,  B_o5_r0             ;[17,1]

								||      DOTPU4  .M1X    B_l1_3210,  A_h1_210_,  A_o1_r1             ;[17,1]

								||[ A_i]LDNDW   .D1T2   *A_row1++(8),           B_l1_7654:B_l1_3210 ;[ 5,2]

								loop_6:

								  [!A_i]RET     .S2     B_ret_addr

								||[ A_i]B       .S1     loop                                        ;[18,1]

								||[!B_p]ADD     .L1     A_o0_r21,   A_o0_r0,    A_o0                ;[18,1]

								||[ A_i]DOTPU4  .M1X    B_l1_5432,  A_h1_210_,  A_o3_r1             ;[ 6,2]

								||[ A_i]DOTPU4  .M2X    B_l1_9876,  A_h1_210_,  B_o7_r1             ;[ 6,2]

								||[ A_i]LDNDW   .D1T2   *A_row0++(8),           B_l0_7654:B_l0_3210 ;[ 6,2]


								  [!B_p]STNDW   .D2T2   B_o7:B_o6,  *+B_o_ptr[3]                    ;[19,1]

								||[!B_p]ADD     .S2     B_o4_r21,   B_o4_r0,    B_o4                ;[19,1]

								||[ A_i]DOTPU4  .M1X    B_l1_5432,  A_h1__210,  A_o2_r1             ;[ 7,2]

								||[ A_i]DOTPU4  .M2X    B_l1_9876,  A_h1__210,  B_o6_r1             ;[ 7,2]


								  [!B_p]STNDW   .D2T1   A_o3:A_o2,  *+B_o_ptr[1]                    ;[20,1]

								||[!B_p]ADD     .L2     B_o5_r2,    B_o5_r1,    B_o5_r21            ;[20,1]

								||[ A_i]DOTPU4  .M1X    B_l0_5432,  A_h0__210,  A_o2_r0             ;[ 8,2]

								||[ A_i]DOTPU4  .M2X    B_l2_9876,  A_h2_210_,  B_o7_r2             ;[ 8,2]


								  [!B_p]ADD     .L2     B_o5_r21,   B_o5_r0,    B_o5                ;[21,1]

								||[!B_p]ADD     .D1     A_o1_r2,    A_o1_r1,    A_o1_r21            ;[21,1]

								||[ A_i]DOTPU4  .M1X    B_l0_5432,  A_h0_210_,  A_o3_r0             ;[ 9,2]

								||[ A_i]DOTPU4  .M2X    B_l0_9876,  A_h0__210,  B_o6_r0             ;[ 9,2]


								  [!B_p]STNDW   .D2T2   B_o5:B_o4,  *+B_o_ptr[2]                    ;[22,1]

								||[!B_p]ADD     .D1     A_o1_r21,   A_o1_r0,    A_o1                ;[22,1]

								||[ A_i]DOTPU4  .M1X    B_l2_5432,  A_h2__210,  A_o2_r2             ;[10,2]

								||[ A_i]DOTPU4  .M2X    B_l2_9876,  A_h2__210,  B_o6_r2             ;[10,2]


								  [!B_p]STNDW   .D2T1   A_o1:A_o0,  *B_o_ptr++[4]                   ;[23,1]

								||[ A_i]DOTPU4  .M1X    B_l2_5432,  A_h2_210_,  A_o3_r2             ;[11,2]

								||[ A_i]DOTPU4  .M2X    B_l2_7654,  A_h2__210,  B_o4_r2             ;[11,2]

								||      ZERO    .S2     B_p

								* =========================== PIPE LOOP EPILOG ============================ *

								; ===== epilog collapsed completely

								* ========================================================================= *

								*   End of file:  img_corr_3x3.asm                                          *

								* ------------------------------------------------------------------------- *

								*             Copyright (c) 2003 Texas Instruments, Incorporated.           *

								*                            All Rights Reserved.                           *

								* ========================================================================= *