;* ======================================================================== *;
;*  TEXAS INSTRUMENTS, INC.                                                 *;
;*                                                                          *;
;*  IMGLIB  DSP Image/Video Processing Library                              *;
;*                                                                          *;
;*      Release:        Revision 1.04b                                      *;
;*      CVS Revision:   1.3     Sun Sep 29 03:32:28 2002 (UTC)              *;
;*      Snapshot date:  23-Oct-2003                                         *;
;*                                                                          *;
;*  This library contains proprietary intellectual property of Texas        *;
;*  Instruments, Inc.  The library and its source code are protected by     *;
;*  various copyrights, and portions may also be protected by patents or    *;
;*  other legal protections.                                                *;
;*                                                                          *;
;*  This software is licensed for use with Texas Instruments TMS320         *;
;*  family DSPs.  This license was provided to you prior to installing      *;
;*  the software.  You may review this license by consulting the file       *;
;*  TI_license.PDF which accompanies the files in this library.             *;
;* ------------------------------------------------------------------------ *;
;*          Copyright (C) 2003 Texas Instruments, Incorporated.             *;
;*                          All Rights Reserved.                            *;
;* ======================================================================== *;


;* ======================================================================== *;
;*  Assembler compatibility shim for assembling 4.30 and later code on      *;
;*  tools prior to 4.30.                                                    *;
;* ======================================================================== *;

        .if $isdefed(".ASSEMBLER_VERSION")
        .asg    .ASSEMBLER_VERSION, $asmver
        .else
        .asg    0,    $asmver
        .endif

        .if ($asmver < 430)

        .asg    B,    CALL     ; Function Call
        .asg    B,    RET      ; Return from a Function
        .asg    B,    CALLRET  ; Function call with Call / Ret chaining.

        .if .TMS320C6400
        .asg    BNOP, CALLNOP  ; C64x BNOP as a Fn. Call
        .asg    BNOP, RETNOP   ; C64x BNOP as a Fn. Return
        .asg    BNOP, CRNOP    ; C64x Fn call w/, Call/Ret chaining via BNOP.
        .endif

        .asg    , .asmfunc     ; .func equivalent for hand-assembly code
        .asg    , .endasmfunc  ; .endfunc equivalent for hand-assembly code

        .endif

;* ======================================================================== *;
;*  End of assembler compatibility shim.                                    *;
;* ======================================================================== *;


* ========================================================================= *
*   NAME                                                                    *
*       IMG_sad_16x16 -- Sum of Absolute Differences on single 16x16 block  *
*                                                                           *
*   USAGE                                                                   *
*       unsigned IMG_sad_16x16                                              *
*       (                                                                   *
*           const unsigned char *restrict srcImg,  /* 16x16 source block */ *
*           const unsigned char *restrict refImg,  /* Reference image    */ *
*           int pitch                              /* Width of ref image */ *
*       );                                                                  *
*                                                                           *
*       The code accepts a pointer to the 16x16 source block (srcImg),      *
*       and a pointer to the upper-left corner of a target position in      *
*       a reference image (refImg).  The width of the reference image       *
*       is given by the pitch argument.                                     *
*                                                                           *
*       The function returns the sum of the absolute differences            *
*       between the source block and the 16x16 region pointed to in the     *
*       reference image.                                                    *
*                                                                           *
*   DESCRIPTION                                                             *
*       The algorithm takes the difference between the pixel values in      *
*       the source image block and the corresponding pixels in the          *
*       reference image.  It then takes the absolute values of these        *
*       differences, and accumulates them over the entire 16x16 region.     *
*       It returns the final accumulation.                                  *
*                                                                           *
*   C CODE                                                                  *
*       The following is a C code description of the algorithm that lacks   *
*       restrictions.  The assembly code may have additional restrictions   *
*       as noted below.                                                     *
*                                                                           *
*           unsigned IMG_sad_16x16                                          *
*           (                                                               *
*               const unsigned char *restrict srcImg,                       *
*               const unsigned char *restrict refImg,                       *
*               int pitch                                                   *
*           )                                                               *
*           {                                                               *
*               int i, j;                                                   *
*               unsigned sad = 0;                                           *
*                                                                           *
*               for (i = 0; i < 16; i++)                                    *
*                   for (j = 0; j < 16; j++)                                *
*                       sad += abs(srcImg[j+i*16] - refImg[j+i*pitch]);     *
*                                                                           *
*               return sad;                                                 *
*           }                                                               *
*                                                                           *
*   ASSUMPTIONS                                                             *
*       Some versions of this kernel may assume that srcImg is double-      *
*       word aligned.                                                       *
*                                                                           *
*   MEMORY NOTE                                                             *
*       No bank conflicts occur.                                            *
*       Endian Neutral.                                                     *
*                                                                           *
*   NOTES                                                                   *
*       This kernel blocks interrupts for 61 cycles.                        *
*                                                                           *
*   CYCLES                                                                  *
*       67 cycles                                                           *
*                                                                           *
*   CODESIZE                                                                *
*       168 bytes                                                           *
*                                                                           *
* ------------------------------------------------------------------------- *
*             Copyright (c) 2003 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *


        .sect ".text:_sad_16x16"
        .global _IMG_sad_16x16
_IMG_sad_16x16:

* ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== *
        .asg            A0,         A_p
        .asg            A1,         A_i
        .asg            A2,         A_s0
        .asg            A3,         A_k1
        .asg            A4,         A_srcImg
        .asg            A4,         A_retval
        .asg            A5,         A_sad
        .asg            A6,         A_pitch
        .asg            A6,         A_s3210
        .asg            A7,         A_d7654
        .asg            A7,         A_s7654
        .asg            A8,         A_r3210
        .asg            A8,         A_s1
        .asg            A9,         A_d3210
        .asg            A9,         A_r7654
        .asg            B3,         B_retaddr
        .asg            B4,         B_refImg
        .asg            B5,         B_pitch
        .asg            B6,         B_rBA98
        .asg            B7,         B_dFEDC
        .asg            B7,         B_rFEDC
        .asg            B8,         B_dBA98
        .asg            B8,         B_sBA98
        .asg            B9,         B_sFEDC
        .asg            B16,        B_sad
        .asg            B17,        B_srcImg
        .asg            B18,        B_s2
        .asg            B19,        B_s3
* ========================================================================= *
* =========================== PIPE LOOP PROLOG ============================ *
        LDNDW   .D2T2   *+B_refImg(8),          B_rFEDC:B_rBA98 ;[ 1,1]
||      B       .S2     loop
||      MVK     .S1     0x0101,     A_k1
||      ADD     .L2X    A_srcImg,   8,          B_srcImg

        LDDW    .D1T1   *A_srcImg++(16),        A_s7654:A_s3210 ;[ 2,1]
||      LDDW    .D2T2   *B_srcImg++(16),        B_sFEDC:B_sBA98 ;[ 2,1]
||      MV      .L2X    A_pitch,    B_pitch
||      PACK2   .L1     A_k1,       A_k1,       A_k1

        LDNDW   .D2T1   *B_refImg++(B_pitch),   A_r7654:A_r3210 ;[ 3,1]
||      ZERO    .L2     B_sad
||      ZERO    .L1     A_sad
||      MVK     .S1     15,         A_i
||      MVK     .D1     3,          A_p

* =========================== PIPE LOOP KERNEL ============================ *
loop:
  [!A_p]ADD     .L1     A_sad,      A_s1,       A_sad           ;[13,1]
||[!A_p]ADD     .S2     B_sad,      B_s3,       B_sad           ;[13,1]
||[ A_i]BDEC    .S1     loop,       A_i                         ;[10,2]
||      DOTPU4  .M1     A_d3210,    A_k1,       A_s0            ;[10,2]
||      SUBABS4 .L2     B_sFEDC,    B_rFEDC,    B_dFEDC         ;[ 7,3]
||      LDNDW   .D2T2   *+B_refImg(8),          B_rFEDC:B_rBA98 ;[ 1,5]

  [!A_p]ADD     .S2     B_sad,      B_s2,       B_sad           ;[14,1]
||      DOTPU4  .M2X    B_dFEDC,    A_k1,       B_s3            ;[ 8,3]
||      SUBABS4 .L1     A_s7654,    A_r7654,    A_d7654         ;[ 8,3]
||      SUBABS4 .L2     B_sBA98,    B_rBA98,    B_dBA98         ;[ 8,3]
||      LDDW    .D1T1   *A_srcImg++(16),        A_s7654:A_s3210 ;[ 2,5]
||      LDDW    .D2T2   *B_srcImg++(16),        B_sFEDC:B_sBA98 ;[ 2,5]

  [ A_p]SUB     .D1     A_p,        1,          A_p             ;[15,1]
||[!A_p]ADD     .S1     A_sad,      A_s0,       A_sad           ;[15,1]
||      DOTPU4  .M1     A_d7654,    A_k1,       A_s1            ;[ 9,3]
||      DOTPU4  .M2X    B_dBA98,    A_k1,       B_s2            ;[ 9,3]
||      SUBABS4 .L1     A_s3210,    A_r3210,    A_d3210         ;[ 9,3]
||      LDNDW   .D2T1   *B_refImg++(B_pitch),   A_r7654:A_r3210 ;[ 3,5]

* =========================== PIPE LOOP EPILOG ============================ *
        ADD     .L1     A_sad,      A_s1,       A_sad           ;[13,4]
||      ADD     .S2     B_sad,      B_s3,       B_sad           ;[13,4]
||      DOTPU4  .M1     A_d3210,    A_k1,       A_s0            ;[10,5]

        ADD     .L2     B_sad,      B_s2,       B_sad           ;[14,4]
||      RET     .S2     B_retaddr

        ADD     .S1     A_sad,      A_s0,       A_sad           ;[15,4]

        ADD     .L1     A_sad,      A_s1,       A_sad           ;[13,5]
||      ADD     .S2     B_sad,      B_s3,       B_sad           ;[13,5]

        ADD     .S2     B_sad,      B_s2,       B_sad           ;[14,5]

        ADD     .S1     A_sad,      A_s0,       A_sad           ;[15,5]

        ADD     .S1X    A_sad,      B_sad,      A_retval
* ========================================================================= *
* ========================================================================= *
*   End of file: img_sad_16x16.asm                                          *
* ------------------------------------------------------------------------- *
*             Copyright (c) 2003 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *