;* ======================================================================== *;
;*  TEXAS INSTRUMENTS, INC.                                                 *;
;*                                                                          *;
;*  IMGLIB  DSP Image/Video Processing Library                              *;
;*                                                                          *;
;*      Release:        Revision 1.04b                                      *;
;*      CVS Revision:   1.10    Sun Sep 29 03:32:24 2002 (UTC)              *;
;*      Snapshot date:  23-Oct-2003                                         *;
;*                                                                          *;
;*  This library contains proprietary intellectual property of Texas        *;
;*  Instruments, Inc.  The library and its source code are protected by     *;
;*  various copyrights, and portions may also be protected by patents or    *;
;*  other legal protections.                                                *;
;*                                                                          *;
;*  This software is licensed for use with Texas Instruments TMS320         *;
;*  family DSPs.  This license was provided to you prior to installing      *;
;*  the software.  You may review this license by consulting the file       *;
;*  TI_license.PDF which accompanies the files in this library.             *;
;* ------------------------------------------------------------------------ *;
;*          Copyright (C) 2003 Texas Instruments, Incorporated.             *;
;*                          All Rights Reserved.                            *;
;* ======================================================================== *;


;* ======================================================================== *;
;*  Assembler compatibility shim for assembling 4.30 and later code on      *;
;*  tools prior to 4.30.                                                    *;
;* ======================================================================== *;

        .if $isdefed(".ASSEMBLER_VERSION")
        .asg    .ASSEMBLER_VERSION, $asmver
        .else
        .asg    0,    $asmver
        .endif

        .if ($asmver < 430)

        .asg    B,    CALL     ; Function Call
        .asg    B,    RET      ; Return from a Function
        .asg    B,    CALLRET  ; Function call with Call / Ret chaining.

        .if .TMS320C6400
        .asg    BNOP, CALLNOP  ; C64x BNOP as a Fn. Call
        .asg    BNOP, RETNOP   ; C64x BNOP as a Fn. Return
        .asg    BNOP, CRNOP    ; C64x Fn call w/, Call/Ret chaining via BNOP.
        .endif

        .asg    , .asmfunc     ; .func equivalent for hand-assembly code
        .asg    , .endasmfunc  ; .endfunc equivalent for hand-assembly code

        .endif

;* ======================================================================== *;
;*  End of assembler compatibility shim.                                    *;
;* ======================================================================== *;


* ========================================================================= *
*                                                                           *
*   TEXAS INSTRUMENTS, INC.                                                 *
*                                                                           *
*   NAME                                                                    *
*       IMG_mad_16x16                                                       *
*                                                                           *
*   REVISION DATE                                                           *
*       18-Dec-2001                                                         *
*                                                                           *
*   USAGE                                                                   *
*       This routine is C-callable and can be called as:                    *
*                                                                           *
*       void IMG_mad_16x16                                                  *
*       (                                                                   *
*           const unsigned char *restrict refImg,                           *
*           const unsigned char *restrict srcImg,                           *
*           int pitch,                                                      *
*           int                 h,                                          *
*           int                 v,                                          *
*           unsigned            *restrict match                             *
*       )                                                                   *
*                                                                           *
*       refImg          Reference image.                                    *
*       srcImg[256]     16x16 block image to look for.                      *
*       pitch           Width of reference image.                           *
*       h               Horiz. size of search area.                         *
*       v               Vert.  size of search area. Must be multiple of 2.  *
*       match[2]        Result:                                             *
*                           match[0] is packed x, y.                        *
*                           match[1] is MAD value.                          *
*                                                                           *
*   DESCRIPTION                                                             *
*       This routine returns the location of the minimum absolute           *
*       difference between a 16x16 search block and some block in a         *
*       (h + 16) x (v + 16) search area. h and v are the sizes of the       *
*       search space for the top left coordinate of the search block.       *
*       refImg points to the top left pixel of the search area.             *
*                                                                           *
*            (0,0)          (h,0)      (h+16,0)                             *
*              ;--------------+--------;                                    *
*              ;    search    |        ;                                    *
*              ;    space     |        ;                                    *
*              ;              |        ;        search area                 *
*              ;--------------+        ;        within reference image      *
*            (0,v)          (h,v)      ;                                    *
*              ;                       ;                                    *
*              ;-----------------------;                                    *
*            (0, v+16)                 (v+16,h+16)                          *
*                                                                           *
*       The location is returned relative to the above coordinate system    *
*       as x and y packed in two 16-bit quantities in a 32-bit word:        *
*                                                                           *
*                   31             16 15             0                      *
*                   +----------------+----------------+                     *
*        match[0]:  |       x        |       y        |                     *
*                   +----------------+----------------+                     *
*                                                                           *
*                   31                               0                      *
*                   +---------------------------------+                     *
*        match[1]:  |   SAD value at location x, y    |                     *
*                   +---------------------------------+                     *
*                                                                           *
*   ASSUMPTIONS                                                             *
*       srcImg and refImg do not alias in memory.                           *
*       The routine is written for Little Endian configuration.             *
*       Two MADS are performed together and hence it is assumed that v      *
*       the vertical dimension is a multiple of 2.                          *
*                                                                           *
*   MEMORY NOTE                                                             *
*       No special requirements on alignment of arrays is required.         *
*                                                                           *
*   TECHNIQUES                                                              *
*       The two outer loops are merged, the two inner loops are merged.     *
*       The inner loop process 2 lines of 2 search locations in parallel.   *
*       The search is performed in top-to-bottom, left-to-right order,      *
*       with the earliest match taking precedence in the case of ties.      *
*       Further use is made of C64x specific instructions such as SUBABS4   *
*       and DOTPU4. The SUBABS4 takes the absolute difference on four 8     *
*       bit quantities packed into a 32 bit word. The DOTPU4 performs four  *
*       8 bit wide multiplies and adds the results together.                *
*                                                                           *
*   C CODE                                                                  *
*                                                                           *
*       void IMG_mad_16x16                                                  *
*       (                                                                   *
*           const unsigned char *restrict refImg,                           *
*           const unsigned char *restrict srcImg,                           *
*           int pitch, int h, int v,                                        *
*           unsigned int *restrict match                                    *
*       )                                                                   *
*       {                                                                   *
*           int i, j, x, y, matx, maty;                                     *
*           unsigned matpos, matval;                                        *
*                                                                           *
*           matval = ~0U;                                                   *
*           matx   = maty = 0;                                              *
*                                                                           *
*           for (x = 0; x < x; x++)                                         *
*               for (y = 0; y < v; y++)                                     *
*               {                                                           *
*                   unsigned acc = 0;                                       *
*                                                                           *
*                   for (i = 0; i < 16; i++)                                *
*                       for (j = 0; j < 16; j++)                            *
*                           acc += abs(srcImg[i*16 + j] -                   *
*                                      refImg[(i+y)*pitch + x + j]);        *
*                                                                           *
*                   if (acc < matval)                                       *
*                   {                                                       *
*                       matval = acc;                                       *
*                       matx   = x;                                         *
*                       maty   = y;                                         *
*                   }                                                       *
*               }                                                           *
*                                                                           *
*           matpos    = (0xffff0000 & (matx << 16)) |                       *
*                       (0x0000ffff & maty);                                *
*           match[0] = matpos;                                              *
*           match[1] = matval;                                              *
*       }                                                                   *
*                                                                           *
*   CYCLES                                                                  *
*       38 * h * v + 20                                                     *
*                                                                           *
*       e.g. h=v= 4:   628 cycles                                           *
*            h=v=32: 38932 cycles                                           *
*                                                                           *
*   CODESIZE                                                                *
*       776 bytes.                                                          *
*                                                                           *
* ------------------------------------------------------------------------- *
*             Copyright (c) 2003 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *


; ================= SYMBOLIC REGISTER ASSIGNMENTS: SETUP ================== ;
        .asg            B15,        B_SP        ; Stack pointer, B datapath
        .asg            B2,         B_csr       ; CSR's value
        .asg            B1,         B_no_gie    ; CSR w/ GIE bit cleared
        .asg            A30,        A_csr       ; Copy of CSR's value
        .asg            B3,         B_ret       ; Return address
; ========================================================================= ;
; ====================== SYMBOLIC REGISTER ASSIGNMENTS =======================
        .asg            A0,         A_best
        .asg            A0,         A_i
        .asg            A0,         A_sub
        .asg            A0,         A_w
        .asg            A1,         A_vl
        .asg            A16,        A_matpos
        .asg            A17,        A_matval
        .asg            A14,        A_ptch
        .asg            A10,        A_ref_ri0d
        .asg            A11,        A_ref_ri0c
        .asg            A18,        A_2diffi_1c
        .asg            A18,        A_2sumi1d
        .asg            A18,        A_diffi_1d
        .asg            A19,        A_2sumi1c
        .asg            A19,        A_sumi1c
        .asg            A2,         A_vl1
        .asg            A20,        A_2diffi_1d
        .asg            A20,        A_2sumi0d
        .asg            A20,        A_diffi_0d
        .asg            A20,        A_diffi_1c
        .asg            A21,        A_matchi1
        .asg            A22,        A_k_ones
        .asg            A23,        A_pitch_8
        .asg            A24,        A_2sumi0c
        .asg            A24,        A_2sumi0cd
        .asg            A24,        A_srchid
        .asg            A25,        A_2sumi1cd
        .asg            A25,        A_diffi_0c
        .asg            A25,        A_srchic
        .asg            A26,        A_2srchid
        .asg            A26,        A_sumi1
        .asg            A26,        A_sumi1cd
        .asg            A27,        A_2diffi_0c
        .asg            A27,        A_2srchic
        .asg            A27,        A_sumi0cd
        .asg            A27,        A_sumi1d
        .asg            A28,        A_2diffi_0d
        .asg            A28,        A_ref_ri2d
        .asg            A28,        A_sumi0c
        .asg            A29,        A_ref_ri2c
        .asg            A29,        A_sumi0d
        .asg            A3,         A_vptch
        .asg            A30,        A_ref_ri1d
        .asg            A31,        A_2sumi1
        .asg            A31,        A_ref_ri1c
        .asg            A4,         A_ref_img
;       .asg            A5,         A_c40
        .asg            A6,         A_hl
        .asg            A6,         A_pitch
        .asg            A7,         A_ffff
        .asg            B6,         B_h
        .asg            A9,         A_bptch
        .asg            B0,         B_best
        .asg            B1,         B_ml
        .asg            B5,         B_matval
        .asg            B16,        B_ref_ri0b
        .asg            B17,        B_ref_ri0a
        .asg            B18,        B_2sumi0ab
        .asg            B18,        B_2sumi0b
        .asg            B18,        B_sumi0a
        .asg            B19,        B_diffi_0b
        .asg            B19,        B_sumi0b
        .asg            B19,        B_sumi1b
        .asg            B20,        B_src_imgcp
        .asg            B21,        B_2diffi_0b
        .asg            B21,        B_2sumi0a
        .asg            B22,        B_matchi0
        .asg            B23,        B_k_ones
        .asg            B24,        B_2sumi1a
        .asg            B24,        B_srchib
        .asg            B25,        B_2diffi_0a
        .asg            B25,        B_diffi_0a
        .asg            B25,        B_srchia
        .asg            B26,        B_2srchib
        .asg            B26,        B_sumi1ab
        .asg            B27,        B_2srchia
        .asg            B27,        B_2sumi1ab
        .asg            B27,        B_diffi_1b
        .asg            B28,        B_2diffi_1b
        .asg            B28,        B_ref_ri2b
        .asg            B29,        B_ref_ri2a
        .asg            B29,        B_sumi1a
        .asg            B30,        B_ref_ri1b
        .asg            B31,        B_2sumi1b
        .asg            B31,        B_ref_ri1a
        .asg            B4,         B_src_img
        .asg            B8,         B_match
        .asg            A8,         A_v
        .asg            B9,         B_2diffi_1a
        .asg            B9,         B_2sumi0
        .asg            B9,         B_diffi_1a
        .asg            B9,         B_sumi0
        .asg            B9,         B_sumi0ab
; ============================================================================

        .sect ".text:_mad_16x16"
        .global _IMG_mad_16x16
_IMG_mad_16x16:
; parameters: A_refImg, B_srcImg, A_pitch, B_h, A_v, B_match
;             A4,       B4,       A6,      B6,  A8,  B8

        STW     .D2T1   A14,        *B_SP--[3]               ; Save A14
||      MVC     .S2     CSR,        B_csr                    ; Remember CSR

        AND             B_csr,      -2,         B_no_gie     ; Clear GIE
||      STW     .D2T1   A11,        *+B_SP[2]                ; Save A11
||      ADD     .D1     A_v,        17,         A_w          ; pitch
||      MPY     .M1     -16,        A_pitch,    A_bptch      ; hpatch

        STW     .D2T1   A10,        *+B_SP[1]                ; Save A10
||      MVC     .S2     B_no_gie,   CSR                      ; Disable ints
||      MPY     .M1     A_w,        A_pitch,    A_vptch      ; vptch
||      SUB     .L1     A_pitch,    8,          A_pitch_8    ; pitch - 8
||      MVKL    .S1     0000FFFFh,  A_ffff                   ; save -1
; ===== Interrupts masked here =====

        MVKL    .S1     01010101h,  A_k_ones                 ; constant
||      MV      .L1     A_v,        A_vl                     ; vert. v...1
||      LDNDW   .D      *A_ref_img++(8),        B_ref_ri0a:B_ref_ri0b  ; Load ref

        MVKH    .S1     01010101h,  A_k_ones                 ; constant
||      ADD     .L1     -1,         A_vptch,    A_vptch      ; vptch--
||      MPY     .M2X    A_v,        B_h,        B_ml         ; ml = h*v
||      LDNDW   .D      *A_ref_img++(A_pitch_8),A_ref_ri0c:A_ref_ri0d   ; Load
||      MV      .D2     B_src_img,  B_src_imgcp              ; srcimg

        MVKL    .S2     01010101h,  B_k_ones                 ; constant
||      MV      .L1     A_v,        A_vl1                    ; v
||      MVKH    .S1     0000FFFFh,  A_ffff                   ; -1
||      LDNDW   .D2T2   *B_src_imgcp++,         B_srchia:B_srchib      ; Load src
||      SUB     .D1     A_pitch,    A_bptch,    A_bptch      ;

        MVKH    .S2     01010101h,  B_k_ones                 ; constant
||      OR      .L2     -1,         B_matval,   B_matval     ; matval
||      MV      .S1     A_bptch,    A_ptch                   ; ptch
||      ZERO    .L1     A_hl                                 ; hl
||      LDNDW   .D      *A_ref_img++(8),        B_ref_ri1a:B_ref_ri1b ; Load ref

; ============================ PIPE LOOP PROLOG ==============================

        LDNDW   .D2T1   *B_src_imgcp++,         A_srchic:A_srchid     ; Load
||      ZERO    .S1     A_matpos                             ; matpos

        LDNDW   .D      *A_ref_img++(A_pitch_8),A_ref_ri1c:A_ref_ri1d ;[ 4,1]

        LDNDW   .D      *A_ref_img++(8),        B_ref_ri2a:B_ref_ri2b ;[ 5,1]

        LDNDW   .D      *A_ref_img++(A_pitch_8),A_ref_ri2c:A_ref_ri2d ;[ 6,1]

        SUBABS4 .L2     B_ref_ri1b, B_srchib,   B_diffi_1b   ;[ 7,1]
||      LDNDW   .D2T2   *B_src_imgcp++,  B_2srchia:B_2srchib ;[ 7,1]
||      B       .S2     L_5 + 12                             ;

        LDNDW   .D2T1   *B_src_imgcp++,  A_2srchic:A_2srchid ;[ 8,1]
||      B       .S2     L_6 + 4                              ;

M_LOOP:
        ADD     .D1     -2,         A_vl,       A_vl         ; vl-= 2
||      MPY     .M1     0, A_matchi1, A_matchi1              ; matchi1
||[!A_vl1 ]MV   .S1     A_v,        A_vl1                    ;[16,0]
||      SUBABS4 .L2     B_ref_ri1a, B_srchia,   B_diffi_1a   ;[ 9,1]
||      SUBABS4 .L1     A_ref_ri1d, A_srchid,   A_diffi_1d   ;[ 9,1]
||      LDNDW   .D2T2   *B_src_imgcp++,  B_srchia:B_srchib   ;[ 1,2]
||      B       .S2     L_7 + 8                              ;

        MVK     .S1     7,          A_i                      ; i = 7
||      MV      .D2     B_ref_ri2b, B_ref_ri0b               ;[10,1]
||      SUBABS4 .L2     B_ref_ri0b, B_srchib,   B_diffi_0b   ;[10,1]
||      DOTPU4  .M2     B_diffi_1a, B_k_ones,   B_sumi1a     ;[10,1]
||      SUBABS4 .L1     A_ref_ri1c, A_srchic,   A_diffi_1c   ;[10,1]
||      LDNDW   .D      *A_ref_img++(8), B_ref_ri1a:B_ref_ri1b;[ 2,2]
||      B       .S2     L_8 + 8                              ;[ 2,2]

        MV      .S2     B_ref_ri2a, B_ref_ri0a               ;[11,1]
||      DOTPU4  .M2     B_diffi_1b, B_k_ones,   B_sumi1b     ;[11,1]
||      SUBABS4 .L2     B_ref_ri0a, B_srchia,   B_diffi_0a   ;[11,1]
||      SUBABS4 .L1     A_ref_ri0d, A_srchid,   A_diffi_0d   ;[11,1]
||      DOTPU4  .M1     A_diffi_1c, A_k_ones,   A_sumi1c     ;[11,1]
||      LDNDW   .D2T1   *B_src_imgcp++,  A_srchic:A_srchid   ;[ 3,2]
||      B       .S1     LOOP_X                               ;
||[!A_vl]MV     .D1     A_vptch,    A_ptch                   ; ptch

        SUBABS4 .L1     A_ref_ri0c, A_srchic,   A_diffi_0c   ;[12,1]
||      DOTPU4  .M2     B_diffi_0a, B_k_ones,   B_sumi0a     ;[12,1]
||      SUBABS4 .L2     B_ref_ri1a, B_2srchia,  B_2diffi_0a  ;[12,1]
||      DOTPU4  .M1     A_diffi_0d, A_k_ones,   A_sumi0d     ;[12,1]
||      LDNDW   .D      *A_ref_img++(A_pitch_8),A_ref_ri1c:A_ref_ri1d ;[ 4,2]
||[!A_vl]MV     .S1     A_v,         A_vl                    ; vl = v
||      ZERO    .S2     B_matchi0                            ; matchi0

; ============================ PIPE LOOP KERNEL ==============================
LOOP_X:

  [ A_i]ADD     .S1     A_i,        -1,         A_i           ;[17,1]
||      ADD     .S2     B_sumi1a,   B_sumi1b,   B_sumi1ab     ;[17,1]
||      DOTPU4  .M2     B_2diffi_1b,B_k_ones,   B_2sumi1b     ;[17,1]
||      DOTPU4  .M1     A_2diffi_0d,A_k_ones,   A_2sumi0d     ;[17,1]
||      ADD     .D1     A_sumi1c,   A_sumi1d,   A_sumi1cd     ;[17,1]
||      SUBABS4 .L2     B_ref_ri1a, B_srchia,   B_diffi_1a    ;[ 9,2]
||      SUBABS4 .L1     A_ref_ri1d, A_srchid,   A_diffi_1d    ;[ 9,2]
||      LDNDW   .D2T2   *B_src_imgcp++,  B_srchia:B_srchib    ;[ 1,3]

        ADD     .S2     B_sumi0a,   B_sumi0b,   B_sumi0ab     ;[18,1]
||      DOTPU4  .M1     A_2diffi_1d,A_k_ones,   A_2sumi1d     ;[18,1]
||      ADD     .S1     A_sumi0c,   A_sumi0d,   A_sumi0cd     ;[18,1]
||      ADD     .D2     B_ref_ri2b, 0,          B_ref_ri0b    ;[10,2]
||      SUBABS4 .L2     B_ref_ri0b, B_srchib,   B_diffi_0b    ;[10,2]
||      DOTPU4  .M2     B_diffi_1a, B_k_ones,   B_sumi1a      ;[10,2]
||      SUBABS4 .L1     A_ref_ri1c, A_srchic,   A_diffi_1c    ;[10,2]
||      LDNDW   .D      *A_ref_img++(8), B_ref_ri1a:B_ref_ri1b;[ 2,3]

  [ A_i]B       .S1     LOOP_X                                ;[19,1]
||      ADD     .D1X    A_sumi1cd,  B_sumi1ab,  A_sumi1       ;[19,1]
||      ADD     .S2     B_ref_ri2a, 0,          B_ref_ri0a    ;[11,2]
||      DOTPU4  .M2     B_diffi_1b, B_k_ones,   B_sumi1b      ;[11,2]
||      SUBABS4 .L2     B_ref_ri0a, B_srchia,   B_diffi_0a    ;[11,2]
||      SUBABS4 .L1     A_ref_ri0d, A_srchid,   A_diffi_0d    ;[11,2]
||      DOTPU4  .M1     A_diffi_1c, A_k_ones,   A_sumi1c      ;[11,2]
||      LDNDW   .D2T1   *B_src_imgcp++,  A_srchic:A_srchid    ;[ 3,3]

        ADD     .S2X    B_sumi0ab,  A_sumi0cd,  B_sumi0       ;[20,1]
||      ADD     .S1     A_matchi1,  A_sumi1,    A_matchi1     ;[20,1]
||      SUBABS4 .L1     A_ref_ri0c, A_srchic,   A_diffi_0c    ;[12,2]
||      DOTPU4  .M2     B_diffi_0a, B_k_ones,   B_sumi0a      ;[12,2]
||      SUBABS4 .L2     B_ref_ri1a, B_2srchia,  B_2diffi_0a   ;[12,2]
||      DOTPU4  .M1     A_diffi_0d, A_k_ones,   A_sumi0d      ;[12,2]
||      LDNDW   .D      *A_ref_img++(A_pitch_8),A_ref_ri1c:A_ref_ri1d   ;[ 4,3]

L_5:    ADD     .D2     B_matchi0,  B_sumi0,    B_matchi0     ;[21,1]
||      ADD     .S2     B_2sumi1a,  B_2sumi1b,  B_2sumi1ab    ;[21,1]
||      ADD     .S1     A_2sumi0c,  A_2sumi0d,  A_2sumi0cd    ;[21,1]
||      SUBABS4 .L2     B_ref_ri2a, B_2srchia,  B_2diffi_1a   ;[13,2]
||      DOTPU4  .M1     A_diffi_1d, A_k_ones,   A_sumi1d      ;[13,2]
||      SUBABS4 .L1     A_ref_ri2d, A_2srchid,  A_2diffi_1d   ;[13,2]
||      DOTPU4  .M2     B_2diffi_0a,B_k_ones,   B_2sumi0a     ;[13,2]
||      LDNDW   .D      *A_ref_img++(8), B_ref_ri2a:B_ref_ri2b;[ 5,3]

L_6:    ADD     .D2     B_2sumi0a,  B_2sumi0b,  B_2sumi0ab    ;[22,1]
||      ADD     .S1     A_2sumi1c,  A_2sumi1d,  A_2sumi1cd    ;[22,1]
||      DOTPU4  .M1     A_diffi_0c, A_k_ones,   A_sumi0c      ;[14,2]
||      SUBABS4 .L1     A_ref_ri2c, A_2srchic,  A_2diffi_1c   ;[14,2]
||      DOTPU4  .M2     B_diffi_0b, B_k_ones,   B_sumi0b      ;[14,2]
||      SUBABS4 .L2     B_ref_ri1b, B_2srchib,  B_2diffi_0b   ;[14,2]
||      LDNDW   .D      *A_ref_img++(A_pitch_8), A_ref_ri2c:A_ref_ri2d  ;[ 6,3]

L_7:    ADD     .S1X    A_2sumi1cd, B_2sumi1ab, A_2sumi1     ;[23,1]
||      ADD     .S2X    B_2sumi0ab, A_2sumi0cd, B_2sumi0     ;[23,1]
||      ADD     .D1     A_ref_ri2c, 0,          A_ref_ri0c   ;[15,2]
||      DOTPU4  .M1     A_2diffi_1c,A_k_ones,   A_2sumi1c    ;[15,2]
||      SUBABS4 .L1     A_ref_ri1c, A_2srchic,  A_2diffi_0c  ;[15,2]
||      DOTPU4  .M2     B_2diffi_1a,B_k_ones,   B_2sumi1a    ;[15,2]
||      SUBABS4 .L2     B_ref_ri1b, B_srchib,   B_diffi_1b   ;[ 7,3]
||      LDNDW   .D2T2   *B_src_imgcp++,  B_2srchia:B_2srchib ;[ 7,3]


L_8:    ADD     .S2     B_matchi0,  B_2sumi0,   B_matchi0    ;[24,1]
||      ADD     .S1     A_matchi1,  A_2sumi1,   A_matchi1    ;[24,1]
||      ADD     .D1     A_ref_ri2d, 0,          A_ref_ri0d   ;[16,2]
||      DOTPU4  .M2     B_2diffi_0b,B_k_ones,   B_2sumi0b    ;[16,2]
||      SUBABS4 .L2     B_ref_ri2b, B_2srchib,  B_2diffi_1b  ;[16,2]
||      SUBABS4 .L1     A_ref_ri1d, A_2srchid,  A_2diffi_0d  ;[16,2]
||      DOTPU4  .M1     A_2diffi_0c,A_k_ones,   A_2sumi0c    ;[16,2]
||      LDNDW   .D2T1   *B_src_imgcp++,  A_2srchic:A_2srchid ;[ 8,3]

; ============================ PIPE LOOP EPILOG ==============================
; EPILOG:

        ADD     .S2     B_sumi1a,   B_sumi1b,   B_sumi1ab    ;[17,3]
||      DOTPU4  .M2     B_2diffi_1b,B_k_ones,   B_2sumi1b    ;[17,3]
||      DOTPU4  .M1     A_2diffi_0d,A_k_ones,   A_2sumi0d    ;[17,3]
||      ADD     .D1     A_sumi1c,   A_sumi1d,   A_sumi1cd    ;[17,3]
||      SUB     .S1     A_ref_img,  A_ptch,     A_ref_img    ;

        ADD     .S2     B_sumi0a,   B_sumi0b,   B_sumi0ab    ;[18,3]
||      DOTPU4  .M1     A_2diffi_1d,A_k_ones,   A_2sumi1d    ;[18,3]
||      ADD     .S1     A_sumi0c,   A_sumi0d,   A_sumi0cd    ;[18,3]
||      MV      .L2     B_src_img,  B_src_imgcp              ;

        ADD     .S1X    A_sumi1cd,  B_sumi1ab,  A_sumi1      ;[19,3]
||      LDNDW   .D      *A_ref_img++(8), B_ref_ri0a:B_ref_ri0b;

        ADD     .S2X    B_sumi0ab,  A_sumi0cd,  B_sumi0      ;[20,3]
||      ADD     .S1     A_matchi1,  A_sumi1,    A_matchi1    ;[20,3]
||      LDNDW   .D      *A_ref_img++(A_pitch_8), A_ref_ri0c:A_ref_ri0d;

        ADD     .L2     B_matchi0,  B_sumi0,    B_matchi0    ;[21,3]
||      ADD     .S2     B_2sumi1a,  B_2sumi1b,  B_2sumi1ab   ;[21,3]
||      ADD     .S1     A_2sumi0c,  A_2sumi0d,  A_2sumi0cd   ;[21,3]
||      LDNDW   .D2T2   *B_src_imgcp++,  B_srchia:B_srchib   ;

        ADD     .S2     B_2sumi0a,  B_2sumi0b,  B_2sumi0ab   ;[22,3]
||      ADD     .S1     A_2sumi1c,  A_2sumi1d,  A_2sumi1cd   ;[22,3]
|| [B_ml]ADD    .L2     B_ml,       -2,         B_ml         ;
||      LDNDW   .D      *A_ref_img++(8),        B_ref_ri1a:B_ref_ri1b; Load ref

        ADD     .S1X    A_2sumi1cd, B_2sumi1ab, A_2sumi1     ;[23,3]
||      ADD     .L2X    B_2sumi0ab, A_2sumi0cd, B_2sumi0     ;[23,3]
||[ B_ml]B      .S2     M_LOOP                               ;
||      LDNDW   .D2T1   *B_src_imgcp++,         A_srchic:A_srchid ; Load

        ADD     .L2     B_matchi0,  B_2sumi0,   B_matchi0    ;[24,3]
||      ADD     .S1     A_matchi1,  A_2sumi1,   A_matchi1    ;[24,3]
||      LDNDW   .D      *A_ref_img++(A_pitch_8),  A_ref_ri1c:A_ref_ri1d ;[ 4,1]
||      MV      .L1X    B_matval,   A_matval

; ============================================================================
; END:

        CMPLTU  .L2     B_matchi0,  B_matval,   B_best       ;
||      ADD     .L1     -1,         A_vl1,      A_vl1        ;
||      MV      .S1     A_bptch,    A_ptch                   ;
||      LDNDW   .D      *A_ref_img++(8),        B_ref_ri2a:B_ref_ri2b ;[ 5,1]

  [ B_best] MV    .L2   B_matchi0,  B_matval                 ;
||[ B_best] MV    .L1X  B_matchi0,  A_matval                 ;
||[ B_best] PACK2 .S1   A_hl,       A_vl1,      A_matpos     ;
||          LDNDW .D    *A_ref_img++(A_pitch_8),A_ref_ri2c:A_ref_ri2d ;[ 6,1]

        ADD     .S1     -1,         A_vl1,      A_vl1        ;
||      CMPLTU  .L1     A_matchi1,  A_matval,   A_best       ; XP stall
||      SUBABS4 .L2     B_ref_ri1b, B_srchib,   B_diffi_1b   ;[ 7,1]
||      LDNDW   .D2T2   *B_src_imgcp++,  B_2srchia:B_2srchib ;[ 7,1]
||[ B_ml]B      .S2     L_5 + 12                             ;

  [ A_best] PACK2 .L1   A_hl,       A_vl1,      A_matpos     ;
||[!A_vl1 ] ADD   .D1   A_hl,       1,          A_hl         ;
||[ A_best] MV    .S2X  A_matchi1,  B_matval                 ;
||          LDNDW .D2T1 *B_src_imgcp++,  A_2srchic:A_2srchid ;[ 8,1]
||[ B_ml]   B     .S1   L_6 + 4                              ;

        ;==== Branch occurs

; ========================================================================= ;
        LDW     .D2T1   *+B_SP[1],  A10

        RET     .S2     B_ret                                ; Return to caller
||      LDW     .D2T1   *+B_SP[2],  A11

        MV      .S1     A_v,        A_sub                    ;
||      AND     .L1     A_matpos,   A_ffff,     A_vl         ;
||      LDW     .D2T1   *++B_SP[3],  A14

        SUB     .L1     A_sub,      1,          A_sub        ;
||      SHL     .S1     A_ffff,     16,         A_ffff       ;

        SUB     .S1     A_sub,      A_vl,       A_vl         ;
||      AND     .L1     A_matpos,   A_ffff,     A_matpos     ;

        ADD     .S1     A_matpos,   A_vl,       A_matpos     ;
||      MV      .L1X    B_matval,   A_matval                 ;

        STNDW   .D2T1   A_matval:A_matpos,      *B_match     ;
||      MVC     .S2     B_csr,            CSR         ; Restore CSR
; ===== Interruptibility state restored here =====
; ===== Branch Occurs =====

* ========================================================================= *
*   End of file:  img_mad_16x16.asm                                         *
* ------------------------------------------------------------------------- *
*             Copyright (c) 2003 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *