;* ======================================================================== *;
;*  TEXAS INSTRUMENTS, INC.                                                 *;
;*                                                                          *;
;*  IMGLIB  DSP Image/Video Processing Library                              *;
;*                                                                          *;
;*      Release:        Revision 1.04b                                      *;
;*      CVS Revision:   1.11    Sun Sep 29 03:32:26 2002 (UTC)              *;
;*      Snapshot date:  23-Oct-2003                                         *;
;*                                                                          *;
;*  This library contains proprietary intellectual property of Texas        *;
;*  Instruments, Inc.  The library and its source code are protected by     *;
;*  various copyrights, and portions may also be protected by patents or    *;
;*  other legal protections.                                                *;
;*                                                                          *;
;*  This software is licensed for use with Texas Instruments TMS320         *;
;*  family DSPs.  This license was provided to you prior to installing      *;
;*  the software.  You may review this license by consulting the file       *;
;*  TI_license.PDF which accompanies the files in this library.             *;
;* ------------------------------------------------------------------------ *;
;*          Copyright (C) 2003 Texas Instruments, Incorporated.             *;
;*                          All Rights Reserved.                            *;
;* ======================================================================== *;


;* ======================================================================== *;
;*  Assembler compatibility shim for assembling 4.30 and later code on      *;
;*  tools prior to 4.30.                                                    *;
;* ======================================================================== *;

        .if $isdefed(".ASSEMBLER_VERSION")
        .asg    .ASSEMBLER_VERSION, $asmver
        .else
        .asg    0,    $asmver
        .endif

        .if ($asmver < 430)

        .asg    B,    CALL     ; Function Call
        .asg    B,    RET      ; Return from a Function
        .asg    B,    CALLRET  ; Function call with Call / Ret chaining.

        .if .TMS320C6400
        .asg    BNOP, CALLNOP  ; C64x BNOP as a Fn. Call
        .asg    BNOP, RETNOP   ; C64x BNOP as a Fn. Return
        .asg    BNOP, CRNOP    ; C64x Fn call w/, Call/Ret chaining via BNOP.
        .endif

        .asg    , .asmfunc     ; .func equivalent for hand-assembly code
        .asg    , .endasmfunc  ; .endfunc equivalent for hand-assembly code

        .endif

;* ======================================================================== *;
;*  End of assembler compatibility shim.                                    *;
;* ======================================================================== *;


* ========================================================================= *
*   TEXAS INSTRUMENTS, INC.                                                 *
*                                                                           *
*   NAME                                                                    *
*       IMG_mpeg2_vld_inter                                                 *
*                                                                           *
*   PLATFORM                                                                *
*       C6400                                                               *
*                                                                           *
*   REVISION DATE                                                           *
*       23-May-2002                                                         *
*                                                                           *
*   DESCRIPTION                                                             *
*       This routine takes a bitstream of an MPEG-2 non-intra coded         *
*       macroblock and returns the decoded IDCT coefficients. The routine   *
*       is implemented as specified in the MPEG-2 standard text (ISO/IEC    *
*       13818-2). The routine checks the coded block pattern (cbp),         *
*       performs coefficient decoding inlcuding, variable length decode,    *
*       run-length expansion, inverse zigzag, dequantization, saturation    *
*       and mismatch control.                                               *
*                                                                           *
*   USAGE                                                                   *
*       This routine is C callable, and has the following C prototype:      *
*                                                                           *
*       void IMG_mpeg2_vld_inter                                            *
*       (                                                                   *
*           const short    *restrict Wptr,                                  *
*           short          *restrict outi,                                  *
*           IMG_mpeg2_vld  *restrict Mpeg2v,                                *
*           int            mode_12Q4,                                       *
*           int            num_blocks,                                      *
*           int            bsbuf_words                                      *
*       );                                                                  *
*                                                                           *
*         Wptr:   Pointer to array that contains quantization matrix. The   *
*                 elements of the quantization matrix in *Wptr must be      *
*                 ordered according to the scan pattern used (zigzag or     *
*                 alternate scan). Video format 4:2:0 requires one          *
*                 quantization matrix (64 array elements).  For formats     *
*                 4:2:2 and 4:4:4 two quantization matrices (one for luma   *
*                 and one for chroma) must specified in the array (128      *
*                 array elements).                                          *
*                                                                           *
*         outi:   Pointer to the IDCT coefficients output array             *
*                 (6*64 elements), elements must be set to zero prior to    *
*                 function call.                                            *
*                                                                           *
*         Mpeg2v: Pointer to the context object containing the coding       *
*                 parameters of the MB to be decoded and the current state  *
*                 of the bitstream buffer. The structure is described       *
*                 below.                                                    *
*                                                                           *
*      mode_12Q4: 0: Coefficients are returned in normal 16-bit integer     *
*                 format.                                                   *
*                 Otherwise: Coefficients are returned in 12Q4 format       *
*                 (normal 16-bit integer format left shifted by 4). This    *
*                 mode is useful for directly passing the coefficients      *
*                 into the IMG_idct_8x8 routine.                            *
*                                                                           *
*     num_blocks: Number of blocks that the MB contains. Valid values are   *
*                 6 for 4:2:0, 8 for 4:2:2 and 12 for 4:4:4 format.         *
*                                                                           *
*    bsbuf_words: Size of bitstream buffer in words. Must be a power of 2.  *
*                 Bitstream buffer must be aligned at an address boundary   *
*                 equal to its size in bytes (bitstream buffer is           *
*                 addressed circularly by this routine.)                    *
*                                                                           *
*       The structure Mpeg2v is defined as follows:                         *
*                                                                           *
*C          #ifndef IMG_MPEG2_VLD_STRUCT_                                   *
*C          #define IMG_MPEG2_VLD_STRUCT_ 1                                 *
*C                                                                          *
*C          typedef struct {                                                *
*C            unsigned int  *bsbuf;      // pointer to bitstream buffer     *
*C            unsigned int  next_wptr;   // next word to read from buffer   *
*C            unsigned int  bptr;        // bit position within word        *
*C            unsigned int  word1;       // word aligned buffer             *
*C            unsigned int  word2;       // word aligned buffer             *
*C            unsigned int  top0;        // top 32 bits of bitstream        *
*C            unsigned int  top1;        // next 32 bits of bitstream       *
*C            const unsigned char *scan; // inverse zigzag scan matrix      *
*C            unsigned int  intravlc;    // intra_vlc_format                *
*C            unsigned int  quant_scale; // quant_scale                     *
*C            unsigned int  dc_prec;     // intra_dc_precision              *
*C            unsigned int  cbp;         // coded_block_pattern             *
*C            unsigned int  fault;       // fault condition (returned)      *
*C            unsigned int  reserved;    // reserved                        *
*C          } IMG_mpeg2_vld;                                                *
*C                                                                          *
*C          #endif                                                          *
*                                                                           *
*       The Mpeg2v variables should  have a fixed layout since they are     *
*       accessed by this routine. If the layout is changed, the             *
*       corresponding changes have to be made in the assembly code too.     *
*                                                                           *
*       The routine sets the fault flag Mpeg2v.fault to 1 if an invalid     *
*       VLC code was encountered or the total run went beyond 63. In        *
*       theses cases the decoder has to resynchronize.                      *
*                                                                           *
*       The required lookup tables for this routine are provided in         *
*       IMGLIB and are linked in automatically when linking against         *
*       IMGLIB.                                                             *
*                                                                           *
*       Before calling the routine the bitstream varaibles in Mpeg2v        *
*       have to be initialized. If bsbuf is a circular buffer and bsptr     *
*       contains the number of bits in the buffer that already have         *
*       been consumed, then next_wptr, bptr, word1, word2, top0 and         *
*       top1 are initialized as follows:                                    *
*                                                                           *
*       1. nextwptr: bsptr may not be a multiple of 32, therefore obtain    *
*       the next lower multiple of 32.                                      *
*                                                                           *
*           next_wptr = (bsptr >> 5);                                       *
*                                                                           *
*       2. bptr: bptr is the bit pointer which points to the current        *
*       bit WITHIN the word pointed to by next_wptr.                        *
*                                                                           *
*           bptr = bsptr & 31;                                              *
*           bptr_cmpl = 32 - bptr;                                          *
*                                                                           *
*       3. word1 and word2: read next 3 words from the bitstream buffer     *
*       (word0 is a temporary variable). bsbuf_words is the size of the     *
*       bitstream buffer in words.                                          *
*                                                                           *
*           word0 = bsbuf[next_wptr];                                       *
*           next_wptr = (next_wptr + 1) & (bsbuf_words-1);                  *
*                                                                           *
*           word1 = bsbuf[next_wptr];                                       *
*           next_wptr = (next_wptr + 1) & (bsbuf_words-1);                  *
*                                                                           *
*           word2 = bsbuf[next_wptr];                                       *
*           next_wptr = (next_wptr + 1) & (bsbuf_words-1);                  *
*                                                                           *
*       4. top0 and top1: Shift words word0, word1, word2 by bptr to the    *
*       left so that the current bit becomes the MSB in word0. word0 can    *
*       simply be shifted by bptr; the then empty LSBs of word0 have to be  *
*       filled with the MSBs of word1. To do that the required MSBs are     *
*       brought into the position of empty LSBs of word0 by shifting word1  *
*       to the right by (32-bptr). The result is then copied into word0 by  *
*       an addition. Rather than overwriting word0, top0 is used to hold    *
*       the new bit aligned word. The same procedure is used to obtain      *
*       top1. top0 and top1 contain the next 64 bits of the bitstream.      *
*                                                                           *
*           s1 = word0 << bptr;                                             *
*           s2 = word1 >> bptr_cmpl;  /* unsigned right-shift */            *
*           top0 = s1 + s2;                                                 *
*                                                                           *
*           s3 = word1 << bptr;                                             *
*           s4 = word2 >> bptr_cmpl;  /* unsigned right-shift */            *
*           top1 = s3 + s4;                                                 *
*                                                                           *
*       Note that the routine returns the updated state of the bitstream    *
*       buffer variables, top0, top1, word1, word2, bptr and next_wptr. If  *
*       all other functions which access the bitstream in a decoder system  *
*       maintain the buffer variables in the same way, then the above       *
*       initialization procedure has to be performed only once at the       *
*       beginning.                                                          *
*                                                                           *
*                                                                           *
*   TECHNIQUES                                                              *
*       The instruction NORM is used to detect the number of leading zeros  *
*       or ones in a code word. This value together with additional bits    *
*       extracted from the codeword is then used as an index into look-up   *
*       tables to determine the length, run, level and sign. Escape code    *
*       sequences are directly extracted from the code word.                *
*                                                                           *
*   ASSUMPTIONS                                                             *
*       The bitstream must be stored in memory in 32-bit words which are    *
*       in little endian byte order.                                        *
*                                                                           *
*       Wptr is allowed to overrun once (to detect total run overrun), so   *
*       maximum overrun that can occur is 66 (Error mark). Therefore,       *
*       in memory 66+1 halfwords behind the weighting matrix should be      *
*       valid (e.g. peripherals). No memory is overwritten,                 *
*       only loads occurr.                                                  *
*                                                                           *
*       Note that the AMR register is set to zero on exit.                  *
*                                                                           *
*   NOTES                                                                   *
*       This code is little ENDIAN.                                         *
*       This code is interrupt-tolerant but not interruptible.              *
*                                                                           *
*   MEMORY NOTES                                                            *
*       No bank conflicts                                                   *
*                                                                           *
*   CYCLES                                                                  *
*       10 * (S - CB) + 37 * CB + 15 * NCB + 34                             *
*       where S:   Number of symbols in MB, CB: Number of coded blocks,     *
*             NCB: Number of not-coded blocks, and CB+NCB=6                 *
*                                                                           *
*   CODE SIZE                                                               *
*       1248 bytes                                                          *
*                                                                           *
*   MEMORY REQUIREMENTS                                                     *
*       1792 bytes for the lookup tables                                    *
*       (can be shared with mpeg2_vld_intra)                                *
*                                                                           *
*                                                                           *
* ------------------------------------------------------------------------- *
*             Copyright (c) 2003 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *

* ===================== SYMBOLIC REGISTER ASSIGNMENTS ====================== *
        .asg            A0,         A_neg
        .asg            A1,         A_bptr1
        .asg            A1,         A_qw
        .asg            A1,         A_test1
        .asg            A1,         A_test2
        .asg            A16,        A_len_c
        .asg            A17,        A_bptr
        .asg            A18,        A_len_tbl_adr
        .asg            A19,        A_const31
        .asg            A2,         A_top0l
        .asg            A20,        A_const32
        .asg            A22,        A_const36
        .asg            A23,        A_qscl
        .asg            A24,        A_level4
        .asg            A24,        A_t2
        .asg            A24,        A_top0_bk
        .asg            A25,        A_empty
        .asg            A25,        A_len
        .asg            A26,        A_nrm
        .asg            A26,        A_t1l
        .asg            A26,        A_t4l
        .asg            A27,        A_t1h
        .asg            A27,        A_t4h
        .asg            A3,         A_top0h
        .asg            A4,         A_ptop0l
        .asg            A5,         A_level_f
        .asg            A5,         A_level5
        .asg            A5,         A_ptop0h
        .asg            A6,         A_W
        .asg            A7,         A_top1
        .asg            A8,         A_word1
        .asg            A9,         A_t3
        .asg            A9,         A_t7
        .asg            A9,         A_t8
        .asg            B0,         B_eob
        .asg            B1,         B_run
        .asg            B1,         B_test3
        .asg            B1,         B_12Q4      ; 12Q4 MERGE
        .asg            B16,        B_level2
        .asg            B16,        B_rld_left
        .asg            B17,        B_bptr_cmpl
        .asg            B17,        B_t14
        .asg            B17,        B_t9
        .asg            B19,        B_word2
        .asg            B20,        B_Wptr_end
        .asg            B21,        B_Zptr
        .asg            B22,        B_outi
        .asg            B23,        B_sum
        .asg            B24,        B_top0_bk
        .asg            B26,        B_level3
        .asg            B3,         B_const63
        .asg            B4,         B_rld_table_adr
        .asg            B5,         B_const32
        .asg            B6,         B_rld_table_adr_1
        .asg            B7,         B_bsbuf_circ
        .asg            B8,         B_Wptr
        .asg            B9,         B_level
        .asg            B9,         B_t12
        .asg            B9,         B_t13
        .asg            B9,         B_t15
        .asg            B9,         B_t16
        .asg            A11,        A_outi      ; 12Q4
        .asg            A12,        A_cnum      ; 12Q4
        .asg            A13,        A_const16   ; 12Q4
        .asg            B18,        B_constFFF0 ; 12Q4

* ========================================================================= *

        .global _IMG_len_tbl0
        .global _IMG_rld_table0

; Mpeg2v structure:
BSBUF_M2OFF      .set   0x0
NEXTWPTR_M2OFF   .set   0x1
BPTR_M2OFF       .set   0x2
WORD1_M2OFF      .set   0x3
WORD2_M2OFF      .set   0x4
TOP0_M2OFF       .set   0x5
TOP1_M2OFF       .set   0x6
ZPTR_M2OFF       .set   0x7
QSCL_M2OFF       .set   0x9
CBP_M2OFF        .set   0xB
FAULT_M2OFF      .set   0xC

        .sect ".text:_mpeg2_vld_inter"
        .global _IMG_mpeg2_vld_inter
_IMG_mpeg2_vld_inter:
; parameters: B_Wptr, B_outi, A_Mpeg2v, B_12Q4, A_num_blocks, B_bsbuf_words
;             A4,     B4,     A6,       B6,     A8,           B8

* ========================================================================= *
*  Setup
* ========================================================================= *
        .asg            B15,        B_SP        ; Stack pointer, B datapath
        .asg            A16,        A_SP        ; Stack pointer, A datapath
        .asg            B0,         B_csr       ; CSR's value
        .asg            B1,         B_no_gie    ; CSR w/ GIE bit cleared
        .asg            B3,         B_ret       ; Return address
        .asg            A29,        A_Mpeg2v
        .asg            B18,        B_Mpeg2v
        .asg            B2,         B_cnt
        .asg            A10,        A_amr_arg            ; AMR arg
        .asg            B9,         B_amr_arg            ; AMR arg

        STW     .D2T1   A10,        *B_SP--[9]           ; RWD, MERGE, 2 W-mat
||      MVC     .S2     CSR,        B_csr                ; Get CSR's state
||      MV      .L2     B4,         B_outi
||      MV      .L1     A6,         A_Mpeg2v
||      MV      .S1X    B8,         A_amr_arg            ; AMR arg

        STW     .D2T2   B_csr,      *+B_SP[1]            ; Save CSR
||      AND     .L2     B_csr,      -2,         B_no_gie ; Clear GIE
||      MV      .S2X    A4,         B_Wptr
||      MV      .D1X    B_SP,       A_SP                 ; 12Q4 MERGE
||      LMBD    .L1     1,          A_amr_arg,  A_amr_arg; AMR arg
||      MVK     .S1     32,         A_const32            ; AMR arg

        STW     .D2T2   B_ret,      *+B_SP[2]            ; Save return addr.
||      STW     .D1T1   A14,        *+A_SP[6]            ; MERGE
||      MV      .L2X    A6,         B_Mpeg2v
||      MVC     .S2     B_no_gie,   CSR                  ; Disable ints.
||      SUB     .L1     A_const32,  A_amr_arg,  A_amr_arg; AMR arg
; ===== Interrupts masked here =====

* ========================================================================= *
*  Get bitstream info
*  Setup circuar bitstream buffer
*  Load table addresses and constants
*  Block loop setup
* ========================================================================= *
        .asg     B31,    B_bsbuf
        .asg     B29,    B_next_wptr
        .asg     B27,    B_cbp
        .asg     B0,     B0_amr_config
        .asg     A21,    A_const1
        .asg     A9,     A_tbs1
        .asg     A4,     A_tbs2
        .asg     B17,    B_tbs3
        .asg     B3,     B_const126
        .asg     B3,     B_const128
        .asg     A14,    A_constSHR                             ; 12Q4 MERGE

        LDW     .D2T2    *+B_Mpeg2v[BSBUF_M2OFF],      B_bsbuf
||      LDW     .D1T1    *+A_Mpeg2v[TOP0_M2OFF],       A_top0_bk
||      MVK     .S2      128,          B_const128
||      MV      .L2      B6,           B_12Q4                   ; 12Q4 MERGE

        LDW     .D2T2    *+B_Mpeg2v[NEXTWPTR_M2OFF],   B_next_wptr
||      LDW     .D1T1    *+A_Mpeg2v[TOP1_M2OFF],       A_top1
||      ADD     .L2      B_Wptr,       B_const128,     B_Wptr_end
||[!B_12Q4]MVK  .S1      20,           A_constSHR             ; non-12Q4 MERGE
||      SHL     .S2X     A_amr_arg,    16,             B_amr_arg; AMR arg

        LDW     .D1T1    *+A_Mpeg2v[BPTR_M2OFF],       A_bptr
||      STW     .D2T2    B_Wptr,         *+B_SP[7]              ; 2 W-mat
||      MV      .L2X     A8,             B_cnt                  ; 2 W-mat
||      SET     .S2      B_amr_arg, 14, 14,     B_amr_arg          ; AMR arg

        LDW     .D1T1    *+A_Mpeg2v[WORD1_M2OFF],      A_word1
||      LDW     .D2T2    *+B_Mpeg2v[WORD2_M2OFF],      B_word2
||      MVC     .S2      B_amr_arg,            AMR                 ; AMR arg
||      MVK     .S1      31,            A_const31

        LDW     .D1T1    *+A_Mpeg2v[QSCL_M2OFF],       A_qscl
||      LDW     .D2T2    *+B_Mpeg2v[CBP_M2OFF],        B_cbp
||[B_12Q4]MVK   .S1      16,           A_constSHR               ; 12Q4 MERGE
; B_constFFF0 and B_Mpeg2v share the same register

* ========================================================================= *
*  Setup bitstream pointers: top0h:top0l, top1 contain top bitstream
* ========================================================================= *
        .asg    B25,    B_word2_bk
        .asg    A28,    A_word1_bk
        .asg    A10,    A_word1_rw      ; RWD
        .asg    A21,    A_word1_rw_bk   ; RWD
        .asg    A31,    A_top0h_bk
        .asg    A30,    A_top0l_bk
        .asg    B28,    B_bptr_bk
        .asg    B30,    B_bsbuf_circ_bk

        SHL     .S1      A_top0_bk,     8,          A_tbs1
||      STW     .D2T1    A11,           *+B_SP[3]
||[B_12Q4]MVKL  .S2      0xFFF0,        B_constFFF0            ; 12Q4 MERGE

        SHRU    .S1      A_top1,        24,         A_tbs2
||      STW     .D2T1    A12,           *+B_SP[4]
||[!B_12Q4]MVKL .S2      0xFFFF,        B_constFFF0           ; non-12Q4 MERGE

        ADD     .L1      A_tbs1,        A_tbs2,     A_top0l_bk
||      ADD     .S1      A_bptr,        8,          A_bptr1
||      STW     .D2T1    A13,           *+B_SP[5]

        CMPGT   .L1      A_bptr1,       A_const31,  A_test2
||      AND     .S1      A_bptr1,       A_const31,  A_bptr
||      MVK     .S2      32,            B_const32
||      ADDAW   .D2      B_bsbuf,       B_next_wptr,B_bsbuf_circ

  [A_test2]MV   .S1      A_word1,                   A_word1_rw  ; RWD
||[A_test2]MV   .L1X     B_word2,                   A_word1
||[A_test2]LDW  .D2T2    *B_bsbuf_circ++,           B_word2
||      SUB     .S2      B_const32,     A_bptr,     B_bptr_cmpl

        MVKL    .S1      _IMG_len_tbl0,     A_len_tbl_adr
||      MVKL    .S2      _IMG_rld_table0,   B_rld_table_adr
||      MV      .L2X     A_bptr,        B_bptr_bk
||      STW     .D2T2    B_cnt,         *+B_SP[8]               ; 2 W-mat

        MVKH    .S1      _IMG_len_tbl0,     A_len_tbl_adr
||      MVKH    .S2      _IMG_rld_table0,   B_rld_table_adr

 [B_12Q4]MVK   .S1      16,            A_const16               ; 12Q4

 [!B_12Q4]MVK   .S1      1,             A_const16              ; non-12Q4

        SHL     .S1      A_word1,       A_bptr,     A_tbs1
||      SHRU    .S2      B_word2,       B_bptr_cmpl,B_tbs3

        ADD     .L1X     A_tbs1,        B_tbs3,     A_top1
||      SHRU    .S1      A_top0_bk,     24,         A_top0h_bk
||      MV      .D1      A_word1,       A_word1_bk
||      MV      .D2      B_word2,       B_word2_bk
||      MV      .L2      B_bsbuf_circ,  B_bsbuf_circ_bk


block_loop:
* ------------------------------------------------------------------------- *
*  check cbp, etc.
* ------------------------------------------------------------------------- *
        .asg     B17,     B_cbp_mask
        .asg     B0,      B_coded
        .asg     A5,      A_last_coeff
        .asg     A2,      A2_odd
        .asg    B31,      B_run_bk
        .asg    B26,      B_num_blocks                        ; 2 W-mat

            SUB   .S2     B_cnt,        1,        B_cnt       ; cbp, cnt--
||          ZERO  .L2     B_sum
||          ZERO  .D2     B_run_bk                            ; not coded
||          ZERO  .L1     A2_odd                              ; not coded
||          MVK   .S1     1,            A_const1              ; cbp

            SHL   .S2X    A_const1,     B_cnt,    B_cbp_mask  ; cbp
||          MV    .L2     B_Wptr_end,   B_Wptr                ; not coded
||          MVK   .S1     0,            A_last_coeff          ; not coded

            AND   .D2     B_cbp_mask,   B_cbp,    B_coded     ; cbp
||          MVK   .S2     126,          B_const126            ; const

  [!B_coded]B     .S1     mismatch                            ; not coded
||[!B_coded]ADD   .L2     B_outi,       B_const126, B_outi    ; not coded
||[B_coded]LDW  .D2T2   *+B_SP[8],      B_num_blocks          ; 2 W-mat

* =========================== PIPE LOOP PROLOG ============================ *
        .asg     A0,      A_tm
        .asg     B0,      B_tm_neg

; the added lines below calculate cc which is required for weighting
; matrix selection in 4:2:2 and 4:4:4 mode
; the following additional registers are required: B_block, B_flag, B_cc
        .asg    B31,      B_block
        .asg    B1,       B_cc
        .asg    B0,       B_flag

        NORM    .L1     A_top0h_bk:A_top0l_bk,  A_nrm           ;[ 1,1]
||      SHRU    .S1     A_top0h_bk,     7,      A_tm            ;table mod

        MPY     .M1     A_nrm,      -16,        A_t2            ;[ 2,1]
||      SHL     .S1     A_top0h_bk:A_top0l_bk,  A_nrm, A_t1h:A_t1l;[ 2,1]

        MVK     .S1     36,             A_const36               ;const

        SHRU    .S1     A_t1h:A_t1l,    A_const36,  A_t4h:A_t4l ;[ 4,1]
||      SUB     .L1     A_len_tbl_adr,  A_t2,       A_t3        ;[ 4,1]
||[B_coded]LDW  .D2T2   *+B_SP[7],      B_Wptr             ;get W-mat base adr

  [!A_tm]LDBU   .D1T1   *A_t3[A_t4l],   A_len                   ;[ 5,1]
||[B_coded] SUB .L2     B_num_blocks,   1,          B_num_blocks;2 W-mat
; branch occurs if not coded MB

        SUB     .S2     B_num_blocks,   B_cnt,      B_block    ;cc for 2 W-mat
||      CMPGT   .L2     B_num_blocks,   6,          B_flag
;prevent 2 W-mat if 4:2:0

        SHRU    .S1     A_top0h_bk:A_top0l_bk, 8, A_empty:A_top0_bk;[ 8,1]
||[B_flag]CMPGT .L2     B_block,        3,          B_flag     ;cc for 2 W-mat
||      ZERO    .S2     B_cc                                   ;cc for 2 W-mat

  [A_tm]MVK     .L1     2,          A_len                      ;table mod
||[B_flag] AND  .D2     B_block,        1,          B_cc       ;cc for 2 W-mat

        MV      .L1X    B_bptr_bk,  A_bptr                     ;restore
||      MVK     .S2     128,         B_const128                ;const
||[B_flag] ADD  .D2     B_cc,           1,          B_cc       ;cc for 2 W-mat

        SUB     .S2X    A_len,      5,          B_rld_left      ;[10,1]
||      CMPLT   .L2X    A_len,      5,          B_test3         ;[10,1]
||      ADD     .L1     A_bptr,     A_len,      A_bptr1         ;[10,1]
||      SHL     .S1     A_top0h_bk:A_top0l_bk,  A_len, A_ptop0h:A_ptop0l;[10,1]
||[!A_tm]SUB    .D1     A_const32,  A_len,      A_len_c         ;[10,1]
||[B_cc]ADD     .D2     B_Wptr,     B_const128, B_Wptr
;if cc!=0 select 2nd W-mat

  [B_test3]MPY  .M2     B_rld_left, 0,          B_rld_left      ;[11,1]
||      MV      .L2X    A_top0_bk,  B_top0_bk                   ;[11,1]
||      AND     .S1     A_const31,  A_bptr1,    A_bptr          ;[11,1]
||      MV      .D1     A_ptop0h,   A_top0h                     ;[11,1]
||      NORM    .L1     A_ptop0h:A_ptop0l,      A_nrm           ;[ 1,2]

        CMPGT   .L1     A_bptr1,    A_const31,  A_test2         ;[12,1]
||      MPY     .M1     A_nrm,      -16,        A_t2            ;[ 2,2]
||      SHL     .S1     A_ptop0h:A_ptop0l, A_nrm, A_t1h:A_t1l   ;[ 2,2]
||      ADD     .L2     B_Wptr,      B_const128, B_Wptr_end     ;reset

        SHL     .S2     B_top0_bk,  B_rld_left, B_t13           ;[13,1]
||      MPY     .M2X    B_const32,  A_len,      B_t12           ;[13,1]
||[A_tm]MVK     .S1     30,                     A_len_c         ;table mod
||      MV      .L1     A_word1_bk,  A_word1                    ;restore
||      MV      .L2     B_word2_bk,  B_word2                    ;restore
||      MV      .D2     B_bsbuf_circ_bk, B_bsbuf_circ           ;restore

        SHRU    .S2     B_t13,      27,         B_t14           ;[14,1]
||[ A_test2]LDW .D2T2   *B_bsbuf_circ++,        B_word2         ;[14,1]
||      SHRU    .S1     A_t1h:A_t1l,            A_const36,  A_t4h:A_t4l ;[ 4,2]
||      SUB     .L1     A_len_tbl_adr,          A_t2,       A_t3;[ 4,2]

        ADD     .L2     B_t14,      B_t12,      B_t15           ;[15,1]
||      SUB     .S2X    B_const32,  A_bptr,     B_bptr_cmpl     ;[15,1]
||[ A_test2]MV  .L1X    B_word2,    A_word1                     ;[15,1]
|| [A_test2]MV  .S1     A_word1,    A_word1_rw                  ; RWD
||      LDBU    .D1T1   *A_t3[A_t4l],           A_len           ;[ 5,2]
||      ZERO    .D2     B_tm_neg                                ;table mod

        ADD     .L2     B_t15,      B_t15,      B_t16           ;[16,1]
||[!A_tm]SUB    .D1     A_len,      24,         A_test1         ;[16,1]
||[A_tm]ZERO    .L1     A_test1                                 ;table mod
||      SHRU    .S1     A_top1,     A_len_c,    A_t7            ;[16,1]
||[A_tm]EXTU    .S2     B_top0_bk,  1, 31,      B_tm_neg        ;table mod

  [ A_test1]LDB .D2T2   *B_rld_table_adr[B_t16],B_level         ;[17,1]
||      ADD     .D1     A_ptop0l,   A_t7,       A_top0l         ;[17,1]
||      ADD     .L2     B_rld_table_adr,     1, B_rld_table_adr_1;const
||[A_tm]MVK     .S2     1,          B_level                     ;table mod

  [ A_test1]LDB .D2T2   *B_rld_table_adr_1[B_t16], B_run        ;[18,1]
||[!A_tm]EXT    .S2     B_top0_bk,  12, 20,    B_level          ;[18,1]
||      SHRU    .S1     A_top0h:A_top0l, 8,    A_empty:A_top0_bk;[ 8,2]
||[B_tm_neg]NEG .L2     B_level,    B_level                     ;table mod

        SHRU    .S2     B_word2,    B_bptr_cmpl,B_t9            ;[19,1]
||      SHL     .S1     A_word1,    A_bptr,     A_t8            ;[19,1]

        SUB     .S2X    A_len,      5,          B_rld_left      ;[10,2]
||      CMPLT   .L2X    A_len,      5,          B_test3         ;[10,2]
||      ADD     .L1     A_bptr,     A_len,      A_bptr1         ;[10,2]
||      SHL     .S1     A_top0h:A_top0l, A_len, A_ptop0h:A_ptop0l;[10,2]
||      SUB     .D1     A_const32,  A_len,      A_len_c         ;[10,2]

  [!A_tm]EXTU   .S2     B_top0_bk,  6,  26,     B_run           ;[21,1]
||[A_tm] ZERO   .D2     B_run                                   ;table mod
||[B_test3]MPY  .M2     B_rld_left, 0,          B_rld_left      ;[11,2]
||      MV      .L2X    A_top0_bk,  B_top0_bk                   ;[11,2]
||      AND     .S1     A_const31,  A_bptr1,    A_bptr          ;[11,2]
||      MV      .D1     A_ptop0h,   A_top0h                     ;[11,2]
||      NORM    .L1     A_ptop0h:A_ptop0l,      A_nrm           ;[ 1,3]

        MPY     .M2     B_level,    2,          B_level2        ;[22,1]
||      CMPGT   .L1     A_bptr1,    A_const31,  A_test2         ;[12,2]
||      MPY     .M1     A_nrm,      -16,        A_t2            ;[ 2,3]
||      SHL     .S1     A_ptop0h:A_ptop0l, A_nrm, A_t1h:A_t1l   ;[ 2,3]
||      LDW     .D1T2   *+A_Mpeg2v[ZPTR_M2OFF], B_Zptr          ;reset
||      MVK     .S2     63,         B_const63                   ;const

        LDH     .D2T1   *++B_Wptr[B_run],       A_W             ;[23,1]
||      CMPLT   .L1X    B_level,    0,          A_neg           ;[23,1]
||      SHL     .S2     B_top0_bk,  B_rld_left, B_t13           ;[13,2]
||      MPY     .M2X    B_const32,  A_len,      B_t12           ;[13,2]

        ADD     .L2     B_Wptr,     2,          B_Wptr          ;[24,1]
||      ADD     .D1X    A_t8,       B_t9,       A_top1          ;[24,1]
||      SHRU    .S2     B_t13,      27,         B_t14           ;[14,2]
||[ A_test2]LDW .D2T2   *B_bsbuf_circ++,        B_word2         ;[14,2]
||      SHRU    .S1     A_t1h:A_t1l,            A_const36,  A_t4h:A_t4l;[ 4,3]
||      SUB     .L1     A_len_tbl_adr,          A_t2,       A_t3;[ 4,3]

  [ A_neg]SUB   .D2     B_level2,   1,          B_level3        ;[25,1]
||      ADD     .L2     B_t14,      B_t12,      B_t15           ;[15,2]
||      SUB     .S2X    B_const32,  A_bptr,     B_bptr_cmpl     ;[15,2]
||[ A_test2]MV  .L1X    B_word2,    A_word1                     ;[15,2]
|| [A_test2]MV  .S1     A_word1,    A_word1_rw                  ; RWD
||      LDBU    .D1T1   *A_t3[A_t4l],           A_len           ;[ 5,3]

  [!A_neg]ADD   .L2     B_level2,   1,          B_level3        ;[26,1]
||      ADD     .S2     B_t15,      B_t15,      B_t16           ;[16,2]
||      SUB     .D1     A_len,      24,         A_test1         ;[16,2]
||      SHRU    .S1     A_top1,     A_len_c,    A_t7            ;[16,2]

        CMPGT   .L2     B_run,      B_const63,  B_eob           ;[27,1]
||[ A_test1]LDB .D2T2   *B_rld_table_adr[B_t16],B_level         ;[17,2]
||      ADD     .D1     A_ptop0l,   A_t7,       A_top0l         ;[17,2]
||      MV      .L1X    B_outi,     A_outi                      ; 12Q4

* =========================== PIPE LOOP KERNEL ============================ *
        .asg    A2,     A2_top0l

loop:
        MPY     .M1     A_qscl,     A_W,        A_qw            ;[28,1]
||[!B_eob]CMPGT .L2     B_Wptr,     B_Wptr_end, B_eob           ;[28,1]
||[A_test1]LDB  .D2T2   *B_rld_table_adr_1[B_t16],      B_run   ;[18,2]
||      EXT     .S2     B_top0_bk,  12, 20,    B_level          ;[18,2]
||      SHRU    .S1     A_top0h:A_top0l, 8,    A_empty:A_top0_bk;[ 8,3]
||[B_eob]MPY    .M2     0,          B_Wptr,     B_Wptr          ;err det

        LDB     .D2T1   *++B_Zptr[B_run],       A_cnum          ;[29,1]
||      SHRU    .S2     B_word2,    B_bptr_cmpl,B_t9            ;[19,2]
||      SHL     .S1     A_word1,    A_bptr,     A_t8            ;[19,2]
||[!B_eob]MV    .L1     A_top0h:A_top0l, A_top0h_bk:A_top0l_bk  ;preserve
||[B_eob]ADD    .L2     B_outi,     B_const63,  B_outi          ;mismatch
||[!B_eob]MPY   .M2X    1,          A_bptr,     B_bptr_bk       ;preserve

        ADD     .D2     B_Zptr,     1,          B_Zptr          ;[30,1]
||      MPY     .M1X    A_qw,       B_level3,   A_level4        ;[30,1]
||      SUB     .S2X    A_len,      5,          B_rld_left      ;[10,3]
||      CMPLT   .L2X    A_len,      5,          B_test3         ;[10,3]
||      ADD     .L1     A_bptr,     A_len,      A_bptr1         ;[10,3]
||      SHL     .S1     A_top0h:A_top0l, A_len, A_ptop0h:A_ptop0l;[10,3]
||      SUB     .D1     A_const32,  A_len,      A_len_c         ;[10,3]
||[B_eob]MPY    .M2     1,          B_run,      B_run_bk        ;preserve

        EXTU    .S2     B_top0_bk,  6,  26,     B_run           ;[21,2]
||[B_test3]MPY  .M2     B_rld_left, 0,          B_rld_left      ;[11,3]
||      MV      .L2X    A_top0_bk,  B_top0_bk                   ;[11,3]
||      AND     .S1     A_const31,  A_bptr1,    A_bptr          ;[11,3]
||      MV      .D1     A_ptop0h,   A_top0h                     ;[11,3]
||      NORM    .L1     A_ptop0h:A_ptop0l,      A_nrm           ;[ 1,4]
||[B_eob]ADD    .D2     B_outi,     B_const63,  B_outi          ;mismatch

  [!B_eob]B     .S2     loop                                    ;[32,1]
||[ A_neg]ADD   .D1     A_level4,   A_const31,  A_level4        ;[32,1]
||      MPY     .M2     B_level,    2,          B_level2        ;[22,2]
||      CMPGT   .L1     A_bptr1,    A_const31,  A_test2         ;[12,3]
||      MPY     .M1     A_nrm,      -16,        A_t2            ;[ 2,4]
||      SHL     .S1     A_ptop0h:A_ptop0l, A_nrm, A_t1h:A_t1l   ;[ 2,4]
||[!B_eob]MV    .L2     B_bsbuf_circ,      B_bsbuf_circ_bk      ;preserve
||[B_eob]LDH    .D2T1   *B_outi,    A_last_coeff                ;mismatch

        SSHL    .S1     A_level4,   15,         A_level5        ;[33,1]
||[!B_eob]LDH   .D2T1   *++B_Wptr[B_run],       A_W             ;[23,2]
||      CMPLT   .L1X    B_level,    0,          A_neg           ;[23,2]
||      SHL     .S2     B_top0_bk,  B_rld_left, B_t13           ;[13,3]
||      MPY     .M2X    B_const32,  A_len,      B_t12           ;[13,3]
||[!B_eob]MV    .L2     B_word2,    B_word2_bk                  ;preserve
||[!B_eob]MV    .D1     A_word1,    A_word1_bk                  ;preserve
||[!B_eob]MVD   .M1     A_word1_rw, A_word1_rw_bk               ;preserve

  [!B_eob]ADD   .L2     B_Wptr,     2,          B_Wptr          ;[24,2]
||[!B_eob]ADD   .D1X    A_t8,       B_t9,       A_top1          ;[24,2]
||      SHRU    .S2     B_t13,      27,         B_t14           ;[14,3]
||[ A_test2]LDW .D2T2   *B_bsbuf_circ++,        B_word2         ;[14,3]
||      SHRU    .S1     A_t1h:A_t1l, A_const36, A_t4h:A_t4l     ;[ 4,4]
||      SUB     .L1     A_len_tbl_adr,          A_t2,       A_t3;[ 4,4]

        SHR     .S1     A_level5,   A_constSHR, A_level_f       ;[35,1] 12Q4
||[ A_neg]SUB   .D2     B_level2,   1,          B_level3        ;[25,2]
||      ADD     .L2     B_t14,      B_t12,      B_t15           ;[15,3]
||      SUB     .S2X    B_const32,  A_bptr,     B_bptr_cmpl     ;[15,3]
||[ A_test2]MV  .L1X    B_word2,    A_word1                     ;[15,3]
|| [A_test2]MVD .M1     A_word1,    A_word1_rw                  ; RWD
||      LDBU    .D1T1   *A_t3[A_t4l],           A_len           ;[ 5,4]

  [!A_neg]ADD   .L2     B_level2,   1,          B_level3        ;[26,2]
||      ADD     .S2     B_t15,      B_t15,      B_t16           ;[16,3]
||      SUB     .D1     A_len,      24,         A_test1         ;[16,3]
||      SHRU    .S1     A_top1,     A_len_c,    A_t7            ;[16,3]
||[!B_eob]AND   .L1X    B_constFFF0,A_level_f,  A_level_f       ; 12Q4

  [!B_eob]STH   .D1T1   A_level_f,  *+A_outi[A_cnum]            ;[36,1] BC
||[!B_eob]ADD   .S2X    B_sum,      A_level_f,  B_sum           ;[37,1]
||        CMPGT .L2     B_run,      B_const63,  B_eob           ;[27,2]
||[ A_test1]LDB .D2T2   *B_rld_table_adr[B_t16],B_level         ;[17,3]
||[!B_eob]ADD   .L1     A_ptop0l,   A_t7,       A2_top0l        ;[17,3]
||[B_eob] XOR   .S1     A_const16,  A_last_coeff, A_last_coeff  ;mismatch 12Q4
||[B_eob] MVD   .M1     A_word1_rw_bk,          A_word1_rw      ; RWD

* =========================== PIPE LOOP EPILOG ============================ *
* ========================================================================= *
; live-out: top0h:top0k, top1, word1, word2, bsbuf_circ, run, Wptr, Wptr_end,
;           sum, bptr

    .asg    B0,     B_err       ; same reg as B_eob
    .asg    A29,    A_Mpeg2v
    .asg    B31,    B_bsbuf
    .asg    B3,     B_ret       ; Return address
    .asg    B15,    B_SP        ; Stack pointer, B datapath
    .asg    B1,     B_const65

mismatch:
 [B_cnt] B       .S1     block_loop                             ; -- BRANCH --
||       MVK     .S2     65,           B_const65                ; invalid VLC
||       CMPGTU  .L2     B_Wptr,       B_Wptr_end, B_err        ; overrun
||       AND     .L1X    A_const16,    B_sum,      A2_odd       ; mismatch 12Q4

 [!B_err]CMPGT   .L2     B_run_bk,     B_const65,  B_err        ; invalid VLC
||       ADD     .S2     B_outi,       2,          B_outi
||       LDW     .D1T2   *+A_Mpeg2v[BSBUF_M2OFF],  B_bsbuf      ; exit
||[!A2_odd]STH   .D2T1   A_last_coeff, *B_outi                  ; mismatch

  [B_err]B       .S2     exit                                   ; -- BRANCH --
||       LDW     .D2T2   *+B_SP[2],    B_ret                    ; exit
||       MV      .L2     B_bsbuf_circ_bk, B_bsbuf_circ          ; AMR arg

         NOP             3                                      ; MERGE
; branch occurs to block_loop
; branch occurs to exit occurs after 2 cycles in block_loop
; (preserve B0_err for exit)

; this will execute only if B_cnt was 0
        NOP              2                                      ; MERGE

* =================================== EXIT =============================== *
        .asg            B26,        B_csr        ; CSR value to restore
        .asg            B22,        B_byte_diff
        .asg            B29,        B_next_wptr
        .asg            B1,         B_lz
        .asg            B27,        B_amr_config
        .asg            B4,         B_constBUFMASK

exit:
        SUB     .L2      B_bsbuf_circ,  B_bsbuf, B_byte_diff
||      SHRU    .S1      A_top1,        8,         A_t2
||      SUB     .S2      B_bptr_bk,     8,         B_bptr_bk
||      LDW     .D2T1    *+B_SP[6],     A14                     ; MERGE

        SHR     .S2      B_byte_diff,   2,         B_next_wptr
||      SHL     .S1      A_top0l_bk,    24,        A_t3
||      CMPLT   .L2      B_bptr_bk,     0,         B_lz
||      LDW     .D2T1    *+B_SP[3],     A11

        ADD     .L1      A_t2,          A_t3,      A_top1
||      SHRU    .S1      A_top0h_bk:A_top0l_bk, 8, A_empty:A_top0_bk
||      STW     .D1T2    B_err,      *+A_Mpeg2v[FAULT_M2OFF]
||[B_lz]MVD     .M1      A_word1_rw,               A_word1_bk   ; RWD
||[B_lz]MV      .L2X     A_word1_bk,               B_word2_bk
||      LDW     .D2T1    *+B_SP[4],     A12

        LDW     .D2T2    *+B_SP[1],     B_csr       ; Get CSR's value
||      STW     .D1T1    A_top1,        *+A_Mpeg2v[TOP1_M2OFF]
||[B_lz]ADD     .L2      B_bptr_bk,     A_const32, B_bptr_bk

        STW     .D1T2    B_bptr_bk,     *+A_Mpeg2v[BPTR_M2OFF]
||      LDW     .D2T1    *+B_SP[5],     A13

        RET     .S2      B_ret                      ; Return to caller
||      STW     .D1T1    A_top0_bk,     *+A_Mpeg2v[TOP0_M2OFF]
||[B_lz]SUBAW   .D2      B_bsbuf_circ,  1, B_bsbuf_circ       ; AMR arg

        STW     .D1T2    B_next_wptr,   *+A_Mpeg2v[NEXTWPTR_M2OFF]
||      ZERO    .L2      B_amr_config
||      LDW     .D2T1    *++B_SP[9],    A10                   ; MERGE, 2 W-mat
||[B_lz]SUB     .S2      B_bsbuf_circ,  B_bsbuf, B_byte_diff  ; AMR arg

        STW     .D1T1    A_word1_bk,    *+A_Mpeg2v[WORD1_M2OFF]
||[B_lz]SHR     .S2      B_byte_diff,   2,         B_next_wptr ; AMR arg

        STW     .D1T2    B_word2_bk,    *+A_Mpeg2v[WORD2_M2OFF]
;  ===== Interruptibility state restored here =====

        STW     .D1T2    B_next_wptr,   *+A_Mpeg2v[NEXTWPTR_M2OFF]
||      MVC     .S2      B_amr_config,  AMR

        MVC     .S2      B_csr,         CSR           ; Restore CSR
; Branch occurs

* ========================================================================= *
*   End of file:   img_mpeg2_vld_inter.asm                                  *
* ------------------------------------------------------------------------- *
*             Copyright (c) 2003 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *