c6416_sdk/dsplib/mat_mul.asm


								;* ======================================================================== *;

								;*  TEXAS INSTRUMENTS, INC.                                                 *;

								;*                                                                          *;

								;*  DSPLIB  DSP Signal Processing Library                                   *;

								;*                                                                          *;

								;*      Release:        Revision 1.04b                                      *;

								;*      CVS Revision:   1.4     Sun Sep 29 03:32:25 2002 (UTC)              *;

								;*      Snapshot date:  23-Oct-2003                                         *;

								;*                                                                          *;

								;*  This library contains proprietary intellectual property of Texas        *;

								;*  Instruments, Inc.  The library and its source code are protected by     *;

								;*  various copyrights, and portions may also be protected by patents or    *;

								;*  other legal protections.                                                *;

								;*                                                                          *;

								;*  This software is licensed for use with Texas Instruments TMS320         *;

								;*  family DSPs.  This license was provided to you prior to installing      *;

								;*  the software.  You may review this license by consulting the file       *;

								;*  TI_license.PDF which accompanies the files in this library.             *;

								;* ------------------------------------------------------------------------ *;

								;*          Copyright (C) 2003 Texas Instruments, Incorporated.             *;

								;*                          All Rights Reserved.                            *;

								;* ======================================================================== *;


								;* ======================================================================== *;

								;*  Assembler compatibility shim for assembling 4.30 and later code on      *;

								;*  tools prior to 4.30.                                                    *;

								;* ======================================================================== *;


								        .if $isdefed(".ASSEMBLER_VERSION")

								        .asg    .ASSEMBLER_VERSION, $asmver

								        .else

								        .asg    0,    $asmver

								        .endif


								        .if ($asmver < 430)


								        .asg    B,    CALL     ; Function Call

								        .asg    B,    RET      ; Return from a Function

								        .asg    B,    CALLRET  ; Function call with Call / Ret chaining.


								        .if .TMS320C6400

								        .asg    BNOP, CALLNOP  ; C64x BNOP as a Fn. Call

								        .asg    BNOP, RETNOP   ; C64x BNOP as a Fn. Return

								        .asg    BNOP, CRNOP    ; C64x Fn call w/, Call/Ret chaining via BNOP.

								        .endif


								        .asg    , .asmfunc     ; .func equivalent for hand-assembly code

								        .asg    , .endasmfunc  ; .endfunc equivalent for hand-assembly code


								        .endif


								;* ======================================================================== *;

								;*  End of assembler compatibility shim.                                    *;

								;* ======================================================================== *;


								* ========================================================================= *

								*   TEXAS INSTRUMENTS, INC.                                                 *

								*                                                                           *

								*   NAME                                                                    *

								*       DSP_mat_mul -- Matrix Multiply, Little Endian                       *

								*                                                                           *

								*    REVISION DATE                                                          *

								*        10-Feb-2002                                                        *

								*                                                                           *

								*   USAGE                                                                   *

								*       This routine is C-callable and can be called as:                    *

								*                                                                           *

								*           void DSP_mat_mul                                                *

								*           (                                                               *

								*               const short *restrict x, int r1, int c1,                    *

								*               const short *restrict y,         int c2,                    *

								*               short       *restrict r,                                    *

								*               int                   qs                                    *

								*           );                                                              *

								*                                                                           *

								*       x  == Pointer to r1 by c1 input matrix.                             *

								*       y  == Pointer to c1 by c2 input matrix.                             *

								*       r  == Pointer to r1 by c2 output matrix.                            *

								*                                                                           *

								*       r1 == Number of rows in x.                                          *

								*       c1 == Number of columns in x.  Also number of rows in y.            *

								*       c2 == Number of columns in y.                                       *

								*                                                                           *

								*       qs == Final right-shift to apply to the result.                     *

								*                                                                           *

								*   DESCRIPTION                                                             *

								*       This function computes the expression "r = x * y" for the matrices  *

								*       x and y.  The columnar dimension of x must match the row dimension  *

								*       of y.  The resulting matrix has the same number of rows as x and    *

								*       the same number of columns as y.                                    *

								*                                                                           *

								*       The values stored in the matrices are assumed to be fixed-point     *

								*       or integer values.  All intermediate sums are retained to 32-bit    *

								*       precision.  No rounding or overflow checking is performed.  The     *

								*       results are right-shifted by the user-specified amount, and then    *

								*       truncated to 16 bits.                                               *

								*                                                                           *

								*       This code is suitable for dense matrices.  No optimizations are     *

								*       made for sparse matrices.                                           *

								*                                                                           *

								*       The following is a C description of the algorithm.  The assembly    *

								*       code may place restrictions on the inputs that the C code version   *

								*       does not.  These restrictions are noted under ASSUMPTIONS below.    *

								*                                                                           *

								*       void DSP_mat_mul                                                    *

								*       (                                                                   *

								*           const short *restrict x, int r1, int c1,                        *

								*           const short *restrict y,         int c2,                        *

								*           short       *restrict r,                                        *

								*           int                   qs                                        *

								*       )                                                                   *

								*       {                                                                   *

								*           int i, j, k;                                                    *

								*           int sum;                                                        *

								*                                                                           *

								*           /* ---------------------------------------------------- */      *

								*           /*  Multiply each row in x by each column in y.  The    */      *

								*           /*  product of row m in x and column n in y is placed   */      *

								*           /*  in position (m,n) in the result.                    */      *

								*           /* ---------------------------------------------------- */      *

								*           for (i = 0; i < r1; i++)                                        *

								*               for (j = 0; j < c2; j++)                                    *

								*               {                                                           *

								*                   sum = 0;                                                *

								*                                                                           *

								*                   for (k = 0; k < c1; k++)                                *

								*                       sum += x[k + i*c1] * y[j + k*c2];                   *

								*                                                                           *

								*                   r[j + i*c2] = sum >> qs;                                *

								*               }                                                           *

								*       }                                                                   *

								*                                                                           *

								*   ASSUMPTIONS                                                             *

								*       The arrays 'x', 'y', and 'r' are stored in distinct arrays.  That   *

								*       is, in-place processing is not allowed.                             *

								*                                                                           *

								*       The input matrices have minimum dimensions of at least 1 row and    *

								*       1 column, and maximum dimensions of 32767 rows and 32767 columns.   *

								*                                                                           *

								*   TECHNIQUES                                                              *

								*       The 'i' loop and 'k' loops are unrolled 2x.  The 'j' loop is        *

								*       unrolled 4x.  For dimensions that are not multiples of the          *

								*       various loops' unroll factors, this code calculates extra results   *

								*       beyond the edges of the matrix.  These extra results are            *

								*       ultimately discarded.  This allows the loops to be unrolled for     *

								*       efficient operation on large matrices while not losing              *

								*       flexibility.                                                        *

								*                                                                           *

								*       The outer two levels of loop nest are collapsed, further reducing   *

								*       the overhead of the looping structure.                              *

								*                                                                           *

								*   NOTES                                                                   *

								*       This code blocks interrupts during its innermost loop.  Interrupts  *

								*       are not blocked otherwise.  As a result, interrupts can be blocked  *

								*       for up to 0.25*c1' + 16 cycles at a time.                           *

								*                                                                           *

								*       When calculating the loop trip counts, the values of r1 and c1      *

								*       are rounded up to the next even value.  The value of c2 is          *

								*       rounded up to the next multiple of 4.  This does not affect         *

								*       the memory layout of the input or output matrices.                  *

								*                                                                           *

								*   MEMORY NOTE                                                             *

								*       The load instructions in the inner loop are predicated to avoid     *

								*       significant over-fetching on the matrices.  However, since the      *

								*       outer loops are unrolled, this code may fetch approximately one     *

								*       full row beyond the end of the 'x' matrix and approximately one     *

								*       double-word beyond the end of the 'y' matrix.  The values read      *

								*       are discarded and do not affect the results of the computation.     *

								*                                                                           *

								*       This code has no memory alignment requirements, as non-aligned      *

								*       loads are used for accessing the inputs, and individual STHs are    *

								*       used for writing the results.                                       *

								*                                                                           *

								*       This is a LITTLE ENDIAN implementation.                             *

								*                                                                           *

								*   CYCLES                                                                  *

								*       cycles = 0.25 * (r1'*c2'*c1') + 2.25 * (r1'*c2') + 11, where:       *

								*                                                                           *

								*           r1' = 2 * ceil(r1/2.0)   // r1 rounded up to next even          *

								*           c1' = 2 * ceil(c1/2.0)   // c1 rounded up to next even          *

								*           c2' = 4 * ceil(c2/4.0);  // c2 rounded up to next mult of 4     *

								*                                                                           *

								*       For r1= 1, c1= 1, c2= 1,  cycles =    33.                           *

								*       For r1= 8, c1=20, c2= 8,  cycles =   475.                           *

								*       For r1=12, c1=14, c2=18,  cycles =  1391.                           *

								*       For r1=32, c1=32, c2=32,  cycles = 10507.                           *

								*                                                                           *

								*       The cycle count includes 6 cycles of function call overhead.  The   *

								*       exact overhead seen by a given application will depend on the       *

								*       compiler options used.                                              *

								*                                                                           *

								*   CODESIZE                                                                *

								*       416 bytes.                                                          *

								*                                                                           *

								* ------------------------------------------------------------------------- *

								*             Copyright (c) 2003 Texas Instruments, Incorporated.           *

								*                            All Rights Reserved.                           *

								* ========================================================================= *


								        .sect ".text:_mat_mul"

								        .global _DSP_mat_mul

								_DSP_mat_mul:

								*           void DSP_mat_mul                                                *

								*           (                                                               *

								*               const short *restrict x, int r1, int c1,                    *

								*               const short *restrict y,         int c2,                    *

								*               short       *restrict r,                                    *

								*               int                   qs                                    *

								*           );                                                              *

								* ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== *

								          .asg          A4,         A_x         ; const short *restrict x

								          .asg          B4,         B_r1        ; int r1

								          .asg          A6,         A_c1        ; int c1

								          .asg          B6,         B_y         ; const short *restrict y

								          .asg          A8,         A_c2        ; int c2

								          .asg          B8,         B_r         ; short       *restrict r

								          .asg          A10,        A_qs        ; int qs


								          .asg          B1,         B_i         ; I loop counter

								          .asg          A0,         A_j         ; J loop counter

								          .asg          A7,         A_k         ; K loop counter


								          .asg          A27,        A_kc        ; K loop counter reload


								          .asg          B22,        B_ic_       ; temp value

								          .asg          A26,        A_kc_       ; temp value


								          .asg          A1,         A_lj        ; Flag: "Last J iteration"

								          .asg          B2,         B_lr        ; Flag: "Last row of output"

								          .asg          A2,         A_c1o       ; Flag: "c1 is odd"

								          .asg          B27,        B_r1o       ; Flag: "r1 is odd"

								          .asg          B0,         B_p         ; Prolog collapse predicate

								          .asg          A1,         A_e         ; Epilog collapse predicate


								          .asg          A5,         A_c1_2      ; c1 * 2

								          .asg          B26,        B_c2        ; c2 (twin copy)

								          .asg          B7,         B_c2_2      ; c2 * 2

								          .asg          B9,         B_qs        ; qs (twin copy)


								          .asg          B5,         B_na        ; unused value


								          .asg          A28,        A_x_sv      ; Saved copy of x (i-loop)

								          .asg          A29,        A_y_sv      ; Saved copy of y (j-loop)

								          .asg          B29,        B_y_sv2     ; Saved copy of y (i-loop)

								          .asg          B30,        B_r_sv      ; Saved copy of r (i-loop)


								          .asg          A9,         A_x00

								          .asg          A16,        A_y10

								          .asg          A17,        A_y32

								          .asg          A18,        A_s00

								          .asg          A18,        A_t00

								          .asg          A19,        A_s01

								          .asg          A19,        A_t01

								          .asg          A20,        A_s02

								          .asg          A20,        A_t02

								          .asg          A21,        A_s03

								          .asg          A21,        A_t03

								          .asg          A22,        A_p00

								          .asg          A22,        A_p01

								          .asg          A22,        A_p02

								          .asg          A22,        A_p03


								          .asg          B4,         B_x11

								          .asg          B5,         B_y00

								          .asg          B16,        B_y10

								          .asg          B17,        B_y32

								          .asg          B18,        B_s10

								          .asg          B18,        B_t10

								          .asg          B19,        B_s11

								          .asg          B19,        B_t11

								          .asg          B20,        B_s12

								          .asg          B20,        B_t12

								          .asg          B21,        B_s13

								          .asg          B21,        B_t13

								          .asg          B22,        B_p10

								          .asg          B22,        B_p11

								          .asg          B22,        B_y33

								          .asg          B23,        B_y22

								          .asg          B24,        B_p13

								          .asg          B24,        B_y11

								          .asg          B25,        B_p12


								* ========================================================================= *

								          AND   .S1     A_c1,       1,          A_c1o           ;[ 1,0]


								          MV    .L2X    A_c2,       B_c2                        ;[ 2,0]

								||        ADD   .L1     A_c1o,      A_c1,       A_kc_           ;[ 2,0]

								||        AND   .D2     B_r1,       1,          B_r1o           ;[ 2,0]

								||        SHL   .S2X    A_c2,       1,          B_c2_2          ;[ 3,0]


								          MV    .L2     B_r,        B_r_sv                      ;[ 3,0]

								||[ A_c1o]SUB   .S2     B_y,        B_c2_2,     B_y             ;[ 4,0]

								||        SUBAH .D1     A_x,        A_c1o,      A_x             ;[ 3,0]

								||        SHRU  .S1     A_kc_,      1,          A_kc            ;[ 3,0]

								||        ADD   .D2     B_r1o,      B_r1,       B_ic_           ;[ 3,0]


								          ROTL  .M2X    A_qs,       0,          B_qs            ;[ 4,0]

								||        MV    .D1     A_c2,       A_j                         ;[ 4,0]

								||        SHL   .S1     A_c1,       1,          A_c1_2          ;[ 4,0]

								||[!A_c1o]LDNDW .D2T1   *B_y,                   A_y32:A_y10


								          ROTL  .M2     B_y,        0,          B_y_sv2         ;[ 5,0]

								||        ADD   .L2     B_y,        B_c2_2,     B_y

								||        MV    .L1     A_x,        A_x_sv                      ;[ 5,0]

								||        SHRU  .S2     B_ic_,      1,          B_i             ;[ 5,0]

								||        MV    .S1X    B_y,        A_y_sv                      ;[ 5,0]

								* =========================== PIPE LOOP PROLOG ============================ *

								loop_ij:

								          LDNDW .D2T2   *B_y++(B_c2_2),         B_y32:B_y10     ;[ 1,1]

								||        SUB   .S1     A_kc,       2,          A_k

								||        ZERO  .L2     B_s13:B_s12

								||        ZERO  .L1     A_s03:A_s02


								          LDNDW .D1T2   *A_x(A_c1_2),           B_na:B_x11      ;[ 2,1]

								||        B     .S1     loop_k

								||        ZERO  .L2     B_s11:B_s10

								||[ A_c1o]ZERO  .L1     A_y32:A_y10


								          CMPGT .L1     A_k,        -1,         A_e             ;[ 3,1]

								||        LDNW  .D1T1   *A_x++[1], A_x00                        ;[ 3,1]

								||        SHR   .S1     A_s03:A_s02, 0,         A_s01:A_s00

								||        MVK           2,          B_p

								; ===== 2 prolog stages collapsed

								* =========================== PIPE LOOP KERNEL ============================ *

								loop_k:

								  [!B_p ] ADD   .D1     A_p03,      A_s03,      A_s03           ;[12,1]

								||[!B_p ] ADD   .L2     B_p13,      B_s13,      B_s13           ;[12,1]

								||        DOTP2 .M1X    A_x00,      B_y33,      A_p03           ;[ 8,2]

								||        DOTP2 .M2     B_x11,      B_y33,      B_p13           ;[ 8,2]

								||        PACKH2.S2X    B_y10,      A_y10,      B_y11           ;[ 8,2]

								||[ A_e ] LDNDW .D2T1   *B_y++(B_c2_2),         A_y32:A_y10     ;[ 4,3]


								  [!B_p ] ADD   .S1     A_p02,      A_s02,      A_s02           ;[13,1]

								||[!B_p ] ADD   .S2     B_p10,      B_s10,      B_s10           ;[13,1]

								||[!B_p ] ADD   .L2     B_p12,      B_s12,      B_s12           ;[13,1]

								||        DOTP2 .M1X    A_x00,      B_y22,      A_p02           ;[ 9,2]

								||        DOTP2 .M2     B_x11,      B_y00,      B_p10           ;[ 9,2]

								||[ A_e ] LDNDW .D2T2   *B_y++(B_c2_2),         B_y32:B_y10     ;[ 1,4]

								loop_k2:

								  [!B_p ] ADD   .L1     A_p00,      A_s00,      A_s00           ;[14,1]

								||[!B_p ] ADD   .D2     B_p11,      B_s11,      B_s11           ;[14,1]

								||        BDEC  .S1     loop_k,     A_k                         ;[10,2]

								||        DOTP2 .M1X    A_x00,      B_y00,      A_p00           ;[10,2]

								||        DOTP2 .M2     B_x11,      B_y11,      B_p11           ;[10,2]

								||        PACK2 .L2X    B_y32,      A_y32,      B_y22           ;[ 6,3]

								||        PACKH2.S2X    B_y32,      A_y32,      B_y33           ;[ 6,3]

								||[ A_e ] LDNDW .D1T2   *A_x(A_c1_2),           B_na:B_x11      ;[ 2,4]

								loop_k3:

								  [ B_p ] SUB   .L2     B_p,        1,          B_p             ;[15,1]

								||[!B_p ] ADD   .S1     A_p01,      A_s01,      A_s01           ;[15,1]

								||        DOTP2 .M1X    A_x00,      B_y11,      A_p01           ;[11,2]

								||        DOTP2 .M2     B_x11,      B_y22,      B_p12           ;[ 7,3]

								||        PACK2 .S2X    B_y10,      A_y10,      B_y00           ;[ 7,3]

								||        CMPGT .L1     A_k,        -1,         A_e             ;[ 3,4]

								||[ A_e ] LDNW  .D1T1   *A_x++[1], A_x00                        ;[ 3,4]


								* =========================== PIPE LOOP EPILOG ============================ *

								; ===== 2 epilog stages collapsed

								          ADD   .D1     A_p03,      A_s03,      A_s03           ;[12,4]

								||        ADD   .D2     B_p13,      B_s13,      B_s13           ;[12,4]

								||        CMPGT .L2     B_i,        B_r1o,      B_lr

								||        CMPLT .L1     A_j,        5,          A_lj

								||        ADD   .S1     A_y_sv,     8,          A_y_sv  ; Adv. y 4 cols.


								          ADD   .L1     A_p02,      A_s02,      A_s02           ;[13,4]

								||        ADD   .S2     B_p10,      B_s10,      B_s10           ;[13,4]

								||        ADD   .D2     B_p12,      B_s12,      B_s12           ;[13,4]

								||[ A_lj] MV    .D1X    B_y_sv2,    A_y_sv              ; Rewind y

								||[ A_lj] SUB   .L2     B_i,        1,          B_i


								          ADD   .L1     A_p00,      A_s00,      A_s00           ;[14,4]

								||        ADD   .L2     B_p11,      B_s11,      B_s11           ;[14,4]

								||        SHR   .S1     A_s02,      A_qs,       A_t02

								||        SHR   .S2     B_s10,      B_qs,       B_t10

								||[!A_c1o]LDNDW .D1T1   *A_y_sv,                A_y32:A_y10

								||[ A_lj] ADDAH .D2     B_r_sv,     B_c2_2,     B_r_sv  ; Adv. r 2 rows


								          ADD   .L1     A_p01,      A_s01,      A_s01           ;[15,4]

								||        SHR   .S1     A_s00,      A_qs,       A_t00

								||        SHR   .S2     B_s13,      B_qs,       B_t13

								||[ B_lr] STH   .D2T2   B_t10,      *B_r  [B_c2]

								||[ A_j ] SUB   .D1     A_j,        1,          A_j


								          SHR   .S1     A_s03,      A_qs,       A_t03

								||        SHR   .S2     B_s11,      B_qs,       B_t11

								||        STH   .D2T1   A_t00,      *B_r++[1]

								||[!A_j ] ZERO  .L2     B_lr

								||        SSHVR .M1     A_s01,      A_qs,       A_t01


								  [ B_lr] STH   .D2T2   B_t11,      *B_r  [B_c2]

								||[ B_i ] B     .S1     loop_ij

								||[!B_i ] RET   .S2     B3


								  [ A_j ] STH   .D2T1   A_t01,      *B_r++[1]

								||[ A_j ] SUB   .D1     A_j,        1,          A_j

								||[ B_lr] CMPGT .L2X    A_j,        1,          B_lr

								||        SHR   .S2     B_s12,      B_qs,       B_t12


								  [ B_lr] STH   .D2T2   B_t12,      *B_r  [B_c2]


								  [ A_j ] STH   .D2T1   A_t02,      *B_r++[1]

								||[ A_j ] SUB   .D1     A_j,        1,          A_j

								||[ B_lr] CMPGT .L2X    A_j,        1,          B_lr


								  [ B_lr] STH   .D2T2   B_t13,      *B_r  [B_c2]

								||[ A_lj] ADDAH .D1     A_x_sv,     A_c1_2,     A_x_sv  ; Adv. x 2 rows

								||[ A_j ] ADD   .L2     B_r,        2,          B_r


								  [ A_j ] STH   .D2T1   A_t03,      *-B_r[1]

								||[!A_lj] SUB   .L1     A_j,        1,          A_j

								||[ A_lj] MV    .S2     B_r_sv,     B_r                 ; Adv. r 2 rows

								||[ A_lj] MV    .D1     A_c2,       A_j

								||[ B_i]  MV    .S1     A_x_sv,                 A_x

								||[ B_i]  ADD   .L2X    A_y_sv,     B_c2_2,     B_y


								* ========================================================================= *

								*   End of file:  dsp_mat_mul.asm                                           *

								* ------------------------------------------------------------------------- *

								*             Copyright (c) 2003 Texas Instruments, Incorporated.           *

								*                            All Rights Reserved.                           *

								* ========================================================================= *