You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
424 lines
25 KiB
424 lines
25 KiB
;* ======================================================================== *;
|
|
;* TEXAS INSTRUMENTS, INC. *;
|
|
;* *;
|
|
;* DSPLIB DSP Signal Processing Library *;
|
|
;* *;
|
|
;* Release: Revision 1.04b *;
|
|
;* CVS Revision: 1.4 Sun Sep 29 03:32:25 2002 (UTC) *;
|
|
;* Snapshot date: 23-Oct-2003 *;
|
|
;* *;
|
|
;* This library contains proprietary intellectual property of Texas *;
|
|
;* Instruments, Inc. The library and its source code are protected by *;
|
|
;* various copyrights, and portions may also be protected by patents or *;
|
|
;* other legal protections. *;
|
|
;* *;
|
|
;* This software is licensed for use with Texas Instruments TMS320 *;
|
|
;* family DSPs. This license was provided to you prior to installing *;
|
|
;* the software. You may review this license by consulting the file *;
|
|
;* TI_license.PDF which accompanies the files in this library. *;
|
|
;* ------------------------------------------------------------------------ *;
|
|
;* Copyright (C) 2003 Texas Instruments, Incorporated. *;
|
|
;* All Rights Reserved. *;
|
|
;* ======================================================================== *;
|
|
|
|
|
|
;* ======================================================================== *;
|
|
;* Assembler compatibility shim for assembling 4.30 and later code on *;
|
|
;* tools prior to 4.30. *;
|
|
;* ======================================================================== *;
|
|
|
|
.if $isdefed(".ASSEMBLER_VERSION")
|
|
.asg .ASSEMBLER_VERSION, $asmver
|
|
.else
|
|
.asg 0, $asmver
|
|
.endif
|
|
|
|
.if ($asmver < 430)
|
|
|
|
.asg B, CALL ; Function Call
|
|
.asg B, RET ; Return from a Function
|
|
.asg B, CALLRET ; Function call with Call / Ret chaining.
|
|
|
|
.if .TMS320C6400
|
|
.asg BNOP, CALLNOP ; C64x BNOP as a Fn. Call
|
|
.asg BNOP, RETNOP ; C64x BNOP as a Fn. Return
|
|
.asg BNOP, CRNOP ; C64x Fn call w/, Call/Ret chaining via BNOP.
|
|
.endif
|
|
|
|
.asg , .asmfunc ; .func equivalent for hand-assembly code
|
|
.asg , .endasmfunc ; .endfunc equivalent for hand-assembly code
|
|
|
|
.endif
|
|
|
|
;* ======================================================================== *;
|
|
;* End of assembler compatibility shim. *;
|
|
;* ======================================================================== *;
|
|
|
|
|
|
* ========================================================================= *
|
|
* TEXAS INSTRUMENTS, INC. *
|
|
* *
|
|
* NAME *
|
|
* DSP_mat_mul -- Matrix Multiply, Little Endian *
|
|
* *
|
|
* REVISION DATE *
|
|
* 10-Feb-2002 *
|
|
* *
|
|
* USAGE *
|
|
* This routine is C-callable and can be called as: *
|
|
* *
|
|
* void DSP_mat_mul *
|
|
* ( *
|
|
* const short *restrict x, int r1, int c1, *
|
|
* const short *restrict y, int c2, *
|
|
* short *restrict r, *
|
|
* int qs *
|
|
* ); *
|
|
* *
|
|
* x == Pointer to r1 by c1 input matrix. *
|
|
* y == Pointer to c1 by c2 input matrix. *
|
|
* r == Pointer to r1 by c2 output matrix. *
|
|
* *
|
|
* r1 == Number of rows in x. *
|
|
* c1 == Number of columns in x. Also number of rows in y. *
|
|
* c2 == Number of columns in y. *
|
|
* *
|
|
* qs == Final right-shift to apply to the result. *
|
|
* *
|
|
* DESCRIPTION *
|
|
* This function computes the expression "r = x * y" for the matrices *
|
|
* x and y. The columnar dimension of x must match the row dimension *
|
|
* of y. The resulting matrix has the same number of rows as x and *
|
|
* the same number of columns as y. *
|
|
* *
|
|
* The values stored in the matrices are assumed to be fixed-point *
|
|
* or integer values. All intermediate sums are retained to 32-bit *
|
|
* precision. No rounding or overflow checking is performed. The *
|
|
* results are right-shifted by the user-specified amount, and then *
|
|
* truncated to 16 bits. *
|
|
* *
|
|
* This code is suitable for dense matrices. No optimizations are *
|
|
* made for sparse matrices. *
|
|
* *
|
|
* The following is a C description of the algorithm. The assembly *
|
|
* code may place restrictions on the inputs that the C code version *
|
|
* does not. These restrictions are noted under ASSUMPTIONS below. *
|
|
* *
|
|
* void DSP_mat_mul *
|
|
* ( *
|
|
* const short *restrict x, int r1, int c1, *
|
|
* const short *restrict y, int c2, *
|
|
* short *restrict r, *
|
|
* int qs *
|
|
* ) *
|
|
* { *
|
|
* int i, j, k; *
|
|
* int sum; *
|
|
* *
|
|
* /* ---------------------------------------------------- */ *
|
|
* /* Multiply each row in x by each column in y. The */ *
|
|
* /* product of row m in x and column n in y is placed */ *
|
|
* /* in position (m,n) in the result. */ *
|
|
* /* ---------------------------------------------------- */ *
|
|
* for (i = 0; i < r1; i++) *
|
|
* for (j = 0; j < c2; j++) *
|
|
* { *
|
|
* sum = 0; *
|
|
* *
|
|
* for (k = 0; k < c1; k++) *
|
|
* sum += x[k + i*c1] * y[j + k*c2]; *
|
|
* *
|
|
* r[j + i*c2] = sum >> qs; *
|
|
* } *
|
|
* } *
|
|
* *
|
|
* ASSUMPTIONS *
|
|
* The arrays 'x', 'y', and 'r' are stored in distinct arrays. That *
|
|
* is, in-place processing is not allowed. *
|
|
* *
|
|
* The input matrices have minimum dimensions of at least 1 row and *
|
|
* 1 column, and maximum dimensions of 32767 rows and 32767 columns. *
|
|
* *
|
|
* TECHNIQUES *
|
|
* The 'i' loop and 'k' loops are unrolled 2x. The 'j' loop is *
|
|
* unrolled 4x. For dimensions that are not multiples of the *
|
|
* various loops' unroll factors, this code calculates extra results *
|
|
* beyond the edges of the matrix. These extra results are *
|
|
* ultimately discarded. This allows the loops to be unrolled for *
|
|
* efficient operation on large matrices while not losing *
|
|
* flexibility. *
|
|
* *
|
|
* The outer two levels of loop nest are collapsed, further reducing *
|
|
* the overhead of the looping structure. *
|
|
* *
|
|
* NOTES *
|
|
* This code blocks interrupts during its innermost loop. Interrupts *
|
|
* are not blocked otherwise. As a result, interrupts can be blocked *
|
|
* for up to 0.25*c1' + 16 cycles at a time. *
|
|
* *
|
|
* When calculating the loop trip counts, the values of r1 and c1 *
|
|
* are rounded up to the next even value. The value of c2 is *
|
|
* rounded up to the next multiple of 4. This does not affect *
|
|
* the memory layout of the input or output matrices. *
|
|
* *
|
|
* MEMORY NOTE *
|
|
* The load instructions in the inner loop are predicated to avoid *
|
|
* significant over-fetching on the matrices. However, since the *
|
|
* outer loops are unrolled, this code may fetch approximately one *
|
|
* full row beyond the end of the 'x' matrix and approximately one *
|
|
* double-word beyond the end of the 'y' matrix. The values read *
|
|
* are discarded and do not affect the results of the computation. *
|
|
* *
|
|
* This code has no memory alignment requirements, as non-aligned *
|
|
* loads are used for accessing the inputs, and individual STHs are *
|
|
* used for writing the results. *
|
|
* *
|
|
* This is a LITTLE ENDIAN implementation. *
|
|
* *
|
|
* CYCLES *
|
|
* cycles = 0.25 * (r1'*c2'*c1') + 2.25 * (r1'*c2') + 11, where: *
|
|
* *
|
|
* r1' = 2 * ceil(r1/2.0) // r1 rounded up to next even *
|
|
* c1' = 2 * ceil(c1/2.0) // c1 rounded up to next even *
|
|
* c2' = 4 * ceil(c2/4.0); // c2 rounded up to next mult of 4 *
|
|
* *
|
|
* For r1= 1, c1= 1, c2= 1, cycles = 33. *
|
|
* For r1= 8, c1=20, c2= 8, cycles = 475. *
|
|
* For r1=12, c1=14, c2=18, cycles = 1391. *
|
|
* For r1=32, c1=32, c2=32, cycles = 10507. *
|
|
* *
|
|
* The cycle count includes 6 cycles of function call overhead. The *
|
|
* exact overhead seen by a given application will depend on the *
|
|
* compiler options used. *
|
|
* *
|
|
* CODESIZE *
|
|
* 416 bytes. *
|
|
* *
|
|
* ------------------------------------------------------------------------- *
|
|
* Copyright (c) 2003 Texas Instruments, Incorporated. *
|
|
* All Rights Reserved. *
|
|
* ========================================================================= *
|
|
|
|
|
|
.sect ".text:_mat_mul"
|
|
.global _DSP_mat_mul
|
|
_DSP_mat_mul:
|
|
* void DSP_mat_mul *
|
|
* ( *
|
|
* const short *restrict x, int r1, int c1, *
|
|
* const short *restrict y, int c2, *
|
|
* short *restrict r, *
|
|
* int qs *
|
|
* ); *
|
|
* ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== *
|
|
.asg A4, A_x ; const short *restrict x
|
|
.asg B4, B_r1 ; int r1
|
|
.asg A6, A_c1 ; int c1
|
|
.asg B6, B_y ; const short *restrict y
|
|
.asg A8, A_c2 ; int c2
|
|
.asg B8, B_r ; short *restrict r
|
|
.asg A10, A_qs ; int qs
|
|
|
|
.asg B1, B_i ; I loop counter
|
|
.asg A0, A_j ; J loop counter
|
|
.asg A7, A_k ; K loop counter
|
|
|
|
.asg A27, A_kc ; K loop counter reload
|
|
|
|
.asg B22, B_ic_ ; temp value
|
|
.asg A26, A_kc_ ; temp value
|
|
|
|
.asg A1, A_lj ; Flag: "Last J iteration"
|
|
.asg B2, B_lr ; Flag: "Last row of output"
|
|
.asg A2, A_c1o ; Flag: "c1 is odd"
|
|
.asg B27, B_r1o ; Flag: "r1 is odd"
|
|
.asg B0, B_p ; Prolog collapse predicate
|
|
.asg A1, A_e ; Epilog collapse predicate
|
|
|
|
.asg A5, A_c1_2 ; c1 * 2
|
|
.asg B26, B_c2 ; c2 (twin copy)
|
|
.asg B7, B_c2_2 ; c2 * 2
|
|
.asg B9, B_qs ; qs (twin copy)
|
|
|
|
.asg B5, B_na ; unused value
|
|
|
|
.asg A28, A_x_sv ; Saved copy of x (i-loop)
|
|
.asg A29, A_y_sv ; Saved copy of y (j-loop)
|
|
.asg B29, B_y_sv2 ; Saved copy of y (i-loop)
|
|
.asg B30, B_r_sv ; Saved copy of r (i-loop)
|
|
|
|
.asg A9, A_x00
|
|
.asg A16, A_y10
|
|
.asg A17, A_y32
|
|
.asg A18, A_s00
|
|
.asg A18, A_t00
|
|
.asg A19, A_s01
|
|
.asg A19, A_t01
|
|
.asg A20, A_s02
|
|
.asg A20, A_t02
|
|
.asg A21, A_s03
|
|
.asg A21, A_t03
|
|
.asg A22, A_p00
|
|
.asg A22, A_p01
|
|
.asg A22, A_p02
|
|
.asg A22, A_p03
|
|
|
|
.asg B4, B_x11
|
|
.asg B5, B_y00
|
|
.asg B16, B_y10
|
|
.asg B17, B_y32
|
|
.asg B18, B_s10
|
|
.asg B18, B_t10
|
|
.asg B19, B_s11
|
|
.asg B19, B_t11
|
|
.asg B20, B_s12
|
|
.asg B20, B_t12
|
|
.asg B21, B_s13
|
|
.asg B21, B_t13
|
|
.asg B22, B_p10
|
|
.asg B22, B_p11
|
|
.asg B22, B_y33
|
|
.asg B23, B_y22
|
|
.asg B24, B_p13
|
|
.asg B24, B_y11
|
|
.asg B25, B_p12
|
|
|
|
* ========================================================================= *
|
|
AND .S1 A_c1, 1, A_c1o ;[ 1,0]
|
|
|
|
MV .L2X A_c2, B_c2 ;[ 2,0]
|
|
|| ADD .L1 A_c1o, A_c1, A_kc_ ;[ 2,0]
|
|
|| AND .D2 B_r1, 1, B_r1o ;[ 2,0]
|
|
|| SHL .S2X A_c2, 1, B_c2_2 ;[ 3,0]
|
|
|
|
MV .L2 B_r, B_r_sv ;[ 3,0]
|
|
||[ A_c1o]SUB .S2 B_y, B_c2_2, B_y ;[ 4,0]
|
|
|| SUBAH .D1 A_x, A_c1o, A_x ;[ 3,0]
|
|
|| SHRU .S1 A_kc_, 1, A_kc ;[ 3,0]
|
|
|| ADD .D2 B_r1o, B_r1, B_ic_ ;[ 3,0]
|
|
|
|
ROTL .M2X A_qs, 0, B_qs ;[ 4,0]
|
|
|| MV .D1 A_c2, A_j ;[ 4,0]
|
|
|| SHL .S1 A_c1, 1, A_c1_2 ;[ 4,0]
|
|
||[!A_c1o]LDNDW .D2T1 *B_y, A_y32:A_y10
|
|
|
|
ROTL .M2 B_y, 0, B_y_sv2 ;[ 5,0]
|
|
|| ADD .L2 B_y, B_c2_2, B_y
|
|
|| MV .L1 A_x, A_x_sv ;[ 5,0]
|
|
|| SHRU .S2 B_ic_, 1, B_i ;[ 5,0]
|
|
|| MV .S1X B_y, A_y_sv ;[ 5,0]
|
|
* =========================== PIPE LOOP PROLOG ============================ *
|
|
loop_ij:
|
|
LDNDW .D2T2 *B_y++(B_c2_2), B_y32:B_y10 ;[ 1,1]
|
|
|| SUB .S1 A_kc, 2, A_k
|
|
|| ZERO .L2 B_s13:B_s12
|
|
|| ZERO .L1 A_s03:A_s02
|
|
|
|
LDNDW .D1T2 *A_x(A_c1_2), B_na:B_x11 ;[ 2,1]
|
|
|| B .S1 loop_k
|
|
|| ZERO .L2 B_s11:B_s10
|
|
||[ A_c1o]ZERO .L1 A_y32:A_y10
|
|
|
|
CMPGT .L1 A_k, -1, A_e ;[ 3,1]
|
|
|| LDNW .D1T1 *A_x++[1], A_x00 ;[ 3,1]
|
|
|| SHR .S1 A_s03:A_s02, 0, A_s01:A_s00
|
|
|| MVK 2, B_p
|
|
; ===== 2 prolog stages collapsed
|
|
* =========================== PIPE LOOP KERNEL ============================ *
|
|
loop_k:
|
|
[!B_p ] ADD .D1 A_p03, A_s03, A_s03 ;[12,1]
|
|
||[!B_p ] ADD .L2 B_p13, B_s13, B_s13 ;[12,1]
|
|
|| DOTP2 .M1X A_x00, B_y33, A_p03 ;[ 8,2]
|
|
|| DOTP2 .M2 B_x11, B_y33, B_p13 ;[ 8,2]
|
|
|| PACKH2.S2X B_y10, A_y10, B_y11 ;[ 8,2]
|
|
||[ A_e ] LDNDW .D2T1 *B_y++(B_c2_2), A_y32:A_y10 ;[ 4,3]
|
|
|
|
[!B_p ] ADD .S1 A_p02, A_s02, A_s02 ;[13,1]
|
|
||[!B_p ] ADD .S2 B_p10, B_s10, B_s10 ;[13,1]
|
|
||[!B_p ] ADD .L2 B_p12, B_s12, B_s12 ;[13,1]
|
|
|| DOTP2 .M1X A_x00, B_y22, A_p02 ;[ 9,2]
|
|
|| DOTP2 .M2 B_x11, B_y00, B_p10 ;[ 9,2]
|
|
||[ A_e ] LDNDW .D2T2 *B_y++(B_c2_2), B_y32:B_y10 ;[ 1,4]
|
|
loop_k2:
|
|
[!B_p ] ADD .L1 A_p00, A_s00, A_s00 ;[14,1]
|
|
||[!B_p ] ADD .D2 B_p11, B_s11, B_s11 ;[14,1]
|
|
|| BDEC .S1 loop_k, A_k ;[10,2]
|
|
|| DOTP2 .M1X A_x00, B_y00, A_p00 ;[10,2]
|
|
|| DOTP2 .M2 B_x11, B_y11, B_p11 ;[10,2]
|
|
|| PACK2 .L2X B_y32, A_y32, B_y22 ;[ 6,3]
|
|
|| PACKH2.S2X B_y32, A_y32, B_y33 ;[ 6,3]
|
|
||[ A_e ] LDNDW .D1T2 *A_x(A_c1_2), B_na:B_x11 ;[ 2,4]
|
|
loop_k3:
|
|
[ B_p ] SUB .L2 B_p, 1, B_p ;[15,1]
|
|
||[!B_p ] ADD .S1 A_p01, A_s01, A_s01 ;[15,1]
|
|
|| DOTP2 .M1X A_x00, B_y11, A_p01 ;[11,2]
|
|
|| DOTP2 .M2 B_x11, B_y22, B_p12 ;[ 7,3]
|
|
|| PACK2 .S2X B_y10, A_y10, B_y00 ;[ 7,3]
|
|
|| CMPGT .L1 A_k, -1, A_e ;[ 3,4]
|
|
||[ A_e ] LDNW .D1T1 *A_x++[1], A_x00 ;[ 3,4]
|
|
|
|
* =========================== PIPE LOOP EPILOG ============================ *
|
|
; ===== 2 epilog stages collapsed
|
|
ADD .D1 A_p03, A_s03, A_s03 ;[12,4]
|
|
|| ADD .D2 B_p13, B_s13, B_s13 ;[12,4]
|
|
|| CMPGT .L2 B_i, B_r1o, B_lr
|
|
|| CMPLT .L1 A_j, 5, A_lj
|
|
|| ADD .S1 A_y_sv, 8, A_y_sv ; Adv. y 4 cols.
|
|
|
|
ADD .L1 A_p02, A_s02, A_s02 ;[13,4]
|
|
|| ADD .S2 B_p10, B_s10, B_s10 ;[13,4]
|
|
|| ADD .D2 B_p12, B_s12, B_s12 ;[13,4]
|
|
||[ A_lj] MV .D1X B_y_sv2, A_y_sv ; Rewind y
|
|
||[ A_lj] SUB .L2 B_i, 1, B_i
|
|
|
|
ADD .L1 A_p00, A_s00, A_s00 ;[14,4]
|
|
|| ADD .L2 B_p11, B_s11, B_s11 ;[14,4]
|
|
|| SHR .S1 A_s02, A_qs, A_t02
|
|
|| SHR .S2 B_s10, B_qs, B_t10
|
|
||[!A_c1o]LDNDW .D1T1 *A_y_sv, A_y32:A_y10
|
|
||[ A_lj] ADDAH .D2 B_r_sv, B_c2_2, B_r_sv ; Adv. r 2 rows
|
|
|
|
ADD .L1 A_p01, A_s01, A_s01 ;[15,4]
|
|
|| SHR .S1 A_s00, A_qs, A_t00
|
|
|| SHR .S2 B_s13, B_qs, B_t13
|
|
||[ B_lr] STH .D2T2 B_t10, *B_r [B_c2]
|
|
||[ A_j ] SUB .D1 A_j, 1, A_j
|
|
|
|
SHR .S1 A_s03, A_qs, A_t03
|
|
|| SHR .S2 B_s11, B_qs, B_t11
|
|
|| STH .D2T1 A_t00, *B_r++[1]
|
|
||[!A_j ] ZERO .L2 B_lr
|
|
|| SSHVR .M1 A_s01, A_qs, A_t01
|
|
|
|
[ B_lr] STH .D2T2 B_t11, *B_r [B_c2]
|
|
||[ B_i ] B .S1 loop_ij
|
|
||[!B_i ] RET .S2 B3
|
|
|
|
[ A_j ] STH .D2T1 A_t01, *B_r++[1]
|
|
||[ A_j ] SUB .D1 A_j, 1, A_j
|
|
||[ B_lr] CMPGT .L2X A_j, 1, B_lr
|
|
|| SHR .S2 B_s12, B_qs, B_t12
|
|
|
|
[ B_lr] STH .D2T2 B_t12, *B_r [B_c2]
|
|
|
|
[ A_j ] STH .D2T1 A_t02, *B_r++[1]
|
|
||[ A_j ] SUB .D1 A_j, 1, A_j
|
|
||[ B_lr] CMPGT .L2X A_j, 1, B_lr
|
|
|
|
[ B_lr] STH .D2T2 B_t13, *B_r [B_c2]
|
|
||[ A_lj] ADDAH .D1 A_x_sv, A_c1_2, A_x_sv ; Adv. x 2 rows
|
|
||[ A_j ] ADD .L2 B_r, 2, B_r
|
|
|
|
[ A_j ] STH .D2T1 A_t03, *-B_r[1]
|
|
||[!A_lj] SUB .L1 A_j, 1, A_j
|
|
||[ A_lj] MV .S2 B_r_sv, B_r ; Adv. r 2 rows
|
|
||[ A_lj] MV .D1 A_c2, A_j
|
|
||[ B_i] MV .S1 A_x_sv, A_x
|
|
||[ B_i] ADD .L2X A_y_sv, B_c2_2, B_y
|
|
|
|
* ========================================================================= *
|
|
* End of file: dsp_mat_mul.asm *
|
|
* ------------------------------------------------------------------------- *
|
|
* Copyright (c) 2003 Texas Instruments, Incorporated. *
|
|
* All Rights Reserved. *
|
|
* ========================================================================= *
|
|
|