You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

424 lines
25 KiB

;* ======================================================================== *;
;* TEXAS INSTRUMENTS, INC. *;
;* *;
;* DSPLIB DSP Signal Processing Library *;
;* *;
;* Release: Revision 1.04b *;
;* CVS Revision: 1.4 Sun Sep 29 03:32:25 2002 (UTC) *;
;* Snapshot date: 23-Oct-2003 *;
;* *;
;* This library contains proprietary intellectual property of Texas *;
;* Instruments, Inc. The library and its source code are protected by *;
;* various copyrights, and portions may also be protected by patents or *;
;* other legal protections. *;
;* *;
;* This software is licensed for use with Texas Instruments TMS320 *;
;* family DSPs. This license was provided to you prior to installing *;
;* the software. You may review this license by consulting the file *;
;* TI_license.PDF which accompanies the files in this library. *;
;* ------------------------------------------------------------------------ *;
;* Copyright (C) 2003 Texas Instruments, Incorporated. *;
;* All Rights Reserved. *;
;* ======================================================================== *;
;* ======================================================================== *;
;* Assembler compatibility shim for assembling 4.30 and later code on *;
;* tools prior to 4.30. *;
;* ======================================================================== *;
.if $isdefed(".ASSEMBLER_VERSION")
.asg .ASSEMBLER_VERSION, $asmver
.else
.asg 0, $asmver
.endif
.if ($asmver < 430)
.asg B, CALL ; Function Call
.asg B, RET ; Return from a Function
.asg B, CALLRET ; Function call with Call / Ret chaining.
.if .TMS320C6400
.asg BNOP, CALLNOP ; C64x BNOP as a Fn. Call
.asg BNOP, RETNOP ; C64x BNOP as a Fn. Return
.asg BNOP, CRNOP ; C64x Fn call w/, Call/Ret chaining via BNOP.
.endif
.asg , .asmfunc ; .func equivalent for hand-assembly code
.asg , .endasmfunc ; .endfunc equivalent for hand-assembly code
.endif
;* ======================================================================== *;
;* End of assembler compatibility shim. *;
;* ======================================================================== *;
* ========================================================================= *
* TEXAS INSTRUMENTS, INC. *
* *
* NAME *
* DSP_mat_mul -- Matrix Multiply, Little Endian *
* *
* REVISION DATE *
* 10-Feb-2002 *
* *
* USAGE *
* This routine is C-callable and can be called as: *
* *
* void DSP_mat_mul *
* ( *
* const short *restrict x, int r1, int c1, *
* const short *restrict y, int c2, *
* short *restrict r, *
* int qs *
* ); *
* *
* x == Pointer to r1 by c1 input matrix. *
* y == Pointer to c1 by c2 input matrix. *
* r == Pointer to r1 by c2 output matrix. *
* *
* r1 == Number of rows in x. *
* c1 == Number of columns in x. Also number of rows in y. *
* c2 == Number of columns in y. *
* *
* qs == Final right-shift to apply to the result. *
* *
* DESCRIPTION *
* This function computes the expression "r = x * y" for the matrices *
* x and y. The columnar dimension of x must match the row dimension *
* of y. The resulting matrix has the same number of rows as x and *
* the same number of columns as y. *
* *
* The values stored in the matrices are assumed to be fixed-point *
* or integer values. All intermediate sums are retained to 32-bit *
* precision. No rounding or overflow checking is performed. The *
* results are right-shifted by the user-specified amount, and then *
* truncated to 16 bits. *
* *
* This code is suitable for dense matrices. No optimizations are *
* made for sparse matrices. *
* *
* The following is a C description of the algorithm. The assembly *
* code may place restrictions on the inputs that the C code version *
* does not. These restrictions are noted under ASSUMPTIONS below. *
* *
* void DSP_mat_mul *
* ( *
* const short *restrict x, int r1, int c1, *
* const short *restrict y, int c2, *
* short *restrict r, *
* int qs *
* ) *
* { *
* int i, j, k; *
* int sum; *
* *
* /* ---------------------------------------------------- */ *
* /* Multiply each row in x by each column in y. The */ *
* /* product of row m in x and column n in y is placed */ *
* /* in position (m,n) in the result. */ *
* /* ---------------------------------------------------- */ *
* for (i = 0; i < r1; i++) *
* for (j = 0; j < c2; j++) *
* { *
* sum = 0; *
* *
* for (k = 0; k < c1; k++) *
* sum += x[k + i*c1] * y[j + k*c2]; *
* *
* r[j + i*c2] = sum >> qs; *
* } *
* } *
* *
* ASSUMPTIONS *
* The arrays 'x', 'y', and 'r' are stored in distinct arrays. That *
* is, in-place processing is not allowed. *
* *
* The input matrices have minimum dimensions of at least 1 row and *
* 1 column, and maximum dimensions of 32767 rows and 32767 columns. *
* *
* TECHNIQUES *
* The 'i' loop and 'k' loops are unrolled 2x. The 'j' loop is *
* unrolled 4x. For dimensions that are not multiples of the *
* various loops' unroll factors, this code calculates extra results *
* beyond the edges of the matrix. These extra results are *
* ultimately discarded. This allows the loops to be unrolled for *
* efficient operation on large matrices while not losing *
* flexibility. *
* *
* The outer two levels of loop nest are collapsed, further reducing *
* the overhead of the looping structure. *
* *
* NOTES *
* This code blocks interrupts during its innermost loop. Interrupts *
* are not blocked otherwise. As a result, interrupts can be blocked *
* for up to 0.25*c1' + 16 cycles at a time. *
* *
* When calculating the loop trip counts, the values of r1 and c1 *
* are rounded up to the next even value. The value of c2 is *
* rounded up to the next multiple of 4. This does not affect *
* the memory layout of the input or output matrices. *
* *
* MEMORY NOTE *
* The load instructions in the inner loop are predicated to avoid *
* significant over-fetching on the matrices. However, since the *
* outer loops are unrolled, this code may fetch approximately one *
* full row beyond the end of the 'x' matrix and approximately one *
* double-word beyond the end of the 'y' matrix. The values read *
* are discarded and do not affect the results of the computation. *
* *
* This code has no memory alignment requirements, as non-aligned *
* loads are used for accessing the inputs, and individual STHs are *
* used for writing the results. *
* *
* This is a LITTLE ENDIAN implementation. *
* *
* CYCLES *
* cycles = 0.25 * (r1'*c2'*c1') + 2.25 * (r1'*c2') + 11, where: *
* *
* r1' = 2 * ceil(r1/2.0) // r1 rounded up to next even *
* c1' = 2 * ceil(c1/2.0) // c1 rounded up to next even *
* c2' = 4 * ceil(c2/4.0); // c2 rounded up to next mult of 4 *
* *
* For r1= 1, c1= 1, c2= 1, cycles = 33. *
* For r1= 8, c1=20, c2= 8, cycles = 475. *
* For r1=12, c1=14, c2=18, cycles = 1391. *
* For r1=32, c1=32, c2=32, cycles = 10507. *
* *
* The cycle count includes 6 cycles of function call overhead. The *
* exact overhead seen by a given application will depend on the *
* compiler options used. *
* *
* CODESIZE *
* 416 bytes. *
* *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *
.sect ".text:_mat_mul"
.global _DSP_mat_mul
_DSP_mat_mul:
* void DSP_mat_mul *
* ( *
* const short *restrict x, int r1, int c1, *
* const short *restrict y, int c2, *
* short *restrict r, *
* int qs *
* ); *
* ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== *
.asg A4, A_x ; const short *restrict x
.asg B4, B_r1 ; int r1
.asg A6, A_c1 ; int c1
.asg B6, B_y ; const short *restrict y
.asg A8, A_c2 ; int c2
.asg B8, B_r ; short *restrict r
.asg A10, A_qs ; int qs
.asg B1, B_i ; I loop counter
.asg A0, A_j ; J loop counter
.asg A7, A_k ; K loop counter
.asg A27, A_kc ; K loop counter reload
.asg B22, B_ic_ ; temp value
.asg A26, A_kc_ ; temp value
.asg A1, A_lj ; Flag: "Last J iteration"
.asg B2, B_lr ; Flag: "Last row of output"
.asg A2, A_c1o ; Flag: "c1 is odd"
.asg B27, B_r1o ; Flag: "r1 is odd"
.asg B0, B_p ; Prolog collapse predicate
.asg A1, A_e ; Epilog collapse predicate
.asg A5, A_c1_2 ; c1 * 2
.asg B26, B_c2 ; c2 (twin copy)
.asg B7, B_c2_2 ; c2 * 2
.asg B9, B_qs ; qs (twin copy)
.asg B5, B_na ; unused value
.asg A28, A_x_sv ; Saved copy of x (i-loop)
.asg A29, A_y_sv ; Saved copy of y (j-loop)
.asg B29, B_y_sv2 ; Saved copy of y (i-loop)
.asg B30, B_r_sv ; Saved copy of r (i-loop)
.asg A9, A_x00
.asg A16, A_y10
.asg A17, A_y32
.asg A18, A_s00
.asg A18, A_t00
.asg A19, A_s01
.asg A19, A_t01
.asg A20, A_s02
.asg A20, A_t02
.asg A21, A_s03
.asg A21, A_t03
.asg A22, A_p00
.asg A22, A_p01
.asg A22, A_p02
.asg A22, A_p03
.asg B4, B_x11
.asg B5, B_y00
.asg B16, B_y10
.asg B17, B_y32
.asg B18, B_s10
.asg B18, B_t10
.asg B19, B_s11
.asg B19, B_t11
.asg B20, B_s12
.asg B20, B_t12
.asg B21, B_s13
.asg B21, B_t13
.asg B22, B_p10
.asg B22, B_p11
.asg B22, B_y33
.asg B23, B_y22
.asg B24, B_p13
.asg B24, B_y11
.asg B25, B_p12
* ========================================================================= *
AND .S1 A_c1, 1, A_c1o ;[ 1,0]
MV .L2X A_c2, B_c2 ;[ 2,0]
|| ADD .L1 A_c1o, A_c1, A_kc_ ;[ 2,0]
|| AND .D2 B_r1, 1, B_r1o ;[ 2,0]
|| SHL .S2X A_c2, 1, B_c2_2 ;[ 3,0]
MV .L2 B_r, B_r_sv ;[ 3,0]
||[ A_c1o]SUB .S2 B_y, B_c2_2, B_y ;[ 4,0]
|| SUBAH .D1 A_x, A_c1o, A_x ;[ 3,0]
|| SHRU .S1 A_kc_, 1, A_kc ;[ 3,0]
|| ADD .D2 B_r1o, B_r1, B_ic_ ;[ 3,0]
ROTL .M2X A_qs, 0, B_qs ;[ 4,0]
|| MV .D1 A_c2, A_j ;[ 4,0]
|| SHL .S1 A_c1, 1, A_c1_2 ;[ 4,0]
||[!A_c1o]LDNDW .D2T1 *B_y, A_y32:A_y10
ROTL .M2 B_y, 0, B_y_sv2 ;[ 5,0]
|| ADD .L2 B_y, B_c2_2, B_y
|| MV .L1 A_x, A_x_sv ;[ 5,0]
|| SHRU .S2 B_ic_, 1, B_i ;[ 5,0]
|| MV .S1X B_y, A_y_sv ;[ 5,0]
* =========================== PIPE LOOP PROLOG ============================ *
loop_ij:
LDNDW .D2T2 *B_y++(B_c2_2), B_y32:B_y10 ;[ 1,1]
|| SUB .S1 A_kc, 2, A_k
|| ZERO .L2 B_s13:B_s12
|| ZERO .L1 A_s03:A_s02
LDNDW .D1T2 *A_x(A_c1_2), B_na:B_x11 ;[ 2,1]
|| B .S1 loop_k
|| ZERO .L2 B_s11:B_s10
||[ A_c1o]ZERO .L1 A_y32:A_y10
CMPGT .L1 A_k, -1, A_e ;[ 3,1]
|| LDNW .D1T1 *A_x++[1], A_x00 ;[ 3,1]
|| SHR .S1 A_s03:A_s02, 0, A_s01:A_s00
|| MVK 2, B_p
; ===== 2 prolog stages collapsed
* =========================== PIPE LOOP KERNEL ============================ *
loop_k:
[!B_p ] ADD .D1 A_p03, A_s03, A_s03 ;[12,1]
||[!B_p ] ADD .L2 B_p13, B_s13, B_s13 ;[12,1]
|| DOTP2 .M1X A_x00, B_y33, A_p03 ;[ 8,2]
|| DOTP2 .M2 B_x11, B_y33, B_p13 ;[ 8,2]
|| PACKH2.S2X B_y10, A_y10, B_y11 ;[ 8,2]
||[ A_e ] LDNDW .D2T1 *B_y++(B_c2_2), A_y32:A_y10 ;[ 4,3]
[!B_p ] ADD .S1 A_p02, A_s02, A_s02 ;[13,1]
||[!B_p ] ADD .S2 B_p10, B_s10, B_s10 ;[13,1]
||[!B_p ] ADD .L2 B_p12, B_s12, B_s12 ;[13,1]
|| DOTP2 .M1X A_x00, B_y22, A_p02 ;[ 9,2]
|| DOTP2 .M2 B_x11, B_y00, B_p10 ;[ 9,2]
||[ A_e ] LDNDW .D2T2 *B_y++(B_c2_2), B_y32:B_y10 ;[ 1,4]
loop_k2:
[!B_p ] ADD .L1 A_p00, A_s00, A_s00 ;[14,1]
||[!B_p ] ADD .D2 B_p11, B_s11, B_s11 ;[14,1]
|| BDEC .S1 loop_k, A_k ;[10,2]
|| DOTP2 .M1X A_x00, B_y00, A_p00 ;[10,2]
|| DOTP2 .M2 B_x11, B_y11, B_p11 ;[10,2]
|| PACK2 .L2X B_y32, A_y32, B_y22 ;[ 6,3]
|| PACKH2.S2X B_y32, A_y32, B_y33 ;[ 6,3]
||[ A_e ] LDNDW .D1T2 *A_x(A_c1_2), B_na:B_x11 ;[ 2,4]
loop_k3:
[ B_p ] SUB .L2 B_p, 1, B_p ;[15,1]
||[!B_p ] ADD .S1 A_p01, A_s01, A_s01 ;[15,1]
|| DOTP2 .M1X A_x00, B_y11, A_p01 ;[11,2]
|| DOTP2 .M2 B_x11, B_y22, B_p12 ;[ 7,3]
|| PACK2 .S2X B_y10, A_y10, B_y00 ;[ 7,3]
|| CMPGT .L1 A_k, -1, A_e ;[ 3,4]
||[ A_e ] LDNW .D1T1 *A_x++[1], A_x00 ;[ 3,4]
* =========================== PIPE LOOP EPILOG ============================ *
; ===== 2 epilog stages collapsed
ADD .D1 A_p03, A_s03, A_s03 ;[12,4]
|| ADD .D2 B_p13, B_s13, B_s13 ;[12,4]
|| CMPGT .L2 B_i, B_r1o, B_lr
|| CMPLT .L1 A_j, 5, A_lj
|| ADD .S1 A_y_sv, 8, A_y_sv ; Adv. y 4 cols.
ADD .L1 A_p02, A_s02, A_s02 ;[13,4]
|| ADD .S2 B_p10, B_s10, B_s10 ;[13,4]
|| ADD .D2 B_p12, B_s12, B_s12 ;[13,4]
||[ A_lj] MV .D1X B_y_sv2, A_y_sv ; Rewind y
||[ A_lj] SUB .L2 B_i, 1, B_i
ADD .L1 A_p00, A_s00, A_s00 ;[14,4]
|| ADD .L2 B_p11, B_s11, B_s11 ;[14,4]
|| SHR .S1 A_s02, A_qs, A_t02
|| SHR .S2 B_s10, B_qs, B_t10
||[!A_c1o]LDNDW .D1T1 *A_y_sv, A_y32:A_y10
||[ A_lj] ADDAH .D2 B_r_sv, B_c2_2, B_r_sv ; Adv. r 2 rows
ADD .L1 A_p01, A_s01, A_s01 ;[15,4]
|| SHR .S1 A_s00, A_qs, A_t00
|| SHR .S2 B_s13, B_qs, B_t13
||[ B_lr] STH .D2T2 B_t10, *B_r [B_c2]
||[ A_j ] SUB .D1 A_j, 1, A_j
SHR .S1 A_s03, A_qs, A_t03
|| SHR .S2 B_s11, B_qs, B_t11
|| STH .D2T1 A_t00, *B_r++[1]
||[!A_j ] ZERO .L2 B_lr
|| SSHVR .M1 A_s01, A_qs, A_t01
[ B_lr] STH .D2T2 B_t11, *B_r [B_c2]
||[ B_i ] B .S1 loop_ij
||[!B_i ] RET .S2 B3
[ A_j ] STH .D2T1 A_t01, *B_r++[1]
||[ A_j ] SUB .D1 A_j, 1, A_j
||[ B_lr] CMPGT .L2X A_j, 1, B_lr
|| SHR .S2 B_s12, B_qs, B_t12
[ B_lr] STH .D2T2 B_t12, *B_r [B_c2]
[ A_j ] STH .D2T1 A_t02, *B_r++[1]
||[ A_j ] SUB .D1 A_j, 1, A_j
||[ B_lr] CMPGT .L2X A_j, 1, B_lr
[ B_lr] STH .D2T2 B_t13, *B_r [B_c2]
||[ A_lj] ADDAH .D1 A_x_sv, A_c1_2, A_x_sv ; Adv. x 2 rows
||[ A_j ] ADD .L2 B_r, 2, B_r
[ A_j ] STH .D2T1 A_t03, *-B_r[1]
||[!A_lj] SUB .L1 A_j, 1, A_j
||[ A_lj] MV .S2 B_r_sv, B_r ; Adv. r 2 rows
||[ A_lj] MV .D1 A_c2, A_j
||[ B_i] MV .S1 A_x_sv, A_x
||[ B_i] ADD .L2X A_y_sv, B_c2_2, B_y
* ========================================================================= *
* End of file: dsp_mat_mul.asm *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *