You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

206 lines
13 KiB

;* ======================================================================== *;
;* TEXAS INSTRUMENTS, INC. *;
;* *;
;* DSPLIB DSP Signal Processing Library *;
;* *;
;* Release: Revision 1.04b *;
;* CVS Revision: 1.18 Tue Oct 14 19:58:57 2003 (UTC) *;
;* Snapshot date: 23-Oct-2003 *;
;* *;
;* This library contains proprietary intellectual property of Texas *;
;* Instruments, Inc. The library and its source code are protected by *;
;* various copyrights, and portions may also be protected by patents or *;
;* other legal protections. *;
;* *;
;* This software is licensed for use with Texas Instruments TMS320 *;
;* family DSPs. This license was provided to you prior to installing *;
;* the software. You may review this license by consulting the file *;
;* TI_license.PDF which accompanies the files in this library. *;
;* ------------------------------------------------------------------------ *;
;* Copyright (C) 2003 Texas Instruments, Incorporated. *;
;* All Rights Reserved. *;
;* ======================================================================== *;
;* ======================================================================== *;
;* Assembler compatibility shim for assembling 4.30 and later code on *;
;* tools prior to 4.30. *;
;* ======================================================================== *;
.if $isdefed(".ASSEMBLER_VERSION")
.asg .ASSEMBLER_VERSION, $asmver
.else
.asg 0, $asmver
.endif
.if ($asmver < 430)
.asg B, CALL ; Function Call
.asg B, RET ; Return from a Function
.asg B, CALLRET ; Function call with Call / Ret chaining.
.if .TMS320C6400
.asg BNOP, CALLNOP ; C64x BNOP as a Fn. Call
.asg BNOP, RETNOP ; C64x BNOP as a Fn. Return
.asg BNOP, CRNOP ; C64x Fn call w/, Call/Ret chaining via BNOP.
.endif
.asg , .asmfunc ; .func equivalent for hand-assembly code
.asg , .endasmfunc ; .endfunc equivalent for hand-assembly code
.endif
;* ======================================================================== *;
;* End of assembler compatibility shim. *;
;* ======================================================================== *;
* ========================================================================= *
* *
* TEXAS INSTRUMENTS, INC. *
* *
* NAME *
* DSP_dotprod *
* *
* REVISION DATE *
* 10-Oct-2003 *
* *
* USAGE *
* This routine is C callable, and has the following C prototype: *
* *
* int DSP_dotprod *
* ( *
* const short *m, /* Pointer to first vector */ *
* const short *n, /* Pointer to second vector */ *
* int count /* Length of vectors. */ *
* ); *
* *
* This routine returns the dot product as its return value. *
* *
* DESCRIPTION *
* The "DSP_dotprod" function implements a dot product of two input *
* vectors, returning the scalar result. Each element of the *
* first array is multiplied with the corresponding element of the *
* second array, and the products are summed. The sum is returned. *
* *
* int DSP_dotprod *
* ( *
* const short *m, /* Pointer to first vector */ *
* const short *n, /* Pointer to second vector */ *
* int count /* Length of vectors. */ *
* ) *
* { *
* int i, sum = 0; *
* *
* for (i = 0; i < count; i++) *
* sum += m[i] * n[i]; *
* *
* return sum; *
* } *
* *
* The above C code is a general implementation without *
* restrictions. The assembly code has some restrictions, as *
* noted below. *
* *
* TECHNIQUES *
* The code is unrolled 4 times to enable full memory and multiplier *
* bandwidth to be utilized. *
* *
* Interrupts are masked by branch delay slots only. *
* *
* Prolog collapsing has been performed to reduce codesize. *
* *
* ASSUMPTIONS *
* The input length is a multiple of 4 and greater than 0. *
* *
* The input data and coeeficients are stored on double word *
* aligned boundaries. *
* *
* This code is not interruptible. Interrupts are masked by *
* branch delay slots during the entire duration of this *
* function. *
* *
* MEMORY NOTE *
* To avoid bank conflicts, The input arrays 'm' and 'n' must *
* be offset by 4 half-words (8 bytes). *
* *
* The code is ENDIAN NEUTRAL. *
* *
* CODESIZE *
* 160 bytes *
* *
* CYCLES *
* cycles = count/4 + 16 *
* For count = 720, cycles = 196. *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *
.sect ".text:_dotprod"
.global _DSP_dotprod
_DSP_dotprod:
* ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== *
.asg A4, A_m ; pointer to vector m
.asg B4, B_n ; pointer to vector n
.asg A6, A_count ; number of elements in each vector
.asg A0, A_i ; loop count
.asg A16, A_sum ; partial sum a
.asg A17, A_prod ; sum of products a[i]*b[i]+a[i+1]*b[i+1]
.asg B16, B_sum ; partial sum b
.asg B17, B_prod ; product sum a[i+2]*b[i+2]+a[i+3]*b[i+3]
.asg A9, A_reg1 ; elements a[i+3] a[i+2]
.asg A8, A_reg0 ; elements a[i+1] a[i]
.asg B7, B_reg1 ; elements b[i+3] b[i+2]
.asg B6, B_reg0 ; elements b[i+1] b[i]
.asg A4, A_sumt ; total sum a + b returned to caller
* ========================== PIPE LOOP PROLOG ============================= *
B .S2 loop ; prime loop
|| LDDW .D2T2 *B_n++, B_reg1:B_reg0 ; load b[i+3]...b[i]
|| LDDW .D1T1 *A_m++, A_reg1:A_reg0 ; load a[i+3]...a[i]
B .S2 loop ; prime loop
|| LDDW .D2T2 *B_n++, B_reg1:B_reg0 ; load b[i+3]...b[i]
|| LDDW .D1T1 *A_m++, A_reg1:A_reg0 ; load a[i+3]...a[i]
|| SHRU .S1 A_count, 2, A_i ; calc loop count
|| ZERO .L1 A_prod:A_sum
|| ZERO .L2 B_prod:B_sum
B .S1 loop ; prime loop
||[A_i] LDDW .D2T2 *B_n++, B_reg1:B_reg0 ; load b[i+3]...b[i]
||[A_i] LDDW .D1T1 *A_m++, A_reg1:A_reg0 ; load a[i+3]...a[i]
|| ZERO .L1 A_prod:A_sum ; added for branch-
|| ZERO .L2 B_prod:B_sum ; target-not-span
[A_i] BDEC .S1 loop, A_i ; prime loop
||[A_i] LDDW .D2T2 *B_n++, B_reg1:B_reg0 ; load b[i+3]...b[i]
||[A_i] LDDW .D1T1 *A_m++, A_reg1:A_reg0 ; load a[i+3]...a[i]
|| ZERO .L1 A_prod:A_sum ; added for branch-
|| ZERO .L2 B_prod:B_sum ; target-not-span
[A_i] BDEC .S1 loop, A_i ; prime loop
||[A_i] LDDW .D2T2 *B_n++, B_reg1:B_reg0 ; load b[i+3]...b[i]
||[A_i] LDDW .D1T1 *A_m++, A_reg1:A_reg0 ; load a[i+3]...a[i]
|| ZERO .L1 A_prod:A_sum ; added for branch-
|| ZERO .L2 B_prod:B_sum ; target-not-span
* ========================== PIPE LOOP KERNEL ============================= *
loop:
ADD .L2 B_sum, B_prod, B_sum ; sum += productb
|| ADD .L1 A_sum, A_prod, A_sum ; sum += producta
||[A_i] LDDW .D2T2 *B_n++, B_reg1:B_reg0 ; load b[i+3]...b[i]
||[A_i] LDDW .D1T1 *A_m++, A_reg1:A_reg0 ; load a[i+3]...a[i]
|| DOTP2 .M2X A_reg0, B_reg0, B_prod ; a[0]*b[0]+a[1]*b[1]
|| DOTP2 .M1X A_reg1, B_reg1, A_prod ; a[2]*b[2]+a[3]*b[3]
||[A_i] BDEC .S1 loop, A_i ; iterate loop
* ========================== PIPE LOOP EPILOG ============================= *
RETNOP.S2 B3, 4 ; Return to caller
ADD .L1X A_sum, B_sum, A_sumt ; final sum
; ===== Branch Occurs
* ========================================================================= *
* End of file: dsp_dotprod.asm *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *