;* ======================================================================== *; ;* TEXAS INSTRUMENTS, INC. *; ;* *; ;* DSPLIB DSP Signal Processing Library *; ;* *; ;* Release: Revision 1.04b *; ;* CVS Revision: 1.18 Tue Oct 14 19:58:57 2003 (UTC) *; ;* Snapshot date: 23-Oct-2003 *; ;* *; ;* This library contains proprietary intellectual property of Texas *; ;* Instruments, Inc. The library and its source code are protected by *; ;* various copyrights, and portions may also be protected by patents or *; ;* other legal protections. *; ;* *; ;* This software is licensed for use with Texas Instruments TMS320 *; ;* family DSPs. This license was provided to you prior to installing *; ;* the software. You may review this license by consulting the file *; ;* TI_license.PDF which accompanies the files in this library. *; ;* ------------------------------------------------------------------------ *; ;* Copyright (C) 2003 Texas Instruments, Incorporated. *; ;* All Rights Reserved. *; ;* ======================================================================== *; ;* ======================================================================== *; ;* Assembler compatibility shim for assembling 4.30 and later code on *; ;* tools prior to 4.30. *; ;* ======================================================================== *; .if $isdefed(".ASSEMBLER_VERSION") .asg .ASSEMBLER_VERSION, $asmver .else .asg 0, $asmver .endif .if ($asmver < 430) .asg B, CALL ; Function Call .asg B, RET ; Return from a Function .asg B, CALLRET ; Function call with Call / Ret chaining. .if .TMS320C6400 .asg BNOP, CALLNOP ; C64x BNOP as a Fn. Call .asg BNOP, RETNOP ; C64x BNOP as a Fn. Return .asg BNOP, CRNOP ; C64x Fn call w/, Call/Ret chaining via BNOP. .endif .asg , .asmfunc ; .func equivalent for hand-assembly code .asg , .endasmfunc ; .endfunc equivalent for hand-assembly code .endif ;* ======================================================================== *; ;* End of assembler compatibility shim. *; ;* ======================================================================== *; * ========================================================================= * * * * TEXAS INSTRUMENTS, INC. * * * * NAME * * DSP_dotprod * * * * REVISION DATE * * 10-Oct-2003 * * * * USAGE * * This routine is C callable, and has the following C prototype: * * * * int DSP_dotprod * * ( * * const short *m, /* Pointer to first vector */ * * const short *n, /* Pointer to second vector */ * * int count /* Length of vectors. */ * * ); * * * * This routine returns the dot product as its return value. * * * * DESCRIPTION * * The "DSP_dotprod" function implements a dot product of two input * * vectors, returning the scalar result. Each element of the * * first array is multiplied with the corresponding element of the * * second array, and the products are summed. The sum is returned. * * * * int DSP_dotprod * * ( * * const short *m, /* Pointer to first vector */ * * const short *n, /* Pointer to second vector */ * * int count /* Length of vectors. */ * * ) * * { * * int i, sum = 0; * * * * for (i = 0; i < count; i++) * * sum += m[i] * n[i]; * * * * return sum; * * } * * * * The above C code is a general implementation without * * restrictions. The assembly code has some restrictions, as * * noted below. * * * * TECHNIQUES * * The code is unrolled 4 times to enable full memory and multiplier * * bandwidth to be utilized. * * * * Interrupts are masked by branch delay slots only. * * * * Prolog collapsing has been performed to reduce codesize. * * * * ASSUMPTIONS * * The input length is a multiple of 4 and greater than 0. * * * * The input data and coeeficients are stored on double word * * aligned boundaries. * * * * This code is not interruptible. Interrupts are masked by * * branch delay slots during the entire duration of this * * function. * * * * MEMORY NOTE * * To avoid bank conflicts, The input arrays 'm' and 'n' must * * be offset by 4 half-words (8 bytes). * * * * The code is ENDIAN NEUTRAL. * * * * CODESIZE * * 160 bytes * * * * CYCLES * * cycles = count/4 + 16 * * For count = 720, cycles = 196. * * ------------------------------------------------------------------------- * * Copyright (c) 2003 Texas Instruments, Incorporated. * * All Rights Reserved. * * ========================================================================= * .sect ".text:_dotprod" .global _DSP_dotprod _DSP_dotprod: * ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== * .asg A4, A_m ; pointer to vector m .asg B4, B_n ; pointer to vector n .asg A6, A_count ; number of elements in each vector .asg A0, A_i ; loop count .asg A16, A_sum ; partial sum a .asg A17, A_prod ; sum of products a[i]*b[i]+a[i+1]*b[i+1] .asg B16, B_sum ; partial sum b .asg B17, B_prod ; product sum a[i+2]*b[i+2]+a[i+3]*b[i+3] .asg A9, A_reg1 ; elements a[i+3] a[i+2] .asg A8, A_reg0 ; elements a[i+1] a[i] .asg B7, B_reg1 ; elements b[i+3] b[i+2] .asg B6, B_reg0 ; elements b[i+1] b[i] .asg A4, A_sumt ; total sum a + b returned to caller * ========================== PIPE LOOP PROLOG ============================= * B .S2 loop ; prime loop || LDDW .D2T2 *B_n++, B_reg1:B_reg0 ; load b[i+3]...b[i] || LDDW .D1T1 *A_m++, A_reg1:A_reg0 ; load a[i+3]...a[i] B .S2 loop ; prime loop || LDDW .D2T2 *B_n++, B_reg1:B_reg0 ; load b[i+3]...b[i] || LDDW .D1T1 *A_m++, A_reg1:A_reg0 ; load a[i+3]...a[i] || SHRU .S1 A_count, 2, A_i ; calc loop count || ZERO .L1 A_prod:A_sum || ZERO .L2 B_prod:B_sum B .S1 loop ; prime loop ||[A_i] LDDW .D2T2 *B_n++, B_reg1:B_reg0 ; load b[i+3]...b[i] ||[A_i] LDDW .D1T1 *A_m++, A_reg1:A_reg0 ; load a[i+3]...a[i] || ZERO .L1 A_prod:A_sum ; added for branch- || ZERO .L2 B_prod:B_sum ; target-not-span [A_i] BDEC .S1 loop, A_i ; prime loop ||[A_i] LDDW .D2T2 *B_n++, B_reg1:B_reg0 ; load b[i+3]...b[i] ||[A_i] LDDW .D1T1 *A_m++, A_reg1:A_reg0 ; load a[i+3]...a[i] || ZERO .L1 A_prod:A_sum ; added for branch- || ZERO .L2 B_prod:B_sum ; target-not-span [A_i] BDEC .S1 loop, A_i ; prime loop ||[A_i] LDDW .D2T2 *B_n++, B_reg1:B_reg0 ; load b[i+3]...b[i] ||[A_i] LDDW .D1T1 *A_m++, A_reg1:A_reg0 ; load a[i+3]...a[i] || ZERO .L1 A_prod:A_sum ; added for branch- || ZERO .L2 B_prod:B_sum ; target-not-span * ========================== PIPE LOOP KERNEL ============================= * loop: ADD .L2 B_sum, B_prod, B_sum ; sum += productb || ADD .L1 A_sum, A_prod, A_sum ; sum += producta ||[A_i] LDDW .D2T2 *B_n++, B_reg1:B_reg0 ; load b[i+3]...b[i] ||[A_i] LDDW .D1T1 *A_m++, A_reg1:A_reg0 ; load a[i+3]...a[i] || DOTP2 .M2X A_reg0, B_reg0, B_prod ; a[0]*b[0]+a[1]*b[1] || DOTP2 .M1X A_reg1, B_reg1, A_prod ; a[2]*b[2]+a[3]*b[3] ||[A_i] BDEC .S1 loop, A_i ; iterate loop * ========================== PIPE LOOP EPILOG ============================= * RETNOP.S2 B3, 4 ; Return to caller ADD .L1X A_sum, B_sum, A_sumt ; final sum ; ===== Branch Occurs * ========================================================================= * * End of file: dsp_dotprod.asm * * ------------------------------------------------------------------------- * * Copyright (c) 2003 Texas Instruments, Incorporated. * * All Rights Reserved. * * ========================================================================= *