c6416_sdk/dsplib/dotprod.asm


								;* ======================================================================== *;

								;*  TEXAS INSTRUMENTS, INC.                                                 *;

								;*                                                                          *;

								;*  DSPLIB  DSP Signal Processing Library                                   *;

								;*                                                                          *;

								;*      Release:        Revision 1.04b                                      *;

								;*      CVS Revision:   1.18    Tue Oct 14 19:58:57 2003 (UTC)              *;

								;*      Snapshot date:  23-Oct-2003                                         *;

								;*                                                                          *;

								;*  This library contains proprietary intellectual property of Texas        *;

								;*  Instruments, Inc.  The library and its source code are protected by     *;

								;*  various copyrights, and portions may also be protected by patents or    *;

								;*  other legal protections.                                                *;

								;*                                                                          *;

								;*  This software is licensed for use with Texas Instruments TMS320         *;

								;*  family DSPs.  This license was provided to you prior to installing      *;

								;*  the software.  You may review this license by consulting the file       *;

								;*  TI_license.PDF which accompanies the files in this library.             *;

								;* ------------------------------------------------------------------------ *;

								;*          Copyright (C) 2003 Texas Instruments, Incorporated.             *;

								;*                          All Rights Reserved.                            *;

								;* ======================================================================== *;


								;* ======================================================================== *;

								;*  Assembler compatibility shim for assembling 4.30 and later code on      *;

								;*  tools prior to 4.30.                                                    *;

								;* ======================================================================== *;


								        .if $isdefed(".ASSEMBLER_VERSION")

								        .asg    .ASSEMBLER_VERSION, $asmver

								        .else

								        .asg    0,    $asmver

								        .endif


								        .if ($asmver < 430)


								        .asg    B,    CALL     ; Function Call

								        .asg    B,    RET      ; Return from a Function

								        .asg    B,    CALLRET  ; Function call with Call / Ret chaining.


								        .if .TMS320C6400

								        .asg    BNOP, CALLNOP  ; C64x BNOP as a Fn. Call

								        .asg    BNOP, RETNOP   ; C64x BNOP as a Fn. Return

								        .asg    BNOP, CRNOP    ; C64x Fn call w/, Call/Ret chaining via BNOP.

								        .endif


								        .asg    , .asmfunc     ; .func equivalent for hand-assembly code

								        .asg    , .endasmfunc  ; .endfunc equivalent for hand-assembly code


								        .endif


								;* ======================================================================== *;

								;*  End of assembler compatibility shim.                                    *;

								;* ======================================================================== *;


								* ========================================================================= *

								*                                                                           *

								*   TEXAS INSTRUMENTS, INC.                                                 *

								*                                                                           *

								*   NAME                                                                    *

								*       DSP_dotprod                                                         *

								*                                                                           *

								*   REVISION DATE                                                           *

								*       10-Oct-2003                                                         *

								*                                                                           *

								*   USAGE                                                                   *

								*       This routine is C callable, and has the following C prototype:      *

								*                                                                           *

								*       int DSP_dotprod                                                     *

								*       (                                                                   *

								*           const short *m,       /* Pointer to first vector  */            *

								*           const short *n,       /* Pointer to second vector */            *

								*           int          count    /* Length of vectors.       */            *

								*       );                                                                  *

								*                                                                           *

								*       This routine returns the dot product as its return value.           *

								*                                                                           *

								*   DESCRIPTION                                                             *

								*       The "DSP_dotprod" function implements a dot product of two input    *

								*       vectors, returning the scalar result.  Each element of the          *

								*       first array is multiplied with the corresponding element of the     *

								*       second array, and the products are summed.  The sum is returned.    *

								*                                                                           *

								*       int DSP_dotprod                                                     *

								*       (                                                                   *

								*           const short *m,       /* Pointer to first vector  */            *

								*           const short *n,       /* Pointer to second vector */            *

								*           int          count    /* Length of vectors.       */            *

								*       )                                                                   *

								*       {                                                                   *

								*           int i, sum = 0;                                                 *

								*                                                                           *

								*           for (i = 0; i < count; i++)                                     *

								*               sum += m[i] * n[i];                                         *

								*                                                                           *

								*           return sum;                                                     *

								*       }                                                                   *

								*                                                                           *

								*       The above C code is a general implementation without                *

								*       restrictions.  The assembly code has some restrictions, as          *

								*       noted below.                                                        *

								*                                                                           *

								*   TECHNIQUES                                                              *

								*       The code is unrolled 4 times to enable full memory and multiplier   *

								*       bandwidth to be utilized.                                           *

								*                                                                           *

								*       Interrupts are masked by branch delay slots only.                   *

								*                                                                           *

								*       Prolog collapsing has been performed to reduce codesize.            *

								*                                                                           *

								*   ASSUMPTIONS                                                             *

								*       The input length is a multiple of 4 and greater than 0.             *

								*                                                                           *

								*       The input data and coeeficients are stored on double word           *

								*       aligned boundaries.                                                 *

								*                                                                           *

								*       This code is not interruptible.  Interrupts are masked by           *

								*       branch delay slots during the entire duration of this               *

								*       function.                                                           *

								*                                                                           *

								*   MEMORY NOTE                                                             *

								*       To avoid bank conflicts, The input arrays 'm' and 'n' must          *

								*       be offset by 4 half-words (8 bytes).                                *

								*                                                                           *

								*       The code is ENDIAN NEUTRAL.                                         *

								*                                                                           *

								*   CODESIZE                                                                *

								*       160 bytes                                                           *

								*                                                                           *

								*   CYCLES                                                                  *

								*       cycles = count/4 + 16                                               *

								*       For count = 720, cycles = 196.                                      *

								* ------------------------------------------------------------------------- *

								*             Copyright (c) 2003 Texas Instruments, Incorporated.           *

								*                            All Rights Reserved.                           *

								* ========================================================================= *


								        .sect ".text:_dotprod"

								        .global _DSP_dotprod

								_DSP_dotprod:

								* ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== *

								        .asg   A4,    A_m     ; pointer to vector m

								        .asg   B4,    B_n     ; pointer to vector n

								        .asg   A6,    A_count ; number of elements in each vector

								        .asg   A0,    A_i     ; loop count

								        .asg   A16,   A_sum   ; partial sum a

								        .asg   A17,   A_prod  ; sum of products a[i]*b[i]+a[i+1]*b[i+1]

								        .asg   B16,   B_sum   ; partial sum b

								        .asg   B17,   B_prod  ; product sum a[i+2]*b[i+2]+a[i+3]*b[i+3]

								        .asg   A9,    A_reg1  ; elements a[i+3] a[i+2]

								        .asg   A8,    A_reg0  ; elements a[i+1] a[i]

								        .asg   B7,    B_reg1  ; elements b[i+3] b[i+2]

								        .asg   B6,    B_reg0  ; elements b[i+1] b[i]

								        .asg   A4,    A_sumt  ; total sum a + b returned to caller

								* ========================== PIPE LOOP PROLOG ============================= *

								        B     .S2     loop                             ; prime loop

								||      LDDW  .D2T2   *B_n++,     B_reg1:B_reg0        ; load b[i+3]...b[i]

								||      LDDW  .D1T1   *A_m++,     A_reg1:A_reg0        ; load a[i+3]...a[i]


								        B     .S2     loop                             ; prime loop

								||      LDDW  .D2T2   *B_n++,     B_reg1:B_reg0        ; load b[i+3]...b[i]

								||      LDDW  .D1T1   *A_m++,     A_reg1:A_reg0        ; load a[i+3]...a[i]

								||      SHRU  .S1     A_count,    2,          A_i      ; calc loop count

								||      ZERO  .L1     A_prod:A_sum

								||      ZERO  .L2     B_prod:B_sum


								        B     .S1     loop                             ; prime loop

								||[A_i] LDDW  .D2T2   *B_n++,     B_reg1:B_reg0        ; load b[i+3]...b[i]

								||[A_i] LDDW  .D1T1   *A_m++,     A_reg1:A_reg0        ; load a[i+3]...a[i]

								||      ZERO  .L1     A_prod:A_sum                     ; added for branch-

								||      ZERO  .L2     B_prod:B_sum                     ;  target-not-span


								  [A_i] BDEC  .S1     loop,       A_i                  ; prime loop

								||[A_i] LDDW  .D2T2   *B_n++,     B_reg1:B_reg0        ; load b[i+3]...b[i]

								||[A_i] LDDW  .D1T1   *A_m++,     A_reg1:A_reg0        ; load a[i+3]...a[i]

								||      ZERO  .L1     A_prod:A_sum                     ; added for branch-

								||      ZERO  .L2     B_prod:B_sum                     ;  target-not-span


								  [A_i] BDEC  .S1     loop,       A_i                  ; prime loop

								||[A_i] LDDW  .D2T2   *B_n++,     B_reg1:B_reg0        ; load b[i+3]...b[i]

								||[A_i] LDDW  .D1T1   *A_m++,     A_reg1:A_reg0        ; load a[i+3]...a[i]

								||      ZERO  .L1     A_prod:A_sum                     ; added for branch-

								||      ZERO  .L2     B_prod:B_sum                     ;  target-not-span

								* ========================== PIPE LOOP KERNEL ============================= *

								loop:

								        ADD   .L2     B_sum,      B_prod,     B_sum    ; sum += productb

								||      ADD   .L1     A_sum,      A_prod,     A_sum    ; sum += producta

								||[A_i] LDDW  .D2T2   *B_n++,     B_reg1:B_reg0        ; load b[i+3]...b[i]

								||[A_i] LDDW  .D1T1   *A_m++,     A_reg1:A_reg0        ; load a[i+3]...a[i]

								||      DOTP2 .M2X    A_reg0,     B_reg0,     B_prod   ; a[0]*b[0]+a[1]*b[1]

								||      DOTP2 .M1X    A_reg1,     B_reg1,     A_prod   ; a[2]*b[2]+a[3]*b[3]

								||[A_i] BDEC  .S1     loop,       A_i                  ; iterate loop

								* ========================== PIPE LOOP EPILOG ============================= *


								        RETNOP.S2     B3,         4                    ; Return to caller

								        ADD   .L1X    A_sum,      B_sum,      A_sumt   ; final sum

								; ===== Branch Occurs


								* ========================================================================= *

								*   End of file:  dsp_dotprod.asm                                           *

								* ------------------------------------------------------------------------- *

								*             Copyright (c) 2003 Texas Instruments, Incorporated.           *

								*                            All Rights Reserved.                           *

								* ========================================================================= *