;* ======================================================================== *;
;*  TEXAS INSTRUMENTS, INC.                                                 *;
;*                                                                          *;
;*  DSPLIB  DSP Signal Processing Library                                   *;
;*                                                                          *;
;*      Release:        Revision 1.04b                                      *;
;*      CVS Revision:   1.18    Tue Oct 14 19:58:57 2003 (UTC)              *;
;*      Snapshot date:  23-Oct-2003                                         *;
;*                                                                          *;
;*  This library contains proprietary intellectual property of Texas        *;
;*  Instruments, Inc.  The library and its source code are protected by     *;
;*  various copyrights, and portions may also be protected by patents or    *;
;*  other legal protections.                                                *;
;*                                                                          *;
;*  This software is licensed for use with Texas Instruments TMS320         *;
;*  family DSPs.  This license was provided to you prior to installing      *;
;*  the software.  You may review this license by consulting the file       *;
;*  TI_license.PDF which accompanies the files in this library.             *;
;* ------------------------------------------------------------------------ *;
;*          Copyright (C) 2003 Texas Instruments, Incorporated.             *;
;*                          All Rights Reserved.                            *;
;* ======================================================================== *;


;* ======================================================================== *;
;*  Assembler compatibility shim for assembling 4.30 and later code on      *;
;*  tools prior to 4.30.                                                    *;
;* ======================================================================== *;

        .if $isdefed(".ASSEMBLER_VERSION")
        .asg    .ASSEMBLER_VERSION, $asmver
        .else
        .asg    0,    $asmver
        .endif

        .if ($asmver < 430)

        .asg    B,    CALL     ; Function Call
        .asg    B,    RET      ; Return from a Function
        .asg    B,    CALLRET  ; Function call with Call / Ret chaining.

        .if .TMS320C6400
        .asg    BNOP, CALLNOP  ; C64x BNOP as a Fn. Call
        .asg    BNOP, RETNOP   ; C64x BNOP as a Fn. Return
        .asg    BNOP, CRNOP    ; C64x Fn call w/, Call/Ret chaining via BNOP.
        .endif

        .asg    , .asmfunc     ; .func equivalent for hand-assembly code
        .asg    , .endasmfunc  ; .endfunc equivalent for hand-assembly code

        .endif

;* ======================================================================== *;
;*  End of assembler compatibility shim.                                    *;
;* ======================================================================== *;


* ========================================================================= *
*                                                                           *
*   TEXAS INSTRUMENTS, INC.                                                 *
*                                                                           *
*   NAME                                                                    *
*       DSP_dotprod                                                         *
*                                                                           *
*   REVISION DATE                                                           *
*       10-Oct-2003                                                         *
*                                                                           *
*   USAGE                                                                   *
*       This routine is C callable, and has the following C prototype:      *
*                                                                           *
*       int DSP_dotprod                                                     *
*       (                                                                   *
*           const short *m,       /* Pointer to first vector  */            *
*           const short *n,       /* Pointer to second vector */            *
*           int          count    /* Length of vectors.       */            *
*       );                                                                  *
*                                                                           *
*       This routine returns the dot product as its return value.           *
*                                                                           *
*   DESCRIPTION                                                             *
*       The "DSP_dotprod" function implements a dot product of two input    *
*       vectors, returning the scalar result.  Each element of the          *
*       first array is multiplied with the corresponding element of the     *
*       second array, and the products are summed.  The sum is returned.    *
*                                                                           *
*       int DSP_dotprod                                                     *
*       (                                                                   *
*           const short *m,       /* Pointer to first vector  */            *
*           const short *n,       /* Pointer to second vector */            *
*           int          count    /* Length of vectors.       */            *
*       )                                                                   *
*       {                                                                   *
*           int i, sum = 0;                                                 *
*                                                                           *
*           for (i = 0; i < count; i++)                                     *
*               sum += m[i] * n[i];                                         *
*                                                                           *
*           return sum;                                                     *
*       }                                                                   *
*                                                                           *
*       The above C code is a general implementation without                *
*       restrictions.  The assembly code has some restrictions, as          *
*       noted below.                                                        *
*                                                                           *
*   TECHNIQUES                                                              *
*       The code is unrolled 4 times to enable full memory and multiplier   *
*       bandwidth to be utilized.                                           *
*                                                                           *
*       Interrupts are masked by branch delay slots only.                   *
*                                                                           *
*       Prolog collapsing has been performed to reduce codesize.            *
*                                                                           *
*   ASSUMPTIONS                                                             *
*       The input length is a multiple of 4 and greater than 0.             *
*                                                                           *
*       The input data and coeeficients are stored on double word           *
*       aligned boundaries.                                                 *
*                                                                           *
*       This code is not interruptible.  Interrupts are masked by           *
*       branch delay slots during the entire duration of this               *
*       function.                                                           *
*                                                                           *
*   MEMORY NOTE                                                             *
*       To avoid bank conflicts, The input arrays 'm' and 'n' must          *
*       be offset by 4 half-words (8 bytes).                                *
*                                                                           *
*       The code is ENDIAN NEUTRAL.                                         *
*                                                                           *
*   CODESIZE                                                                *
*       160 bytes                                                           *
*                                                                           *
*   CYCLES                                                                  *
*       cycles = count/4 + 16                                               *
*       For count = 720, cycles = 196.                                      *
* ------------------------------------------------------------------------- *
*             Copyright (c) 2003 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *

        .sect ".text:_dotprod"
        .global _DSP_dotprod
_DSP_dotprod:
* ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== *
        .asg   A4,    A_m     ; pointer to vector m
        .asg   B4,    B_n     ; pointer to vector n
        .asg   A6,    A_count ; number of elements in each vector
        .asg   A0,    A_i     ; loop count
        .asg   A16,   A_sum   ; partial sum a
        .asg   A17,   A_prod  ; sum of products a[i]*b[i]+a[i+1]*b[i+1]
        .asg   B16,   B_sum   ; partial sum b
        .asg   B17,   B_prod  ; product sum a[i+2]*b[i+2]+a[i+3]*b[i+3]
        .asg   A9,    A_reg1  ; elements a[i+3] a[i+2]
        .asg   A8,    A_reg0  ; elements a[i+1] a[i]
        .asg   B7,    B_reg1  ; elements b[i+3] b[i+2]
        .asg   B6,    B_reg0  ; elements b[i+1] b[i]
        .asg   A4,    A_sumt  ; total sum a + b returned to caller
* ========================== PIPE LOOP PROLOG ============================= *
        B     .S2     loop                             ; prime loop
||      LDDW  .D2T2   *B_n++,     B_reg1:B_reg0        ; load b[i+3]...b[i]
||      LDDW  .D1T1   *A_m++,     A_reg1:A_reg0        ; load a[i+3]...a[i]

        B     .S2     loop                             ; prime loop
||      LDDW  .D2T2   *B_n++,     B_reg1:B_reg0        ; load b[i+3]...b[i]
||      LDDW  .D1T1   *A_m++,     A_reg1:A_reg0        ; load a[i+3]...a[i]
||      SHRU  .S1     A_count,    2,          A_i      ; calc loop count
||      ZERO  .L1     A_prod:A_sum
||      ZERO  .L2     B_prod:B_sum

        B     .S1     loop                             ; prime loop
||[A_i] LDDW  .D2T2   *B_n++,     B_reg1:B_reg0        ; load b[i+3]...b[i]
||[A_i] LDDW  .D1T1   *A_m++,     A_reg1:A_reg0        ; load a[i+3]...a[i]
||      ZERO  .L1     A_prod:A_sum                     ; added for branch-
||      ZERO  .L2     B_prod:B_sum                     ;  target-not-span

  [A_i] BDEC  .S1     loop,       A_i                  ; prime loop
||[A_i] LDDW  .D2T2   *B_n++,     B_reg1:B_reg0        ; load b[i+3]...b[i]
||[A_i] LDDW  .D1T1   *A_m++,     A_reg1:A_reg0        ; load a[i+3]...a[i]
||      ZERO  .L1     A_prod:A_sum                     ; added for branch-
||      ZERO  .L2     B_prod:B_sum                     ;  target-not-span

  [A_i] BDEC  .S1     loop,       A_i                  ; prime loop
||[A_i] LDDW  .D2T2   *B_n++,     B_reg1:B_reg0        ; load b[i+3]...b[i]
||[A_i] LDDW  .D1T1   *A_m++,     A_reg1:A_reg0        ; load a[i+3]...a[i]
||      ZERO  .L1     A_prod:A_sum                     ; added for branch-
||      ZERO  .L2     B_prod:B_sum                     ;  target-not-span
* ========================== PIPE LOOP KERNEL ============================= *
loop:
        ADD   .L2     B_sum,      B_prod,     B_sum    ; sum += productb
||      ADD   .L1     A_sum,      A_prod,     A_sum    ; sum += producta
||[A_i] LDDW  .D2T2   *B_n++,     B_reg1:B_reg0        ; load b[i+3]...b[i]
||[A_i] LDDW  .D1T1   *A_m++,     A_reg1:A_reg0        ; load a[i+3]...a[i]
||      DOTP2 .M2X    A_reg0,     B_reg0,     B_prod   ; a[0]*b[0]+a[1]*b[1]
||      DOTP2 .M1X    A_reg1,     B_reg1,     A_prod   ; a[2]*b[2]+a[3]*b[3]
||[A_i] BDEC  .S1     loop,       A_i                  ; iterate loop
* ========================== PIPE LOOP EPILOG ============================= *

        RETNOP.S2     B3,         4                    ; Return to caller
        ADD   .L1X    A_sum,      B_sum,      A_sumt   ; final sum
; ===== Branch Occurs

* ========================================================================= *
*   End of file:  dsp_dotprod.asm                                           *
* ------------------------------------------------------------------------- *
*             Copyright (c) 2003 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *