You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

190 lines
14 KiB

/* ======================================================================== */
/* TEXAS INSTRUMENTS, INC. */
/* */
/* DSPLIB DSP Signal Processing Library */
/* */
/* Release: Revision 1.04b */
/* CVS Revision: 1.4 Sun Sep 29 03:32:25 2002 (UTC) */
/* Snapshot date: 23-Oct-2003 */
/* */
/* This library contains proprietary intellectual property of Texas */
/* Instruments, Inc. The library and its source code are protected by */
/* various copyrights, and portions may also be protected by patents or */
/* other legal protections. */
/* */
/* This software is licensed for use with Texas Instruments TMS320 */
/* family DSPs. This license was provided to you prior to installing */
/* the software. You may review this license by consulting the file */
/* TI_license.PDF which accompanies the files in this library. */
/* ------------------------------------------------------------------------ */
/* Copyright (C) 2003 Texas Instruments, Incorporated. */
/* All Rights Reserved. */
/* ======================================================================== */
/* ======================================================================== */
/* Assembler compatibility shim for assembling 4.30 and later code on */
/* tools prior to 4.30. */
/* ======================================================================== */
/* ======================================================================== */
/* End of assembler compatibility shim. */
/* ======================================================================== */
/* ======================================================================== */
/* TEXAS INSTRUMENTS, INC. */
/* */
/* NAME */
/* DSP_mat_mul -- Matrix Multiply, Little Endian */
/* */
/* REVISION DATE */
/* 10-Feb-2002 */
/* */
/* USAGE */
/* This routine is C-callable and can be called as: */
/* */
/* void DSP_mat_mul */
/* ( */
/* const short *restrict x, int r1, int c1, */
/* const short *restrict y, int c2, */
/* short *restrict r, */
/* int qs */
/* ); */
/* */
/* x == Pointer to r1 by c1 input matrix. */
/* y == Pointer to c1 by c2 input matrix. */
/* r == Pointer to r1 by c2 output matrix. */
/* */
/* r1 == Number of rows in x. */
/* c1 == Number of columns in x. Also number of rows in y. */
/* c2 == Number of columns in y. */
/* */
/* qs == Final right-shift to apply to the result. */
/* */
/* DESCRIPTION */
/* This function computes the expression "r = x * y" for the matrices */
/* x and y. The columnar dimension of x must match the row dimension */
/* of y. The resulting matrix has the same number of rows as x and */
/* the same number of columns as y. */
/* */
/* The values stored in the matrices are assumed to be fixed-point */
/* or integer values. All intermediate sums are retained to 32-bit */
/* precision. No rounding or overflow checking is performed. The */
/* results are right-shifted by the user-specified amount, and then */
/* truncated to 16 bits. */
/* */
/* This code is suitable for dense matrices. No optimizations are */
/* made for sparse matrices. */
/* */
/* The following is a C description of the algorithm. The assembly */
/* code may place restrictions on the inputs that the C code version */
/* does not. These restrictions are noted under ASSUMPTIONS below. */
/* */
/* void DSP_mat_mul */
/* ( */
/* const short *restrict x, int r1, int c1, */
/* const short *restrict y, int c2, */
/* short *restrict r, */
/* int qs */
/* ) */
/* { */
/* int i, j, k; */
/* int sum; */
/* */
/* // ---------------------------------------------------- // */
/* // Multiply each row in x by each column in y. The // */
/* // product of row m in x and column n in y is placed // */
/* // in position (m,n) in the result. // */
/* // ---------------------------------------------------- // */
/* for (i = 0; i < r1; i++) */
/* for (j = 0; j < c2; j++) */
/* { */
/* sum = 0; */
/* */
/* for (k = 0; k < c1; k++) */
/* sum += x[k + i*c1] * y[j + k*c2]; */
/* */
/* r[j + i*c2] = sum >> qs; */
/* } */
/* } */
/* */
/* ASSUMPTIONS */
/* The arrays 'x', 'y', and 'r' are stored in distinct arrays. That */
/* is, in-place processing is not allowed. */
/* */
/* The input matrices have minimum dimensions of at least 1 row and */
/* 1 column, and maximum dimensions of 32767 rows and 32767 columns. */
/* */
/* TECHNIQUES */
/* The 'i' loop and 'k' loops are unrolled 2x. The 'j' loop is */
/* unrolled 4x. For dimensions that are not multiples of the */
/* various loops' unroll factors, this code calculates extra results */
/* beyond the edges of the matrix. These extra results are */
/* ultimately discarded. This allows the loops to be unrolled for */
/* efficient operation on large matrices while not losing */
/* flexibility. */
/* */
/* The outer two levels of loop nest are collapsed, further reducing */
/* the overhead of the looping structure. */
/* */
/* NOTES */
/* This code blocks interrupts during its innermost loop. Interrupts */
/* are not blocked otherwise. As a result, interrupts can be blocked */
/* for up to 0.25*c1' + 16 cycles at a time. */
/* */
/* When calculating the loop trip counts, the values of r1 and c1 */
/* are rounded up to the next even value. The value of c2 is */
/* rounded up to the next multiple of 4. This does not affect */
/* the memory layout of the input or output matrices. */
/* */
/* MEMORY NOTE */
/* The load instructions in the inner loop are predicated to avoid */
/* significant over-fetching on the matrices. However, since the */
/* outer loops are unrolled, this code may fetch approximately one */
/* full row beyond the end of the 'x' matrix and approximately one */
/* double-word beyond the end of the 'y' matrix. The values read */
/* are discarded and do not affect the results of the computation. */
/* */
/* This code has no memory alignment requirements, as non-aligned */
/* loads are used for accessing the inputs, and individual STHs are */
/* used for writing the results. */
/* */
/* This is a LITTLE ENDIAN implementation. */
/* */
/* CYCLES */
/* cycles = 0.25 * (r1'*c2'*c1') + 2.25 * (r1'*c2') + 11, where: */
/* */
/* r1' = 2 * ceil(r1/2.0) // r1 rounded up to next even */
/* c1' = 2 * ceil(c1/2.0) // c1 rounded up to next even */
/* c2' = 4 * ceil(c2/4.0); // c2 rounded up to next mult of 4 */
/* */
/* For r1= 1, c1= 1, c2= 1, cycles = 33. */
/* For r1= 8, c1=20, c2= 8, cycles = 475. */
/* For r1=12, c1=14, c2=18, cycles = 1391. */
/* For r1=32, c1=32, c2=32, cycles = 10507. */
/* */
/* The cycle count includes 6 cycles of function call overhead. The */
/* exact overhead seen by a given application will depend on the */
/* compiler options used. */
/* */
/* CODESIZE */
/* 416 bytes. */
/* */
/* ------------------------------------------------------------------------ */
/* Copyright (c) 2003 Texas Instruments, Incorporated. */
/* All Rights Reserved. */
/* ======================================================================== */
#ifndef DSP_MAT_MUL_H_
#define DSP_MAT_MUL_H_ 1
void DSP_mat_mul
(
const short *restrict x, int r1, int c1,
const short *restrict y, int c2,
short *restrict r,
int qs
);
#endif
/* ======================================================================== */
/* End of file: dsp_mat_mul.h */
/* ------------------------------------------------------------------------ */
/* Copyright (c) 2003 Texas Instruments, Incorporated. */
/* All Rights Reserved. */
/* ======================================================================== */