7 #ifndef TBCI_MATRIX_KERNELS_H
8 #define TBCI_MATRIX_KERNELS_H
10 template <
typename T>
class Matrix;
11 template <
typename T>
class TMatrix;
12 template <
typename T>
class TSMatrix;
13 template <
typename T>
class Vector;
14 template <
typename T>
class TVector;
15 template <
typename T>
class TSVector;
24 const unsigned bc = b->
columns();
25 const unsigned ac = a->
columns();
27 for (
unsigned i=start;
i<end; ++
i) {
28 for (
unsigned j=0; j<bc; ++j) {
31 register T val (a->
get(
i,0) * b->
get(0,j));
33 for (
register unsigned l=1; l<ac; ++l) {
34 const register T y = a->
get(
i,l) * b->
get(l,j);
35 const register T t = val +
y;
39 res->
set(val-comp,
i,j);
43 for (
unsigned i=start;
i<end; ++
i) {
44 for (
unsigned j=0; j<bc; ++j) {
47 register T val (a->
get(
i,0) * b->
get(0,j));
48 for (
register unsigned l=1; l<ac; ++l)
49 val += a->
get(
i,l) * b->
get(l,j);
64 #if defined(USE_PREFETCH) || defined(USE_UNR_VEC_KERNELS)
70 #ifdef OLD_MAT_MAT_MULT
77 const unsigned bc = b->
columns();
78 const unsigned ac = a->
columns();
79 for (
unsigned i=start;
i<end; ++
i) {
82 for (
unsigned l=0; l<ac; ++l) {
96 for (; n > 3; n -= 4) {
99 *(rptr+1) += tmp * *(bptr+1);
101 *(rptr+2) += tmp * *(bptr+2);
107 *(rptr-1) += tmp * *(bptr+3);
114 *rptr++ += tmp * *bptr++;
121 template <
typename T>
129 const unsigned bc = b->
columns();
130 const unsigned ac = a->
columns();
132 for (
unsigned i=start;
i<end; ++
i) {
134 for (
unsigned l=0; l<ac; ++l) {
137 const register T tmp = *aptr;
138 for (
int n = bc; n; --n)
139 *rptr++ += tmp * *bptr++;
147 #define COST_MATVEC(r,c) (r*(COST_UNIT_STORE+COST_LOOP \
148 +c*(3*COST_UNIT_LOAD+COST_CALL+COST_ADD+COST_MULT+COST_LOOP)))
150 template <
typename T>
156 fprintf (stderr,
"do_mat_vec_mult (pid %i): %p %p %p, %i - %i\n", getpid(), res, mat, vec, start, end);
159 const unsigned mc = mat->
col;
160 #if 0 //def TBCI_SIMD_SUM
165 for (
unsigned rw = start; rw < end; ++rw) {
169 do_vec_mult_exact<T> (mc, mat->
mat[rw], vec->
vec, val);
171 do_vec_mult_quick<T> (mc, mat->
mat[rw], vec->
vec, val);
176 template <
typename T>
182 const unsigned mc = mat->
col;
184 #if 0 //def TBCI_SIMD_SUM
186 for (
unsigned rw = start; rw < end; ++rw) {
189 do_vec_mult_exact<T>(mc, mat->
mat[rw], tsv->
vec, val);
191 do_vec_mult_quick<T>(mc, mat->
mat[rw], tsv->
vec, val);
196 for (
unsigned rw = start; rw < end; ++rw) {
201 do_vec_mult_exact<T> (mc, mat->
mat[rw], tsv->
vec, val);
203 do_vec_mult_quick<T> (mc, mat->
mat[rw], tsv->
vec, val);
208 template <
typename T>
214 fprintf (stderr,
"do_mat_vec_transmult_exact (pid %i): %p %p %p, %i - %i\n", getpid(), res, mat, vec, start, end);
217 const unsigned mr = mat->
row;
219 for (
unsigned cl = start; cl < end; ++cl)
221 for (
unsigned rw = 0; rw < mr; ++rw) {
222 const register T fac = vec->
get(rw);
224 for (
unsigned off = start; off < end; ++off) {
225 const register T y = mat->
get(rw, off) *
fac;
226 const register T t = res->
get(off) +
y;
227 corr(off-start) += (t - res->
get(off)) - y;
231 for (
unsigned cl = start; cl < end; ++cl)
232 res->
setval(cl) -= corr(cl-start);
236 template <
typename T>
246 fprintf (stderr,
"do_mat_vec_transmult (pid %i): %p %p %p, %i - %i\n", getpid(), res, mat, vec, start, end);
249 const unsigned mr = mat->
row;
250 for (
unsigned cl = start; cl < end; ++cl)
252 for (
unsigned rw = 0; rw < mr; ++rw) {
253 const register T fac = vec->
get(rw);
254 do_vec_add_svc<T> (end-start, &res->
setval(start), mat->
mat[rw],
fac);
259 template <
typename T>
266 const unsigned mr = mat->
row;
268 const unsigned mstr = mat->
mat[1] - mat->
mat[0];
272 for (
unsigned cl = start; cl < end; ++cl) {
276 do_vec_mult_stride_exact<T> (mr, mat->
mat[0]+cl, vec->
vec, val, mstr);
278 do_vec_mult_stride_quick<T> (mr, mat->
mat[0]+cl, vec->
vec, val, mstr);
284 const unsigned mr = mat->
row;
285 for (
unsigned cl = start; cl < end; cl++)
289 register int i = mr - 9;
304 for (; i > 3; i -= 4) {
307 el += mat->
mat[r+1][cl] * *(vecptr+1);
309 el += mat->
mat[r+2][cl] * *(vecptr+2);
311 el += mat->
mat[r+3][cl] * *(vecptr+3);
319 for (i += 8;
i; --
i, r++)
320 el += mat->
mat[r][cl] * *vecptr++;
T ** mat
C storage layout: mat[row][col].
T & setval(const unsigned long i) const
#define CACHELINE_SZ
(L1) Cache line size in bytes.
long double fact(const double x)
void do_mat_vec_mult(const unsigned start, const unsigned end, TVector< T > *res, const Matrix< T > *mat, const Vector< T > *vec)
T *const & vecptr() const
tbci_traits< T >::const_refval_type get(const unsigned long i) const
void do_mat_tsv_mult(const unsigned start, const unsigned end, TVector< T > *res, const Matrix< T > *mat, const TSVector< T > *tsv)
T & setval(const T &val, const unsigned int r, const unsigned int c)
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
unsigned int columns() const
number of columns
void do_old_mat_mat_mult(const unsigned start, const unsigned end, TMatrix< T > *res, const Matrix< T > *a, const Matrix< T > *b)
const T & getcref(const unsigned long i) const
#define PREFETCH_W(addr, loc)
Temporary Base Class Idiom: Class TVector is used for temporary variables.
unsigned int do_exactsum2()
void do_mat_mat_mult(const unsigned start, const unsigned end, TMatrix< T > *res, const Matrix< T > *a, const Matrix< T > *b)
TODO: Provide plain version of mat-mat and mat-vec mult!
void do_mat_vec_transmult(const unsigned start, const unsigned end, TVector< T > *res, const Matrix< T > *mat, const Vector< T > *vec)
const T * getrowptr(const unsigned r) const
Helpers for matvecmul.
T & set(const T &val, const unsigned r, const unsigned c)
void do_mat_vec_transmult_exact(const unsigned start, const unsigned end, TVector< T > *res, const Matrix< T > *mat, const Vector< T > *vec)
#define LIKELY(expr)
branch prediction note that we sometimes on purpose mark the unlikely possibility likely and vice ver...
tbci_traits< T >::const_refval_type get(const unsigned r, const unsigned c) const
get, set and getcref are used internally and not for public consumption