TBCI Numerical high perf. C++ Library  2.8.0
matrix_kernels.h
Go to the documentation of this file.
1 
7 #ifndef TBCI_MATRIX_KERNELS_H
8 #define TBCI_MATRIX_KERNELS_H
9 
10 template <typename T> class Matrix;
11 template <typename T> class TMatrix;
12 template <typename T> class TSMatrix;
13 template <typename T> class Vector;
14 template <typename T> class TVector;
15 template <typename T> class TSVector;
16 
17 //#include "vec_kern_unr_pref.h"
18 
19 template <typename T>
20 void do_old_mat_mat_mult (const unsigned start, const unsigned end,
21  TMatrix<T> *res, const Matrix<T> *a,
22  const Matrix<T> *b)
23 {
24  const unsigned bc = b->columns();
25  const unsigned ac = a->columns();
26  if (do_exactsum2()) {
27  for (unsigned i=start; i<end; ++i) {
28  for (unsigned j=0; j<bc; ++j) {
29  PREFETCH_R(a->getrowptr(i), 2);
30  PREFETCH_W(&res->setval(i,j), 2);
31  register T val (a->get(i,0) * b->get(0,j));
32  register T comp(0.0);
33  for (register unsigned l=1; l<ac; ++l) {
34  const register T y = a->get(i,l) * b->get(l,j);
35  const register T t = val + y;
36  comp += (t-val) - y;
37  val = t;
38  }
39  res->set(val-comp,i,j);
40  }
41  }
42  } else {
43  for (unsigned i=start; i<end; ++i) {
44  for (unsigned j=0; j<bc; ++j) {
45  PREFETCH_R(a->getrowptr(i), 2);
46  PREFETCH_W(&res->setval(i,j), 2);
47  register T val (a->get(i,0) * b->get(0,j));
48  for (register unsigned l=1; l<ac; ++l)
49  val += a->get(i,l) * b->get(l,j);
50  res->set(val,i,j);
51  }
52  }
53  }
54 }
55 
57 
58 /* Following Dowd/Severance, ch. 8: Friendlier mem access pattern */
59 /* Should be faster in general. However, on iPII, it's faster (+)
60  * or slower (-), depending on the data type T:
61  * int: -, float: +, double: -, long long double: +, cplx<double>: +
62  * AXP: Generally faster
63  * (KG, 99/07/28) */
64 #if defined(USE_PREFETCH) || defined(USE_UNR_VEC_KERNELS)
65 template <typename T>
66 void do_mat_mat_mult (const unsigned start, const unsigned end,
67  TMatrix<T> *res,
68  const Matrix<T> *a, const Matrix<T> *b)
69 {
70 #ifdef OLD_MAT_MAT_MULT
71  do_old_mat_mat_mult(start, end, res, a, b);
72 #else
73  if (do_exactsum2()) {
74  do_old_mat_mat_mult(start, end, res, a, b);
75  return;
76  }
77  const unsigned bc = b->columns();
78  const unsigned ac = a->columns();
79  for (unsigned i=start; i<end; ++i) {
80  const T* RESTRICT aptr = a->getrowptr(i);
81  PREFETCH_R(aptr,2);
82  for (unsigned l=0; l<ac; ++l) {
83  T* RESTRICT rptr = res->getrowptr(i);
84  const T* RESTRICT bptr = b->getrowptr(l);
85  int n = bc - 8;
86  PREFETCH_R(rptr,2); PREFETCH_R(bptr,1);
87  ALIGN3(const register T tmp, *aptr, MIN_ALIGN2);
88  if (LIKELY(n >= 0)) {
89  if (sizeof(T)*2 >= CACHELINE_SZ) {
90  PREFETCH_R(rptr+2,2); PREFETCH_R(bptr+2,1);
91  }
92  PREFETCH_R(rptr+4,2); PREFETCH_R(bptr+4,1);
93  if (sizeof(T)*2 >= CACHELINE_SZ) {
94  PREFETCH_R(rptr+6,2); PREFETCH_R(bptr+6,1);
95  }
96  for (; n > 3; n -= 4) {
97  *rptr += tmp * *bptr;
98  PREFETCH_R(rptr+ 8,2);
99  *(rptr+1) += tmp * *(bptr+1);
100  PREFETCH_R(bptr+ 8,1);
101  *(rptr+2) += tmp * *(bptr+2);
102  if (sizeof(T)*2 >= CACHELINE_SZ)
103  PREFETCH_R(rptr+10,2);
104  rptr += 4;
105  if (sizeof(T)*2 >= CACHELINE_SZ)
106  PREFETCH_R(bptr+10,1);
107  *(rptr-1) += tmp * *(bptr+3);
108  bptr += 4;
109  }
110  }
111  if (LIKELY(l+4 < ac-CACHELINE_SZ/sizeof(T)))
112  PREFETCH_R(aptr+4, 2);
113  for (n += 8; n; --n)
114  *rptr++ += tmp * *bptr++;
115  aptr++;
116  }
117  }
118 #endif
119 }
120 #else
121 template <typename T>
122 void do_mat_mat_mult (const unsigned start, const unsigned end,
123  TMatrix<T> *res, const Matrix<T> *a, const Matrix<T> *b)
124 {
125  if (do_exactsum2()) {
126  do_old_mat_mat_mult(start, end, res, a, b);
127  return;
128  }
129  const unsigned bc = b->columns();
130  const unsigned ac = a->columns();
131  Vector<T> corr((T)0, end-start);
132  for (unsigned i=start; i<end; ++i) {
133  const T* RESTRICT aptr = a->getrowptr(i);
134  for (unsigned l=0; l<ac; ++l) {
135  T* RESTRICT rptr = res->getrowptr(i);
136  const T* RESTRICT bptr = b->getrowptr(l);
137  const register T tmp = *aptr;
138  for (int n = bc; n; --n)
139  *rptr++ += tmp * *bptr++;
140  aptr++;
141  }
142  }
143 }
144 #endif
145 
146 /* costs of mat vec mult; r = row, c = col */
147 #define COST_MATVEC(r,c) (r*(COST_UNIT_STORE+COST_LOOP \
148  +c*(3*COST_UNIT_LOAD+COST_CALL+COST_ADD+COST_MULT+COST_LOOP)))
149 
150 template <typename T>
151 /*inline*/ void do_mat_vec_mult (const unsigned start, const unsigned end,
152  TVector<T> *res,
153  const Matrix<T> *mat, const Vector<T> *vec)
154 {
155 #ifdef DEBUG_THREAD
156  fprintf (stderr, "do_mat_vec_mult (pid %i): %p %p %p, %i - %i\n", getpid(), res, mat, vec, start, end);
157 #endif
158  PREFETCH_R(vec->vec,3);
159  const unsigned mc = mat->col;
160 #if 0 //def TBCI_SIMD_SUM
161  if (UNLIKELY(sizeof(T) < TBCI_SIMD_ALIGN && mc % (TBCI_SIMD_ALIGN/sizeof(T)))) {
162  /* OK, so we can not ensure alignment -- should do something clever ? */
163  }
164 #endif
165  for (unsigned rw = start; rw < end; ++rw) {
166  //PREFETCH_W(res->vec+rw,3);
167  T val(0.0);
168  if (do_exactsum2())
169  do_vec_mult_exact<T> (mc, mat->mat[rw], vec->vec, val);
170  else
171  do_vec_mult_quick<T> (mc, mat->mat[rw], vec->vec, val);
172  res->vec[rw] = val;
173  }
174 }
175 
176 template <typename T>
177 /*inline*/ void do_mat_tsv_mult (const unsigned start, const unsigned end,
178  TVector<T> *res,
179  const Matrix<T> *mat, const TSVector<T> *tsv)
180 {
181  PREFETCH_R(tsv->vec,3);
182  const unsigned mc = mat->col;
183  const T fact = tsv->getfac();
184 #if 0 //def TBCI_SIMD_SUM
185  if (UNLIKELY(sizeof(T) < TBCI_SIMD_ALIGN && mc % (TBCI_SIMD_ALIGN/sizeof(T)))) {
186  for (unsigned rw = start; rw < end; ++rw) {
187  T val(0.0);
188  if (do_exactsum2())
189  do_vec_mult_exact<T>(mc, mat->mat[rw], tsv->vec, val);
190  else
191  do_vec_mult_quick<T>(mc, mat->mat[rw], tsv->vec, val);
192  res->vec[rw] = val*fact;
193  }
194  } else
195 #endif
196  for (unsigned rw = start; rw < end; ++rw) {
197  //PREFETCH_W(res->vec+rw,3);
198  T val(0.0);
199  // TODO: May be unaligned!
200  if (do_exactsum2())
201  do_vec_mult_exact<T> (mc, mat->mat[rw], tsv->vec, val);
202  else
203  do_vec_mult_quick<T> (mc, mat->mat[rw], tsv->vec, val);
204  res->vec[rw] = val*fact;
205  }
206 }
207 
208 template <typename T>
209 /*inline*/ void do_mat_vec_transmult_exact (const unsigned start, const unsigned end,
210  TVector<T> *res,
211  const Matrix<T> *mat, const Vector<T> *vec)
212 {
213 #ifdef DEBUG_THREAD
214  fprintf (stderr, "do_mat_vec_transmult_exact (pid %i): %p %p %p, %i - %i\n", getpid(), res, mat, vec, start, end);
215 #endif
216  PREFETCH_R(&vec->getcref(0),3);
217  const unsigned mr = mat->row;
218  Vector<T> corr((T)0, end-start);
219  for (unsigned cl = start; cl < end; ++cl)
220  res->setval(cl) = (T)0;
221  for (unsigned rw = 0; rw < mr; ++rw) {
222  const register T fac = vec->get(rw);
223  //do_vec_add_svc<T> (end-start, &res->setval(start), mat->mat[rw], fac);
224  for (unsigned off = start; off < end; ++off) {
225  const register T y = mat->get(rw, off) * fac;
226  const register T t = res->get(off) + y;
227  corr(off-start) += (t - res->get(off)) - y;
228  res->setval(off) = t;
229  }
230  }
231  for (unsigned cl = start; cl < end; ++cl)
232  res->setval(cl) -= corr(cl-start);
233 }
234 
235 #if 1
236 template <typename T>
237 /*inline*/ void do_mat_vec_transmult (const unsigned start, const unsigned end,
238  TVector<T> *res,
239  const Matrix<T> *mat, const Vector<T> *vec)
240 {
241  if (do_exactsum2()) {
242  do_mat_vec_transmult_exact(start, end, res, mat, vec);
243  return;
244  }
245 #ifdef DEBUG_THREAD
246  fprintf (stderr, "do_mat_vec_transmult (pid %i): %p %p %p, %i - %i\n", getpid(), res, mat, vec, start, end);
247 #endif
248  PREFETCH_R(vec->vec,3);
249  const unsigned mr = mat->row;
250  for (unsigned cl = start; cl < end; ++cl)
251  res->setval(cl) = (T)0;
252  for (unsigned rw = 0; rw < mr; ++rw) {
253  const register T fac = vec->get(rw);
254  do_vec_add_svc<T> (end-start, &res->setval(start), mat->mat[rw], fac);
255  }
256 }
257 
258 #else
259 template <typename T>
260 /*inline*/ void do_mat_vec_transmult (const unsigned start, const unsigned end,
261  TVector<T> *res,
262  const Matrix<T> *mat, const Vector<T> *vec)
263 {
264 #if 1
265  PREFETCH_R(vec->vec,3);
266  const unsigned mr = mat->row;
267  //const unsigned mstr = mr;
268  const unsigned mstr = mat->mat[1] - mat->mat[0];
269  //fprintf(stderr, "m-v-transmult: stride %i (%ix%i)\n",
270  // mstr, mr, mat->col);
271  /* We don't have SIMD routines for non=unit strides */
272  for (unsigned cl = start; cl < end; ++cl) {
273  //PREFETCH_W(res->vec+rw,3);
274  T val(0.0);
275  if (do_exactsum2())
276  do_vec_mult_stride_exact<T> (mr, mat->mat[0]+cl, vec->vec, val, mstr);
277  else
278  do_vec_mult_stride_quick<T> (mr, mat->mat[0]+cl, vec->vec, val, mstr);
279  res->vec[cl] = val;
280  }
281 #else /* OLD IMPLEMENTATION */
282 
283  PREFETCH_R(vec->vec,3);
284  const unsigned mr = mat->row;
285  for (unsigned cl = start; cl < end; cl++)
286  {
287  PREFETCH_R(mat->mat[0]+cl,1);
288  register T* vecptr = vec->vec;
289  register int i = mr - 9;
290  int r = 1;
291  ALIGN3(T el, mat->mat[0][cl] * *vecptr++, MIN_ALIGN2);
292  /* FIXME: Exact summing needs to be implemented
293  * -- and a SIMD version would be nice as well ...
294  * This calls for a do_vec_mult_stride kernel */
295  //T comp(0.0);
296  if (LIKELY(i >= 0)) {
297  PREFETCH_R(mat->mat[1]+cl,1);
298  PREFETCH_R(mat->mat[2]+cl,1);
299  PREFETCH_R(mat->mat[3]+cl,1);
300  PREFETCH_R(mat->mat[4]+cl,1);
301  PREFETCH_R(mat->mat[5]+cl,1);
302  PREFETCH_R(mat->mat[6]+cl,1);
303  PREFETCH_R(mat->mat[7]+cl,1);
304  for (; i > 3; i -= 4) {
305  el += mat->mat[r][cl] * *vecptr;
306  PREFETCH_R(mat->mat[r+ 8]+cl, 1);
307  el += mat->mat[r+1][cl] * *(vecptr+1);
308  PREFETCH_R(mat->mat[r+ 9]+cl, 1);
309  el += mat->mat[r+2][cl] * *(vecptr+2);
310  PREFETCH_R(mat->mat[r+10]+cl, 1);
311  el += mat->mat[r+3][cl] * *(vecptr+3);
312  PREFETCH_R(mat->mat[r+11]+cl, 1);
313  r += 4; vecptr += 4;
314  }
315  }
316  if (LIKELY(cl < end-CACHELINE_SZ/sizeof(T))) {
317  PREFETCH_W(res->vec + cl, 2);
318  }
319  for (i += 8; i; --i, r++)
320  el += mat->mat[r][cl] * *vecptr++;
321 
322  res->vec[cl] = el;
323  }
324 #endif
325 }
326 #endif
327 
328 /* TODO:
329  * Divide and Conquer algorithm for matrix-matrix multiplication:
330  * (Strassen's algorithm)
331  *
332  * [C11 C12 ] [A11 A12 ] [B11 B12 ]
333  * | | = | | × | |
334  * [C21 C22 ] [A21 A22 ] [B21 B22 ]
335  *
336  * P1 = (A11 + A22) (B11 + B22)
337  * P2 = (A21 + A22) B11
338  * P3 = A11 (B12 - B22)
339  * P4 = A22( B21 - B11)
340  * P5 = (A11 + A12) B22
341  * P6 = (A21 - A11) (B11 + B12)
342  * P7 = (A12 - A22) (B21 + B22)
343  *
344  * C11 = P1 + P4 - P5 + P7
345  * C12 = P3 + P5
346  * C21 = P2 + P4
347  * C22 = P1 + P3 - P2 + P6
348  *
349  * O(n^2log7) multiplications
350  *
351  * http://www.cse.ohio-state.edu/~gurari/course/cse693s04/cse693s04su61.html#x84-7200011.4
352  * http://www.brpreiss.com/books/opus5/html/page457.html
353  */
354 
355 #endif /* TBCI_MATRIX_KERNELS_H */
T ** mat
C storage layout: mat[row][col].
Definition: matrix.h:119
Matrix< T > a(10, 10)
tm fac
Definition: f_matrix.h:1051
T & setval(const unsigned long i) const
Definition: vector.h:192
#define CACHELINE_SZ
(L1) Cache line size in bytes.
Definition: perf_opt.h:148
long double fact(const double x)
Definition: mathplus.h:101
void do_mat_vec_mult(const unsigned start, const unsigned end, TVector< T > *res, const Matrix< T > *mat, const Vector< T > *vec)
T *const & vecptr() const
Definition: vector.h:1163
tbci_traits< T >::const_refval_type get(const unsigned long i) const
Definition: vector.h:190
#define MIN_ALIGN2
Definition: basics.h:417
const T & getfac() const
Definition: vector.h:1073
unsigned int col
Definition: matrix.h:116
#define UNLIKELY(expr)
Definition: basics.h:101
void do_mat_tsv_mult(const unsigned start, const unsigned end, TVector< T > *res, const Matrix< T > *mat, const TSVector< T > *tsv)
T & setval(const T &val, const unsigned int r, const unsigned int c)
Definition: matrix.h:281
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
Definition: basics.h:741
unsigned int columns() const
number of columns
Definition: matrix.h:315
#define TBCI_SIMD_ALIGN
Definition: malloc_cache.h:122
void do_old_mat_mat_mult(const unsigned start, const unsigned end, TMatrix< T > *res, const Matrix< T > *a, const Matrix< T > *b)
F_TMatrix< T > b
Definition: f_matrix.h:736
T * vec
Definition: vector.h:1029
const T & getcref(const unsigned long i) const
Definition: vector.h:188
Definition: bvector.h:49
#define PREFETCH_W(addr, loc)
Definition: basics.h:742
int i
Definition: LM_fit.h:71
doublereal y
Definition: TOMS_707.C:27
Temporary Base Class Idiom: Class TVector is used for temporary variables.
Definition: bvector.h:52
Definition: bvector.h:54
T * vec
Definition: bvector.h:73
#define ALIGN3(v, i, x)
Definition: basics.h:435
#define T
Definition: bdmatlib.cc:20
unsigned int do_exactsum2()
Definition: tbci_param.h:151
void do_mat_mat_mult(const unsigned start, const unsigned end, TMatrix< T > *res, const Matrix< T > *a, const Matrix< T > *b)
TODO: Provide plain version of mat-mat and mat-vec mult!
void do_mat_vec_transmult(const unsigned start, const unsigned end, TVector< T > *res, const Matrix< T > *mat, const Vector< T > *vec)
const T * getrowptr(const unsigned r) const
Helpers for matvecmul.
Definition: matrix.h:296
T & set(const T &val, const unsigned r, const unsigned c)
Definition: matrix.h:290
#define RESTRICT
Definition: basics.h:89
void do_mat_vec_transmult_exact(const unsigned start, const unsigned end, TVector< T > *res, const Matrix< T > *mat, const Vector< T > *vec)
#define LIKELY(expr)
branch prediction note that we sometimes on purpose mark the unlikely possibility likely and vice ver...
Definition: basics.h:100
unsigned int row
Definition: matrix.h:116
tbci_traits< T >::const_refval_type get(const unsigned r, const unsigned c) const
get, set and getcref are used internally and not for public consumption
Definition: matrix.h:288