MueLu  Version of the Day
MueLu_PerfModels_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // MueLu: A package for multigrid based preconditioning
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact
39 // Jonathan Hu (jhu@sandia.gov)
40 // Andrey Prokopenko (aprokop@sandia.gov)
41 // Ray Tuminaro (rstumin@sandia.gov)
42 //
43 // ***********************************************************************
44 //
45 // @HEADER
46 #include <cstdio>
47 #include <cmath>
48 #include <numeric>
49 #include <utility>
50 #include <chrono>
51 #include <iomanip>
52 #include <Teuchos_ScalarTraits.hpp>
53 #include <Kokkos_ArithTraits.hpp>
55 
56 
57 #ifdef HAVE_MPI
58 #include <mpi.h>
59 #endif
60 
61 namespace MueLu {
62 
63  namespace PerfDetails {
64  template<class Scalar,class Node>
65  double stream_vector_add(int KERNEL_REPEATS, int VECTOR_SIZE) {
66  // PerfDetails' STREAM routines need to be instantiatiated on impl_scalar_type, not Scalar
67  using impl_scalar_type = typename Kokkos::Details::ArithTraits<Scalar>::val_type;
68 
69  using exec_space = typename Node::execution_space;
70  using memory_space = typename Node::memory_space;
71  using range_policy = Kokkos::RangePolicy<exec_space>;
72 
73  Kokkos::View<impl_scalar_type*,memory_space> a("a", VECTOR_SIZE);
74  Kokkos::View<impl_scalar_type*,memory_space> b("b", VECTOR_SIZE);
75  Kokkos::View<impl_scalar_type*,memory_space> c("c", VECTOR_SIZE);
76  double total_test_time = 0.0;
77 
78  impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
79 
80  Kokkos::parallel_for("stream/fill",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (const size_t i) {
81  a(i) = ONE * (double)i;
82  b(i) = a(i);
83  });
84  exec_space().fence();
85 
86  using clock = std::chrono::high_resolution_clock;
87 
88  clock::time_point start, stop;
89 
90  for(int i = 0; i < KERNEL_REPEATS; i++) {
91  start = clock::now();
92  Kokkos::parallel_for("stream/add",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (const size_t j) { //Vector Addition
93  c(j) = a(j) + b(j);
94  });
95 
96  exec_space().fence();
97  stop = clock::now();
98  double my_test_time = std::chrono::duration<double>(stop - start).count();
99  total_test_time += my_test_time;
100  }
101 
102  return total_test_time / KERNEL_REPEATS;
103  }
104 
105  template<class Scalar,class Node>
106  double stream_vector_copy(int KERNEL_REPEATS, int VECTOR_SIZE) {
107  // PerfDetails' STREAM routines need to be instantiatiated on impl_scalar_type, not Scalar
108  using impl_scalar_type = typename Kokkos::Details::ArithTraits<Scalar>::val_type;
109 
110  using exec_space = typename Node::execution_space;
111  using memory_space = typename Node::memory_space;
112  using range_policy = Kokkos::RangePolicy<exec_space>;
113 
114  Kokkos::View<impl_scalar_type*,memory_space> a("a", VECTOR_SIZE);
115  Kokkos::View<impl_scalar_type*,memory_space> b("b", VECTOR_SIZE);
116  double total_test_time = 0.0;
117 
118  impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
119 
120  Kokkos::parallel_for("stream/fill",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (const size_t i) {
121  a(i) = ONE;
122  });
123  exec_space().fence();
124 
125  using clock = std::chrono::high_resolution_clock;
126  clock::time_point start, stop;
127 
128  for(int i = 0; i < KERNEL_REPEATS; i++) {
129  start = clock::now();
130  Kokkos::parallel_for("stream/copy",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (const size_t j) { //Vector Addition
131  b(j) = a(j);
132  });
133 
134  exec_space().fence();
135  stop = clock::now();
136  double my_test_time = std::chrono::duration<double>(stop - start).count();
137  total_test_time += my_test_time;
138  }
139 
140  return total_test_time / KERNEL_REPEATS;
141  }
142 
143 
144 
145  double table_lookup(const std::vector<int> & x, const std::vector<double> & y, int value) {
146  // If there's no table, nan
147  if(x.size() == 0) return Teuchos::ScalarTraits<double>::nan();
148 
149  // NOTE: This should probably be a binary search, but this isn't performance sensitive, so we'll go simple
150  int N = (int) x.size();
151  int hi = 0;
152  for( ; hi < N; hi++) {
153  if (x[hi] > value)
154  break;
155  }
156 
157  if(hi == 0) {
158  // Lower end (return the min time)
159  //printf("Lower end: %d < %d\n",value,x[0]);
160  return y[0];
161  }
162  else if (hi == N) {
163  // Higher end (extrapolate from the last two points)
164  //printf("Upper end: %d > %d\n",value,x[N-1]);
165  hi = N-1;
166  int run = x[hi] - x[hi-1];
167  double rise = y[hi] - y[hi-1];
168  double slope = rise / run;
169  int diff = value - x[hi-1];
170 
171  return y[hi-1] + slope * diff;
172  }
173  else {
174  // Interpolate
175  //printf("Middle: %d < %d < %d\n",x[hi-1],value,x[hi]);
176  int run = x[hi] - x[hi-1];
177  double rise = y[hi] - y[hi-1];
178  double slope = rise / run;
179  int diff = value - x[hi-1];
180 
181  return y[hi-1] + slope * diff;
182  }
183  }
184 
185  // Report bandwidth in GB / sec
186  const double GB = 1024.0 * 1024.0 * 1024.0;
187  double convert_time_to_bandwidth_gbs(double time, int num_calls, double memory_per_call_bytes) {
188  double time_per_call = time / num_calls;
189  return memory_per_call_bytes / GB / time_per_call;
190  }
191 
192 
193  template <class exec_space, class memory_space>
194  void pingpong_basic(int KERNEL_REPEATS, int MAX_SIZE,const Teuchos::Comm<int> &comm, std::vector<int> & sizes, std::vector<double> & times) {
195  int rank = comm.getRank();
196  int nproc = comm.getSize();
197 
198  if(nproc < 2) return;
199 
200 #ifdef HAVE_MPI
201  using range_policy = Kokkos::RangePolicy<exec_space>;
202  const int buff_size = (int) pow(2,MAX_SIZE);
203 
204  sizes.resize(MAX_SIZE+1);
205  times.resize(MAX_SIZE+1);
206 
207  // Allocate memory for the buffers (and fill send)
208  Kokkos::View<char*,memory_space> r_buf("recv",buff_size), s_buf("send",buff_size);
209  Kokkos::deep_copy(s_buf,1);
210 
211  //Send and recieve.
212  // NOTE: Do consectutive pair buddies here for simplicity. We should be smart later
213  int odd = rank % 2;
214  int buddy = odd ? rank - 1 : rank + 1;
215 
216  for(int i = 0; i < MAX_SIZE + 1 ;i ++) {
217  int msg_size = (int) pow(2,i);
218  comm.barrier();
219 
220  double t0 = MPI_Wtime();
221  for(int j = 0; j < KERNEL_REPEATS; j++) {
222  if (buddy < nproc) {
223  if (odd) {
224  comm.send(msg_size, (char*)s_buf.data(), buddy);
225  comm.receive(buddy, msg_size, (char*)r_buf.data());
226  }
227  else {
228  comm.receive(buddy, msg_size,(char*)r_buf.data());
229  comm.send(msg_size, (char*)s_buf.data(), buddy);
230  }
231  }
232  }
233 
234  double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
235  sizes[i] = msg_size;
236  times[i] = time_per_call;
237  }
238 #endif
239  }
240 
241 
242 
243  }// end namespace PerfDetails
244 
245 
246  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
248 
249 
250  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
251  void
253 
254  // We need launch/waits latency estimates for corrected stream
255  launch_latency_make_table(KERNEL_REPEATS);
256  double latency = launch_latency_lookup();
257 
258  if(LOG_MAX_SIZE < 2)
259  LOG_MAX_SIZE=20;
260 
261  stream_sizes_.resize(LOG_MAX_SIZE+1);
262  stream_copy_times_.resize(LOG_MAX_SIZE+1);
263  stream_add_times_.resize(LOG_MAX_SIZE+1);
264  latency_corrected_stream_copy_times_.resize(LOG_MAX_SIZE+1);
265  latency_corrected_stream_add_times_.resize(LOG_MAX_SIZE+1);
266 
267  for(int i=0; i<LOG_MAX_SIZE+1; i++) {
268  int size = (int) pow(2,i);
269  double c_time = PerfDetails::stream_vector_copy<Scalar,Node>(KERNEL_REPEATS,size);
270  double a_time = PerfDetails::stream_vector_add<Scalar,Node>(KERNEL_REPEATS,size);
271 
272  stream_sizes_[i] = size;
273 
274  // Correct for the difference in memory transactions per element
275  stream_copy_times_[i] = c_time / 2.0;
276  stream_add_times_[i] = a_time / 3.0;
277 
278  // Correct for launch latency too. We'll note that sometimes the latency estimate
279  // is higher than the actual copy/add time estimate. If so, we don't correct
280  latency_corrected_stream_copy_times_[i] = (c_time - latency <= 0.0) ? c_time / 2.0 : ( (c_time-latency)/2.0 );
281  latency_corrected_stream_add_times_[i] = (a_time - latency <= 0.0) ? a_time / 3.0 : ( (a_time-latency)/3.0 );
282 
283 
284  }
285  }
286 
287  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
288  double
290  return PerfDetails::table_lookup(stream_sizes_,stream_copy_times_,SIZE_IN_BYTES/sizeof(Scalar));
291  }
292 
293  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
294  double
296  return PerfDetails::table_lookup(stream_sizes_,stream_add_times_,SIZE_IN_BYTES/sizeof(Scalar));
297  }
298 
299  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
300  double
302  return std::min(stream_vector_copy_lookup(SIZE_IN_BYTES),stream_vector_add_lookup(SIZE_IN_BYTES));
303  }
304 
305 
306  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
307  double
309  return PerfDetails::table_lookup(stream_sizes_,latency_corrected_stream_copy_times_,SIZE_IN_BYTES/sizeof(Scalar));
310  }
311 
312  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
313  double
315  return PerfDetails::table_lookup(stream_sizes_,latency_corrected_stream_add_times_,SIZE_IN_BYTES/sizeof(Scalar));
316  }
317 
318  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
319  double
321  return std::min(latency_corrected_stream_vector_copy_lookup(SIZE_IN_BYTES),latency_corrected_stream_vector_add_lookup(SIZE_IN_BYTES));
322  }
323 
324 
325  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
326  void
328  print_stream_vector_table_impl(out,false);
329  }
330 
331  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
332  void
334  print_stream_vector_table_impl(out,true);
335  }
336 
337 
338  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
339  void
341  using namespace std;
342  std::ios old_format(NULL);
343  old_format.copyfmt(out);
344 
345  out << setw(20) << "Length in Scalars" << setw(1) << " "
346  << setw(20) << "COPY (us)" << setw(1) << " "
347  << setw(20) << "ADD (us)" << setw(1) << " "
348  << setw(20) << "COPY (GB/s)" << setw(1) << " "
349  << setw(20) << "ADD (GB/s)" << std::endl;
350 
351  out << setw(20) << "-----------------" << setw(1) << " "
352  << setw(20) << "---------" << setw(1) << " "
353  << setw(20) << "--------" << setw(1) << " "
354  << setw(20) << "-----------" << setw(1) << " "
355  << setw(20) << "----------" << std::endl;
356 
357 
358  for(int i=0; i<(int)stream_sizes_.size(); i++) {
359  int size = stream_sizes_[i];
360  double c_time = use_latency_correction ? latency_corrected_stream_copy_times_[i] : stream_copy_times_[i];
361  double a_time = use_latency_correction ? latency_corrected_stream_add_times_[i] : stream_add_times_[i];
362  // We've already corrected for the transactions per element difference
363  double c_bw = PerfDetails::convert_time_to_bandwidth_gbs(c_time,1,size*sizeof(Scalar));
364  double a_bw = PerfDetails::convert_time_to_bandwidth_gbs(a_time,1,size*sizeof(Scalar));
365 
366 
367  out << setw(20) << size << setw(1) << " "
368  << setw(20) << fixed << setprecision(4) << (c_time*1e6) << setw(1) << " "
369  << setw(20) << fixed << setprecision(4) << (a_time*1e6) << setw(1) << " "
370  << setw(20) << fixed << setprecision(4) << c_bw << setw(1) << " "
371  << setw(20) << fixed << setprecision(4) << a_bw << std::endl;
372  }
373 
374  out.copyfmt(old_format);
375  }
376 
377 
378 
379 
380  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
381  void
382  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Teuchos::Comm<int> > &comm) {
383 
384  PerfDetails::pingpong_basic<Kokkos::HostSpace::execution_space,Kokkos::HostSpace::memory_space>(KERNEL_REPEATS,LOG_MAX_SIZE,*comm,pingpong_sizes_,pingpong_host_times_);
385 
386  PerfDetails::pingpong_basic<typename Node::execution_space,typename Node::memory_space>(KERNEL_REPEATS,LOG_MAX_SIZE,*comm,pingpong_sizes_,pingpong_device_times_);
387 
388  }
389 
390  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
391  double
393  return PerfDetails::table_lookup(pingpong_sizes_,pingpong_host_times_,SIZE_IN_BYTES);
394  }
395 
396  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
397  double
399  return PerfDetails::table_lookup(pingpong_sizes_,pingpong_device_times_,SIZE_IN_BYTES);
400  }
401 
402 
403  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
404  void
406  if(pingpong_sizes_.size() == 0) return;
407 
408  using namespace std;
409  std::ios old_format(NULL);
410  old_format.copyfmt(out);
411 
412  out << setw(20) << "Message Size" << setw(1) << " "
413  << setw(20) << "Host (us)" << setw(1) << " "
414  << setw(20) << "Device (us)" << std::endl;
415 
416  out << setw(20) << "------------" << setw(1) << " "
417  << setw(20) << "---------" << setw(1) << " "
418  << setw(20) << "-----------" << std::endl;
419 
420 
421  for(int i=0; i<(int)pingpong_sizes_.size(); i++) {
422  int size = pingpong_sizes_[i];
423  double h_time = pingpong_host_times_[i];
424  double d_time = pingpong_device_times_[i];
425 
426 
427  out << setw(20) << size << setw(1) << " "
428  << setw(20) << fixed << setprecision(4) << (h_time*1e6) << setw(1) << " "
429  << setw(20) << fixed << setprecision(4) << (d_time*1e6) << setw(1) << std::endl;
430  }
431 
432  out.copyfmt(old_format);
433  }
434 
435  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
436  void
438  using exec_space = typename Node::execution_space;
439  using range_policy = Kokkos::RangePolicy<exec_space>;
440  using clock = std::chrono::high_resolution_clock;
441 
442  double total_test_time = 0;
443  clock::time_point start, stop;
444  for(int i = 0; i < KERNEL_REPEATS; i++) {
445  start = clock::now();
446  Kokkos::parallel_for("empty kernel",range_policy(0,1), KOKKOS_LAMBDA (const size_t j) {
447  ;
448  });
449  exec_space().fence();
450  stop = clock::now();
451  double my_test_time = std::chrono::duration<double>(stop - start).count();
452  total_test_time += my_test_time;
453  }
454 
455  launch_and_wait_latency_ = total_test_time / KERNEL_REPEATS;
456  }
457 
458  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
459  double
461  return launch_and_wait_latency_;
462  }
463 
464 
465  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
466  void
468  using namespace std;
469  std::ios old_format(NULL);
470  old_format.copyfmt(out);
471 
472  out << setw(20) << "Launch+Wait Latency (us)" << setw(1) << " "
473  << setw(20) << fixed << setprecision(4) << (launch_and_wait_latency_*1e6) << std::endl;
474 
475  out.copyfmt(old_format);
476  }
477 
478 
479 } //namespace MueLu
double convert_time_to_bandwidth_gbs(double time, int num_calls, double memory_per_call_bytes)
double stream_vector_copy(int KERNEL_REPEATS, int VECTOR_SIZE)
void print_launch_latency_table(std::ostream &out)
double pingpong_device_lookup(int SIZE_IN_BYTES)
void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20)
void print_pingpong_table(std::ostream &out)
double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES)
double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES)
Namespace for MueLu classes and methods.
void pingpong_basic(int KERNEL_REPEATS, int MAX_SIZE, const Teuchos::Comm< int > &comm, std::vector< int > &sizes, std::vector< double > &times)
double table_lookup(const std::vector< int > &x, const std::vector< double > &y, int value)
double stream_vector_lookup(int SIZE_IN_BYTES)
MueLu::DefaultScalar Scalar
void print_stream_vector_table(std::ostream &out)
double stream_vector_copy_lookup(int SIZE_IN_BYTES)
void print_latency_corrected_stream_vector_table(std::ostream &out)
double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES)
double pingpong_host_lookup(int SIZE_IN_BYTES)
void launch_latency_make_table(int KERNEL_REPEATS)
double stream_vector_add(int KERNEL_REPEATS, int VECTOR_SIZE)
double stream_vector_add_lookup(int SIZE_IN_BYTES)
void print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction)
void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Teuchos::Comm< int > > &comm)