MueLu  Version of the Day
MueLu_PerfModels_decl.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // MueLu: A package for multigrid based preconditioning
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact
39 // Jonathan Hu (jhu@sandia.gov)
40 // Andrey Prokopenko (aprokop@sandia.gov)
41 // Ray Tuminaro (rstumin@sandia.gov)
42 //
43 // ***********************************************************************
44 //
45 // @HEADER
46 #ifndef MUELU_PERFMODELS_HPP
47 #define MUELU_PERFMODELS_HPP
48 
49 #include "MueLu_ConfigDefs.hpp"
50 
51 #include <vector>
52 #include <ostream>
53 #include <Teuchos_DefaultComm.hpp>
54 
55 #include "MueLu_PerfModels_fwd.hpp"
56 
57 namespace MueLu {
58 
59  template <class Scalar,
62  class Node = DefaultNode>
63  class PerfModels {
64  public:
65  PerfModels();
66 
67  /* Single Node tests based upon the STREAM benchmark for measuring memory
68  * bandwith and computation rate. These processes compute either the addition
69  * of two vectors or the multiplication of dense matrices of any given size.
70  * Many iterations occur which then return a vector containing the individual
71  * lengths of time per iteration.
72  *
73  * See further here:
74  * - https://www.cs.virginia.edu/stream/ref.html
75  * - https://github.com/UoB-HPC/BabelStream
76  */
77 
78  /* This version is for table interpolation and works on chars, so the LOG_MAX_SIZE is for bytes */
79  void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20);
80 
81  /* Lookup in the stream_vector table */
82  double stream_vector_copy_lookup(int SIZE_IN_BYTES);
83  double stream_vector_add_lookup(int SIZE_IN_BYTES);
84  double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES);
85  double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES);
86 
87  // Uses the faster of the tables. The time is then divided by the number of memory transactions
88  // per element in the kernel (e.g. 2 for COPY and 3 for ADD).
89  double stream_vector_lookup(int SIZE_IN_BYTES);
90  double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES);
91 
92  /* Print table */
93  void print_stream_vector_table(std::ostream & out);
94  void print_latency_corrected_stream_vector_table(std::ostream & out);
95 
96  /* A latency test between two processes based upon the MVAPICH OSU Micro-Benchmarks.
97  * The sender process sends a message and then waits for confirmation of reception.
98  * Many iterations occur with various message sizes and the average latency values
99  * are returned within a map. Utilizes blocking send and recieve.
100  *
101  * See further: https://mvapich.cse.ohio-state.edu/benchmarks/
102  */
103  void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Teuchos::Comm<int> > &comm);
104 
105  /* Lookup in the stream_vector table */
106  double pingpong_host_lookup(int SIZE_IN_BYTES);
107  double pingpong_device_lookup(int SIZE_IN_BYTES);
108 
109  /* Print table */
110  void print_pingpong_table(std::ostream & out);
111 
112 
113  /* Estimate launch latency based on the cost of submitting an empty Kokkos::parallel_for.
114  * This necessary to correct the memory bandwidth costs for models on high latency platforms,
115  * e.g., GPUS.
116  */
117  void launch_latency_make_table(int KERNEL_REPEATS);
118 
119  /* Lookup launch latency */
120  double launch_latency_lookup();
121 
122  /* Print table */
123  void print_launch_latency_table(std::ostream & out);
124 
125  private:
126  void print_stream_vector_table_impl(std::ostream & out,bool use_latency_correction);
127 
128 
129  std::vector<int> stream_sizes_;
130  std::vector<double> stream_copy_times_;
131  std::vector<double> stream_add_times_;
134 
135  std::vector<int> pingpong_sizes_;
136  std::vector<double> pingpong_host_times_;
137  std::vector<double> pingpong_device_times_;
138 
140 
141 
142  }; //class PerfModels
143 
144 } //namespace MueLu
145 
146 #endif //ifndef MUELU_PERFMODELS_HPP
std::vector< double > latency_corrected_stream_copy_times_
MueLu::DefaultLocalOrdinal LocalOrdinal
KokkosClassic::DefaultNode::DefaultNodeType DefaultNode
void print_launch_latency_table(std::ostream &out)
double pingpong_device_lookup(int SIZE_IN_BYTES)
void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20)
void print_pingpong_table(std::ostream &out)
std::vector< double > latency_corrected_stream_add_times_
double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES)
double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES)
Namespace for MueLu classes and methods.
MueLu::DefaultNode Node
std::vector< double > stream_add_times_
double stream_vector_lookup(int SIZE_IN_BYTES)
MueLu::DefaultScalar Scalar
void print_stream_vector_table(std::ostream &out)
MueLu::DefaultGlobalOrdinal GlobalOrdinal
double stream_vector_copy_lookup(int SIZE_IN_BYTES)
void print_latency_corrected_stream_vector_table(std::ostream &out)
std::vector< int > pingpong_sizes_
std::vector< int > stream_sizes_
double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES)
std::vector< double > pingpong_device_times_
double pingpong_host_lookup(int SIZE_IN_BYTES)
void launch_latency_make_table(int KERNEL_REPEATS)
std::vector< double > stream_copy_times_
double stream_vector_add_lookup(int SIZE_IN_BYTES)
std::vector< double > pingpong_host_times_
void print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction)
void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Teuchos::Comm< int > > &comm)