Tpetra parallel linear algebra  Version of the Day
TpetraExt_MatrixMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 #ifndef TPETRA_MATRIXMATRIX_DEF_HPP
42 #define TPETRA_MATRIXMATRIX_DEF_HPP
43 #include "Tpetra_ConfigDefs.hpp"
45 #include "Teuchos_VerboseObject.hpp"
46 #include "Teuchos_Array.hpp"
47 #include "Tpetra_Util.hpp"
48 #include "Tpetra_CrsMatrix.hpp"
50 #include "Tpetra_RowMatrixTransposer.hpp"
53 #include "Tpetra_Details_makeColMap.hpp"
54 #include "Tpetra_ConfigDefs.hpp"
55 #include "Tpetra_Map.hpp"
56 #include "Tpetra_Export.hpp"
57 #include "Tpetra_Import_Util.hpp"
58 #include "Tpetra_Import_Util2.hpp"
59 #include <algorithm>
60 #include <type_traits>
61 #include "Teuchos_FancyOStream.hpp"
62 
63 #include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp"
65 
66 #include "KokkosSparse_spgemm.hpp"
67 #include "KokkosSparse_spadd.hpp"
68 #include "Kokkos_Bitset.hpp"
69 
70 #include <MatrixMarket_Tpetra.hpp>
71 
79 /*********************************************************************************************************/
80 // Include the architecture-specific kernel partial specializations here
81 // NOTE: This needs to be outside all namespaces
82 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
83 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
84 #include "TpetraExt_MatrixMatrix_HIP.hpp"
85 
86 namespace Tpetra {
87 
88 namespace MatrixMatrix{
89 
90 //
91 // This method forms the matrix-matrix product C = op(A) * op(B), where
92 // op(A) == A if transposeA is false,
93 // op(A) == A^T if transposeA is true,
94 // and similarly for op(B).
95 //
96 template <class Scalar,
97  class LocalOrdinal,
98  class GlobalOrdinal,
99  class Node>
100 void Multiply(
102  bool transposeA,
104  bool transposeB,
106  bool call_FillComplete_on_result,
107  const std::string& label,
108  const Teuchos::RCP<Teuchos::ParameterList>& params)
109 {
110  using Teuchos::null;
111  using Teuchos::RCP;
112  using Teuchos::rcp;
113  typedef Scalar SC;
114  typedef LocalOrdinal LO;
115  typedef GlobalOrdinal GO;
116  typedef Node NO;
117  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
118  typedef Import<LO,GO,NO> import_type;
119  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
120  typedef Map<LO,GO,NO> map_type;
121  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
122 
123 #ifdef HAVE_TPETRA_MMM_TIMINGS
124  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
125  using Teuchos::TimeMonitor;
126  //MM is used to time setup, and then multiply.
127 
128  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Setup"))));
129 #endif
130 
131  const std::string prefix = "TpetraExt::MatrixMatrix::Multiply(): ";
132 
133  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
134 
135  // The input matrices A and B must both be fillComplete.
136  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
137  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
138 
139  // If transposeA is true, then Aprime will be the transpose of A
140  // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
141  // will just be a pointer to A.
142  RCP<const crs_matrix_type> Aprime = null;
143  // If transposeB is true, then Bprime will be the transpose of B
144  // (computed explicitly via RowMatrixTransposer). Otherwise, Bprime
145  // will just be a pointer to B.
146  RCP<const crs_matrix_type> Bprime = null;
147 
148  // Is this a "clean" matrix?
149  //
150  // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
151  // locally nor globally indexed, then it was empty. I don't like
152  // this, because the most straightforward implementation presumes
153  // lazy allocation of indices. However, historical precedent
154  // demands that we keep around this predicate as a way to test
155  // whether the matrix is empty.
156  const bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
157 
158  bool use_optimized_ATB = false;
159  if (transposeA && !transposeB && call_FillComplete_on_result && newFlag)
160  use_optimized_ATB = true;
161 
162 #ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later.
163  use_optimized_ATB = false;
164 #endif
165 
166  using Teuchos::ParameterList;
167  RCP<ParameterList> transposeParams (new ParameterList);
168  transposeParams->set ("sort", false);
169 
170  if (!use_optimized_ATB && transposeA) {
171  transposer_type transposer (rcpFromRef (A));
172  Aprime = transposer.createTranspose (transposeParams);
173  }
174  else {
175  Aprime = rcpFromRef(A);
176  }
177 
178  if (transposeB) {
179  transposer_type transposer (rcpFromRef (B));
180  Bprime = transposer.createTranspose (transposeParams);
181  }
182  else {
183  Bprime = rcpFromRef(B);
184  }
185 
186  // Check size compatibility
187  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
188  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
189  global_size_t Aouter = transposeA ? numACols : A.getGlobalNumRows();
190  global_size_t Bouter = transposeB ? B.getGlobalNumRows() : numBCols;
191  global_size_t Ainner = transposeA ? A.getGlobalNumRows() : numACols;
192  global_size_t Binner = transposeB ? numBCols : B.getGlobalNumRows();
193  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
194  prefix << "ERROR, inner dimensions of op(A) and op(B) "
195  "must match for matrix-matrix product. op(A) is "
196  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
197 
198  // The result matrix C must at least have a row-map that reflects the correct
199  // row-size. Don't check the number of columns because rectangular matrices
200  // which were constructed with only one map can still end up having the
201  // correct capacity and dimensions when filled.
202  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
203  prefix << "ERROR, dimensions of result C must "
204  "match dimensions of op(A) * op(B). C has " << C.getGlobalNumRows()
205  << " rows, should have at least " << Aouter << std::endl);
206 
207  // It doesn't matter whether C is already Filled or not. If it is already
208  // Filled, it must have space allocated for the positions that will be
209  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
210  // we'll error out later when trying to store result values.
211 
212  // CGB: However, matrix must be in active-fill
213  if (!C.isFillActive()) C.resumeFill();
214 
215  // We're going to need to import remotely-owned sections of A and/or B if
216  // more than one processor is performing this run, depending on the scenario.
217  int numProcs = A.getComm()->getSize();
218 
219  // Declare a couple of structs that will be used to hold views of the data
220  // of A and B, to be used for fast access during the matrix-multiplication.
221  crs_matrix_struct_type Aview;
222  crs_matrix_struct_type Bview;
223 
224  RCP<const map_type> targetMap_A = Aprime->getRowMap();
225  RCP<const map_type> targetMap_B = Bprime->getRowMap();
226 
227 #ifdef HAVE_TPETRA_MMM_TIMINGS
228  {
229  TimeMonitor MM_importExtract(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All I&X")));
230 #endif
231 
232  // Now import any needed remote rows and populate the Aview struct
233  // NOTE: We assert that an import isn't needed --- since we do the transpose
234  // above to handle that.
235  if (!use_optimized_ATB) {
236  RCP<const import_type> dummyImporter;
237  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
238  }
239 
240  // We will also need local access to all rows of B that correspond to the
241  // column-map of op(A).
242  if (numProcs > 1)
243  targetMap_B = Aprime->getColMap();
244 
245  // Import any needed remote rows and populate the Bview struct.
246  if (!use_optimized_ATB)
247  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label, params);
248 
249 #ifdef HAVE_TPETRA_MMM_TIMINGS
250  } //stop MM_importExtract here
251  //stop the setup timer, and start the multiply timer
252  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Multiply"))));
253 #endif
254 
255  // Call the appropriate method to perform the actual multiplication.
256  if (use_optimized_ATB) {
257  MMdetails::mult_AT_B_newmatrix(A, B, C, label,params);
258 
259  } else if (call_FillComplete_on_result && newFlag) {
260  MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label,params);
261 
262  } else if (call_FillComplete_on_result) {
263  MMdetails::mult_A_B_reuse(Aview, Bview, C, label,params);
264 
265  } else {
266  // mfh 27 Sep 2016: Is this the "slow" case? This
267  // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
268  // thread-parallel inserts, but that may take some effort.
269  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
270 
271  MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
272  }
273 
274 #ifdef HAVE_TPETRA_MMM_TIMINGS
275  TimeMonitor MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All FillComplete")));
276 #endif
277  if (call_FillComplete_on_result && !C.isFillComplete()) {
278  // We'll call FillComplete on the C matrix before we exit, and give it a
279  // domain-map and a range-map.
280  // The domain-map will be the domain-map of B, unless
281  // op(B)==transpose(B), in which case the range-map of B will be used.
282  // The range-map will be the range-map of A, unless op(A)==transpose(A),
283  // in which case the domain-map of A will be used.
284  C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
285  }
286 }
287 
288 
289 template <class Scalar,
290  class LocalOrdinal,
291  class GlobalOrdinal,
292  class Node>
293 void Jacobi(Scalar omega,
298  bool call_FillComplete_on_result,
299  const std::string& label,
300  const Teuchos::RCP<Teuchos::ParameterList>& params)
301 {
302  using Teuchos::RCP;
303  typedef Scalar SC;
304  typedef LocalOrdinal LO;
305  typedef GlobalOrdinal GO;
306  typedef Node NO;
307  typedef Import<LO,GO,NO> import_type;
308  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
309  typedef Map<LO,GO,NO> map_type;
310  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
311 
312 #ifdef HAVE_TPETRA_MMM_TIMINGS
313  std::string prefix_mmm = std::string("TpetraExt ")+ label + std::string(": ");
314  using Teuchos::TimeMonitor;
315  TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm+std::string("Jacobi All Setup")));
316 #endif
317 
318  const std::string prefix = "TpetraExt::MatrixMatrix::Jacobi(): ";
319 
320  // A and B should already be Filled.
321  // Should we go ahead and call FillComplete() on them if necessary or error
322  // out? For now, we choose to error out.
323  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
324  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
325 
326  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
327  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
328 
329  // Now check size compatibility
330  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
331  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
332  global_size_t Aouter = A.getGlobalNumRows();
333  global_size_t Bouter = numBCols;
334  global_size_t Ainner = numACols;
335  global_size_t Binner = B.getGlobalNumRows();
336  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
337  prefix << "ERROR, inner dimensions of op(A) and op(B) "
338  "must match for matrix-matrix product. op(A) is "
339  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
340 
341  // The result matrix C must at least have a row-map that reflects the correct
342  // row-size. Don't check the number of columns because rectangular matrices
343  // which were constructed with only one map can still end up having the
344  // correct capacity and dimensions when filled.
345  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
346  prefix << "ERROR, dimensions of result C must "
347  "match dimensions of op(A) * op(B). C has "<< C.getGlobalNumRows()
348  << " rows, should have at least "<< Aouter << std::endl);
349 
350  // It doesn't matter whether C is already Filled or not. If it is already
351  // Filled, it must have space allocated for the positions that will be
352  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
353  // we'll error out later when trying to store result values.
354 
355  // CGB: However, matrix must be in active-fill
356  TEUCHOS_TEST_FOR_EXCEPT( C.isFillActive() == false );
357 
358  // We're going to need to import remotely-owned sections of A and/or B if
359  // more than one processor is performing this run, depending on the scenario.
360  int numProcs = A.getComm()->getSize();
361 
362  // Declare a couple of structs that will be used to hold views of the data of
363  // A and B, to be used for fast access during the matrix-multiplication.
364  crs_matrix_struct_type Aview;
365  crs_matrix_struct_type Bview;
366 
367  RCP<const map_type> targetMap_A = Aprime->getRowMap();
368  RCP<const map_type> targetMap_B = Bprime->getRowMap();
369 
370 #ifdef HAVE_TPETRA_MMM_TIMINGS
371  TimeMonitor MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All I&X")));
372  {
373 #endif
374 
375  // Enable globalConstants by default
376  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
377  RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(new Teuchos::ParameterList);
378  if(!params.is_null()) {
379  importParams1->set("compute global constants",params->get("compute global constants: temporaries",false));
380  int mm_optimization_core_count=0;
381  auto slist = params->sublist("matrixmatrix: kernel params",false);
382  mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount ());
383  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
384  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
385  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
386  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
387  auto & ip1slist = importParams1->sublist("matrixmatrix: kernel params",false);
388  ip1slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
389  ip1slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
390  ip1slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
391  }
392 
393  //Now import any needed remote rows and populate the Aview struct.
394  RCP<const import_type> dummyImporter;
395  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label,importParams1);
396 
397  // We will also need local access to all rows of B that correspond to the
398  // column-map of op(A).
399  if (numProcs > 1)
400  targetMap_B = Aprime->getColMap();
401 
402  // Now import any needed remote rows and populate the Bview struct.
403  // Enable globalConstants by default
404  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
405  RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(new Teuchos::ParameterList);
406  if(!params.is_null()) {
407  importParams2->set("compute global constants",params->get("compute global constants: temporaries",false));
408 
409  auto slist = params->sublist("matrixmatrix: kernel params",false);
410  int mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount () );
411  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
412  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
413  auto & ip2slist = importParams2->sublist("matrixmatrix: kernel params",false);
414  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
415  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
416  ip2slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
417  ip2slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
418  ip2slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
419  }
420 
421  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label,importParams2);
422 
423 #ifdef HAVE_TPETRA_MMM_TIMINGS
424  }
425  TimeMonitor MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All Multiply")));
426 #endif
427 
428  // Now call the appropriate method to perform the actual multiplication.
429  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
430 
431  // Is this a "clean" matrix
432  bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
433 
434  if (call_FillComplete_on_result && newFlag) {
435  MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
436 
437  } else if (call_FillComplete_on_result) {
438  MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
439 
440  } else {
441  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, "jacobi_A_B_general not implemented");
442  // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
443  // commenting it out.
444 // #ifdef HAVE_TPETRA_MMM_TIMINGS
445 // MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: Jacobi FillComplete")));
446 // #endif
447  // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
448  // commenting it out.
449  // if (call_FillComplete_on_result) {
450  // //We'll call FillComplete on the C matrix before we exit, and give
451  // //it a domain-map and a range-map.
452  // //The domain-map will be the domain-map of B, unless
453  // //op(B)==transpose(B), in which case the range-map of B will be used.
454  // //The range-map will be the range-map of A, unless
455  // //op(A)==transpose(A), in which case the domain-map of A will be used.
456  // if (!C.isFillComplete()) {
457  // C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
458  // }
459  // }
460  }
461 }
462 
463 
464 template <class Scalar,
465  class LocalOrdinal,
466  class GlobalOrdinal,
467  class Node>
468 void Add(
470  bool transposeA,
471  Scalar scalarA,
473  Scalar scalarB )
474 {
475  using Teuchos::Array;
476  using Teuchos::RCP;
477  using Teuchos::null;
478  typedef Scalar SC;
479  typedef LocalOrdinal LO;
480  typedef GlobalOrdinal GO;
481  typedef Node NO;
482  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
483  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
484 
485  const std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
486 
487  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
488  prefix << "ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
489  "(Result matrix B is not required to be isFillComplete()).");
490  TEUCHOS_TEST_FOR_EXCEPTION(B.isFillComplete() , std::runtime_error,
491  prefix << "ERROR, input matrix B must not be fill complete!");
492  TEUCHOS_TEST_FOR_EXCEPTION(B.isStaticGraph() , std::runtime_error,
493  prefix << "ERROR, input matrix B must not have static graph!");
494  TEUCHOS_TEST_FOR_EXCEPTION(B.isLocallyIndexed() , std::runtime_error,
495  prefix << "ERROR, input matrix B must not be locally indexed!");
496 
497  using Teuchos::ParameterList;
498  RCP<ParameterList> transposeParams (new ParameterList);
499  transposeParams->set ("sort", false);
500 
501  RCP<const crs_matrix_type> Aprime = null;
502  if (transposeA) {
503  transposer_type transposer (rcpFromRef (A));
504  Aprime = transposer.createTranspose (transposeParams);
505  }
506  else {
507  Aprime = rcpFromRef(A);
508  }
509 
510  size_t a_numEntries;
511  typename crs_matrix_type::nonconst_global_inds_host_view_type a_inds("a_inds",A.getNodeMaxNumRowEntries());
512  typename crs_matrix_type::nonconst_values_host_view_type a_vals("a_vals",A.getNodeMaxNumRowEntries());
513  GO row;
514 
515  if (scalarB != Teuchos::ScalarTraits<SC>::one())
516  B.scale(scalarB);
517 
518  bool bFilled = B.isFillComplete();
519  size_t numMyRows = B.getNodeNumRows();
520  if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
521  for (LO i = 0; (size_t)i < numMyRows; ++i) {
522  row = B.getRowMap()->getGlobalElement(i);
523  Aprime->getGlobalRowCopy(row, a_inds, a_vals, a_numEntries);
524 
525  if (scalarA != Teuchos::ScalarTraits<SC>::one())
526  for (size_t j = 0; j < a_numEntries; ++j)
527  a_vals[j] *= scalarA;
528 
529  if (bFilled)
530  B.sumIntoGlobalValues(row, a_numEntries, reinterpret_cast<Scalar *>(a_vals.data()), a_inds.data());
531  else
532  B.insertGlobalValues(row, a_numEntries, reinterpret_cast<Scalar *>(a_vals.data()), a_inds.data());
533  }
534  }
535 }
536 
537 template <class Scalar,
538  class LocalOrdinal,
539  class GlobalOrdinal,
540  class Node>
541 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
542 add (const Scalar& alpha,
543  const bool transposeA,
545  const Scalar& beta,
546  const bool transposeB,
548  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
549  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
550  const Teuchos::RCP<Teuchos::ParameterList>& params)
551 {
552  using Teuchos::RCP;
553  using Teuchos::rcpFromRef;
554  using Teuchos::rcp;
555  using Teuchos::ParameterList;
557  if(!params.is_null())
558  {
559  TEUCHOS_TEST_FOR_EXCEPTION(
560  params->isParameter("Call fillComplete") && !params->get<bool>("Call fillComplete"),
561  std::invalid_argument,
562  "Tpetra::MatrixMatrix::add(): this version of add() always calls fillComplete\n"
563  "on the result, but you explicitly set 'Call fillComplete' = false in the parameter list. Don't set this explicitly.");
564  params->set("Call fillComplete", true);
565  }
566  //If transposeB, must compute B's explicit transpose to
567  //get the correct row map for C.
568  RCP<const crs_matrix_type> Brcp = rcpFromRef(B);
569  if(transposeB)
570  {
572  Brcp = transposer.createTranspose();
573  }
574  //Check that A,B are fillComplete before getting B's column map
575  TEUCHOS_TEST_FOR_EXCEPTION
576  (! A.isFillComplete () || ! Brcp->isFillComplete (), std::invalid_argument,
577  "TpetraExt::MatrixMatrix::add(): A and B must both be fill complete.");
578  RCP<crs_matrix_type> C = rcp(new crs_matrix_type(Brcp->getRowMap(), 0));
579  //this version of add() always fill completes the result, no matter what is in params on input
580  add(alpha, transposeA, A, beta, false, *Brcp, *C, domainMap, rangeMap, params);
581  return C;
582 }
583 
584 //This functor does the same thing as CrsGraph::convertColumnIndicesFromGlobalToLocal,
585 //but since the spadd() output is always packed there is no need for a separate
586 //numRowEntries here.
587 //
588 template<class LO, class GO, class LOView, class GOView, class LocalMap>
589 struct ConvertGlobalToLocalFunctor
590 {
591  ConvertGlobalToLocalFunctor(LOView& lids_, const GOView& gids_, const LocalMap localColMap_)
592  : lids(lids_), gids(gids_), localColMap(localColMap_)
593  {}
594 
595  KOKKOS_FUNCTION void operator() (const GO i) const
596  {
597  lids(i) = localColMap.getLocalElement(gids(i));
598  }
599 
600  LOView lids;
601  const GOView gids;
602  const LocalMap localColMap;
603 };
604 
605 template <class Scalar,
606  class LocalOrdinal,
607  class GlobalOrdinal,
608  class Node>
609 void
610 add (const Scalar& alpha,
611  const bool transposeA,
613  const Scalar& beta,
614  const bool transposeB,
617  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
618  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
619  const Teuchos::RCP<Teuchos::ParameterList>& params)
620 {
621  using Teuchos::RCP;
622  using Teuchos::rcp;
623  using Teuchos::rcpFromRef;
624  using Teuchos::rcp_implicit_cast;
625  using Teuchos::rcp_dynamic_cast;
626  using Teuchos::TimeMonitor;
627  using SC = Scalar;
628  using LO = LocalOrdinal;
629  using GO = GlobalOrdinal;
630  using NO = Node;
631  using crs_matrix_type = CrsMatrix<SC,LO,GO,NO>;
632  using crs_graph_type = CrsGraph<LO,GO,NO>;
633  using map_type = Map<LO,GO,NO>;
634  using transposer_type = RowMatrixTransposer<SC,LO,GO,NO>;
635  using import_type = Import<LO,GO,NO>;
636  using export_type = Export<LO,GO,NO>;
637  using exec_space = typename crs_graph_type::execution_space;
638  using AddKern = MMdetails::AddKernels<SC,LO,GO,NO>;
639  const char* prefix_mmm = "TpetraExt::MatrixMatrix::add: ";
640  constexpr bool debug = false;
641 
642 #ifdef HAVE_TPETRA_MMM_TIMINGS
643  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("transpose"))));
644 #endif
645 
646  if (debug) {
647  std::ostringstream os;
648  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
649  << "TpetraExt::MatrixMatrix::add" << std::endl;
650  std::cerr << os.str ();
651  }
652 
653  TEUCHOS_TEST_FOR_EXCEPTION
654  (C.isLocallyIndexed() || C.isGloballyIndexed(), std::invalid_argument,
655  prefix_mmm << "C must be a 'new' matrix (neither locally nor globally indexed).");
656  TEUCHOS_TEST_FOR_EXCEPTION
657  (! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
658  prefix_mmm << "A and B must both be fill complete.");
659 #ifdef HAVE_TPETRA_DEBUG
660  // The matrices don't have domain or range Maps unless they are fill complete.
661  if (A.isFillComplete () && B.isFillComplete ()) {
662  const bool domainMapsSame =
663  (! transposeA && ! transposeB &&
664  ! A.getDomainMap()->locallySameAs (*B.getDomainMap ())) ||
665  (! transposeA && transposeB &&
666  ! A.getDomainMap()->isSameAs (*B.getRangeMap ())) ||
667  ( transposeA && ! transposeB &&
668  ! A.getRangeMap ()->isSameAs (*B.getDomainMap ()));
669  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
670  prefix_mmm << "The domain Maps of Op(A) and Op(B) are not the same.");
671 
672  const bool rangeMapsSame =
673  (! transposeA && ! transposeB &&
674  ! A.getRangeMap ()->isSameAs (*B.getRangeMap ())) ||
675  (! transposeA && transposeB &&
676  ! A.getRangeMap ()->isSameAs (*B.getDomainMap())) ||
677  ( transposeA && ! transposeB &&
678  ! A.getDomainMap()->isSameAs (*B.getRangeMap ()));
679  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
680  prefix_mmm << "The range Maps of Op(A) and Op(B) are not the same.");
681  }
682 #endif // HAVE_TPETRA_DEBUG
683 
684  using Teuchos::ParameterList;
685  // Form the explicit transpose of A if necessary.
686  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
687  if (transposeA) {
688  transposer_type transposer (Aprime);
689  Aprime = transposer.createTranspose ();
690  }
691 
692 #ifdef HAVE_TPETRA_DEBUG
693  TEUCHOS_TEST_FOR_EXCEPTION
694  (Aprime.is_null (), std::logic_error,
695  prefix_mmm << "Failed to compute Op(A). "
696  "Please report this bug to the Tpetra developers.");
697 #endif // HAVE_TPETRA_DEBUG
698 
699  // Form the explicit transpose of B if necessary.
700  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
701  if (transposeB) {
702  if (debug) {
703  std::ostringstream os;
704  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
705  << "Form explicit xpose of B" << std::endl;
706  std::cerr << os.str ();
707  }
708  transposer_type transposer (Bprime);
709  Bprime = transposer.createTranspose ();
710  }
711 #ifdef HAVE_TPETRA_DEBUG
712  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
713  prefix_mmm << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
714  TEUCHOS_TEST_FOR_EXCEPTION(
715  !Aprime->isFillComplete () || !Bprime->isFillComplete (), std::invalid_argument,
716  prefix_mmm << "Aprime and Bprime must both be fill complete. "
717  "Please report this bug to the Tpetra developers.");
718 #endif // HAVE_TPETRA_DEBUG
719  RCP<const map_type> CDomainMap = domainMap;
720  RCP<const map_type> CRangeMap = rangeMap;
721  if(CDomainMap.is_null())
722  {
723  CDomainMap = Bprime->getDomainMap();
724  }
725  if(CRangeMap.is_null())
726  {
727  CRangeMap = Bprime->getRangeMap();
728  }
729  assert(!(CDomainMap.is_null()));
730  assert(!(CRangeMap.is_null()));
731  typedef typename AddKern::values_array values_array;
732  typedef typename AddKern::row_ptrs_array row_ptrs_array;
733  typedef typename AddKern::col_inds_array col_inds_array;
734  bool AGraphSorted = Aprime->getCrsGraph()->isSorted();
735  bool BGraphSorted = Bprime->getCrsGraph()->isSorted();
736  values_array vals;
737  row_ptrs_array rowptrs;
738  col_inds_array colinds;
739 #ifdef HAVE_TPETRA_MMM_TIMINGS
740  MM = Teuchos::null;
741  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("rowmap check/import"))));
742 #endif
743  if(!(Aprime->getRowMap()->isSameAs(*(Bprime->getRowMap()))))
744  {
745  //import Aprime into Bprime's row map so the local matrices have same # of rows
746  auto import = rcp(new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
747  // cbl do not set
748  // parameterlist "isMatrixMatrix_TransferAndFillComplete" true here as
749  // this import _may_ take the form of a transfer. In practice it would be unlikely,
750  // but the general case is not so forgiving.
751  Aprime = importAndFillCompleteCrsMatrix<crs_matrix_type>(Aprime, *import, Bprime->getDomainMap(), Bprime->getRangeMap());
752  }
753  bool matchingColMaps = Aprime->getColMap()->isSameAs(*(Bprime->getColMap()));
754  bool sorted = AGraphSorted && BGraphSorted;
755  RCP<const import_type> Cimport = Teuchos::null;
756  RCP<export_type> Cexport = Teuchos::null;
757  bool doFillComplete = true;
758  if(Teuchos::nonnull(params) && params->isParameter("Call fillComplete"))
759  {
760  doFillComplete = params->get<bool>("Call fillComplete");
761  }
762  auto Alocal = Aprime->getLocalMatrixDevice();
763  auto Blocal = Bprime->getLocalMatrixDevice();
764  LO numLocalRows = Alocal.numRows();
765  if(numLocalRows == 0)
766  {
767  //KokkosKernels spadd assumes rowptrs.extent(0) + 1 == nrows,
768  //but an empty Tpetra matrix is allowed to have rowptrs.extent(0) == 0.
769  //Handle this case now
770  //(without interfering with collective operations, since it's possible for
771  //some ranks to have 0 local rows and others not).
772  rowptrs = row_ptrs_array("C rowptrs", 0);
773  }
774  auto Acolmap = Aprime->getColMap();
775  auto Bcolmap = Bprime->getColMap();
776  if(!matchingColMaps)
777  {
778  using global_col_inds_array = typename AddKern::global_col_inds_array;
779 #ifdef HAVE_TPETRA_MMM_TIMINGS
780  MM = Teuchos::null;
781  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mismatched col map full kernel"))));
782 #endif
783  //use kernel that converts col indices in both A and B to common domain map before adding
784  auto AlocalColmap = Acolmap->getLocalMap();
785  auto BlocalColmap = Bcolmap->getLocalMap();
786  global_col_inds_array globalColinds;
787  if (debug) {
788  std::ostringstream os;
789  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
790  << "Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
791  std::cerr << os.str ();
792  }
793  AddKern::convertToGlobalAndAdd(
794  Alocal, alpha, Blocal, beta, AlocalColmap, BlocalColmap,
795  vals, rowptrs, globalColinds);
796  if (debug) {
797  std::ostringstream os;
798  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
799  << "Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
800  std::cerr << os.str ();
801  }
802 #ifdef HAVE_TPETRA_MMM_TIMINGS
803  MM = Teuchos::null;
804  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Constructing graph"))));
805 #endif
806  RCP<const map_type> CcolMap;
807  Tpetra::Details::makeColMap<LocalOrdinal, GlobalOrdinal, Node>
808  (CcolMap, CDomainMap, globalColinds);
809  C.replaceColMap(CcolMap);
810  col_inds_array localColinds("C colinds", globalColinds.extent(0));
811  Kokkos::parallel_for(Kokkos::RangePolicy<exec_space>(0, globalColinds.extent(0)),
812  ConvertGlobalToLocalFunctor<LocalOrdinal, GlobalOrdinal,
813  col_inds_array, global_col_inds_array,
814  typename map_type::local_map_type>
815  (localColinds, globalColinds, CcolMap->getLocalMap()));
816  KokkosKernels::Impl::sort_crs_matrix<exec_space, row_ptrs_array, col_inds_array, values_array>(rowptrs, localColinds, vals);
817  C.setAllValues(rowptrs, localColinds, vals);
818  C.fillComplete(CDomainMap, CRangeMap, params);
819  if(!doFillComplete)
820  C.resumeFill();
821  }
822  else
823  {
824  //Aprime, Bprime and C all have the same column maps
825  auto Avals = Alocal.values;
826  auto Bvals = Blocal.values;
827  auto Arowptrs = Alocal.graph.row_map;
828  auto Browptrs = Blocal.graph.row_map;
829  auto Acolinds = Alocal.graph.entries;
830  auto Bcolinds = Blocal.graph.entries;
831  if(sorted)
832  {
833  //use sorted kernel
834 #ifdef HAVE_TPETRA_MMM_TIMINGS
835  MM = Teuchos::null;
836  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("sorted entries full kernel"))));
837 #endif
838  if (debug) {
839  std::ostringstream os;
840  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
841  << "Call AddKern::addSorted(...)" << std::endl;
842  std::cerr << os.str ();
843  }
844  AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, vals, rowptrs, colinds);
845  }
846  else
847  {
848  //use unsorted kernel
849 #ifdef HAVE_TPETRA_MMM_TIMINGS
850  MM = Teuchos::null;
851  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mm add unsorted entries full kernel"))));
852 #endif
853  if (debug) {
854  std::ostringstream os;
855  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
856  << "Call AddKern::addUnsorted(...)" << std::endl;
857  std::cerr << os.str ();
858  }
859  AddKern::addUnsorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
860  }
861  //Bprime col map works as C's row map, since Aprime and Bprime have the same colmaps.
862  RCP<const map_type> Ccolmap = Bcolmap;
863  C.replaceColMap(Ccolmap);
864  C.setAllValues(rowptrs, colinds, vals);
865  if(doFillComplete)
866  {
867  #ifdef HAVE_TPETRA_MMM_TIMINGS
868  MM = Teuchos::null;
869  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Tpetra::Crs expertStaticFillComplete"))));
870  #endif
871  if(!CDomainMap->isSameAs(*Ccolmap))
872  {
873  if (debug) {
874  std::ostringstream os;
875  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
876  << "Create Cimport" << std::endl;
877  std::cerr << os.str ();
878  }
879  Cimport = rcp(new import_type(CDomainMap, Ccolmap));
880  }
881  if(!C.getRowMap()->isSameAs(*CRangeMap))
882  {
883  if (debug) {
884  std::ostringstream os;
885  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
886  << "Create Cexport" << std::endl;
887  std::cerr << os.str ();
888  }
889  Cexport = rcp(new export_type(C.getRowMap(), CRangeMap));
890  }
891 
892  if (debug) {
893  std::ostringstream os;
894  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
895  << "Call C->expertStaticFillComplete(...)" << std::endl;
896  std::cerr << os.str ();
897  }
898  C.expertStaticFillComplete(CDomainMap, CRangeMap, Cimport, Cexport, params);
899  }
900  }
901 }
902 
903 template <class Scalar,
904  class LocalOrdinal,
905  class GlobalOrdinal,
906  class Node>
907 void Add(
909  bool transposeA,
910  Scalar scalarA,
912  bool transposeB,
913  Scalar scalarB,
915 {
916  using Teuchos::Array;
917  using Teuchos::ArrayRCP;
918  using Teuchos::ArrayView;
919  using Teuchos::RCP;
920  using Teuchos::rcp;
921  using Teuchos::rcp_dynamic_cast;
922  using Teuchos::rcpFromRef;
923  using Teuchos::tuple;
924  using std::endl;
925  // typedef typename ArrayView<const Scalar>::size_type size_type;
926  typedef Teuchos::ScalarTraits<Scalar> STS;
928  // typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
929  // typedef RowGraph<LocalOrdinal, GlobalOrdinal, Node> row_graph_type;
930  // typedef CrsGraph<LocalOrdinal, GlobalOrdinal, Node> crs_graph_type;
933 
934  std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
935 
936  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
937  prefix << "The case C == null does not actually work. Fixing this will require an interface change.");
938 
939  TEUCHOS_TEST_FOR_EXCEPTION(
940  ! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
941  prefix << "Both input matrices must be fill complete before calling this function.");
942 
943 #ifdef HAVE_TPETRA_DEBUG
944  {
945  const bool domainMapsSame =
946  (! transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getDomainMap ()))) ||
947  (! transposeA && transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ()))) ||
948  (transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ())));
949  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
950  prefix << "The domain Maps of Op(A) and Op(B) are not the same.");
951 
952  const bool rangeMapsSame =
953  (! transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getRangeMap ()))) ||
954  (! transposeA && transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ()))) ||
955  (transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ())));
956  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
957  prefix << "The range Maps of Op(A) and Op(B) are not the same.");
958  }
959 #endif // HAVE_TPETRA_DEBUG
960 
961  using Teuchos::ParameterList;
962  RCP<ParameterList> transposeParams (new ParameterList);
963  transposeParams->set ("sort", false);
964 
965  // Form the explicit transpose of A if necessary.
966  RCP<const crs_matrix_type> Aprime;
967  if (transposeA) {
968  transposer_type theTransposer (rcpFromRef (A));
969  Aprime = theTransposer.createTranspose (transposeParams);
970  }
971  else {
972  Aprime = rcpFromRef (A);
973  }
974 
975 #ifdef HAVE_TPETRA_DEBUG
976  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
977  prefix << "Failed to compute Op(A). Please report this bug to the Tpetra developers.");
978 #endif // HAVE_TPETRA_DEBUG
979 
980  // Form the explicit transpose of B if necessary.
981  RCP<const crs_matrix_type> Bprime;
982  if (transposeB) {
983  transposer_type theTransposer (rcpFromRef (B));
984  Bprime = theTransposer.createTranspose (transposeParams);
985  }
986  else {
987  Bprime = rcpFromRef (B);
988  }
989 
990 #ifdef HAVE_TPETRA_DEBUG
991  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
992  prefix << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
993 #endif // HAVE_TPETRA_DEBUG
994 
995  // Allocate or zero the entries of the result matrix.
996  if (! C.is_null ()) {
997  C->setAllToScalar (STS::zero ());
998  } else {
999  // FIXME (mfh 08 May 2013) When I first looked at this method, I
1000  // noticed that C was being given the row Map of Aprime (the
1001  // possibly transposed version of A). Is this what we want?
1002  C = rcp (new crs_matrix_type (Aprime->getRowMap (), 0));
1003  }
1004 
1005 #ifdef HAVE_TPETRA_DEBUG
1006  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1007  prefix << "At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1008  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1009  prefix << "At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1010  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
1011  prefix << "At this point, C is null. Please report this bug to the Tpetra developers.");
1012 #endif // HAVE_TPETRA_DEBUG
1013 
1014  Array<RCP<const crs_matrix_type> > Mat =
1015  tuple<RCP<const crs_matrix_type> > (Aprime, Bprime);
1016  Array<Scalar> scalar = tuple<Scalar> (scalarA, scalarB);
1017 
1018  // do a loop over each matrix to add: A reordering might be more efficient
1019  for (int k = 0; k < 2; ++k) {
1020  typename crs_matrix_type::nonconst_global_inds_host_view_type Indices;
1021  typename crs_matrix_type::nonconst_values_host_view_type Values;
1022 
1023  // Loop over each locally owned row of the current matrix (either
1024  // Aprime or Bprime), and sum its entries into the corresponding
1025  // row of C. This works regardless of whether Aprime or Bprime
1026  // has the same row Map as C, because both sumIntoGlobalValues and
1027  // insertGlobalValues allow summing resp. inserting into nonowned
1028  // rows of C.
1029 #ifdef HAVE_TPETRA_DEBUG
1030  TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
1031  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1032 #endif // HAVE_TPETRA_DEBUG
1033  RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
1034 #ifdef HAVE_TPETRA_DEBUG
1035  TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
1036  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1037 #endif // HAVE_TPETRA_DEBUG
1038 
1039  const size_t localNumRows = Mat[k]->getNodeNumRows ();
1040  for (size_t i = 0; i < localNumRows; ++i) {
1041  const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
1042  size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
1043  if (numEntries > 0) {
1044  Kokkos::resize(Indices,numEntries);
1045  Kokkos::resize(Values,numEntries);
1046  Mat[k]->getGlobalRowCopy (globalRow, Indices, Values, numEntries);
1047 
1048  if (scalar[k] != STS::one ()) {
1049  for (size_t j = 0; j < numEntries; ++j) {
1050  Values[j] *= scalar[k];
1051  }
1052  }
1053 
1054  if (C->isFillComplete ()) {
1055  C->sumIntoGlobalValues (globalRow, numEntries,
1056  reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1057  } else {
1058  C->insertGlobalValues (globalRow, numEntries,
1059  reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1060  }
1061  }
1062  }
1063  }
1064 }
1065 
1066 } //End namespace MatrixMatrix
1067 
1068 namespace MMdetails{
1069 
1070 /*********************************************************************************************************/
1071 // Prints MMM-style statistics on communication done with an Import or Export object
1072 template <class TransferType>
1073 void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer, const std::string &label) {
1074  if (Transfer.is_null())
1075  return;
1076 
1077  const Distributor & Distor = Transfer->getDistributor();
1078  Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
1079 
1080  size_t rows_send = Transfer->getNumExportIDs();
1081  size_t rows_recv = Transfer->getNumRemoteIDs();
1082 
1083  size_t round1_send = Transfer->getNumExportIDs() * sizeof(size_t);
1084  size_t round1_recv = Transfer->getNumRemoteIDs() * sizeof(size_t);
1085  size_t num_send_neighbors = Distor.getNumSends();
1086  size_t num_recv_neighbors = Distor.getNumReceives();
1087  size_t round2_send, round2_recv;
1088  Distor.getLastDoStatistics(round2_send,round2_recv);
1089 
1090  int myPID = Comm->getRank();
1091  int NumProcs = Comm->getSize();
1092 
1093  // Processor by processor statistics
1094  // printf("[%d] %s Statistics: neigh[s/r]=%d/%d rows[s/r]=%d/%d r1bytes[s/r]=%d/%d r2bytes[s/r]=%d/%d\n",
1095  // myPID, label.c_str(),num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv);
1096 
1097  // Global statistics
1098  size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
1099  size_t gstats_min[8], gstats_max[8];
1100 
1101  double lstats_avg[8], gstats_avg[8];
1102  for(int i=0; i<8; i++)
1103  lstats_avg[i] = ((double)lstats[i])/NumProcs;
1104 
1105  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
1106  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
1107  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
1108 
1109  if(!myPID) {
1110  printf("%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1111  (int)gstats_min[0],gstats_avg[0],(int)gstats_max[0], (int)gstats_min[2],gstats_avg[2],(int)gstats_max[2],
1112  (int)gstats_min[4],gstats_avg[4],(int)gstats_max[4], (int)gstats_min[6],gstats_avg[6],(int)gstats_max[6]);
1113  printf("%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1114  (int)gstats_min[1],gstats_avg[1],(int)gstats_max[1], (int)gstats_min[3],gstats_avg[3],(int)gstats_max[3],
1115  (int)gstats_min[5],gstats_avg[5],(int)gstats_max[5], (int)gstats_min[7],gstats_avg[7],(int)gstats_max[7]);
1116  }
1117 }
1118 
1119 // Kernel method for computing the local portion of C = A*B
1120 template<class Scalar,
1121  class LocalOrdinal,
1122  class GlobalOrdinal,
1123  class Node>
1124 void mult_AT_B_newmatrix(
1125  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1126  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1127  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1128  const std::string & label,
1129  const Teuchos::RCP<Teuchos::ParameterList>& params)
1130 {
1131  using Teuchos::RCP;
1132  using Teuchos::rcp;
1133  typedef Scalar SC;
1134  typedef LocalOrdinal LO;
1135  typedef GlobalOrdinal GO;
1136  typedef Node NO;
1137  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
1138  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1139 
1140 #ifdef HAVE_TPETRA_MMM_TIMINGS
1141  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1142  using Teuchos::TimeMonitor;
1143  RCP<TimeMonitor> MM = rcp (new TimeMonitor
1144  (*TimeMonitor::getNewTimer (prefix_mmm + "MMM-T Transpose")));
1145 #endif
1146 
1147  /*************************************************************/
1148  /* 1) Local Transpose of A */
1149  /*************************************************************/
1150  transposer_type transposer (rcpFromRef (A), label + std::string("XP: "));
1151 
1152  using Teuchos::ParameterList;
1153  RCP<ParameterList> transposeParams (new ParameterList);
1154  transposeParams->set ("sort", false);
1155  if(! params.is_null ()) {
1156  transposeParams->set ("compute global constants",
1157  params->get ("compute global constants: temporaries",
1158  false));
1159  }
1160  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Atrans =
1161  transposer.createTransposeLocal (transposeParams);
1162 
1163  /*************************************************************/
1164  /* 2/3) Call mult_A_B_newmatrix w/ fillComplete */
1165  /*************************************************************/
1166 #ifdef HAVE_TPETRA_MMM_TIMINGS
1167  MM = Teuchos::null;
1168  MM = rcp (new TimeMonitor
1169  (*TimeMonitor::getNewTimer (prefix_mmm + std::string ("MMM-T I&X"))));
1170 #endif
1171 
1172  // Get views, asserting that no import is required to speed up computation
1173  crs_matrix_struct_type Aview;
1174  crs_matrix_struct_type Bview;
1175  RCP<const Import<LO, GO, NO> > dummyImporter;
1176 
1177  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
1178  RCP<Teuchos::ParameterList> importParams1 (new ParameterList);
1179  if (! params.is_null ()) {
1180  importParams1->set ("compute global constants",
1181  params->get ("compute global constants: temporaries",
1182  false));
1183  auto slist = params->sublist ("matrixmatrix: kernel params", false);
1184  bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1185  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck", false);
1186  int mm_optimization_core_count =
1188  mm_optimization_core_count =
1189  slist.get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1190  int mm_optimization_core_count2 =
1191  params->get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1192  if (mm_optimization_core_count2 < mm_optimization_core_count) {
1193  mm_optimization_core_count = mm_optimization_core_count2;
1194  }
1195  auto & sip1 = importParams1->sublist ("matrixmatrix: kernel params", false);
1196  sip1.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1197  sip1.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1198  sip1.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1199  }
1200 
1201  MMdetails::import_and_extract_views (*Atrans, Atrans->getRowMap (),
1202  Aview, dummyImporter, true,
1203  label, importParams1);
1204 
1205  RCP<ParameterList> importParams2 (new ParameterList);
1206  if (! params.is_null ()) {
1207  importParams2->set ("compute global constants",
1208  params->get ("compute global constants: temporaries",
1209  false));
1210  auto slist = params->sublist ("matrixmatrix: kernel params", false);
1211  bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1212  bool overrideAllreduce = slist.get ("MM_TAFC_OverrideAllreduceCheck", false);
1213  int mm_optimization_core_count =
1215  mm_optimization_core_count =
1216  slist.get ("MM_TAFC_OptimizationCoreCount",
1217  mm_optimization_core_count);
1218  int mm_optimization_core_count2 =
1219  params->get ("MM_TAFC_OptimizationCoreCount",
1220  mm_optimization_core_count);
1221  if (mm_optimization_core_count2 < mm_optimization_core_count) {
1222  mm_optimization_core_count = mm_optimization_core_count2;
1223  }
1224  auto & sip2 = importParams2->sublist ("matrixmatrix: kernel params", false);
1225  sip2.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1226  sip2.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1227  sip2.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1228  }
1229 
1230  if(B.getRowMap()->isSameAs(*Atrans->getColMap())){
1231  MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,true, label,importParams2);
1232  }
1233  else {
1234  MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter,false, label,importParams2);
1235  }
1236 
1237 #ifdef HAVE_TPETRA_MMM_TIMINGS
1238  MM = Teuchos::null;
1239  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T AB-core"))));
1240 #endif
1241 
1242  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Ctemp;
1243 
1244  // If Atrans has no Exporter, we can use C instead of having to create a temp matrix
1245  bool needs_final_export = ! Atrans->getGraph ()->getExporter ().is_null();
1246  if (needs_final_export) {
1247  Ctemp = rcp (new Tpetra::CrsMatrix<SC, LO, GO, NO> (Atrans->getRowMap (), 0));
1248  }
1249  else {
1250  Ctemp = rcp (&C, false);
1251  }
1252 
1253  mult_A_B_newmatrix(Aview, Bview, *Ctemp, label,params);
1254 
1255  /*************************************************************/
1256  /* 4) exportAndFillComplete matrix */
1257  /*************************************************************/
1258 #ifdef HAVE_TPETRA_MMM_TIMINGS
1259  MM = Teuchos::null;
1260  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T exportAndFillComplete"))));
1261 #endif
1262 
1263  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Crcp (&C, false);
1264 
1265  if (needs_final_export) {
1266  ParameterList labelList;
1267  labelList.set("Timer Label", label);
1268  if(!params.is_null()) {
1269  ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params",false);
1270  ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
1271  int mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
1272  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1273  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1274  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
1275  labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,"Core Count above which the optimized neighbor discovery is used");
1276  bool isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
1277  bool overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
1278 
1279  labelList_subList.set ("isMatrixMatrix_TransferAndFillComplete", isMM,
1280  "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
1281  labelList.set("compute global constants",params->get("compute global constants",true));
1282  labelList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
1283  }
1284  Ctemp->exportAndFillComplete (Crcp,
1285  *Ctemp->getGraph ()->getExporter (),
1286  B.getDomainMap (),
1287  A.getDomainMap (),
1288  rcp (&labelList, false));
1289  }
1290 #ifdef HAVE_TPETRA_MMM_STATISTICS
1291  printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(" AT_B MMM"));
1292 #endif
1293 }
1294 
1295 /*********************************************************************************************************/
1296 // Kernel method for computing the local portion of C = A*B
1297 template<class Scalar,
1298  class LocalOrdinal,
1299  class GlobalOrdinal,
1300  class Node>
1301 void mult_A_B(
1302  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1303  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1304  CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1305  const std::string& /* label */,
1306  const Teuchos::RCP<Teuchos::ParameterList>& /* params */)
1307 {
1308  using Teuchos::Array;
1309  using Teuchos::ArrayRCP;
1310  using Teuchos::ArrayView;
1311  using Teuchos::OrdinalTraits;
1312  using Teuchos::null;
1313 
1314  typedef Teuchos::ScalarTraits<Scalar> STS;
1315  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("mult_A_B", mult_A_B);
1316  LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1317  LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1318 
1319  LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1320  LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1321 
1322  ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getNodeElementList();
1323  ArrayView<const GlobalOrdinal> bcols_import = null;
1324  if (Bview.importColMap != null) {
1325  C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1326  C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1327 
1328  bcols_import = Bview.importColMap->getNodeElementList();
1329  }
1330 
1331  size_t C_numCols = C_lastCol - C_firstCol +
1332  OrdinalTraits<LocalOrdinal>::one();
1333  size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1334  OrdinalTraits<LocalOrdinal>::one();
1335 
1336  if (C_numCols_import > C_numCols)
1337  C_numCols = C_numCols_import;
1338 
1339  Array<Scalar> dwork = Array<Scalar>(C_numCols);
1340  Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1341  Array<size_t> iwork2 = Array<size_t>(C_numCols);
1342 
1343  Array<Scalar> C_row_i = dwork;
1344  Array<GlobalOrdinal> C_cols = iwork;
1345  Array<size_t> c_index = iwork2;
1346  Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1347  Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1348 
1349  size_t C_row_i_length, j, k, last_index;
1350 
1351  // Run through all the hash table lookups once and for all
1352  LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1353  Array<LocalOrdinal> Acol2Brow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1354  Array<LocalOrdinal> Acol2Irow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1355  if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1356  // Maps are the same: Use local IDs as the hash
1357  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1358  Aview.colMap->getMaxLocalIndex(); i++)
1359  Acol2Brow[i]=i;
1360  }
1361  else {
1362  // Maps are not the same: Use the map's hash
1363  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1364  Aview.colMap->getMaxLocalIndex(); i++) {
1365  GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1366  LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1367  if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1368  else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1369  }
1370  }
1371 
1372  // To form C = A*B we're going to execute this expression:
1373  //
1374  // C(i,j) = sum_k( A(i,k)*B(k,j) )
1375  //
1376  // Our goal, of course, is to navigate the data in A and B once, without
1377  // performing searches for column-indices, etc.
1378  ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP;
1379  ArrayRCP<const LocalOrdinal> Acolind_RCP, Bcolind_RCP, Icolind_RCP;
1380  ArrayRCP<const Scalar> Avals_RCP, Bvals_RCP, Ivals_RCP;
1381  ArrayView<const size_t> Arowptr, Browptr, Irowptr;
1382  ArrayView<const LocalOrdinal> Acolind, Bcolind, Icolind;
1383  ArrayView<const Scalar> Avals, Bvals, Ivals;
1384  Aview.origMatrix->getAllValues(Arowptr_RCP,Acolind_RCP,Avals_RCP);
1385  Bview.origMatrix->getAllValues(Browptr_RCP,Bcolind_RCP,Bvals_RCP);
1386  Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1387  Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
1388  if(!Bview.importMatrix.is_null()) {
1389  Bview.importMatrix->getAllValues(Irowptr_RCP,Icolind_RCP,Ivals_RCP);
1390  Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1391  }
1392 
1393  bool C_filled = C.isFillComplete();
1394 
1395  for (size_t i = 0; i < C_numCols; i++)
1396  c_index[i] = OrdinalTraits<size_t>::invalid();
1397 
1398  // Loop over the rows of A.
1399  size_t Arows = Aview.rowMap->getNodeNumElements();
1400  for(size_t i=0; i<Arows; ++i) {
1401 
1402  // Only navigate the local portion of Aview... which is, thankfully, all of
1403  // A since this routine doesn't do transpose modes
1404  GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1405 
1406  // Loop across the i-th row of A and for each corresponding row in B, loop
1407  // across columns and accumulate product A(i,k)*B(k,j) into our partial sum
1408  // quantities C_row_i. In other words, as we stride across B(k,:) we're
1409  // calculating updates for row i of the result matrix C.
1410  C_row_i_length = OrdinalTraits<size_t>::zero();
1411 
1412  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1413  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1414  const Scalar Aval = Avals[k];
1415  if (Aval == STS::zero())
1416  continue;
1417 
1418  if (Ak == LO_INVALID)
1419  continue;
1420 
1421  for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1422  LocalOrdinal col = Bcolind[j];
1423  //assert(col >= 0 && col < C_numCols);
1424 
1425  if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1426  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1427  // This has to be a += so insertGlobalValue goes out
1428  C_row_i[C_row_i_length] = Aval*Bvals[j];
1429  C_cols[C_row_i_length] = col;
1430  c_index[col] = C_row_i_length;
1431  C_row_i_length++;
1432 
1433  } else {
1434  C_row_i[c_index[col]] += Aval*Bvals[j];
1435  }
1436  }
1437  }
1438 
1439  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1440  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1441  C_cols[ii] = bcols[C_cols[ii]];
1442  combined_index[ii] = C_cols[ii];
1443  combined_values[ii] = C_row_i[ii];
1444  }
1445  last_index = C_row_i_length;
1446 
1447  //
1448  //Now put the C_row_i values into C.
1449  //
1450  // We might have to revamp this later.
1451  C_row_i_length = OrdinalTraits<size_t>::zero();
1452 
1453  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1454  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1455  const Scalar Aval = Avals[k];
1456  if (Aval == STS::zero())
1457  continue;
1458 
1459  if (Ak!=LO_INVALID) continue;
1460 
1461  Ak = Acol2Irow[Acolind[k]];
1462  for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1463  LocalOrdinal col = Icolind[j];
1464  //assert(col >= 0 && col < C_numCols);
1465 
1466  if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1467  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1468  // This has to be a += so insertGlobalValue goes out
1469  C_row_i[C_row_i_length] = Aval*Ivals[j];
1470  C_cols[C_row_i_length] = col;
1471  c_index[col] = C_row_i_length;
1472  C_row_i_length++;
1473 
1474  } else {
1475  // This has to be a += so insertGlobalValue goes out
1476  C_row_i[c_index[col]] += Aval*Ivals[j];
1477  }
1478  }
1479  }
1480 
1481  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1482  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1483  C_cols[ii] = bcols_import[C_cols[ii]];
1484  combined_index[last_index] = C_cols[ii];
1485  combined_values[last_index] = C_row_i[ii];
1486  last_index++;
1487  }
1488 
1489  // Now put the C_row_i values into C.
1490  // We might have to revamp this later.
1491  C_filled ?
1492  C.sumIntoGlobalValues(
1493  global_row,
1494  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1495  combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1496  :
1497  C.insertGlobalValues(
1498  global_row,
1499  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1500  combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1501 
1502  }
1503 }
1504 
1505 /*********************************************************************************************************/
1506 template<class Scalar,
1507  class LocalOrdinal,
1508  class GlobalOrdinal,
1509  class Node>
1510 void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1511  typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1512  Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1513 
1514  if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1515  Mview.maxNumRowEntries = Mview.indices[0].size();
1516 
1517  for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1518  if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1519  Mview.maxNumRowEntries = Mview.indices[i].size();
1520  }
1521 }
1522 
1523 /*********************************************************************************************************/
1524 template<class CrsMatrixType>
1525 size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1526  // Follows the NZ estimate in ML's ml_matmatmult.c
1527  size_t Aest = 100, Best=100;
1528  if (A.getNodeNumEntries() >= A.getNodeNumRows())
1529  Aest = (A.getNodeNumRows() > 0) ? A.getNodeNumEntries()/A.getNodeNumRows() : 100;
1530  if (B.getNodeNumEntries() >= B.getNodeNumRows())
1531  Best = (B.getNodeNumRows() > 0) ? B.getNodeNumEntries()/B.getNodeNumRows() : 100;
1532 
1533  size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
1534  nnzperrow *= nnzperrow;
1535 
1536  return (size_t)(A.getNodeNumRows()*nnzperrow*0.75 + 100);
1537 }
1538 
1539 
1540 /*********************************************************************************************************/
1541 // Kernel method for computing the local portion of C = A*B
1542 //
1543 // mfh 27 Sep 2016: Currently, mult_AT_B_newmatrix() also calls this
1544 // function, so this is probably the function we want to
1545 // thread-parallelize.
1546 template<class Scalar,
1547  class LocalOrdinal,
1548  class GlobalOrdinal,
1549  class Node>
1550 void mult_A_B_newmatrix(
1551  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1552  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1553  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1554  const std::string& label,
1555  const Teuchos::RCP<Teuchos::ParameterList>& params)
1556 {
1557  using Teuchos::Array;
1558  using Teuchos::ArrayRCP;
1559  using Teuchos::ArrayView;
1560  using Teuchos::RCP;
1561  using Teuchos::rcp;
1562 
1563  // Tpetra typedefs
1564  typedef LocalOrdinal LO;
1565  typedef GlobalOrdinal GO;
1566  typedef Node NO;
1567  typedef Import<LO,GO,NO> import_type;
1568  typedef Map<LO,GO,NO> map_type;
1569 
1570  // Kokkos typedefs
1571  typedef typename map_type::local_map_type local_map_type;
1573  typedef typename KCRS::StaticCrsGraphType graph_t;
1574  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1575  typedef typename NO::execution_space execution_space;
1576  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1577  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1578 
1579 #ifdef HAVE_TPETRA_MMM_TIMINGS
1580  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1581  using Teuchos::TimeMonitor;
1582  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM M5 Cmap")))));
1583 #endif
1584  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1585 
1586  // Build the final importer / column map, hash table lookups for C
1587  RCP<const import_type> Cimport;
1588  RCP<const map_type> Ccolmap;
1589  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1590  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1591  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1592  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1593  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1594  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1595  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1596  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1597 
1598  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
1599  // indices of B, to local column indices of C. (B and C have the
1600  // same number of columns.) The kernel uses this, instead of
1601  // copying the entire input matrix B and converting its column
1602  // indices to those of C.
1603  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1604 
1605  if (Bview.importMatrix.is_null()) {
1606  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1607  Cimport = Bimport;
1608  Ccolmap = Bview.colMap;
1609  const LO colMapSize = static_cast<LO>(Bview.colMap->getNodeNumElements());
1610  // Bcol2Ccol is trivial
1611  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1612  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1613  KOKKOS_LAMBDA(const LO i) {
1614  Bcol2Ccol(i) = i;
1615  });
1616  }
1617  else {
1618  // mfh 27 Sep 2016: B has "remotes," so we need to build the
1619  // column Map of C, as well as C's Import object (from its domain
1620  // Map to its column Map). C's column Map is the union of the
1621  // column Maps of (the local part of) B, and the "remote" part of
1622  // B. Ditto for the Import. We have optimized this "setUnion"
1623  // operation on Import objects and Maps.
1624 
1625  // Choose the right variant of setUnion
1626  if (!Bimport.is_null() && !Iimport.is_null()) {
1627  Cimport = Bimport->setUnion(*Iimport);
1628  }
1629  else if (!Bimport.is_null() && Iimport.is_null()) {
1630  Cimport = Bimport->setUnion();
1631  }
1632  else if (Bimport.is_null() && !Iimport.is_null()) {
1633  Cimport = Iimport->setUnion();
1634  }
1635  else {
1636  throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1637  }
1638  Ccolmap = Cimport->getTargetMap();
1639 
1640  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
1641  // in general. We should get rid of it in order to reduce
1642  // communication costs of sparse matrix-matrix multiply.
1643  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1644  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1645 
1646  // NOTE: This is not efficient and should be folded into setUnion
1647  //
1648  // mfh 27 Sep 2016: What the above comment means, is that the
1649  // setUnion operation on Import objects could also compute these
1650  // local index - to - local index look-up tables.
1651  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
1652  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1653  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1654  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1655  });
1656  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1657  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1658  });
1659 
1660  }
1661 
1662  // Replace the column map
1663  //
1664  // mfh 27 Sep 2016: We do this because C was originally created
1665  // without a column Map. Now we have its column Map.
1666  C.replaceColMap(Ccolmap);
1667 
1668  // mfh 27 Sep 2016: Construct tables that map from local column
1669  // indices of A, to local row indices of either B_local (the locally
1670  // owned part of B), or B_remote (the "imported" remote part of B).
1671  //
1672  // For column index Aik in row i of A, if the corresponding row of B
1673  // exists in the local part of B ("orig") (which I'll call B_local),
1674  // then targetMapToOrigRow[Aik] is the local index of that row of B.
1675  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1676  //
1677  // For column index Aik in row i of A, if the corresponding row of B
1678  // exists in the remote part of B ("Import") (which I'll call
1679  // B_remote), then targetMapToImportRow[Aik] is the local index of
1680  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1681  // (a flag value).
1682 
1683  // Run through all the hash table lookups once and for all
1684  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
1685  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
1686 
1687  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
1688  GO aidx = Acolmap_local.getGlobalElement(i);
1689  LO B_LID = Browmap_local.getLocalElement(aidx);
1690  if (B_LID != LO_INVALID) {
1691  targetMapToOrigRow(i) = B_LID;
1692  targetMapToImportRow(i) = LO_INVALID;
1693  } else {
1694  LO I_LID = Irowmap_local.getLocalElement(aidx);
1695  targetMapToOrigRow(i) = LO_INVALID;
1696  targetMapToImportRow(i) = I_LID;
1697 
1698  }
1699  });
1700 
1701  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
1702  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
1703  KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
1704 
1705 }
1706 
1707 
1708 /*********************************************************************************************************/
1709 // AB NewMatrix Kernel wrappers (Default non-threaded version)
1710 template<class Scalar,
1711  class LocalOrdinal,
1712  class GlobalOrdinal,
1713  class Node,
1714  class LocalOrdinalViewType>
1715 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1716  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1717  const LocalOrdinalViewType & targetMapToOrigRow,
1718  const LocalOrdinalViewType & targetMapToImportRow,
1719  const LocalOrdinalViewType & Bcol2Ccol,
1720  const LocalOrdinalViewType & Icol2Ccol,
1721  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1722  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
1723  const std::string& label,
1724  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1725 #ifdef HAVE_TPETRA_MMM_TIMINGS
1726  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1727  using Teuchos::TimeMonitor;
1728  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore"))));
1729 #endif
1730 
1731  using Teuchos::Array;
1732  using Teuchos::ArrayRCP;
1733  using Teuchos::ArrayView;
1734  using Teuchos::RCP;
1735  using Teuchos::rcp;
1736 
1737  // Lots and lots of typedefs
1738  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
1739  typedef typename KCRS::StaticCrsGraphType graph_t;
1740  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1741  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1742  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1743  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1744 
1745  typedef Scalar SC;
1746  typedef LocalOrdinal LO;
1747  typedef GlobalOrdinal GO;
1748  typedef Node NO;
1749  typedef Map<LO,GO,NO> map_type;
1750  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1751  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1752  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1753 
1754  // Sizes
1755  RCP<const map_type> Ccolmap = C.getColMap();
1756  size_t m = Aview.origMatrix->getNodeNumRows();
1757  size_t n = Ccolmap->getNodeNumElements();
1758  size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
1759 
1760  // Grab the Kokkos::SparseCrsMatrices & inner stuff
1761  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
1762  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
1763 
1764  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
1765  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
1766  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
1767 
1768  c_lno_view_t Irowptr;
1769  lno_nnz_view_t Icolind;
1770  scalar_view_t Ivals;
1771  if(!Bview.importMatrix.is_null()) {
1772  auto lclB = Bview.importMatrix->getLocalMatrixHost();
1773  Irowptr = lclB.graph.row_map;
1774  Icolind = lclB.graph.entries;
1775  Ivals = lclB.values;
1776  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
1777  }
1778 
1779 #ifdef HAVE_TPETRA_MMM_TIMINGS
1780  RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
1781 #endif
1782 
1783  // Classic csr assembly (low memory edition)
1784  //
1785  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
1786  // The method loops over rows of A, and may resize after processing
1787  // each row. Chris Siefert says that this reflects experience in
1788  // ML; for the non-threaded case, ML found it faster to spend less
1789  // effort on estimation and risk an occasional reallocation.
1790  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
1791  lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
1792  lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
1793  scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
1794 
1795  // mfh 27 Sep 2016: The c_status array is an implementation detail
1796  // of the local sparse matrix-matrix multiply routine.
1797 
1798  // The status array will contain the index into colind where this entry was last deposited.
1799  // c_status[i] < CSR_ip - not in the row yet
1800  // c_status[i] >= CSR_ip - this is the entry where you can find the data
1801  // We start with this filled with INVALID's indicating that there are no entries yet.
1802  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
1803  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1804  std::vector<size_t> c_status(n, ST_INVALID);
1805 
1806  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1807  // routine. The routine computes C := A * (B_local + B_remote).
1808  //
1809  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
1810  // you whether the corresponding row of B belongs to B_local
1811  // ("orig") or B_remote ("Import").
1812 
1813  // For each row of A/C
1814  size_t CSR_ip = 0, OLD_ip = 0;
1815  for (size_t i = 0; i < m; i++) {
1816  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
1817  // on the calling process.
1818  Crowptr[i] = CSR_ip;
1819 
1820  // mfh 27 Sep 2016: For each entry of A in the current row of A
1821  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
1822  LO Aik = Acolind[k]; // local column index of current entry of A
1823  const SC Aval = Avals[k]; // value of current entry of A
1824  if (Aval == SC_ZERO)
1825  continue; // skip explicitly stored zero values in A
1826 
1827  if (targetMapToOrigRow[Aik] != LO_INVALID) {
1828  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
1829  // corresponding to the current entry of A is populated, then
1830  // the corresponding row of B is in B_local (i.e., it lives on
1831  // the calling process).
1832 
1833  // Local matrix
1834  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
1835 
1836  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
1837  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
1838  LO Bkj = Bcolind[j];
1839  LO Cij = Bcol2Ccol[Bkj];
1840 
1841  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
1842  // New entry
1843  c_status[Cij] = CSR_ip;
1844  Ccolind[CSR_ip] = Cij;
1845  Cvals[CSR_ip] = Aval*Bvals[j];
1846  CSR_ip++;
1847 
1848  } else {
1849  Cvals[c_status[Cij]] += Aval*Bvals[j];
1850  }
1851  }
1852 
1853  } else {
1854  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
1855  // corresponding to the current entry of A NOT populated (has
1856  // a flag "invalid" value), then the corresponding row of B is
1857  // in B_local (i.e., it lives on the calling process).
1858 
1859  // Remote matrix
1860  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
1861  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
1862  LO Ikj = Icolind[j];
1863  LO Cij = Icol2Ccol[Ikj];
1864 
1865  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
1866  // New entry
1867  c_status[Cij] = CSR_ip;
1868  Ccolind[CSR_ip] = Cij;
1869  Cvals[CSR_ip] = Aval*Ivals[j];
1870  CSR_ip++;
1871  } else {
1872  Cvals[c_status[Cij]] += Aval*Ivals[j];
1873  }
1874  }
1875  }
1876  }
1877 
1878  // Resize for next pass if needed
1879  if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1])*b_max_nnz_per_row) > CSR_alloc) {
1880  CSR_alloc *= 2;
1881  Kokkos::resize(Ccolind,CSR_alloc);
1882  Kokkos::resize(Cvals,CSR_alloc);
1883  }
1884  OLD_ip = CSR_ip;
1885  }
1886 
1887  Crowptr[m] = CSR_ip;
1888 
1889  // Downward resize
1890  Kokkos::resize(Ccolind,CSR_ip);
1891  Kokkos::resize(Cvals,CSR_ip);
1892 
1893 #ifdef HAVE_TPETRA_MMM_TIMINGS
1894  {
1895  auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix Final Sort")));
1896 #endif
1897 
1898  // Final sort & set of CRS arrays
1899  if (params.is_null() || params->get("sort entries",true))
1900  Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
1901  C.setAllValues(Crowptr,Ccolind, Cvals);
1902 
1903 
1904 #ifdef HAVE_TPETRA_MMM_TIMINGS
1905  }
1906  auto MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix ESFC")));
1907  {
1908 #endif
1909 
1910  // Final FillComplete
1911  //
1912  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1913  // Import (from domain Map to column Map) construction (which costs
1914  // lots of communication) by taking the previously constructed
1915  // Import object. We should be able to do this without interfering
1916  // with the implementation of the local part of sparse matrix-matrix
1917  // multply above.
1918  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1919  labelList->set("Timer Label",label);
1920  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1921  RCP<const Export<LO,GO,NO> > dummyExport;
1922  C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
1923 #ifdef HAVE_TPETRA_MMM_TIMINGS
1924  }
1925  MM2 = Teuchos::null;
1926 #endif
1927 
1928 }
1929 
1930 
1931 
1932 
1933 /*********************************************************************************************************/
1934 // Kernel method for computing the local portion of C = A*B (reuse)
1935 template<class Scalar,
1936  class LocalOrdinal,
1937  class GlobalOrdinal,
1938  class Node>
1939 void mult_A_B_reuse(
1940  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1941  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1942  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1943  const std::string& label,
1944  const Teuchos::RCP<Teuchos::ParameterList>& params)
1945 {
1946  using Teuchos::Array;
1947  using Teuchos::ArrayRCP;
1948  using Teuchos::ArrayView;
1949  using Teuchos::RCP;
1950  using Teuchos::rcp;
1951 
1952  // Tpetra typedefs
1953  typedef LocalOrdinal LO;
1954  typedef GlobalOrdinal GO;
1955  typedef Node NO;
1956  typedef Import<LO,GO,NO> import_type;
1957  typedef Map<LO,GO,NO> map_type;
1958 
1959  // Kokkos typedefs
1960  typedef typename map_type::local_map_type local_map_type;
1962  typedef typename KCRS::StaticCrsGraphType graph_t;
1963  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1964  typedef typename NO::execution_space execution_space;
1965  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1966  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1967 
1968 #ifdef HAVE_TPETRA_MMM_TIMINGS
1969  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1970  using Teuchos::TimeMonitor;
1971  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse Cmap"))));
1972 #endif
1973  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1974 
1975  // Grab all the maps
1976  RCP<const import_type> Cimport = C.getGraph()->getImporter();
1977  RCP<const map_type> Ccolmap = C.getColMap();
1978  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1979  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1980  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1981  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1982  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1983  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1984 
1985  // Build the final importer / column map, hash table lookups for C
1986  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1987  {
1988  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
1989  // So, column map of C may be a strict subset of the column map of B
1990  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1991  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1992  });
1993 
1994  if (!Bview.importMatrix.is_null()) {
1995  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1996  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1997 
1998  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
1999  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2000  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2001  });
2002  }
2003  }
2004 
2005  // Run through all the hash table lookups once and for all
2006  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2007  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2008  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2009  GO aidx = Acolmap_local.getGlobalElement(i);
2010  LO B_LID = Browmap_local.getLocalElement(aidx);
2011  if (B_LID != LO_INVALID) {
2012  targetMapToOrigRow(i) = B_LID;
2013  targetMapToImportRow(i) = LO_INVALID;
2014  } else {
2015  LO I_LID = Irowmap_local.getLocalElement(aidx);
2016  targetMapToOrigRow(i) = LO_INVALID;
2017  targetMapToImportRow(i) = I_LID;
2018 
2019  }
2020  });
2021 
2022  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2023  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2024  KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2025 }
2026 
2027 /*********************************************************************************************************/
2028 template<class Scalar,
2029  class LocalOrdinal,
2030  class GlobalOrdinal,
2031  class Node,
2032  class LocalOrdinalViewType>
2033 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2034  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2035  const LocalOrdinalViewType & targetMapToOrigRow,
2036  const LocalOrdinalViewType & targetMapToImportRow,
2037  const LocalOrdinalViewType & Bcol2Ccol,
2038  const LocalOrdinalViewType & Icol2Ccol,
2039  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2040  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2041  const std::string& label,
2042  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2043 #ifdef HAVE_TPETRA_MMM_TIMINGS
2044  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2045  using Teuchos::TimeMonitor;
2046  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
2047  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2048 #else
2049  (void)label;
2050 #endif
2051  using Teuchos::RCP;
2052  using Teuchos::rcp;
2053 
2054 
2055  // Lots and lots of typedefs
2056  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2057  typedef typename KCRS::StaticCrsGraphType graph_t;
2058  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2059  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2060  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2061 
2062  typedef Scalar SC;
2063  typedef LocalOrdinal LO;
2064  typedef GlobalOrdinal GO;
2065  typedef Node NO;
2066  typedef Map<LO,GO,NO> map_type;
2067  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2068  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2069  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2070 
2071  // Sizes
2072  RCP<const map_type> Ccolmap = C.getColMap();
2073  size_t m = Aview.origMatrix->getNodeNumRows();
2074  size_t n = Ccolmap->getNodeNumElements();
2075 
2076  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2077  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2078  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2079  const KCRS & Cmat = C.getLocalMatrixHost();
2080 
2081  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2082  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2083  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2084  scalar_view_t Cvals = Cmat.values;
2085 
2086  c_lno_view_t Irowptr;
2087  lno_nnz_view_t Icolind;
2088  scalar_view_t Ivals;
2089  if(!Bview.importMatrix.is_null()) {
2090  auto lclB = Bview.importMatrix->getLocalMatrixHost();
2091  Irowptr = lclB.graph.row_map;
2092  Icolind = lclB.graph.entries;
2093  Ivals = lclB.values;
2094  }
2095 
2096 #ifdef HAVE_TPETRA_MMM_TIMINGS
2097  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
2098 #endif
2099 
2100  // Classic csr assembly (low memory edition)
2101  // mfh 27 Sep 2016: The c_status array is an implementation detail
2102  // of the local sparse matrix-matrix multiply routine.
2103 
2104  // The status array will contain the index into colind where this entry was last deposited.
2105  // c_status[i] < CSR_ip - not in the row yet
2106  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2107  // We start with this filled with INVALID's indicating that there are no entries yet.
2108  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2109  std::vector<size_t> c_status(n, ST_INVALID);
2110 
2111  // For each row of A/C
2112  size_t CSR_ip = 0, OLD_ip = 0;
2113  for (size_t i = 0; i < m; i++) {
2114  // First fill the c_status array w/ locations where we're allowed to
2115  // generate nonzeros for this row
2116  OLD_ip = Crowptr[i];
2117  CSR_ip = Crowptr[i+1];
2118  for (size_t k = OLD_ip; k < CSR_ip; k++) {
2119  c_status[Ccolind[k]] = k;
2120 
2121  // Reset values in the row of C
2122  Cvals[k] = SC_ZERO;
2123  }
2124 
2125  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2126  LO Aik = Acolind[k];
2127  const SC Aval = Avals[k];
2128  if (Aval == SC_ZERO)
2129  continue;
2130 
2131  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2132  // Local matrix
2133  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2134 
2135  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2136  LO Bkj = Bcolind[j];
2137  LO Cij = Bcol2Ccol[Bkj];
2138 
2139  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2140  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2141  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2142 
2143  Cvals[c_status[Cij]] += Aval * Bvals[j];
2144  }
2145 
2146  } else {
2147  // Remote matrix
2148  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2149  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2150  LO Ikj = Icolind[j];
2151  LO Cij = Icol2Ccol[Ikj];
2152 
2153  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2154  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2155  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2156 
2157  Cvals[c_status[Cij]] += Aval * Ivals[j];
2158  }
2159  }
2160  }
2161  }
2162 
2163 #ifdef HAVE_TPETRA_MMM_TIMINGS
2164  auto MM3 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse ESFC"))));
2165 #endif
2166 
2167  C.fillComplete(C.getDomainMap(), C.getRangeMap());
2168 }
2169 
2170 
2171 /*********************************************************************************************************/
2172 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2173 template<class Scalar,
2174  class LocalOrdinal,
2175  class GlobalOrdinal,
2176  class Node>
2177 void jacobi_A_B_newmatrix(
2178  Scalar omega,
2179  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2180  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2181  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2182  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2183  const std::string& label,
2184  const Teuchos::RCP<Teuchos::ParameterList>& params)
2185 {
2186  using Teuchos::Array;
2187  using Teuchos::ArrayRCP;
2188  using Teuchos::ArrayView;
2189  using Teuchos::RCP;
2190  using Teuchos::rcp;
2191  // typedef Scalar SC;
2192  typedef LocalOrdinal LO;
2193  typedef GlobalOrdinal GO;
2194  typedef Node NO;
2195 
2196  typedef Import<LO,GO,NO> import_type;
2197  typedef Map<LO,GO,NO> map_type;
2198  typedef typename map_type::local_map_type local_map_type;
2199 
2200  // All of the Kokkos typedefs
2202  typedef typename KCRS::StaticCrsGraphType graph_t;
2203  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2204  typedef typename NO::execution_space execution_space;
2205  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2206  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2207 
2208 
2209 #ifdef HAVE_TPETRA_MMM_TIMINGS
2210  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2211  using Teuchos::TimeMonitor;
2212  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi M5 Cmap"))));
2213 #endif
2214  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2215 
2216  // Build the final importer / column map, hash table lookups for C
2217  RCP<const import_type> Cimport;
2218  RCP<const map_type> Ccolmap;
2219  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2220  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
2221  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2222  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2223  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2224  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2225  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2226  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2227 
2228  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
2229  // indices of B, to local column indices of C. (B and C have the
2230  // same number of columns.) The kernel uses this, instead of
2231  // copying the entire input matrix B and converting its column
2232  // indices to those of C.
2233  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2234 
2235  if (Bview.importMatrix.is_null()) {
2236  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
2237  Cimport = Bimport;
2238  Ccolmap = Bview.colMap;
2239  // Bcol2Ccol is trivial
2240  // Bcol2Ccol is trivial
2241 
2242  Kokkos::RangePolicy<execution_space, LO> range (0, static_cast<LO> (Bview.colMap->getNodeNumElements ()));
2243  Kokkos::parallel_for (range, KOKKOS_LAMBDA (const size_t i) {
2244  Bcol2Ccol(i) = static_cast<LO> (i);
2245  });
2246  } else {
2247  // mfh 27 Sep 2016: B has "remotes," so we need to build the
2248  // column Map of C, as well as C's Import object (from its domain
2249  // Map to its column Map). C's column Map is the union of the
2250  // column Maps of (the local part of) B, and the "remote" part of
2251  // B. Ditto for the Import. We have optimized this "setUnion"
2252  // operation on Import objects and Maps.
2253 
2254  // Choose the right variant of setUnion
2255  if (!Bimport.is_null() && !Iimport.is_null()){
2256  Cimport = Bimport->setUnion(*Iimport);
2257  Ccolmap = Cimport->getTargetMap();
2258 
2259  } else if (!Bimport.is_null() && Iimport.is_null()) {
2260  Cimport = Bimport->setUnion();
2261 
2262  } else if(Bimport.is_null() && !Iimport.is_null()) {
2263  Cimport = Iimport->setUnion();
2264 
2265  } else
2266  throw std::runtime_error("TpetraExt::Jacobi status of matrix importers is nonsensical");
2267 
2268  Ccolmap = Cimport->getTargetMap();
2269 
2270  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2271  std::runtime_error, "Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2272 
2273  // NOTE: This is not efficient and should be folded into setUnion
2274  //
2275  // mfh 27 Sep 2016: What the above comment means, is that the
2276  // setUnion operation on Import objects could also compute these
2277  // local index - to - local index look-up tables.
2278  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2279  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2280  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2281  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2282  });
2283  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2284  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2285  });
2286 
2287  }
2288 
2289  // Replace the column map
2290  //
2291  // mfh 27 Sep 2016: We do this because C was originally created
2292  // without a column Map. Now we have its column Map.
2293  C.replaceColMap(Ccolmap);
2294 
2295  // mfh 27 Sep 2016: Construct tables that map from local column
2296  // indices of A, to local row indices of either B_local (the locally
2297  // owned part of B), or B_remote (the "imported" remote part of B).
2298  //
2299  // For column index Aik in row i of A, if the corresponding row of B
2300  // exists in the local part of B ("orig") (which I'll call B_local),
2301  // then targetMapToOrigRow[Aik] is the local index of that row of B.
2302  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
2303  //
2304  // For column index Aik in row i of A, if the corresponding row of B
2305  // exists in the remote part of B ("Import") (which I'll call
2306  // B_remote), then targetMapToImportRow[Aik] is the local index of
2307  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
2308  // (a flag value).
2309 
2310  // Run through all the hash table lookups once and for all
2311  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2312  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2313  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2314  GO aidx = Acolmap_local.getGlobalElement(i);
2315  LO B_LID = Browmap_local.getLocalElement(aidx);
2316  if (B_LID != LO_INVALID) {
2317  targetMapToOrigRow(i) = B_LID;
2318  targetMapToImportRow(i) = LO_INVALID;
2319  } else {
2320  LO I_LID = Irowmap_local.getLocalElement(aidx);
2321  targetMapToOrigRow(i) = LO_INVALID;
2322  targetMapToImportRow(i) = I_LID;
2323 
2324  }
2325  });
2326 
2327  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2328  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2329  KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2330 
2331 }
2332 
2333 
2334 /*********************************************************************************************************/
2335 // Jacobi AB NewMatrix Kernel wrappers (Default non-threaded version)
2336 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2337 
2338 template<class Scalar,
2339  class LocalOrdinal,
2340  class GlobalOrdinal,
2341  class Node,
2342  class LocalOrdinalViewType>
2343 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
2344  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> & Dinv,
2345  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2346  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2347  const LocalOrdinalViewType & targetMapToOrigRow,
2348  const LocalOrdinalViewType & targetMapToImportRow,
2349  const LocalOrdinalViewType & Bcol2Ccol,
2350  const LocalOrdinalViewType & Icol2Ccol,
2351  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2352  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2353  const std::string& label,
2354  const Teuchos::RCP<Teuchos::ParameterList>& params) {
2355 
2356 #ifdef HAVE_TPETRA_MMM_TIMINGS
2357  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2358  using Teuchos::TimeMonitor;
2359  auto MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Nemwmatrix SerialCore")));
2360 #endif
2361 
2362  using Teuchos::Array;
2363  using Teuchos::ArrayRCP;
2364  using Teuchos::ArrayView;
2365  using Teuchos::RCP;
2366  using Teuchos::rcp;
2367 
2368  // Lots and lots of typedefs
2369  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2370  typedef typename KCRS::StaticCrsGraphType graph_t;
2371  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2372  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2373  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2374  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2375 
2376  // Jacobi-specific
2377  typedef typename scalar_view_t::memory_space scalar_memory_space;
2378 
2379  typedef Scalar SC;
2380  typedef LocalOrdinal LO;
2381  typedef GlobalOrdinal GO;
2382  typedef Node NO;
2383 
2384  typedef Map<LO,GO,NO> map_type;
2385  size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2386  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2387 
2388  // Sizes
2389  RCP<const map_type> Ccolmap = C.getColMap();
2390  size_t m = Aview.origMatrix->getNodeNumRows();
2391  size_t n = Ccolmap->getNodeNumElements();
2392  size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
2393 
2394  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2395  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2396  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2397 
2398  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2399  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2400  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2401 
2402  c_lno_view_t Irowptr;
2403  lno_nnz_view_t Icolind;
2404  scalar_view_t Ivals;
2405  if(!Bview.importMatrix.is_null()) {
2406  auto lclB = Bview.importMatrix->getLocalMatrixHost();
2407  Irowptr = lclB.graph.row_map;
2408  Icolind = lclB.graph.entries;
2409  Ivals = lclB.values;
2410  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
2411  }
2412 
2413  // Jacobi-specific inner stuff
2414  auto Dvals =
2415  Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2416 
2417  // Teuchos::ArrayView::operator[].
2418  // The status array will contain the index into colind where this entry was last deposited.
2419  // c_status[i] < CSR_ip - not in the row yet.
2420  // c_status[i] >= CSR_ip, this is the entry where you can find the data
2421  // We start with this filled with INVALID's indicating that there are no entries yet.
2422  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2423  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2424  Array<size_t> c_status(n, ST_INVALID);
2425 
2426  // Classic csr assembly (low memory edition)
2427  //
2428  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2429  // The method loops over rows of A, and may resize after processing
2430  // each row. Chris Siefert says that this reflects experience in
2431  // ML; for the non-threaded case, ML found it faster to spend less
2432  // effort on estimation and risk an occasional reallocation.
2433  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2434  lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
2435  lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
2436  scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
2437  size_t CSR_ip = 0, OLD_ip = 0;
2438 
2439  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2440 
2441  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2442  // routine. The routine computes
2443  //
2444  // C := (I - omega * D^{-1} * A) * (B_local + B_remote)).
2445  //
2446  // This corresponds to one sweep of (weighted) Jacobi.
2447  //
2448  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2449  // you whether the corresponding row of B belongs to B_local
2450  // ("orig") or B_remote ("Import").
2451 
2452  // For each row of A/C
2453  for (size_t i = 0; i < m; i++) {
2454  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2455  // on the calling process.
2456  Crowptr[i] = CSR_ip;
2457  SC minusOmegaDval = -omega*Dvals(i,0);
2458 
2459  // Entries of B
2460  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2461  Scalar Bval = Bvals[j];
2462  if (Bval == SC_ZERO)
2463  continue;
2464  LO Bij = Bcolind[j];
2465  LO Cij = Bcol2Ccol[Bij];
2466 
2467  // Assume no repeated entries in B
2468  c_status[Cij] = CSR_ip;
2469  Ccolind[CSR_ip] = Cij;
2470  Cvals[CSR_ip] = Bvals[j];
2471  CSR_ip++;
2472  }
2473 
2474  // Entries of -omega * Dinv * A * B
2475  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2476  LO Aik = Acolind[k];
2477  const SC Aval = Avals[k];
2478  if (Aval == SC_ZERO)
2479  continue;
2480 
2481  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2482  // Local matrix
2483  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2484 
2485  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2486  LO Bkj = Bcolind[j];
2487  LO Cij = Bcol2Ccol[Bkj];
2488 
2489  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2490  // New entry
2491  c_status[Cij] = CSR_ip;
2492  Ccolind[CSR_ip] = Cij;
2493  Cvals[CSR_ip] = minusOmegaDval* Aval * Bvals[j];
2494  CSR_ip++;
2495 
2496  } else {
2497  Cvals[c_status[Cij]] += minusOmegaDval* Aval * Bvals[j];
2498  }
2499  }
2500 
2501  } else {
2502  // Remote matrix
2503  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2504  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2505  LO Ikj = Icolind[j];
2506  LO Cij = Icol2Ccol[Ikj];
2507 
2508  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2509  // New entry
2510  c_status[Cij] = CSR_ip;
2511  Ccolind[CSR_ip] = Cij;
2512  Cvals[CSR_ip] = minusOmegaDval* Aval * Ivals[j];
2513  CSR_ip++;
2514  } else {
2515  Cvals[c_status[Cij]] += minusOmegaDval* Aval * Ivals[j];
2516  }
2517  }
2518  }
2519  }
2520 
2521  // Resize for next pass if needed
2522  if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1]+1)*b_max_nnz_per_row) > CSR_alloc) {
2523  CSR_alloc *= 2;
2524  Kokkos::resize(Ccolind,CSR_alloc);
2525  Kokkos::resize(Cvals,CSR_alloc);
2526  }
2527  OLD_ip = CSR_ip;
2528  }
2529  Crowptr[m] = CSR_ip;
2530 
2531  // Downward resize
2532  Kokkos::resize(Ccolind,CSR_ip);
2533  Kokkos::resize(Cvals,CSR_ip);
2534 
2535  {
2536 #ifdef HAVE_TPETRA_MMM_TIMINGS
2537  auto MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix Final Sort")));
2538 #endif
2539 
2540  // Replace the column map
2541  //
2542  // mfh 27 Sep 2016: We do this because C was originally created
2543  // without a column Map. Now we have its column Map.
2544  C.replaceColMap(Ccolmap);
2545 
2546  // Final sort & set of CRS arrays
2547  //
2548  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
2549  // matrix-matrix multiply routine sort the entries for us?
2550  // Final sort & set of CRS arrays
2551  if (params.is_null() || params->get("sort entries",true))
2552  Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2553  C.setAllValues(Crowptr,Ccolind, Cvals);
2554  }
2555  {
2556 #ifdef HAVE_TPETRA_MMM_TIMINGS
2557  auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix ESFC")));
2558 #endif
2559 
2560  // Final FillComplete
2561  //
2562  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2563  // Import (from domain Map to column Map) construction (which costs
2564  // lots of communication) by taking the previously constructed
2565  // Import object. We should be able to do this without interfering
2566  // with the implementation of the local part of sparse matrix-matrix
2567  // multply above
2568  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2569  labelList->set("Timer Label",label);
2570  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2571  RCP<const Export<LO,GO,NO> > dummyExport;
2572  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2573 
2574  }
2575 }
2576 
2577 
2578 /*********************************************************************************************************/
2579 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2580 template<class Scalar,
2581  class LocalOrdinal,
2582  class GlobalOrdinal,
2583  class Node>
2584 void jacobi_A_B_reuse(
2585  Scalar omega,
2586  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2587  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2588  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2589  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2590  const std::string& label,
2591  const Teuchos::RCP<Teuchos::ParameterList>& params)
2592 {
2593  using Teuchos::Array;
2594  using Teuchos::ArrayRCP;
2595  using Teuchos::ArrayView;
2596  using Teuchos::RCP;
2597  using Teuchos::rcp;
2598 
2599  typedef LocalOrdinal LO;
2600  typedef GlobalOrdinal GO;
2601  typedef Node NO;
2602 
2603  typedef Import<LO,GO,NO> import_type;
2604  typedef Map<LO,GO,NO> map_type;
2605 
2606  // Kokkos typedefs
2607  typedef typename map_type::local_map_type local_map_type;
2609  typedef typename KCRS::StaticCrsGraphType graph_t;
2610  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2611  typedef typename NO::execution_space execution_space;
2612  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2613  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2614 
2615 #ifdef HAVE_TPETRA_MMM_TIMINGS
2616  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2617  using Teuchos::TimeMonitor;
2618  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse Cmap"))));
2619 #endif
2620  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2621 
2622  // Grab all the maps
2623  RCP<const import_type> Cimport = C.getGraph()->getImporter();
2624  RCP<const map_type> Ccolmap = C.getColMap();
2625  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2626  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2627  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2628  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2629  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2630  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2631 
2632  // Build the final importer / column map, hash table lookups for C
2633  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2634  {
2635  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2636  // So, column map of C may be a strict subset of the column map of B
2637  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2638  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2639  });
2640 
2641  if (!Bview.importMatrix.is_null()) {
2642  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2643  std::runtime_error, "Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
2644 
2645  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2646  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2647  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2648  });
2649  }
2650  }
2651 
2652  // Run through all the hash table lookups once and for all
2653  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2654  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2655  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2656  GO aidx = Acolmap_local.getGlobalElement(i);
2657  LO B_LID = Browmap_local.getLocalElement(aidx);
2658  if (B_LID != LO_INVALID) {
2659  targetMapToOrigRow(i) = B_LID;
2660  targetMapToImportRow(i) = LO_INVALID;
2661  } else {
2662  LO I_LID = Irowmap_local.getLocalElement(aidx);
2663  targetMapToOrigRow(i) = LO_INVALID;
2664  targetMapToImportRow(i) = I_LID;
2665 
2666  }
2667  });
2668 
2669 #ifdef HAVE_TPETRA_MMM_TIMINGS
2670  MM = Teuchos::null;
2671 #endif
2672 
2673  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2674  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2675  KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2676 }
2677 
2678 
2679 
2680 /*********************************************************************************************************/
2681 template<class Scalar,
2682  class LocalOrdinal,
2683  class GlobalOrdinal,
2684  class Node,
2685  class LocalOrdinalViewType>
2686 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
2687  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2688  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2689  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2690  const LocalOrdinalViewType & targetMapToOrigRow,
2691  const LocalOrdinalViewType & targetMapToImportRow,
2692  const LocalOrdinalViewType & Bcol2Ccol,
2693  const LocalOrdinalViewType & Icol2Ccol,
2694  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2695  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2696  const std::string& label,
2697  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2698 #ifdef HAVE_TPETRA_MMM_TIMINGS
2699  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2700  using Teuchos::TimeMonitor;
2701  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore"))));
2702  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2703 #else
2704  (void)label;
2705 #endif
2706  using Teuchos::RCP;
2707  using Teuchos::rcp;
2708 
2709  // Lots and lots of typedefs
2710  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2711  typedef typename KCRS::StaticCrsGraphType graph_t;
2712  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2713  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2714  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2715  typedef typename scalar_view_t::memory_space scalar_memory_space;
2716 
2717  typedef Scalar SC;
2718  typedef LocalOrdinal LO;
2719  typedef GlobalOrdinal GO;
2720  typedef Node NO;
2721  typedef Map<LO,GO,NO> map_type;
2722  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2723  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2724  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2725 
2726  // Sizes
2727  RCP<const map_type> Ccolmap = C.getColMap();
2728  size_t m = Aview.origMatrix->getNodeNumRows();
2729  size_t n = Ccolmap->getNodeNumElements();
2730 
2731  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2732  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2733  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2734  const KCRS & Cmat = C.getLocalMatrixHost();
2735 
2736  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2737  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2738  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2739  scalar_view_t Cvals = Cmat.values;
2740 
2741  c_lno_view_t Irowptr;
2742  lno_nnz_view_t Icolind;
2743  scalar_view_t Ivals;
2744  if(!Bview.importMatrix.is_null()) {
2745  auto lclB = Bview.importMatrix->getLocalMatrixHost();
2746  Irowptr = lclB.graph.row_map;
2747  Icolind = lclB.graph.entries;
2748  Ivals = lclB.values;
2749  }
2750 
2751  // Jacobi-specific inner stuff
2752  auto Dvals =
2753  Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2754 
2755 #ifdef HAVE_TPETRA_MMM_TIMINGS
2756  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore - Compare"))));
2757 #endif
2758 
2759  // The status array will contain the index into colind where this entry was last deposited.
2760  // c_status[i] < CSR_ip - not in the row yet
2761  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2762  // We start with this filled with INVALID's indicating that there are no entries yet.
2763  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2764  std::vector<size_t> c_status(n, ST_INVALID);
2765 
2766  // For each row of A/C
2767  size_t CSR_ip = 0, OLD_ip = 0;
2768  for (size_t i = 0; i < m; i++) {
2769 
2770  // First fill the c_status array w/ locations where we're allowed to
2771  // generate nonzeros for this row
2772  OLD_ip = Crowptr[i];
2773  CSR_ip = Crowptr[i+1];
2774  for (size_t k = OLD_ip; k < CSR_ip; k++) {
2775  c_status[Ccolind[k]] = k;
2776 
2777  // Reset values in the row of C
2778  Cvals[k] = SC_ZERO;
2779  }
2780 
2781  SC minusOmegaDval = -omega*Dvals(i,0);
2782 
2783  // Entries of B
2784  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2785  Scalar Bval = Bvals[j];
2786  if (Bval == SC_ZERO)
2787  continue;
2788  LO Bij = Bcolind[j];
2789  LO Cij = Bcol2Ccol[Bij];
2790 
2791  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2792  std::runtime_error, "Trying to insert a new entry into a static graph");
2793 
2794  Cvals[c_status[Cij]] = Bvals[j];
2795  }
2796 
2797  // Entries of -omega * Dinv * A * B
2798  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2799  LO Aik = Acolind[k];
2800  const SC Aval = Avals[k];
2801  if (Aval == SC_ZERO)
2802  continue;
2803 
2804  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2805  // Local matrix
2806  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2807 
2808  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2809  LO Bkj = Bcolind[j];
2810  LO Cij = Bcol2Ccol[Bkj];
2811 
2812  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2813  std::runtime_error, "Trying to insert a new entry into a static graph");
2814 
2815  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
2816  }
2817 
2818  } else {
2819  // Remote matrix
2820  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2821  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2822  LO Ikj = Icolind[j];
2823  LO Cij = Icol2Ccol[Ikj];
2824 
2825  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2826  std::runtime_error, "Trying to insert a new entry into a static graph");
2827 
2828  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
2829  }
2830  }
2831  }
2832  }
2833 
2834 #ifdef HAVE_TPETRA_MMM_TIMINGS
2835  MM2 = Teuchos::null;
2836  MM2 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
2837 #endif
2838 
2839  C.fillComplete(C.getDomainMap(), C.getRangeMap());
2840 #ifdef HAVE_TPETRA_MMM_TIMINGS
2841  MM2 = Teuchos::null;
2842  MM = Teuchos::null;
2843 #endif
2844 
2845 }
2846 
2847 
2848 
2849 /*********************************************************************************************************/
2850 template<class Scalar,
2851  class LocalOrdinal,
2852  class GlobalOrdinal,
2853  class Node>
2854 void import_and_extract_views(
2855  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
2856  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
2857  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2858  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
2859  bool userAssertsThereAreNoRemotes,
2860  const std::string& label,
2861  const Teuchos::RCP<Teuchos::ParameterList>& params)
2862 {
2863  using Teuchos::Array;
2864  using Teuchos::ArrayView;
2865  using Teuchos::RCP;
2866  using Teuchos::rcp;
2867  using Teuchos::null;
2868 
2869  typedef Scalar SC;
2870  typedef LocalOrdinal LO;
2871  typedef GlobalOrdinal GO;
2872  typedef Node NO;
2873 
2874  typedef Map<LO,GO,NO> map_type;
2875  typedef Import<LO,GO,NO> import_type;
2876  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
2877 
2878 #ifdef HAVE_TPETRA_MMM_TIMINGS
2879  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2880  using Teuchos::TimeMonitor;
2881  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Alloc"))));
2882 #endif
2883  // The goal of this method is to populate the 'Aview' struct with views of the
2884  // rows of A, including all rows that correspond to elements in 'targetMap'.
2885  //
2886  // If targetMap includes local elements that correspond to remotely-owned rows
2887  // of A, then those remotely-owned rows will be imported into
2888  // 'Aview.importMatrix', and views of them will be included in 'Aview'.
2889  Aview.deleteContents();
2890 
2891  Aview.origMatrix = rcp(&A, false);
2892  Aview.origRowMap = A.getRowMap();
2893  Aview.rowMap = targetMap;
2894  Aview.colMap = A.getColMap();
2895  Aview.domainMap = A.getDomainMap();
2896  Aview.importColMap = null;
2897  RCP<const map_type> rowMap = A.getRowMap();
2898  const int numProcs = rowMap->getComm()->getSize();
2899 
2900  // Short circuit if the user swears there are no remotes (or if we're in serial)
2901  if (userAssertsThereAreNoRemotes || numProcs < 2)
2902  return;
2903 
2904  RCP<const import_type> importer;
2905  if (params != null && params->isParameter("importer")) {
2906  importer = params->get<RCP<const import_type> >("importer");
2907 
2908  } else {
2909 #ifdef HAVE_TPETRA_MMM_TIMINGS
2910  MM = Teuchos::null;
2911  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap"))));
2912 #endif
2913 
2914  // Mark each row in targetMap as local or remote, and go ahead and get a view
2915  // for the local rows
2916  RCP<const map_type> remoteRowMap;
2917  size_t numRemote = 0;
2918  int mode = 0;
2919  if (!prototypeImporter.is_null() &&
2920  prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
2921  prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
2922  // We have a valid prototype importer --- ask it for the remotes
2923 #ifdef HAVE_TPETRA_MMM_TIMINGS
2924  TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap-Mode1"));
2925 #endif
2926  ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
2927  numRemote = prototypeImporter->getNumRemoteIDs();
2928 
2929  Array<GO> remoteRows(numRemote);
2930  for (size_t i = 0; i < numRemote; i++)
2931  remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
2932 
2933  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2934  rowMap->getIndexBase(), rowMap->getComm()));
2935  mode = 1;
2936 
2937  } else if (prototypeImporter.is_null()) {
2938  // No prototype importer --- count the remotes the hard way
2939 #ifdef HAVE_TPETRA_MMM_TIMINGS
2940  TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap-Mode2"));
2941 #endif
2942  ArrayView<const GO> rows = targetMap->getNodeElementList();
2943  size_t numRows = targetMap->getNodeNumElements();
2944 
2945  Array<GO> remoteRows(numRows);
2946  for(size_t i = 0; i < numRows; ++i) {
2947  const LO mlid = rowMap->getLocalElement(rows[i]);
2948 
2949  if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
2950  remoteRows[numRemote++] = rows[i];
2951  }
2952  remoteRows.resize(numRemote);
2953  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2954  rowMap->getIndexBase(), rowMap->getComm()));
2955  mode = 2;
2956 
2957  } else {
2958  // PrototypeImporter is bad. But if we're in serial that's OK.
2959  mode = 3;
2960  }
2961 
2962  if (numProcs < 2) {
2963  TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
2964  "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
2965  // If only one processor we don't need to import any remote rows, so return.
2966  return;
2967  }
2968 
2969  //
2970  // Now we will import the needed remote rows of A, if the global maximum
2971  // value of numRemote is greater than 0.
2972  //
2973 #ifdef HAVE_TPETRA_MMM_TIMINGS
2974  MM = Teuchos::null;
2975  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Collective-0"))));
2976 #endif
2977 
2978  global_size_t globalMaxNumRemote = 0;
2979  Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
2980 
2981  if (globalMaxNumRemote > 0) {
2982 #ifdef HAVE_TPETRA_MMM_TIMINGS
2983  MM = Teuchos::null;
2984  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-2"))));
2985 #endif
2986  // Create an importer with target-map remoteRowMap and source-map rowMap.
2987  if (mode == 1)
2988  importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
2989  else if (mode == 2)
2990  importer = rcp(new import_type(rowMap, remoteRowMap));
2991  else
2992  throw std::runtime_error("prototypeImporter->SourceMap() does not match A.getRowMap()!");
2993  }
2994 
2995  if (params != null)
2996  params->set("importer", importer);
2997  }
2998 
2999  if (importer != null) {
3000 #ifdef HAVE_TPETRA_MMM_TIMINGS
3001  MM = Teuchos::null;
3002  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-3"))));
3003 #endif
3004 
3005  // Now create a new matrix into which we can import the remote rows of A that we need.
3006  Teuchos::ParameterList labelList;
3007  labelList.set("Timer Label", label);
3008  auto & labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
3009 
3010  bool isMM = true;
3011  bool overrideAllreduce = false;
3012  int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
3013  // Minor speedup tweak - avoid computing the global constants
3014  Teuchos::ParameterList params_sublist;
3015  if(!params.is_null()) {
3016  labelList.set("compute global constants", params->get("compute global constants",false));
3017  params_sublist = params->sublist("matrixmatrix: kernel params",false);
3018  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3019  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3020  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
3021  isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
3022  overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
3023  }
3024  labelList_subList.set("isMatrixMatrix_TransferAndFillComplete",isMM);
3025  labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3026  labelList_subList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
3027 
3028  Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
3029  A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3030 
3031 #if 0
3032  // Disabled code for dumping input matrices
3033  static int count=0;
3034  char str[80];
3035  sprintf(str,"import_matrix.%d.dat",count);
3037  count++;
3038 #endif
3039 
3040 #ifdef HAVE_TPETRA_MMM_STATISTICS
3041  printMultiplicationStatistics(importer, label + std::string(" I&X MMM"));
3042 #endif
3043 
3044 
3045 #ifdef HAVE_TPETRA_MMM_TIMINGS
3046  MM = Teuchos::null;
3047  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-4"))));
3048 #endif
3049 
3050  // Save the column map of the imported matrix, so that we can convert indices back to global for arithmetic later
3051  Aview.importColMap = Aview.importMatrix->getColMap();
3052 #ifdef HAVE_TPETRA_MMM_TIMINGS
3053  MM = Teuchos::null;
3054 #endif
3055  }
3056 }
3057 
3058 
3059 
3060 
3061 
3062 /*********************************************************************************************************/
3063  // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3064 template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node, class LocalOrdinalViewType>
3066 merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3067  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3068  const LocalOrdinalViewType & Acol2Brow,
3069  const LocalOrdinalViewType & Acol2Irow,
3070  const LocalOrdinalViewType & Bcol2Ccol,
3071  const LocalOrdinalViewType & Icol2Ccol,
3072  const size_t mergedNodeNumCols) {
3073 
3074  using Teuchos::RCP;
3076  typedef typename KCRS::StaticCrsGraphType graph_t;
3077  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3078  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3079  typedef typename KCRS::values_type::non_const_type scalar_view_t;
3080  // Grab the Kokkos::SparseCrsMatrices
3081  const KCRS & Ak = Aview.origMatrix->getLocalMatrixDevice();
3082  const KCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
3083 
3084  // We need to do this dance if either (a) We have Bimport or (b) We don't A's colMap is not the same as B's rowMap
3085  if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3086  // We do have a Bimport
3087  // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3088  // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3089  RCP<const KCRS> Ik_;
3090  if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrixDevice());
3091  const KCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3092  KCRS Iks;
3093  if(Ik!=0) Iks = *Ik;
3094  size_t merge_numrows = Ak.numCols();
3095  // The last entry of this at least, need to be initialized
3096  lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3097 
3098  const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3099 
3100  // Use a Kokkos::parallel_scan to build the rowptr
3101  typedef typename Node::execution_space execution_space;
3102  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3103  Kokkos::parallel_scan ("Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3104  KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3105  if(final) Mrowptr(i) = update;
3106  // Get the row count
3107  size_t ct=0;
3108  if(Acol2Brow(i)!=LO_INVALID)
3109  ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3110  else
3111  ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3112  update+=ct;
3113 
3114  if(final && i+1==merge_numrows)
3115  Mrowptr(i+1)=update;
3116  });
3117 
3118  // Allocate nnz
3119  size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3120  lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"),merge_nnz);
3121  scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"),merge_nnz);
3122 
3123  // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3124  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3125  Kokkos::parallel_for ("Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(const size_t i) {
3126  if(Acol2Brow(i)!=LO_INVALID) {
3127  size_t row = Acol2Brow(i);
3128  size_t start = Bk.graph.row_map(row);
3129  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3130  Mvalues(j) = Bk.values(j-Mrowptr(i)+start);
3131  Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3132  }
3133  }
3134  else {
3135  size_t row = Acol2Irow(i);
3136  size_t start = Iks.graph.row_map(row);
3137  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3138  Mvalues(j) = Iks.values(j-Mrowptr(i)+start);
3139  Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3140  }
3141  }
3142  });
3143 
3144  KCRS newmat("CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind);
3145  return newmat;
3146  }
3147  else {
3148  // We don't have a Bimport (the easy case)
3149  return Bk;
3150  }
3151 }//end merge_matrices
3152 
3153 template<typename SC, typename LO, typename GO, typename NO>
3154 void AddKernels<SC, LO, GO, NO>::
3155 addSorted(
3156  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3157  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3158  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3159  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3160  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3161  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3162  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3163  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3164  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3165  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3166  typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3167 {
3168  using Teuchos::TimeMonitor;
3169  using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3170  TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3171  auto nrows = Arowptrs.extent(0) - 1;
3172  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3173  typename AddKern::KKH handle;
3174  handle.create_spadd_handle(true);
3175  auto addHandle = handle.get_spadd_handle();
3176 #ifdef HAVE_TPETRA_MMM_TIMINGS
3177  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted symbolic")));
3178 #endif
3179  KokkosSparse::Experimental::spadd_symbolic
3180  <KKH,
3181  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3182  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3183  row_ptrs_array, col_inds_array>
3184  (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3185  //KokkosKernels requires values to be zeroed
3186  Cvals = values_array("C values", addHandle->get_c_nnz());
3187  Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3188 #ifdef HAVE_TPETRA_MMM_TIMINGS
3189  MM = Teuchos::null;
3190  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted numeric")));
3191 #endif
3192  KokkosSparse::Experimental::spadd_numeric(&handle,
3193  Arowptrs, Acolinds, Avals, scalarA,
3194  Browptrs, Bcolinds, Bvals, scalarB,
3195  Crowptrs, Ccolinds, Cvals);
3196 }
3197 
3198 template<typename SC, typename LO, typename GO, typename NO>
3199 void AddKernels<SC, LO, GO, NO>::
3200 addUnsorted(
3201  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3202  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3203  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3204  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3205  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3206  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3207  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3208  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3209  GO /* numGlobalCols */,
3210  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3211  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3212  typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3213 {
3214  using Teuchos::TimeMonitor;
3215  using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3216  TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3217  auto nrows = Arowptrs.extent(0) - 1;
3218  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3219  typedef MMdetails::AddKernels<SC, LO, GO, NO> AddKern;
3220  typename AddKern::KKH handle;
3221  handle.create_spadd_handle(false);
3222  auto addHandle = handle.get_spadd_handle();
3223 #ifdef HAVE_TPETRA_MMM_TIMINGS
3224  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() unsorted symbolic")));
3225 #endif
3226  KokkosSparse::Experimental::spadd_symbolic
3227  <KKH,
3228  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3229  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3230  row_ptrs_array, col_inds_array>
3231  (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3232  //Cvals must be zeroed out
3233  Cvals = values_array("C values", addHandle->get_c_nnz());
3234  Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3235 #ifdef HAVE_TPETRA_MMM_TIMINGS
3236  MM = Teuchos::null;
3237  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() unsorted kernel: unsorted numeric")));
3238 #endif
3239  KokkosSparse::Experimental::spadd_numeric(&handle,
3240  Arowptrs, Acolinds, Avals, scalarA,
3241  Browptrs, Bcolinds, Bvals, scalarB,
3242  Crowptrs, Ccolinds, Cvals);
3243 }
3244 
3245 template<typename GO,
3246  typename LocalIndicesType,
3247  typename GlobalIndicesType,
3248  typename ColMapType>
3249 struct ConvertLocalToGlobalFunctor
3250 {
3251  ConvertLocalToGlobalFunctor(
3252  const LocalIndicesType& colindsOrig_,
3253  const GlobalIndicesType& colindsConverted_,
3254  const ColMapType& colmap_) :
3255  colindsOrig (colindsOrig_),
3256  colindsConverted (colindsConverted_),
3257  colmap (colmap_)
3258  {}
3259  KOKKOS_INLINE_FUNCTION void
3260  operator() (const GO i) const
3261  {
3262  colindsConverted(i) = colmap.getGlobalElement(colindsOrig(i));
3263  }
3264  LocalIndicesType colindsOrig;
3265  GlobalIndicesType colindsConverted;
3266  ColMapType colmap;
3267 };
3268 
3269 template<typename SC, typename LO, typename GO, typename NO>
3270 void MMdetails::AddKernels<SC, LO, GO, NO>::
3271 convertToGlobalAndAdd(
3272  const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& A,
3273  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3274  const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& B,
3275  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3276  const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
3277  const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
3278  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3279  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3280  typename MMdetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds)
3281 {
3282  using Teuchos::TimeMonitor;
3283  //Need to use a different KokkosKernelsHandle type than other versions,
3284  //since the ordinals are now GO
3285  using KKH_GO = KokkosKernels::Experimental::KokkosKernelsHandle<size_t, GO, impl_scalar_type,
3286  typename NO::execution_space, typename NO::memory_space, typename NO::memory_space>;
3287 
3288  const values_array Avals = A.values;
3289  const values_array Bvals = B.values;
3290  const col_inds_array Acolinds = A.graph.entries;
3291  const col_inds_array Bcolinds = B.graph.entries;
3292  auto Arowptrs = A.graph.row_map;
3293  auto Browptrs = B.graph.row_map;
3294  global_col_inds_array AcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("A colinds (converted)"), Acolinds.extent(0));
3295  global_col_inds_array BcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("B colinds (converted)"), Bcolinds.extent(0));
3296 #ifdef HAVE_TPETRA_MMM_TIMINGS
3297  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: " + std::string("column map conversion"))));
3298 #endif
3299  ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(Acolinds, AcolindsConverted, AcolMap);
3300  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
3301  ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(Bcolinds, BcolindsConverted, BcolMap);
3302  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
3303  KKH_GO handle;
3304  handle.create_spadd_handle(false);
3305  auto addHandle = handle.get_spadd_handle();
3306 #ifdef HAVE_TPETRA_MMM_TIMINGS
3307  MM = Teuchos::null;
3308  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted symbolic")));
3309 #endif
3310  auto nrows = Arowptrs.extent(0) - 1;
3311  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3312  KokkosSparse::Experimental::spadd_symbolic
3313  <KKH_GO, typename row_ptrs_array::const_type, typename global_col_inds_array::const_type, typename row_ptrs_array::const_type, typename global_col_inds_array::const_type, row_ptrs_array, global_col_inds_array>
3314  (&handle, Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
3315  Cvals = values_array("C values", addHandle->get_c_nnz());
3316  Ccolinds = global_col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3317 #ifdef HAVE_TPETRA_MMM_TIMINGS
3318  MM = Teuchos::null;
3319  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted numeric")));
3320 #endif
3321  KokkosSparse::Experimental::spadd_numeric(&handle,
3322  Arowptrs, AcolindsConverted, Avals, scalarA,
3323  Browptrs, BcolindsConverted, Bvals, scalarB,
3324  Crowptrs, Ccolinds, Cvals);
3325 }
3326 
3327 
3328 } //End namepsace MMdetails
3329 
3330 } //End namespace Tpetra
3331 
3332 /*********************************************************************************************************/
3333 //
3334 // Explicit instantiation macro
3335 //
3336 // Must be expanded from within the Tpetra namespace!
3337 //
3338 namespace Tpetra {
3339 
3340 #define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
3341  template \
3342  void MatrixMatrix::Multiply( \
3343  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3344  bool transposeA, \
3345  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3346  bool transposeB, \
3347  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3348  bool call_FillComplete_on_result, \
3349  const std::string & label, \
3350  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3351 \
3352 template \
3353  void MatrixMatrix::Jacobi( \
3354  SCALAR omega, \
3355  const Vector< SCALAR, LO, GO, NODE > & Dinv, \
3356  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3357  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3358  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3359  bool call_FillComplete_on_result, \
3360  const std::string & label, \
3361  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3362 \
3363  template \
3364  void MatrixMatrix::Add( \
3365  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3366  bool transposeA, \
3367  SCALAR scalarA, \
3368  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3369  bool transposeB, \
3370  SCALAR scalarB, \
3371  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > C); \
3372 \
3373  template \
3374  void MatrixMatrix::Add( \
3375  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3376  bool transposeA, \
3377  SCALAR scalarA, \
3378  CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3379  SCALAR scalarB ); \
3380 \
3381  template \
3382  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
3383  MatrixMatrix::add<SCALAR, LO, GO, NODE> \
3384  (const SCALAR & alpha, \
3385  const bool transposeA, \
3386  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3387  const SCALAR & beta, \
3388  const bool transposeB, \
3389  const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3390  const Teuchos::RCP<const Map<LO, GO, NODE> >& domainMap, \
3391  const Teuchos::RCP<const Map<LO, GO, NODE> >& rangeMap, \
3392  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3393 \
3394  template \
3395  void \
3396  MatrixMatrix::add< SCALAR , LO, GO , NODE > \
3397  (const SCALAR & alpha, \
3398  const bool transposeA, \
3399  const CrsMatrix< SCALAR , LO, GO , NODE >& A, \
3400  const SCALAR& beta, \
3401  const bool transposeB, \
3402  const CrsMatrix< SCALAR , LO, GO , NODE >& B, \
3403  CrsMatrix< SCALAR , LO, GO , NODE >& C, \
3404  const Teuchos::RCP<const Map<LO, GO , NODE > >& domainMap, \
3405  const Teuchos::RCP<const Map<LO, GO , NODE > >& rangeMap, \
3406  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3407 \
3408  template struct MMdetails::AddKernels<SCALAR, LO, GO, NODE>;
3409 
3410 } //End namespace Tpetra
3411 
3412 #endif // TPETRA_MATRIXMATRIX_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix&#39;s column Map with the given Map.
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
bool isFillActive() const
Whether the matrix is not fill complete.
void scale(const Scalar &alpha)
Scale the matrix&#39;s values: this := alpha*this.
size_t getNodeNumRows() const override
The number of matrix rows owned by the calling process.
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
size_t global_size_t
Global size_t object.
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
static void writeSparseFile(const std::string &filename, const sparse_matrix_type &matrix, const std::string &matrixName, const std::string &matrixDescription, const bool debug=false)
Print the sparse matrix in Matrix Market format, with comments.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
LocalOrdinal sumIntoGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using global indices.
Utility functions for packing and unpacking sparse matrix entries.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
Teuchos::RCP< crs_matrix_type > createTranspose(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute and return the transpose of the matrix given to the constructor.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
A parallel distribution of indices over processes.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
size_t getNodeMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
A distributed dense vector.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
Matrix Market file readers and writers for Tpetra objects.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
void setAllValues(const typename local_graph_device_type::row_map_type &ptr, const typename local_graph_device_type::entries_type::non_const_type &ind, const typename local_matrix_device_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Declaration and definition of Tpetra::Details::getEntryOnHost.
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.