Tpetra parallel linear algebra  Version of the Day
TpetraExt_MatrixMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 #ifndef TPETRA_MATRIXMATRIX_DEF_HPP
42 #define TPETRA_MATRIXMATRIX_DEF_HPP
43 #include "Tpetra_ConfigDefs.hpp"
45 #include "Teuchos_VerboseObject.hpp"
46 #include "Teuchos_Array.hpp"
47 #include "Tpetra_Util.hpp"
48 #include "Tpetra_CrsMatrix.hpp"
50 #include "Tpetra_RowMatrixTransposer.hpp"
53 #include "Tpetra_Details_makeColMap.hpp"
54 #include "Tpetra_ConfigDefs.hpp"
55 #include "Tpetra_Map.hpp"
56 #include "Tpetra_Export.hpp"
57 #include "Tpetra_Import_Util.hpp"
58 #include "Tpetra_Import_Util2.hpp"
59 #include <algorithm>
60 #include <type_traits>
61 #include "Teuchos_FancyOStream.hpp"
62 
63 #include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp"
65 
66 #include "KokkosSparse_spgemm.hpp"
67 #include "KokkosSparse_spadd.hpp"
68 #include "Kokkos_Bitset.hpp"
69 
70 #include <MatrixMarket_Tpetra.hpp>
71 
79 /*********************************************************************************************************/
80 // Include the architecture-specific kernel partial specializations here
81 // NOTE: This needs to be outside all namespaces
82 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
83 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
84 
85 
86 namespace Tpetra {
87 
88 namespace MatrixMatrix{
89 
90 //
91 // This method forms the matrix-matrix product C = op(A) * op(B), where
92 // op(A) == A if transposeA is false,
93 // op(A) == A^T if transposeA is true,
94 // and similarly for op(B).
95 //
96 template <class Scalar,
97  class LocalOrdinal,
98  class GlobalOrdinal,
99  class Node>
100 void Multiply(
102  bool transposeA,
104  bool transposeB,
106  bool call_FillComplete_on_result,
107  const std::string& label,
108  const Teuchos::RCP<Teuchos::ParameterList>& params)
109 {
110  using Teuchos::null;
111  using Teuchos::RCP;
112  using Teuchos::rcp;
113  typedef Scalar SC;
114  typedef LocalOrdinal LO;
115  typedef GlobalOrdinal GO;
116  typedef Node NO;
117  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
118  typedef Import<LO,GO,NO> import_type;
119  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
120  typedef Map<LO,GO,NO> map_type;
121  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
122 
123 #ifdef HAVE_TPETRA_MMM_TIMINGS
124  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
125  using Teuchos::TimeMonitor;
126  //MM is used to time setup, and then multiply.
127 
128  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Setup"))));
129 #endif
130 
131  const std::string prefix = "TpetraExt::MatrixMatrix::Multiply(): ";
132 
133  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
134 
135  // The input matrices A and B must both be fillComplete.
136  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
137  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
138 
139  // If transposeA is true, then Aprime will be the transpose of A
140  // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
141  // will just be a pointer to A.
142  RCP<const crs_matrix_type> Aprime = null;
143  // If transposeB is true, then Bprime will be the transpose of B
144  // (computed explicitly via RowMatrixTransposer). Otherwise, Bprime
145  // will just be a pointer to B.
146  RCP<const crs_matrix_type> Bprime = null;
147 
148  // Is this a "clean" matrix?
149  //
150  // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
151  // locally nor globally indexed, then it was empty. I don't like
152  // this, because the most straightforward implementation presumes
153  // lazy allocation of indices. However, historical precedent
154  // demands that we keep around this predicate as a way to test
155  // whether the matrix is empty.
156  const bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
157 
158  bool use_optimized_ATB = false;
159  if (transposeA && !transposeB && call_FillComplete_on_result && newFlag)
160  use_optimized_ATB = true;
161 
162 #ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later.
163  use_optimized_ATB = false;
164 #endif
165 
166  using Teuchos::ParameterList;
167  RCP<ParameterList> transposeParams (new ParameterList);
168  transposeParams->set ("sort", false);
169 
170  if (!use_optimized_ATB && transposeA) {
171  transposer_type transposer (rcpFromRef (A));
172  Aprime = transposer.createTranspose (transposeParams);
173  }
174  else {
175  Aprime = rcpFromRef(A);
176  }
177 
178  if (transposeB) {
179  transposer_type transposer (rcpFromRef (B));
180  Bprime = transposer.createTranspose (transposeParams);
181  }
182  else {
183  Bprime = rcpFromRef(B);
184  }
185 
186  // Check size compatibility
187  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
188  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
189  global_size_t Aouter = transposeA ? numACols : A.getGlobalNumRows();
190  global_size_t Bouter = transposeB ? B.getGlobalNumRows() : numBCols;
191  global_size_t Ainner = transposeA ? A.getGlobalNumRows() : numACols;
192  global_size_t Binner = transposeB ? numBCols : B.getGlobalNumRows();
193  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
194  prefix << "ERROR, inner dimensions of op(A) and op(B) "
195  "must match for matrix-matrix product. op(A) is "
196  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
197 
198  // The result matrix C must at least have a row-map that reflects the correct
199  // row-size. Don't check the number of columns because rectangular matrices
200  // which were constructed with only one map can still end up having the
201  // correct capacity and dimensions when filled.
202  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
203  prefix << "ERROR, dimensions of result C must "
204  "match dimensions of op(A) * op(B). C has " << C.getGlobalNumRows()
205  << " rows, should have at least " << Aouter << std::endl);
206 
207  // It doesn't matter whether C is already Filled or not. If it is already
208  // Filled, it must have space allocated for the positions that will be
209  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
210  // we'll error out later when trying to store result values.
211 
212  // CGB: However, matrix must be in active-fill
213  if (!C.isFillActive()) C.resumeFill();
214 
215  // We're going to need to import remotely-owned sections of A and/or B if
216  // more than one processor is performing this run, depending on the scenario.
217  int numProcs = A.getComm()->getSize();
218 
219  // Declare a couple of structs that will be used to hold views of the data
220  // of A and B, to be used for fast access during the matrix-multiplication.
221  crs_matrix_struct_type Aview;
222  crs_matrix_struct_type Bview;
223 
224  RCP<const map_type> targetMap_A = Aprime->getRowMap();
225  RCP<const map_type> targetMap_B = Bprime->getRowMap();
226 
227 #ifdef HAVE_TPETRA_MMM_TIMINGS
228  {
229  TimeMonitor MM_importExtract(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All I&X")));
230 #endif
231 
232  // Now import any needed remote rows and populate the Aview struct
233  // NOTE: We assert that an import isn't needed --- since we do the transpose
234  // above to handle that.
235  if (!use_optimized_ATB) {
236  RCP<const import_type> dummyImporter;
237  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
238  }
239 
240  // We will also need local access to all rows of B that correspond to the
241  // column-map of op(A).
242  if (numProcs > 1)
243  targetMap_B = Aprime->getColMap();
244 
245  // Import any needed remote rows and populate the Bview struct.
246  if (!use_optimized_ATB)
247  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), false, label, params);
248 
249 #ifdef HAVE_TPETRA_MMM_TIMINGS
250  } //stop MM_importExtract here
251  //stop the setup timer, and start the multiply timer
252  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Multiply"))));
253 #endif
254 
255  // Call the appropriate method to perform the actual multiplication.
256  if (use_optimized_ATB) {
257  MMdetails::mult_AT_B_newmatrix(A, B, C, label,params);
258 
259  } else if (call_FillComplete_on_result && newFlag) {
260  MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label,params);
261 
262  } else if (call_FillComplete_on_result) {
263  MMdetails::mult_A_B_reuse(Aview, Bview, C, label,params);
264 
265  } else {
266  // mfh 27 Sep 2016: Is this the "slow" case? This
267  // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
268  // thread-parallel inserts, but that may take some effort.
269  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
270 
271  MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
272  }
273 
274 #ifdef HAVE_TPETRA_MMM_TIMINGS
275  TimeMonitor MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All FillComplete")));
276 #endif
277  if (call_FillComplete_on_result && !C.isFillComplete()) {
278  // We'll call FillComplete on the C matrix before we exit, and give it a
279  // domain-map and a range-map.
280  // The domain-map will be the domain-map of B, unless
281  // op(B)==transpose(B), in which case the range-map of B will be used.
282  // The range-map will be the range-map of A, unless op(A)==transpose(A),
283  // in which case the domain-map of A will be used.
284  C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
285  }
286 }
287 
288 
289 template <class Scalar,
290  class LocalOrdinal,
291  class GlobalOrdinal,
292  class Node>
293 void Jacobi(Scalar omega,
298  bool call_FillComplete_on_result,
299  const std::string& label,
300  const Teuchos::RCP<Teuchos::ParameterList>& params)
301 {
302  using Teuchos::RCP;
303  typedef Scalar SC;
304  typedef LocalOrdinal LO;
305  typedef GlobalOrdinal GO;
306  typedef Node NO;
307  typedef Import<LO,GO,NO> import_type;
308  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
309  typedef Map<LO,GO,NO> map_type;
310  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
311 
312 #ifdef HAVE_TPETRA_MMM_TIMINGS
313  std::string prefix_mmm = std::string("TpetraExt ")+ label + std::string(": ");
314  using Teuchos::TimeMonitor;
315  TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm+std::string("Jacobi All Setup")));
316 #endif
317 
318  const std::string prefix = "TpetraExt::MatrixMatrix::Jacobi(): ";
319 
320  // A and B should already be Filled.
321  // Should we go ahead and call FillComplete() on them if necessary or error
322  // out? For now, we choose to error out.
323  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
324  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
325 
326  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
327  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
328 
329  // Now check size compatibility
330  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
331  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
332  global_size_t Aouter = A.getGlobalNumRows();
333  global_size_t Bouter = numBCols;
334  global_size_t Ainner = numACols;
335  global_size_t Binner = B.getGlobalNumRows();
336  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
337  prefix << "ERROR, inner dimensions of op(A) and op(B) "
338  "must match for matrix-matrix product. op(A) is "
339  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
340 
341  // The result matrix C must at least have a row-map that reflects the correct
342  // row-size. Don't check the number of columns because rectangular matrices
343  // which were constructed with only one map can still end up having the
344  // correct capacity and dimensions when filled.
345  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
346  prefix << "ERROR, dimensions of result C must "
347  "match dimensions of op(A) * op(B). C has "<< C.getGlobalNumRows()
348  << " rows, should have at least "<< Aouter << std::endl);
349 
350  // It doesn't matter whether C is already Filled or not. If it is already
351  // Filled, it must have space allocated for the positions that will be
352  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
353  // we'll error out later when trying to store result values.
354 
355  // CGB: However, matrix must be in active-fill
356  TEUCHOS_TEST_FOR_EXCEPT( C.isFillActive() == false );
357 
358  // We're going to need to import remotely-owned sections of A and/or B if
359  // more than one processor is performing this run, depending on the scenario.
360  int numProcs = A.getComm()->getSize();
361 
362  // Declare a couple of structs that will be used to hold views of the data of
363  // A and B, to be used for fast access during the matrix-multiplication.
364  crs_matrix_struct_type Aview;
365  crs_matrix_struct_type Bview;
366 
367  RCP<const map_type> targetMap_A = Aprime->getRowMap();
368  RCP<const map_type> targetMap_B = Bprime->getRowMap();
369 
370 #ifdef HAVE_TPETRA_MMM_TIMINGS
371  TimeMonitor MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All I&X")));
372  {
373 #endif
374 
375  // Enable globalConstants by default
376  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
377  RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(new Teuchos::ParameterList);
378  if(!params.is_null()) {
379  importParams1->set("compute global constants",params->get("compute global constants: temporaries",false));
380  int mm_optimization_core_count=0;
381  auto slist = params->sublist("matrixmatrix: kernel params",false);
382  mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount ());
383  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
384  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
385  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
386  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
387  auto & ip1slist = importParams1->sublist("matrixmatrix: kernel params",false);
388  ip1slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
389  ip1slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
390  ip1slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
391  }
392 
393  //Now import any needed remote rows and populate the Aview struct.
394  RCP<const import_type> dummyImporter;
395  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label,importParams1);
396 
397  // We will also need local access to all rows of B that correspond to the
398  // column-map of op(A).
399  if (numProcs > 1)
400  targetMap_B = Aprime->getColMap();
401 
402  // Now import any needed remote rows and populate the Bview struct.
403  // Enable globalConstants by default
404  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
405  RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(new Teuchos::ParameterList);
406  if(!params.is_null()) {
407  importParams2->set("compute global constants",params->get("compute global constants: temporaries",false));
408 
409  auto slist = params->sublist("matrixmatrix: kernel params",false);
410  int mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount () );
411  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
412  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
413  auto & ip2slist = importParams2->sublist("matrixmatrix: kernel params",false);
414  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
415  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
416  ip2slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
417  ip2slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
418  ip2slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
419  }
420 
421  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), false, label,importParams2);
422 
423 #ifdef HAVE_TPETRA_MMM_TIMINGS
424  }
425  TimeMonitor MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All Multiply")));
426 #endif
427 
428  // Now call the appropriate method to perform the actual multiplication.
429  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
430 
431  // Is this a "clean" matrix
432  bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
433 
434  if (call_FillComplete_on_result && newFlag) {
435  MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
436 
437  } else if (call_FillComplete_on_result) {
438  MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
439 
440  } else {
441  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, "jacobi_A_B_general not implemented");
442  // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
443  // commenting it out.
444 // #ifdef HAVE_TPETRA_MMM_TIMINGS
445 // MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: Jacobi FillComplete")));
446 // #endif
447  // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
448  // commenting it out.
449  // if (call_FillComplete_on_result) {
450  // //We'll call FillComplete on the C matrix before we exit, and give
451  // //it a domain-map and a range-map.
452  // //The domain-map will be the domain-map of B, unless
453  // //op(B)==transpose(B), in which case the range-map of B will be used.
454  // //The range-map will be the range-map of A, unless
455  // //op(A)==transpose(A), in which case the domain-map of A will be used.
456  // if (!C.isFillComplete()) {
457  // C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
458  // }
459  // }
460  }
461 }
462 
463 
464 template <class Scalar,
465  class LocalOrdinal,
466  class GlobalOrdinal,
467  class Node>
468 void Add(
470  bool transposeA,
471  Scalar scalarA,
473  Scalar scalarB )
474 {
475  using Teuchos::Array;
476  using Teuchos::RCP;
477  using Teuchos::null;
478  typedef Scalar SC;
479  typedef LocalOrdinal LO;
480  typedef GlobalOrdinal GO;
481  typedef Node NO;
482  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
483  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
484 
485  const std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
486 
487  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
488  prefix << "ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
489  "(Result matrix B is not required to be isFillComplete()).");
490  TEUCHOS_TEST_FOR_EXCEPTION(B.isFillComplete() , std::runtime_error,
491  prefix << "ERROR, input matrix B must not be fill complete!");
492  TEUCHOS_TEST_FOR_EXCEPTION(B.isStaticGraph() , std::runtime_error,
493  prefix << "ERROR, input matrix B must not have static graph!");
494  TEUCHOS_TEST_FOR_EXCEPTION(B.isLocallyIndexed() , std::runtime_error,
495  prefix << "ERROR, input matrix B must not be locally indexed!");
496 
497  using Teuchos::ParameterList;
498  RCP<ParameterList> transposeParams (new ParameterList);
499  transposeParams->set ("sort", false);
500 
501  RCP<const crs_matrix_type> Aprime = null;
502  if (transposeA) {
503  transposer_type transposer (rcpFromRef (A));
504  Aprime = transposer.createTranspose (transposeParams);
505  }
506  else {
507  Aprime = rcpFromRef(A);
508  }
509 
510  size_t a_numEntries;
511  Array<GO> a_inds(A.getNodeMaxNumRowEntries());
512  Array<SC> a_vals(A.getNodeMaxNumRowEntries());
513  GO row;
514 
515  if (scalarB != Teuchos::ScalarTraits<SC>::one())
516  B.scale(scalarB);
517 
518  bool bFilled = B.isFillComplete();
519  size_t numMyRows = B.getNodeNumRows();
520  if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
521  for (LO i = 0; (size_t)i < numMyRows; ++i) {
522  row = B.getRowMap()->getGlobalElement(i);
523  Aprime->getGlobalRowCopy(row, a_inds(), a_vals(), a_numEntries);
524 
525  if (scalarA != Teuchos::ScalarTraits<SC>::one())
526  for (size_t j = 0; j < a_numEntries; ++j)
527  a_vals[j] *= scalarA;
528 
529  if (bFilled)
530  B.sumIntoGlobalValues(row, a_inds(0,a_numEntries), a_vals(0,a_numEntries));
531  else
532  B.insertGlobalValues(row, a_inds(0,a_numEntries), a_vals(0,a_numEntries));
533  }
534  }
535 }
536 
537 template <class Scalar,
538  class LocalOrdinal,
539  class GlobalOrdinal,
540  class Node>
541 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
542 add (const Scalar& alpha,
543  const bool transposeA,
545  const Scalar& beta,
546  const bool transposeB,
548  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
549  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
550  const Teuchos::RCP<Teuchos::ParameterList>& params)
551 {
552  using Teuchos::RCP;
553  using Teuchos::rcpFromRef;
554  using Teuchos::rcp;
555  using Teuchos::ParameterList;
557  if(!params.is_null())
558  {
559  TEUCHOS_TEST_FOR_EXCEPTION(
560  params->isParameter("Call fillComplete") && !params->get<bool>("Call fillComplete"),
561  std::invalid_argument,
562  "Tpetra::MatrixMatrix::add(): this version of add() always calls fillComplete\n"
563  "on the result, but you explicitly set 'Call fillComplete' = false in the parameter list. Don't set this explicitly.");
564  params->set("Call fillComplete", true);
565  }
566  //If transposeB, must compute B's explicit transpose to
567  //get the correct row map for C.
568  RCP<const crs_matrix_type> Brcp = rcpFromRef(B);
569  if(transposeB)
570  {
572  Brcp = transposer.createTranspose();
573  }
574  //Check that A,B are fillComplete before getting B's column map
575  TEUCHOS_TEST_FOR_EXCEPTION
576  (! A.isFillComplete () || ! Brcp->isFillComplete (), std::invalid_argument,
577  "TpetraExt::MatrixMatrix::add(): A and B must both be fill complete.");
578  RCP<crs_matrix_type> C = rcp(new crs_matrix_type(Brcp->getRowMap(), 0));
579  //this version of add() always fill completes the result, no matter what is in params on input
580  add(alpha, transposeA, A, beta, false, *Brcp, *C, domainMap, rangeMap, params);
581  return C;
582 }
583 
584 //This functor does the same thing as CrsGraph::convertColumnIndicesFromGlobalToLocal,
585 //but since the spadd() output is always packed there is no need for a separate
586 //numRowEntries here.
587 //
588 template<class LO, class GO, class LOView, class GOView, class LocalMap>
589 struct ConvertGlobalToLocalFunctor
590 {
591  ConvertGlobalToLocalFunctor(LOView& lids_, const GOView& gids_, const LocalMap localColMap_)
592  : lids(lids_), gids(gids_), localColMap(localColMap_)
593  {}
594 
595  KOKKOS_FUNCTION void operator() (const GO i) const
596  {
597  lids(i) = localColMap.getLocalElement(gids(i));
598  }
599 
600  LOView lids;
601  const GOView gids;
602  const LocalMap localColMap;
603 };
604 
605 template <class Scalar,
606  class LocalOrdinal,
607  class GlobalOrdinal,
608  class Node>
609 void
610 add (const Scalar& alpha,
611  const bool transposeA,
613  const Scalar& beta,
614  const bool transposeB,
617  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
618  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
619  const Teuchos::RCP<Teuchos::ParameterList>& params)
620 {
621  using Teuchos::RCP;
622  using Teuchos::rcp;
623  using Teuchos::rcpFromRef;
624  using Teuchos::rcp_implicit_cast;
625  using Teuchos::rcp_dynamic_cast;
626  using Teuchos::TimeMonitor;
627  using SC = Scalar;
628  using LO = LocalOrdinal;
629  using GO = GlobalOrdinal;
630  using NO = Node;
631  using crs_matrix_type = CrsMatrix<SC,LO,GO,NO>;
632  using crs_graph_type = CrsGraph<LO,GO,NO>;
633  using map_type = Map<LO,GO,NO>;
634  using transposer_type = RowMatrixTransposer<SC,LO,GO,NO>;
635  using import_type = Import<LO,GO,NO>;
636  using export_type = Export<LO,GO,NO>;
637  using exec_space = typename crs_graph_type::execution_space;
638  using AddKern = MMdetails::AddKernels<SC,LO,GO,NO>;
639  const char* prefix_mmm = "TpetraExt::MatrixMatrix::add: ";
640  constexpr bool debug = false;
641 
642 #ifdef HAVE_TPETRA_MMM_TIMINGS
643  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("transpose"))));
644 #endif
645 
646  if (debug) {
647  std::ostringstream os;
648  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
649  << "TpetraExt::MatrixMatrix::add" << std::endl;
650  std::cerr << os.str ();
651  }
652 
653  TEUCHOS_TEST_FOR_EXCEPTION
654  (C.isLocallyIndexed() || C.isGloballyIndexed(), std::invalid_argument,
655  prefix_mmm << "C must be a 'new' matrix (neither locally nor globally indexed).");
656  TEUCHOS_TEST_FOR_EXCEPTION
657  (! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
658  prefix_mmm << "A and B must both be fill complete.");
659 #ifdef HAVE_TPETRA_DEBUG
660  // The matrices don't have domain or range Maps unless they are fill complete.
661  if (A.isFillComplete () && B.isFillComplete ()) {
662  const bool domainMapsSame =
663  (! transposeA && ! transposeB &&
664  ! A.getDomainMap()->locallySameAs (*B.getDomainMap ())) ||
665  (! transposeA && transposeB &&
666  ! A.getDomainMap()->isSameAs (*B.getRangeMap ())) ||
667  ( transposeA && ! transposeB &&
668  ! A.getRangeMap ()->isSameAs (*B.getDomainMap ()));
669  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
670  prefix_mmm << "The domain Maps of Op(A) and Op(B) are not the same.");
671 
672  const bool rangeMapsSame =
673  (! transposeA && ! transposeB &&
674  ! A.getRangeMap ()->isSameAs (*B.getRangeMap ())) ||
675  (! transposeA && transposeB &&
676  ! A.getRangeMap ()->isSameAs (*B.getDomainMap())) ||
677  ( transposeA && ! transposeB &&
678  ! A.getDomainMap()->isSameAs (*B.getRangeMap ()));
679  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
680  prefix_mmm << "The range Maps of Op(A) and Op(B) are not the same.");
681  }
682 #endif // HAVE_TPETRA_DEBUG
683 
684  using Teuchos::ParameterList;
685  // Form the explicit transpose of A if necessary.
686  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
687  if (transposeA) {
688  transposer_type transposer (Aprime);
689  Aprime = transposer.createTranspose ();
690  }
691 
692 #ifdef HAVE_TPETRA_DEBUG
693  TEUCHOS_TEST_FOR_EXCEPTION
694  (Aprime.is_null (), std::logic_error,
695  prefix_mmm << "Failed to compute Op(A). "
696  "Please report this bug to the Tpetra developers.");
697 #endif // HAVE_TPETRA_DEBUG
698 
699  // Form the explicit transpose of B if necessary.
700  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
701  if (transposeB) {
702  if (debug) {
703  std::ostringstream os;
704  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
705  << "Form explicit xpose of B" << std::endl;
706  std::cerr << os.str ();
707  }
708  transposer_type transposer (Bprime);
709  Bprime = transposer.createTranspose ();
710  }
711 #ifdef HAVE_TPETRA_DEBUG
712  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
713  prefix_mmm << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
714  TEUCHOS_TEST_FOR_EXCEPTION(
715  !Aprime->isFillComplete () || !Bprime->isFillComplete (), std::invalid_argument,
716  prefix_mmm << "Aprime and Bprime must both be fill complete. "
717  "Please report this bug to the Tpetra developers.");
718 #endif // HAVE_TPETRA_DEBUG
719  RCP<const map_type> CDomainMap = domainMap;
720  RCP<const map_type> CRangeMap = rangeMap;
721  if(CDomainMap.is_null())
722  {
723  CDomainMap = Bprime->getDomainMap();
724  }
725  if(CRangeMap.is_null())
726  {
727  CRangeMap = Bprime->getRangeMap();
728  }
729  assert(!(CDomainMap.is_null()));
730  assert(!(CRangeMap.is_null()));
731  typedef typename AddKern::values_array values_array;
732  typedef typename AddKern::row_ptrs_array row_ptrs_array;
733  typedef typename AddKern::col_inds_array col_inds_array;
734  bool AGraphSorted = Aprime->getCrsGraph()->isSorted();
735  bool BGraphSorted = Bprime->getCrsGraph()->isSorted();
736  values_array vals;
737  row_ptrs_array rowptrs;
738  col_inds_array colinds;
739 #ifdef HAVE_TPETRA_MMM_TIMINGS
740  MM = Teuchos::null;
741  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("rowmap check/import"))));
742 #endif
743  if(!(Aprime->getRowMap()->isSameAs(*(Bprime->getRowMap()))))
744  {
745  //import Aprime into Bprime's row map so the local matrices have same # of rows
746  auto import = rcp(new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
747  // cbl do not set
748  // parameterlist "isMatrixMatrix_TransferAndFillComplete" true here as
749  // this import _may_ take the form of a transfer. In practice it would be unlikely,
750  // but the general case is not so forgiving.
751  Aprime = importAndFillCompleteCrsMatrix<crs_matrix_type>(Aprime, *import, Bprime->getDomainMap(), Bprime->getRangeMap());
752  }
753  bool matchingColMaps = Aprime->getColMap()->isSameAs(*(Bprime->getColMap()));
754  bool sorted = AGraphSorted && BGraphSorted;
755  RCP<const import_type> Cimport = Teuchos::null;
756  RCP<export_type> Cexport = Teuchos::null;
757  bool doFillComplete = true;
758  if(Teuchos::nonnull(params) && params->isParameter("Call fillComplete"))
759  {
760  doFillComplete = params->get<bool>("Call fillComplete");
761  }
762  auto Alocal = Aprime->getLocalMatrix();
763  auto Blocal = Bprime->getLocalMatrix();
764  LO numLocalRows = Alocal.numRows();
765  if(numLocalRows == 0)
766  {
767  //KokkosKernels spadd assumes rowptrs.extent(0) + 1 == nrows,
768  //but an empty Tpetra matrix is allowed to have rowptrs.extent(0) == 0.
769  //Handle this case now
770  //(without interfering with collective operations, since it's possible for
771  //some ranks to have 0 local rows and others not).
772  rowptrs = row_ptrs_array("C rowptrs", 0);
773  }
774  auto Acolmap = Aprime->getColMap();
775  auto Bcolmap = Bprime->getColMap();
776  if(!matchingColMaps)
777  {
778  using global_col_inds_array = typename AddKern::global_col_inds_array;
779 #ifdef HAVE_TPETRA_MMM_TIMINGS
780  MM = Teuchos::null;
781  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mismatched col map full kernel"))));
782 #endif
783  //use kernel that converts col indices in both A and B to common domain map before adding
784  auto AlocalColmap = Acolmap->getLocalMap();
785  auto BlocalColmap = Bcolmap->getLocalMap();
786  global_col_inds_array globalColinds;
787  if (debug) {
788  std::ostringstream os;
789  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
790  << "Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
791  std::cerr << os.str ();
792  }
793  AddKern::convertToGlobalAndAdd(
794  Alocal, alpha, Blocal, beta, AlocalColmap, BlocalColmap,
795  vals, rowptrs, globalColinds);
796  if (debug) {
797  std::ostringstream os;
798  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
799  << "Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
800  std::cerr << os.str ();
801  }
802 #ifdef HAVE_TPETRA_MMM_TIMINGS
803  MM = Teuchos::null;
804  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Constructing graph"))));
805 #endif
806  RCP<const map_type> CcolMap;
807  Tpetra::Details::makeColMap<LocalOrdinal, GlobalOrdinal, Node>
808  (CcolMap, CDomainMap, globalColinds);
809  C.replaceColMap(CcolMap);
810  col_inds_array localColinds("C colinds", globalColinds.extent(0));
811  Kokkos::parallel_for(Kokkos::RangePolicy<exec_space>(0, globalColinds.extent(0)),
812  ConvertGlobalToLocalFunctor<LocalOrdinal, GlobalOrdinal,
813  col_inds_array, global_col_inds_array,
814  typename map_type::local_map_type>
815  (localColinds, globalColinds, CcolMap->getLocalMap()));
816  KokkosKernels::Impl::sort_crs_matrix<exec_space, row_ptrs_array, col_inds_array, values_array>(rowptrs, localColinds, vals);
817  C.setAllValues(rowptrs, localColinds, vals);
818  C.fillComplete(CDomainMap, CRangeMap, params);
819  if(!doFillComplete)
820  C.resumeFill();
821  }
822  else
823  {
824  //Aprime, Bprime and C all have the same column maps
825  auto Avals = Alocal.values;
826  auto Bvals = Blocal.values;
827  auto Arowptrs = Alocal.graph.row_map;
828  auto Browptrs = Blocal.graph.row_map;
829  auto Acolinds = Alocal.graph.entries;
830  auto Bcolinds = Blocal.graph.entries;
831  if(sorted)
832  {
833  //use sorted kernel
834 #ifdef HAVE_TPETRA_MMM_TIMINGS
835  MM = Teuchos::null;
836  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("sorted entries full kernel"))));
837 #endif
838  if (debug) {
839  std::ostringstream os;
840  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
841  << "Call AddKern::addSorted(...)" << std::endl;
842  std::cerr << os.str ();
843  }
844  AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, vals, rowptrs, colinds);
845  }
846  else
847  {
848  //use unsorted kernel
849 #ifdef HAVE_TPETRA_MMM_TIMINGS
850  MM = Teuchos::null;
851  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mm add unsorted entries full kernel"))));
852 #endif
853  if (debug) {
854  std::ostringstream os;
855  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
856  << "Call AddKern::addUnsorted(...)" << std::endl;
857  std::cerr << os.str ();
858  }
859  AddKern::addUnsorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
860  }
861  //Bprime col map works as C's row map, since Aprime and Bprime have the same colmaps.
862  RCP<const map_type> Ccolmap = Bcolmap;
863  C.replaceColMap(Ccolmap);
864  C.setAllValues(rowptrs, colinds, vals);
865  if(doFillComplete)
866  {
867  #ifdef HAVE_TPETRA_MMM_TIMINGS
868  MM = Teuchos::null;
869  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Tpetra::Crs expertStaticFillComplete"))));
870  #endif
871  if(!CDomainMap->isSameAs(*Ccolmap))
872  {
873  if (debug) {
874  std::ostringstream os;
875  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
876  << "Create Cimport" << std::endl;
877  std::cerr << os.str ();
878  }
879  Cimport = rcp(new import_type(CDomainMap, Ccolmap));
880  }
881  if(!C.getRowMap()->isSameAs(*CRangeMap))
882  {
883  if (debug) {
884  std::ostringstream os;
885  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
886  << "Create Cexport" << std::endl;
887  std::cerr << os.str ();
888  }
889  Cexport = rcp(new export_type(C.getRowMap(), CRangeMap));
890  }
891 
892  if (debug) {
893  std::ostringstream os;
894  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
895  << "Call C->expertStaticFillComplete(...)" << std::endl;
896  std::cerr << os.str ();
897  }
898  C.expertStaticFillComplete(CDomainMap, CRangeMap, Cimport, Cexport, params);
899  }
900  }
901 }
902 
903 template <class Scalar,
904  class LocalOrdinal,
905  class GlobalOrdinal,
906  class Node>
907 void Add(
909  bool transposeA,
910  Scalar scalarA,
912  bool transposeB,
913  Scalar scalarB,
915 {
916  using Teuchos::Array;
917  using Teuchos::ArrayRCP;
918  using Teuchos::ArrayView;
919  using Teuchos::RCP;
920  using Teuchos::rcp;
921  using Teuchos::rcp_dynamic_cast;
922  using Teuchos::rcpFromRef;
923  using Teuchos::tuple;
924  using std::endl;
925  // typedef typename ArrayView<const Scalar>::size_type size_type;
926  typedef Teuchos::ScalarTraits<Scalar> STS;
928  // typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
929  // typedef RowGraph<LocalOrdinal, GlobalOrdinal, Node> row_graph_type;
930  // typedef CrsGraph<LocalOrdinal, GlobalOrdinal, Node> crs_graph_type;
933 
934  std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
935 
936  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
937  prefix << "The case C == null does not actually work. Fixing this will require an interface change.");
938 
939  TEUCHOS_TEST_FOR_EXCEPTION(
940  ! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
941  prefix << "Both input matrices must be fill complete before calling this function.");
942 
943 #ifdef HAVE_TPETRA_DEBUG
944  {
945  const bool domainMapsSame =
946  (! transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getDomainMap ()))) ||
947  (! transposeA && transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ()))) ||
948  (transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ())));
949  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
950  prefix << "The domain Maps of Op(A) and Op(B) are not the same.");
951 
952  const bool rangeMapsSame =
953  (! transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getRangeMap ()))) ||
954  (! transposeA && transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ()))) ||
955  (transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ())));
956  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
957  prefix << "The range Maps of Op(A) and Op(B) are not the same.");
958  }
959 #endif // HAVE_TPETRA_DEBUG
960 
961  using Teuchos::ParameterList;
962  RCP<ParameterList> transposeParams (new ParameterList);
963  transposeParams->set ("sort", false);
964 
965  // Form the explicit transpose of A if necessary.
966  RCP<const crs_matrix_type> Aprime;
967  if (transposeA) {
968  transposer_type theTransposer (rcpFromRef (A));
969  Aprime = theTransposer.createTranspose (transposeParams);
970  }
971  else {
972  Aprime = rcpFromRef (A);
973  }
974 
975 #ifdef HAVE_TPETRA_DEBUG
976  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
977  prefix << "Failed to compute Op(A). Please report this bug to the Tpetra developers.");
978 #endif // HAVE_TPETRA_DEBUG
979 
980  // Form the explicit transpose of B if necessary.
981  RCP<const crs_matrix_type> Bprime;
982  if (transposeB) {
983  transposer_type theTransposer (rcpFromRef (B));
984  Bprime = theTransposer.createTranspose (transposeParams);
985  }
986  else {
987  Bprime = rcpFromRef (B);
988  }
989 
990 #ifdef HAVE_TPETRA_DEBUG
991  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
992  prefix << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
993 #endif // HAVE_TPETRA_DEBUG
994 
995  // Allocate or zero the entries of the result matrix.
996  if (! C.is_null ()) {
997  C->setAllToScalar (STS::zero ());
998  } else {
999  // FIXME (mfh 08 May 2013) When I first looked at this method, I
1000  // noticed that C was being given the row Map of Aprime (the
1001  // possibly transposed version of A). Is this what we want?
1002  C = rcp (new crs_matrix_type (Aprime->getRowMap (), 0));
1003  }
1004 
1005 #ifdef HAVE_TPETRA_DEBUG
1006  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1007  prefix << "At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1008  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1009  prefix << "At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1010  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
1011  prefix << "At this point, C is null. Please report this bug to the Tpetra developers.");
1012 #endif // HAVE_TPETRA_DEBUG
1013 
1014  Array<RCP<const crs_matrix_type> > Mat =
1015  tuple<RCP<const crs_matrix_type> > (Aprime, Bprime);
1016  Array<Scalar> scalar = tuple<Scalar> (scalarA, scalarB);
1017 
1018  // do a loop over each matrix to add: A reordering might be more efficient
1019  for (int k = 0; k < 2; ++k) {
1020  Array<GlobalOrdinal> Indices;
1021  Array<Scalar> Values;
1022 
1023  // Loop over each locally owned row of the current matrix (either
1024  // Aprime or Bprime), and sum its entries into the corresponding
1025  // row of C. This works regardless of whether Aprime or Bprime
1026  // has the same row Map as C, because both sumIntoGlobalValues and
1027  // insertGlobalValues allow summing resp. inserting into nonowned
1028  // rows of C.
1029 #ifdef HAVE_TPETRA_DEBUG
1030  TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
1031  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1032 #endif // HAVE_TPETRA_DEBUG
1033  RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
1034 #ifdef HAVE_TPETRA_DEBUG
1035  TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
1036  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1037 #endif // HAVE_TPETRA_DEBUG
1038 
1039  const size_t localNumRows = Mat[k]->getNodeNumRows ();
1040  for (size_t i = 0; i < localNumRows; ++i) {
1041  const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
1042  size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
1043  if (numEntries > 0) {
1044  Indices.resize (numEntries);
1045  Values.resize (numEntries);
1046  Mat[k]->getGlobalRowCopy (globalRow, Indices (), Values (), numEntries);
1047 
1048  if (scalar[k] != STS::one ()) {
1049  for (size_t j = 0; j < numEntries; ++j) {
1050  Values[j] *= scalar[k];
1051  }
1052  }
1053 
1054  if (C->isFillComplete ()) {
1055  C->sumIntoGlobalValues (globalRow, Indices, Values);
1056  } else {
1057  C->insertGlobalValues (globalRow, Indices, Values);
1058  }
1059  }
1060  }
1061  }
1062 }
1063 
1064 } //End namespace MatrixMatrix
1065 
1066 namespace MMdetails{
1067 
1068 /*********************************************************************************************************/
1069 // Prints MMM-style statistics on communication done with an Import or Export object
1070 template <class TransferType>
1071 void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer, const std::string &label) {
1072  if (Transfer.is_null())
1073  return;
1074 
1075  const Distributor & Distor = Transfer->getDistributor();
1076  Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
1077 
1078  size_t rows_send = Transfer->getNumExportIDs();
1079  size_t rows_recv = Transfer->getNumRemoteIDs();
1080 
1081  size_t round1_send = Transfer->getNumExportIDs() * sizeof(size_t);
1082  size_t round1_recv = Transfer->getNumRemoteIDs() * sizeof(size_t);
1083  size_t num_send_neighbors = Distor.getNumSends();
1084  size_t num_recv_neighbors = Distor.getNumReceives();
1085  size_t round2_send, round2_recv;
1086  Distor.getLastDoStatistics(round2_send,round2_recv);
1087 
1088  int myPID = Comm->getRank();
1089  int NumProcs = Comm->getSize();
1090 
1091  // Processor by processor statistics
1092  // printf("[%d] %s Statistics: neigh[s/r]=%d/%d rows[s/r]=%d/%d r1bytes[s/r]=%d/%d r2bytes[s/r]=%d/%d\n",
1093  // myPID, label.c_str(),num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv);
1094 
1095  // Global statistics
1096  size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
1097  size_t gstats_min[8], gstats_max[8];
1098 
1099  double lstats_avg[8], gstats_avg[8];
1100  for(int i=0; i<8; i++)
1101  lstats_avg[i] = ((double)lstats[i])/NumProcs;
1102 
1103  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
1104  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
1105  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
1106 
1107  if(!myPID) {
1108  printf("%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1109  (int)gstats_min[0],gstats_avg[0],(int)gstats_max[0], (int)gstats_min[2],gstats_avg[2],(int)gstats_max[2],
1110  (int)gstats_min[4],gstats_avg[4],(int)gstats_max[4], (int)gstats_min[6],gstats_avg[6],(int)gstats_max[6]);
1111  printf("%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1112  (int)gstats_min[1],gstats_avg[1],(int)gstats_max[1], (int)gstats_min[3],gstats_avg[3],(int)gstats_max[3],
1113  (int)gstats_min[5],gstats_avg[5],(int)gstats_max[5], (int)gstats_min[7],gstats_avg[7],(int)gstats_max[7]);
1114  }
1115 }
1116 
1117 // Kernel method for computing the local portion of C = A*B
1118 template<class Scalar,
1119  class LocalOrdinal,
1120  class GlobalOrdinal,
1121  class Node>
1122 void mult_AT_B_newmatrix(
1123  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1124  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1125  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1126  const std::string & label,
1127  const Teuchos::RCP<Teuchos::ParameterList>& params)
1128 {
1129  using Teuchos::RCP;
1130  using Teuchos::rcp;
1131  typedef Scalar SC;
1132  typedef LocalOrdinal LO;
1133  typedef GlobalOrdinal GO;
1134  typedef Node NO;
1135  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
1136  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1137 
1138 #ifdef HAVE_TPETRA_MMM_TIMINGS
1139  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1140  using Teuchos::TimeMonitor;
1141  RCP<TimeMonitor> MM = rcp (new TimeMonitor
1142  (*TimeMonitor::getNewTimer (prefix_mmm + "MMM-T Transpose")));
1143 #endif
1144 
1145  /*************************************************************/
1146  /* 1) Local Transpose of A */
1147  /*************************************************************/
1148  transposer_type transposer (rcpFromRef (A), label + std::string("XP: "));
1149 
1150  using Teuchos::ParameterList;
1151  RCP<ParameterList> transposeParams (new ParameterList);
1152  transposeParams->set ("sort", false);
1153  if(! params.is_null ()) {
1154  transposeParams->set ("compute global constants",
1155  params->get ("compute global constants: temporaries",
1156  false));
1157  }
1158  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Atrans =
1159  transposer.createTransposeLocal (transposeParams);
1160 
1161  /*************************************************************/
1162  /* 2/3) Call mult_A_B_newmatrix w/ fillComplete */
1163  /*************************************************************/
1164 #ifdef HAVE_TPETRA_MMM_TIMINGS
1165  MM = Teuchos::null;
1166  MM = rcp (new TimeMonitor
1167  (*TimeMonitor::getNewTimer (prefix_mmm + std::string ("MMM-T I&X"))));
1168 #endif
1169 
1170  // Get views, asserting that no import is required to speed up computation
1171  crs_matrix_struct_type Aview;
1172  crs_matrix_struct_type Bview;
1173  RCP<const Import<LO, GO, NO> > dummyImporter;
1174 
1175  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
1176  RCP<Teuchos::ParameterList> importParams1 (new ParameterList);
1177  if (! params.is_null ()) {
1178  importParams1->set ("compute global constants",
1179  params->get ("compute global constants: temporaries",
1180  false));
1181  auto slist = params->sublist ("matrixmatrix: kernel params", false);
1182  bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1183  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck", false);
1184  int mm_optimization_core_count =
1186  mm_optimization_core_count =
1187  slist.get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1188  int mm_optimization_core_count2 =
1189  params->get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1190  if (mm_optimization_core_count2 < mm_optimization_core_count) {
1191  mm_optimization_core_count = mm_optimization_core_count2;
1192  }
1193  auto & sip1 = importParams1->sublist ("matrixmatrix: kernel params", false);
1194  sip1.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1195  sip1.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1196  sip1.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1197  }
1198 
1199  MMdetails::import_and_extract_views (*Atrans, Atrans->getRowMap (),
1200  Aview, dummyImporter, true,
1201  label, importParams1);
1202 
1203  RCP<ParameterList> importParams2 (new ParameterList);
1204  if (! params.is_null ()) {
1205  importParams2->set ("compute global constants",
1206  params->get ("compute global constants: temporaries",
1207  false));
1208  auto slist = params->sublist ("matrixmatrix: kernel params", false);
1209  bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1210  bool overrideAllreduce = slist.get ("MM_TAFC_OverrideAllreduceCheck", false);
1211  int mm_optimization_core_count =
1213  mm_optimization_core_count =
1214  slist.get ("MM_TAFC_OptimizationCoreCount",
1215  mm_optimization_core_count);
1216  int mm_optimization_core_count2 =
1217  params->get ("MM_TAFC_OptimizationCoreCount",
1218  mm_optimization_core_count);
1219  if (mm_optimization_core_count2 < mm_optimization_core_count) {
1220  mm_optimization_core_count = mm_optimization_core_count2;
1221  }
1222  auto & sip2 = importParams2->sublist ("matrixmatrix: kernel params", false);
1223  sip2.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1224  sip2.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1225  sip2.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1226  }
1227 
1228  if(B.getRowMap()->isSameAs(*Atrans->getColMap())){
1229  MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,true, label,importParams2);
1230  }
1231  else {
1232  MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter,false, label,importParams2);
1233  }
1234 
1235 #ifdef HAVE_TPETRA_MMM_TIMINGS
1236  MM = Teuchos::null;
1237  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T AB-core"))));
1238 #endif
1239 
1240  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Ctemp;
1241 
1242  // If Atrans has no Exporter, we can use C instead of having to create a temp matrix
1243  bool needs_final_export = ! Atrans->getGraph ()->getExporter ().is_null();
1244  if (needs_final_export) {
1245  Ctemp = rcp (new Tpetra::CrsMatrix<SC, LO, GO, NO> (Atrans->getRowMap (), 0));
1246  }
1247  else {
1248  Ctemp = rcp (&C, false);
1249  }
1250 
1251  mult_A_B_newmatrix(Aview, Bview, *Ctemp, label,params);
1252 
1253  /*************************************************************/
1254  /* 4) exportAndFillComplete matrix */
1255  /*************************************************************/
1256 #ifdef HAVE_TPETRA_MMM_TIMINGS
1257  MM = Teuchos::null;
1258  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T exportAndFillComplete"))));
1259 #endif
1260 
1261  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Crcp (&C, false);
1262 
1263  if (needs_final_export) {
1264  ParameterList labelList;
1265  labelList.set("Timer Label", label);
1266  if(!params.is_null()) {
1267  ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params",false);
1268  ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
1269  int mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
1270  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1271  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1272  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
1273  labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,"Core Count above which the optimized neighbor discovery is used");
1274  bool isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
1275  bool overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
1276 
1277  labelList_subList.set ("isMatrixMatrix_TransferAndFillComplete", isMM,
1278  "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
1279  labelList.set("compute global constants",params->get("compute global constants",true));
1280  labelList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
1281  }
1282  Ctemp->exportAndFillComplete (Crcp,
1283  *Ctemp->getGraph ()->getExporter (),
1284  B.getDomainMap (),
1285  A.getDomainMap (),
1286  rcp (&labelList, false));
1287  }
1288 #ifdef HAVE_TPETRA_MMM_STATISTICS
1289  printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(" AT_B MMM"));
1290 #endif
1291 }
1292 
1293 /*********************************************************************************************************/
1294 // Kernel method for computing the local portion of C = A*B
1295 template<class Scalar,
1296  class LocalOrdinal,
1297  class GlobalOrdinal,
1298  class Node>
1299 void mult_A_B(
1300  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1301  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1302  CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1303  const std::string& /* label */,
1304  const Teuchos::RCP<Teuchos::ParameterList>& /* params */)
1305 {
1306  using Teuchos::Array;
1307  using Teuchos::ArrayRCP;
1308  using Teuchos::ArrayView;
1309  using Teuchos::OrdinalTraits;
1310  using Teuchos::null;
1311 
1312  typedef Teuchos::ScalarTraits<Scalar> STS;
1313  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("mult_A_B", mult_A_B);
1314  LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1315  LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1316 
1317  LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1318  LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1319 
1320  ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getNodeElementList();
1321  ArrayView<const GlobalOrdinal> bcols_import = null;
1322  if (Bview.importColMap != null) {
1323  C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1324  C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1325 
1326  bcols_import = Bview.importColMap->getNodeElementList();
1327  }
1328 
1329  size_t C_numCols = C_lastCol - C_firstCol +
1330  OrdinalTraits<LocalOrdinal>::one();
1331  size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1332  OrdinalTraits<LocalOrdinal>::one();
1333 
1334  if (C_numCols_import > C_numCols)
1335  C_numCols = C_numCols_import;
1336 
1337  Array<Scalar> dwork = Array<Scalar>(C_numCols);
1338  Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1339  Array<size_t> iwork2 = Array<size_t>(C_numCols);
1340 
1341  Array<Scalar> C_row_i = dwork;
1342  Array<GlobalOrdinal> C_cols = iwork;
1343  Array<size_t> c_index = iwork2;
1344  Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1345  Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1346 
1347  size_t C_row_i_length, j, k, last_index;
1348 
1349  // Run through all the hash table lookups once and for all
1350  LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1351  Array<LocalOrdinal> Acol2Brow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1352  Array<LocalOrdinal> Acol2Irow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1353  if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1354  // Maps are the same: Use local IDs as the hash
1355  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1356  Aview.colMap->getMaxLocalIndex(); i++)
1357  Acol2Brow[i]=i;
1358  }
1359  else {
1360  // Maps are not the same: Use the map's hash
1361  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1362  Aview.colMap->getMaxLocalIndex(); i++) {
1363  GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1364  LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1365  if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1366  else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1367  }
1368  }
1369 
1370  // To form C = A*B we're going to execute this expression:
1371  //
1372  // C(i,j) = sum_k( A(i,k)*B(k,j) )
1373  //
1374  // Our goal, of course, is to navigate the data in A and B once, without
1375  // performing searches for column-indices, etc.
1376  ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP;
1377  ArrayRCP<const LocalOrdinal> Acolind_RCP, Bcolind_RCP, Icolind_RCP;
1378  ArrayRCP<const Scalar> Avals_RCP, Bvals_RCP, Ivals_RCP;
1379  ArrayView<const size_t> Arowptr, Browptr, Irowptr;
1380  ArrayView<const LocalOrdinal> Acolind, Bcolind, Icolind;
1381  ArrayView<const Scalar> Avals, Bvals, Ivals;
1382  Aview.origMatrix->getAllValues(Arowptr_RCP,Acolind_RCP,Avals_RCP);
1383  Bview.origMatrix->getAllValues(Browptr_RCP,Bcolind_RCP,Bvals_RCP);
1384  Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1385  Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
1386  if(!Bview.importMatrix.is_null()) {
1387  Bview.importMatrix->getAllValues(Irowptr_RCP,Icolind_RCP,Ivals_RCP);
1388  Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1389  }
1390 
1391  bool C_filled = C.isFillComplete();
1392 
1393  for (size_t i = 0; i < C_numCols; i++)
1394  c_index[i] = OrdinalTraits<size_t>::invalid();
1395 
1396  // Loop over the rows of A.
1397  size_t Arows = Aview.rowMap->getNodeNumElements();
1398  for(size_t i=0; i<Arows; ++i) {
1399 
1400  // Only navigate the local portion of Aview... which is, thankfully, all of
1401  // A since this routine doesn't do transpose modes
1402  GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1403 
1404  // Loop across the i-th row of A and for each corresponding row in B, loop
1405  // across columns and accumulate product A(i,k)*B(k,j) into our partial sum
1406  // quantities C_row_i. In other words, as we stride across B(k,:) we're
1407  // calculating updates for row i of the result matrix C.
1408  C_row_i_length = OrdinalTraits<size_t>::zero();
1409 
1410  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1411  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1412  const Scalar Aval = Avals[k];
1413  if (Aval == STS::zero())
1414  continue;
1415 
1416  if (Ak == LO_INVALID)
1417  continue;
1418 
1419  for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1420  LocalOrdinal col = Bcolind[j];
1421  //assert(col >= 0 && col < C_numCols);
1422 
1423  if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1424  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1425  // This has to be a += so insertGlobalValue goes out
1426  C_row_i[C_row_i_length] = Aval*Bvals[j];
1427  C_cols[C_row_i_length] = col;
1428  c_index[col] = C_row_i_length;
1429  C_row_i_length++;
1430 
1431  } else {
1432  C_row_i[c_index[col]] += Aval*Bvals[j];
1433  }
1434  }
1435  }
1436 
1437  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1438  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1439  C_cols[ii] = bcols[C_cols[ii]];
1440  combined_index[ii] = C_cols[ii];
1441  combined_values[ii] = C_row_i[ii];
1442  }
1443  last_index = C_row_i_length;
1444 
1445  //
1446  //Now put the C_row_i values into C.
1447  //
1448  // We might have to revamp this later.
1449  C_row_i_length = OrdinalTraits<size_t>::zero();
1450 
1451  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1452  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1453  const Scalar Aval = Avals[k];
1454  if (Aval == STS::zero())
1455  continue;
1456 
1457  if (Ak!=LO_INVALID) continue;
1458 
1459  Ak = Acol2Irow[Acolind[k]];
1460  for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1461  LocalOrdinal col = Icolind[j];
1462  //assert(col >= 0 && col < C_numCols);
1463 
1464  if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1465  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1466  // This has to be a += so insertGlobalValue goes out
1467  C_row_i[C_row_i_length] = Aval*Ivals[j];
1468  C_cols[C_row_i_length] = col;
1469  c_index[col] = C_row_i_length;
1470  C_row_i_length++;
1471 
1472  } else {
1473  // This has to be a += so insertGlobalValue goes out
1474  C_row_i[c_index[col]] += Aval*Ivals[j];
1475  }
1476  }
1477  }
1478 
1479  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1480  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1481  C_cols[ii] = bcols_import[C_cols[ii]];
1482  combined_index[last_index] = C_cols[ii];
1483  combined_values[last_index] = C_row_i[ii];
1484  last_index++;
1485  }
1486 
1487  // Now put the C_row_i values into C.
1488  // We might have to revamp this later.
1489  C_filled ?
1490  C.sumIntoGlobalValues(
1491  global_row,
1492  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1493  combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1494  :
1495  C.insertGlobalValues(
1496  global_row,
1497  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1498  combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1499 
1500  }
1501 }
1502 
1503 /*********************************************************************************************************/
1504 template<class Scalar,
1505  class LocalOrdinal,
1506  class GlobalOrdinal,
1507  class Node>
1508 void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1509  typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1510  Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1511 
1512  if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1513  Mview.maxNumRowEntries = Mview.indices[0].size();
1514 
1515  for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1516  if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1517  Mview.maxNumRowEntries = Mview.indices[i].size();
1518  }
1519 }
1520 
1521 /*********************************************************************************************************/
1522 template<class CrsMatrixType>
1523 size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1524  // Follows the NZ estimate in ML's ml_matmatmult.c
1525  size_t Aest = 100, Best=100;
1526  if (A.getNodeNumEntries() >= A.getNodeNumRows())
1527  Aest = (A.getNodeNumRows() > 0) ? A.getNodeNumEntries()/A.getNodeNumRows() : 100;
1528  if (B.getNodeNumEntries() >= B.getNodeNumRows())
1529  Best = (B.getNodeNumRows() > 0) ? B.getNodeNumEntries()/B.getNodeNumRows() : 100;
1530 
1531  size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
1532  nnzperrow *= nnzperrow;
1533 
1534  return (size_t)(A.getNodeNumRows()*nnzperrow*0.75 + 100);
1535 }
1536 
1537 
1538 /*********************************************************************************************************/
1539 // Kernel method for computing the local portion of C = A*B
1540 //
1541 // mfh 27 Sep 2016: Currently, mult_AT_B_newmatrix() also calls this
1542 // function, so this is probably the function we want to
1543 // thread-parallelize.
1544 template<class Scalar,
1545  class LocalOrdinal,
1546  class GlobalOrdinal,
1547  class Node>
1548 void mult_A_B_newmatrix(
1549  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1550  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1551  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1552  const std::string& label,
1553  const Teuchos::RCP<Teuchos::ParameterList>& params)
1554 {
1555  using Teuchos::Array;
1556  using Teuchos::ArrayRCP;
1557  using Teuchos::ArrayView;
1558  using Teuchos::RCP;
1559  using Teuchos::rcp;
1560 
1561  // Tpetra typedefs
1562  typedef LocalOrdinal LO;
1563  typedef GlobalOrdinal GO;
1564  typedef Node NO;
1565  typedef Import<LO,GO,NO> import_type;
1566  typedef Map<LO,GO,NO> map_type;
1567 
1568  // Kokkos typedefs
1569  typedef typename map_type::local_map_type local_map_type;
1571  typedef typename KCRS::StaticCrsGraphType graph_t;
1572  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1573  typedef typename NO::execution_space execution_space;
1574  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1575  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1576 
1577 #ifdef HAVE_TPETRA_MMM_TIMINGS
1578  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1579  using Teuchos::TimeMonitor;
1580  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM M5 Cmap")))));
1581 #endif
1582  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1583 
1584  // Build the final importer / column map, hash table lookups for C
1585  RCP<const import_type> Cimport;
1586  RCP<const map_type> Ccolmap;
1587  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1588  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1589  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1590  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1591  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1592  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1593  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1594  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1595 
1596  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
1597  // indices of B, to local column indices of C. (B and C have the
1598  // same number of columns.) The kernel uses this, instead of
1599  // copying the entire input matrix B and converting its column
1600  // indices to those of C.
1601  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1602 
1603  if (Bview.importMatrix.is_null()) {
1604  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1605  Cimport = Bimport;
1606  Ccolmap = Bview.colMap;
1607  const LO colMapSize = static_cast<LO>(Bview.colMap->getNodeNumElements());
1608  // Bcol2Ccol is trivial
1609  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1610  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1611  KOKKOS_LAMBDA(const LO i) {
1612  Bcol2Ccol(i) = i;
1613  });
1614  }
1615  else {
1616  // mfh 27 Sep 2016: B has "remotes," so we need to build the
1617  // column Map of C, as well as C's Import object (from its domain
1618  // Map to its column Map). C's column Map is the union of the
1619  // column Maps of (the local part of) B, and the "remote" part of
1620  // B. Ditto for the Import. We have optimized this "setUnion"
1621  // operation on Import objects and Maps.
1622 
1623  // Choose the right variant of setUnion
1624  if (!Bimport.is_null() && !Iimport.is_null()) {
1625  Cimport = Bimport->setUnion(*Iimport);
1626  }
1627  else if (!Bimport.is_null() && Iimport.is_null()) {
1628  Cimport = Bimport->setUnion();
1629  }
1630  else if (Bimport.is_null() && !Iimport.is_null()) {
1631  Cimport = Iimport->setUnion();
1632  }
1633  else {
1634  throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1635  }
1636  Ccolmap = Cimport->getTargetMap();
1637 
1638  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
1639  // in general. We should get rid of it in order to reduce
1640  // communication costs of sparse matrix-matrix multiply.
1641  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1642  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1643 
1644  // NOTE: This is not efficient and should be folded into setUnion
1645  //
1646  // mfh 27 Sep 2016: What the above comment means, is that the
1647  // setUnion operation on Import objects could also compute these
1648  // local index - to - local index look-up tables.
1649  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
1650  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1651  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1652  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1653  });
1654  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1655  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1656  });
1657 
1658  }
1659 
1660  // Replace the column map
1661  //
1662  // mfh 27 Sep 2016: We do this because C was originally created
1663  // without a column Map. Now we have its column Map.
1664  C.replaceColMap(Ccolmap);
1665 
1666  // mfh 27 Sep 2016: Construct tables that map from local column
1667  // indices of A, to local row indices of either B_local (the locally
1668  // owned part of B), or B_remote (the "imported" remote part of B).
1669  //
1670  // For column index Aik in row i of A, if the corresponding row of B
1671  // exists in the local part of B ("orig") (which I'll call B_local),
1672  // then targetMapToOrigRow[Aik] is the local index of that row of B.
1673  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1674  //
1675  // For column index Aik in row i of A, if the corresponding row of B
1676  // exists in the remote part of B ("Import") (which I'll call
1677  // B_remote), then targetMapToImportRow[Aik] is the local index of
1678  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1679  // (a flag value).
1680 
1681  // Run through all the hash table lookups once and for all
1682  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
1683  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
1684 
1685  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
1686  GO aidx = Acolmap_local.getGlobalElement(i);
1687  LO B_LID = Browmap_local.getLocalElement(aidx);
1688  if (B_LID != LO_INVALID) {
1689  targetMapToOrigRow(i) = B_LID;
1690  targetMapToImportRow(i) = LO_INVALID;
1691  } else {
1692  LO I_LID = Irowmap_local.getLocalElement(aidx);
1693  targetMapToOrigRow(i) = LO_INVALID;
1694  targetMapToImportRow(i) = I_LID;
1695 
1696  }
1697  });
1698 
1699  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
1700  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
1701  KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
1702 
1703 }
1704 
1705 
1706 /*********************************************************************************************************/
1707 // AB NewMatrix Kernel wrappers (Default non-threaded version)
1708 template<class Scalar,
1709  class LocalOrdinal,
1710  class GlobalOrdinal,
1711  class Node,
1712  class LocalOrdinalViewType>
1713 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1714  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1715  const LocalOrdinalViewType & targetMapToOrigRow,
1716  const LocalOrdinalViewType & targetMapToImportRow,
1717  const LocalOrdinalViewType & Bcol2Ccol,
1718  const LocalOrdinalViewType & Icol2Ccol,
1719  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1720  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
1721  const std::string& label,
1722  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1723 #ifdef HAVE_TPETRA_MMM_TIMINGS
1724  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1725  using Teuchos::TimeMonitor;
1726  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore"))));
1727 #endif
1728 
1729  using Teuchos::Array;
1730  using Teuchos::ArrayRCP;
1731  using Teuchos::ArrayView;
1732  using Teuchos::RCP;
1733  using Teuchos::rcp;
1734 
1735  // Lots and lots of typedefs
1737  typedef typename KCRS::StaticCrsGraphType graph_t;
1738  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1739  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1740  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1741  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1742 
1743  typedef Scalar SC;
1744  typedef LocalOrdinal LO;
1745  typedef GlobalOrdinal GO;
1746  typedef Node NO;
1747  typedef Map<LO,GO,NO> map_type;
1748  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1749  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1750  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1751 
1752  // Sizes
1753  RCP<const map_type> Ccolmap = C.getColMap();
1754  size_t m = Aview.origMatrix->getNodeNumRows();
1755  size_t n = Ccolmap->getNodeNumElements();
1756  size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
1757 
1758  // Grab the Kokkos::SparseCrsMatrices & inner stuff
1759  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
1760  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
1761 
1762  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
1763  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
1764  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
1765 
1766  c_lno_view_t Irowptr;
1767  lno_nnz_view_t Icolind;
1768  scalar_view_t Ivals;
1769  if(!Bview.importMatrix.is_null()) {
1770  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
1771  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
1772  Ivals = Bview.importMatrix->getLocalMatrix().values;
1773  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
1774  }
1775 
1776 #ifdef HAVE_TPETRA_MMM_TIMINGS
1777  RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
1778 #endif
1779 
1780  // Classic csr assembly (low memory edition)
1781  //
1782  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
1783  // The method loops over rows of A, and may resize after processing
1784  // each row. Chris Siefert says that this reflects experience in
1785  // ML; for the non-threaded case, ML found it faster to spend less
1786  // effort on estimation and risk an occasional reallocation.
1787  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
1788  lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
1789  lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
1790  scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
1791 
1792  // mfh 27 Sep 2016: The c_status array is an implementation detail
1793  // of the local sparse matrix-matrix multiply routine.
1794 
1795  // The status array will contain the index into colind where this entry was last deposited.
1796  // c_status[i] < CSR_ip - not in the row yet
1797  // c_status[i] >= CSR_ip - this is the entry where you can find the data
1798  // We start with this filled with INVALID's indicating that there are no entries yet.
1799  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
1800  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1801  std::vector<size_t> c_status(n, ST_INVALID);
1802 
1803  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1804  // routine. The routine computes C := A * (B_local + B_remote).
1805  //
1806  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
1807  // you whether the corresponding row of B belongs to B_local
1808  // ("orig") or B_remote ("Import").
1809 
1810  // For each row of A/C
1811  size_t CSR_ip = 0, OLD_ip = 0;
1812  for (size_t i = 0; i < m; i++) {
1813  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
1814  // on the calling process.
1815  Crowptr[i] = CSR_ip;
1816 
1817  // mfh 27 Sep 2016: For each entry of A in the current row of A
1818  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
1819  LO Aik = Acolind[k]; // local column index of current entry of A
1820  const SC Aval = Avals[k]; // value of current entry of A
1821  if (Aval == SC_ZERO)
1822  continue; // skip explicitly stored zero values in A
1823 
1824  if (targetMapToOrigRow[Aik] != LO_INVALID) {
1825  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
1826  // corresponding to the current entry of A is populated, then
1827  // the corresponding row of B is in B_local (i.e., it lives on
1828  // the calling process).
1829 
1830  // Local matrix
1831  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
1832 
1833  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
1834  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
1835  LO Bkj = Bcolind[j];
1836  LO Cij = Bcol2Ccol[Bkj];
1837 
1838  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
1839  // New entry
1840  c_status[Cij] = CSR_ip;
1841  Ccolind[CSR_ip] = Cij;
1842  Cvals[CSR_ip] = Aval*Bvals[j];
1843  CSR_ip++;
1844 
1845  } else {
1846  Cvals[c_status[Cij]] += Aval*Bvals[j];
1847  }
1848  }
1849 
1850  } else {
1851  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
1852  // corresponding to the current entry of A NOT populated (has
1853  // a flag "invalid" value), then the corresponding row of B is
1854  // in B_local (i.e., it lives on the calling process).
1855 
1856  // Remote matrix
1857  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
1858  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
1859  LO Ikj = Icolind[j];
1860  LO Cij = Icol2Ccol[Ikj];
1861 
1862  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
1863  // New entry
1864  c_status[Cij] = CSR_ip;
1865  Ccolind[CSR_ip] = Cij;
1866  Cvals[CSR_ip] = Aval*Ivals[j];
1867  CSR_ip++;
1868  } else {
1869  Cvals[c_status[Cij]] += Aval*Ivals[j];
1870  }
1871  }
1872  }
1873  }
1874 
1875  // Resize for next pass if needed
1876  if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1])*b_max_nnz_per_row) > CSR_alloc) {
1877  CSR_alloc *= 2;
1878  Kokkos::resize(Ccolind,CSR_alloc);
1879  Kokkos::resize(Cvals,CSR_alloc);
1880  }
1881  OLD_ip = CSR_ip;
1882  }
1883 
1884  Crowptr[m] = CSR_ip;
1885 
1886  // Downward resize
1887  Kokkos::resize(Ccolind,CSR_ip);
1888  Kokkos::resize(Cvals,CSR_ip);
1889 
1890 #ifdef HAVE_TPETRA_MMM_TIMINGS
1891  {
1892  auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix Final Sort")));
1893 #endif
1894 
1895  // Final sort & set of CRS arrays
1896  if (params.is_null() || params->get("sort entries",true))
1897  Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
1898  C.setAllValues(Crowptr,Ccolind, Cvals);
1899 
1900 
1901 #ifdef HAVE_TPETRA_MMM_TIMINGS
1902  }
1903  auto MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix ESFC")));
1904  {
1905 #endif
1906 
1907  // Final FillComplete
1908  //
1909  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1910  // Import (from domain Map to column Map) construction (which costs
1911  // lots of communication) by taking the previously constructed
1912  // Import object. We should be able to do this without interfering
1913  // with the implementation of the local part of sparse matrix-matrix
1914  // multply above.
1915  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1916  labelList->set("Timer Label",label);
1917  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1918  RCP<const Export<LO,GO,NO> > dummyExport;
1919  C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
1920 #ifdef HAVE_TPETRA_MMM_TIMINGS
1921  }
1922  MM2 = Teuchos::null;
1923 #endif
1924 
1925 }
1926 
1927 
1928 
1929 
1930 /*********************************************************************************************************/
1931 // Kernel method for computing the local portion of C = A*B (reuse)
1932 template<class Scalar,
1933  class LocalOrdinal,
1934  class GlobalOrdinal,
1935  class Node>
1936 void mult_A_B_reuse(
1937  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1938  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1939  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1940  const std::string& label,
1941  const Teuchos::RCP<Teuchos::ParameterList>& params)
1942 {
1943  using Teuchos::Array;
1944  using Teuchos::ArrayRCP;
1945  using Teuchos::ArrayView;
1946  using Teuchos::RCP;
1947  using Teuchos::rcp;
1948 
1949  // Tpetra typedefs
1950  typedef LocalOrdinal LO;
1951  typedef GlobalOrdinal GO;
1952  typedef Node NO;
1953  typedef Import<LO,GO,NO> import_type;
1954  typedef Map<LO,GO,NO> map_type;
1955 
1956  // Kokkos typedefs
1957  typedef typename map_type::local_map_type local_map_type;
1959  typedef typename KCRS::StaticCrsGraphType graph_t;
1960  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1961  typedef typename NO::execution_space execution_space;
1962  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1963  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1964 
1965 #ifdef HAVE_TPETRA_MMM_TIMINGS
1966  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1967  using Teuchos::TimeMonitor;
1968  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse Cmap"))));
1969 #endif
1970  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1971 
1972  // Grab all the maps
1973  RCP<const import_type> Cimport = C.getGraph()->getImporter();
1974  RCP<const map_type> Ccolmap = C.getColMap();
1975  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1976  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1977  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1978  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1979  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1980  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1981 
1982  // Build the final importer / column map, hash table lookups for C
1983  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1984  {
1985  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
1986  // So, column map of C may be a strict subset of the column map of B
1987  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1988  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1989  });
1990 
1991  if (!Bview.importMatrix.is_null()) {
1992  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1993  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1994 
1995  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
1996  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1997  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1998  });
1999  }
2000  }
2001 
2002  // Run through all the hash table lookups once and for all
2003  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2004  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2005  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2006  GO aidx = Acolmap_local.getGlobalElement(i);
2007  LO B_LID = Browmap_local.getLocalElement(aidx);
2008  if (B_LID != LO_INVALID) {
2009  targetMapToOrigRow(i) = B_LID;
2010  targetMapToImportRow(i) = LO_INVALID;
2011  } else {
2012  LO I_LID = Irowmap_local.getLocalElement(aidx);
2013  targetMapToOrigRow(i) = LO_INVALID;
2014  targetMapToImportRow(i) = I_LID;
2015 
2016  }
2017  });
2018 
2019  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2020  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2021  KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2022 }
2023 
2024 /*********************************************************************************************************/
2025 template<class Scalar,
2026  class LocalOrdinal,
2027  class GlobalOrdinal,
2028  class Node,
2029  class LocalOrdinalViewType>
2030 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2031  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2032  const LocalOrdinalViewType & targetMapToOrigRow,
2033  const LocalOrdinalViewType & targetMapToImportRow,
2034  const LocalOrdinalViewType & Bcol2Ccol,
2035  const LocalOrdinalViewType & Icol2Ccol,
2036  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2037  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2038  const std::string& label,
2039  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2040 #ifdef HAVE_TPETRA_MMM_TIMINGS
2041  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2042  using Teuchos::TimeMonitor;
2043  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
2044  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2045 #else
2046  (void)label;
2047 #endif
2048  using Teuchos::RCP;
2049  using Teuchos::rcp;
2050 
2051 
2052  // Lots and lots of typedefs
2054  typedef typename KCRS::StaticCrsGraphType graph_t;
2055  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2056  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2057  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2058 
2059  typedef Scalar SC;
2060  typedef LocalOrdinal LO;
2061  typedef GlobalOrdinal GO;
2062  typedef Node NO;
2063  typedef Map<LO,GO,NO> map_type;
2064  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2065  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2066  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2067 
2068  // Sizes
2069  RCP<const map_type> Ccolmap = C.getColMap();
2070  size_t m = Aview.origMatrix->getNodeNumRows();
2071  size_t n = Ccolmap->getNodeNumElements();
2072 
2073  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2074  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2075  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2076  const KCRS & Cmat = C.getLocalMatrix();
2077 
2078  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2079  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2080  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2081  scalar_view_t Cvals = Cmat.values;
2082 
2083  c_lno_view_t Irowptr;
2084  lno_nnz_view_t Icolind;
2085  scalar_view_t Ivals;
2086  if(!Bview.importMatrix.is_null()) {
2087  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2088  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2089  Ivals = Bview.importMatrix->getLocalMatrix().values;
2090  }
2091 
2092 #ifdef HAVE_TPETRA_MMM_TIMINGS
2093  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
2094 #endif
2095 
2096  // Classic csr assembly (low memory edition)
2097  // mfh 27 Sep 2016: The c_status array is an implementation detail
2098  // of the local sparse matrix-matrix multiply routine.
2099 
2100  // The status array will contain the index into colind where this entry was last deposited.
2101  // c_status[i] < CSR_ip - not in the row yet
2102  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2103  // We start with this filled with INVALID's indicating that there are no entries yet.
2104  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2105  std::vector<size_t> c_status(n, ST_INVALID);
2106 
2107  // For each row of A/C
2108  size_t CSR_ip = 0, OLD_ip = 0;
2109  for (size_t i = 0; i < m; i++) {
2110  // First fill the c_status array w/ locations where we're allowed to
2111  // generate nonzeros for this row
2112  OLD_ip = Crowptr[i];
2113  CSR_ip = Crowptr[i+1];
2114  for (size_t k = OLD_ip; k < CSR_ip; k++) {
2115  c_status[Ccolind[k]] = k;
2116 
2117  // Reset values in the row of C
2118  Cvals[k] = SC_ZERO;
2119  }
2120 
2121  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2122  LO Aik = Acolind[k];
2123  const SC Aval = Avals[k];
2124  if (Aval == SC_ZERO)
2125  continue;
2126 
2127  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2128  // Local matrix
2129  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2130 
2131  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2132  LO Bkj = Bcolind[j];
2133  LO Cij = Bcol2Ccol[Bkj];
2134 
2135  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2136  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2137  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2138 
2139  Cvals[c_status[Cij]] += Aval * Bvals[j];
2140  }
2141 
2142  } else {
2143  // Remote matrix
2144  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2145  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2146  LO Ikj = Icolind[j];
2147  LO Cij = Icol2Ccol[Ikj];
2148 
2149  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2150  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2151  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2152 
2153  Cvals[c_status[Cij]] += Aval * Ivals[j];
2154  }
2155  }
2156  }
2157  }
2158 
2159 #ifdef HAVE_TPETRA_MMM_TIMINGS
2160  auto MM3 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse ESFC"))));
2161 #endif
2162 
2163  C.fillComplete(C.getDomainMap(), C.getRangeMap());
2164 }
2165 
2166 
2167 /*********************************************************************************************************/
2168 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2169 template<class Scalar,
2170  class LocalOrdinal,
2171  class GlobalOrdinal,
2172  class Node>
2173 void jacobi_A_B_newmatrix(
2174  Scalar omega,
2175  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2176  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2177  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2178  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2179  const std::string& label,
2180  const Teuchos::RCP<Teuchos::ParameterList>& params)
2181 {
2182  using Teuchos::Array;
2183  using Teuchos::ArrayRCP;
2184  using Teuchos::ArrayView;
2185  using Teuchos::RCP;
2186  using Teuchos::rcp;
2187  // typedef Scalar SC;
2188  typedef LocalOrdinal LO;
2189  typedef GlobalOrdinal GO;
2190  typedef Node NO;
2191 
2192  typedef Import<LO,GO,NO> import_type;
2193  typedef Map<LO,GO,NO> map_type;
2194  typedef typename map_type::local_map_type local_map_type;
2195 
2196  // All of the Kokkos typedefs
2198  typedef typename KCRS::StaticCrsGraphType graph_t;
2199  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2200  typedef typename NO::execution_space execution_space;
2201  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2202  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2203 
2204 
2205 #ifdef HAVE_TPETRA_MMM_TIMINGS
2206  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2207  using Teuchos::TimeMonitor;
2208  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi M5 Cmap"))));
2209 #endif
2210  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2211 
2212  // Build the final importer / column map, hash table lookups for C
2213  RCP<const import_type> Cimport;
2214  RCP<const map_type> Ccolmap;
2215  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2216  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
2217  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2218  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2219  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2220  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2221  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2222  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2223 
2224  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
2225  // indices of B, to local column indices of C. (B and C have the
2226  // same number of columns.) The kernel uses this, instead of
2227  // copying the entire input matrix B and converting its column
2228  // indices to those of C.
2229  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2230 
2231  if (Bview.importMatrix.is_null()) {
2232  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
2233  Cimport = Bimport;
2234  Ccolmap = Bview.colMap;
2235  // Bcol2Ccol is trivial
2236  // Bcol2Ccol is trivial
2237 
2238  Kokkos::RangePolicy<execution_space, LO> range (0, static_cast<LO> (Bview.colMap->getNodeNumElements ()));
2239  Kokkos::parallel_for (range, KOKKOS_LAMBDA (const size_t i) {
2240  Bcol2Ccol(i) = static_cast<LO> (i);
2241  });
2242  } else {
2243  // mfh 27 Sep 2016: B has "remotes," so we need to build the
2244  // column Map of C, as well as C's Import object (from its domain
2245  // Map to its column Map). C's column Map is the union of the
2246  // column Maps of (the local part of) B, and the "remote" part of
2247  // B. Ditto for the Import. We have optimized this "setUnion"
2248  // operation on Import objects and Maps.
2249 
2250  // Choose the right variant of setUnion
2251  if (!Bimport.is_null() && !Iimport.is_null()){
2252  Cimport = Bimport->setUnion(*Iimport);
2253  Ccolmap = Cimport->getTargetMap();
2254 
2255  } else if (!Bimport.is_null() && Iimport.is_null()) {
2256  Cimport = Bimport->setUnion();
2257 
2258  } else if(Bimport.is_null() && !Iimport.is_null()) {
2259  Cimport = Iimport->setUnion();
2260 
2261  } else
2262  throw std::runtime_error("TpetraExt::Jacobi status of matrix importers is nonsensical");
2263 
2264  Ccolmap = Cimport->getTargetMap();
2265 
2266  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2267  std::runtime_error, "Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2268 
2269  // NOTE: This is not efficient and should be folded into setUnion
2270  //
2271  // mfh 27 Sep 2016: What the above comment means, is that the
2272  // setUnion operation on Import objects could also compute these
2273  // local index - to - local index look-up tables.
2274  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2275  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2276  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2277  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2278  });
2279  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2280  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2281  });
2282 
2283  }
2284 
2285  // Replace the column map
2286  //
2287  // mfh 27 Sep 2016: We do this because C was originally created
2288  // without a column Map. Now we have its column Map.
2289  C.replaceColMap(Ccolmap);
2290 
2291  // mfh 27 Sep 2016: Construct tables that map from local column
2292  // indices of A, to local row indices of either B_local (the locally
2293  // owned part of B), or B_remote (the "imported" remote part of B).
2294  //
2295  // For column index Aik in row i of A, if the corresponding row of B
2296  // exists in the local part of B ("orig") (which I'll call B_local),
2297  // then targetMapToOrigRow[Aik] is the local index of that row of B.
2298  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
2299  //
2300  // For column index Aik in row i of A, if the corresponding row of B
2301  // exists in the remote part of B ("Import") (which I'll call
2302  // B_remote), then targetMapToImportRow[Aik] is the local index of
2303  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
2304  // (a flag value).
2305 
2306  // Run through all the hash table lookups once and for all
2307  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2308  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2309  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2310  GO aidx = Acolmap_local.getGlobalElement(i);
2311  LO B_LID = Browmap_local.getLocalElement(aidx);
2312  if (B_LID != LO_INVALID) {
2313  targetMapToOrigRow(i) = B_LID;
2314  targetMapToImportRow(i) = LO_INVALID;
2315  } else {
2316  LO I_LID = Irowmap_local.getLocalElement(aidx);
2317  targetMapToOrigRow(i) = LO_INVALID;
2318  targetMapToImportRow(i) = I_LID;
2319 
2320  }
2321  });
2322 
2323  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2324  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2325  KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2326 
2327 }
2328 
2329 
2330 /*********************************************************************************************************/
2331 // Jacobi AB NewMatrix Kernel wrappers (Default non-threaded version)
2332 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2333 
2334 template<class Scalar,
2335  class LocalOrdinal,
2336  class GlobalOrdinal,
2337  class Node,
2338  class LocalOrdinalViewType>
2339 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
2340  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> & Dinv,
2341  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2342  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2343  const LocalOrdinalViewType & targetMapToOrigRow,
2344  const LocalOrdinalViewType & targetMapToImportRow,
2345  const LocalOrdinalViewType & Bcol2Ccol,
2346  const LocalOrdinalViewType & Icol2Ccol,
2347  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2348  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2349  const std::string& label,
2350  const Teuchos::RCP<Teuchos::ParameterList>& params) {
2351 
2352 #ifdef HAVE_TPETRA_MMM_TIMINGS
2353  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2354  using Teuchos::TimeMonitor;
2355  auto MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Nemwmatrix SerialCore")));
2356 #endif
2357 
2358  using Teuchos::Array;
2359  using Teuchos::ArrayRCP;
2360  using Teuchos::ArrayView;
2361  using Teuchos::RCP;
2362  using Teuchos::rcp;
2363 
2364  // Lots and lots of typedefs
2366  typedef typename KCRS::StaticCrsGraphType graph_t;
2367  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2368  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2369  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2370  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2371 
2372  // Jacobi-specific
2373  typedef typename scalar_view_t::memory_space scalar_memory_space;
2374 
2375  typedef Scalar SC;
2376  typedef LocalOrdinal LO;
2377  typedef GlobalOrdinal GO;
2378  typedef Node NO;
2379 
2380  typedef Map<LO,GO,NO> map_type;
2381  size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2382  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2383 
2384  // Sizes
2385  RCP<const map_type> Ccolmap = C.getColMap();
2386  size_t m = Aview.origMatrix->getNodeNumRows();
2387  size_t n = Ccolmap->getNodeNumElements();
2388  size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
2389 
2390  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2391  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2392  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2393 
2394  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2395  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2396  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2397 
2398  c_lno_view_t Irowptr;
2399  lno_nnz_view_t Icolind;
2400  scalar_view_t Ivals;
2401  if(!Bview.importMatrix.is_null()) {
2402  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2403  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2404  Ivals = Bview.importMatrix->getLocalMatrix().values;
2405  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
2406  }
2407 
2408  // Jacobi-specific inner stuff
2409  auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
2410 
2411  // Teuchos::ArrayView::operator[].
2412  // The status array will contain the index into colind where this entry was last deposited.
2413  // c_status[i] < CSR_ip - not in the row yet.
2414  // c_status[i] >= CSR_ip, this is the entry where you can find the data
2415  // We start with this filled with INVALID's indicating that there are no entries yet.
2416  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2417  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2418  Array<size_t> c_status(n, ST_INVALID);
2419 
2420  // Classic csr assembly (low memory edition)
2421  //
2422  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2423  // The method loops over rows of A, and may resize after processing
2424  // each row. Chris Siefert says that this reflects experience in
2425  // ML; for the non-threaded case, ML found it faster to spend less
2426  // effort on estimation and risk an occasional reallocation.
2427  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2428  lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
2429  lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
2430  scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
2431  size_t CSR_ip = 0, OLD_ip = 0;
2432 
2433  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2434 
2435  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2436  // routine. The routine computes
2437  //
2438  // C := (I - omega * D^{-1} * A) * (B_local + B_remote)).
2439  //
2440  // This corresponds to one sweep of (weighted) Jacobi.
2441  //
2442  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2443  // you whether the corresponding row of B belongs to B_local
2444  // ("orig") or B_remote ("Import").
2445 
2446  // For each row of A/C
2447  for (size_t i = 0; i < m; i++) {
2448  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2449  // on the calling process.
2450  Crowptr[i] = CSR_ip;
2451  SC minusOmegaDval = -omega*Dvals(i,0);
2452 
2453  // Entries of B
2454  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2455  Scalar Bval = Bvals[j];
2456  if (Bval == SC_ZERO)
2457  continue;
2458  LO Bij = Bcolind[j];
2459  LO Cij = Bcol2Ccol[Bij];
2460 
2461  // Assume no repeated entries in B
2462  c_status[Cij] = CSR_ip;
2463  Ccolind[CSR_ip] = Cij;
2464  Cvals[CSR_ip] = Bvals[j];
2465  CSR_ip++;
2466  }
2467 
2468  // Entries of -omega * Dinv * A * B
2469  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2470  LO Aik = Acolind[k];
2471  const SC Aval = Avals[k];
2472  if (Aval == SC_ZERO)
2473  continue;
2474 
2475  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2476  // Local matrix
2477  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2478 
2479  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2480  LO Bkj = Bcolind[j];
2481  LO Cij = Bcol2Ccol[Bkj];
2482 
2483  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2484  // New entry
2485  c_status[Cij] = CSR_ip;
2486  Ccolind[CSR_ip] = Cij;
2487  Cvals[CSR_ip] = minusOmegaDval* Aval * Bvals[j];
2488  CSR_ip++;
2489 
2490  } else {
2491  Cvals[c_status[Cij]] += minusOmegaDval* Aval * Bvals[j];
2492  }
2493  }
2494 
2495  } else {
2496  // Remote matrix
2497  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2498  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2499  LO Ikj = Icolind[j];
2500  LO Cij = Icol2Ccol[Ikj];
2501 
2502  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2503  // New entry
2504  c_status[Cij] = CSR_ip;
2505  Ccolind[CSR_ip] = Cij;
2506  Cvals[CSR_ip] = minusOmegaDval* Aval * Ivals[j];
2507  CSR_ip++;
2508  } else {
2509  Cvals[c_status[Cij]] += minusOmegaDval* Aval * Ivals[j];
2510  }
2511  }
2512  }
2513  }
2514 
2515  // Resize for next pass if needed
2516  if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1]+1)*b_max_nnz_per_row) > CSR_alloc) {
2517  CSR_alloc *= 2;
2518  Kokkos::resize(Ccolind,CSR_alloc);
2519  Kokkos::resize(Cvals,CSR_alloc);
2520  }
2521  OLD_ip = CSR_ip;
2522  }
2523  Crowptr[m] = CSR_ip;
2524 
2525  // Downward resize
2526  Kokkos::resize(Ccolind,CSR_ip);
2527  Kokkos::resize(Cvals,CSR_ip);
2528 
2529  {
2530 #ifdef HAVE_TPETRA_MMM_TIMINGS
2531  auto MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix Final Sort")));
2532 #endif
2533 
2534  // Replace the column map
2535  //
2536  // mfh 27 Sep 2016: We do this because C was originally created
2537  // without a column Map. Now we have its column Map.
2538  C.replaceColMap(Ccolmap);
2539 
2540  // Final sort & set of CRS arrays
2541  //
2542  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
2543  // matrix-matrix multiply routine sort the entries for us?
2544  // Final sort & set of CRS arrays
2545  if (params.is_null() || params->get("sort entries",true))
2546  Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2547  C.setAllValues(Crowptr,Ccolind, Cvals);
2548  }
2549  {
2550 #ifdef HAVE_TPETRA_MMM_TIMINGS
2551  auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix ESFC")));
2552 #endif
2553 
2554  // Final FillComplete
2555  //
2556  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2557  // Import (from domain Map to column Map) construction (which costs
2558  // lots of communication) by taking the previously constructed
2559  // Import object. We should be able to do this without interfering
2560  // with the implementation of the local part of sparse matrix-matrix
2561  // multply above
2562  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2563  labelList->set("Timer Label",label);
2564  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2565  RCP<const Export<LO,GO,NO> > dummyExport;
2566  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2567 
2568  }
2569 }
2570 
2571 
2572 /*********************************************************************************************************/
2573 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2574 template<class Scalar,
2575  class LocalOrdinal,
2576  class GlobalOrdinal,
2577  class Node>
2578 void jacobi_A_B_reuse(
2579  Scalar omega,
2580  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2581  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2582  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2583  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2584  const std::string& label,
2585  const Teuchos::RCP<Teuchos::ParameterList>& params)
2586 {
2587  using Teuchos::Array;
2588  using Teuchos::ArrayRCP;
2589  using Teuchos::ArrayView;
2590  using Teuchos::RCP;
2591  using Teuchos::rcp;
2592 
2593  typedef LocalOrdinal LO;
2594  typedef GlobalOrdinal GO;
2595  typedef Node NO;
2596 
2597  typedef Import<LO,GO,NO> import_type;
2598  typedef Map<LO,GO,NO> map_type;
2599 
2600  // Kokkos typedefs
2601  typedef typename map_type::local_map_type local_map_type;
2603  typedef typename KCRS::StaticCrsGraphType graph_t;
2604  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2605  typedef typename NO::execution_space execution_space;
2606  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2607  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2608 
2609 #ifdef HAVE_TPETRA_MMM_TIMINGS
2610  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2611  using Teuchos::TimeMonitor;
2612  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse Cmap"))));
2613 #endif
2614  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2615 
2616  // Grab all the maps
2617  RCP<const import_type> Cimport = C.getGraph()->getImporter();
2618  RCP<const map_type> Ccolmap = C.getColMap();
2619  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2620  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2621  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2622  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2623  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2624  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2625 
2626  // Build the final importer / column map, hash table lookups for C
2627  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2628  {
2629  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2630  // So, column map of C may be a strict subset of the column map of B
2631  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2632  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2633  });
2634 
2635  if (!Bview.importMatrix.is_null()) {
2636  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2637  std::runtime_error, "Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
2638 
2639  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2640  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2641  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2642  });
2643  }
2644  }
2645 
2646  // Run through all the hash table lookups once and for all
2647  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2648  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2649  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2650  GO aidx = Acolmap_local.getGlobalElement(i);
2651  LO B_LID = Browmap_local.getLocalElement(aidx);
2652  if (B_LID != LO_INVALID) {
2653  targetMapToOrigRow(i) = B_LID;
2654  targetMapToImportRow(i) = LO_INVALID;
2655  } else {
2656  LO I_LID = Irowmap_local.getLocalElement(aidx);
2657  targetMapToOrigRow(i) = LO_INVALID;
2658  targetMapToImportRow(i) = I_LID;
2659 
2660  }
2661  });
2662 
2663 #ifdef HAVE_TPETRA_MMM_TIMINGS
2664  MM = Teuchos::null;
2665 #endif
2666 
2667  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2668  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2669  KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2670 }
2671 
2672 
2673 
2674 /*********************************************************************************************************/
2675 template<class Scalar,
2676  class LocalOrdinal,
2677  class GlobalOrdinal,
2678  class Node,
2679  class LocalOrdinalViewType>
2680 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
2681  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2682  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2683  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2684  const LocalOrdinalViewType & targetMapToOrigRow,
2685  const LocalOrdinalViewType & targetMapToImportRow,
2686  const LocalOrdinalViewType & Bcol2Ccol,
2687  const LocalOrdinalViewType & Icol2Ccol,
2688  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2689  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2690  const std::string& label,
2691  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2692 #ifdef HAVE_TPETRA_MMM_TIMINGS
2693  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2694  using Teuchos::TimeMonitor;
2695  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore"))));
2696  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2697 #else
2698  (void)label;
2699 #endif
2700  using Teuchos::RCP;
2701  using Teuchos::rcp;
2702 
2703  // Lots and lots of typedefs
2705  typedef typename KCRS::StaticCrsGraphType graph_t;
2706  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2707  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2708  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2709  typedef typename scalar_view_t::memory_space scalar_memory_space;
2710 
2711  typedef Scalar SC;
2712  typedef LocalOrdinal LO;
2713  typedef GlobalOrdinal GO;
2714  typedef Node NO;
2715  typedef Map<LO,GO,NO> map_type;
2716  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2717  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2718  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2719 
2720  // Sizes
2721  RCP<const map_type> Ccolmap = C.getColMap();
2722  size_t m = Aview.origMatrix->getNodeNumRows();
2723  size_t n = Ccolmap->getNodeNumElements();
2724 
2725  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2726  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2727  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2728  const KCRS & Cmat = C.getLocalMatrix();
2729 
2730  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2731  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2732  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2733  scalar_view_t Cvals = Cmat.values;
2734 
2735  c_lno_view_t Irowptr;
2736  lno_nnz_view_t Icolind;
2737  scalar_view_t Ivals;
2738  if(!Bview.importMatrix.is_null()) {
2739  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2740  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2741  Ivals = Bview.importMatrix->getLocalMatrix().values;
2742  }
2743 
2744  // Jacobi-specific inner stuff
2745  auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
2746 
2747 #ifdef HAVE_TPETRA_MMM_TIMINGS
2748  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore - Compare"))));
2749 #endif
2750 
2751  // The status array will contain the index into colind where this entry was last deposited.
2752  // c_status[i] < CSR_ip - not in the row yet
2753  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2754  // We start with this filled with INVALID's indicating that there are no entries yet.
2755  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2756  std::vector<size_t> c_status(n, ST_INVALID);
2757 
2758  // For each row of A/C
2759  size_t CSR_ip = 0, OLD_ip = 0;
2760  for (size_t i = 0; i < m; i++) {
2761 
2762  // First fill the c_status array w/ locations where we're allowed to
2763  // generate nonzeros for this row
2764  OLD_ip = Crowptr[i];
2765  CSR_ip = Crowptr[i+1];
2766  for (size_t k = OLD_ip; k < CSR_ip; k++) {
2767  c_status[Ccolind[k]] = k;
2768 
2769  // Reset values in the row of C
2770  Cvals[k] = SC_ZERO;
2771  }
2772 
2773  SC minusOmegaDval = -omega*Dvals(i,0);
2774 
2775  // Entries of B
2776  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2777  Scalar Bval = Bvals[j];
2778  if (Bval == SC_ZERO)
2779  continue;
2780  LO Bij = Bcolind[j];
2781  LO Cij = Bcol2Ccol[Bij];
2782 
2783  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2784  std::runtime_error, "Trying to insert a new entry into a static graph");
2785 
2786  Cvals[c_status[Cij]] = Bvals[j];
2787  }
2788 
2789  // Entries of -omega * Dinv * A * B
2790  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2791  LO Aik = Acolind[k];
2792  const SC Aval = Avals[k];
2793  if (Aval == SC_ZERO)
2794  continue;
2795 
2796  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2797  // Local matrix
2798  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2799 
2800  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2801  LO Bkj = Bcolind[j];
2802  LO Cij = Bcol2Ccol[Bkj];
2803 
2804  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2805  std::runtime_error, "Trying to insert a new entry into a static graph");
2806 
2807  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
2808  }
2809 
2810  } else {
2811  // Remote matrix
2812  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2813  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2814  LO Ikj = Icolind[j];
2815  LO Cij = Icol2Ccol[Ikj];
2816 
2817  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2818  std::runtime_error, "Trying to insert a new entry into a static graph");
2819 
2820  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
2821  }
2822  }
2823  }
2824  }
2825 
2826 #ifdef HAVE_TPETRA_MMM_TIMINGS
2827  MM2 = Teuchos::null;
2828  MM2 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
2829 #endif
2830 
2831  C.fillComplete(C.getDomainMap(), C.getRangeMap());
2832 #ifdef HAVE_TPETRA_MMM_TIMINGS
2833  MM2 = Teuchos::null;
2834  MM = Teuchos::null;
2835 #endif
2836 
2837 }
2838 
2839 
2840 
2841 /*********************************************************************************************************/
2842 template<class Scalar,
2843  class LocalOrdinal,
2844  class GlobalOrdinal,
2845  class Node>
2846 void import_and_extract_views(
2847  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
2848  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
2849  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2850  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
2851  bool userAssertsThereAreNoRemotes,
2852  const std::string& label,
2853  const Teuchos::RCP<Teuchos::ParameterList>& params)
2854 {
2855  using Teuchos::Array;
2856  using Teuchos::ArrayView;
2857  using Teuchos::RCP;
2858  using Teuchos::rcp;
2859  using Teuchos::null;
2860 
2861  typedef Scalar SC;
2862  typedef LocalOrdinal LO;
2863  typedef GlobalOrdinal GO;
2864  typedef Node NO;
2865 
2866  typedef Map<LO,GO,NO> map_type;
2867  typedef Import<LO,GO,NO> import_type;
2868  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
2869 
2870 #ifdef HAVE_TPETRA_MMM_TIMINGS
2871  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2872  using Teuchos::TimeMonitor;
2873  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Alloc"))));
2874 #endif
2875  // The goal of this method is to populate the 'Aview' struct with views of the
2876  // rows of A, including all rows that correspond to elements in 'targetMap'.
2877  //
2878  // If targetMap includes local elements that correspond to remotely-owned rows
2879  // of A, then those remotely-owned rows will be imported into
2880  // 'Aview.importMatrix', and views of them will be included in 'Aview'.
2881  Aview.deleteContents();
2882 
2883  Aview.origMatrix = rcp(&A, false);
2884  Aview.origRowMap = A.getRowMap();
2885  Aview.rowMap = targetMap;
2886  Aview.colMap = A.getColMap();
2887  Aview.domainMap = A.getDomainMap();
2888  Aview.importColMap = null;
2889  RCP<const map_type> rowMap = A.getRowMap();
2890  const int numProcs = rowMap->getComm()->getSize();
2891 
2892  // Short circuit if the user swears there are no remotes (or if we're in serial)
2893  if (userAssertsThereAreNoRemotes || numProcs < 2)
2894  return;
2895 
2896  RCP<const import_type> importer;
2897  if (params != null && params->isParameter("importer")) {
2898  importer = params->get<RCP<const import_type> >("importer");
2899 
2900  } else {
2901 #ifdef HAVE_TPETRA_MMM_TIMINGS
2902  MM = Teuchos::null;
2903  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap"))));
2904 #endif
2905 
2906  // Mark each row in targetMap as local or remote, and go ahead and get a view
2907  // for the local rows
2908  RCP<const map_type> remoteRowMap;
2909  size_t numRemote = 0;
2910  int mode = 0;
2911  if (!prototypeImporter.is_null() &&
2912  prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
2913  prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
2914  // We have a valid prototype importer --- ask it for the remotes
2915 #ifdef HAVE_TPETRA_MMM_TIMINGS
2916  TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap-Mode1"));
2917 #endif
2918  ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
2919  numRemote = prototypeImporter->getNumRemoteIDs();
2920 
2921  Array<GO> remoteRows(numRemote);
2922  for (size_t i = 0; i < numRemote; i++)
2923  remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
2924 
2925  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2926  rowMap->getIndexBase(), rowMap->getComm()));
2927  mode = 1;
2928 
2929  } else if (prototypeImporter.is_null()) {
2930  // No prototype importer --- count the remotes the hard way
2931 #ifdef HAVE_TPETRA_MMM_TIMINGS
2932  TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap-Mode2"));
2933 #endif
2934  ArrayView<const GO> rows = targetMap->getNodeElementList();
2935  size_t numRows = targetMap->getNodeNumElements();
2936 
2937  Array<GO> remoteRows(numRows);
2938  for(size_t i = 0; i < numRows; ++i) {
2939  const LO mlid = rowMap->getLocalElement(rows[i]);
2940 
2941  if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
2942  remoteRows[numRemote++] = rows[i];
2943  }
2944  remoteRows.resize(numRemote);
2945  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2946  rowMap->getIndexBase(), rowMap->getComm()));
2947  mode = 2;
2948 
2949  } else {
2950  // PrototypeImporter is bad. But if we're in serial that's OK.
2951  mode = 3;
2952  }
2953 
2954  if (numProcs < 2) {
2955  TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
2956  "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
2957  // If only one processor we don't need to import any remote rows, so return.
2958  return;
2959  }
2960 
2961  //
2962  // Now we will import the needed remote rows of A, if the global maximum
2963  // value of numRemote is greater than 0.
2964  //
2965 #ifdef HAVE_TPETRA_MMM_TIMINGS
2966  MM = Teuchos::null;
2967  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Collective-0"))));
2968 #endif
2969 
2970  global_size_t globalMaxNumRemote = 0;
2971  Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
2972 
2973  if (globalMaxNumRemote > 0) {
2974 #ifdef HAVE_TPETRA_MMM_TIMINGS
2975  MM = Teuchos::null;
2976  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-2"))));
2977 #endif
2978  // Create an importer with target-map remoteRowMap and source-map rowMap.
2979  if (mode == 1)
2980  importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
2981  else if (mode == 2)
2982  importer = rcp(new import_type(rowMap, remoteRowMap));
2983  else
2984  throw std::runtime_error("prototypeImporter->SourceMap() does not match A.getRowMap()!");
2985  }
2986 
2987  if (params != null)
2988  params->set("importer", importer);
2989  }
2990 
2991  if (importer != null) {
2992 #ifdef HAVE_TPETRA_MMM_TIMINGS
2993  MM = Teuchos::null;
2994  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-3"))));
2995 #endif
2996 
2997  // Now create a new matrix into which we can import the remote rows of A that we need.
2998  Teuchos::ParameterList labelList;
2999  labelList.set("Timer Label", label);
3000  auto & labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
3001 
3002  bool isMM = true;
3003  bool overrideAllreduce = false;
3004  int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
3005  // Minor speedup tweak - avoid computing the global constants
3006  Teuchos::ParameterList params_sublist;
3007  if(!params.is_null()) {
3008  labelList.set("compute global constants", params->get("compute global constants",false));
3009  params_sublist = params->sublist("matrixmatrix: kernel params",false);
3010  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3011  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3012  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
3013  isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
3014  overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
3015  }
3016  labelList_subList.set("isMatrixMatrix_TransferAndFillComplete",isMM);
3017  labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3018  labelList_subList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
3019 
3020  Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
3021  A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3022 
3023 #if 0
3024  // Disabled code for dumping input matrices
3025  static int count=0;
3026  char str[80];
3027  sprintf(str,"import_matrix.%d.dat",count);
3029  count++;
3030 #endif
3031 
3032 #ifdef HAVE_TPETRA_MMM_STATISTICS
3033  printMultiplicationStatistics(importer, label + std::string(" I&X MMM"));
3034 #endif
3035 
3036 
3037 #ifdef HAVE_TPETRA_MMM_TIMINGS
3038  MM = Teuchos::null;
3039  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-4"))));
3040 #endif
3041 
3042  // Save the column map of the imported matrix, so that we can convert indices back to global for arithmetic later
3043  Aview.importColMap = Aview.importMatrix->getColMap();
3044 #ifdef HAVE_TPETRA_MMM_TIMINGS
3045  MM = Teuchos::null;
3046 #endif
3047  }
3048 }
3049 
3050 
3051 
3052 
3053 
3054 /*********************************************************************************************************/
3055  // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3056 template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node, class LocalOrdinalViewType>
3058 merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3059  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3060  const LocalOrdinalViewType & Acol2Brow,
3061  const LocalOrdinalViewType & Acol2Irow,
3062  const LocalOrdinalViewType & Bcol2Ccol,
3063  const LocalOrdinalViewType & Icol2Ccol,
3064  const size_t mergedNodeNumCols) {
3065 
3066  using Teuchos::RCP;
3068  typedef typename KCRS::StaticCrsGraphType graph_t;
3069  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3070  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3071  typedef typename KCRS::values_type::non_const_type scalar_view_t;
3072  // Grab the Kokkos::SparseCrsMatrices
3073  const KCRS & Ak = Aview.origMatrix->getLocalMatrix();
3074  const KCRS & Bk = Bview.origMatrix->getLocalMatrix();
3075 
3076  // We need to do this dance if either (a) We have Bimport or (b) We don't A's colMap is not the same as B's rowMap
3077  if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3078  // We do have a Bimport
3079  // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3080  // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3081  RCP<const KCRS> Ik_;
3082  if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrix());
3083  const KCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3084  KCRS Iks;
3085  if(Ik!=0) Iks = *Ik;
3086  size_t merge_numrows = Ak.numCols();
3087  // The last entry of this at least, need to be initialized
3088  lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3089 
3090  const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3091 
3092  // Use a Kokkos::parallel_scan to build the rowptr
3093  typedef typename Node::execution_space execution_space;
3094  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3095  Kokkos::parallel_scan ("Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3096  KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3097  if(final) Mrowptr(i) = update;
3098  // Get the row count
3099  size_t ct=0;
3100  if(Acol2Brow(i)!=LO_INVALID)
3101  ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3102  else
3103  ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3104  update+=ct;
3105 
3106  if(final && i+1==merge_numrows)
3107  Mrowptr(i+1)=update;
3108  });
3109 
3110  // Allocate nnz
3111  size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3112  lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"),merge_nnz);
3113  scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"),merge_nnz);
3114 
3115  // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3116  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3117  Kokkos::parallel_for ("Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(const size_t i) {
3118  if(Acol2Brow(i)!=LO_INVALID) {
3119  size_t row = Acol2Brow(i);
3120  size_t start = Bk.graph.row_map(row);
3121  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3122  Mvalues(j) = Bk.values(j-Mrowptr(i)+start);
3123  Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3124  }
3125  }
3126  else {
3127  size_t row = Acol2Irow(i);
3128  size_t start = Iks.graph.row_map(row);
3129  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3130  Mvalues(j) = Iks.values(j-Mrowptr(i)+start);
3131  Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3132  }
3133  }
3134  });
3135 
3136  KCRS newmat("CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind);
3137  return newmat;
3138  }
3139  else {
3140  // We don't have a Bimport (the easy case)
3141  return Bk;
3142  }
3143 }//end merge_matrices
3144 
3145 template<typename SC, typename LO, typename GO, typename NO>
3146 void AddKernels<SC, LO, GO, NO>::
3147 addSorted(
3148  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3149  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3150  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3151  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3152  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3153  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3154  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3155  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3156  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3157  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3158  typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3159 {
3160  using Teuchos::TimeMonitor;
3161  TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3162  auto nrows = Arowptrs.extent(0) - 1;
3163  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3164  typedef KokkosKernels::Experimental::KokkosKernelsHandle<typename col_inds_array::size_type, LO, impl_scalar_type,
3165  execution_space, memory_space, memory_space> KKH;
3166  KKH handle;
3167  handle.create_spadd_handle(true);
3168  auto addHandle = handle.get_spadd_handle();
3169 #ifdef HAVE_TPETRA_MMM_TIMINGS
3170  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted symbolic")));
3171 #endif
3172  KokkosSparse::Experimental::spadd_symbolic
3173  <KKH,
3174  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3175  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3176  row_ptrs_array, col_inds_array>
3177  (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3178  //KokkosKernels requires values to be zeroed
3179  Cvals = values_array("C values", addHandle->get_max_result_nnz());
3180  Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_max_result_nnz());
3181 #ifdef HAVE_TPETRA_MMM_TIMINGS
3182  MM = Teuchos::null;
3183  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted numeric")));
3184 #endif
3185  KokkosSparse::Experimental::spadd_numeric(&handle,
3186  Arowptrs, Acolinds, Avals, scalarA,
3187  Browptrs, Bcolinds, Bvals, scalarB,
3188  Crowptrs, Ccolinds, Cvals);
3189 }
3190 
3191 template<typename SC, typename LO, typename GO, typename NO>
3192 void AddKernels<SC, LO, GO, NO>::
3193 addUnsorted(
3194  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3195  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3196  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3197  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3198  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3199  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3200  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3201  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3202  GO /* numGlobalCols */,
3203  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3204  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3205  typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3206 {
3207  using Teuchos::TimeMonitor;
3208  TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3209  auto nrows = Arowptrs.extent(0) - 1;
3210  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3211  typedef MMdetails::AddKernels<SC, LO, GO, NO> AddKern;
3212  typedef KokkosKernels::Experimental::KokkosKernelsHandle<typename col_inds_array::size_type, LO, AddKern::impl_scalar_type,
3213  AddKern::execution_space, AddKern::memory_space, AddKern::memory_space> KKH;
3214  KKH handle;
3215  handle.create_spadd_handle(false);
3216  auto addHandle = handle.get_spadd_handle();
3217 #ifdef HAVE_TPETRA_MMM_TIMINGS
3218  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted symbolic")));
3219 #endif
3220  KokkosSparse::Experimental::spadd_symbolic
3221  <KKH,
3222  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3223  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3224  row_ptrs_array, col_inds_array>
3225  (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3226  //Cvals must be zeroed out
3227  Cvals = values_array("C values", addHandle->get_max_result_nnz());
3228  Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_max_result_nnz());
3229 #ifdef HAVE_TPETRA_MMM_TIMINGS
3230  MM = Teuchos::null;
3231  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted kernel: sorted numeric")));
3232 #endif
3233  KokkosSparse::Experimental::spadd_numeric(&handle,
3234  Arowptrs, Acolinds, Avals, scalarA,
3235  Browptrs, Bcolinds, Bvals, scalarB,
3236  Crowptrs, Ccolinds, Cvals);
3237 }
3238 
3239 template<typename GO,
3240  typename LocalIndicesType,
3241  typename GlobalIndicesType,
3242  typename ColMapType>
3243 struct ConvertLocalToGlobalFunctor
3244 {
3245  ConvertLocalToGlobalFunctor(
3246  const LocalIndicesType& colindsOrig_,
3247  const GlobalIndicesType& colindsConverted_,
3248  const ColMapType& colmap_) :
3249  colindsOrig (colindsOrig_),
3250  colindsConverted (colindsConverted_),
3251  colmap (colmap_)
3252  {}
3253  KOKKOS_INLINE_FUNCTION void
3254  operator() (const GO i) const
3255  {
3256  colindsConverted(i) = colmap.getGlobalElement(colindsOrig(i));
3257  }
3258  LocalIndicesType colindsOrig;
3259  GlobalIndicesType colindsConverted;
3260  ColMapType colmap;
3261 };
3262 
3263 template<typename SC, typename LO, typename GO, typename NO>
3264 void MMdetails::AddKernels<SC, LO, GO, NO>::
3265 convertToGlobalAndAdd(
3266  const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& A,
3267  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3268  const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& B,
3269  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3270  const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
3271  const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
3272  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3273  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3274  typename MMdetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds)
3275 {
3276  using Teuchos::TimeMonitor;
3277 
3278  const values_array Avals = A.values;
3279  const values_array Bvals = B.values;
3280  const col_inds_array Acolinds = A.graph.entries;
3281  const col_inds_array Bcolinds = B.graph.entries;
3282  auto Arowptrs = A.graph.row_map;
3283  auto Browptrs = B.graph.row_map;
3284  global_col_inds_array AcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("A colinds (converted)"), Acolinds.extent(0));
3285  global_col_inds_array BcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("B colinds (converted)"), Bcolinds.extent(0));
3286 #ifdef HAVE_TPETRA_MMM_TIMINGS
3287  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: " + std::string("column map conversion"))));
3288 #endif
3289  ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(Acolinds, AcolindsConverted, AcolMap);
3290  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
3291  ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(Bcolinds, BcolindsConverted, BcolMap);
3292  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
3293  typedef KokkosKernels::Experimental::KokkosKernelsHandle<typename col_inds_array::size_type, GO, impl_scalar_type,
3294  execution_space, memory_space, memory_space> KKH;
3295  KKH handle;
3296  handle.create_spadd_handle(false);
3297  auto addHandle = handle.get_spadd_handle();
3298 #ifdef HAVE_TPETRA_MMM_TIMINGS
3299  MM = Teuchos::null;
3300  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted symbolic")));
3301 #endif
3302  auto nrows = Arowptrs.extent(0) - 1;
3303  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3304  KokkosSparse::Experimental::spadd_symbolic
3305  <KKH, typename row_ptrs_array::const_type, typename global_col_inds_array::const_type, typename row_ptrs_array::const_type, typename global_col_inds_array::const_type, row_ptrs_array, global_col_inds_array>
3306  (&handle, Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
3307  Cvals = values_array("C values", addHandle->get_max_result_nnz());
3308  Ccolinds = global_col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_max_result_nnz());
3309 #ifdef HAVE_TPETRA_MMM_TIMINGS
3310  MM = Teuchos::null;
3311  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted numeric")));
3312 #endif
3313  KokkosSparse::Experimental::spadd_numeric(&handle,
3314  Arowptrs, AcolindsConverted, Avals, scalarA,
3315  Browptrs, BcolindsConverted, Bvals, scalarB,
3316  Crowptrs, Ccolinds, Cvals);
3317 }
3318 
3319 
3320 } //End namepsace MMdetails
3321 
3322 } //End namespace Tpetra
3323 
3324 /*********************************************************************************************************/
3325 //
3326 // Explicit instantiation macro
3327 //
3328 // Must be expanded from within the Tpetra namespace!
3329 //
3330 namespace Tpetra {
3331 
3332 #define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
3333  template \
3334  void MatrixMatrix::Multiply( \
3335  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3336  bool transposeA, \
3337  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3338  bool transposeB, \
3339  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3340  bool call_FillComplete_on_result, \
3341  const std::string & label, \
3342  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3343 \
3344 template \
3345  void MatrixMatrix::Jacobi( \
3346  SCALAR omega, \
3347  const Vector< SCALAR, LO, GO, NODE > & Dinv, \
3348  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3349  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3350  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3351  bool call_FillComplete_on_result, \
3352  const std::string & label, \
3353  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3354 \
3355  template \
3356  void MatrixMatrix::Add( \
3357  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3358  bool transposeA, \
3359  SCALAR scalarA, \
3360  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3361  bool transposeB, \
3362  SCALAR scalarB, \
3363  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > C); \
3364 \
3365  template \
3366  void MatrixMatrix::Add( \
3367  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3368  bool transposeA, \
3369  SCALAR scalarA, \
3370  CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3371  SCALAR scalarB ); \
3372 \
3373  template \
3374  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
3375  MatrixMatrix::add<SCALAR, LO, GO, NODE> \
3376  (const SCALAR & alpha, \
3377  const bool transposeA, \
3378  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3379  const SCALAR & beta, \
3380  const bool transposeB, \
3381  const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3382  const Teuchos::RCP<const Map<LO, GO, NODE> >& domainMap, \
3383  const Teuchos::RCP<const Map<LO, GO, NODE> >& rangeMap, \
3384  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3385 \
3386  template \
3387  void \
3388  MatrixMatrix::add< SCALAR , LO, GO , NODE > \
3389  (const SCALAR & alpha, \
3390  const bool transposeA, \
3391  const CrsMatrix< SCALAR , LO, GO , NODE >& A, \
3392  const SCALAR& beta, \
3393  const bool transposeB, \
3394  const CrsMatrix< SCALAR , LO, GO , NODE >& B, \
3395  CrsMatrix< SCALAR , LO, GO , NODE >& C, \
3396  const Teuchos::RCP<const Map<LO, GO , NODE > >& domainMap, \
3397  const Teuchos::RCP<const Map<LO, GO , NODE > >& rangeMap, \
3398  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3399 \
3400  template struct MMdetails::AddKernels<SCALAR, LO, GO, NODE>;
3401 
3402 } //End namespace Tpetra
3403 
3404 #endif // TPETRA_MATRIXMATRIX_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix&#39;s column Map with the given Map.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
bool isFillActive() const
Whether the matrix is not fill complete.
void scale(const Scalar &alpha)
Scale the matrix&#39;s values: this := alpha*this.
size_t getNodeNumRows() const override
The number of matrix rows owned by the calling process.
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
size_t global_size_t
Global size_t object.
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
static void writeSparseFile(const std::string &filename, const sparse_matrix_type &matrix, const std::string &matrixName, const std::string &matrixDescription, const bool debug=false)
Print the sparse matrix in Matrix Market format, with comments.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
LocalOrdinal sumIntoGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using global indices.
Utility functions for packing and unpacking sparse matrix entries.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
Teuchos::RCP< crs_matrix_type > createTranspose(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute and return the transpose of the matrix given to the constructor.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
A parallel distribution of indices over processes.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
void setAllValues(const typename local_matrix_type::row_map_type &ptr, const typename local_graph_type::entries_type::non_const_type &ind, const typename local_matrix_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
size_t getNodeMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
A distributed dense vector.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
Matrix Market file readers and writers for Tpetra objects.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Declaration and definition of Tpetra::Details::getEntryOnHost.
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.