41 #ifndef TPETRA_MATRIXMATRIX_DEF_HPP 42 #define TPETRA_MATRIXMATRIX_DEF_HPP 43 #include "Tpetra_ConfigDefs.hpp" 45 #include "Teuchos_VerboseObject.hpp" 46 #include "Teuchos_Array.hpp" 48 #include "Tpetra_CrsMatrix.hpp" 50 #include "Tpetra_RowMatrixTransposer.hpp" 53 #include "Tpetra_Details_makeColMap.hpp" 54 #include "Tpetra_ConfigDefs.hpp" 55 #include "Tpetra_Map.hpp" 56 #include "Tpetra_Export.hpp" 60 #include <type_traits> 61 #include "Teuchos_FancyOStream.hpp" 63 #include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp" 66 #include "KokkosSparse_spgemm.hpp" 67 #include "KokkosSparse_spadd.hpp" 68 #include "Kokkos_Bitset.hpp" 82 #include "TpetraExt_MatrixMatrix_OpenMP.hpp" 83 #include "TpetraExt_MatrixMatrix_Cuda.hpp" 88 namespace MatrixMatrix{
96 template <
class Scalar,
106 bool call_FillComplete_on_result,
107 const std::string& label,
108 const Teuchos::RCP<Teuchos::ParameterList>& params)
114 typedef LocalOrdinal LO;
115 typedef GlobalOrdinal GO;
123 #ifdef HAVE_TPETRA_MMM_TIMINGS 124 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
125 using Teuchos::TimeMonitor;
128 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All Setup"))));
131 const std::string prefix =
"TpetraExt::MatrixMatrix::Multiply(): ";
136 TEUCHOS_TEST_FOR_EXCEPTION(!A.
isFillComplete(), std::runtime_error, prefix <<
"Matrix A is not fill complete.");
137 TEUCHOS_TEST_FOR_EXCEPTION(!B.
isFillComplete(), std::runtime_error, prefix <<
"Matrix B is not fill complete.");
142 RCP<const crs_matrix_type> Aprime = null;
146 RCP<const crs_matrix_type> Bprime = null;
156 const bool newFlag = !C.
getGraph()->isLocallyIndexed() && !C.
getGraph()->isGloballyIndexed();
158 bool use_optimized_ATB =
false;
159 if (transposeA && !transposeB && call_FillComplete_on_result && newFlag)
160 use_optimized_ATB =
true;
162 #ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later. 163 use_optimized_ATB =
false;
166 using Teuchos::ParameterList;
167 RCP<ParameterList> transposeParams (
new ParameterList);
168 transposeParams->set (
"sort",
false);
170 if (!use_optimized_ATB && transposeA) {
171 transposer_type transposer (rcpFromRef (A));
172 Aprime = transposer.createTranspose (transposeParams);
175 Aprime = rcpFromRef(A);
179 transposer_type transposer (rcpFromRef (B));
180 Bprime = transposer.createTranspose (transposeParams);
183 Bprime = rcpFromRef(B);
193 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
194 prefix <<
"ERROR, inner dimensions of op(A) and op(B) " 195 "must match for matrix-matrix product. op(A) is " 196 << Aouter <<
"x" << Ainner <<
", op(B) is "<< Binner <<
"x" << Bouter);
202 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.
getGlobalNumRows(), std::runtime_error,
203 prefix <<
"ERROR, dimensions of result C must " 205 <<
" rows, should have at least " << Aouter << std::endl);
217 int numProcs = A.
getComm()->getSize();
221 crs_matrix_struct_type Aview;
222 crs_matrix_struct_type Bview;
224 RCP<const map_type> targetMap_A = Aprime->getRowMap();
225 RCP<const map_type> targetMap_B = Bprime->getRowMap();
227 #ifdef HAVE_TPETRA_MMM_TIMINGS 229 TimeMonitor MM_importExtract(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All I&X")));
235 if (!use_optimized_ATB) {
236 RCP<const import_type> dummyImporter;
237 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter,
true, label, params);
243 targetMap_B = Aprime->getColMap();
246 if (!use_optimized_ATB)
247 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(),
false, label, params);
249 #ifdef HAVE_TPETRA_MMM_TIMINGS 252 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All Multiply"))));
256 if (use_optimized_ATB) {
257 MMdetails::mult_AT_B_newmatrix(A, B, C, label,params);
259 }
else if (call_FillComplete_on_result && newFlag) {
260 MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label,params);
262 }
else if (call_FillComplete_on_result) {
263 MMdetails::mult_A_B_reuse(Aview, Bview, C, label,params);
269 CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
271 MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
274 #ifdef HAVE_TPETRA_MMM_TIMINGS 275 TimeMonitor MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All FillComplete")));
284 C.
fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
289 template <
class Scalar,
298 bool call_FillComplete_on_result,
299 const std::string& label,
300 const Teuchos::RCP<Teuchos::ParameterList>& params)
304 typedef LocalOrdinal LO;
305 typedef GlobalOrdinal GO;
312 #ifdef HAVE_TPETRA_MMM_TIMINGS 313 std::string prefix_mmm = std::string(
"TpetraExt ")+ label + std::string(
": ");
314 using Teuchos::TimeMonitor;
315 TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm+std::string(
"Jacobi All Setup")));
318 const std::string prefix =
"TpetraExt::MatrixMatrix::Jacobi(): ";
323 TEUCHOS_TEST_FOR_EXCEPTION(!A.
isFillComplete(), std::runtime_error, prefix <<
"Matrix A is not fill complete.");
324 TEUCHOS_TEST_FOR_EXCEPTION(!B.
isFillComplete(), std::runtime_error, prefix <<
"Matrix B is not fill complete.");
326 RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
327 RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
336 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
337 prefix <<
"ERROR, inner dimensions of op(A) and op(B) " 338 "must match for matrix-matrix product. op(A) is " 339 << Aouter <<
"x" << Ainner <<
", op(B) is "<< Binner <<
"x" << Bouter);
345 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.
getGlobalNumRows(), std::runtime_error,
346 prefix <<
"ERROR, dimensions of result C must " 348 <<
" rows, should have at least "<< Aouter << std::endl);
360 int numProcs = A.
getComm()->getSize();
364 crs_matrix_struct_type Aview;
365 crs_matrix_struct_type Bview;
367 RCP<const map_type> targetMap_A = Aprime->getRowMap();
368 RCP<const map_type> targetMap_B = Bprime->getRowMap();
370 #ifdef HAVE_TPETRA_MMM_TIMINGS 371 TimeMonitor MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi All I&X")));
377 RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(
new Teuchos::ParameterList);
378 if(!params.is_null()) {
379 importParams1->set(
"compute global constants",params->get(
"compute global constants: temporaries",
false));
380 int mm_optimization_core_count=0;
381 auto slist = params->sublist(
"matrixmatrix: kernel params",
false);
383 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
384 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
385 bool isMM = slist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
386 bool overrideAllreduce = slist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
387 auto & ip1slist = importParams1->sublist(
"matrixmatrix: kernel params",
false);
388 ip1slist.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
389 ip1slist.set(
"isMatrixMatrix_TransferAndFillComplete",isMM);
390 ip1slist.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
394 RCP<const import_type> dummyImporter;
395 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter,
true, label,importParams1);
400 targetMap_B = Aprime->getColMap();
405 RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(
new Teuchos::ParameterList);
406 if(!params.is_null()) {
407 importParams2->set(
"compute global constants",params->get(
"compute global constants: temporaries",
false));
409 auto slist = params->sublist(
"matrixmatrix: kernel params",
false);
411 bool isMM = slist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
412 bool overrideAllreduce = slist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
413 auto & ip2slist = importParams2->sublist(
"matrixmatrix: kernel params",
false);
414 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
415 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
416 ip2slist.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
417 ip2slist.set(
"isMatrixMatrix_TransferAndFillComplete",isMM);
418 ip2slist.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
421 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(),
false, label,importParams2);
423 #ifdef HAVE_TPETRA_MMM_TIMINGS 425 TimeMonitor MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi All Multiply")));
429 CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
432 bool newFlag = !C.
getGraph()->isLocallyIndexed() && !C.
getGraph()->isGloballyIndexed();
434 if (call_FillComplete_on_result && newFlag) {
435 MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
437 }
else if (call_FillComplete_on_result) {
438 MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
441 TEUCHOS_TEST_FOR_EXCEPTION(
true, std::runtime_error,
"jacobi_A_B_general not implemented");
464 template <
class Scalar,
475 using Teuchos::Array;
479 typedef LocalOrdinal LO;
480 typedef GlobalOrdinal GO;
485 const std::string prefix =
"TpetraExt::MatrixMatrix::Add(): ";
487 TEUCHOS_TEST_FOR_EXCEPTION(!A.
isFillComplete(), std::runtime_error,
488 prefix <<
"ERROR, input matrix A.isFillComplete() is false; it is required to be true. " 489 "(Result matrix B is not required to be isFillComplete()).");
490 TEUCHOS_TEST_FOR_EXCEPTION(B.
isFillComplete() , std::runtime_error,
491 prefix <<
"ERROR, input matrix B must not be fill complete!");
492 TEUCHOS_TEST_FOR_EXCEPTION(B.
isStaticGraph() , std::runtime_error,
493 prefix <<
"ERROR, input matrix B must not have static graph!");
495 prefix <<
"ERROR, input matrix B must not be locally indexed!");
497 using Teuchos::ParameterList;
498 RCP<ParameterList> transposeParams (
new ParameterList);
499 transposeParams->set (
"sort",
false);
501 RCP<const crs_matrix_type> Aprime = null;
503 transposer_type transposer (rcpFromRef (A));
504 Aprime = transposer.createTranspose (transposeParams);
507 Aprime = rcpFromRef(A);
515 if (scalarB != Teuchos::ScalarTraits<SC>::one())
520 if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
521 for (LO i = 0; (size_t)i < numMyRows; ++i) {
522 row = B.
getRowMap()->getGlobalElement(i);
523 Aprime->getGlobalRowCopy(row, a_inds(), a_vals(), a_numEntries);
525 if (scalarA != Teuchos::ScalarTraits<SC>::one())
526 for (
size_t j = 0; j < a_numEntries; ++j)
527 a_vals[j] *= scalarA;
537 template <
class Scalar,
541 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
543 const bool transposeA,
546 const bool transposeB,
550 const Teuchos::RCP<Teuchos::ParameterList>& params)
553 using Teuchos::rcpFromRef;
555 using Teuchos::ParameterList;
557 if(!params.is_null())
559 TEUCHOS_TEST_FOR_EXCEPTION(
560 params->isParameter(
"Call fillComplete") && !params->get<
bool>(
"Call fillComplete"),
561 std::invalid_argument,
562 "Tpetra::MatrixMatrix::add(): this version of add() always calls fillComplete\n" 563 "on the result, but you explicitly set 'Call fillComplete' = false in the parameter list. Don't set this explicitly.");
564 params->set(
"Call fillComplete",
true);
568 RCP<const crs_matrix_type> Brcp = rcpFromRef(B);
575 TEUCHOS_TEST_FOR_EXCEPTION
576 (! A.
isFillComplete () || ! Brcp->isFillComplete (), std::invalid_argument,
577 "TpetraExt::MatrixMatrix::add(): A and B must both be fill complete.");
578 RCP<crs_matrix_type> C = rcp(
new crs_matrix_type(Brcp->getRowMap(), 0));
580 add(alpha, transposeA, A, beta,
false, *Brcp, *C, domainMap, rangeMap, params);
588 template<
class LO,
class GO,
class LOView,
class GOView,
class LocalMap>
589 struct ConvertGlobalToLocalFunctor
591 ConvertGlobalToLocalFunctor(LOView& lids_,
const GOView& gids_,
const LocalMap localColMap_)
592 : lids(lids_), gids(gids_), localColMap(localColMap_)
595 KOKKOS_FUNCTION
void operator() (
const GO i)
const 597 lids(i) = localColMap.getLocalElement(gids(i));
602 const LocalMap localColMap;
605 template <
class Scalar,
611 const bool transposeA,
614 const bool transposeB,
619 const Teuchos::RCP<Teuchos::ParameterList>& params)
623 using Teuchos::rcpFromRef;
624 using Teuchos::rcp_implicit_cast;
625 using Teuchos::rcp_dynamic_cast;
626 using Teuchos::TimeMonitor;
628 using LO = LocalOrdinal;
629 using GO = GlobalOrdinal;
637 using exec_space =
typename crs_graph_type::execution_space;
638 using AddKern = MMdetails::AddKernels<SC,LO,GO,NO>;
639 const char* prefix_mmm =
"TpetraExt::MatrixMatrix::add: ";
640 constexpr
bool debug =
false;
642 #ifdef HAVE_TPETRA_MMM_TIMINGS 643 RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"transpose"))));
647 std::ostringstream os;
648 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": " 649 <<
"TpetraExt::MatrixMatrix::add" << std::endl;
650 std::cerr << os.str ();
653 TEUCHOS_TEST_FOR_EXCEPTION
655 prefix_mmm <<
"C must be a 'new' matrix (neither locally nor globally indexed).");
656 TEUCHOS_TEST_FOR_EXCEPTION
658 prefix_mmm <<
"A and B must both be fill complete.");
659 #ifdef HAVE_TPETRA_DEBUG 662 const bool domainMapsSame =
663 (! transposeA && ! transposeB &&
665 (! transposeA && transposeB &&
667 ( transposeA && ! transposeB &&
669 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
670 prefix_mmm <<
"The domain Maps of Op(A) and Op(B) are not the same.");
672 const bool rangeMapsSame =
673 (! transposeA && ! transposeB &&
675 (! transposeA && transposeB &&
677 ( transposeA && ! transposeB &&
679 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
680 prefix_mmm <<
"The range Maps of Op(A) and Op(B) are not the same.");
682 #endif // HAVE_TPETRA_DEBUG 684 using Teuchos::ParameterList;
686 RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
688 transposer_type transposer (Aprime);
689 Aprime = transposer.createTranspose ();
692 #ifdef HAVE_TPETRA_DEBUG 693 TEUCHOS_TEST_FOR_EXCEPTION
694 (Aprime.is_null (), std::logic_error,
695 prefix_mmm <<
"Failed to compute Op(A). " 696 "Please report this bug to the Tpetra developers.");
697 #endif // HAVE_TPETRA_DEBUG 700 RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
703 std::ostringstream os;
704 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": " 705 <<
"Form explicit xpose of B" << std::endl;
706 std::cerr << os.str ();
708 transposer_type transposer (Bprime);
709 Bprime = transposer.createTranspose ();
711 #ifdef HAVE_TPETRA_DEBUG 712 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
713 prefix_mmm <<
"Failed to compute Op(B). Please report this bug to the Tpetra developers.");
714 TEUCHOS_TEST_FOR_EXCEPTION(
715 !Aprime->isFillComplete () || !Bprime->isFillComplete (), std::invalid_argument,
716 prefix_mmm <<
"Aprime and Bprime must both be fill complete. " 717 "Please report this bug to the Tpetra developers.");
718 #endif // HAVE_TPETRA_DEBUG 719 RCP<const map_type> CDomainMap = domainMap;
720 RCP<const map_type> CRangeMap = rangeMap;
721 if(CDomainMap.is_null())
723 CDomainMap = Bprime->getDomainMap();
725 if(CRangeMap.is_null())
727 CRangeMap = Bprime->getRangeMap();
729 assert(!(CDomainMap.is_null()));
730 assert(!(CRangeMap.is_null()));
731 typedef typename AddKern::values_array values_array;
732 typedef typename AddKern::row_ptrs_array row_ptrs_array;
733 typedef typename AddKern::col_inds_array col_inds_array;
734 bool AGraphSorted = Aprime->getCrsGraph()->isSorted();
735 bool BGraphSorted = Bprime->getCrsGraph()->isSorted();
737 row_ptrs_array rowptrs;
738 col_inds_array colinds;
739 #ifdef HAVE_TPETRA_MMM_TIMINGS 741 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"rowmap check/import"))));
743 if(!(Aprime->getRowMap()->isSameAs(*(Bprime->getRowMap()))))
746 auto import = rcp(
new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
751 Aprime = importAndFillCompleteCrsMatrix<crs_matrix_type>(Aprime, *
import, Bprime->getDomainMap(), Bprime->getRangeMap());
753 bool matchingColMaps = Aprime->getColMap()->isSameAs(*(Bprime->getColMap()));
754 bool sorted = AGraphSorted && BGraphSorted;
755 RCP<const import_type> Cimport = Teuchos::null;
756 RCP<export_type> Cexport = Teuchos::null;
757 bool doFillComplete =
true;
758 if(Teuchos::nonnull(params) && params->isParameter(
"Call fillComplete"))
760 doFillComplete = params->get<
bool>(
"Call fillComplete");
762 auto Alocal = Aprime->getLocalMatrix();
763 auto Blocal = Bprime->getLocalMatrix();
764 LO numLocalRows = Alocal.numRows();
765 if(numLocalRows == 0)
772 rowptrs = row_ptrs_array(
"C rowptrs", 0);
774 auto Acolmap = Aprime->getColMap();
775 auto Bcolmap = Bprime->getColMap();
778 using global_col_inds_array =
typename AddKern::global_col_inds_array;
779 #ifdef HAVE_TPETRA_MMM_TIMINGS 781 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"mismatched col map full kernel"))));
784 auto AlocalColmap = Acolmap->getLocalMap();
785 auto BlocalColmap = Bcolmap->getLocalMap();
786 global_col_inds_array globalColinds;
788 std::ostringstream os;
789 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": " 790 <<
"Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
791 std::cerr << os.str ();
793 AddKern::convertToGlobalAndAdd(
794 Alocal, alpha, Blocal, beta, AlocalColmap, BlocalColmap,
795 vals, rowptrs, globalColinds);
797 std::ostringstream os;
798 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": " 799 <<
"Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
800 std::cerr << os.str ();
802 #ifdef HAVE_TPETRA_MMM_TIMINGS 804 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Constructing graph"))));
806 RCP<const map_type> CcolMap;
807 Tpetra::Details::makeColMap<LocalOrdinal, GlobalOrdinal, Node>
808 (CcolMap, CDomainMap, globalColinds);
810 col_inds_array localColinds(
"C colinds", globalColinds.extent(0));
811 Kokkos::parallel_for(Kokkos::RangePolicy<exec_space>(0, globalColinds.extent(0)),
812 ConvertGlobalToLocalFunctor<LocalOrdinal, GlobalOrdinal,
813 col_inds_array, global_col_inds_array,
814 typename map_type::local_map_type>
815 (localColinds, globalColinds, CcolMap->getLocalMap()));
816 KokkosKernels::Impl::sort_crs_matrix<exec_space, row_ptrs_array, col_inds_array, values_array>(rowptrs, localColinds, vals);
825 auto Avals = Alocal.values;
826 auto Bvals = Blocal.values;
827 auto Arowptrs = Alocal.graph.row_map;
828 auto Browptrs = Blocal.graph.row_map;
829 auto Acolinds = Alocal.graph.entries;
830 auto Bcolinds = Blocal.graph.entries;
834 #ifdef HAVE_TPETRA_MMM_TIMINGS 836 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"sorted entries full kernel"))));
839 std::ostringstream os;
840 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": " 841 <<
"Call AddKern::addSorted(...)" << std::endl;
842 std::cerr << os.str ();
844 AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, vals, rowptrs, colinds);
849 #ifdef HAVE_TPETRA_MMM_TIMINGS 851 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"mm add unsorted entries full kernel"))));
854 std::ostringstream os;
855 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": " 856 <<
"Call AddKern::addUnsorted(...)" << std::endl;
857 std::cerr << os.str ();
859 AddKern::addUnsorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
862 RCP<const map_type> Ccolmap = Bcolmap;
867 #ifdef HAVE_TPETRA_MMM_TIMINGS 869 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Tpetra::Crs expertStaticFillComplete"))));
871 if(!CDomainMap->isSameAs(*Ccolmap))
874 std::ostringstream os;
875 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": " 876 <<
"Create Cimport" << std::endl;
877 std::cerr << os.str ();
879 Cimport = rcp(
new import_type(CDomainMap, Ccolmap));
884 std::ostringstream os;
885 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": " 886 <<
"Create Cexport" << std::endl;
887 std::cerr << os.str ();
889 Cexport = rcp(
new export_type(C.
getRowMap(), CRangeMap));
893 std::ostringstream os;
894 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": " 895 <<
"Call C->expertStaticFillComplete(...)" << std::endl;
896 std::cerr << os.str ();
903 template <
class Scalar,
916 using Teuchos::Array;
917 using Teuchos::ArrayRCP;
918 using Teuchos::ArrayView;
921 using Teuchos::rcp_dynamic_cast;
922 using Teuchos::rcpFromRef;
923 using Teuchos::tuple;
926 typedef Teuchos::ScalarTraits<Scalar> STS;
934 std::string prefix =
"TpetraExt::MatrixMatrix::Add(): ";
936 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
937 prefix <<
"The case C == null does not actually work. Fixing this will require an interface change.");
939 TEUCHOS_TEST_FOR_EXCEPTION(
941 prefix <<
"Both input matrices must be fill complete before calling this function.");
943 #ifdef HAVE_TPETRA_DEBUG 945 const bool domainMapsSame =
949 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
950 prefix <<
"The domain Maps of Op(A) and Op(B) are not the same.");
952 const bool rangeMapsSame =
956 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
957 prefix <<
"The range Maps of Op(A) and Op(B) are not the same.");
959 #endif // HAVE_TPETRA_DEBUG 961 using Teuchos::ParameterList;
962 RCP<ParameterList> transposeParams (
new ParameterList);
963 transposeParams->set (
"sort",
false);
966 RCP<const crs_matrix_type> Aprime;
968 transposer_type theTransposer (rcpFromRef (A));
969 Aprime = theTransposer.createTranspose (transposeParams);
972 Aprime = rcpFromRef (A);
975 #ifdef HAVE_TPETRA_DEBUG 976 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
977 prefix <<
"Failed to compute Op(A). Please report this bug to the Tpetra developers.");
978 #endif // HAVE_TPETRA_DEBUG 981 RCP<const crs_matrix_type> Bprime;
983 transposer_type theTransposer (rcpFromRef (B));
984 Bprime = theTransposer.createTranspose (transposeParams);
987 Bprime = rcpFromRef (B);
990 #ifdef HAVE_TPETRA_DEBUG 991 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
992 prefix <<
"Failed to compute Op(B). Please report this bug to the Tpetra developers.");
993 #endif // HAVE_TPETRA_DEBUG 996 if (! C.is_null ()) {
997 C->setAllToScalar (STS::zero ());
1002 C = rcp (
new crs_matrix_type (Aprime->getRowMap (), 0));
1005 #ifdef HAVE_TPETRA_DEBUG 1006 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1007 prefix <<
"At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1008 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1009 prefix <<
"At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1010 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
1011 prefix <<
"At this point, C is null. Please report this bug to the Tpetra developers.");
1012 #endif // HAVE_TPETRA_DEBUG 1014 Array<RCP<const crs_matrix_type> > Mat =
1015 tuple<RCP<const crs_matrix_type> > (Aprime, Bprime);
1016 Array<Scalar> scalar = tuple<Scalar> (scalarA, scalarB);
1019 for (
int k = 0; k < 2; ++k) {
1020 Array<GlobalOrdinal> Indices;
1021 Array<Scalar> Values;
1029 #ifdef HAVE_TPETRA_DEBUG 1030 TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
1031 prefix <<
"At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1032 #endif // HAVE_TPETRA_DEBUG 1033 RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
1034 #ifdef HAVE_TPETRA_DEBUG 1035 TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
1036 prefix <<
"At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1037 #endif // HAVE_TPETRA_DEBUG 1039 const size_t localNumRows = Mat[k]->getNodeNumRows ();
1040 for (
size_t i = 0; i < localNumRows; ++i) {
1041 const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
1042 size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
1043 if (numEntries > 0) {
1044 Indices.resize (numEntries);
1045 Values.resize (numEntries);
1046 Mat[k]->getGlobalRowCopy (globalRow, Indices (), Values (), numEntries);
1048 if (scalar[k] != STS::one ()) {
1049 for (
size_t j = 0; j < numEntries; ++j) {
1050 Values[j] *= scalar[k];
1054 if (C->isFillComplete ()) {
1055 C->sumIntoGlobalValues (globalRow, Indices, Values);
1057 C->insertGlobalValues (globalRow, Indices, Values);
1066 namespace MMdetails{
1070 template <
class TransferType>
1071 void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer,
const std::string &label) {
1072 if (Transfer.is_null())
1075 const Distributor & Distor = Transfer->getDistributor();
1076 Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
1078 size_t rows_send = Transfer->getNumExportIDs();
1079 size_t rows_recv = Transfer->getNumRemoteIDs();
1081 size_t round1_send = Transfer->getNumExportIDs() *
sizeof(size_t);
1082 size_t round1_recv = Transfer->getNumRemoteIDs() *
sizeof(size_t);
1083 size_t num_send_neighbors = Distor.getNumSends();
1084 size_t num_recv_neighbors = Distor.getNumReceives();
1085 size_t round2_send, round2_recv;
1086 Distor.getLastDoStatistics(round2_send,round2_recv);
1088 int myPID = Comm->getRank();
1089 int NumProcs = Comm->getSize();
1096 size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
1097 size_t gstats_min[8], gstats_max[8];
1099 double lstats_avg[8], gstats_avg[8];
1100 for(
int i=0; i<8; i++)
1101 lstats_avg[i] = ((
double)lstats[i])/NumProcs;
1103 Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
1104 Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
1105 Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
1108 printf(
"%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1109 (int)gstats_min[0],gstats_avg[0],(
int)gstats_max[0], (int)gstats_min[2],gstats_avg[2],(
int)gstats_max[2],
1110 (int)gstats_min[4],gstats_avg[4],(
int)gstats_max[4], (int)gstats_min[6],gstats_avg[6],(
int)gstats_max[6]);
1111 printf(
"%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1112 (int)gstats_min[1],gstats_avg[1],(
int)gstats_max[1], (int)gstats_min[3],gstats_avg[3],(
int)gstats_max[3],
1113 (int)gstats_min[5],gstats_avg[5],(
int)gstats_max[5], (int)gstats_min[7],gstats_avg[7],(
int)gstats_max[7]);
1118 template<
class Scalar,
1120 class GlobalOrdinal,
1122 void mult_AT_B_newmatrix(
1123 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1124 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1125 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1126 const std::string & label,
1127 const Teuchos::RCP<Teuchos::ParameterList>& params)
1132 typedef LocalOrdinal LO;
1133 typedef GlobalOrdinal GO;
1135 typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
1136 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1138 #ifdef HAVE_TPETRA_MMM_TIMINGS 1139 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1140 using Teuchos::TimeMonitor;
1141 RCP<TimeMonitor> MM = rcp (
new TimeMonitor
1142 (*TimeMonitor::getNewTimer (prefix_mmm +
"MMM-T Transpose")));
1148 transposer_type transposer (rcpFromRef (A), label + std::string(
"XP: "));
1150 using Teuchos::ParameterList;
1151 RCP<ParameterList> transposeParams (
new ParameterList);
1152 transposeParams->set (
"sort",
false);
1153 if(! params.is_null ()) {
1154 transposeParams->set (
"compute global constants",
1155 params->get (
"compute global constants: temporaries",
1158 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Atrans =
1159 transposer.createTransposeLocal (transposeParams);
1164 #ifdef HAVE_TPETRA_MMM_TIMINGS 1166 MM = rcp (
new TimeMonitor
1167 (*TimeMonitor::getNewTimer (prefix_mmm + std::string (
"MMM-T I&X"))));
1171 crs_matrix_struct_type Aview;
1172 crs_matrix_struct_type Bview;
1173 RCP<const Import<LO, GO, NO> > dummyImporter;
1176 RCP<Teuchos::ParameterList> importParams1 (
new ParameterList);
1177 if (! params.is_null ()) {
1178 importParams1->set (
"compute global constants",
1179 params->get (
"compute global constants: temporaries",
1181 auto slist = params->sublist (
"matrixmatrix: kernel params",
false);
1182 bool isMM = slist.get (
"isMatrixMatrix_TransferAndFillComplete",
false);
1183 bool overrideAllreduce = slist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
1184 int mm_optimization_core_count =
1186 mm_optimization_core_count =
1187 slist.get (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1188 int mm_optimization_core_count2 =
1189 params->get (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1190 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1191 mm_optimization_core_count = mm_optimization_core_count2;
1193 auto & sip1 = importParams1->sublist (
"matrixmatrix: kernel params",
false);
1194 sip1.set (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1195 sip1.set (
"isMatrixMatrix_TransferAndFillComplete", isMM);
1196 sip1.set (
"MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1199 MMdetails::import_and_extract_views (*Atrans, Atrans->getRowMap (),
1200 Aview, dummyImporter,
true,
1201 label, importParams1);
1203 RCP<ParameterList> importParams2 (
new ParameterList);
1204 if (! params.is_null ()) {
1205 importParams2->set (
"compute global constants",
1206 params->get (
"compute global constants: temporaries",
1208 auto slist = params->sublist (
"matrixmatrix: kernel params",
false);
1209 bool isMM = slist.get (
"isMatrixMatrix_TransferAndFillComplete",
false);
1210 bool overrideAllreduce = slist.get (
"MM_TAFC_OverrideAllreduceCheck",
false);
1211 int mm_optimization_core_count =
1213 mm_optimization_core_count =
1214 slist.get (
"MM_TAFC_OptimizationCoreCount",
1215 mm_optimization_core_count);
1216 int mm_optimization_core_count2 =
1217 params->get (
"MM_TAFC_OptimizationCoreCount",
1218 mm_optimization_core_count);
1219 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1220 mm_optimization_core_count = mm_optimization_core_count2;
1222 auto & sip2 = importParams2->sublist (
"matrixmatrix: kernel params",
false);
1223 sip2.set (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1224 sip2.set (
"isMatrixMatrix_TransferAndFillComplete", isMM);
1225 sip2.set (
"MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1228 if(B.getRowMap()->isSameAs(*Atrans->getColMap())){
1229 MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,
true, label,importParams2);
1232 MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter,
false, label,importParams2);
1235 #ifdef HAVE_TPETRA_MMM_TIMINGS 1237 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM-T AB-core"))));
1240 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Ctemp;
1243 bool needs_final_export = ! Atrans->getGraph ()->getExporter ().is_null();
1244 if (needs_final_export) {
1248 Ctemp = rcp (&C,
false);
1251 mult_A_B_newmatrix(Aview, Bview, *Ctemp, label,params);
1256 #ifdef HAVE_TPETRA_MMM_TIMINGS 1258 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM-T exportAndFillComplete"))));
1261 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Crcp (&C,
false);
1263 if (needs_final_export) {
1264 ParameterList labelList;
1265 labelList.set(
"Timer Label", label);
1266 if(!params.is_null()) {
1267 ParameterList& params_sublist = params->sublist(
"matrixmatrix: kernel params",
false);
1268 ParameterList& labelList_subList = labelList.sublist(
"matrixmatrix: kernel params",
false);
1270 mm_optimization_core_count = params_sublist.get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1271 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1272 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
1273 labelList_subList.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,
"Core Count above which the optimized neighbor discovery is used");
1274 bool isMM = params_sublist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
1275 bool overrideAllreduce = params_sublist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
1277 labelList_subList.set (
"isMatrixMatrix_TransferAndFillComplete", isMM,
1278 "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
1279 labelList.set(
"compute global constants",params->get(
"compute global constants",
true));
1280 labelList.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
1282 Ctemp->exportAndFillComplete (Crcp,
1283 *Ctemp->getGraph ()->getExporter (),
1286 rcp (&labelList,
false));
1288 #ifdef HAVE_TPETRA_MMM_STATISTICS 1289 printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(
" AT_B MMM"));
1295 template<
class Scalar,
1297 class GlobalOrdinal,
1300 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1301 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1302 CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1303 const std::string& ,
1304 const Teuchos::RCP<Teuchos::ParameterList>& )
1306 using Teuchos::Array;
1307 using Teuchos::ArrayRCP;
1308 using Teuchos::ArrayView;
1309 using Teuchos::OrdinalTraits;
1310 using Teuchos::null;
1312 typedef Teuchos::ScalarTraits<Scalar> STS;
1314 LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1315 LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1317 LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1318 LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1320 ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getNodeElementList();
1321 ArrayView<const GlobalOrdinal> bcols_import = null;
1322 if (Bview.importColMap != null) {
1323 C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1324 C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1326 bcols_import = Bview.importColMap->getNodeElementList();
1329 size_t C_numCols = C_lastCol - C_firstCol +
1330 OrdinalTraits<LocalOrdinal>::one();
1331 size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1332 OrdinalTraits<LocalOrdinal>::one();
1334 if (C_numCols_import > C_numCols)
1335 C_numCols = C_numCols_import;
1337 Array<Scalar> dwork = Array<Scalar>(C_numCols);
1338 Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1339 Array<size_t> iwork2 = Array<size_t>(C_numCols);
1341 Array<Scalar> C_row_i = dwork;
1342 Array<GlobalOrdinal> C_cols = iwork;
1343 Array<size_t> c_index = iwork2;
1344 Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1345 Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1347 size_t C_row_i_length, j, k, last_index;
1350 LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1351 Array<LocalOrdinal> Acol2Brow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1352 Array<LocalOrdinal> Acol2Irow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1353 if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1355 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1356 Aview.colMap->getMaxLocalIndex(); i++)
1361 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1362 Aview.colMap->getMaxLocalIndex(); i++) {
1363 GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1364 LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1365 if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1366 else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1376 ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP;
1377 ArrayRCP<const LocalOrdinal> Acolind_RCP, Bcolind_RCP, Icolind_RCP;
1378 ArrayRCP<const Scalar> Avals_RCP, Bvals_RCP, Ivals_RCP;
1379 ArrayView<const size_t> Arowptr, Browptr, Irowptr;
1380 ArrayView<const LocalOrdinal> Acolind, Bcolind, Icolind;
1381 ArrayView<const Scalar> Avals, Bvals, Ivals;
1382 Aview.origMatrix->getAllValues(Arowptr_RCP,Acolind_RCP,Avals_RCP);
1383 Bview.origMatrix->getAllValues(Browptr_RCP,Bcolind_RCP,Bvals_RCP);
1384 Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1385 Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
1386 if(!Bview.importMatrix.is_null()) {
1387 Bview.importMatrix->getAllValues(Irowptr_RCP,Icolind_RCP,Ivals_RCP);
1388 Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1391 bool C_filled = C.isFillComplete();
1393 for (
size_t i = 0; i < C_numCols; i++)
1394 c_index[i] = OrdinalTraits<size_t>::invalid();
1397 size_t Arows = Aview.rowMap->getNodeNumElements();
1398 for(
size_t i=0; i<Arows; ++i) {
1402 GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1408 C_row_i_length = OrdinalTraits<size_t>::zero();
1410 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1411 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1412 const Scalar Aval = Avals[k];
1413 if (Aval == STS::zero())
1416 if (Ak == LO_INVALID)
1419 for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1420 LocalOrdinal col = Bcolind[j];
1423 if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1426 C_row_i[C_row_i_length] = Aval*Bvals[j];
1427 C_cols[C_row_i_length] = col;
1428 c_index[col] = C_row_i_length;
1432 C_row_i[c_index[col]] += Aval*Bvals[j];
1437 for (
size_t ii = 0; ii < C_row_i_length; ii++) {
1438 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1439 C_cols[ii] = bcols[C_cols[ii]];
1440 combined_index[ii] = C_cols[ii];
1441 combined_values[ii] = C_row_i[ii];
1443 last_index = C_row_i_length;
1449 C_row_i_length = OrdinalTraits<size_t>::zero();
1451 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1452 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1453 const Scalar Aval = Avals[k];
1454 if (Aval == STS::zero())
1457 if (Ak!=LO_INVALID)
continue;
1459 Ak = Acol2Irow[Acolind[k]];
1460 for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1461 LocalOrdinal col = Icolind[j];
1464 if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1467 C_row_i[C_row_i_length] = Aval*Ivals[j];
1468 C_cols[C_row_i_length] = col;
1469 c_index[col] = C_row_i_length;
1474 C_row_i[c_index[col]] += Aval*Ivals[j];
1479 for (
size_t ii = 0; ii < C_row_i_length; ii++) {
1480 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1481 C_cols[ii] = bcols_import[C_cols[ii]];
1482 combined_index[last_index] = C_cols[ii];
1483 combined_values[last_index] = C_row_i[ii];
1490 C.sumIntoGlobalValues(
1492 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1493 combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1495 C.insertGlobalValues(
1497 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1498 combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1504 template<
class Scalar,
1506 class GlobalOrdinal,
1508 void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1509 typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1510 Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1512 if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1513 Mview.maxNumRowEntries = Mview.indices[0].size();
1515 for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1516 if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1517 Mview.maxNumRowEntries = Mview.indices[i].size();
1522 template<
class CrsMatrixType>
1523 size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1525 size_t Aest = 100, Best=100;
1526 if (A.getNodeNumEntries() >= A.getNodeNumRows())
1527 Aest = (A.getNodeNumRows() > 0) ? A.getNodeNumEntries()/A.getNodeNumRows() : 100;
1528 if (B.getNodeNumEntries() >= B.getNodeNumRows())
1529 Best = (B.getNodeNumRows() > 0) ? B.getNodeNumEntries()/B.getNodeNumRows() : 100;
1531 size_t nnzperrow = (size_t)(sqrt((
double)Aest) + sqrt((
double)Best) - 1);
1532 nnzperrow *= nnzperrow;
1534 return (
size_t)(A.getNodeNumRows()*nnzperrow*0.75 + 100);
1544 template<
class Scalar,
1546 class GlobalOrdinal,
1548 void mult_A_B_newmatrix(
1549 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1550 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1551 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1552 const std::string& label,
1553 const Teuchos::RCP<Teuchos::ParameterList>& params)
1555 using Teuchos::Array;
1556 using Teuchos::ArrayRCP;
1557 using Teuchos::ArrayView;
1562 typedef LocalOrdinal LO;
1563 typedef GlobalOrdinal GO;
1565 typedef Import<LO,GO,NO> import_type;
1566 typedef Map<LO,GO,NO> map_type;
1569 typedef typename map_type::local_map_type local_map_type;
1571 typedef typename KCRS::StaticCrsGraphType graph_t;
1572 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1573 typedef typename NO::execution_space execution_space;
1574 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1575 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1577 #ifdef HAVE_TPETRA_MMM_TIMINGS 1578 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1579 using Teuchos::TimeMonitor;
1580 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM M5 Cmap")))));
1582 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1585 RCP<const import_type> Cimport;
1586 RCP<const map_type> Ccolmap;
1587 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1588 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1589 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1590 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1591 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1592 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1593 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1594 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1601 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1603 if (Bview.importMatrix.is_null()) {
1606 Ccolmap = Bview.colMap;
1607 const LO colMapSize =
static_cast<LO
>(Bview.colMap->getNodeNumElements());
1609 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1610 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1611 KOKKOS_LAMBDA(
const LO i) {
1624 if (!Bimport.is_null() && !Iimport.is_null()) {
1625 Cimport = Bimport->setUnion(*Iimport);
1627 else if (!Bimport.is_null() && Iimport.is_null()) {
1628 Cimport = Bimport->setUnion();
1630 else if (Bimport.is_null() && !Iimport.is_null()) {
1631 Cimport = Iimport->setUnion();
1634 throw std::runtime_error(
"TpetraExt::MMM status of matrix importers is nonsensical");
1636 Ccolmap = Cimport->getTargetMap();
1641 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1642 std::runtime_error,
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1649 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
1650 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1651 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
1652 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1654 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
1655 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1664 C.replaceColMap(Ccolmap);
1682 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
1683 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
1685 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
1686 GO aidx = Acolmap_local.getGlobalElement(i);
1687 LO B_LID = Browmap_local.getLocalElement(aidx);
1688 if (B_LID != LO_INVALID) {
1689 targetMapToOrigRow(i) = B_LID;
1690 targetMapToImportRow(i) = LO_INVALID;
1692 LO I_LID = Irowmap_local.getLocalElement(aidx);
1693 targetMapToOrigRow(i) = LO_INVALID;
1694 targetMapToImportRow(i) = I_LID;
1701 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
1708 template<
class Scalar,
1710 class GlobalOrdinal,
1712 class LocalOrdinalViewType>
1713 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1714 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1715 const LocalOrdinalViewType & targetMapToOrigRow,
1716 const LocalOrdinalViewType & targetMapToImportRow,
1717 const LocalOrdinalViewType & Bcol2Ccol,
1718 const LocalOrdinalViewType & Icol2Ccol,
1719 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1720 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
1721 const std::string& label,
1722 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1723 #ifdef HAVE_TPETRA_MMM_TIMINGS 1724 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1725 using Teuchos::TimeMonitor;
1726 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore"))));
1729 using Teuchos::Array;
1730 using Teuchos::ArrayRCP;
1731 using Teuchos::ArrayView;
1737 typedef typename KCRS::StaticCrsGraphType graph_t;
1738 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1739 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1740 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1741 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1744 typedef LocalOrdinal LO;
1745 typedef GlobalOrdinal GO;
1747 typedef Map<LO,GO,NO> map_type;
1748 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1749 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1750 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1753 RCP<const map_type> Ccolmap = C.getColMap();
1754 size_t m = Aview.origMatrix->getNodeNumRows();
1755 size_t n = Ccolmap->getNodeNumElements();
1756 size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
1759 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
1760 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
1762 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
1763 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
1764 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
1766 c_lno_view_t Irowptr;
1767 lno_nnz_view_t Icolind;
1768 scalar_view_t Ivals;
1769 if(!Bview.importMatrix.is_null()) {
1770 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
1771 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
1772 Ivals = Bview.importMatrix->getLocalMatrix().values;
1773 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
1776 #ifdef HAVE_TPETRA_MMM_TIMINGS 1777 RCP<TimeMonitor> MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore - Compare"))));
1787 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
1788 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing(
"Crowptr"),m+1);
1789 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing(
"Ccolind"),CSR_alloc);
1790 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing(
"Cvals"),CSR_alloc);
1800 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1801 std::vector<size_t> c_status(n, ST_INVALID);
1811 size_t CSR_ip = 0, OLD_ip = 0;
1812 for (
size_t i = 0; i < m; i++) {
1815 Crowptr[i] = CSR_ip;
1818 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
1819 LO Aik = Acolind[k];
1820 const SC Aval = Avals[k];
1821 if (Aval == SC_ZERO)
1824 if (targetMapToOrigRow[Aik] != LO_INVALID) {
1831 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
1834 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
1835 LO Bkj = Bcolind[j];
1836 LO Cij = Bcol2Ccol[Bkj];
1838 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
1840 c_status[Cij] = CSR_ip;
1841 Ccolind[CSR_ip] = Cij;
1842 Cvals[CSR_ip] = Aval*Bvals[j];
1846 Cvals[c_status[Cij]] += Aval*Bvals[j];
1857 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
1858 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
1859 LO Ikj = Icolind[j];
1860 LO Cij = Icol2Ccol[Ikj];
1862 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
1864 c_status[Cij] = CSR_ip;
1865 Ccolind[CSR_ip] = Cij;
1866 Cvals[CSR_ip] = Aval*Ivals[j];
1869 Cvals[c_status[Cij]] += Aval*Ivals[j];
1876 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1])*b_max_nnz_per_row) > CSR_alloc) {
1878 Kokkos::resize(Ccolind,CSR_alloc);
1879 Kokkos::resize(Cvals,CSR_alloc);
1884 Crowptr[m] = CSR_ip;
1887 Kokkos::resize(Ccolind,CSR_ip);
1888 Kokkos::resize(Cvals,CSR_ip);
1890 #ifdef HAVE_TPETRA_MMM_TIMINGS 1892 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix Final Sort")));
1896 if (params.is_null() || params->get(
"sort entries",
true))
1897 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
1898 C.setAllValues(Crowptr,Ccolind, Cvals);
1901 #ifdef HAVE_TPETRA_MMM_TIMINGS 1903 auto MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix ESFC")));
1915 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
1916 labelList->set(
"Timer Label",label);
1917 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
1918 RCP<const Export<LO,GO,NO> > dummyExport;
1919 C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
1920 #ifdef HAVE_TPETRA_MMM_TIMINGS 1922 MM2 = Teuchos::null;
1932 template<
class Scalar,
1934 class GlobalOrdinal,
1936 void mult_A_B_reuse(
1937 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1938 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1939 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1940 const std::string& label,
1941 const Teuchos::RCP<Teuchos::ParameterList>& params)
1943 using Teuchos::Array;
1944 using Teuchos::ArrayRCP;
1945 using Teuchos::ArrayView;
1950 typedef LocalOrdinal LO;
1951 typedef GlobalOrdinal GO;
1953 typedef Import<LO,GO,NO> import_type;
1954 typedef Map<LO,GO,NO> map_type;
1957 typedef typename map_type::local_map_type local_map_type;
1959 typedef typename KCRS::StaticCrsGraphType graph_t;
1960 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1961 typedef typename NO::execution_space execution_space;
1962 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1963 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1965 #ifdef HAVE_TPETRA_MMM_TIMINGS 1966 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1967 using Teuchos::TimeMonitor;
1968 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse Cmap"))));
1970 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1973 RCP<const import_type> Cimport = C.getGraph()->getImporter();
1974 RCP<const map_type> Ccolmap = C.getColMap();
1975 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1976 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1977 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1978 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1979 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1980 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1983 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1987 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
1988 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1991 if (!Bview.importMatrix.is_null()) {
1992 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1993 std::runtime_error,
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1995 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
1996 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
1997 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2003 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2004 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2005 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
2006 GO aidx = Acolmap_local.getGlobalElement(i);
2007 LO B_LID = Browmap_local.getLocalElement(aidx);
2008 if (B_LID != LO_INVALID) {
2009 targetMapToOrigRow(i) = B_LID;
2010 targetMapToImportRow(i) = LO_INVALID;
2012 LO I_LID = Irowmap_local.getLocalElement(aidx);
2013 targetMapToOrigRow(i) = LO_INVALID;
2014 targetMapToImportRow(i) = I_LID;
2021 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2025 template<
class Scalar,
2027 class GlobalOrdinal,
2029 class LocalOrdinalViewType>
2030 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2031 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2032 const LocalOrdinalViewType & targetMapToOrigRow,
2033 const LocalOrdinalViewType & targetMapToImportRow,
2034 const LocalOrdinalViewType & Bcol2Ccol,
2035 const LocalOrdinalViewType & Icol2Ccol,
2036 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2037 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > ,
2038 const std::string& label,
2039 const Teuchos::RCP<Teuchos::ParameterList>& ) {
2040 #ifdef HAVE_TPETRA_MMM_TIMINGS 2041 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2042 using Teuchos::TimeMonitor;
2043 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse SerialCore"))));
2044 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2054 typedef typename KCRS::StaticCrsGraphType graph_t;
2055 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2056 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2057 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2060 typedef LocalOrdinal LO;
2061 typedef GlobalOrdinal GO;
2063 typedef Map<LO,GO,NO> map_type;
2064 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2065 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2066 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2069 RCP<const map_type> Ccolmap = C.getColMap();
2070 size_t m = Aview.origMatrix->getNodeNumRows();
2071 size_t n = Ccolmap->getNodeNumElements();
2074 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2075 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2076 const KCRS & Cmat = C.getLocalMatrix();
2078 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2079 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2080 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2081 scalar_view_t Cvals = Cmat.values;
2083 c_lno_view_t Irowptr;
2084 lno_nnz_view_t Icolind;
2085 scalar_view_t Ivals;
2086 if(!Bview.importMatrix.is_null()) {
2087 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2088 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2089 Ivals = Bview.importMatrix->getLocalMatrix().values;
2092 #ifdef HAVE_TPETRA_MMM_TIMINGS 2093 MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore - Compare"))));
2105 std::vector<size_t> c_status(n, ST_INVALID);
2108 size_t CSR_ip = 0, OLD_ip = 0;
2109 for (
size_t i = 0; i < m; i++) {
2112 OLD_ip = Crowptr[i];
2113 CSR_ip = Crowptr[i+1];
2114 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
2115 c_status[Ccolind[k]] = k;
2121 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2122 LO Aik = Acolind[k];
2123 const SC Aval = Avals[k];
2124 if (Aval == SC_ZERO)
2127 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2129 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
2131 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2132 LO Bkj = Bcolind[j];
2133 LO Cij = Bcol2Ccol[Bkj];
2135 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2136 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
2137 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
2139 Cvals[c_status[Cij]] += Aval * Bvals[j];
2144 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
2145 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2146 LO Ikj = Icolind[j];
2147 LO Cij = Icol2Ccol[Ikj];
2149 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2150 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
2151 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
2153 Cvals[c_status[Cij]] += Aval * Ivals[j];
2159 #ifdef HAVE_TPETRA_MMM_TIMINGS 2160 auto MM3 = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse ESFC"))));
2163 C.fillComplete(C.getDomainMap(), C.getRangeMap());
2169 template<
class Scalar,
2171 class GlobalOrdinal,
2173 void jacobi_A_B_newmatrix(
2175 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2176 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2177 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2178 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2179 const std::string& label,
2180 const Teuchos::RCP<Teuchos::ParameterList>& params)
2182 using Teuchos::Array;
2183 using Teuchos::ArrayRCP;
2184 using Teuchos::ArrayView;
2188 typedef LocalOrdinal LO;
2189 typedef GlobalOrdinal GO;
2192 typedef Import<LO,GO,NO> import_type;
2193 typedef Map<LO,GO,NO> map_type;
2194 typedef typename map_type::local_map_type local_map_type;
2198 typedef typename KCRS::StaticCrsGraphType graph_t;
2199 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2200 typedef typename NO::execution_space execution_space;
2201 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2202 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2205 #ifdef HAVE_TPETRA_MMM_TIMINGS 2206 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2207 using Teuchos::TimeMonitor;
2208 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi M5 Cmap"))));
2210 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2213 RCP<const import_type> Cimport;
2214 RCP<const map_type> Ccolmap;
2215 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2216 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
2217 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2218 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2219 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2220 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2221 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2222 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2229 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2231 if (Bview.importMatrix.is_null()) {
2234 Ccolmap = Bview.colMap;
2238 Kokkos::RangePolicy<execution_space, LO> range (0, static_cast<LO> (Bview.colMap->getNodeNumElements ()));
2239 Kokkos::parallel_for (range, KOKKOS_LAMBDA (
const size_t i) {
2240 Bcol2Ccol(i) =
static_cast<LO
> (i);
2251 if (!Bimport.is_null() && !Iimport.is_null()){
2252 Cimport = Bimport->setUnion(*Iimport);
2253 Ccolmap = Cimport->getTargetMap();
2255 }
else if (!Bimport.is_null() && Iimport.is_null()) {
2256 Cimport = Bimport->setUnion();
2258 }
else if(Bimport.is_null() && !Iimport.is_null()) {
2259 Cimport = Iimport->setUnion();
2262 throw std::runtime_error(
"TpetraExt::Jacobi status of matrix importers is nonsensical");
2264 Ccolmap = Cimport->getTargetMap();
2266 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2267 std::runtime_error,
"Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2274 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2275 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2276 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2277 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2279 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2280 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2289 C.replaceColMap(Ccolmap);
2307 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2308 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2309 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
2310 GO aidx = Acolmap_local.getGlobalElement(i);
2311 LO B_LID = Browmap_local.getLocalElement(aidx);
2312 if (B_LID != LO_INVALID) {
2313 targetMapToOrigRow(i) = B_LID;
2314 targetMapToImportRow(i) = LO_INVALID;
2316 LO I_LID = Irowmap_local.getLocalElement(aidx);
2317 targetMapToOrigRow(i) = LO_INVALID;
2318 targetMapToImportRow(i) = I_LID;
2325 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2334 template<
class Scalar,
2336 class GlobalOrdinal,
2338 class LocalOrdinalViewType>
2339 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
2340 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> & Dinv,
2341 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2342 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2343 const LocalOrdinalViewType & targetMapToOrigRow,
2344 const LocalOrdinalViewType & targetMapToImportRow,
2345 const LocalOrdinalViewType & Bcol2Ccol,
2346 const LocalOrdinalViewType & Icol2Ccol,
2347 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2348 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2349 const std::string& label,
2350 const Teuchos::RCP<Teuchos::ParameterList>& params) {
2352 #ifdef HAVE_TPETRA_MMM_TIMINGS 2353 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2354 using Teuchos::TimeMonitor;
2355 auto MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Nemwmatrix SerialCore")));
2358 using Teuchos::Array;
2359 using Teuchos::ArrayRCP;
2360 using Teuchos::ArrayView;
2366 typedef typename KCRS::StaticCrsGraphType graph_t;
2367 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2368 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2369 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2370 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2373 typedef typename scalar_view_t::memory_space scalar_memory_space;
2376 typedef LocalOrdinal LO;
2377 typedef GlobalOrdinal GO;
2380 typedef Map<LO,GO,NO> map_type;
2381 size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2382 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2385 RCP<const map_type> Ccolmap = C.getColMap();
2386 size_t m = Aview.origMatrix->getNodeNumRows();
2387 size_t n = Ccolmap->getNodeNumElements();
2388 size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
2391 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2392 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2394 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2395 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2396 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2398 c_lno_view_t Irowptr;
2399 lno_nnz_view_t Icolind;
2400 scalar_view_t Ivals;
2401 if(!Bview.importMatrix.is_null()) {
2402 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2403 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2404 Ivals = Bview.importMatrix->getLocalMatrix().values;
2405 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
2409 auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
2417 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2418 Array<size_t> c_status(n, ST_INVALID);
2427 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2428 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing(
"Crowptr"),m+1);
2429 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing(
"Ccolind"),CSR_alloc);
2430 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing(
"Cvals"),CSR_alloc);
2431 size_t CSR_ip = 0, OLD_ip = 0;
2433 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2447 for (
size_t i = 0; i < m; i++) {
2450 Crowptr[i] = CSR_ip;
2451 SC minusOmegaDval = -omega*Dvals(i,0);
2454 for (
size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2455 Scalar Bval = Bvals[j];
2456 if (Bval == SC_ZERO)
2458 LO Bij = Bcolind[j];
2459 LO Cij = Bcol2Ccol[Bij];
2462 c_status[Cij] = CSR_ip;
2463 Ccolind[CSR_ip] = Cij;
2464 Cvals[CSR_ip] = Bvals[j];
2469 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2470 LO Aik = Acolind[k];
2471 const SC Aval = Avals[k];
2472 if (Aval == SC_ZERO)
2475 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2477 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
2479 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2480 LO Bkj = Bcolind[j];
2481 LO Cij = Bcol2Ccol[Bkj];
2483 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2485 c_status[Cij] = CSR_ip;
2486 Ccolind[CSR_ip] = Cij;
2487 Cvals[CSR_ip] = minusOmegaDval* Aval * Bvals[j];
2491 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Bvals[j];
2497 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
2498 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2499 LO Ikj = Icolind[j];
2500 LO Cij = Icol2Ccol[Ikj];
2502 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2504 c_status[Cij] = CSR_ip;
2505 Ccolind[CSR_ip] = Cij;
2506 Cvals[CSR_ip] = minusOmegaDval* Aval * Ivals[j];
2509 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Ivals[j];
2516 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1]+1)*b_max_nnz_per_row) > CSR_alloc) {
2518 Kokkos::resize(Ccolind,CSR_alloc);
2519 Kokkos::resize(Cvals,CSR_alloc);
2523 Crowptr[m] = CSR_ip;
2526 Kokkos::resize(Ccolind,CSR_ip);
2527 Kokkos::resize(Cvals,CSR_ip);
2530 #ifdef HAVE_TPETRA_MMM_TIMINGS 2531 auto MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix Final Sort")));
2538 C.replaceColMap(Ccolmap);
2545 if (params.is_null() || params->get(
"sort entries",
true))
2546 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2547 C.setAllValues(Crowptr,Ccolind, Cvals);
2550 #ifdef HAVE_TPETRA_MMM_TIMINGS 2551 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix ESFC")));
2562 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
2563 labelList->set(
"Timer Label",label);
2564 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
2565 RCP<const Export<LO,GO,NO> > dummyExport;
2566 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2574 template<
class Scalar,
2576 class GlobalOrdinal,
2578 void jacobi_A_B_reuse(
2580 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2581 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2582 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2583 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2584 const std::string& label,
2585 const Teuchos::RCP<Teuchos::ParameterList>& params)
2587 using Teuchos::Array;
2588 using Teuchos::ArrayRCP;
2589 using Teuchos::ArrayView;
2593 typedef LocalOrdinal LO;
2594 typedef GlobalOrdinal GO;
2597 typedef Import<LO,GO,NO> import_type;
2598 typedef Map<LO,GO,NO> map_type;
2601 typedef typename map_type::local_map_type local_map_type;
2603 typedef typename KCRS::StaticCrsGraphType graph_t;
2604 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2605 typedef typename NO::execution_space execution_space;
2606 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2607 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2609 #ifdef HAVE_TPETRA_MMM_TIMINGS 2610 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2611 using Teuchos::TimeMonitor;
2612 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse Cmap"))));
2614 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2617 RCP<const import_type> Cimport = C.getGraph()->getImporter();
2618 RCP<const map_type> Ccolmap = C.getColMap();
2619 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2620 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2621 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2622 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2623 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2624 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2627 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2631 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2632 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2635 if (!Bview.importMatrix.is_null()) {
2636 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2637 std::runtime_error,
"Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
2639 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2640 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2641 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2647 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2648 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2649 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
2650 GO aidx = Acolmap_local.getGlobalElement(i);
2651 LO B_LID = Browmap_local.getLocalElement(aidx);
2652 if (B_LID != LO_INVALID) {
2653 targetMapToOrigRow(i) = B_LID;
2654 targetMapToImportRow(i) = LO_INVALID;
2656 LO I_LID = Irowmap_local.getLocalElement(aidx);
2657 targetMapToOrigRow(i) = LO_INVALID;
2658 targetMapToImportRow(i) = I_LID;
2663 #ifdef HAVE_TPETRA_MMM_TIMINGS 2669 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2675 template<
class Scalar,
2677 class GlobalOrdinal,
2679 class LocalOrdinalViewType>
2680 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
2681 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2682 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2683 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2684 const LocalOrdinalViewType & targetMapToOrigRow,
2685 const LocalOrdinalViewType & targetMapToImportRow,
2686 const LocalOrdinalViewType & Bcol2Ccol,
2687 const LocalOrdinalViewType & Icol2Ccol,
2688 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2689 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > ,
2690 const std::string& label,
2691 const Teuchos::RCP<Teuchos::ParameterList>& ) {
2692 #ifdef HAVE_TPETRA_MMM_TIMINGS 2693 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2694 using Teuchos::TimeMonitor;
2695 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse SerialCore"))));
2696 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2705 typedef typename KCRS::StaticCrsGraphType graph_t;
2706 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2707 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2708 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2709 typedef typename scalar_view_t::memory_space scalar_memory_space;
2712 typedef LocalOrdinal LO;
2713 typedef GlobalOrdinal GO;
2715 typedef Map<LO,GO,NO> map_type;
2716 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2717 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2718 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2721 RCP<const map_type> Ccolmap = C.getColMap();
2722 size_t m = Aview.origMatrix->getNodeNumRows();
2723 size_t n = Ccolmap->getNodeNumElements();
2726 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2727 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2728 const KCRS & Cmat = C.getLocalMatrix();
2730 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2731 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2732 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2733 scalar_view_t Cvals = Cmat.values;
2735 c_lno_view_t Irowptr;
2736 lno_nnz_view_t Icolind;
2737 scalar_view_t Ivals;
2738 if(!Bview.importMatrix.is_null()) {
2739 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2740 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2741 Ivals = Bview.importMatrix->getLocalMatrix().values;
2745 auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
2747 #ifdef HAVE_TPETRA_MMM_TIMINGS 2748 MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse SerialCore - Compare"))));
2756 std::vector<size_t> c_status(n, ST_INVALID);
2759 size_t CSR_ip = 0, OLD_ip = 0;
2760 for (
size_t i = 0; i < m; i++) {
2764 OLD_ip = Crowptr[i];
2765 CSR_ip = Crowptr[i+1];
2766 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
2767 c_status[Ccolind[k]] = k;
2773 SC minusOmegaDval = -omega*Dvals(i,0);
2776 for (
size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2777 Scalar Bval = Bvals[j];
2778 if (Bval == SC_ZERO)
2780 LO Bij = Bcolind[j];
2781 LO Cij = Bcol2Ccol[Bij];
2783 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2784 std::runtime_error,
"Trying to insert a new entry into a static graph");
2786 Cvals[c_status[Cij]] = Bvals[j];
2790 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2791 LO Aik = Acolind[k];
2792 const SC Aval = Avals[k];
2793 if (Aval == SC_ZERO)
2796 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2798 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
2800 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2801 LO Bkj = Bcolind[j];
2802 LO Cij = Bcol2Ccol[Bkj];
2804 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2805 std::runtime_error,
"Trying to insert a new entry into a static graph");
2807 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
2812 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
2813 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2814 LO Ikj = Icolind[j];
2815 LO Cij = Icol2Ccol[Ikj];
2817 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2818 std::runtime_error,
"Trying to insert a new entry into a static graph");
2820 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
2826 #ifdef HAVE_TPETRA_MMM_TIMINGS 2827 MM2 = Teuchos::null;
2828 MM2 = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse ESFC"))));
2831 C.fillComplete(C.getDomainMap(), C.getRangeMap());
2832 #ifdef HAVE_TPETRA_MMM_TIMINGS 2833 MM2 = Teuchos::null;
2842 template<
class Scalar,
2844 class GlobalOrdinal,
2846 void import_and_extract_views(
2847 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
2848 Teuchos::RCP<
const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
2849 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2850 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
2851 bool userAssertsThereAreNoRemotes,
2852 const std::string& label,
2853 const Teuchos::RCP<Teuchos::ParameterList>& params)
2855 using Teuchos::Array;
2856 using Teuchos::ArrayView;
2859 using Teuchos::null;
2862 typedef LocalOrdinal LO;
2863 typedef GlobalOrdinal GO;
2866 typedef Map<LO,GO,NO> map_type;
2867 typedef Import<LO,GO,NO> import_type;
2868 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
2870 #ifdef HAVE_TPETRA_MMM_TIMINGS 2871 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2872 using Teuchos::TimeMonitor;
2873 RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Alloc"))));
2881 Aview.deleteContents();
2883 Aview.origMatrix = rcp(&A,
false);
2884 Aview.origRowMap = A.getRowMap();
2885 Aview.rowMap = targetMap;
2886 Aview.colMap = A.getColMap();
2887 Aview.domainMap = A.getDomainMap();
2888 Aview.importColMap = null;
2889 RCP<const map_type> rowMap = A.getRowMap();
2890 const int numProcs = rowMap->getComm()->getSize();
2893 if (userAssertsThereAreNoRemotes || numProcs < 2)
2896 RCP<const import_type> importer;
2897 if (params != null && params->isParameter(
"importer")) {
2898 importer = params->get<RCP<const import_type> >(
"importer");
2901 #ifdef HAVE_TPETRA_MMM_TIMINGS 2903 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X RemoteMap"))));
2908 RCP<const map_type> remoteRowMap;
2909 size_t numRemote = 0;
2911 if (!prototypeImporter.is_null() &&
2912 prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
2913 prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
2915 #ifdef HAVE_TPETRA_MMM_TIMINGS 2916 TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X RemoteMap-Mode1"));
2918 ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
2919 numRemote = prototypeImporter->getNumRemoteIDs();
2921 Array<GO> remoteRows(numRemote);
2922 for (
size_t i = 0; i < numRemote; i++)
2923 remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
2925 remoteRowMap = rcp(
new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2926 rowMap->getIndexBase(), rowMap->getComm()));
2929 }
else if (prototypeImporter.is_null()) {
2931 #ifdef HAVE_TPETRA_MMM_TIMINGS 2932 TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X RemoteMap-Mode2"));
2934 ArrayView<const GO> rows = targetMap->getNodeElementList();
2935 size_t numRows = targetMap->getNodeNumElements();
2937 Array<GO> remoteRows(numRows);
2938 for(
size_t i = 0; i < numRows; ++i) {
2939 const LO mlid = rowMap->getLocalElement(rows[i]);
2941 if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
2942 remoteRows[numRemote++] = rows[i];
2944 remoteRows.resize(numRemote);
2945 remoteRowMap = rcp(
new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2946 rowMap->getIndexBase(), rowMap->getComm()));
2955 TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
2956 "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
2965 #ifdef HAVE_TPETRA_MMM_TIMINGS 2967 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Collective-0"))));
2971 Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (
global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
2973 if (globalMaxNumRemote > 0) {
2974 #ifdef HAVE_TPETRA_MMM_TIMINGS 2976 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Import-2"))));
2980 importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
2982 importer = rcp(
new import_type(rowMap, remoteRowMap));
2984 throw std::runtime_error(
"prototypeImporter->SourceMap() does not match A.getRowMap()!");
2988 params->set(
"importer", importer);
2991 if (importer != null) {
2992 #ifdef HAVE_TPETRA_MMM_TIMINGS 2994 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Import-3"))));
2998 Teuchos::ParameterList labelList;
2999 labelList.set(
"Timer Label", label);
3000 auto & labelList_subList = labelList.sublist(
"matrixmatrix: kernel params",
false);
3003 bool overrideAllreduce =
false;
3006 Teuchos::ParameterList params_sublist;
3007 if(!params.is_null()) {
3008 labelList.set(
"compute global constants", params->get(
"compute global constants",
false));
3009 params_sublist = params->sublist(
"matrixmatrix: kernel params",
false);
3010 mm_optimization_core_count = params_sublist.get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3011 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3012 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
3013 isMM = params_sublist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
3014 overrideAllreduce = params_sublist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
3016 labelList_subList.set(
"isMatrixMatrix_TransferAndFillComplete",isMM);
3017 labelList_subList.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3018 labelList_subList.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
3020 Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
3021 A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3027 sprintf(str,
"import_matrix.%d.dat",count);
3032 #ifdef HAVE_TPETRA_MMM_STATISTICS 3033 printMultiplicationStatistics(importer, label + std::string(
" I&X MMM"));
3037 #ifdef HAVE_TPETRA_MMM_TIMINGS 3039 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Import-4"))));
3043 Aview.importColMap = Aview.importMatrix->getColMap();
3044 #ifdef HAVE_TPETRA_MMM_TIMINGS 3056 template<
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node,
class LocalOrdinalViewType>
3058 merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3059 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3060 const LocalOrdinalViewType & Acol2Brow,
3061 const LocalOrdinalViewType & Acol2Irow,
3062 const LocalOrdinalViewType & Bcol2Ccol,
3063 const LocalOrdinalViewType & Icol2Ccol,
3064 const size_t mergedNodeNumCols) {
3068 typedef typename KCRS::StaticCrsGraphType graph_t;
3069 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3070 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3071 typedef typename KCRS::values_type::non_const_type scalar_view_t;
3073 const KCRS & Ak = Aview.origMatrix->getLocalMatrix();
3074 const KCRS & Bk = Bview.origMatrix->getLocalMatrix();
3077 if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3081 RCP<const KCRS> Ik_;
3082 if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrix());
3083 const KCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3085 if(Ik!=0) Iks = *Ik;
3086 size_t merge_numrows = Ak.numCols();
3088 lno_view_t Mrowptr(
"Mrowptr", merge_numrows + 1);
3090 const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3093 typedef typename Node::execution_space execution_space;
3094 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3095 Kokkos::parallel_scan (
"Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3096 KOKKOS_LAMBDA(
const size_t i,
size_t& update,
const bool final) {
3097 if(
final) Mrowptr(i) = update;
3100 if(Acol2Brow(i)!=LO_INVALID)
3101 ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3103 ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3106 if(
final && i+1==merge_numrows)
3107 Mrowptr(i+1)=update;
3111 size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3112 lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing(
"Mcolind"),merge_nnz);
3113 scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing(
"Mvals"),merge_nnz);
3116 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3117 Kokkos::parallel_for (
"Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(
const size_t i) {
3118 if(Acol2Brow(i)!=LO_INVALID) {
3119 size_t row = Acol2Brow(i);
3120 size_t start = Bk.graph.row_map(row);
3121 for(
size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3122 Mvalues(j) = Bk.values(j-Mrowptr(i)+start);
3123 Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3127 size_t row = Acol2Irow(i);
3128 size_t start = Iks.graph.row_map(row);
3129 for(
size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3130 Mvalues(j) = Iks.values(j-Mrowptr(i)+start);
3131 Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3136 KCRS newmat(
"CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind);
3145 template<
typename SC,
typename LO,
typename GO,
typename NO>
3146 void AddKernels<SC, LO, GO, NO>::
3148 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3149 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3150 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3151 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3152 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3153 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3154 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3155 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3156 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3157 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3158 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3160 using Teuchos::TimeMonitor;
3161 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error,
"Can't add matrices with different numbers of rows.");
3162 auto nrows = Arowptrs.extent(0) - 1;
3163 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing(
"C row ptrs"), nrows + 1);
3164 typedef KokkosKernels::Experimental::KokkosKernelsHandle<
typename col_inds_array::size_type, LO, impl_scalar_type,
3165 execution_space, memory_space, memory_space> KKH;
3167 handle.create_spadd_handle(
true);
3168 auto addHandle = handle.get_spadd_handle();
3169 #ifdef HAVE_TPETRA_MMM_TIMINGS 3170 auto MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() sorted symbolic")));
3172 KokkosSparse::Experimental::spadd_symbolic
3174 typename row_ptrs_array::const_type,
typename col_inds_array::const_type,
3175 typename row_ptrs_array::const_type,
typename col_inds_array::const_type,
3176 row_ptrs_array, col_inds_array>
3177 (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3179 Cvals = values_array(
"C values", addHandle->get_max_result_nnz());
3180 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing(
"C colinds"), addHandle->get_max_result_nnz());
3181 #ifdef HAVE_TPETRA_MMM_TIMINGS 3183 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() sorted numeric")));
3185 KokkosSparse::Experimental::spadd_numeric(&handle,
3186 Arowptrs, Acolinds, Avals, scalarA,
3187 Browptrs, Bcolinds, Bvals, scalarB,
3188 Crowptrs, Ccolinds, Cvals);
3191 template<
typename SC,
typename LO,
typename GO,
typename NO>
3192 void AddKernels<SC, LO, GO, NO>::
3194 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3195 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3196 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3197 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3198 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3199 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3200 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3201 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3203 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3204 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3205 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3207 using Teuchos::TimeMonitor;
3208 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error,
"Can't add matrices with different numbers of rows.");
3209 auto nrows = Arowptrs.extent(0) - 1;
3210 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing(
"C row ptrs"), nrows + 1);
3211 typedef MMdetails::AddKernels<SC, LO, GO, NO> AddKern;
3212 typedef KokkosKernels::Experimental::KokkosKernelsHandle<
typename col_inds_array::size_type, LO, AddKern::impl_scalar_type,
3213 AddKern::execution_space, AddKern::memory_space, AddKern::memory_space> KKH;
3215 handle.create_spadd_handle(
false);
3216 auto addHandle = handle.get_spadd_handle();
3217 #ifdef HAVE_TPETRA_MMM_TIMINGS 3218 auto MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() sorted symbolic")));
3220 KokkosSparse::Experimental::spadd_symbolic
3222 typename row_ptrs_array::const_type,
typename col_inds_array::const_type,
3223 typename row_ptrs_array::const_type,
typename col_inds_array::const_type,
3224 row_ptrs_array, col_inds_array>
3225 (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3227 Cvals = values_array(
"C values", addHandle->get_max_result_nnz());
3228 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing(
"C colinds"), addHandle->get_max_result_nnz());
3229 #ifdef HAVE_TPETRA_MMM_TIMINGS 3231 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() sorted kernel: sorted numeric")));
3233 KokkosSparse::Experimental::spadd_numeric(&handle,
3234 Arowptrs, Acolinds, Avals, scalarA,
3235 Browptrs, Bcolinds, Bvals, scalarB,
3236 Crowptrs, Ccolinds, Cvals);
3239 template<
typename GO,
3240 typename LocalIndicesType,
3241 typename GlobalIndicesType,
3242 typename ColMapType>
3243 struct ConvertLocalToGlobalFunctor
3245 ConvertLocalToGlobalFunctor(
3246 const LocalIndicesType& colindsOrig_,
3247 const GlobalIndicesType& colindsConverted_,
3248 const ColMapType& colmap_) :
3249 colindsOrig (colindsOrig_),
3250 colindsConverted (colindsConverted_),
3253 KOKKOS_INLINE_FUNCTION
void 3254 operator() (
const GO i)
const 3256 colindsConverted(i) = colmap.getGlobalElement(colindsOrig(i));
3258 LocalIndicesType colindsOrig;
3259 GlobalIndicesType colindsConverted;
3263 template<
typename SC,
typename LO,
typename GO,
typename NO>
3264 void MMdetails::AddKernels<SC, LO, GO, NO>::
3265 convertToGlobalAndAdd(
3266 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& A,
3267 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3268 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& B,
3269 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3270 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
3271 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
3272 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3273 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3274 typename MMdetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds)
3276 using Teuchos::TimeMonitor;
3278 const values_array Avals = A.values;
3279 const values_array Bvals = B.values;
3280 const col_inds_array Acolinds = A.graph.entries;
3281 const col_inds_array Bcolinds = B.graph.entries;
3282 auto Arowptrs = A.graph.row_map;
3283 auto Browptrs = B.graph.row_map;
3284 global_col_inds_array AcolindsConverted(Kokkos::ViewAllocateWithoutInitializing(
"A colinds (converted)"), Acolinds.extent(0));
3285 global_col_inds_array BcolindsConverted(Kokkos::ViewAllocateWithoutInitializing(
"B colinds (converted)"), Bcolinds.extent(0));
3286 #ifdef HAVE_TPETRA_MMM_TIMINGS 3287 auto MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() diff col map kernel: " + std::string(
"column map conversion"))));
3289 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(Acolinds, AcolindsConverted, AcolMap);
3290 Kokkos::parallel_for(
"Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
3291 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(Bcolinds, BcolindsConverted, BcolMap);
3292 Kokkos::parallel_for(
"Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
3293 typedef KokkosKernels::Experimental::KokkosKernelsHandle<
typename col_inds_array::size_type, GO, impl_scalar_type,
3294 execution_space, memory_space, memory_space> KKH;
3296 handle.create_spadd_handle(
false);
3297 auto addHandle = handle.get_spadd_handle();
3298 #ifdef HAVE_TPETRA_MMM_TIMINGS 3300 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted symbolic")));
3302 auto nrows = Arowptrs.extent(0) - 1;
3303 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing(
"C row ptrs"), nrows + 1);
3304 KokkosSparse::Experimental::spadd_symbolic
3305 <KKH,
typename row_ptrs_array::const_type,
typename global_col_inds_array::const_type,
typename row_ptrs_array::const_type,
typename global_col_inds_array::const_type, row_ptrs_array, global_col_inds_array>
3306 (&handle, Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
3307 Cvals = values_array(
"C values", addHandle->get_max_result_nnz());
3308 Ccolinds = global_col_inds_array(Kokkos::ViewAllocateWithoutInitializing(
"C colinds"), addHandle->get_max_result_nnz());
3309 #ifdef HAVE_TPETRA_MMM_TIMINGS 3311 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted numeric")));
3313 KokkosSparse::Experimental::spadd_numeric(&handle,
3314 Arowptrs, AcolindsConverted, Avals, scalarA,
3315 Browptrs, BcolindsConverted, Bvals, scalarB,
3316 Crowptrs, Ccolinds, Cvals);
3332 #define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \ 3334 void MatrixMatrix::Multiply( \ 3335 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \ 3337 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \ 3339 CrsMatrix< SCALAR , LO , GO , NODE >& C, \ 3340 bool call_FillComplete_on_result, \ 3341 const std::string & label, \ 3342 const Teuchos::RCP<Teuchos::ParameterList>& params); \ 3345 void MatrixMatrix::Jacobi( \ 3347 const Vector< SCALAR, LO, GO, NODE > & Dinv, \ 3348 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \ 3349 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \ 3350 CrsMatrix< SCALAR , LO , GO , NODE >& C, \ 3351 bool call_FillComplete_on_result, \ 3352 const std::string & label, \ 3353 const Teuchos::RCP<Teuchos::ParameterList>& params); \ 3356 void MatrixMatrix::Add( \ 3357 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \ 3360 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \ 3363 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > C); \ 3366 void MatrixMatrix::Add( \ 3367 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \ 3370 CrsMatrix<SCALAR, LO, GO, NODE>& B, \ 3374 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \ 3375 MatrixMatrix::add<SCALAR, LO, GO, NODE> \ 3376 (const SCALAR & alpha, \ 3377 const bool transposeA, \ 3378 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \ 3379 const SCALAR & beta, \ 3380 const bool transposeB, \ 3381 const CrsMatrix<SCALAR, LO, GO, NODE>& B, \ 3382 const Teuchos::RCP<const Map<LO, GO, NODE> >& domainMap, \ 3383 const Teuchos::RCP<const Map<LO, GO, NODE> >& rangeMap, \ 3384 const Teuchos::RCP<Teuchos::ParameterList>& params); \ 3388 MatrixMatrix::add< SCALAR , LO, GO , NODE > \ 3389 (const SCALAR & alpha, \ 3390 const bool transposeA, \ 3391 const CrsMatrix< SCALAR , LO, GO , NODE >& A, \ 3392 const SCALAR& beta, \ 3393 const bool transposeB, \ 3394 const CrsMatrix< SCALAR , LO, GO , NODE >& B, \ 3395 CrsMatrix< SCALAR , LO, GO , NODE >& C, \ 3396 const Teuchos::RCP<const Map<LO, GO , NODE > >& domainMap, \ 3397 const Teuchos::RCP<const Map<LO, GO , NODE > >& rangeMap, \ 3398 const Teuchos::RCP<Teuchos::ParameterList>& params); \ 3400 template struct MMdetails::AddKernels<SCALAR, LO, GO, NODE>; 3404 #endif // TPETRA_MATRIXMATRIX_DEF_HPP Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix's column Map with the given Map.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
bool isFillActive() const
Whether the matrix is not fill complete.
void scale(const Scalar &alpha)
Scale the matrix's values: this := alpha*this.
size_t getNodeNumRows() const override
The number of matrix rows owned by the calling process.
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
size_t global_size_t
Global size_t object.
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
static void writeSparseFile(const std::string &filename, const sparse_matrix_type &matrix, const std::string &matrixName, const std::string &matrixDescription, const bool debug=false)
Print the sparse matrix in Matrix Market format, with comments.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
LocalOrdinal sumIntoGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using global indices.
Utility functions for packing and unpacking sparse matrix entries.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Sparse matrix-matrix multiply.
Teuchos::RCP< crs_matrix_type > createTranspose(const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Compute and return the transpose of the matrix given to the constructor.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
A parallel distribution of indices over processes.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
void setAllValues(const typename local_matrix_type::row_map_type &ptr, const typename local_graph_type::entries_type::non_const_type &ind, const typename local_matrix_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
size_t getNodeMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
A distributed dense vector.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
Matrix Market file readers and writers for Tpetra objects.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Declaration and definition of Tpetra::Details::getEntryOnHost.
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.