46 #ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP 47 #define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP 49 #include "Kokkos_UnorderedMap.hpp" 50 #include "Xpetra_CrsGraphFactory.hpp" 54 #include "MueLu_Aggregates_kokkos.hpp" 55 #include "MueLu_AmalgamationFactory_kokkos.hpp" 56 #include "MueLu_AmalgamationInfo_kokkos.hpp" 57 #include "MueLu_CoarseMapFactory_kokkos.hpp" 60 #include "MueLu_NullspaceFactory_kokkos.hpp" 61 #include "MueLu_PerfUtils.hpp" 63 #include "MueLu_Utilities_kokkos.hpp" 65 #include "Xpetra_IO.hpp" 71 template<
class LocalOrdinal,
class View>
72 class ReduceMaxFunctor{
74 ReduceMaxFunctor(View view) :
view_(view) { }
76 KOKKOS_INLINE_FUNCTION
82 KOKKOS_INLINE_FUNCTION
89 KOKKOS_INLINE_FUNCTION
98 template<
class LOType,
class GOType,
class SCType,
class DeviceType,
class NspType,
class aggRowsType,
class maxAggDofSizeType,
class agg2RowMapLOType,
class statusType,
class rowsType,
class rowsAuxType,
class colsAuxType,
class valsAuxType>
99 class LocalQRDecompFunctor {
105 typedef typename DeviceType::execution_space execution_space;
106 typedef typename Kokkos::ArithTraits<SC>::val_type impl_SC;
107 typedef Kokkos::ArithTraits<impl_SC> impl_ATS;
108 typedef typename impl_ATS::magnitudeType Magnitude;
110 typedef Kokkos::View<impl_SC**,typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_matrix;
111 typedef Kokkos::View<impl_SC* ,typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_vector;
127 LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_,
bool doQRStep_) :
141 KOKKOS_INLINE_FUNCTION
142 void operator() (
const typename Kokkos::TeamPolicy<execution_space>::member_type & thread,
size_t& nnz)
const {
143 auto agg = thread.league_rank();
148 const impl_SC one = impl_ATS::one();
149 const impl_SC two = one + one;
150 const impl_SC zero = impl_ATS::zero();
151 const auto zeroM = impl_ATS::magnitude(zero);
157 Xpetra::global_size_t offset = agg * n;
162 shared_matrix r(thread.team_shmem(), m, n);
163 for (
int j = 0; j < n; j++)
164 for (
int k = 0; k < m; k++)
168 for (
int i = 0; i < m; i++) {
169 for (
int j = 0; j < n; j++)
170 printf(
" %5.3lf ", r(i,j));
176 shared_matrix q(thread.team_shmem(), m, m);
178 bool isSingular =
false;
182 for (
int i = 0; i < m; i++) {
183 for (
int j = 0; j < m; j++)
188 for (
int k = 0; k < n; k++) {
190 Magnitude s = zeroM, norm, norm_x;
191 for (
int i = k+1; i < m; i++)
192 s += pow(impl_ATS::magnitude(r(i,k)), 2);
193 norm = sqrt(pow(impl_ATS::magnitude(r(k,k)), 2) + s);
202 norm_x = sqrt(pow(impl_ATS::magnitude(r(k,k)), 2) + s);
203 if (norm_x == zeroM) {
211 for (
int i = k; i < m; i++)
215 for (
int j = k+1; j < n; j++) {
218 for (
int i = k; i < m; i++)
219 si += r(i,k) * r(i,j);
220 for (
int i = k; i < m; i++)
221 r(i,j) -= two*si * r(i,k);
225 for (
int j = k; j < m; j++) {
228 for (
int i = k; i < m; i++)
229 si += r(i,k) * qt(i,j);
230 for (
int i = k; i < m; i++)
231 qt(i,j) -= two*si * r(i,k);
236 for (
int i = k+1; i < m; i++)
242 for (
int i = 0; i < m; i++)
243 for (
int j = 0; j < i; j++) {
244 impl_SC tmp = qt(i,j);
251 for (
int j = 0; j < n; j++)
252 for (
int k = 0; k <= j; k++)
291 for (
int j = 0; j < n; j++)
292 for (
int k = 0; k < n; k++)
296 coarseNS(offset+k,j) = (k == j ? one : zero);
299 for (
int i = 0; i < m; i++)
300 for (
int j = 0; j < n; j++)
301 q(i,j) = (j == i ? one : zero);
305 for (
int j = 0; j < m; j++) {
307 size_t rowStart =
rowsAux(localRow);
309 for (
int k = 0; k < n; k++) {
311 if (q(j,k) != zero) {
312 colsAux(rowStart+lnnz) = offset + k;
313 valsAux(rowStart+lnnz) = q(j,k);
317 rows(localRow+1) = lnnz;
323 for (
int i = 0; i < m; i++) {
324 for (
int j = 0; j < n; j++)
330 for (
int i = 0; i < aggSize; i++) {
331 for (
int j = 0; j < aggSize; j++)
332 printf(
" %5.3lf ", q(i,j));
346 for (
int j = 0; j < m; j++) {
348 size_t rowStart =
rowsAux(localRow);
350 for (
int k = 0; k < n; k++) {
351 const impl_SC qr_jk =
fineNS(localRow,k);
354 colsAux(rowStart+lnnz) = offset + k;
355 valsAux(rowStart+lnnz) = qr_jk;
359 rows(localRow+1) = lnnz;
363 for (
int j = 0; j < n; j++)
371 size_t team_shmem_size(
int )
const {
375 return shared_matrix::shmem_size(m, n) +
376 shared_matrix::shmem_size(m, m);
384 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
386 RCP<ParameterList> validParamList = rcp(
new ParameterList());
388 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name)) 391 #undef SET_VALID_ENTRY 393 validParamList->set< RCP<const FactoryBase> >(
"A", Teuchos::null,
"Generating factory of the matrix A");
394 validParamList->set< RCP<const FactoryBase> >(
"Aggregates", Teuchos::null,
"Generating factory of the aggregates");
395 validParamList->set< RCP<const FactoryBase> >(
"Nullspace", Teuchos::null,
"Generating factory of the nullspace");
396 validParamList->set< RCP<const FactoryBase> >(
"Scaled Nullspace", Teuchos::null,
"Generating factory of the scaled nullspace");
397 validParamList->set< RCP<const FactoryBase> >(
"UnAmalgamationInfo", Teuchos::null,
"Generating factory of UnAmalgamationInfo");
398 validParamList->set< RCP<const FactoryBase> >(
"CoarseMap", Teuchos::null,
"Generating factory of the coarse map");
399 validParamList->set< RCP<const FactoryBase> >(
"Coordinates", Teuchos::null,
"Generating factory of the coordinates");
402 ParameterList norecurse;
403 norecurse.disableRecursiveValidation();
404 validParamList->set<ParameterList> (
"matrixmatrix: kernel params", norecurse,
"MatrixMatrix kernel parameters");
406 return validParamList;
409 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
412 const ParameterList& pL = GetParameterList();
414 std::string nspName =
"Nullspace";
415 if(pL.isParameter(
"Nullspace name")) nspName = pL.get<std::string>(
"Nullspace name");
417 Input(fineLevel,
"A");
418 Input(fineLevel,
"Aggregates");
419 Input(fineLevel, nspName);
420 Input(fineLevel,
"UnAmalgamationInfo");
421 Input(fineLevel,
"CoarseMap");
424 pL.get<
bool>(
"tentative: build coarse coordinates") ) {
425 bTransferCoordinates_ =
true;
426 Input(fineLevel,
"Coordinates");
427 }
else if (bTransferCoordinates_) {
428 Input(fineLevel,
"Coordinates");
432 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
434 return BuildP(fineLevel, coarseLevel);
437 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
441 typedef typename Teuchos::ScalarTraits<Scalar>::coordinateType coordinate_type;
442 typedef Xpetra::MultiVectorFactory<coordinate_type,LO,GO,NO> RealValuedMultiVectorFactory;
443 const ParameterList& pL = GetParameterList();
444 std::string nspName =
"Nullspace";
445 if(pL.isParameter(
"Nullspace name")) nspName = pL.get<std::string>(
"Nullspace name");
447 auto A = Get< RCP<Matrix> > (fineLevel,
"A");
448 auto aggregates = Get< RCP<Aggregates_kokkos> > (fineLevel,
"Aggregates");
449 auto amalgInfo = Get< RCP<AmalgamationInfo_kokkos> > (fineLevel,
"UnAmalgamationInfo");
450 auto fineNullspace = Get< RCP<MultiVector> > (fineLevel, nspName);
451 auto coarseMap = Get< RCP<const Map> > (fineLevel,
"CoarseMap");
452 RCP<RealValuedMultiVector> fineCoords;
453 if(bTransferCoordinates_) {
454 fineCoords = Get< RCP<RealValuedMultiVector> >(fineLevel,
"Coordinates");
457 RCP<Matrix> Ptentative;
460 if ( aggregates->GetNumGlobalAggregatesComputeIfNeeded() == 0) {
461 Ptentative = Teuchos::null;
462 Set(coarseLevel,
"P", Ptentative);
465 RCP<MultiVector> coarseNullspace;
466 RCP<RealValuedMultiVector> coarseCoords;
468 if(bTransferCoordinates_) {
469 ArrayView<const GO> elementAList = coarseMap->getLocalElementList();
470 GO indexBase = coarseMap->getIndexBase();
473 if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
474 blkSize = rcp_dynamic_cast<
const StridedMap>(coarseMap)->getFixedBlockSize();
476 Array<GO> elementList;
477 ArrayView<const GO> elementListView;
481 elementListView = elementAList;
484 auto numElements = elementAList.size() / blkSize;
486 elementList.resize(numElements);
489 for (LO i = 0; i < Teuchos::as<LO>(numElements); i++)
490 elementList[i] = (elementAList[i*blkSize]-indexBase)/blkSize + indexBase;
492 elementListView = elementList;
495 auto uniqueMap = fineCoords->getMap();
496 auto coarseCoordMap = MapFactory::Build(coarseMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
497 elementListView, indexBase, coarseMap->getComm());
498 coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors());
501 RCP<RealValuedMultiVector> ghostedCoords = fineCoords;
502 if (aggregates->AggregatesCrossProcessors()) {
503 auto nonUniqueMap = aggregates->GetMap();
504 auto importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
506 ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors());
507 ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
512 auto aggGraph = aggregates->GetGraph();
513 auto numAggs = aggGraph.numRows();
515 auto fineCoordsView = fineCoords ->getDeviceLocalView(Xpetra::Access::ReadOnly);
516 auto coarseCoordsView = coarseCoords->getDeviceLocalView(Xpetra::Access::OverwriteAll);
522 const auto dim = fineCoords->getNumVectors();
525 for (
size_t j = 0; j < dim; j++) {
526 Kokkos::parallel_for(
"MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy<local_ordinal_type, execution_space>(0, numAggs),
527 KOKKOS_LAMBDA(
const LO i) {
531 auto aggregate = aggGraph.rowConst(i);
533 coordinate_type sum = 0.0;
534 for (
size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
535 sum += fineCoordsRandomView(aggregate(colID),j);
537 coarseCoordsView(i,j) = sum / aggregate.length;
543 if (!aggregates->AggregatesCrossProcessors()) {
544 if(Xpetra::Helpers<SC,LO,GO,NO>::isTpetraBlockCrs(A)) {
545 BuildPuncoupledBlockCrs(coarseLevel,A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,
549 BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.
GetLevelID());
553 BuildPcoupled (A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
563 if (A->IsView(
"stridedMaps") ==
true)
564 Ptentative->CreateView(
"stridedMaps", A->getRowMap(
"stridedMaps"), coarseMap);
566 Ptentative->CreateView(
"stridedMaps", Ptentative->getRangeMap(), coarseMap);
568 if(bTransferCoordinates_) {
569 Set(coarseLevel,
"Coordinates", coarseCoords);
571 Set(coarseLevel,
"Nullspace", coarseNullspace);
572 Set(coarseLevel,
"P", Ptentative);
575 RCP<ParameterList> params = rcp(
new ParameterList());
576 params->set(
"printLoadBalancingInfo",
true);
581 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
583 BuildPuncoupled(
Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates,
584 RCP<AmalgamationInfo_kokkos> amalgInfo, RCP<MultiVector> fineNullspace,
585 RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
586 RCP<MultiVector>& coarseNullspace,
const int levelID)
const {
587 auto rowMap = A->getRowMap();
588 auto colMap = A->getColMap();
590 const size_t numRows = rowMap->getLocalNumElements();
591 const size_t NSDim = fineNullspace->getNumVectors();
593 typedef Kokkos::ArithTraits<SC> ATS;
594 using impl_SC =
typename ATS::val_type;
595 using impl_ATS = Kokkos::ArithTraits<impl_SC>;
596 const impl_SC zero = impl_ATS::zero(), one = impl_ATS::one();
598 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
600 typename Aggregates_kokkos::local_graph_type aggGraph;
603 aggGraph = aggregates->GetGraph();
605 auto aggRows = aggGraph.row_map;
606 auto aggCols = aggGraph.entries;
614 goodMap = isGoodMap(*rowMap, *colMap);
618 "MueLu: TentativePFactory_kokkos: for now works only with good maps " 619 "(i.e. \"matching\" row and column maps)");
628 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
630 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
631 GO globalOffset = amalgInfo->GlobalOffset();
634 auto procWinner = aggregates->GetProcWinner() ->getDeviceLocalView(Xpetra::Access::ReadOnly);
635 auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
636 const size_t numAggregates = aggregates->GetNumAggregates();
638 int myPID = aggregates->GetMap()->getComm()->getRank();
643 typedef typename Aggregates_kokkos::aggregates_sizes_type::non_const_type AggSizeType;
644 AggSizeType aggDofSizes;
646 if (stridedBlockSize == 1) {
650 aggDofSizes = AggSizeType(
"agg_dof_sizes", numAggregates+1);
652 auto sizesConst = aggregates->ComputeAggregateSizes();
653 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates+1)), sizesConst);
659 aggDofSizes = AggSizeType(
"agg_dof_sizes", numAggregates + 1);
661 auto nodeMap = aggregates->GetMap()->getLocalMap();
662 auto dofMap = colMap->getLocalMap();
664 Kokkos::parallel_for(
"MueLu:TentativePF:Build:compute_agg_sizes",
range_type(0,numAggregates),
665 KOKKOS_LAMBDA(
const LO agg) {
666 auto aggRowView = aggGraph.rowConst(agg);
669 for (LO colID = 0; colID < aggRowView.length; colID++) {
670 GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
672 for (LO k = 0; k < stridedBlockSize; k++) {
673 GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
675 if (dofMap.getLocalElement(dofGID) != INVALID)
679 aggDofSizes(agg+1) = size;
686 ReduceMaxFunctor<LO,decltype(aggDofSizes)> reduceMax(aggDofSizes);
687 Kokkos::parallel_reduce(
"MueLu:TentativePF:Build:max_agg_size",
range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
691 Kokkos::parallel_scan(
"MueLu:TentativePF:Build:aggregate_sizes:stage1_scan",
range_type(0,numAggregates+1),
692 KOKKOS_LAMBDA(
const LO i, LO& update,
const bool& final_pass) {
693 update += aggDofSizes(i);
695 aggDofSizes(i) = update;
700 Kokkos::View<LO*, DeviceType>
agg2RowMapLO(Kokkos::ViewAllocateWithoutInitializing(
"agg2row_map_LO"), numRows);
704 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing(
"aggOffsets"), numAggregates);
705 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
707 Kokkos::parallel_for(
"MueLu:TentativePF:Build:createAgg2RowMap",
range_type(0, vertex2AggId.extent(0)),
708 KOKKOS_LAMBDA(
const LO lnode) {
709 if (procWinner(lnode, 0) == myPID) {
711 auto aggID = vertex2AggId(lnode,0);
713 auto offset = Kokkos::atomic_fetch_add( &aggOffsets(aggID), stridedBlockSize );
717 for (LO k = 0; k < stridedBlockSize; k++)
725 coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
728 auto fineNS = fineNullspace ->getDeviceLocalView(Xpetra::Access::ReadWrite);
729 auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
733 typedef typename Xpetra::Matrix<SC,LO,GO,NO>::local_matrix_type local_matrix_type;
734 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
735 typedef typename local_matrix_type::index_type::non_const_type cols_type;
736 typedef typename local_matrix_type::values_type::non_const_type vals_type;
740 typedef Kokkos::View<int[10], DeviceType> status_type;
741 status_type status(
"status");
746 const ParameterList& pL = GetParameterList();
747 const bool&
doQRStep = pL.get<
bool>(
"tentative: calculate qr");
749 GetOStream(
Runtime1) <<
"TentativePFactory : bypassing local QR phase" << std::endl;
751 GetOStream(
Warnings0) <<
"TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
754 size_t nnzEstimate = numRows * NSDim;
755 rows_type
rowsAux(Kokkos::ViewAllocateWithoutInitializing(
"Ptent_aux_rows"), numRows+1);
756 cols_type
colsAux(Kokkos::ViewAllocateWithoutInitializing(
"Ptent_aux_cols"), nnzEstimate);
757 vals_type
valsAux(
"Ptent_aux_vals", nnzEstimate);
758 rows_type
rows(
"Ptent_rows", numRows+1);
765 Kokkos::parallel_for(
"MueLu:TentativePF:BuildPuncoupled:for1",
range_type(0, numRows+1),
766 KOKKOS_LAMBDA(
const LO row) {
769 Kokkos::parallel_for(
"MueLu:TentativePF:BuildUncoupled:for2",
range_type(0, nnzEstimate),
770 KOKKOS_LAMBDA(
const LO j) {
787 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
790 Kokkos::parallel_for(
"MueLu:TentativePF:BuildUncoupled:main_loop", policy,
791 KOKKOS_LAMBDA(
const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
792 auto agg = thread.league_rank();
800 auto norm = impl_ATS::magnitude(zero);
805 for (decltype(aggSize) k = 0; k < aggSize; k++) {
821 for (decltype(aggSize) k = 0; k < aggSize; k++) {
825 rows(localRow+1) = 1;
832 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
833 Kokkos::deep_copy(statusHost, status);
834 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
836 std::ostringstream oss;
837 oss <<
"MueLu::TentativePFactory::MakeTentative: ";
839 case 0: oss <<
"!goodMap is not implemented";
break;
840 case 1: oss <<
"fine level NS part has a zero column";
break;
846 Kokkos::parallel_for(
"MueLu:TentativePF:BuildUncoupled:main_loop_noqr", policy,
847 KOKKOS_LAMBDA(
const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
848 auto agg = thread.league_rank();
857 for (decltype(aggSize) k = 0; k < aggSize; k++) {
861 rows(localRow+1) = 1;
869 Kokkos::parallel_reduce(
"MueLu:TentativeP:CountNNZ",
range_type(0, numRows+1),
870 KOKKOS_LAMBDA(
const LO i,
size_t &nnz_count) {
871 nnz_count +=
rows(i);
888 const Kokkos::TeamPolicy<execution_space> policy(numAggregates,1);
890 decltype(aggDofSizes ), decltype(maxAggSize), decltype(
agg2RowMapLO),
895 Kokkos::parallel_reduce(
"MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz);
898 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
899 Kokkos::deep_copy(statusHost, status);
900 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
902 std::ostringstream oss;
903 oss <<
"MueLu::TentativePFactory::MakeTentative: ";
905 case 0: oss <<
"!goodMap is not implemented";
break;
906 case 1: oss <<
"fine level NS part has a zero column";
break;
919 if (nnz != nnzEstimate) {
924 Kokkos::parallel_scan(
"MueLu:TentativePF:Build:compress_rows",
range_type(0,numRows+1),
925 KOKKOS_LAMBDA(
const LO i, LO& upd,
const bool&
final) {
935 cols = cols_type(
"Ptent_cols", nnz);
936 vals = vals_type(
"Ptent_vals", nnz);
941 Kokkos::parallel_for(
"MueLu:TentativePF:Build:compress_cols_vals",
range_type(0,numRows),
942 KOKKOS_LAMBDA(
const LO i) {
943 LO rowStart =
rows(i);
948 cols(rowStart+lnnz) =
colsAux(j);
949 vals(rowStart+lnnz) =
valsAux(j);
961 GetOStream(
Runtime1) <<
"TentativePFactory : aggregates do not cross process boundaries" << std::endl;
967 local_matrix_type lclMatrix = local_matrix_type(
"A", numRows, coarseMap->getLocalNumElements(), nnz, vals,
rows, cols);
970 RCP<ParameterList> FCparams;
971 if (pL.isSublist(
"matrixmatrix: kernel params"))
972 FCparams = rcp(
new ParameterList(pL.sublist(
"matrixmatrix: kernel params")));
974 FCparams = rcp(
new ParameterList);
977 FCparams->set(
"compute global constants", FCparams->get(
"compute global constants",
false));
978 FCparams->set(
"Timer Label", std::string(
"MueLu::TentativeP-") +
toString(levelID));
980 auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
981 Ptentative = rcp(
new CrsMatrixWrap(PtentCrs));
986 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
988 BuildPuncoupledBlockCrs(
Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates,
989 RCP<AmalgamationInfo_kokkos> amalgInfo, RCP<MultiVector> fineNullspace,
990 RCP<const Map> coarsePointMap, RCP<Matrix>& Ptentative,
991 RCP<MultiVector>& coarseNullspace,
const int levelID)
const {
992 #ifdef HAVE_MUELU_TPETRA 1002 RCP<const Map> rowMap = A->getRowMap();
1003 RCP<const Map> rangeMap = A->getRangeMap();
1004 RCP<const Map> colMap = A->getColMap();
1006 const size_t numFineBlockRows = rowMap->getLocalNumElements();
1010 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1012 typedef Kokkos::ArithTraits<SC> ATS;
1013 using impl_SC =
typename ATS::val_type;
1014 using impl_ATS = Kokkos::ArithTraits<impl_SC>;
1015 const impl_SC one = impl_ATS::one();
1018 const size_t NSDim = fineNullspace->getNumVectors();
1019 auto aggSizes = aggregates->ComputeAggregateSizes();
1022 typename Aggregates_kokkos::local_graph_type aggGraph;
1025 aggGraph = aggregates->GetGraph();
1027 auto aggRows = aggGraph.row_map;
1028 auto aggCols = aggGraph.entries;
1035 const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim;
1036 RCP<const Map> coarseBlockMap = MapFactory::Build(coarsePointMap->lib(),
1037 Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
1039 coarsePointMap->getIndexBase(),
1040 coarsePointMap->getComm());
1042 const ParameterList& pL = GetParameterList();
1053 "MueLu: TentativePFactory_kokkos: for now works only with good maps " 1054 "(i.e. \"matching\" row and column maps)");
1063 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
1065 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
1069 auto procWinner = aggregates->GetProcWinner() ->getDeviceLocalView(Xpetra::Access::ReadOnly);
1070 auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
1071 const size_t numAggregates = aggregates->GetNumAggregates();
1073 int myPID = aggregates->GetMap()->getComm()->getRank();
1078 typedef typename Aggregates_kokkos::aggregates_sizes_type::non_const_type AggSizeType;
1079 AggSizeType aggDofSizes;
1085 aggDofSizes = AggSizeType(
"agg_dof_sizes", numAggregates+1);
1087 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates+1)), aggSizes);
1093 ReduceMaxFunctor<LO,decltype(aggDofSizes)> reduceMax(aggDofSizes);
1094 Kokkos::parallel_reduce(
"MueLu:TentativePF:Build:max_agg_size",
range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
1098 Kokkos::parallel_scan(
"MueLu:TentativePF:Build:aggregate_sizes:stage1_scan",
range_type(0,numAggregates+1),
1099 KOKKOS_LAMBDA(
const LO i, LO& update,
const bool& final_pass) {
1100 update += aggDofSizes(i);
1102 aggDofSizes(i) = update;
1107 Kokkos::View<LO*, DeviceType> aggToRowMapLO(Kokkos::ViewAllocateWithoutInitializing(
"aggtorow_map_LO"), numFineBlockRows);
1111 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing(
"aggOffsets"), numAggregates);
1112 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
1114 Kokkos::parallel_for(
"MueLu:TentativePF:Build:createAgg2RowMap",
range_type(0, vertex2AggId.extent(0)),
1115 KOKKOS_LAMBDA(
const LO lnode) {
1116 if (procWinner(lnode, 0) == myPID) {
1118 auto aggID = vertex2AggId(lnode,0);
1120 auto offset = Kokkos::atomic_fetch_add( &aggOffsets(aggID), stridedBlockSize );
1124 for (LO k = 0; k < stridedBlockSize; k++)
1125 aggToRowMapLO(offset + k) = lnode*stridedBlockSize + k;
1132 coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim);
1135 auto fineNS = fineNullspace ->getDeviceLocalView(Xpetra::Access::ReadWrite);
1136 auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
1138 typedef typename Xpetra::Matrix<SC,LO,GO,NO>::local_matrix_type local_matrix_type;
1139 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
1140 typedef typename local_matrix_type::index_type::non_const_type cols_type;
1145 typedef Kokkos::View<int[10], DeviceType> status_type;
1146 status_type status(
"status");
1152 GetOStream(
Runtime1) <<
"TentativePFactory : bypassing local QR phase" << std::endl;
1158 rows_type ia(Kokkos::ViewAllocateWithoutInitializing(
"BlockGraph_rowptr"), numFineBlockRows+1);
1159 cols_type ja(Kokkos::ViewAllocateWithoutInitializing(
"BlockGraph_colind"), numFineBlockRows);
1161 Kokkos::parallel_for(
"MueLu:TentativePF:BlockCrs:graph_init",
range_type(0, numFineBlockRows),
1162 KOKKOS_LAMBDA(
const LO j) {
1166 if(j==(LO)numFineBlockRows-1)
1167 ia[numFineBlockRows] = numFineBlockRows;
1171 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
1172 Kokkos::parallel_for(
"MueLu:TentativePF:BlockCrs:fillGraph", policy,
1173 KOKKOS_LAMBDA(
const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
1174 auto agg = thread.league_rank();
1175 Xpetra::global_size_t offset = agg;
1180 for (LO j = 0; j < aggSize; j++) {
1182 const LO localRow = aggToRowMapLO[aggDofSizes[agg]+j];
1183 const size_t rowStart = ia[localRow];
1184 ja[rowStart] = offset;
1194 rows_type i_temp(Kokkos::ViewAllocateWithoutInitializing(
"BlockGraph_rowptr"), numFineBlockRows+1);
1196 Kokkos::parallel_scan(
"MueLu:TentativePF:BlockCrs:compress_rows",
range_type(0,numFineBlockRows),
1197 KOKKOS_LAMBDA(
const LO i, LO& upd,
const bool&
final) {
1200 for (
auto j = ia[i]; j < ia[i+1]; j++)
1201 if (ja[j] != INVALID)
1203 if(
final && i == (LO) numFineBlockRows-1)
1204 i_temp[numFineBlockRows] = upd;
1207 cols_type j_temp(Kokkos::ViewAllocateWithoutInitializing(
"BlockGraph_colind"), nnz);
1210 Kokkos::parallel_for(
"MueLu:TentativePF:BlockCrs:compress_cols",
range_type(0,numFineBlockRows),
1211 KOKKOS_LAMBDA(
const LO i) {
1212 size_t rowStart = i_temp[i];
1214 for (
auto j = ia[i]; j < ia[i+1]; j++)
1215 if (ja[j] != INVALID) {
1216 j_temp[rowStart+lnnz] = ja[j];
1225 RCP<CrsGraph> BlockGraph = CrsGraphFactory::Build(rowMap,coarseBlockMap,ia,ja);
1230 RCP<ParameterList> FCparams;
1231 if(pL.isSublist(
"matrixmatrix: kernel params"))
1232 FCparams=rcp(
new ParameterList(pL.sublist(
"matrixmatrix: kernel params")));
1234 FCparams= rcp(
new ParameterList);
1236 FCparams->set(
"compute global constants",FCparams->get(
"compute global constants",
false));
1237 std::string levelIDs =
toString(levelID);
1238 FCparams->set(
"Timer Label",std::string(
"MueLu::TentativeP-")+levelIDs);
1239 RCP<const Export> dummy_e;
1240 RCP<const Import> dummy_i;
1241 BlockGraph->expertStaticFillComplete(coarseBlockMap,rowMap,dummy_i,dummy_e,FCparams);
1252 RCP<Xpetra::CrsMatrix<SC,LO,GO,NO> > P_xpetra = Xpetra::CrsMatrixFactory<SC,LO,GO,NO>::BuildBlock(BlockGraph, coarsePointMap, rangeMap,NSDim);
1253 RCP<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> > P_tpetra = rcp_dynamic_cast<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> >(P_xpetra);
1254 if(P_tpetra.is_null())
throw std::runtime_error(
"BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix");
1255 RCP<CrsMatrixWrap> P_wrap = rcp(
new CrsMatrixWrap(P_xpetra));
1257 auto values = P_tpetra->getTpetra_BlockCrsMatrix()->getValuesDeviceNonConst();
1258 const LO stride = NSDim*NSDim;
1260 Kokkos::parallel_for(
"MueLu:TentativePF:BlockCrs:main_loop_noqr", policy,
1261 KOKKOS_LAMBDA(
const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
1262 auto agg = thread.league_rank();
1266 Xpetra::global_size_t offset = agg*NSDim;
1269 for (LO j = 0; j < aggSize; j++) {
1270 LO localBlockRow = aggToRowMapLO(
aggRows(agg)+j);
1271 LO rowStart = localBlockRow * stride;
1272 for (LO r = 0; r < (LO)NSDim; r++) {
1273 LO localPointRow = localBlockRow*NSDim + r;
1274 for (LO c = 0; c < (LO)NSDim; c++) {
1275 values[rowStart + r*NSDim + c] = fineNSRandom(localPointRow,c);
1281 for(LO j=0; j<(LO)NSDim; j++)
1285 Ptentative = P_wrap;
1288 throw std::runtime_error(
"TentativePFactory::BuildPuncoupledBlockCrs: Requires Tpetra");
1292 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
1294 BuildPcoupled(RCP<Matrix> , RCP<Aggregates_kokkos> ,
1295 RCP<AmalgamationInfo_kokkos> , RCP<MultiVector> ,
1296 RCP<const Map> , RCP<Matrix>& ,
1297 RCP<MultiVector>& )
const {
1301 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
1303 isGoodMap(
const Map& rowMap,
const Map& colMap)
const {
1304 auto rowLocalMap = rowMap.getLocalMap();
1305 auto colLocalMap = colMap.getLocalMap();
1307 const size_t numRows = rowLocalMap.getLocalNumElements();
1308 const size_t numCols = colLocalMap.getLocalNumElements();
1310 if (numCols < numRows)
1314 Kokkos::parallel_reduce(
"MueLu:TentativePF:isGoodMap",
range_type(0, numRows),
1315 KOKKOS_LAMBDA(
const LO i,
size_t &diff) {
1316 diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i));
1319 return (numDiff == 0);
1324 #define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT 1325 #endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP Important warning messages (one line)
MueLu::DefaultLocalOrdinal LocalOrdinal
std::string toString(const T &what)
Little helper function to convert non-string types to strings.
Timer to be used in factories. Similar to Monitor but with additional timers.
Namespace for MueLu classes and methods.
static const NoFactory * get()
Print even more statistics.
int GetLevelID() const
Return level number.
#define SET_VALID_ENTRY(name)
static bool MapsAreNested(const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &rowMap, const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &colMap)
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
Class that holds all level-specific information.
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
Kokkos::RangePolicy< local_ordinal_type, execution_space > range_type
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
bool IsAvailable(const std::string &ename, const FactoryBase *factory=NoFactory::get()) const
Test whether a need's value has been saved.
Exception throws to report errors in the internal logical of the program.
Description of what is happening (more verbose)
maxAggDofSizeType maxAggDofSize
agg2RowMapLOType agg2RowMapLO