MueLu  Version of the Day
MueLu_TentativePFactory_kokkos_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // MueLu: A package for multigrid based preconditioning
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact
39 // Jonathan Hu (jhu@sandia.gov)
40 // Andrey Prokopenko (aprokop@sandia.gov)
41 // Ray Tuminaro (rstumin@sandia.gov)
42 //
43 // ***********************************************************************
44 //
45 // @HEADER
46 #ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
47 #define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
48 
49 #include "Kokkos_UnorderedMap.hpp"
50 #include "Xpetra_CrsGraphFactory.hpp"
51 
53 
54 #include "MueLu_Aggregates_kokkos.hpp"
55 #include "MueLu_AmalgamationFactory_kokkos.hpp"
56 #include "MueLu_AmalgamationInfo_kokkos.hpp"
57 #include "MueLu_CoarseMapFactory_kokkos.hpp"
58 
59 #include "MueLu_MasterList.hpp"
60 #include "MueLu_NullspaceFactory_kokkos.hpp"
61 #include "MueLu_PerfUtils.hpp"
62 #include "MueLu_Monitor.hpp"
63 #include "MueLu_Utilities_kokkos.hpp"
64 
65 #include "Xpetra_IO.hpp"
66 
67 namespace MueLu {
68 
69  namespace { // anonymous
70 
71  template<class LocalOrdinal, class View>
72  class ReduceMaxFunctor{
73  public:
74  ReduceMaxFunctor(View view) : view_(view) { }
75 
76  KOKKOS_INLINE_FUNCTION
77  void operator()(const LocalOrdinal &i, LocalOrdinal& vmax) const {
78  if (vmax < view_(i))
79  vmax = view_(i);
80  }
81 
82  KOKKOS_INLINE_FUNCTION
83  void join (LocalOrdinal& dst, const LocalOrdinal& src) const {
84  if (dst < src) {
85  dst = src;
86  }
87  }
88 
89  KOKKOS_INLINE_FUNCTION
90  void init (LocalOrdinal& dst) const {
91  dst = 0;
92  }
93  private:
95  };
96 
97  // local QR decomposition
98  template<class LOType, class GOType, class SCType,class DeviceType, class NspType, class aggRowsType, class maxAggDofSizeType, class agg2RowMapLOType, class statusType, class rowsType, class rowsAuxType, class colsAuxType, class valsAuxType>
99  class LocalQRDecompFunctor {
100  private:
101  typedef LOType LO;
102  typedef GOType GO;
103  typedef SCType SC;
104 
105  typedef typename DeviceType::execution_space execution_space;
106  typedef typename Kokkos::ArithTraits<SC>::val_type impl_SC;
107  typedef Kokkos::ArithTraits<impl_SC> impl_ATS;
108  typedef typename impl_ATS::magnitudeType Magnitude;
109 
110  typedef Kokkos::View<impl_SC**,typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_matrix;
111  typedef Kokkos::View<impl_SC* ,typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_vector;
112 
113  private:
114 
115  NspType fineNS;
116  NspType coarseNS;
117  aggRowsType aggRows;
118  maxAggDofSizeType maxAggDofSize; //< maximum number of dofs in aggregate (max size of aggregate * numDofsPerNode)
119  agg2RowMapLOType agg2RowMapLO;
120  statusType statusAtomic;
121  rowsType rows;
122  rowsAuxType rowsAux;
123  colsAuxType colsAux;
124  valsAuxType valsAux;
125  bool doQRStep;
126  public:
127  LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_, bool doQRStep_) :
128  fineNS(fineNS_),
129  coarseNS(coarseNS_),
130  aggRows(aggRows_),
131  maxAggDofSize(maxAggDofSize_),
132  agg2RowMapLO(agg2RowMapLO_),
133  statusAtomic(statusAtomic_),
134  rows(rows_),
135  rowsAux(rowsAux_),
136  colsAux(colsAux_),
137  valsAux(valsAux_),
138  doQRStep(doQRStep_)
139  { }
140 
141  KOKKOS_INLINE_FUNCTION
142  void operator() ( const typename Kokkos::TeamPolicy<execution_space>::member_type & thread, size_t& nnz) const {
143  auto agg = thread.league_rank();
144 
145  // size of aggregate: number of DOFs in aggregate
146  auto aggSize = aggRows(agg+1) - aggRows(agg);
147 
148  const impl_SC one = impl_ATS::one();
149  const impl_SC two = one + one;
150  const impl_SC zero = impl_ATS::zero();
151  const auto zeroM = impl_ATS::magnitude(zero);
152 
153  int m = aggSize;
154  int n = fineNS.extent(1);
155 
156  // calculate row offset for coarse nullspace
157  Xpetra::global_size_t offset = agg * n;
158 
159  if (doQRStep) {
160 
161  // Extract the piece of the nullspace corresponding to the aggregate
162  shared_matrix r(thread.team_shmem(), m, n); // A (initially), R (at the end)
163  for (int j = 0; j < n; j++)
164  for (int k = 0; k < m; k++)
165  r(k,j) = fineNS(agg2RowMapLO(aggRows(agg)+k),j);
166 #if 0
167  printf("A\n");
168  for (int i = 0; i < m; i++) {
169  for (int j = 0; j < n; j++)
170  printf(" %5.3lf ", r(i,j));
171  printf("\n");
172  }
173 #endif
174 
175  // Calculate QR decomposition (standard)
176  shared_matrix q(thread.team_shmem(), m, m); // Q
177  if (m >= n) {
178  bool isSingular = false;
179 
180  // Initialize Q^T
181  auto qt = q;
182  for (int i = 0; i < m; i++) {
183  for (int j = 0; j < m; j++)
184  qt(i,j) = zero;
185  qt(i,i) = one;
186  }
187 
188  for (int k = 0; k < n; k++) { // we ignore "n" instead of "n-1" to normalize
189  // FIXME_KOKKOS: use team
190  Magnitude s = zeroM, norm, norm_x;
191  for (int i = k+1; i < m; i++)
192  s += pow(impl_ATS::magnitude(r(i,k)), 2);
193  norm = sqrt(pow(impl_ATS::magnitude(r(k,k)), 2) + s);
194 
195  if (norm == zero) {
196  isSingular = true;
197  break;
198  }
199 
200  r(k,k) -= norm*one;
201 
202  norm_x = sqrt(pow(impl_ATS::magnitude(r(k,k)), 2) + s);
203  if (norm_x == zeroM) {
204  // We have a single diagonal element in the column.
205  // No reflections required. Just need to restor r(k,k).
206  r(k,k) = norm*one;
207  continue;
208  }
209 
210  // FIXME_KOKKOS: use team
211  for (int i = k; i < m; i++)
212  r(i,k) /= norm_x;
213 
214  // Update R(k:m,k+1:n)
215  for (int j = k+1; j < n; j++) {
216  // FIXME_KOKKOS: use team in the loops
217  impl_SC si = zero;
218  for (int i = k; i < m; i++)
219  si += r(i,k) * r(i,j);
220  for (int i = k; i < m; i++)
221  r(i,j) -= two*si * r(i,k);
222  }
223 
224  // Update Q^T (k:m,k:m)
225  for (int j = k; j < m; j++) {
226  // FIXME_KOKKOS: use team in the loops
227  impl_SC si = zero;
228  for (int i = k; i < m; i++)
229  si += r(i,k) * qt(i,j);
230  for (int i = k; i < m; i++)
231  qt(i,j) -= two*si * r(i,k);
232  }
233 
234  // Fix R(k:m,k)
235  r(k,k) = norm*one;
236  for (int i = k+1; i < m; i++)
237  r(i,k) = zero;
238  }
239 
240 #if 0
241  // Q = (Q^T)^T
242  for (int i = 0; i < m; i++)
243  for (int j = 0; j < i; j++) {
244  impl_SC tmp = qt(i,j);
245  qt(i,j) = qt(j,i);
246  qt(j,i) = tmp;
247  }
248 #endif
249 
250  // Build coarse nullspace using the upper triangular part of R
251  for (int j = 0; j < n; j++)
252  for (int k = 0; k <= j; k++)
253  coarseNS(offset+k,j) = r(k,j);
254 
255  if (isSingular) {
256  statusAtomic(1) = true;
257  return;
258  }
259 
260  } else {
261  // Special handling for m < n (i.e. single node aggregates in structural mechanics)
262 
263  // The local QR decomposition is not possible in the "overconstrained"
264  // case (i.e. number of columns in qr > number of rowsAux), which
265  // corresponds to #DOFs in Aggregate < n. For usual problems this
266  // is only possible for single node aggregates in structural mechanics.
267  // (Similar problems may arise in discontinuous Galerkin problems...)
268  // We bypass the QR decomposition and use an identity block in the
269  // tentative prolongator for the single node aggregate and transfer the
270  // corresponding fine level null space information 1-to-1 to the coarse
271  // level null space part.
272 
273  // NOTE: The resulting tentative prolongation operator has
274  // (m*DofsPerNode-n) zero columns leading to a singular
275  // coarse level operator A. To deal with that one has the following
276  // options:
277  // - Use the "RepairMainDiagonal" flag in the RAPFactory (default:
278  // false) to add some identity block to the diagonal of the zero rowsAux
279  // in the coarse level operator A, such that standard level smoothers
280  // can be used again.
281  // - Use special (projection-based) level smoothers, which can deal
282  // with singular matrices (very application specific)
283  // - Adapt the code below to avoid zero columns. However, we do not
284  // support a variable number of DOFs per node in MueLu/Xpetra which
285  // makes the implementation really hard.
286  //
287  // FIXME: do we need to check for singularity here somehow? Zero
288  // columns would be easy but linear dependency would require proper QR.
289 
290  // R = extended (by adding identity rowsAux) qr
291  for (int j = 0; j < n; j++)
292  for (int k = 0; k < n; k++)
293  if (k < m)
294  coarseNS(offset+k,j) = r(k,j);
295  else
296  coarseNS(offset+k,j) = (k == j ? one : zero);
297 
298  // Q = I (rectangular)
299  for (int i = 0; i < m; i++)
300  for (int j = 0; j < n; j++)
301  q(i,j) = (j == i ? one : zero);
302  }
303 
304  // Process each row in the local Q factor and fill helper arrays to assemble P
305  for (int j = 0; j < m; j++) {
306  LO localRow = agg2RowMapLO(aggRows(agg)+j);
307  size_t rowStart = rowsAux(localRow);
308  size_t lnnz = 0;
309  for (int k = 0; k < n; k++) {
310  // skip zeros
311  if (q(j,k) != zero) {
312  colsAux(rowStart+lnnz) = offset + k;
313  valsAux(rowStart+lnnz) = q(j,k);
314  lnnz++;
315  }
316  }
317  rows(localRow+1) = lnnz;
318  nnz += lnnz;
319  }
320 
321 #if 0
322  printf("R\n");
323  for (int i = 0; i < m; i++) {
324  for (int j = 0; j < n; j++)
325  printf(" %5.3lf ", coarseNS(i,j));
326  printf("\n");
327  }
328 
329  printf("Q\n");
330  for (int i = 0; i < aggSize; i++) {
331  for (int j = 0; j < aggSize; j++)
332  printf(" %5.3lf ", q(i,j));
333  printf("\n");
334  }
335 #endif
336  } else {
338  // "no-QR" option //
340  // Local Q factor is just the fine nullspace support over the current aggregate.
341  // Local R factor is the identity.
342  // TODO I have not implemented any special handling for aggregates that are too
343  // TODO small to locally support the nullspace, as is done in the standard QR
344  // TODO case above.
345 
346  for (int j = 0; j < m; j++) {
347  LO localRow = agg2RowMapLO(aggRows(agg)+j);
348  size_t rowStart = rowsAux(localRow);
349  size_t lnnz = 0;
350  for (int k = 0; k < n; k++) {
351  const impl_SC qr_jk = fineNS(localRow,k);
352  // skip zeros
353  if (qr_jk != zero) {
354  colsAux(rowStart+lnnz) = offset + k;
355  valsAux(rowStart+lnnz) = qr_jk;
356  lnnz++;
357  }
358  }
359  rows(localRow+1) = lnnz;
360  nnz += lnnz;
361  }
362 
363  for (int j = 0; j < n; j++)
364  coarseNS(offset+j,j) = one;
365 
366  }
367 
368  }
369 
370  // amount of shared memory
371  size_t team_shmem_size( int /* team_size */ ) const {
372  if (doQRStep) {
373  int m = maxAggDofSize;
374  int n = fineNS.extent(1);
375  return shared_matrix::shmem_size(m, n) + // r
376  shared_matrix::shmem_size(m, m); // q
377  } else
378  return 0;
379  }
380  };
381 
382  } // namespace anonymous
383 
384  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
386  RCP<ParameterList> validParamList = rcp(new ParameterList());
387 
388 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
389  SET_VALID_ENTRY("tentative: calculate qr");
390  SET_VALID_ENTRY("tentative: build coarse coordinates");
391 #undef SET_VALID_ENTRY
392 
393  validParamList->set< RCP<const FactoryBase> >("A", Teuchos::null, "Generating factory of the matrix A");
394  validParamList->set< RCP<const FactoryBase> >("Aggregates", Teuchos::null, "Generating factory of the aggregates");
395  validParamList->set< RCP<const FactoryBase> >("Nullspace", Teuchos::null, "Generating factory of the nullspace");
396  validParamList->set< RCP<const FactoryBase> >("Scaled Nullspace", Teuchos::null, "Generating factory of the scaled nullspace");
397  validParamList->set< RCP<const FactoryBase> >("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo");
398  validParamList->set< RCP<const FactoryBase> >("CoarseMap", Teuchos::null, "Generating factory of the coarse map");
399  validParamList->set< RCP<const FactoryBase> >("Coordinates", Teuchos::null, "Generating factory of the coordinates");
400 
401  // Make sure we don't recursively validate options for the matrixmatrix kernels
402  ParameterList norecurse;
403  norecurse.disableRecursiveValidation();
404  validParamList->set<ParameterList> ("matrixmatrix: kernel params", norecurse, "MatrixMatrix kernel parameters");
405 
406  return validParamList;
407  }
408 
409  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
411 
412  const ParameterList& pL = GetParameterList();
413  // NOTE: This guy can only either be 'Nullspace' or 'Scaled Nullspace' or else the validator above will cause issues
414  std::string nspName = "Nullspace";
415  if(pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
416 
417  Input(fineLevel, "A");
418  Input(fineLevel, "Aggregates");
419  Input(fineLevel, nspName);
420  Input(fineLevel, "UnAmalgamationInfo");
421  Input(fineLevel, "CoarseMap");
422  if( fineLevel.GetLevelID() == 0 &&
423  fineLevel.IsAvailable("Coordinates", NoFactory::get()) && // we have coordinates (provided by user app)
424  pL.get<bool>("tentative: build coarse coordinates") ) { // and we want coordinates on other levels
425  bTransferCoordinates_ = true; // then set the transfer coordinates flag to true
426  Input(fineLevel, "Coordinates");
427  } else if (bTransferCoordinates_) {
428  Input(fineLevel, "Coordinates");
429  }
430  }
431 
432  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
434  return BuildP(fineLevel, coarseLevel);
435  }
436 
437  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
439  FactoryMonitor m(*this, "Build", coarseLevel);
440 
441  typedef typename Teuchos::ScalarTraits<Scalar>::coordinateType coordinate_type;
442  typedef Xpetra::MultiVectorFactory<coordinate_type,LO,GO,NO> RealValuedMultiVectorFactory;
443  const ParameterList& pL = GetParameterList();
444  std::string nspName = "Nullspace";
445  if(pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
446 
447  auto A = Get< RCP<Matrix> > (fineLevel, "A");
448  auto aggregates = Get< RCP<Aggregates_kokkos> > (fineLevel, "Aggregates");
449  auto amalgInfo = Get< RCP<AmalgamationInfo_kokkos> > (fineLevel, "UnAmalgamationInfo");
450  auto fineNullspace = Get< RCP<MultiVector> > (fineLevel, nspName);
451  auto coarseMap = Get< RCP<const Map> > (fineLevel, "CoarseMap");
452  RCP<RealValuedMultiVector> fineCoords;
453  if(bTransferCoordinates_) {
454  fineCoords = Get< RCP<RealValuedMultiVector> >(fineLevel, "Coordinates");
455  }
456 
457  RCP<Matrix> Ptentative;
458  // No coarse DoFs so we need to bail by setting Ptentattive to null and returning
459  // This level will ultimately be removed in MueLu_Hierarchy_defs.h via a resize()
460  if ( aggregates->GetNumGlobalAggregatesComputeIfNeeded() == 0) {
461  Ptentative = Teuchos::null;
462  Set(coarseLevel, "P", Ptentative);
463  return;
464  }
465  RCP<MultiVector> coarseNullspace;
466  RCP<RealValuedMultiVector> coarseCoords;
467 
468  if(bTransferCoordinates_) {
469  ArrayView<const GO> elementAList = coarseMap->getLocalElementList();
470  GO indexBase = coarseMap->getIndexBase();
471 
472  LO blkSize = 1;
473  if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
474  blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize();
475 
476  Array<GO> elementList;
477  ArrayView<const GO> elementListView;
478  if (blkSize == 1) {
479  // Scalar system
480  // No amalgamation required
481  elementListView = elementAList;
482 
483  } else {
484  auto numElements = elementAList.size() / blkSize;
485 
486  elementList.resize(numElements);
487 
488  // Amalgamate the map
489  for (LO i = 0; i < Teuchos::as<LO>(numElements); i++)
490  elementList[i] = (elementAList[i*blkSize]-indexBase)/blkSize + indexBase;
491 
492  elementListView = elementList;
493  }
494 
495  auto uniqueMap = fineCoords->getMap();
496  auto coarseCoordMap = MapFactory::Build(coarseMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
497  elementListView, indexBase, coarseMap->getComm());
498  coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors());
499 
500  // Create overlapped fine coordinates to reduce global communication
501  RCP<RealValuedMultiVector> ghostedCoords = fineCoords;
502  if (aggregates->AggregatesCrossProcessors()) {
503  auto nonUniqueMap = aggregates->GetMap();
504  auto importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
505 
506  ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors());
507  ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
508  }
509 
510  // The good new is that his graph has already been constructed for the
511  // TentativePFactory and was cached in Aggregates. So this is a no-op.
512  auto aggGraph = aggregates->GetGraph();
513  auto numAggs = aggGraph.numRows();
514 
515  auto fineCoordsView = fineCoords ->getDeviceLocalView(Xpetra::Access::ReadOnly);
516  auto coarseCoordsView = coarseCoords->getDeviceLocalView(Xpetra::Access::OverwriteAll);
517 
518  // Fill in coarse coordinates
519  {
520  SubFactoryMonitor m2(*this, "AverageCoords", coarseLevel);
521 
522  const auto dim = fineCoords->getNumVectors();
523 
524  typename AppendTrait<decltype(fineCoordsView), Kokkos::RandomAccess>::type fineCoordsRandomView = fineCoordsView;
525  for (size_t j = 0; j < dim; j++) {
526  Kokkos::parallel_for("MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy<local_ordinal_type, execution_space>(0, numAggs),
527  KOKKOS_LAMBDA(const LO i) {
528  // A row in this graph represents all node ids in the aggregate
529  // Therefore, averaging is very easy
530 
531  auto aggregate = aggGraph.rowConst(i);
532 
533  coordinate_type sum = 0.0; // do not use Scalar here (Stokhos)
534  for (size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
535  sum += fineCoordsRandomView(aggregate(colID),j);
536 
537  coarseCoordsView(i,j) = sum / aggregate.length;
538  });
539  }
540  }
541  }
542 
543  if (!aggregates->AggregatesCrossProcessors()) {
544  if(Xpetra::Helpers<SC,LO,GO,NO>::isTpetraBlockCrs(A)) {
545  BuildPuncoupledBlockCrs(coarseLevel,A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,
546  coarseLevel.GetLevelID());
547  }
548  else {
549  BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID());
550  }
551  }
552  else
553  BuildPcoupled (A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
554 
555  // If available, use striding information of fine level matrix A for range
556  // map and coarseMap as domain map; otherwise use plain range map of
557  // Ptent = plain range map of A for range map and coarseMap as domain map.
558  // NOTE:
559  // The latter is not really safe, since there is no striding information
560  // for the range map. This is not really a problem, since striding
561  // information is always available on the intermedium levels and the
562  // coarsest levels.
563  if (A->IsView("stridedMaps") == true)
564  Ptentative->CreateView("stridedMaps", A->getRowMap("stridedMaps"), coarseMap);
565  else
566  Ptentative->CreateView("stridedMaps", Ptentative->getRangeMap(), coarseMap);
567 
568  if(bTransferCoordinates_) {
569  Set(coarseLevel, "Coordinates", coarseCoords);
570  }
571  Set(coarseLevel, "Nullspace", coarseNullspace);
572  Set(coarseLevel, "P", Ptentative);
573 
574  if (IsPrint(Statistics2)) {
575  RCP<ParameterList> params = rcp(new ParameterList());
576  params->set("printLoadBalancingInfo", true);
577  GetOStream(Statistics2) << PerfUtils::PrintMatrixInfo(*Ptentative, "Ptent", params);
578  }
579  }
580 
581  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
583  BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates,
584  RCP<AmalgamationInfo_kokkos> amalgInfo, RCP<MultiVector> fineNullspace,
585  RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
586  RCP<MultiVector>& coarseNullspace, const int levelID) const {
587  auto rowMap = A->getRowMap();
588  auto colMap = A->getColMap();
589 
590  const size_t numRows = rowMap->getLocalNumElements();
591  const size_t NSDim = fineNullspace->getNumVectors();
592 
593  typedef Kokkos::ArithTraits<SC> ATS;
594  using impl_SC = typename ATS::val_type;
595  using impl_ATS = Kokkos::ArithTraits<impl_SC>;
596  const impl_SC zero = impl_ATS::zero(), one = impl_ATS::one();
597 
598  const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
599 
600  typename Aggregates_kokkos::local_graph_type aggGraph;
601  {
602  SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
603  aggGraph = aggregates->GetGraph();
604  }
605  auto aggRows = aggGraph.row_map;
606  auto aggCols = aggGraph.entries;
607 
608  // Aggregates map is based on the amalgamated column map
609  // We can skip global-to-local conversion if LIDs in row map are
610  // same as LIDs in column map
611  bool goodMap;
612  {
613  SubFactoryMonitor m2(*this, "Check good map", coarseLevel);
614  goodMap = isGoodMap(*rowMap, *colMap);
615  }
616  // FIXME_KOKKOS: need to proofread later code for bad maps
617  TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
618  "MueLu: TentativePFactory_kokkos: for now works only with good maps "
619  "(i.e. \"matching\" row and column maps)");
620 
621  // STEP 1: do unamalgamation
622  // The non-kokkos version uses member functions from the AmalgamationInfo
623  // container class to unamalgamate the data. In contrast, the kokkos
624  // version of TentativePFactory does the unamalgamation here and only uses
625  // the data of the AmalgamationInfo container class
626 
627  // Extract information for unamalgamation
628  LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
629  GO indexBase;
630  amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
631  GO globalOffset = amalgInfo->GlobalOffset();
632 
633  // Extract aggregation info (already in Kokkos host views)
634  auto procWinner = aggregates->GetProcWinner() ->getDeviceLocalView(Xpetra::Access::ReadOnly);
635  auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
636  const size_t numAggregates = aggregates->GetNumAggregates();
637 
638  int myPID = aggregates->GetMap()->getComm()->getRank();
639 
640  // Create Kokkos::View (on the device) to store the aggreate dof sizes
641  // Later used to get aggregate dof offsets
642  // NOTE: This zeros itself on construction
643  typedef typename Aggregates_kokkos::aggregates_sizes_type::non_const_type AggSizeType;
644  AggSizeType aggDofSizes;
645 
646  if (stridedBlockSize == 1) {
647  SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
648 
649  // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
650  aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates+1);
651 
652  auto sizesConst = aggregates->ComputeAggregateSizes();
653  Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates+1)), sizesConst);
654 
655  } else {
656  SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
657 
658  // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
659  aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
660 
661  auto nodeMap = aggregates->GetMap()->getLocalMap();
662  auto dofMap = colMap->getLocalMap();
663 
664  Kokkos::parallel_for("MueLu:TentativePF:Build:compute_agg_sizes", range_type(0,numAggregates),
665  KOKKOS_LAMBDA(const LO agg) {
666  auto aggRowView = aggGraph.rowConst(agg);
667 
668  size_t size = 0;
669  for (LO colID = 0; colID < aggRowView.length; colID++) {
670  GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
671 
672  for (LO k = 0; k < stridedBlockSize; k++) {
673  GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
674 
675  if (dofMap.getLocalElement(dofGID) != INVALID)
676  size++;
677  }
678  }
679  aggDofSizes(agg+1) = size;
680  });
681  }
682 
683  // Find maximum dof size for aggregates
684  // Later used to reserve enough scratch space for local QR decompositions
685  LO maxAggSize = 0;
686  ReduceMaxFunctor<LO,decltype(aggDofSizes)> reduceMax(aggDofSizes);
687  Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
688 
689  // parallel_scan (exclusive)
690  // The aggDofSizes View then contains the aggregate dof offsets
691  Kokkos::parallel_scan("MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0,numAggregates+1),
692  KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
693  update += aggDofSizes(i);
694  if (final_pass)
695  aggDofSizes(i) = update;
696  });
697 
698  // Create Kokkos::View on the device to store mapping
699  // between (local) aggregate id and row map ids (LIDs)
700  Kokkos::View<LO*, DeviceType> agg2RowMapLO(Kokkos::ViewAllocateWithoutInitializing("agg2row_map_LO"), numRows);
701  {
702  SubFactoryMonitor m2(*this, "Create Agg2RowMap", coarseLevel);
703 
704  AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
705  Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
706 
707  Kokkos::parallel_for("MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
708  KOKKOS_LAMBDA(const LO lnode) {
709  if (procWinner(lnode, 0) == myPID) {
710  // No need for atomics, it's one-to-one
711  auto aggID = vertex2AggId(lnode,0);
712 
713  auto offset = Kokkos::atomic_fetch_add( &aggOffsets(aggID), stridedBlockSize );
714  // FIXME: I think this may be wrong
715  // We unconditionally add the whole block here. When we calculated
716  // aggDofSizes, we did the isLocalElement check. Something's fishy.
717  for (LO k = 0; k < stridedBlockSize; k++)
718  agg2RowMapLO(offset + k) = lnode*stridedBlockSize + k;
719  }
720  });
721  }
722 
723  // STEP 2: prepare local QR decomposition
724  // Reserve memory for tentative prolongation operator
725  coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
726 
727  // Pull out the nullspace vectors so that we can have random access (on the device)
728  auto fineNS = fineNullspace ->getDeviceLocalView(Xpetra::Access::ReadWrite);
729  auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
730 
731  size_t nnz = 0; // actual number of nnz
732 
733  typedef typename Xpetra::Matrix<SC,LO,GO,NO>::local_matrix_type local_matrix_type;
734  typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
735  typedef typename local_matrix_type::index_type::non_const_type cols_type;
736  typedef typename local_matrix_type::values_type::non_const_type vals_type;
737 
738 
739  // Device View for status (error messages...)
740  typedef Kokkos::View<int[10], DeviceType> status_type;
741  status_type status("status");
742 
745 
746  const ParameterList& pL = GetParameterList();
747  const bool& doQRStep = pL.get<bool>("tentative: calculate qr");
748  if (!doQRStep) {
749  GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
750  if (NSDim>1)
751  GetOStream(Warnings0) << "TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
752  }
753 
754  size_t nnzEstimate = numRows * NSDim;
755  rows_type rowsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_rows"), numRows+1);
756  cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_cols"), nnzEstimate);
757  vals_type valsAux("Ptent_aux_vals", nnzEstimate);
758  rows_type rows("Ptent_rows", numRows+1);
759  {
760  // Stage 0: fill in views.
761  SubFactoryMonitor m2(*this, "Stage 0 (InitViews)", coarseLevel);
762 
763  // The main thing to notice is initialization of vals with INVALID. These
764  // values will later be used to compress the arrays
765  Kokkos::parallel_for("MueLu:TentativePF:BuildPuncoupled:for1", range_type(0, numRows+1),
766  KOKKOS_LAMBDA(const LO row) {
767  rowsAux(row) = row*NSDim;
768  });
769  Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:for2", range_type(0, nnzEstimate),
770  KOKKOS_LAMBDA(const LO j) {
771  colsAux(j) = INVALID;
772  });
773  }
774 
775  if (NSDim == 1) {
776  // 1D is special, as it is the easiest. We don't even need to the QR,
777  // just normalize an array. Plus, no worries abot small aggregates. In
778  // addition, we do not worry about compression. It is unlikely that
779  // nullspace will have zeros. If it does, a prolongator row would be
780  // zero and we'll get singularity anyway.
781  SubFactoryMonitor m2(*this, "Stage 1 (LocalQR)", coarseLevel);
782 
783  // Set up team policy with numAggregates teams and one thread per team.
784  // Each team handles a slice of the data associated with one aggregate
785  // and performs a local QR decomposition (in this case real QR is
786  // unnecessary).
787  const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
788 
789  if (doQRStep) {
790  Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:main_loop", policy,
791  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
792  auto agg = thread.league_rank();
793 
794  // size of the aggregate (number of DOFs in aggregate)
795  LO aggSize = aggRows(agg+1) - aggRows(agg);
796 
797  // Extract the piece of the nullspace corresponding to the aggregate, and
798  // put it in the flat array, "localQR" (in column major format) for the
799  // QR routine. Trivial in 1D.
800  auto norm = impl_ATS::magnitude(zero);
801 
802  // Calculate QR by hand
803  // FIXME: shouldn't there be stridedblock here?
804  // FIXME_KOKKOS: shouldn't there be stridedblock here?
805  for (decltype(aggSize) k = 0; k < aggSize; k++) {
806  auto dnorm = impl_ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0));
807  norm += dnorm*dnorm;
808  }
809  norm = sqrt(norm);
810 
811  if (norm == zero) {
812  // zero column; terminate the execution
813  statusAtomic(1) = true;
814  return;
815  }
816 
817  // R = norm
818  coarseNS(agg, 0) = norm;
819 
820  // Q = localQR(:,0)/norm
821  for (decltype(aggSize) k = 0; k < aggSize; k++) {
822  LO localRow = agg2RowMapLO(aggRows(agg)+k);
823  impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0) / norm;
824 
825  rows(localRow+1) = 1;
826  colsAux(localRow) = agg;
827  valsAux(localRow) = localVal;
828 
829  }
830  });
831 
832  typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
833  Kokkos::deep_copy(statusHost, status);
834  for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
835  if (statusHost(i)) {
836  std::ostringstream oss;
837  oss << "MueLu::TentativePFactory::MakeTentative: ";
838  switch (i) {
839  case 0: oss << "!goodMap is not implemented"; break;
840  case 1: oss << "fine level NS part has a zero column"; break;
841  }
842  throw Exceptions::RuntimeError(oss.str());
843  }
844 
845  } else {
846  Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:main_loop_noqr", policy,
847  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
848  auto agg = thread.league_rank();
849 
850  // size of the aggregate (number of DOFs in aggregate)
851  LO aggSize = aggRows(agg+1) - aggRows(agg);
852 
853  // R = norm
854  coarseNS(agg, 0) = one;
855 
856  // Q = localQR(:,0)/norm
857  for (decltype(aggSize) k = 0; k < aggSize; k++) {
858  LO localRow = agg2RowMapLO(aggRows(agg)+k);
859  impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0);
860 
861  rows(localRow+1) = 1;
862  colsAux(localRow) = agg;
863  valsAux(localRow) = localVal;
864 
865  }
866  });
867  }
868 
869  Kokkos::parallel_reduce("MueLu:TentativeP:CountNNZ", range_type(0, numRows+1),
870  KOKKOS_LAMBDA(const LO i, size_t &nnz_count) {
871  nnz_count += rows(i);
872  }, nnz);
873 
874  } else { // NSdim > 1
875  // FIXME_KOKKOS: This code branch is completely unoptimized.
876  // Work to do:
877  // - Optimize QR decomposition
878  // - Remove INVALID usage similarly to CoalesceDropFactory_kokkos by
879  // packing new values in the beginning of each row
880  // We do use auxilary view in this case, so keep a second rows view for
881  // counting nonzeros in rows
882 
883  {
884  SubFactoryMonitor m2 = SubFactoryMonitor(*this, doQRStep ? "Stage 1 (LocalQR)" : "Stage 1 (Fill coarse nullspace and tentative P)", coarseLevel);
885  // Set up team policy with numAggregates teams and one thread per team.
886  // Each team handles a slice of the data associated with one aggregate
887  // and performs a local QR decomposition
888  const Kokkos::TeamPolicy<execution_space> policy(numAggregates,1); // numAggregates teams a 1 thread
889  LocalQRDecompFunctor<LocalOrdinal, GlobalOrdinal, Scalar, DeviceType, decltype(fineNSRandom),
890  decltype(aggDofSizes /*aggregate sizes in dofs*/), decltype(maxAggSize), decltype(agg2RowMapLO),
891  decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux),
892  decltype(valsAux)>
893  localQRFunctor(fineNSRandom, coarseNS, aggDofSizes, maxAggSize, agg2RowMapLO, statusAtomic,
895  Kokkos::parallel_reduce("MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz);
896  }
897 
898  typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
899  Kokkos::deep_copy(statusHost, status);
900  for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
901  if (statusHost(i)) {
902  std::ostringstream oss;
903  oss << "MueLu::TentativePFactory::MakeTentative: ";
904  switch(i) {
905  case 0: oss << "!goodMap is not implemented"; break;
906  case 1: oss << "fine level NS part has a zero column"; break;
907  }
908  throw Exceptions::RuntimeError(oss.str());
909  }
910  }
911 
912  // Compress the cols and vals by ignoring INVALID column entries that correspond
913  // to 0 in QR.
914 
915  // The real cols and vals are constructed using calculated (not estimated) nnz
916  cols_type cols;
917  vals_type vals;
918 
919  if (nnz != nnzEstimate) {
920  {
921  // Stage 2: compress the arrays
922  SubFactoryMonitor m2(*this, "Stage 2 (CompressRows)", coarseLevel);
923 
924  Kokkos::parallel_scan("MueLu:TentativePF:Build:compress_rows", range_type(0,numRows+1),
925  KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
926  upd += rows(i);
927  if (final)
928  rows(i) = upd;
929  });
930  }
931 
932  {
933  SubFactoryMonitor m2(*this, "Stage 2 (CompressCols)", coarseLevel);
934 
935  cols = cols_type("Ptent_cols", nnz);
936  vals = vals_type("Ptent_vals", nnz);
937 
938  // FIXME_KOKKOS: this can be spedup by moving correct cols and vals values
939  // to the beginning of rows. See CoalesceDropFactory_kokkos for
940  // example.
941  Kokkos::parallel_for("MueLu:TentativePF:Build:compress_cols_vals", range_type(0,numRows),
942  KOKKOS_LAMBDA(const LO i) {
943  LO rowStart = rows(i);
944 
945  size_t lnnz = 0;
946  for (auto j = rowsAux(i); j < rowsAux(i+1); j++)
947  if (colsAux(j) != INVALID) {
948  cols(rowStart+lnnz) = colsAux(j);
949  vals(rowStart+lnnz) = valsAux(j);
950  lnnz++;
951  }
952  });
953  }
954 
955  } else {
956  rows = rowsAux;
957  cols = colsAux;
958  vals = valsAux;
959  }
960 
961  GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl;
962 
963  {
964  // Stage 3: construct Xpetra::Matrix
965  SubFactoryMonitor m2(*this, "Stage 3 (LocalMatrix+FillComplete)", coarseLevel);
966 
967  local_matrix_type lclMatrix = local_matrix_type("A", numRows, coarseMap->getLocalNumElements(), nnz, vals, rows, cols);
968 
969  // Managing labels & constants for ESFC
970  RCP<ParameterList> FCparams;
971  if (pL.isSublist("matrixmatrix: kernel params"))
972  FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
973  else
974  FCparams = rcp(new ParameterList);
975 
976  // By default, we don't need global constants for TentativeP
977  FCparams->set("compute global constants", FCparams->get("compute global constants", false));
978  FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + toString(levelID));
979 
980  auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
981  Ptentative = rcp(new CrsMatrixWrap(PtentCrs));
982  }
983  }
984 
985 
986  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
988  BuildPuncoupledBlockCrs(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates,
989  RCP<AmalgamationInfo_kokkos> amalgInfo, RCP<MultiVector> fineNullspace,
990  RCP<const Map> coarsePointMap, RCP<Matrix>& Ptentative,
991  RCP<MultiVector>& coarseNullspace, const int levelID) const {
992 #ifdef HAVE_MUELU_TPETRA
993  /* This routine generates a BlockCrs P for a BlockCrs A. There are a few assumptions here, which meet the use cases we care about, but could
994  be generalized later, if we ever need to do so:
995  1) Null space dimension === block size of matrix: So no elasticity right now
996  2) QR is not supported: Under assumption #1, this shouldn't cause problems.
997  3) Maps are "good": Aka the first chunk of the ColMap is the RowMap.
998 
999  These assumptions keep our code way simpler and still support the use cases we actually care about.
1000  */
1001 
1002  RCP<const Map> rowMap = A->getRowMap();
1003  RCP<const Map> rangeMap = A->getRangeMap();
1004  RCP<const Map> colMap = A->getColMap();
1005  // const size_t numFinePointRows = rangeMap->getLocalNumElements();
1006  const size_t numFineBlockRows = rowMap->getLocalNumElements();
1007 
1008  // typedef Teuchos::ScalarTraits<SC> STS;
1009  // typedef typename STS::magnitudeType Magnitude;
1010  const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1011 
1012  typedef Kokkos::ArithTraits<SC> ATS;
1013  using impl_SC = typename ATS::val_type;
1014  using impl_ATS = Kokkos::ArithTraits<impl_SC>;
1015  const impl_SC one = impl_ATS::one();
1016 
1017  // const GO numAggs = aggregates->GetNumAggregates();
1018  const size_t NSDim = fineNullspace->getNumVectors();
1019  auto aggSizes = aggregates->ComputeAggregateSizes();
1020 
1021 
1022  typename Aggregates_kokkos::local_graph_type aggGraph;
1023  {
1024  SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
1025  aggGraph = aggregates->GetGraph();
1026  }
1027  auto aggRows = aggGraph.row_map;
1028  auto aggCols = aggGraph.entries;
1029 
1030 
1031  // Need to generate the coarse block map
1032  // NOTE: We assume NSDim == block size here
1033  // NOTE: We also assume that coarseMap has contiguous GIDs
1034  //const size_t numCoarsePointRows = coarsePointMap->getLocalNumElements();
1035  const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim;
1036  RCP<const Map> coarseBlockMap = MapFactory::Build(coarsePointMap->lib(),
1037  Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
1038  numCoarseBlockRows,
1039  coarsePointMap->getIndexBase(),
1040  coarsePointMap->getComm());
1041  // Sanity checking
1042  const ParameterList& pL = GetParameterList();
1043  // const bool &doQRStep = pL.get<bool>("tentative: calculate qr");
1044 
1045 
1046  // The aggregates use the amalgamated column map, which in this case is what we want
1047 
1048  // Aggregates map is based on the amalgamated column map
1049  // We can skip global-to-local conversion if LIDs in row map are
1050  // same as LIDs in column map
1051  bool goodMap = MueLu::Utilities<SC,LO,GO,NO>::MapsAreNested(*rowMap, *colMap);
1052  TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
1053  "MueLu: TentativePFactory_kokkos: for now works only with good maps "
1054  "(i.e. \"matching\" row and column maps)");
1055 
1056  // STEP 1: do unamalgamation
1057  // The non-kokkos version uses member functions from the AmalgamationInfo
1058  // container class to unamalgamate the data. In contrast, the kokkos
1059  // version of TentativePFactory does the unamalgamation here and only uses
1060  // the data of the AmalgamationInfo container class
1061 
1062  // Extract information for unamalgamation
1063  LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
1064  GO indexBase;
1065  amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
1066  //GO globalOffset = amalgInfo->GlobalOffset();
1067 
1068  // Extract aggregation info (already in Kokkos host views)
1069  auto procWinner = aggregates->GetProcWinner() ->getDeviceLocalView(Xpetra::Access::ReadOnly);
1070  auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
1071  const size_t numAggregates = aggregates->GetNumAggregates();
1072 
1073  int myPID = aggregates->GetMap()->getComm()->getRank();
1074 
1075  // Create Kokkos::View (on the device) to store the aggreate dof sizes
1076  // Later used to get aggregate dof offsets
1077  // NOTE: This zeros itself on construction
1078  typedef typename Aggregates_kokkos::aggregates_sizes_type::non_const_type AggSizeType;
1079  AggSizeType aggDofSizes; // This turns into "starts" after the parallel_scan
1080 
1081  {
1082  SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
1083 
1084  // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
1085  aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates+1);
1086 
1087  Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates+1)), aggSizes);
1088  }
1089 
1090  // Find maximum dof size for aggregates
1091  // Later used to reserve enough scratch space for local QR decompositions
1092  LO maxAggSize = 0;
1093  ReduceMaxFunctor<LO,decltype(aggDofSizes)> reduceMax(aggDofSizes);
1094  Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
1095 
1096  // parallel_scan (exclusive)
1097  // The aggDofSizes View then contains the aggregate dof offsets
1098  Kokkos::parallel_scan("MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0,numAggregates+1),
1099  KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
1100  update += aggDofSizes(i);
1101  if (final_pass)
1102  aggDofSizes(i) = update;
1103  });
1104 
1105  // Create Kokkos::View on the device to store mapping
1106  // between (local) aggregate id and row map ids (LIDs)
1107  Kokkos::View<LO*, DeviceType> aggToRowMapLO(Kokkos::ViewAllocateWithoutInitializing("aggtorow_map_LO"), numFineBlockRows);
1108  {
1109  SubFactoryMonitor m2(*this, "Create AggToRowMap", coarseLevel);
1110 
1111  AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
1112  Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
1113 
1114  Kokkos::parallel_for("MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
1115  KOKKOS_LAMBDA(const LO lnode) {
1116  if (procWinner(lnode, 0) == myPID) {
1117  // No need for atomics, it's one-to-one
1118  auto aggID = vertex2AggId(lnode,0);
1119 
1120  auto offset = Kokkos::atomic_fetch_add( &aggOffsets(aggID), stridedBlockSize );
1121  // FIXME: I think this may be wrong
1122  // We unconditionally add the whole block here. When we calculated
1123  // aggDofSizes, we did the isLocalElement check. Something's fishy.
1124  for (LO k = 0; k < stridedBlockSize; k++)
1125  aggToRowMapLO(offset + k) = lnode*stridedBlockSize + k;
1126  }
1127  });
1128  }
1129 
1130  // STEP 2: prepare local QR decomposition
1131  // Reserve memory for tentative prolongation operator
1132  coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim);
1133 
1134  // Pull out the nullspace vectors so that we can have random access (on the device)
1135  auto fineNS = fineNullspace ->getDeviceLocalView(Xpetra::Access::ReadWrite);
1136  auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
1137 
1138  typedef typename Xpetra::Matrix<SC,LO,GO,NO>::local_matrix_type local_matrix_type;
1139  typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
1140  typedef typename local_matrix_type::index_type::non_const_type cols_type;
1141  // typedef typename local_matrix_type::values_type::non_const_type vals_type;
1142 
1143 
1144  // Device View for status (error messages...)
1145  typedef Kokkos::View<int[10], DeviceType> status_type;
1146  status_type status("status");
1147 
1149  typename AppendTrait<status_type, Kokkos::Atomic> ::type statusAtomic = status;
1150 
1151  // We're going to bypass QR in the BlockCrs version of the code regardless of what the user asks for
1152  GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
1153 
1154  // BlockCrs requires that we build the (block) graph first, so let's do that...
1155 
1156  // NOTE: Because we're assuming that the NSDim == BlockSize, we only have one
1157  // block non-zero per row in the matrix;
1158  rows_type ia(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows+1);
1159  cols_type ja(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), numFineBlockRows);
1160 
1161  Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:graph_init", range_type(0, numFineBlockRows),
1162  KOKKOS_LAMBDA(const LO j) {
1163  ia[j] = j;
1164  ja[j] = INVALID;
1165 
1166  if(j==(LO)numFineBlockRows-1)
1167  ia[numFineBlockRows] = numFineBlockRows;
1168  });
1169 
1170  // Fill Graph
1171  const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
1172  Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:fillGraph", policy,
1173  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
1174  auto agg = thread.league_rank();
1175  Xpetra::global_size_t offset = agg;
1176 
1177  // size of the aggregate (number of DOFs in aggregate)
1178  LO aggSize = aggRows(agg+1) - aggRows(agg);
1179 
1180  for (LO j = 0; j < aggSize; j++) {
1181  // FIXME: Allow for bad maps
1182  const LO localRow = aggToRowMapLO[aggDofSizes[agg]+j];
1183  const size_t rowStart = ia[localRow];
1184  ja[rowStart] = offset;
1185  }
1186  });
1187 
1188  // Compress storage (remove all INVALID, which happen when we skip zeros)
1189  // We do that in-place
1190  {
1191  // Stage 2: compress the arrays
1192  SubFactoryMonitor m2(*this, "Stage 2 (CompressData)", coarseLevel);
1193  // Fill i_temp with the correct row starts
1194  rows_type i_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows+1);
1195  LO nnz=0;
1196  Kokkos::parallel_scan("MueLu:TentativePF:BlockCrs:compress_rows", range_type(0,numFineBlockRows),
1197  KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
1198  if(final)
1199  i_temp[i] = upd;
1200  for (auto j = ia[i]; j < ia[i+1]; j++)
1201  if (ja[j] != INVALID)
1202  upd++;
1203  if(final && i == (LO) numFineBlockRows-1)
1204  i_temp[numFineBlockRows] = upd;
1205  },nnz);
1206 
1207  cols_type j_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), nnz);
1208 
1209 
1210  Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:compress_cols", range_type(0,numFineBlockRows),
1211  KOKKOS_LAMBDA(const LO i) {
1212  size_t rowStart = i_temp[i];
1213  size_t lnnz = 0;
1214  for (auto j = ia[i]; j < ia[i+1]; j++)
1215  if (ja[j] != INVALID) {
1216  j_temp[rowStart+lnnz] = ja[j];
1217  lnnz++;
1218  }
1219  });
1220 
1221  ia = i_temp;
1222  ja = j_temp;
1223  }
1224 
1225  RCP<CrsGraph> BlockGraph = CrsGraphFactory::Build(rowMap,coarseBlockMap,ia,ja);
1226 
1227 
1228  // Managing labels & constants for ESFC
1229  {
1230  RCP<ParameterList> FCparams;
1231  if(pL.isSublist("matrixmatrix: kernel params"))
1232  FCparams=rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
1233  else
1234  FCparams= rcp(new ParameterList);
1235  // By default, we don't need global constants for TentativeP
1236  FCparams->set("compute global constants",FCparams->get("compute global constants",false));
1237  std::string levelIDs = toString(levelID);
1238  FCparams->set("Timer Label",std::string("MueLu::TentativeP-")+levelIDs);
1239  RCP<const Export> dummy_e;
1240  RCP<const Import> dummy_i;
1241  BlockGraph->expertStaticFillComplete(coarseBlockMap,rowMap,dummy_i,dummy_e,FCparams);
1242  }
1243 
1244  // We can't leave the ia/ja pointers floating around, because of host/device view counting, so
1245  // we clear them here
1246  ia = rows_type();
1247  ja = cols_type();
1248 
1249 
1250  // Now let's make a BlockCrs Matrix
1251  // NOTE: Assumes block size== NSDim
1252  RCP<Xpetra::CrsMatrix<SC,LO,GO,NO> > P_xpetra = Xpetra::CrsMatrixFactory<SC,LO,GO,NO>::BuildBlock(BlockGraph, coarsePointMap, rangeMap,NSDim);
1253  RCP<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> > P_tpetra = rcp_dynamic_cast<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> >(P_xpetra);
1254  if(P_tpetra.is_null()) throw std::runtime_error("BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix");
1255  RCP<CrsMatrixWrap> P_wrap = rcp(new CrsMatrixWrap(P_xpetra));
1256 
1257  auto values = P_tpetra->getTpetra_BlockCrsMatrix()->getValuesDeviceNonConst();
1258  const LO stride = NSDim*NSDim;
1259 
1260  Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:main_loop_noqr", policy,
1261  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
1262  auto agg = thread.league_rank();
1263 
1264  // size of the aggregate (number of DOFs in aggregate)
1265  LO aggSize = aggRows(agg+1) - aggRows(agg);
1266  Xpetra::global_size_t offset = agg*NSDim;
1267 
1268  // Q = localQR(:,0)/norm
1269  for (LO j = 0; j < aggSize; j++) {
1270  LO localBlockRow = aggToRowMapLO(aggRows(agg)+j);
1271  LO rowStart = localBlockRow * stride;
1272  for (LO r = 0; r < (LO)NSDim; r++) {
1273  LO localPointRow = localBlockRow*NSDim + r;
1274  for (LO c = 0; c < (LO)NSDim; c++) {
1275  values[rowStart + r*NSDim + c] = fineNSRandom(localPointRow,c);
1276  }
1277  }
1278  }
1279 
1280  // R = norm
1281  for(LO j=0; j<(LO)NSDim; j++)
1282  coarseNS(offset+j,j) = one;
1283  });
1284 
1285  Ptentative = P_wrap;
1286 
1287 #else
1288  throw std::runtime_error("TentativePFactory::BuildPuncoupledBlockCrs: Requires Tpetra");
1289 #endif
1290  }
1291 
1292  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
1294  BuildPcoupled(RCP<Matrix> /* A */, RCP<Aggregates_kokkos> /* aggregates */,
1295  RCP<AmalgamationInfo_kokkos> /* amalgInfo */, RCP<MultiVector> /* fineNullspace */,
1296  RCP<const Map> /* coarseMap */, RCP<Matrix>& /* Ptentative */,
1297  RCP<MultiVector>& /* coarseNullspace */) const {
1298  throw Exceptions::RuntimeError("MueLu: Construction of coupled tentative P is not implemented");
1299  }
1300 
1301  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
1303  isGoodMap(const Map& rowMap, const Map& colMap) const {
1304  auto rowLocalMap = rowMap.getLocalMap();
1305  auto colLocalMap = colMap.getLocalMap();
1306 
1307  const size_t numRows = rowLocalMap.getLocalNumElements();
1308  const size_t numCols = colLocalMap.getLocalNumElements();
1309 
1310  if (numCols < numRows)
1311  return false;
1312 
1313  size_t numDiff = 0;
1314  Kokkos::parallel_reduce("MueLu:TentativePF:isGoodMap", range_type(0, numRows),
1315  KOKKOS_LAMBDA(const LO i, size_t &diff) {
1316  diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i));
1317  }, numDiff);
1318 
1319  return (numDiff == 0);
1320  }
1321 
1322 } //namespace MueLu
1323 
1324 #define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT
1325 #endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
Important warning messages (one line)
View
MueLu::DefaultLocalOrdinal LocalOrdinal
std::string toString(const T &what)
Little helper function to convert non-string types to strings.
Timer to be used in factories. Similar to Monitor but with additional timers.
Namespace for MueLu classes and methods.
static const NoFactory * get()
Print even more statistics.
int GetLevelID() const
Return level number.
Definition: MueLu_Level.cpp:76
#define SET_VALID_ENTRY(name)
static bool MapsAreNested(const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &rowMap, const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &colMap)
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
Class that holds all level-specific information.
Definition: MueLu_Level.hpp:99
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
bool IsAvailable(const std::string &ename, const FactoryBase *factory=NoFactory::get()) const
Test whether a need&#39;s value has been saved.
Exception throws to report errors in the internal logical of the program.
Description of what is happening (more verbose)
maxAggDofSizeType maxAggDofSize
agg2RowMapLOType agg2RowMapLO