MueLu  Version of the Day
MueLu_SemiCoarsenPFactory_kokkos_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // MueLu: A package for multigrid based preconditioning
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact
39 // Jonathan Hu (jhu@sandia.gov)
40 // Andrey Prokopenko (aprokop@sandia.gov)
41 // Ray Tuminaro (rstumin@sandia.gov)
42 //
43 // ***********************************************************************
44 //
45 // @HEADER
46 #ifndef MUELU_SEMICOARSENPFACTORY_KOKKOS_DEF_HPP
47 #define MUELU_SEMICOARSENPFACTORY_KOKKOS_DEF_HPP
48 
49 #include <stdlib.h>
50 
51 #include <Kokkos_Core.hpp>
52 
53 #include <KokkosBatched_LU_Decl.hpp>
54 #include <KokkosBatched_LU_Serial_Impl.hpp>
55 #include <KokkosBatched_Trsm_Decl.hpp>
56 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
57 #include <KokkosBatched_Util.hpp>
58 #include <KokkosSparse_CrsMatrix.hpp>
59 
60 #include <Xpetra_CrsMatrixWrap.hpp>
61 #include <Xpetra_ImportFactory.hpp>
62 #include <Xpetra_Matrix.hpp>
63 #include <Xpetra_MultiVectorFactory.hpp>
64 #include <Xpetra_VectorFactory.hpp>
65 
67 #include "MueLu_MasterList.hpp"
68 #include "MueLu_Monitor.hpp"
70 
71 namespace MueLu {
72 
73 template <class Scalar, class LocalOrdinal, class GlobalOrdinal,
74  class DeviceType>
75 RCP<const ParameterList>
76 SemiCoarsenPFactory_kokkos<Scalar, LocalOrdinal, GlobalOrdinal,
77  Kokkos::Compat::KokkosDeviceWrapperNode<
78  DeviceType>>::GetValidParameterList() const {
79  RCP<ParameterList> validParamList = rcp(new ParameterList());
80 
81  std::string name = "semicoarsen: coarsen rate";
82  validParamList->setEntry(name, MasterList::getEntry(name));
83  validParamList->set<RCP<const FactoryBase>>(
84  "A", Teuchos::null, "Generating factory of the matrix A");
85  validParamList->set<RCP<const FactoryBase>>(
86  "Nullspace", Teuchos::null, "Generating factory of the nullspace");
87  validParamList->set<RCP<const FactoryBase>>(
88  "Coordinates", Teuchos::null, "Generating factory for coordinates");
89 
90  validParamList->set<RCP<const FactoryBase>>(
91  "LineDetection_VertLineIds", Teuchos::null,
92  "Generating factory for LineDetection vertical line ids");
93  validParamList->set<RCP<const FactoryBase>>(
94  "LineDetection_Layers", Teuchos::null,
95  "Generating factory for LineDetection layer ids");
96  validParamList->set<RCP<const FactoryBase>>(
97  "CoarseNumZLayers", Teuchos::null,
98  "Generating factory for number of coarse z-layers");
99 
100  return validParamList;
101 }
102 
103 template <class Scalar, class LocalOrdinal, class GlobalOrdinal,
104  class DeviceType>
107  Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
108  DeclareInput(Level &fineLevel, Level & /* coarseLevel */) const {
109  Input(fineLevel, "A");
110  Input(fineLevel, "Nullspace");
111 
112  Input(fineLevel, "LineDetection_VertLineIds");
113  Input(fineLevel, "LineDetection_Layers");
114  Input(fineLevel, "CoarseNumZLayers");
115 
116  // check whether fine level coordinate information is available.
117  // If yes, request the fine level coordinates and generate coarse coordinates
118  // during the Build call
119  if (fineLevel.GetLevelID() == 0) {
120  if (fineLevel.IsAvailable("Coordinates", NoFactory::get())) {
121  fineLevel.DeclareInput("Coordinates", NoFactory::get(), this);
122  bTransferCoordinates_ = true;
123  }
124  } else if (bTransferCoordinates_ == true) {
125  // on coarser levels we check the default factory providing "Coordinates"
126  // or the factory declared to provide "Coordinates"
127  // first, check which factory is providing coordinate information
128  RCP<const FactoryBase> myCoordsFact = GetFactory("Coordinates");
129  if (myCoordsFact == Teuchos::null) {
130  myCoordsFact = fineLevel.GetFactoryManager()->GetFactory("Coordinates");
131  }
132  if (fineLevel.IsAvailable("Coordinates", myCoordsFact.get())) {
133  fineLevel.DeclareInput("Coordinates", myCoordsFact.get(), this);
134  }
135  }
136 }
137 
138 template <class Scalar, class LocalOrdinal, class GlobalOrdinal,
139  class DeviceType>
141  Kokkos::Compat::KokkosDeviceWrapperNode<
142  DeviceType>>::Build(Level &fineLevel,
143  Level &coarseLevel)
144  const {
145  return BuildP(fineLevel, coarseLevel);
146 }
147 
148 template <class Scalar, class LocalOrdinal, class GlobalOrdinal,
149  class DeviceType>
151  Kokkos::Compat::KokkosDeviceWrapperNode<
152  DeviceType>>::BuildP(Level &fineLevel,
153  Level &coarseLevel)
154  const {
155  FactoryMonitor m(*this, "Build", coarseLevel);
156 
157  // obtain general variables
158  RCP<Matrix> A = Get<RCP<Matrix>>(fineLevel, "A");
159  RCP<MultiVector> fineNullspace =
160  Get<RCP<MultiVector>>(fineLevel, "Nullspace");
161 
162  // get user-provided coarsening rate parameter (constant over all levels)
163  const ParameterList &pL = GetParameterList();
164  LO CoarsenRate = as<LO>(pL.get<int>("semicoarsen: coarsen rate"));
165  TEUCHOS_TEST_FOR_EXCEPTION(
166  CoarsenRate < 2, Exceptions::RuntimeError,
167  "semicoarsen: coarsen rate must be greater than 1");
168 
169  // collect general input data
170  LO BlkSize = A->GetFixedBlockSize();
171  RCP<const Map> rowMap = A->getRowMap();
172  LO Ndofs = rowMap->getLocalNumElements();
173  LO Nnodes = Ndofs / BlkSize;
174 
175  // collect line detection information generated by the LineDetectionFactory
176  // instance
177  LO FineNumZLayers = Get<LO>(fineLevel, "CoarseNumZLayers");
178  Teuchos::ArrayRCP<LO> TVertLineId =
179  Get<Teuchos::ArrayRCP<LO>>(fineLevel, "LineDetection_VertLineIds");
180  Teuchos::ArrayRCP<LO> TLayerId =
181  Get<Teuchos::ArrayRCP<LO>>(fineLevel, "LineDetection_Layers");
182 
183  // compute number of coarse layers
184  TEUCHOS_TEST_FOR_EXCEPTION(FineNumZLayers < 2, Exceptions::RuntimeError,
185  "Cannot coarsen further");
186  using coordT = typename Teuchos::ScalarTraits<Scalar>::coordinateType;
187  LO CoarseNumZLayers =
188  (LO)floor(((coordT)(FineNumZLayers + 1)) / ((coordT)CoarsenRate) - 1.0);
189  if (CoarseNumZLayers < 1)
190  CoarseNumZLayers = 1;
191 
192  // generate transfer operator with semicoarsening
193  RCP<Matrix> P;
194  RCP<MultiVector> coarseNullspace;
195  BuildSemiCoarsenP(coarseLevel, Ndofs, Nnodes, BlkSize, FineNumZLayers,
196  CoarseNumZLayers, TLayerId, TVertLineId, A, fineNullspace, P,
197  coarseNullspace);
198 
199  // Store number of coarse z-layers on the coarse level container
200  // This information is used by the LineDetectionAlgorithm
201  // TODO get rid of the NoFactory
202  coarseLevel.Set("NumZLayers", Teuchos::as<LO>(CoarseNumZLayers),
204 
205  // store semicoarsening transfer on coarse level
206  Set(coarseLevel, "P", P);
207  Set(coarseLevel, "Nullspace", coarseNullspace);
208 
209  // transfer coordinates
210  if (bTransferCoordinates_) {
211  SubFactoryMonitor m2(*this, "TransferCoordinates", coarseLevel);
212  typedef Xpetra::MultiVector<
213  typename Teuchos::ScalarTraits<Scalar>::coordinateType, LO, GO, NO>
214  xdMV;
215  RCP<xdMV> fineCoords = Teuchos::null;
216  if (fineLevel.GetLevelID() == 0 &&
217  fineLevel.IsAvailable("Coordinates", NoFactory::get())) {
218  fineCoords = fineLevel.Get<RCP<xdMV>>("Coordinates", NoFactory::get());
219  } else {
220  RCP<const FactoryBase> myCoordsFact = GetFactory("Coordinates");
221  if (myCoordsFact == Teuchos::null) {
222  myCoordsFact = fineLevel.GetFactoryManager()->GetFactory("Coordinates");
223  }
224  if (fineLevel.IsAvailable("Coordinates", myCoordsFact.get())) {
225  fineCoords =
226  fineLevel.Get<RCP<xdMV>>("Coordinates", myCoordsFact.get());
227  }
228  }
229 
230  TEUCHOS_TEST_FOR_EXCEPTION(fineCoords == Teuchos::null,
232  "No Coordinates found provided by the user.");
233 
234  TEUCHOS_TEST_FOR_EXCEPTION(fineCoords->getNumVectors() != 3,
236  "Three coordinates arrays must be supplied if "
237  "line detection orientation not given.");
238  ArrayRCP<typename Teuchos::ScalarTraits<Scalar>::coordinateType> x =
239  fineCoords->getDataNonConst(0);
240  ArrayRCP<typename Teuchos::ScalarTraits<Scalar>::coordinateType> y =
241  fineCoords->getDataNonConst(1);
242  ArrayRCP<typename Teuchos::ScalarTraits<Scalar>::coordinateType> z =
243  fineCoords->getDataNonConst(2);
244 
245  // determine the maximum and minimum z coordinate value on the current
246  // processor.
247  typename Teuchos::ScalarTraits<Scalar>::coordinateType zval_max =
248  -Teuchos::ScalarTraits<
249  typename Teuchos::ScalarTraits<Scalar>::coordinateType>::one() /
250  Teuchos::ScalarTraits<
251  typename Teuchos::ScalarTraits<Scalar>::coordinateType>::sfmin();
252  typename Teuchos::ScalarTraits<Scalar>::coordinateType zval_min =
253  Teuchos::ScalarTraits<
254  typename Teuchos::ScalarTraits<Scalar>::coordinateType>::one() /
255  Teuchos::ScalarTraits<
256  typename Teuchos::ScalarTraits<Scalar>::coordinateType>::sfmin();
257  for (auto it = z.begin(); it != z.end(); ++it) {
258  if (*it > zval_max)
259  zval_max = *it;
260  if (*it < zval_min)
261  zval_min = *it;
262  }
263 
264  LO myCoarseZLayers = Teuchos::as<LO>(CoarseNumZLayers);
265 
266  ArrayRCP<typename Teuchos::ScalarTraits<Scalar>::coordinateType>
267  myZLayerCoords = Teuchos::arcp<
268  typename Teuchos::ScalarTraits<Scalar>::coordinateType>(
269  myCoarseZLayers);
270  if (myCoarseZLayers == 1) {
271  myZLayerCoords[0] = zval_min;
272  } else {
273  typename Teuchos::ScalarTraits<Scalar>::coordinateType dz =
274  (zval_max - zval_min) / (myCoarseZLayers - 1);
275  for (LO k = 0; k < myCoarseZLayers; ++k) {
276  myZLayerCoords[k] = k * dz;
277  }
278  }
279 
280  // Note, that the coarse level node coordinates have to be in vertical
281  // ordering according to the numbering of the vertical lines
282 
283  // number of vertical lines on current node:
284  LO numVertLines = Nnodes / FineNumZLayers;
285  LO numLocalCoarseNodes = numVertLines * myCoarseZLayers;
286 
287  RCP<const Map> coarseCoordMap = MapFactory::Build(
288  fineCoords->getMap()->lib(),
289  Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
290  Teuchos::as<size_t>(numLocalCoarseNodes),
291  fineCoords->getMap()->getIndexBase(), fineCoords->getMap()->getComm());
292  RCP<xdMV> coarseCoords = Xpetra::MultiVectorFactory<
293  typename Teuchos::ScalarTraits<Scalar>::coordinateType, LO, GO,
294  NO>::Build(coarseCoordMap, fineCoords->getNumVectors());
295  coarseCoords->putScalar(-1.0);
296  ArrayRCP<typename Teuchos::ScalarTraits<Scalar>::coordinateType> cx =
297  coarseCoords->getDataNonConst(0);
298  ArrayRCP<typename Teuchos::ScalarTraits<Scalar>::coordinateType> cy =
299  coarseCoords->getDataNonConst(1);
300  ArrayRCP<typename Teuchos::ScalarTraits<Scalar>::coordinateType> cz =
301  coarseCoords->getDataNonConst(2);
302 
303  // loop over all vert line indices (stop as soon as possible)
304  LO cntCoarseNodes = 0;
305  for (LO vt = 0; vt < TVertLineId.size(); ++vt) {
306  // vertical line id in *vt
307  LO curVertLineId = TVertLineId[vt];
308 
309  if (cx[curVertLineId * myCoarseZLayers] == -1.0 &&
310  cy[curVertLineId * myCoarseZLayers] == -1.0) {
311  // loop over all local myCoarseZLayers
312  for (LO n = 0; n < myCoarseZLayers; ++n) {
313  cx[curVertLineId * myCoarseZLayers + n] = x[vt];
314  cy[curVertLineId * myCoarseZLayers + n] = y[vt];
315  cz[curVertLineId * myCoarseZLayers + n] = myZLayerCoords[n];
316  }
317  cntCoarseNodes += myCoarseZLayers;
318  }
319 
320  TEUCHOS_TEST_FOR_EXCEPTION(cntCoarseNodes > numLocalCoarseNodes,
322  "number of coarse nodes is inconsistent.");
323  if (cntCoarseNodes == numLocalCoarseNodes)
324  break;
325  }
326 
327  // set coarse level coordinates
328  Set(coarseLevel, "Coordinates", coarseCoords);
329  } /* end bool bTransferCoordinates */
330 }
331 
332 template <class Scalar, class LocalOrdinal, class GlobalOrdinal,
333  class DeviceType>
336  Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
337  BuildSemiCoarsenP(Level &coarseLevel, const LO NFRows, const LO NFNodes,
338  const LO DofsPerNode, const LO NFLayers,
339  const LO NCLayers, const ArrayRCP<LO> LayerId,
340  const ArrayRCP<LO> VertLineId, const RCP<Matrix> &Amat,
341  const RCP<MultiVector> fineNullspace, RCP<Matrix> &P,
342  RCP<MultiVector> &coarseNullspace) const {
343  SubFactoryMonitor m2(*this, "BuildSemiCoarsenP", coarseLevel);
344  using impl_SC = typename Kokkos::ArithTraits<SC>::val_type;
345  using impl_ATS = Kokkos::ArithTraits<impl_SC>;
346  using LOView1D = Kokkos::View<LO *, DeviceType>;
347  using LOView2D = Kokkos::View<LO **, DeviceType>;
348 
349  // Construct a map from fine level column to layer ids (including ghost nodes)
350  // Note: this is needed to sum all couplings within a layer
351  const auto FCol2LayerVector =
352  Xpetra::VectorFactory<LO, LO, GO, NO>::Build(Amat->getColMap());
353  const auto localTemp =
354  Xpetra::VectorFactory<LO, LO, GO, NO>::Build(Amat->getDomainMap());
355  RCP<const Import> importer = Amat->getCrsGraph()->getImporter();
356  if (importer == Teuchos::null)
357  importer = ImportFactory::Build(Amat->getDomainMap(), Amat->getColMap());
358  {
359  // Fill local temp with layer ids and fill ghost nodes
360  const auto localTempHost = localTemp->getHostLocalView(Xpetra::Access::ReadWrite);
361  for (int row = 0; row < NFRows; row++)
362  localTempHost(row, 0) = LayerId[row / DofsPerNode];
363  const auto localTempView = localTemp->getDeviceLocalView(Xpetra::Access::ReadWrite);
364  Kokkos::deep_copy(localTempView, localTempHost);
365  FCol2LayerVector->doImport(*localTemp, *(importer), Xpetra::INSERT);
366  }
367  const auto FCol2LayerView = FCol2LayerVector->getDeviceLocalView(Xpetra::Access::ReadOnly);
368  const auto FCol2Layer = Kokkos::subview(FCol2LayerView, Kokkos::ALL(), 0);
369 
370  // Construct a map from fine level column to local dof per node id (including
371  // ghost nodes) Note: this is needed to sum all couplings within a layer
372  const auto FCol2DofVector =
373  Xpetra::VectorFactory<LO, LO, GO, NO>::Build(Amat->getColMap());
374  {
375  // Fill local temp with local dof per node ids and fill ghost nodes
376  const auto localTempHost = localTemp->getHostLocalView(Xpetra::Access::ReadWrite);
377  for (int row = 0; row < NFRows; row++)
378  localTempHost(row, 0) = row % DofsPerNode;
379  const auto localTempView = localTemp->getDeviceLocalView(Xpetra::Access::ReadWrite);
380  Kokkos::deep_copy(localTempView, localTempHost);
381  FCol2DofVector->doImport(*localTemp, *(importer), Xpetra::INSERT);
382  }
383  const auto FCol2DofView = FCol2DofVector->getDeviceLocalView(Xpetra::Access::ReadOnly);
384  const auto FCol2Dof = Kokkos::subview(FCol2DofView, Kokkos::ALL(), 0);
385 
386  // Compute NVertLines
387  // TODO: Read this from line detection factory
388  int NVertLines = -1;
389  if (NFNodes != 0)
390  NVertLines = VertLineId[0];
391  for (int node = 1; node < NFNodes; ++node)
392  if (VertLineId[node] > NVertLines)
393  NVertLines = VertLineId[node];
394  NVertLines++;
395 
396  // Construct a map from Line, Layer ids to fine level node
397  LOView2D LineLayer2Node("LineLayer2Node", NVertLines, NFLayers);
398  typename LOView2D::HostMirror LineLayer2NodeHost =
399  Kokkos::create_mirror_view(LineLayer2Node);
400  for (int node = 0; node < NFNodes; ++node)
401  LineLayer2NodeHost(VertLineId[node], LayerId[node]) = node;
402  Kokkos::deep_copy(LineLayer2Node, LineLayer2NodeHost);
403 
404  // Construct a map from coarse layer id to fine layer id
405  LOView1D CLayer2FLayer("CLayer2FLayer", NCLayers);
406  typename LOView1D::HostMirror CLayer2FLayerHost =
407  Kokkos::create_mirror_view(CLayer2FLayer);
408  using coordT = typename Teuchos::ScalarTraits<Scalar>::coordinateType;
409  const LO FirstStride =
410  (LO)ceil(((coordT)(NFLayers + 1)) / ((coordT)(NCLayers + 1)));
411  const coordT RestStride =
412  ((coordT)(NFLayers - FirstStride + 1)) / ((coordT)NCLayers);
413  const LO NCpts =
414  (LO)floor((((coordT)(NFLayers - FirstStride + 1)) / RestStride) + .00001);
415  TEUCHOS_TEST_FOR_EXCEPTION(NCLayers != NCpts, Exceptions::RuntimeError,
416  "sizes do not match.");
417  coordT stride = (coordT)FirstStride;
418  for (int clayer = 0; clayer < NCLayers; ++clayer) {
419  CLayer2FLayerHost(clayer) = (LO)floor(stride) - 1;
420  stride += RestStride;
421  }
422  Kokkos::deep_copy(CLayer2FLayer, CLayer2FLayerHost);
423 
424  // Compute start layer and stencil sizes for layer interpolation at each
425  // coarse layer
426  int MaxStencilSize = 1;
427  LOView1D CLayer2StartLayer("CLayer2StartLayer", NCLayers);
428  LOView1D CLayer2StencilSize("CLayer2StencilSize", NCLayers);
429  typename LOView1D::HostMirror CLayer2StartLayerHost =
430  Kokkos::create_mirror_view(CLayer2StartLayer);
431  typename LOView1D::HostMirror CLayer2StencilSizeHost =
432  Kokkos::create_mirror_view(CLayer2StencilSize);
433  for (int clayer = 0; clayer < NCLayers; ++clayer) {
434  const int startLayer = (clayer > 0) ? CLayer2FLayerHost(clayer - 1) + 1 : 0;
435  const int stencilSize = (clayer < NCLayers - 1)
436  ? CLayer2FLayerHost(clayer + 1) - startLayer
437  : NFLayers - startLayer;
438 
439  if (MaxStencilSize < stencilSize)
440  MaxStencilSize = stencilSize;
441  CLayer2StartLayerHost(clayer) = startLayer;
442  CLayer2StencilSizeHost(clayer) = stencilSize;
443  }
444  Kokkos::deep_copy(CLayer2StartLayer, CLayer2StartLayerHost);
445  Kokkos::deep_copy(CLayer2StencilSize, CLayer2StencilSizeHost);
446 
447  // Allocate storage for the coarse layer interpolation matrices on all
448  // vertical lines Note: Contributions to each matrix are collapsed to vertical
449  // lines. Thus, each vertical line gives rise to a block tridiagonal matrix.
450  // Here we store the full matrix to be compatible with kokkos kernels batch LU
451  // and tringular solve.
452  int Nmax = MaxStencilSize * DofsPerNode;
453  Kokkos::View<impl_SC ***, DeviceType> BandMat(
454  "BandMat", NVertLines, Nmax, Nmax);
455  Kokkos::View<impl_SC ***, DeviceType> BandSol(
456  "BandSol", NVertLines, Nmax, DofsPerNode);
457 
458  // Precompute number of nonzeros in prolongation matrix and allocate P views
459  // Note: Each coarse dof (NVertLines*NCLayers*DofsPerNode) contributes an
460  // interpolation stencil (StencilSize*DofsPerNode)
461  int NnzP = 0;
462  for (int clayer = 0; clayer < NCLayers; ++clayer)
463  NnzP += CLayer2StencilSizeHost(clayer);
464  NnzP *= NVertLines * DofsPerNode * DofsPerNode;
465  Kokkos::View<impl_SC *, DeviceType> Pvals("Pvals", NnzP);
466  Kokkos::View<LO *, DeviceType> Pcols("Pcols", NnzP);
467 
468  // Precompute Pptr
469  // Note: Each coarse layer stencil dof contributes DofsPerNode to the
470  // corresponding row in P
471  Kokkos::View<size_t *, DeviceType> Pptr("Pptr", NFRows + 1);
472  typename Kokkos::View<size_t *, DeviceType>::HostMirror PptrHost =
473  Kokkos::create_mirror_view(Pptr);
474  Kokkos::deep_copy(PptrHost, 0);
475  for (int line = 0; line < NVertLines; ++line) {
476  for (int clayer = 0; clayer < NCLayers; ++clayer) {
477  const int stencilSize = CLayer2StencilSizeHost(clayer);
478  const int startLayer = CLayer2StartLayerHost(clayer);
479  for (int snode = 0; snode < stencilSize; ++snode) {
480  for (int dofi = 0; dofi < DofsPerNode; ++dofi) {
481  const int layer = startLayer + snode;
482  const int AmatBlkRow = LineLayer2NodeHost(line, layer);
483  const int AmatRow = AmatBlkRow * DofsPerNode + dofi;
484  PptrHost(AmatRow + 1) += DofsPerNode;
485  }
486  }
487  }
488  }
489  for (int i = 2; i < NFRows + 1; ++i)
490  PptrHost(i) += PptrHost(i - 1);
491  TEUCHOS_TEST_FOR_EXCEPTION(NnzP != (int)PptrHost(NFRows),
493  "Number of nonzeros in P does not match");
494  Kokkos::deep_copy(Pptr, PptrHost);
495 
496  // Precompute Pptr offsets
497  // Note: These are used to determine the nonzero index in Pvals and Pcols
498  Kokkos::View<LO *, Kokkos::DefaultHostExecutionSpace> layerBuckets(
499  "layerBuckets", NFLayers);
500  Kokkos::deep_copy(layerBuckets, 0);
501  LOView2D CLayerSNode2PptrOffset("CLayerSNode2PptrOffset", NCLayers,
502  MaxStencilSize);
503  typename LOView2D::HostMirror CLayerSNode2PptrOffsetHost =
504  Kokkos::create_mirror_view(CLayerSNode2PptrOffset);
505  for (int clayer = 0; clayer < NCLayers; ++clayer) {
506  const int stencilSize = CLayer2StencilSizeHost(clayer);
507  const int startLayer = CLayer2StartLayerHost(clayer);
508  for (int snode = 0; snode < stencilSize; ++snode) {
509  const int layer = startLayer + snode;
510  CLayerSNode2PptrOffsetHost(clayer, snode) = layerBuckets(layer);
511  layerBuckets(layer)++;
512  }
513  }
514  Kokkos::deep_copy(CLayerSNode2PptrOffset, CLayerSNode2PptrOffsetHost);
515 
516  { // Fill P - fill and solve each block tridiagonal system and fill P views
517  SubFactoryMonitor m3(*this, "Fill P", coarseLevel);
518 
519  const auto localAmat = Amat->getLocalMatrixDevice();
520  const auto zero = impl_ATS::zero();
521  const auto one = impl_ATS::one();
522 
523  using range_policy = Kokkos::RangePolicy<execution_space>;
524  Kokkos::parallel_for(
525  "MueLu::SemiCoarsenPFactory_kokkos::BuildSemiCoarsenP Fill P",
526  range_policy(0, NVertLines), KOKKOS_LAMBDA(const int line) {
527  for (int clayer = 0; clayer < NCLayers; ++clayer) {
528 
529  // Initialize BandSol
530  auto bandSol =
531  Kokkos::subview(BandSol, line, Kokkos::ALL(), Kokkos::ALL());
532  for (int row = 0; row < Nmax; ++row)
533  for (int dof = 0; dof < DofsPerNode; ++dof)
534  bandSol(row, dof) = zero;
535 
536  // Initialize BandMat (set unused row diagonal to 1.0)
537  const int stencilSize = CLayer2StencilSize(clayer);
538  const int N = stencilSize * DofsPerNode;
539  auto bandMat =
540  Kokkos::subview(BandMat, line, Kokkos::ALL(), Kokkos::ALL());
541  for (int row = 0; row < Nmax; ++row)
542  for (int col = 0; col < Nmax; ++col)
543  bandMat(row, col) =
544  (row == col && row >= N) ? one : zero;
545 
546  // Loop over layers in stencil and fill banded matrix and rhs
547  const int flayer = CLayer2FLayer(clayer);
548  const int startLayer = CLayer2StartLayer(clayer);
549  for (int snode = 0; snode < stencilSize; ++snode) {
550 
551  const int layer = startLayer + snode;
552  if (layer == flayer) { // If layer in stencil is a coarse layer
553  for (int dof = 0; dof < DofsPerNode; ++dof) {
554  const int row = snode * DofsPerNode + dof;
555  bandMat(row, row) = one;
556  bandSol(row, dof) = one;
557  }
558  } else { // Not a coarse layer
559  const int AmatBlkRow = LineLayer2Node(line, layer);
560  for (int dofi = 0; dofi < DofsPerNode; ++dofi) {
561 
562  // Get Amat row info
563  const int AmatRow = AmatBlkRow * DofsPerNode + dofi;
564  const auto localAmatRow = localAmat.rowConst(AmatRow);
565  const int AmatRowLeng = localAmatRow.length;
566 
567  const int row = snode * DofsPerNode + dofi;
568  for (int dofj = 0; dofj < DofsPerNode; ++dofj) {
569  const int col = snode * DofsPerNode + dofj;
570 
571  // Sum values along row which correspond to stencil
572  // layer/dof and fill bandMat
573  auto val = zero;
574  for (int i = 0; i < AmatRowLeng; ++i) {
575  const int colidx = localAmatRow.colidx(i);
576  if (FCol2Layer(colidx) == layer &&
577  FCol2Dof(colidx) == dofj)
578  val += localAmatRow.value(i);
579  }
580  bandMat(row, col) = val;
581 
582  if (snode > 0) {
583  // Sum values along row which correspond to stencil
584  // layer/dof below and fill bandMat
585  val = zero;
586  for (int i = 0; i < AmatRowLeng; ++i) {
587  const int colidx = localAmatRow.colidx(i);
588  if (FCol2Layer(colidx) == layer - 1 &&
589  FCol2Dof(colidx) == dofj)
590  val += localAmatRow.value(i);
591  }
592  bandMat(row, col - DofsPerNode) = val;
593  }
594 
595  if (snode < stencilSize - 1) {
596  // Sum values along row which correspond to stencil
597  // layer/dof above and fill bandMat
598  val = zero;
599  for (int i = 0; i < AmatRowLeng; ++i) {
600  const int colidx = localAmatRow.colidx(i);
601  if (FCol2Layer(colidx) == layer + 1 &&
602  FCol2Dof(colidx) == dofj)
603  val += localAmatRow.value(i);
604  }
605  bandMat(row, col + DofsPerNode) = val;
606  }
607  }
608  }
609  }
610  }
611 
612  // Batch LU and triangular solves
613  namespace KB = KokkosBatched;
614  using lu_type = typename KB::SerialLU<KB::Algo::LU::Unblocked>;
615  lu_type::invoke(bandMat);
616  using trsv_l_type =
617  typename KB::SerialTrsm<KB::Side::Left, KB::Uplo::Lower,
618  KB::Trans::NoTranspose, KB::Diag::Unit,
619  KB::Algo::Trsm::Unblocked>;
620  trsv_l_type::invoke(one, bandMat, bandSol);
621  using trsv_u_type = typename KB::SerialTrsm<
622  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose,
623  KB::Diag::NonUnit, KB::Algo::Trsm::Unblocked>;
624  trsv_u_type::invoke(one, bandMat, bandSol);
625 
626  // Fill prolongation views with solution
627  for (int snode = 0; snode < stencilSize; ++snode) {
628  for (int dofj = 0; dofj < DofsPerNode; ++dofj) {
629  for (int dofi = 0; dofi < DofsPerNode; ++dofi) {
630  const int layer = startLayer + snode;
631  const int AmatBlkRow = LineLayer2Node(line, layer);
632  const int AmatRow = AmatBlkRow * DofsPerNode + dofi;
633 
634  const int pptrOffset = CLayerSNode2PptrOffset(clayer, snode);
635  const int pptr =
636  Pptr(AmatRow) + pptrOffset * DofsPerNode + dofj;
637 
638  const int col =
639  line * NCLayers + clayer; // coarse node (block row) index
640  Pcols(pptr) = col * DofsPerNode + dofj;
641  Pvals(pptr) = bandSol(snode * DofsPerNode + dofi, dofj);
642  }
643  }
644  }
645  }
646  });
647  } // Fill P
648 
649  // Build P
650  RCP<const Map> rowMap = Amat->getRowMap();
651  Xpetra::global_size_t GNdofs = rowMap->getGlobalNumElements();
652  Xpetra::global_size_t itemp = GNdofs / NFLayers;
653  std::vector<size_t> stridingInfo_{(size_t)DofsPerNode};
654  RCP<const Map> coarseMap = StridedMapFactory::Build(
655  rowMap->lib(), NCLayers * itemp, NCLayers * NVertLines * DofsPerNode, 0,
656  stridingInfo_, rowMap->getComm(), -1, 0);
657  P = rcp(new CrsMatrixWrap(rowMap, coarseMap, 0));
658  RCP<CrsMatrix> PCrs = rcp_dynamic_cast<CrsMatrixWrap>(P)->getCrsMatrix();
659  PCrs->setAllValues(Pptr, Pcols, Pvals);
660  PCrs->expertStaticFillComplete(coarseMap, Amat->getDomainMap());
661 
662  // set StridingInformation of P
663  if (Amat->IsView("stridedMaps") == true)
664  P->CreateView("stridedMaps", Amat->getRowMap("stridedMaps"), coarseMap);
665  else
666  P->CreateView("stridedMaps", P->getRangeMap(), coarseMap);
667 
668  // Construct coarse nullspace and inject fine nullspace
669  coarseNullspace =
670  MultiVectorFactory::Build(coarseMap, fineNullspace->getNumVectors());
671  const int numVectors = fineNullspace->getNumVectors();
672  const auto fineNullspaceView = fineNullspace->getDeviceLocalView(Xpetra::Access::ReadOnly);
673  const auto coarseNullspaceView = coarseNullspace->getDeviceLocalView(Xpetra::Access::ReadWrite);
674  using range_policy = Kokkos::RangePolicy<execution_space>;
675  Kokkos::parallel_for(
676  "MueLu::SemiCoarsenPFactory_kokkos::BuildSemiCoarsenP Inject Nullspace",
677  range_policy(0, NVertLines), KOKKOS_LAMBDA(const int line) {
678  for (int clayer = 0; clayer < NCLayers; ++clayer) {
679  const int layer = CLayer2FLayer(clayer);
680  const int AmatBlkRow =
681  LineLayer2Node(line, layer); // fine node (block row) index
682  const int col =
683  line * NCLayers + clayer; // coarse node (block row) index
684  for (int k = 0; k < numVectors; ++k) {
685  for (int dofi = 0; dofi < DofsPerNode; ++dofi) {
686  const int fRow = AmatBlkRow * DofsPerNode + dofi;
687  const int cRow = col * DofsPerNode + dofi;
688  coarseNullspaceView(cRow, k) = fineNullspaceView(fRow, k);
689  }
690  }
691  }
692  });
693 }
694 
695 } // namespace MueLu
696 
697 #define MUELU_SEMICOARSENPFACTORY_KOKKOS_SHORT
698 #endif // MUELU_SEMICOARSENPFACTORY_KOKKOS_DEF_HPP
MueLu::DefaultLocalOrdinal LocalOrdinal
Timer to be used in factories. Similar to Monitor but with additional timers.
Namespace for MueLu classes and methods.
static const NoFactory * get()
int GetLevelID() const
Return level number.
Definition: MueLu_Level.cpp:76
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
Class that holds all level-specific information.
Definition: MueLu_Level.hpp:99
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
void Set(const std::string &ename, const T &entry, const FactoryBase *factory=NoFactory::get())
static const Teuchos::ParameterEntry & getEntry(const std::string &name)
Returns default entry from the "master" list corresponding to the specified name. ...
const RCP< const FactoryManagerBase > GetFactoryManager()
returns the current factory manager
Definition: MueLu_Level.cpp:96
bool IsAvailable(const std::string &ename, const FactoryBase *factory=NoFactory::get()) const
Test whether a need&#39;s value has been saved.
Exception throws to report errors in the internal logical of the program.
Prolongator factory performing semi-coarsening.
void DeclareInput(const std::string &ename, const FactoryBase *factory, const FactoryBase *requestedBy=NoFactory::get())
Callback from FactoryBase::CallDeclareInput() and FactoryBase::DeclareInput()