MueLu  Version of the Day
MueLu_CoalesceDropFactory_kokkos_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // MueLu: A package for multigrid based preconditioning
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact
39 // Jonathan Hu (jhu@sandia.gov)
40 // Andrey Prokopenko (aprokop@sandia.gov)
41 // Ray Tuminaro (rstumin@sandia.gov)
42 //
43 // ***********************************************************************
44 //
45 // @HEADER
46 #ifndef MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
47 #define MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
48 
49 #include <Kokkos_Core.hpp>
50 #include <KokkosSparse_CrsMatrix.hpp>
51 
52 #include "Xpetra_Matrix.hpp"
53 
55 
56 #include "MueLu_AmalgamationInfo_kokkos.hpp"
57 #include "MueLu_Exceptions.hpp"
58 #include "MueLu_Level.hpp"
59 #include "MueLu_LWGraph_kokkos.hpp"
60 #include "MueLu_MasterList.hpp"
61 #include "MueLu_Monitor.hpp"
62 #include "MueLu_Utilities_kokkos.hpp"
63 
64 namespace MueLu {
65 
66 
67  namespace CoalesceDrop_Kokkos_Details { // anonymous
68 
69  template<class LO, class RowType>
70  class ScanFunctor {
71  public:
72  ScanFunctor(RowType rows_) : rows(rows_) { }
73 
74  KOKKOS_INLINE_FUNCTION
75  void operator()(const LO i, LO& upd, const bool& final) const {
76  upd += rows(i);
77  if (final)
78  rows(i) = upd;
79  }
80 
81  private:
82  RowType rows;
83  };
84 
85  template<class LO, class GhostedViewType>
87  private:
88  typedef typename GhostedViewType::value_type SC;
89  typedef Kokkos::ArithTraits<SC> ATS;
90  typedef typename ATS::magnitudeType magnitudeType;
91 
92  GhostedViewType diag; // corresponds to overlapped diagonal multivector (2D View)
94 
95  public:
96  ClassicalDropFunctor(GhostedViewType ghostedDiag, magnitudeType threshold) :
97  diag(ghostedDiag),
98  eps(threshold)
99  { }
100 
101  // Return true if we drop, false if not
102  KOKKOS_FORCEINLINE_FUNCTION
103  bool operator()(LO row, LO col, SC val) const {
104  // We avoid square root by using squared values
105  auto aiiajj = ATS::magnitude(diag(row, 0)) * ATS::magnitude(diag(col, 0)); // |a_ii|*|a_jj|
106  auto aij2 = ATS::magnitude(val) * ATS::magnitude(val); // |a_ij|^2
107 
108  return (aij2 <= eps*eps * aiiajj);
109  }
110  };
111 
112  template<class LO, class CoordsType>
114  private:
115  typedef typename CoordsType::value_type SC;
116  typedef Kokkos::ArithTraits<SC> ATS;
117  typedef typename ATS::magnitudeType magnitudeType;
118 
119  public:
120  typedef SC value_type;
121 
122  public:
123  DistanceFunctor(CoordsType coords_) : coords(coords_) { }
124 
125  KOKKOS_INLINE_FUNCTION
126  magnitudeType distance2(LO row, LO col) const {
127  SC d = ATS::zero(), s;
128  for (size_t j = 0; j < coords.extent(1); j++) {
129  s = coords(row,j) - coords(col,j);
130  d += s*s;
131  }
132  return ATS::magnitude(d);
133  }
134  private:
135  CoordsType coords;
136  };
137 
138  template<class LO, class GhostedViewType, class DistanceFunctor>
140  private:
141  typedef typename GhostedViewType::value_type SC;
142  typedef Kokkos::ArithTraits<SC> ATS;
143  typedef typename ATS::magnitudeType magnitudeType;
144 
145  public:
146  DistanceLaplacianDropFunctor(GhostedViewType ghostedLaplDiag, DistanceFunctor distFunctor_, magnitudeType threshold) :
147  diag(ghostedLaplDiag),
148  distFunctor(distFunctor_),
149  eps(threshold)
150  { }
151 
152  // Return true if we drop, false if not
153  KOKKOS_INLINE_FUNCTION
154  bool operator()(LO row, LO col, SC /* val */) const {
155  // We avoid square root by using squared values
156 
157  // We ignore incoming value of val as we operate on an auxiliary
158  // distance Laplacian matrix
159  typedef typename DistanceFunctor::value_type dSC;
160  typedef Kokkos::ArithTraits<dSC> dATS;
161  auto fval = dATS::one() / distFunctor.distance2(row, col);
162 
163  auto aiiajj = ATS::magnitude(diag(row, 0)) * ATS::magnitude(diag(col, 0)); // |a_ii|*|a_jj|
164  auto aij2 = ATS::magnitude(fval) * ATS::magnitude(fval); // |a_ij|^2
165 
166  return (aij2 <= eps*eps * aiiajj);
167  }
168 
169  private:
170  GhostedViewType diag; // corresponds to overlapped diagonal multivector (2D View)
173  };
174 
175  template<class SC, class LO, class MatrixType, class BndViewType, class DropFunctorType>
177  private:
178  typedef typename MatrixType::StaticCrsGraphType graph_type;
179  typedef typename graph_type::row_map_type rows_type;
180  typedef typename graph_type::entries_type cols_type;
181  typedef typename MatrixType::values_type vals_type;
182  typedef Kokkos::ArithTraits<SC> ATS;
183  typedef typename ATS::val_type impl_Scalar;
184  typedef Kokkos::ArithTraits<impl_Scalar> impl_ATS;
185  typedef typename ATS::magnitudeType magnitudeType;
186 
187  public:
188  ScalarFunctor(MatrixType A_, BndViewType bndNodes_, DropFunctorType dropFunctor_,
189  typename rows_type::non_const_type rows_,
190  typename cols_type::non_const_type colsAux_,
191  typename vals_type::non_const_type valsAux_,
192  bool reuseGraph_, bool lumping_, SC /* threshold_ */,
193  bool aggregationMayCreateDirichlet_ ) :
194  A(A_),
195  bndNodes(bndNodes_),
196  dropFunctor(dropFunctor_),
197  rows(rows_),
198  colsAux(colsAux_),
199  valsAux(valsAux_),
200  reuseGraph(reuseGraph_),
201  lumping(lumping_),
202  aggregationMayCreateDirichlet(aggregationMayCreateDirichlet_)
203  {
204  rowsA = A.graph.row_map;
205  zero = impl_ATS::zero();
206  }
207 
208  KOKKOS_INLINE_FUNCTION
209  void operator()(const LO row, LO& nnz) const {
210  auto rowView = A.rowConst(row);
211  auto length = rowView.length;
212  auto offset = rowsA(row);
213 
214  impl_Scalar diag = zero;
215  LO rownnz = 0;
216  LO diagID = -1;
217  for (decltype(length) colID = 0; colID < length; colID++) {
218  LO col = rowView.colidx(colID);
219  impl_Scalar val = rowView.value (colID);
220 
221  if (!dropFunctor(row, col, rowView.value(colID)) || row == col) {
222  colsAux(offset+rownnz) = col;
223 
224  LO valID = (reuseGraph ? colID : rownnz);
225  valsAux(offset+valID) = val;
226  if (row == col)
227  diagID = valID;
228 
229  rownnz++;
230 
231  } else {
232  // Rewrite with zeros (needed for reuseGraph)
233  valsAux(offset+colID) = zero;
234  diag += val;
235  }
236  }
237  // How to assert on the device?
238  // assert(diagIndex != -1);
239  rows(row+1) = rownnz;
240  // if (lumping && diagID != -1) {
241  if (lumping) {
242  // Add diag to the diagonal
243 
244  // NOTE_KOKKOS: valsAux was allocated with
245  // ViewAllocateWithoutInitializing. This is not a problem here
246  // because we explicitly set this value above.
247  valsAux(offset+diagID) += diag;
248  }
249 
250  // If the only element remaining after filtering is diagonal, mark node as boundary
251  // FIXME: this should really be replaced by the following
252  // if (indices.size() == 1 && indices[0] == row)
253  // boundaryNodes[row] = true;
254  // We do not do it this way now because there is no framework for distinguishing isolated
255  // and boundary nodes in the aggregation algorithms
256  bndNodes(row) = (rownnz == 1 && aggregationMayCreateDirichlet);
257 
258  nnz += rownnz;
259  }
260 
261  private:
262  MatrixType A;
263  BndViewType bndNodes;
264  DropFunctorType dropFunctor;
265 
267 
268  typename rows_type::non_const_type rows;
269  typename cols_type::non_const_type colsAux;
270  typename vals_type::non_const_type valsAux;
271 
273  bool lumping;
276  };
277 
278  // collect number nonzeros of blkSize rows in nnz_(row+1)
279  template<class MatrixType, class NnzType, class blkSizeType>
281  private:
282  typedef typename MatrixType::ordinal_type LO;
283 
284  public:
285  Stage1aVectorFunctor(MatrixType kokkosMatrix_, NnzType nnz_, blkSizeType blkSize_) :
286  kokkosMatrix(kokkosMatrix_),
287  nnz(nnz_),
288  blkSize(blkSize_) { }
289 
290  KOKKOS_INLINE_FUNCTION
291  void operator()(const LO row, LO& totalnnz) const {
292 
293  // the following code is more or less what MergeRows is doing
294  // count nonzero entries in all dof rows associated with node row
295  LO nodeRowMaxNonZeros = 0;
296  for (LO j = 0; j < blkSize; j++) {
297  auto rowView = kokkosMatrix.row(row * blkSize + j);
298  nodeRowMaxNonZeros += rowView.length;
299  }
300  nnz(row + 1) = nodeRowMaxNonZeros;
301  totalnnz += nodeRowMaxNonZeros;
302  }
303 
304 
305  private:
306  MatrixType kokkosMatrix; //< local matrix part
307  NnzType nnz; //< View containing number of nonzeros for current row
308  blkSizeType blkSize; //< block size (or partial block size in strided maps)
309  };
310 
311 
312  // build the dof-based column map containing the local dof ids belonging to blkSize rows in matrix
313  // sort column ids
314  // translate them into (unique) node ids
315  // count the node column ids per node row
316  template<class MatrixType, class NnzType, class blkSizeType, class ColDofType, class Dof2NodeTranslationType, class BdryNodeTypeConst, class BdryNodeType, class boolType>
318  private:
319  typedef typename MatrixType::ordinal_type LO;
320 
321  private:
322  MatrixType kokkosMatrix; //< local matrix part
323  NnzType coldofnnz; //< view containing start and stop indices for subviews
324  blkSizeType blkSize; //< block size (or partial block size in strided maps)
325  ColDofType coldofs; //< view containing the local dof ids associated with columns for the blkSize rows (not sorted)
326  Dof2NodeTranslationType dof2node; //< view containing the local node id associated with the local dof id
327  NnzType colnodennz; //< view containing number of column nodes for each node row
328  BdryNodeTypeConst dirichletdof; //< view containing with num dofs booleans. True if dof (not necessarily entire node) is dirichlet boundardy dof.
329  BdryNodeType bdrynode; //< view containing with numNodes booleans. True if node is (full) dirichlet boundardy node.
330  boolType usegreedydirichlet; //< boolean for use of greedy Dirichlet (if any dof is Dirichlet, entire node is dirichlet) default false (need all dofs in node to be Dirichlet for node to be Dirichlet)
331 
332  public:
333  Stage1bcVectorFunctor(MatrixType kokkosMatrix_,
334  NnzType coldofnnz_,
335  blkSizeType blkSize_,
336  ColDofType coldofs_,
337  Dof2NodeTranslationType dof2node_,
338  NnzType colnodennz_,
339  BdryNodeTypeConst dirichletdof_,
340  BdryNodeType bdrynode_,
341  boolType usegreedydirichlet_) :
342  kokkosMatrix(kokkosMatrix_),
343  coldofnnz(coldofnnz_),
344  blkSize(blkSize_),
345  coldofs(coldofs_),
346  dof2node(dof2node_),
347  colnodennz(colnodennz_),
348  dirichletdof(dirichletdof_),
349  bdrynode(bdrynode_),
350  usegreedydirichlet(usegreedydirichlet_) {
351  }
352 
353  KOKKOS_INLINE_FUNCTION
354  void operator()(const LO rowNode, LO& nnz) const {
355 
356  LO pos = coldofnnz(rowNode);
357  if( usegreedydirichlet ){
358  bdrynode(rowNode) = false;
359  for (LO j = 0; j < blkSize; j++) {
360  auto rowView = kokkosMatrix.row(rowNode * blkSize + j);
361  auto numIndices = rowView.length;
362 
363  // if any dof in the node is Dirichlet
364  if( dirichletdof(rowNode * blkSize + j) )
365  bdrynode(rowNode) = true;
366 
367  for (decltype(numIndices) k = 0; k < numIndices; k++) {
368  auto dofID = rowView.colidx(k);
369  coldofs(pos) = dofID;
370  pos ++;
371  }
372  }
373  }else{
374  bdrynode(rowNode) = true;
375  for (LO j = 0; j < blkSize; j++) {
376  auto rowView = kokkosMatrix.row(rowNode * blkSize + j);
377  auto numIndices = rowView.length;
378 
379  // if any dof in the node is not Dirichlet
380  if( dirichletdof(rowNode * blkSize + j) == false )
381  bdrynode(rowNode) = false;
382 
383  for (decltype(numIndices) k = 0; k < numIndices; k++) {
384  auto dofID = rowView.colidx(k);
385  coldofs(pos) = dofID;
386  pos ++;
387  }
388  }
389  }
390 
391  // sort coldofs
392  LO begin = coldofnnz(rowNode);
393  LO end = coldofnnz(rowNode+1);
394  LO n = end - begin;
395  for (LO i = 0; i < (n-1); i++) {
396  for (LO j = 0; j < (n-i-1); j++) {
397  if (coldofs(j+begin) > coldofs(j+begin+1)) {
398  LO temp = coldofs(j+begin);
399  coldofs(j+begin) = coldofs(j+begin+1);
400  coldofs(j+begin+1) = temp;
401  }
402  }
403  }
404  size_t cnt = 0;
405  LO lastNodeID = -1;
406  for (LO i = 0; i < n; i++) {
407  LO dofID = coldofs(begin + i);
408  LO nodeID = dof2node(dofID);
409  if(nodeID != lastNodeID) {
410  lastNodeID = nodeID;
411  coldofs(begin+cnt) = nodeID;
412  cnt++;
413  }
414  }
415  colnodennz(rowNode+1) = cnt;
416  nnz += cnt;
417  }
418 
419  };
420 
421  // fill column node id view
422  template<class MatrixType, class ColDofNnzType, class ColDofType, class ColNodeNnzType, class ColNodeType>
424  private:
425  typedef typename MatrixType::ordinal_type LO;
426  typedef typename MatrixType::value_type SC;
427 
428  private:
429  ColDofType coldofs; //< view containing mixed node and dof indices (only input)
430  ColDofNnzType coldofnnz; //< view containing the start and stop indices for subviews (dofs)
431  ColNodeType colnodes; //< view containing the local node ids associated with columns
432  ColNodeNnzType colnodennz; //< view containing start and stop indices for subviews
433 
434  public:
435  Stage1dVectorFunctor(ColDofType coldofs_, ColDofNnzType coldofnnz_, ColNodeType colnodes_, ColNodeNnzType colnodennz_) :
436  coldofs(coldofs_),
437  coldofnnz(coldofnnz_),
438  colnodes(colnodes_),
439  colnodennz(colnodennz_) {
440  }
441 
442  KOKKOS_INLINE_FUNCTION
443  void operator()(const LO rowNode) const {
444  auto dofbegin = coldofnnz(rowNode);
445  auto nodebegin = colnodennz(rowNode);
446  auto nodeend = colnodennz(rowNode+1);
447  auto n = nodeend - nodebegin;
448 
449  for (decltype(nodebegin) i = 0; i < n; i++) {
450  colnodes(nodebegin + i) = coldofs(dofbegin + i);
451  }
452  }
453  };
454 
455 
456  } // namespace
457 
458  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
460  RCP<ParameterList> validParamList = rcp(new ParameterList());
461 
462 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
463  SET_VALID_ENTRY("aggregation: drop tol");
464  SET_VALID_ENTRY("aggregation: Dirichlet threshold");
465  SET_VALID_ENTRY("aggregation: drop scheme");
466  SET_VALID_ENTRY("aggregation: dropping may create Dirichlet");
467  SET_VALID_ENTRY("aggregation: greedy Dirichlet");
468  SET_VALID_ENTRY("filtered matrix: use lumping");
469  SET_VALID_ENTRY("filtered matrix: reuse graph");
470  SET_VALID_ENTRY("filtered matrix: reuse eigenvalue");
471  {
472  typedef Teuchos::StringToIntegralParameterEntryValidator<int> validatorType;
473  validParamList->getEntry("aggregation: drop scheme").setValidator(
474  rcp(new validatorType(Teuchos::tuple<std::string>("classical", "distance laplacian"), "aggregation: drop scheme")));
475  }
476 #undef SET_VALID_ENTRY
477  validParamList->set< RCP<const FactoryBase> >("A", Teuchos::null, "Generating factory of the matrix A");
478  validParamList->set< RCP<const FactoryBase> >("UnAmalgamationInfo", Teuchos::null, "Generating factory for UnAmalgamationInfo");
479  validParamList->set< RCP<const FactoryBase> >("Coordinates", Teuchos::null, "Generating factory for Coordinates");
480 
481  return validParamList;
482  }
483 
484  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
486  Input(currentLevel, "A");
487  Input(currentLevel, "UnAmalgamationInfo");
488 
489  const ParameterList& pL = GetParameterList();
490  if (pL.get<std::string>("aggregation: drop scheme") == "distance laplacian")
491  Input(currentLevel, "Coordinates");
492  }
493 
494  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
496  Build(Level& currentLevel) const {
497  FactoryMonitor m(*this, "Build", currentLevel);
498 
499  typedef Teuchos::ScalarTraits<SC> STS;
500  typedef typename STS::magnitudeType MT;
501  const MT zero = Teuchos::ScalarTraits<MT>::zero();
502 
503  auto A = Get< RCP<Matrix> >(currentLevel, "A");
504 
505 
506  /* NOTE: storageblocksize (from GetStorageBlockSize()) is the size of a block in the chosen storage scheme.
507  blkSize is the number of storage blocks that must kept together during the amalgamation process.
508 
509  Both of these quantities may be different than numPDEs (from GetFixedBlockSize()), but the following must always hold:
510 
511  numPDEs = blkSize * storageblocksize.
512 
513  If numPDEs==1
514  Matrix is point storage (classical CRS storage). storageblocksize=1 and blkSize=1
515  No other values makes sense.
516 
517  If numPDEs>1
518  If matrix uses point storage, then storageblocksize=1 and blkSize=numPDEs.
519  If matrix uses block storage, with block size of n, then storageblocksize=n, and blkSize=numPDEs/n.
520  Thus far, only storageblocksize=numPDEs and blkSize=1 has been tested.
521  */
522 
523  TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0,Exceptions::RuntimeError,"A->GetFixedBlockSize() needs to be a multiple of A->GetStorageBlockSize()");
524  LO blkSize = A->GetFixedBlockSize() / A->GetStorageBlockSize();
525 
526  auto amalInfo = Get< RCP<AmalgamationInfo_kokkos> >(currentLevel, "UnAmalgamationInfo");
527 
528  const ParameterList& pL = GetParameterList();
529 
530  std::string algo = pL.get<std::string>("aggregation: drop scheme");
531 
532  double threshold = pL.get<double>("aggregation: drop tol");
533  GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold
534  << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
535 
536  const typename STS::magnitudeType dirichletThreshold =
537  STS::magnitude(as<SC>(pL.get<double>("aggregation: Dirichlet threshold")));
538 
539  GO numDropped = 0, numTotal = 0;
540 
541  RCP<LWGraph_kokkos> graph;
542  LO dofsPerNode = -1;
543 
544  typedef typename LWGraph_kokkos::boundary_nodes_type boundary_nodes_type;
545  boundary_nodes_type boundaryNodes;
546 
547  RCP<Matrix> filteredA;
548  if (blkSize == 1 && threshold == zero) {
549  // Scalar problem without dropping
550 
551  // Detect and record rows that correspond to Dirichlet boundary conditions
552  boundaryNodes = Utilities_kokkos::DetectDirichletRows(*A, dirichletThreshold);
553 
554  // Trivial LWGraph construction
555  graph = rcp(new LWGraph_kokkos(A->getCrsGraph()->getLocalGraphDevice(), A->getRowMap(), A->getColMap(), "graph of A"));
556  graph->getLocalLWGraph().SetBoundaryNodeMap(boundaryNodes);
557 
558  numTotal = A->getLocalNumEntries();
559  dofsPerNode = 1;
560 
561  filteredA = A;
562 
563  } else if (blkSize == 1 && threshold != zero) {
564  // Scalar problem with dropping
565 
566  typedef typename Matrix::local_matrix_type local_matrix_type;
567  typedef typename LWGraph_kokkos::local_graph_type kokkos_graph_type;
568  typedef typename kokkos_graph_type::row_map_type::non_const_type rows_type;
569  typedef typename kokkos_graph_type::entries_type::non_const_type cols_type;
570  typedef typename local_matrix_type::values_type::non_const_type vals_type;
571 
572  LO numRows = A->getLocalNumRows();
573  local_matrix_type kokkosMatrix = A->getLocalMatrixDevice();
574  auto nnzA = kokkosMatrix.nnz();
575  auto rowsA = kokkosMatrix.graph.row_map;
576 
577 
578  typedef Kokkos::ArithTraits<SC> ATS;
579  typedef typename ATS::val_type impl_Scalar;
580  typedef Kokkos::ArithTraits<impl_Scalar> impl_ATS;
581 
582  bool reuseGraph = pL.get<bool>("filtered matrix: reuse graph");
583  bool lumping = pL.get<bool>("filtered matrix: use lumping");
584  if (lumping)
585  GetOStream(Runtime0) << "Lumping dropped entries" << std::endl;
586 
587  const bool aggregationMayCreateDirichlet = pL.get<bool>("aggregation: dropping may create Dirichlet");
588 
589  // FIXME_KOKKOS: replace by ViewAllocateWithoutInitializing + setting a single value
590  rows_type rows ("FA_rows", numRows+1);
591  cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("FA_aux_cols"), nnzA);
592  vals_type valsAux;
593  if (reuseGraph) {
594  SubFactoryMonitor m2(*this, "CopyMatrix", currentLevel);
595 
596  // Share graph with the original matrix
597  filteredA = MatrixFactory::Build(A->getCrsGraph());
598 
599  // Do a no-op fill-complete
600  RCP<ParameterList> fillCompleteParams(new ParameterList);
601  fillCompleteParams->set("No Nonlocal Changes", true);
602  filteredA->fillComplete(fillCompleteParams);
603 
604  // No need to reuseFill, just modify in place
605  valsAux = filteredA->getLocalMatrixDevice().values;
606 
607  } else {
608  // Need an extra array to compress
609  valsAux = vals_type(Kokkos::ViewAllocateWithoutInitializing("FA_aux_vals"), nnzA);
610  }
611 
612  typename boundary_nodes_type::non_const_type bndNodes(Kokkos::ViewAllocateWithoutInitializing("boundaryNodes"), numRows);
613 
614  LO nnzFA = 0;
615  {
616  if (algo == "classical") {
617  // Construct overlapped matrix diagonal
618  RCP<Vector> ghostedDiag;
619  {
620  kokkosMatrix = local_matrix_type();
621  SubFactoryMonitor m2(*this, "Ghosted diag construction", currentLevel);
623  kokkosMatrix=A->getLocalMatrixDevice();
624  }
625 
626  // Filter out entries
627  {
628  SubFactoryMonitor m2(*this, "MainLoop", currentLevel);
629 
630  auto ghostedDiagView = ghostedDiag->getDeviceLocalView(Xpetra::Access::ReadWrite);
631 
634  scalarFunctor(kokkosMatrix, bndNodes, dropFunctor, rows, colsAux, valsAux, reuseGraph, lumping, threshold, aggregationMayCreateDirichlet);
635 
636  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:main_loop", range_type(0,numRows),
637  scalarFunctor, nnzFA);
638  }
639 
640  } else if (algo == "distance laplacian") {
641  typedef Xpetra::MultiVector<typename Teuchos::ScalarTraits<Scalar>::magnitudeType,LO,GO,NO> doubleMultiVector;
642  auto coords = Get<RCP<doubleMultiVector> >(currentLevel, "Coordinates");
643 
644  auto uniqueMap = A->getRowMap();
645  auto nonUniqueMap = A->getColMap();
646 
647  // Construct ghosted coordinates
648  RCP<const Import> importer;
649  {
650  SubFactoryMonitor m2(*this, "Coords Import construction", currentLevel);
651  importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
652  }
653  RCP<doubleMultiVector> ghostedCoords;
654  {
655  SubFactoryMonitor m2(*this, "Ghosted coords construction", currentLevel);
656  ghostedCoords = Xpetra::MultiVectorFactory<typename Teuchos::ScalarTraits<Scalar>::magnitudeType,LO,GO,NO>::Build(nonUniqueMap, coords->getNumVectors());
657  ghostedCoords->doImport(*coords, *importer, Xpetra::INSERT);
658  }
659 
660  auto ghostedCoordsView = ghostedCoords->getDeviceLocalView(Xpetra::Access::ReadWrite);
662 
663  // Construct Laplacian diagonal
664  RCP<Vector> localLaplDiag;
665  {
666  SubFactoryMonitor m2(*this, "Local Laplacian diag construction", currentLevel);
667 
668  localLaplDiag = VectorFactory::Build(uniqueMap);
669 
670  auto localLaplDiagView = localLaplDiag->getDeviceLocalView(Xpetra::Access::OverwriteAll);
671  auto kokkosGraph = kokkosMatrix.graph;
672 
673  Kokkos::parallel_for("MueLu:CoalesceDropF:Build:scalar_filter:laplacian_diag", range_type(0,numRows),
674  KOKKOS_LAMBDA(const LO row) {
675  auto rowView = kokkosGraph.rowConst(row);
676  auto length = rowView.length;
677 
678  impl_Scalar d = impl_ATS::zero();
679  for (decltype(length) colID = 0; colID < length; colID++) {
680  auto col = rowView(colID);
681  if (row != col)
682  d += impl_ATS::one()/distFunctor.distance2(row, col);
683  }
684  localLaplDiagView(row,0) = d;
685  });
686  }
687 
688  // Construct ghosted Laplacian diagonal
689  RCP<Vector> ghostedLaplDiag;
690  {
691  SubFactoryMonitor m2(*this, "Ghosted Laplacian diag construction", currentLevel);
692  ghostedLaplDiag = VectorFactory::Build(nonUniqueMap);
693  ghostedLaplDiag->doImport(*localLaplDiag, *importer, Xpetra::INSERT);
694  }
695 
696  // Filter out entries
697  {
698  SubFactoryMonitor m2(*this, "MainLoop", currentLevel);
699 
700  auto ghostedLaplDiagView = ghostedLaplDiag->getDeviceLocalView(Xpetra::Access::ReadWrite);
701 
703  dropFunctor(ghostedLaplDiagView, distFunctor, threshold);
705  scalarFunctor(kokkosMatrix, bndNodes, dropFunctor, rows, colsAux, valsAux, reuseGraph, lumping, threshold, true);
706 
707  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:main_loop", range_type(0,numRows),
708  scalarFunctor, nnzFA);
709  }
710  }
711 
712  }
713  numDropped = nnzA - nnzFA;
714 
715  boundaryNodes = bndNodes;
716 
717  {
718  SubFactoryMonitor m2(*this, "CompressRows", currentLevel);
719 
720  // parallel_scan (exclusive)
721  Kokkos::parallel_scan("MueLu:CoalesceDropF:Build:scalar_filter:compress_rows", range_type(0,numRows+1),
722  KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
723  update += rows(i);
724  if (final_pass)
725  rows(i) = update;
726  });
727  }
728 
729  // Compress cols (and optionally vals)
730  // We use a trick here: we moved all remaining elements to the beginning
731  // of the original row in the main loop, so we don't need to check for
732  // INVALID here, and just stop when achieving the new number of elements
733  // per row.
734  cols_type cols(Kokkos::ViewAllocateWithoutInitializing("FA_cols"), nnzFA);
735  vals_type vals;
736  if (reuseGraph) {
737  GetOStream(Runtime1) << "reuse matrix graph for filtering (compress matrix columns only)" << std::endl;
738  // Only compress cols
739  SubFactoryMonitor m2(*this, "CompressColsAndVals", currentLevel);
740 
741  Kokkos::parallel_for("MueLu:TentativePF:Build:compress_cols", range_type(0,numRows),
742  KOKKOS_LAMBDA(const LO i) {
743  // Is there Kokkos memcpy?
744  LO rowStart = rows(i);
745  LO rowAStart = rowsA(i);
746  size_t rownnz = rows(i+1) - rows(i);
747  for (size_t j = 0; j < rownnz; j++)
748  cols(rowStart+j) = colsAux(rowAStart+j);
749  });
750  } else {
751  // Compress cols and vals
752  GetOStream(Runtime1) << "new matrix graph for filtering (compress matrix columns and values)" << std::endl;
753  SubFactoryMonitor m2(*this, "CompressColsAndVals", currentLevel);
754 
755  vals = vals_type(Kokkos::ViewAllocateWithoutInitializing("FA_vals"), nnzFA);
756 
757  Kokkos::parallel_for("MueLu:TentativePF:Build:compress_cols", range_type(0,numRows),
758  KOKKOS_LAMBDA(const LO i) {
759  LO rowStart = rows(i);
760  LO rowAStart = rowsA(i);
761  size_t rownnz = rows(i+1) - rows(i);
762  for (size_t j = 0; j < rownnz; j++) {
763  cols(rowStart+j) = colsAux(rowAStart+j);
764  vals(rowStart+j) = valsAux(rowAStart+j);
765  }
766  });
767  }
768 
769  kokkos_graph_type kokkosGraph(cols, rows);
770 
771  {
772  SubFactoryMonitor m2(*this, "LWGraph construction", currentLevel);
773 
774  graph = rcp(new LWGraph_kokkos(kokkosGraph, A->getRowMap(), A->getColMap(), "filtered graph of A"));
775  graph->getLocalLWGraph().SetBoundaryNodeMap(boundaryNodes);
776  }
777 
778  numTotal = A->getLocalNumEntries();
779 
780  dofsPerNode = 1;
781 
782  if (!reuseGraph) {
783  SubFactoryMonitor m2(*this, "LocalMatrix+FillComplete", currentLevel);
784 
785  local_matrix_type localFA = local_matrix_type("A", numRows, A->getLocalMatrixDevice().numCols(), nnzFA, vals, rows, cols);
786  auto filteredACrs = CrsMatrixFactory::Build(localFA, A->getRowMap(), A->getColMap(), A->getDomainMap(), A->getRangeMap(),
787  A->getCrsGraph()->getImporter(), A->getCrsGraph()->getExporter());
788  filteredA = rcp(new CrsMatrixWrap(filteredACrs));
789  }
790 
791  filteredA->SetFixedBlockSize(A->GetFixedBlockSize());
792 
793  if (pL.get<bool>("filtered matrix: reuse eigenvalue")) {
794  // Reuse max eigenvalue from A
795  // It is unclear what eigenvalue is the best for the smoothing, but we already may have
796  // the D^{-1}A estimate in A, may as well use it.
797  // NOTE: ML does that too
798  filteredA->SetMaxEigenvalueEstimate(A->GetMaxEigenvalueEstimate());
799  } else {
800  filteredA->SetMaxEigenvalueEstimate(-Teuchos::ScalarTraits<SC>::one());
801  }
802 
803  } else if (blkSize > 1 && threshold == zero) {
804  // Case 3: block problem without filtering
805  //
806  // FIXME_KOKKOS: this code is completely unoptimized. It really should do
807  // a very simple thing: merge rows and produce nodal graph. But the code
808  // seems very complicated. Can we do better?
809 
810  TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getLocalNumElements() % blkSize != 0, MueLu::Exceptions::RuntimeError, "MueLu::CoalesceDropFactory: Number of local elements is " << A->getRowMap()->getLocalNumElements() << " but should be a multiply of " << blkSize);
811 
812  const RCP<const Map> rowMap = A->getRowMap();
813  const RCP<const Map> colMap = A->getColMap();
814 
815  // build a node row map (uniqueMap = non-overlapping) and a node column map
816  // (nonUniqueMap = overlapping). The arrays rowTranslation and colTranslation
817  // stored in the AmalgamationInfo class container contain the local node id
818  // given a local dof id. The data is calculated in the AmalgamationFactory and
819  // stored in the variable "UnAmalgamationInfo" (which is of type AmalagamationInfo)
820  const RCP<const Map> uniqueMap = amalInfo->getNodeRowMap();
821  const RCP<const Map> nonUniqueMap = amalInfo->getNodeColMap();
822  Array<LO> rowTranslationArray = *(amalInfo->getRowTranslation()); // TAW should be transform that into a View?
823  Array<LO> colTranslationArray = *(amalInfo->getColTranslation());
824 
825  Kokkos::View<LO*, Kokkos::MemoryUnmanaged>
826  rowTranslationView(rowTranslationArray.getRawPtr(),rowTranslationArray.size() );
827  Kokkos::View<LO*, Kokkos::MemoryUnmanaged>
828  colTranslationView(colTranslationArray.getRawPtr(),colTranslationArray.size() );
829 
830  // get number of local nodes
831  LO numNodes = Teuchos::as<LocalOrdinal>(uniqueMap->getLocalNumElements());
832  typedef typename Kokkos::View<LocalOrdinal*, DeviceType> id_translation_type;
833  id_translation_type rowTranslation("dofId2nodeId",rowTranslationArray.size());
834  id_translation_type colTranslation("ov_dofId2nodeId",colTranslationArray.size());
835  Kokkos::deep_copy(rowTranslation, rowTranslationView);
836  Kokkos::deep_copy(colTranslation, colTranslationView);
837 
838  // extract striding information
839  blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map)
840  LocalOrdinal blkId = -1; //< the block id within a strided map or -1 if it is a full block map
841  LocalOrdinal blkPartSize = A->GetFixedBlockSize(); //< stores block size of part blkId (or the full block size)
842  if(A->IsView("stridedMaps") == true) {
843  const RCP<const Map> myMap = A->getRowMap("stridedMaps");
844  const RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(myMap);
845  TEUCHOS_TEST_FOR_EXCEPTION(strMap.is_null() == true, Exceptions::RuntimeError, "Map is not of type stridedMap");
846  blkSize = Teuchos::as<const LocalOrdinal>(strMap->getFixedBlockSize());
847  blkId = strMap->getStridedBlockId();
848  if (blkId > -1)
849  blkPartSize = Teuchos::as<LocalOrdinal>(strMap->getStridingData()[blkId]);
850  }
851  auto kokkosMatrix = A->getLocalMatrixDevice(); // access underlying kokkos data
852 
853  //
854  typedef typename LWGraph_kokkos::local_graph_type kokkos_graph_type;
855  typedef typename kokkos_graph_type::row_map_type row_map_type;
856  //typedef typename row_map_type::HostMirror row_map_type_h;
857  typedef typename kokkos_graph_type::entries_type entries_type;
858 
859  // Stage 1c: get number of dof-nonzeros per blkSize node rows
860  typename row_map_type::non_const_type dofNnz("nnz_map", numNodes + 1);
861  LO numDofCols = 0;
863  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:stage1a", range_type(0,numNodes), stage1aFunctor, numDofCols);
864  // parallel_scan (exclusive)
866  Kokkos::parallel_scan("MueLu:CoalesceDropF:Build:scalar_filter:stage1_scan", range_type(0,numNodes+1), scanFunctor);
867 
868  // Detect and record dof rows that correspond to Dirichlet boundary conditions
869  boundary_nodes_type singleEntryRows = Utilities_kokkos::DetectDirichletRows(*A, dirichletThreshold);
870 
871  typename entries_type::non_const_type dofcols("dofcols", numDofCols/*dofNnz(numNodes)*/); // why does dofNnz(numNodes) work? should be a parallel reduce, i guess
872 
873  // we have dofcols and dofids from Stage1dVectorFunctor
874  LO numNodeCols = 0;
875  typename row_map_type::non_const_type rows("nnz_nodemap", numNodes + 1);
876  typename boundary_nodes_type::non_const_type bndNodes("boundaryNodes", numNodes);
877 
878  CoalesceDrop_Kokkos_Details::Stage1bcVectorFunctor <decltype(kokkosMatrix), decltype(dofNnz), decltype(blkPartSize), decltype(dofcols), decltype(colTranslation), decltype(singleEntryRows), decltype(bndNodes), bool> stage1bcFunctor(kokkosMatrix, dofNnz, blkPartSize, dofcols, colTranslation, rows, singleEntryRows, bndNodes, pL.get<bool>("aggregation: greedy Dirichlet"));
879  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:stage1c", range_type(0,numNodes), stage1bcFunctor,numNodeCols);
880 
881  // parallel_scan (exclusive)
883  Kokkos::parallel_scan("MueLu:CoalesceDropF:Build:scalar_filter:stage1_scan", range_type(0,numNodes+1), scanNodeFunctor);
884 
885  // create column node view
886  typename entries_type::non_const_type cols("nodecols", numNodeCols);
887 
888 
890  Kokkos::parallel_for("MueLu:CoalesceDropF:Build:scalar_filter:stage1c", range_type(0,numNodes), stage1dFunctor);
891  kokkos_graph_type kokkosGraph(cols, rows);
892 
893  // create LW graph
894  graph = rcp(new LWGraph_kokkos(kokkosGraph, uniqueMap, nonUniqueMap, "amalgamated graph of A"));
895 
896  boundaryNodes = bndNodes;
897  graph->getLocalLWGraph().SetBoundaryNodeMap(boundaryNodes);
898  numTotal = A->getLocalNumEntries();
899 
900  dofsPerNode = blkSize;
901 
902  filteredA = A;
903 
904  } else {
905  TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "MueLu: CoalesceDropFactory_kokkos: Block filtering is not implemented");
906  }
907 
908  if (GetVerbLevel() & Statistics1) {
909  GO numLocalBoundaryNodes = 0;
910  GO numGlobalBoundaryNodes = 0;
911 
912  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:bnd", range_type(0, boundaryNodes.extent(0)),
913  KOKKOS_LAMBDA(const LO i, GO& n) {
914  if (boundaryNodes(i))
915  n++;
916  }, numLocalBoundaryNodes);
917 
918  auto comm = A->getRowMap()->getComm();
919  MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
920  GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl;
921  }
922 
923  if ((GetVerbLevel() & Statistics1) && threshold != zero) {
924  auto comm = A->getRowMap()->getComm();
925 
926  GO numGlobalTotal, numGlobalDropped;
927  MueLu_sumAll(comm, numTotal, numGlobalTotal);
928  MueLu_sumAll(comm, numDropped, numGlobalDropped);
929 
930  if (numGlobalTotal != 0) {
931  GetOStream(Statistics1) << "Number of dropped entries: "
932  << numGlobalDropped << "/" << numGlobalTotal
933  << " (" << 100*Teuchos::as<double>(numGlobalDropped)/Teuchos::as<double>(numGlobalTotal) << "%)" << std::endl;
934  }
935  }
936 
937  Set(currentLevel, "DofsPerNode", dofsPerNode);
938  Set(currentLevel, "Graph", graph);
939  Set(currentLevel, "A", filteredA);
940  }
941 }
942 #endif // MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
KOKKOS_INLINE_FUNCTION void operator()(const LO row, LO &nnz) const
#define MueLu_sumAll(rcpComm, in, out)
MueLu::DefaultLocalOrdinal LocalOrdinal
Lightweight MueLu representation of a compressed row storage graph.
Stage1dVectorFunctor(ColDofType coldofs_, ColDofNnzType coldofnnz_, ColNodeType colnodes_, ColNodeNnzType colnodennz_)
ScalarFunctor(MatrixType A_, BndViewType bndNodes_, DropFunctorType dropFunctor_, typename rows_type::non_const_type rows_, typename cols_type::non_const_type colsAux_, typename vals_type::non_const_type valsAux_, bool reuseGraph_, bool lumping_, SC, bool aggregationMayCreateDirichlet_)
Stage1aVectorFunctor(MatrixType kokkosMatrix_, NnzType nnz_, blkSizeType blkSize_)
KOKKOS_INLINE_FUNCTION void operator()(const LO row, LO &totalnnz) const
Timer to be used in factories. Similar to Monitor but with additional timers.
ClassicalDropFunctor(GhostedViewType ghostedDiag, magnitudeType threshold)
Print more statistics.
One-liner description of what is happening.
Namespace for MueLu classes and methods.
Class that holds all level-specific information.
Definition: MueLu_Level.hpp:99
KOKKOS_INLINE_FUNCTION magnitudeType distance2(LO row, LO col) const
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
DistanceLaplacianDropFunctor(GhostedViewType ghostedLaplDiag, DistanceFunctor distFunctor_, magnitudeType threshold)
KOKKOS_INLINE_FUNCTION void operator()(const LO i, LO &upd, const bool &final) const
static RCP< Vector > GetMatrixOverlappedDiagonal(const Matrix &A)
Extract Overlapped Matrix Diagonal.
Stage1bcVectorFunctor(MatrixType kokkosMatrix_, NnzType coldofnnz_, blkSizeType blkSize_, ColDofType coldofs_, Dof2NodeTranslationType dof2node_, NnzType colnodennz_, BdryNodeTypeConst dirichletdof_, BdryNodeType bdrynode_, boolType usegreedydirichlet_)
KOKKOS_INLINE_FUNCTION void operator()(const LO rowNode, LO &nnz) const
static Kokkos::View< bool *, typename NO::device_type > DetectDirichletRows(const Matrix &A, const Magnitude &tol=Teuchos::ScalarTraits< typename Teuchos::ScalarTraits< SC >::magnitudeType >::zero(), const bool count_twos_as_dirichlet=false)
Detect Dirichlet rows.
KOKKOS_FORCEINLINE_FUNCTION bool operator()(LO row, LO col, SC val) const
#define SET_VALID_ENTRY(name)
Factory for creating a graph based on a given matrix.
Exception throws to report errors in the internal logical of the program.
Description of what is happening (more verbose)