Ifpack2 Templated Preconditioning Package  Version 1.0
Ifpack2_BlockTriDiContainer_impl.hpp
1 /*@HEADER
2 // ***********************************************************************
3 //
4 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ***********************************************************************
40 //@HEADER
41 */
42 
43 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
44 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
45 
46 #include <Teuchos_Details_MpiTypeTraits.hpp>
47 
48 #include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
49 #include <Tpetra_Distributor.hpp>
50 #include <Tpetra_BlockMultiVector.hpp>
51 
52 #include <Kokkos_ArithTraits.hpp>
53 #include <KokkosBatched_Util.hpp>
54 #include <KokkosBatched_Vector.hpp>
55 #include <KokkosBatched_Copy_Decl.hpp>
56 #include <KokkosBatched_Copy_Impl.hpp>
57 #include <KokkosBatched_AddRadial_Decl.hpp>
58 #include <KokkosBatched_AddRadial_Impl.hpp>
59 #include <KokkosBatched_SetIdentity_Decl.hpp>
60 #include <KokkosBatched_SetIdentity_Impl.hpp>
61 #include <KokkosBatched_Gemm_Decl.hpp>
62 #include <KokkosBatched_Gemm_Serial_Impl.hpp>
63 #include <KokkosBatched_Gemm_Team_Impl.hpp>
64 #include <KokkosBatched_Gemv_Decl.hpp>
65 #include <KokkosBatched_Gemv_Team_Impl.hpp>
66 #include <KokkosBatched_Trsm_Decl.hpp>
67 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
68 #include <KokkosBatched_Trsm_Team_Impl.hpp>
69 #include <KokkosBatched_Trsv_Decl.hpp>
70 #include <KokkosBatched_Trsv_Serial_Impl.hpp>
71 #include <KokkosBatched_Trsv_Team_Impl.hpp>
72 #include <KokkosBatched_LU_Decl.hpp>
73 #include <KokkosBatched_LU_Serial_Impl.hpp>
74 #include <KokkosBatched_LU_Team_Impl.hpp>
75 
76 #include <KokkosBlas1_nrm1.hpp>
77 #include <KokkosBlas1_nrm2.hpp>
78 
79 #include <memory>
80 
81 // need to interface this into cmake variable (or only use this flag when it is necessary)
82 //#define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
83 //#undef IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
84 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
85 #include "cuda_profiler_api.h"
86 #endif
87 
88 // I am not 100% sure about the mpi 3 on cuda
89 #if MPI_VERSION >= 3
90 #define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
91 #endif
92 
93 // ::: Experiments :::
94 // define either pinned memory or cudamemory for mpi
95 // if both macros are disabled, it will use tpetra memory space which is uvm space for cuda
96 // if defined, this use pinned memory instead of device pointer
97 // by default, we enable pinned memory
98 #define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
99 //#define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI
100 
101 // if defined, all views are allocated on cuda space intead of cuda uvm space
102 #define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
103 
104 // if defined, btdm_scalar_type is used (if impl_scala_type is double, btdm_scalar_type is float)
105 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
106 #define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
107 #endif
108 
109 // if defined, it uses multiple execution spaces
110 #define IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES
111 
112 namespace Ifpack2 {
113 
114  namespace BlockTriDiContainerDetails {
115 
116  namespace KB = KokkosBatched;
117 
121  using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing;
122 
123  template <typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
124  using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
125  MemoryTraitsType::is_random_access |
126  flag>;
127 
128  template <typename ViewType>
129  using Unmanaged = Kokkos::View<typename ViewType::data_type,
130  typename ViewType::array_layout,
131  typename ViewType::device_type,
132  MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
133  template <typename ViewType>
134  using Atomic = Kokkos::View<typename ViewType::data_type,
135  typename ViewType::array_layout,
136  typename ViewType::device_type,
137  MemoryTraits<typename ViewType::memory_traits,Kokkos::Atomic> >;
138  template <typename ViewType>
139  using Const = Kokkos::View<typename ViewType::const_data_type,
140  typename ViewType::array_layout,
141  typename ViewType::device_type,
142  typename ViewType::memory_traits>;
143  template <typename ViewType>
144  using ConstUnmanaged = Const<Unmanaged<ViewType> >;
145 
146  template <typename ViewType>
147  using AtomicUnmanaged = Atomic<Unmanaged<ViewType> >;
148 
149  template <typename ViewType>
150  using Unmanaged = Kokkos::View<typename ViewType::data_type,
151  typename ViewType::array_layout,
152  typename ViewType::device_type,
153  MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
154 
155 
156  template <typename ViewType>
157  using Scratch = Kokkos::View<typename ViewType::data_type,
158  typename ViewType::array_layout,
159  typename ViewType::execution_space::scratch_memory_space,
160  MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
161 
165  template<typename LayoutType> struct TpetraLittleBlock;
166  template<> struct TpetraLittleBlock<Kokkos::LayoutLeft> {
167  template<typename T> KOKKOS_INLINE_FUNCTION
168  static T getFlatIndex(const T i, const T j, const T blksize) { return i+j*blksize; }
169  };
170  template<> struct TpetraLittleBlock<Kokkos::LayoutRight> {
171  template<typename T> KOKKOS_INLINE_FUNCTION
172  static T getFlatIndex(const T i, const T j, const T blksize) { return i*blksize+j; }
173  };
174 
178  template<typename T> struct BlockTridiagScalarType { typedef T type; };
179 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
180  template<> struct BlockTridiagScalarType<double> { typedef float type; };
181  //template<> struct SmallScalarType<Kokkos::complex<double> > { typedef Kokkos::complex<float> type; };
182 #endif
183 
187  template<typename T> struct is_cuda { enum : bool { value = false }; };
188 #if defined(KOKKOS_ENABLE_CUDA)
189  template<> struct is_cuda<Kokkos::Cuda> { enum : bool { value = true }; };
190 #endif
191 
195  template<typename T> struct is_hip { enum : bool { value = false }; };
196 #if defined(KOKKOS_ENABLE_HIP)
197  template<> struct is_hip<Kokkos::Experimental::HIP> { enum : bool { value = true }; };
198 #endif
199 
203  template<typename T>
205  static void createInstance(T &exec_instance) {
206  exec_instance = T();
207  }
208 #if defined(KOKKOS_ENABLE_CUDA)
209  static void createInstance(const cudaStream_t &s, T &exec_instance) {
210  exec_instance = T();
211  }
212 #endif
213  };
214 
215 #if defined(KOKKOS_ENABLE_CUDA)
216  template<>
217  struct ExecutionSpaceFactory<Kokkos::Cuda> {
218  static void createInstance(Kokkos::Cuda &exec_instance) {
219  exec_instance = Kokkos::Cuda();
220  }
221  static void createInstance(const cudaStream_t &s, Kokkos::Cuda &exec_instance) {
222  exec_instance = Kokkos::Cuda(s);
223  }
224  };
225 #endif
226 
227 #if defined(KOKKOS_ENABLE_HIP)
228  template<>
229  struct ExecutionSpaceFactory<Kokkos::Experimental::HIP> {
230  static void createInstance(Kokkos::Experimental::HIP &exec_instance) {
231  exec_instance = Kokkos::Experimental::HIP();
232  }
233  };
234 #endif
235 
239  template<typename CommPtrType>
240  std::string get_msg_prefix (const CommPtrType &comm) {
241  const auto rank = comm->getRank();
242  const auto nranks = comm->getSize();
243  std::stringstream ss;
244  ss << "Rank " << rank << " of " << nranks << ": ";
245  return ss.str();
246  }
247 
251  template<typename T, int N>
252  struct ArrayValueType {
253  T v[N];
254  KOKKOS_INLINE_FUNCTION
255  ArrayValueType() {
256  for (int i=0;i<N;++i)
257  this->v[i] = 0;
258  }
259  KOKKOS_INLINE_FUNCTION
260  ArrayValueType(const ArrayValueType &b) {
261  for (int i=0;i<N;++i)
262  this->v[i] = b.v[i];
263  }
264  };
265  template<typename T, int N>
266  static
267  KOKKOS_INLINE_FUNCTION
268  void
269  operator+=(ArrayValueType<T,N> &a,
270  const ArrayValueType<T,N> &b) {
271  for (int i=0;i<N;++i)
272  a.v[i] += b.v[i];
273  }
274 
278  template<typename T, int N, typename ExecSpace>
279  struct SumReducer {
280  typedef SumReducer reducer;
282  typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
283  value_type *value;
284 
285  KOKKOS_INLINE_FUNCTION
286  SumReducer(value_type &val) : value(&val) {}
287 
288  KOKKOS_INLINE_FUNCTION
289  void join(value_type &dst, value_type const &src) const {
290  for (int i=0;i<N;++i)
291  dst.v[i] += src.v[i];
292  }
293  KOKKOS_INLINE_FUNCTION
294  void init(value_type &val) const {
295  for (int i=0;i<N;++i)
296  val.v[i] = Kokkos::reduction_identity<T>::sum();
297  }
298  KOKKOS_INLINE_FUNCTION
299  value_type& reference() {
300  return *value;
301  }
302  KOKKOS_INLINE_FUNCTION
303  result_view_type view() const {
304  return result_view_type(value);
305  }
306  };
307 
308 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS)
309 #define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label) TEUCHOS_FUNC_TIME_MONITOR(label);
310 #else
311 #define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label)
312 #endif
313 
314 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
315 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
316  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
317 
318 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
319  { KOKKOS_IMPL_CUDA_SAFE_CALL( cudaProfilerStop() ); }
320 #else
321 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
323 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
324 #endif
325 
329  template <typename MatrixType>
330  struct ImplType {
334  typedef size_t size_type;
335  typedef typename MatrixType::scalar_type scalar_type;
336  typedef typename MatrixType::local_ordinal_type local_ordinal_type;
337  typedef typename MatrixType::global_ordinal_type global_ordinal_type;
338  typedef typename MatrixType::node_type node_type;
339 
343  typedef typename Kokkos::Details::ArithTraits<scalar_type>::val_type impl_scalar_type;
344  typedef typename Kokkos::ArithTraits<impl_scalar_type>::mag_type magnitude_type;
345 
346  typedef typename BlockTridiagScalarType<impl_scalar_type>::type btdm_scalar_type;
347  typedef typename Kokkos::ArithTraits<btdm_scalar_type>::mag_type btdm_magnitude_type;
348 
352  typedef Kokkos::DefaultHostExecutionSpace host_execution_space;
353 
357  typedef typename node_type::device_type node_device_type;
358  typedef typename node_device_type::execution_space node_execution_space;
359  typedef typename node_device_type::memory_space node_memory_space;
360 
361 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE)
362  typedef node_execution_space execution_space;
364  typedef typename std::conditional<std::is_same<node_memory_space,Kokkos::CudaUVMSpace>::value,
365  Kokkos::CudaSpace,
366  node_memory_space>::type memory_space;
367  typedef Kokkos::Device<execution_space,memory_space> device_type;
368 #else
369  typedef node_device_type device_type;
370  typedef node_execution_space execution_space;
371  typedef node_memory_space memory_space;
372 #endif
373 
374  typedef Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_multivector_type;
375  typedef Tpetra::Map<local_ordinal_type,global_ordinal_type,node_type> tpetra_map_type;
376  typedef Tpetra::Import<local_ordinal_type,global_ordinal_type,node_type> tpetra_import_type;
377  typedef Tpetra::RowMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_row_matrix_type;
378  typedef Tpetra::BlockCrsMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_crs_matrix_type;
379  typedef typename tpetra_block_crs_matrix_type::little_block_type tpetra_block_access_view_type;
380  typedef Tpetra::BlockMultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_multivector_type;
381  typedef typename tpetra_block_crs_matrix_type::crs_graph_type::local_graph_device_type local_crs_graph_type;
382 
386  template<typename T, int l> using Vector = KB::Vector<T,l>;
387  template<typename T> using SIMD = KB::SIMD<T>;
388  template<typename T, typename M> using DefaultVectorLength = KB::DefaultVectorLength<T,M>;
389  template<typename T, typename M> using DefaultInternalVectorLength = KB::DefaultInternalVectorLength<T,M>;
390 
391  static constexpr int vector_length = DefaultVectorLength<btdm_scalar_type,memory_space>::value;
392  static constexpr int internal_vector_length = DefaultInternalVectorLength<btdm_scalar_type,memory_space>::value;
393  typedef Vector<SIMD<btdm_scalar_type>,vector_length> vector_type;
394  typedef Vector<SIMD<btdm_scalar_type>,internal_vector_length> internal_vector_type;
395 
399  typedef Kokkos::View<size_type*,device_type> size_type_1d_view;
400  typedef Kokkos::View<local_ordinal_type*,device_type> local_ordinal_type_1d_view;
401  // tpetra block crs values
402  typedef Kokkos::View<impl_scalar_type*,device_type> impl_scalar_type_1d_view;
403  typedef Kokkos::View<impl_scalar_type*,node_device_type> impl_scalar_type_1d_view_tpetra;
404 
405  // tpetra multivector values (layout left): may need to change the typename more explicitly
406  typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,device_type> impl_scalar_type_2d_view;
407  typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,node_device_type> impl_scalar_type_2d_view_tpetra;
408 
409  // packed data always use layout right
410  typedef Kokkos::View<vector_type*,device_type> vector_type_1d_view;
411  typedef Kokkos::View<vector_type***,Kokkos::LayoutRight,device_type> vector_type_3d_view;
412  typedef Kokkos::View<internal_vector_type***,Kokkos::LayoutRight,device_type> internal_vector_type_3d_view;
413  typedef Kokkos::View<internal_vector_type****,Kokkos::LayoutRight,device_type> internal_vector_type_4d_view;
414  typedef Kokkos::View<btdm_scalar_type***,Kokkos::LayoutRight,device_type> btdm_scalar_type_3d_view;
415  typedef Kokkos::View<btdm_scalar_type****,Kokkos::LayoutRight,device_type> btdm_scalar_type_4d_view;
416  };
417 
421  template<typename MatrixType>
422  typename Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_import_type>
423  createBlockCrsTpetraImporter(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
424  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter");
425  using impl_type = ImplType<MatrixType>;
426  using tpetra_map_type = typename impl_type::tpetra_map_type;
427  using tpetra_mv_type = typename impl_type::tpetra_block_multivector_type;
428  using tpetra_import_type = typename impl_type::tpetra_import_type;
429 
430  const auto g = A->getCrsGraph(); // tpetra crs graph object
431  const auto blocksize = A->getBlockSize();
432  const auto src = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
433  const auto tgt = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap() , blocksize)));
434 
435  return Teuchos::rcp(new tpetra_import_type(src, tgt));
436  }
437 
438  // Partial replacement for forward-mode MultiVector::doImport.
439  // Permits overlapped communication and computation, but also supports sync'ed.
440  // I'm finding that overlapped comm/comp can give quite poor performance on some
441  // platforms, so we can't just use it straightforwardly always.
442 
443  template<typename MatrixType>
444  struct AsyncableImport {
445  public:
446  using impl_type = ImplType<MatrixType>;
447 
448  private:
452 #if !defined(HAVE_IFPACK2_MPI)
453  typedef int MPI_Request;
454  typedef int MPI_Comm;
455 #endif
456  using scalar_type = typename impl_type::scalar_type;
459 
460  static int isend(const MPI_Comm comm, const char* buf, int count, int dest, int tag, MPI_Request* ireq) {
461 #ifdef HAVE_IFPACK2_MPI
462  MPI_Request ureq;
463  int ret = MPI_Isend(const_cast<char*>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
464  if (ireq == NULL) MPI_Request_free(&ureq);
465  return ret;
466 #else
467  return 0;
468 #endif
469  }
470 
471  static int irecv(const MPI_Comm comm, char* buf, int count, int src, int tag, MPI_Request* ireq) {
472 #ifdef HAVE_IFPACK2_MPI
473  MPI_Request ureq;
474  int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
475  if (ireq == NULL) MPI_Request_free(&ureq);
476  return ret;
477 #else
478  return 0;
479 #endif
480  }
481 
482  static int waitany(int count, MPI_Request* reqs, int* index) {
483 #ifdef HAVE_IFPACK2_MPI
484  return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
485 #else
486  return 0;
487 #endif
488  }
489 
490  static int waitall(int count, MPI_Request* reqs) {
491 #ifdef HAVE_IFPACK2_MPI
492  return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
493 #else
494  return 0;
495 #endif
496  }
497 
498  public:
499  using tpetra_map_type = typename impl_type::tpetra_map_type;
500  using tpetra_import_type = typename impl_type::tpetra_import_type;
501 
502  using local_ordinal_type = typename impl_type::local_ordinal_type;
503  using global_ordinal_type = typename impl_type::global_ordinal_type;
504  using size_type = typename impl_type::size_type;
505  using impl_scalar_type = typename impl_type::impl_scalar_type;
506 
507  using int_1d_view_host = Kokkos::View<int*,Kokkos::HostSpace>;
508  using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type*,Kokkos::HostSpace>;
509 
510  using execution_space = typename impl_type::execution_space;
511  using memory_space = typename impl_type::memory_space;
512  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
513  using size_type_1d_view = typename impl_type::size_type_1d_view;
514  using size_type_1d_view_host = Kokkos::View<size_type*,Kokkos::HostSpace>;
515 
516 #if defined(KOKKOS_ENABLE_CUDA)
517  using impl_scalar_type_1d_view =
518  typename std::conditional<std::is_same<execution_space,Kokkos::Cuda>::value,
519 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
520  Kokkos::View<impl_scalar_type*,Kokkos::CudaHostPinnedSpace>,
521 # elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
522  Kokkos::View<impl_scalar_type*,Kokkos::CudaSpace>,
523 # else // no experimental macros are defined
524  typename impl_type::impl_scalar_type_1d_view,
525 # endif
526  typename impl_type::impl_scalar_type_1d_view>::type;
527 #else
528  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
529 #endif
530  using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view;
531  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
532 
533 #ifdef HAVE_IFPACK2_MPI
534  MPI_Comm comm;
535 #endif
536 
537  impl_scalar_type_2d_view_tpetra remote_multivector;
538  local_ordinal_type blocksize;
539 
540  template<typename T>
541  struct SendRecvPair {
542  T send, recv;
543  };
544 
545  // (s)end and (r)eceive data:
546  SendRecvPair<int_1d_view_host> pids; // mpi ranks
547  SendRecvPair<std::vector<MPI_Request> > reqs; // MPI_Request is pointer, cannot use kokkos view
548  SendRecvPair<size_type_1d_view> offset; // offsets to local id list and data buffer
549  SendRecvPair<size_type_1d_view_host> offset_host; // offsets to local id list and data buffer
550  SendRecvPair<local_ordinal_type_1d_view> lids; // local id list
551  SendRecvPair<impl_scalar_type_1d_view> buffer; // data buffer
552 
553  local_ordinal_type_1d_view dm2cm; // permutation
554 
555 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
556  using exec_instance_1d_std_vector = std::vector<execution_space>;
557  exec_instance_1d_std_vector exec_instances;
558 #endif
559 
560  // for cuda
561  public:
562  void setOffsetValues(const Teuchos::ArrayView<const size_t> &lens,
563  const size_type_1d_view &offs) {
564  // wrap lens to kokkos view and deep copy to device
565  Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
566  const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
567 
568  // exclusive scan
569  const Kokkos::RangePolicy<execution_space> policy(0,offs.extent(0));
570  const local_ordinal_type lens_size = lens_device.extent(0);
571  Kokkos::parallel_scan
572  ("AsyncableImport::RangePolicy::setOffsetValues",
573  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
574  if (final)
575  offs(i) = update;
576  update += (i < lens_size ? lens_device[i] : 0);
577  });
578  }
579 
580  void setOffsetValuesHost(const Teuchos::ArrayView<const size_t> &lens,
581  const size_type_1d_view_host &offs) {
582  // wrap lens to kokkos view and deep copy to device
583  Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
584  const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
585 
586  // exclusive scan
587  offs(0) = 0;
588  for (local_ordinal_type i=1,iend=offs.extent(0);i<iend;++i) {
589  offs(i) = offs(i-1) + lens[i-1];
590  }
591  }
592 
593  private:
594  void createMpiRequests(const tpetra_import_type &import) {
595  Tpetra::Distributor &distributor = import.getDistributor();
596 
597  // copy pids from distributor
598  const auto pids_from = distributor.getProcsFrom();
599  pids.recv = int_1d_view_host(do_not_initialize_tag("pids recv"), pids_from.size());
600  memcpy(pids.recv.data(), pids_from.getRawPtr(), sizeof(int)*pids.recv.extent(0));
601 
602  const auto pids_to = distributor.getProcsTo();
603  pids.send = int_1d_view_host(do_not_initialize_tag("pids send"), pids_to.size());
604  memcpy(pids.send.data(), pids_to.getRawPtr(), sizeof(int)*pids.send.extent(0));
605 
606  // mpi requests
607  reqs.recv.resize(pids.recv.extent(0)); memset(reqs.recv.data(), 0, reqs.recv.size()*sizeof(MPI_Request));
608  reqs.send.resize(pids.send.extent(0)); memset(reqs.send.data(), 0, reqs.send.size()*sizeof(MPI_Request));
609 
610  // construct offsets
611 #if 0
612  const auto lengths_to = distributor.getLengthsTo();
613  offset.send = size_type_1d_view(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
614 
615  const auto lengths_from = distributor.getLengthsFrom();
616  offset.recv = size_type_1d_view(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
617 
618  setOffsetValues(lengths_to, offset.send);
619  offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
620 
621  setOffsetValues(lengths_from, offset.recv);
622  offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
623 #else
624  const auto lengths_to = distributor.getLengthsTo();
625  offset_host.send = size_type_1d_view_host(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
626 
627  const auto lengths_from = distributor.getLengthsFrom();
628  offset_host.recv = size_type_1d_view_host(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
629 
630  setOffsetValuesHost(lengths_to, offset_host.send);
631  //offset.send = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.send);
632 
633  setOffsetValuesHost(lengths_from, offset_host.recv);
634  //offset.recv = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.recv);
635 #endif
636  }
637 
638  void createSendRecvIDs(const tpetra_import_type &import) {
639  // For each remote PID, the list of LIDs to receive.
640  const auto remote_lids = import.getRemoteLIDs();
641  const local_ordinal_type_1d_view_host
642  remote_lids_view_host(const_cast<local_ordinal_type*>(remote_lids.getRawPtr()), remote_lids.size());
643  lids.recv = local_ordinal_type_1d_view(do_not_initialize_tag("lids recv"), remote_lids.size());
644  Kokkos::deep_copy(lids.recv, remote_lids_view_host);
645 
646  // For each export PID, the list of LIDs to send.
647  auto epids = import.getExportPIDs();
648  auto elids = import.getExportLIDs();
649  TEUCHOS_ASSERT(epids.size() == elids.size());
650  lids.send = local_ordinal_type_1d_view(do_not_initialize_tag("lids send"), elids.size());
651  auto lids_send_host = Kokkos::create_mirror_view(lids.send);
652 
653  // naive search (not sure if pids or epids are sorted)
654  for (local_ordinal_type cnt=0,i=0,iend=pids.send.extent(0);i<iend;++i) {
655  const auto pid_send_value = pids.send[i];
656  for (local_ordinal_type j=0,jend=epids.size();j<jend;++j)
657  if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
658 #if !defined(__HIP_DEVICE_COMPILE__) && !defined(__CUDA_ARCH__)
659  TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i+1]);
660 #endif
661  }
662  Kokkos::deep_copy(lids.send, lids_send_host);
663  }
664 
665  void createExecutionSpaceInstances() {
666 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
667  //The following line creates 8 streams:
668  exec_instances =
669  Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1, 1, 1, 1, 1);
670 #endif
671  }
672 
673  public:
674  // for cuda, all tag types are public
675  struct ToBuffer {};
676  struct ToMultiVector {};
677 
678  AsyncableImport (const Teuchos::RCP<const tpetra_map_type>& src_map,
679  const Teuchos::RCP<const tpetra_map_type>& tgt_map,
680  const local_ordinal_type blocksize_,
681  const local_ordinal_type_1d_view dm2cm_) {
682  blocksize = blocksize_;
683  dm2cm = dm2cm_;
684 
685 #ifdef HAVE_IFPACK2_MPI
686  comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
687 #endif
688  const tpetra_import_type import(src_map, tgt_map);
689 
690  createMpiRequests(import);
691  createSendRecvIDs(import);
692  createExecutionSpaceInstances();
693  }
694 
695  void createDataBuffer(const local_ordinal_type &num_vectors) {
696  const size_type extent_0 = lids.recv.extent(0)*blocksize;
697  const size_type extent_1 = num_vectors;
698  if (remote_multivector.extent(0) == extent_0 &&
699  remote_multivector.extent(1) == extent_1) {
700  // skip
701  } else {
702  remote_multivector =
703  impl_scalar_type_2d_view_tpetra(do_not_initialize_tag("remote multivector"), extent_0, extent_1);
704 
705  const auto send_buffer_size = offset_host.send[offset_host.send.extent(0)-1]*blocksize*num_vectors;
706  const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0)-1]*blocksize*num_vectors;
707 
708  buffer.send = impl_scalar_type_1d_view(do_not_initialize_tag("buffer send"), send_buffer_size);
709  buffer.recv = impl_scalar_type_1d_view(do_not_initialize_tag("buffer recv"), recv_buffer_size);
710  }
711  }
712 
713  void cancel () {
714 #ifdef HAVE_IFPACK2_MPI
715  waitall(reqs.recv.size(), reqs.recv.data());
716  waitall(reqs.send.size(), reqs.send.data());
717 #endif
718  }
719 
720  // ======================================================================
721  // Async version using cuda stream
722  // - cuda only with kokkos develop branch
723  // ======================================================================
724 
725 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
726  template<typename PackTag>
727  static
728  void copy(const local_ordinal_type_1d_view &lids_,
729  const impl_scalar_type_1d_view &buffer_,
730  const local_ordinal_type ibeg_,
731  const local_ordinal_type iend_,
732  const impl_scalar_type_2d_view_tpetra &multivector_,
733  const local_ordinal_type blocksize_,
734  const execution_space &exec_instance_) {
735  const local_ordinal_type num_vectors = multivector_.extent(1);
736  const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
737  const local_ordinal_type idiff = iend_ - ibeg_;
738  const auto abase = buffer_.data() + mv_blocksize*ibeg_;
739 
740  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
741  local_ordinal_type vector_size(0);
742  if (blocksize_ <= 4) vector_size = 4;
743  else if (blocksize_ <= 8) vector_size = 8;
744  else if (blocksize_ <= 16) vector_size = 16;
745  else vector_size = 32;
746 
747  const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
748  const team_policy_type policy(exec_instance_, idiff, 1, vector_size);
749  Kokkos::parallel_for
750  (//"AsyncableImport::TeamPolicy::copyViaCudaStream",
751  Kokkos::Experimental::require(policy, work_item_property),
752  KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
753  const local_ordinal_type i = member.league_rank();
754  Kokkos::parallel_for
755  (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
756  auto aptr = abase + blocksize_*(i + idiff*j);
757  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
758  if (std::is_same<PackTag,ToBuffer>::value)
759  Kokkos::parallel_for
760  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
761  aptr[k] = bptr[k];
762  });
763  else
764  Kokkos::parallel_for
765  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
766  bptr[k] = aptr[k];
767  });
768  });
769  });
770  }
771 
772  void asyncSendRecvVar1(const impl_scalar_type_2d_view_tpetra &mv) {
773  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv");
774 
775 #ifdef HAVE_IFPACK2_MPI
776  // constants and reallocate data buffers if necessary
777  const local_ordinal_type num_vectors = mv.extent(1);
778  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
779 
780  // 0. post receive async
781  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
782  irecv(comm,
783  reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
784  (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
785  pids.recv[i],
786  42,
787  &reqs.recv[i]);
788  }
789 
791  execution_space().fence();
792 
793  // 1. async memcpy
794  for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
795  // 1.0. enqueue pack buffer
796  if (i<8) exec_instances[i%8].fence();
797  copy<ToBuffer>(lids.send, buffer.send,
798  offset_host.send(i), offset_host.send(i+1),
799  mv, blocksize,
800  //execution_space());
801  exec_instances[i%8]);
802 
803  }
805  //execution_space().fence();
806  for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
807  // 1.1. sync the stream and isend
808  if (i<8) exec_instances[i%8].fence();
809  isend(comm,
810  reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
811  (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
812  pids.send[i],
813  42,
814  &reqs.send[i]);
815  }
816 
817  // 2. poke communication
818  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
819  int flag;
820  MPI_Status stat;
821  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
822  }
823 #endif // HAVE_IFPACK2_MPI
824  }
825 
826  void syncRecvVar1() {
827  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncRecv");
828 #ifdef HAVE_IFPACK2_MPI
829  // 0. wait for receive async.
830  for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.recv.extent(0));++i) {
831  local_ordinal_type idx = i;
832 
833  // 0.0. wait any
834  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
835 
836  // 0.1. unpack data after data is moved into a device
837  copy<ToMultiVector>(lids.recv, buffer.recv,
838  offset_host.recv(idx), offset_host.recv(idx+1),
839  remote_multivector, blocksize,
840  exec_instances[idx%8]);
841  }
842 
843  // 1. fire up all cuda events
844  Kokkos::fence();
845 
846  // 2. cleanup all open comm
847  waitall(reqs.send.size(), reqs.send.data());
848 #endif // HAVE_IFPACK2_MPI
849  }
850 #endif //defined(KOKKOS_ENABLE_CUDA)
851 
852  // ======================================================================
853  // Generic version without using cuda stream
854  // - only difference between cuda and host architecture is on using team
855  // or range policies.
856  // ======================================================================
857  template<typename PackTag>
858  static
859  void copy(const local_ordinal_type_1d_view &lids_,
860  const impl_scalar_type_1d_view &buffer_,
861  const local_ordinal_type &ibeg_,
862  const local_ordinal_type &iend_,
863  const impl_scalar_type_2d_view_tpetra &multivector_,
864  const local_ordinal_type blocksize_) {
865  const local_ordinal_type num_vectors = multivector_.extent(1);
866  const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
867  const local_ordinal_type idiff = iend_ - ibeg_;
868  const auto abase = buffer_.data() + mv_blocksize*ibeg_;
869  if (is_cuda<execution_space>::value || is_hip<execution_space>::value) {
870 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
871  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
872  local_ordinal_type vector_size(0);
873  if (blocksize_ <= 4) vector_size = 4;
874  else if (blocksize_ <= 8) vector_size = 8;
875  else if (blocksize_ <= 16) vector_size = 16;
876  else vector_size = 32;
877  const team_policy_type policy(idiff, 1, vector_size);
878  Kokkos::parallel_for
879  ("AsyncableImport::TeamPolicy::copy",
880  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
881  const local_ordinal_type i = member.league_rank();
882  Kokkos::parallel_for
883  (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
884  auto aptr = abase + blocksize_*(i + idiff*j);
885  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
886  if (std::is_same<PackTag,ToBuffer>::value)
887  Kokkos::parallel_for
888  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
889  aptr[k] = bptr[k];
890  });
891  else
892  Kokkos::parallel_for
893  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
894  bptr[k] = aptr[k];
895  });
896  });
897  });
898 #endif
899  } else {
900 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
901  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: device compiler should not see this code");
902 #else
903  {
904  const Kokkos::RangePolicy<execution_space> policy(0, idiff*num_vectors);
905  Kokkos::parallel_for
906  ("AsyncableImport::RangePolicy::copy",
907  policy, KOKKOS_LAMBDA(const local_ordinal_type &ij) {
908  const local_ordinal_type i = ij%idiff;
909  const local_ordinal_type j = ij/idiff;
910  auto aptr = abase + blocksize_*(i + idiff*j);
911  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
912  auto from = std::is_same<PackTag,ToBuffer>::value ? bptr : aptr;
913  auto to = std::is_same<PackTag,ToBuffer>::value ? aptr : bptr;
914  memcpy(to, from, sizeof(impl_scalar_type)*blocksize_);
915  });
916  }
917 #endif
918  }
919  }
920 
921 
925  void asyncSendRecvVar0(const impl_scalar_type_2d_view_tpetra &mv) {
926  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv");
927 
928 #ifdef HAVE_IFPACK2_MPI
929  // constants and reallocate data buffers if necessary
930  const local_ordinal_type num_vectors = mv.extent(1);
931  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
932 
933  // receive async
934  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
935  irecv(comm,
936  reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
937  (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
938  pids.recv[i],
939  42,
940  &reqs.recv[i]);
941  }
942 
943  // send async
944  for (local_ordinal_type i=0,iend=pids.send.extent(0);i<iend;++i) {
945  copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i+1),
946  mv, blocksize);
947  Kokkos::fence();
948  isend(comm,
949  reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
950  (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
951  pids.send[i],
952  42,
953  &reqs.send[i]);
954  }
955 
956  // I find that issuing an Iprobe seems to nudge some MPIs into action,
957  // which helps with overlapped comm/comp performance.
958  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
959  int flag;
960  MPI_Status stat;
961  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
962  }
963 #endif
964  }
965 
966  void syncRecvVar0() {
967  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncRecv");
968 #ifdef HAVE_IFPACK2_MPI
969  // receive async.
970  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
971  local_ordinal_type idx = i;
972  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
973  copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx+1),
974  remote_multivector, blocksize);
975  }
976  // wait on the sends to match all Isends with a cleanup operation.
977  waitall(reqs.send.size(), reqs.send.data());
978 #endif
979  }
980 
984  void asyncSendRecv(const impl_scalar_type_2d_view_tpetra &mv) {
985 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
986 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
987  asyncSendRecvVar1(mv);
988 #else
989  asyncSendRecvVar0(mv);
990 #endif
991 #else
992  asyncSendRecvVar0(mv);
993 #endif
994  }
995  void syncRecv() {
996 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
997 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
998  syncRecvVar1();
999 #else
1000  syncRecvVar0();
1001 #endif
1002 #else
1003  syncRecvVar0();
1004 #endif
1005  }
1006 
1007  void syncExchange(const impl_scalar_type_2d_view_tpetra &mv) {
1008  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncExchange");
1009  asyncSendRecv(mv);
1010  syncRecv();
1011  }
1012 
1013  impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView() const { return remote_multivector; }
1014  };
1015 
1019  template<typename MatrixType>
1020  Teuchos::RCP<AsyncableImport<MatrixType> >
1021  createBlockCrsAsyncImporter(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
1022  using impl_type = ImplType<MatrixType>;
1023  using tpetra_map_type = typename impl_type::tpetra_map_type;
1024  using local_ordinal_type = typename impl_type::local_ordinal_type;
1025  using global_ordinal_type = typename impl_type::global_ordinal_type;
1026  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1027 
1028  const auto g = A->getCrsGraph(); // tpetra crs graph object
1029  const auto blocksize = A->getBlockSize();
1030  const auto domain_map = g.getDomainMap();
1031  const auto column_map = g.getColMap();
1032 
1033  std::vector<global_ordinal_type> gids;
1034  bool separate_remotes = true, found_first = false, need_owned_permutation = false;
1035  for (size_t i=0;i<column_map->getLocalNumElements();++i) {
1036  const global_ordinal_type gid = column_map->getGlobalElement(i);
1037  if (!domain_map->isNodeGlobalElement(gid)) {
1038  found_first = true;
1039  gids.push_back(gid);
1040  } else if (found_first) {
1041  separate_remotes = false;
1042  break;
1043  }
1044  if (!need_owned_permutation &&
1045  domain_map->getLocalElement(gid) != static_cast<local_ordinal_type>(i)) {
1046  // The owned part of the domain and column maps are different
1047  // orderings. We *could* do a super efficient impl of this case in the
1048  // num_sweeps > 1 case by adding complexity to PermuteAndRepack. But,
1049  // really, if a caller cares about speed, they wouldn't make different
1050  // local permutations like this. So we punt on the best impl and go for
1051  // a pretty good one: the permutation is done in place in
1052  // compute_b_minus_Rx for the pure-owned part of the MVP. The only cost
1053  // is the presumably worse memory access pattern of the input vector.
1054  need_owned_permutation = true;
1055  }
1056  }
1057 
1058  if (separate_remotes) {
1059  const auto invalid = Teuchos::OrdinalTraits<global_ordinal_type>::invalid();
1060  const auto parsimonious_col_map
1061  = Teuchos::rcp(new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm()));
1062  if (parsimonious_col_map->getGlobalNumElements() > 0) {
1063  // make the importer only if needed.
1064  local_ordinal_type_1d_view dm2cm;
1065  if (need_owned_permutation) {
1066  dm2cm = local_ordinal_type_1d_view(do_not_initialize_tag("dm2cm"), domain_map->getLocalNumElements());
1067  const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
1068  for (size_t i=0;i<domain_map->getLocalNumElements();++i)
1069  dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
1070  Kokkos::deep_copy(dm2cm, dm2cm_host);
1071  }
1072  return Teuchos::rcp(new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
1073  }
1074  }
1075  return Teuchos::null;
1076  }
1077 
1078  template<typename MatrixType>
1079  struct PartInterface {
1080  using local_ordinal_type = typename ImplType<MatrixType>::local_ordinal_type;
1081  using local_ordinal_type_1d_view = typename ImplType<MatrixType>::local_ordinal_type_1d_view;
1082 
1083  PartInterface() = default;
1084  PartInterface(const PartInterface &b) = default;
1085 
1086  // Some terms:
1087  // The matrix A is split as A = D + R, where D is the matrix of tridiag
1088  // blocks and R is the remainder.
1089  // A part is roughly a synonym for a tridiag. The distinction is that a part
1090  // is the set of rows belonging to one tridiag and, equivalently, the off-diag
1091  // rows in R associated with that tridiag. In contrast, the term tridiag is
1092  // used to refer specifically to tridiag data, such as the pointer into the
1093  // tridiag data array.
1094  // Local (lcl) row arge the LIDs. lclrow lists the LIDs belonging to each
1095  // tridiag, and partptr points to the beginning of each tridiag. This is the
1096  // LID space.
1097  // Row index (idx) is the ordinal in the tridiag ordering. lclrow is indexed
1098  // by this ordinal. This is the 'index' space.
1099  // A flat index is the mathematical index into an array. A pack index
1100  // accounts for SIMD packing.
1101 
1102  // Local row LIDs. Permutation from caller's index space to tridiag index
1103  // space.
1104  local_ordinal_type_1d_view lclrow;
1105  // partptr_ is the pointer array into lclrow_.
1106  local_ordinal_type_1d_view partptr; // np+1
1107  // packptr_(i), for i the pack index, indexes partptr_. partptr_(packptr_(i))
1108  // is the start of the i'th pack.
1109  local_ordinal_type_1d_view packptr; // npack+1
1110  // part2rowidx0_(i) is the flat row index of the start of the i'th part. It's
1111  // an alias of partptr_ in the case of no overlap.
1112  local_ordinal_type_1d_view part2rowidx0; // np+1
1113  // part2packrowidx0_(i) is the packed row index. If vector_length is 1, then
1114  // it's the same as part2rowidx0_; if it's > 1, then the value is combined
1115  // with i % vector_length to get the location in the packed data.
1116  local_ordinal_type_1d_view part2packrowidx0; // np+1
1117  local_ordinal_type part2packrowidx0_back; // So we don't need to grab the array from the GPU.
1118  // rowidx2part_ maps the row index to the part index.
1119  local_ordinal_type_1d_view rowidx2part; // nr
1120  // True if lcl{row|col} is at most a constant away from row{idx|col}. In
1121  // practice, this knowledge is not particularly useful, as packing for batched
1122  // processing is done at the same time as the permutation from LID to index
1123  // space. But it's easy to detect, so it's recorded in case an optimization
1124  // can be made based on it.
1125  bool row_contiguous;
1126 
1127  local_ordinal_type max_partsz;
1128  };
1129 
1133  template<typename MatrixType>
1134  PartInterface<MatrixType>
1135  createPartInterface(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
1136  const Teuchos::Array<Teuchos::Array<typename ImplType<MatrixType>::local_ordinal_type> > &partitions) {
1137  using impl_type = ImplType<MatrixType>;
1138  using local_ordinal_type = typename impl_type::local_ordinal_type;
1139  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1140 
1141  constexpr int vector_length = impl_type::vector_length;
1142 
1143  const auto comm = A->getRowMap()->getComm();
1144 
1145  PartInterface<MatrixType> interf;
1146 
1147  const bool jacobi = partitions.size() == 0;
1148  const local_ordinal_type A_n_lclrows = A->getLocalNumRows();
1149  const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1150 
1151 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1152  local_ordinal_type nrows = 0;
1153  if (jacobi)
1154  nrows = nparts;
1155  else
1156  for (local_ordinal_type i=0;i<nparts;++i) nrows += partitions[i].size();
1157 
1158  TEUCHOS_TEST_FOR_EXCEPT_MSG
1159  (nrows != A_n_lclrows, get_msg_prefix(comm) << "The #rows implied by the local partition is not "
1160  << "the same as getLocalNumRows: " << nrows << " vs " << A_n_lclrows);
1161 #endif
1162 
1163  // permutation vector
1164  std::vector<local_ordinal_type> p;
1165  if (jacobi) {
1166  interf.max_partsz = 1;
1167  } else {
1168  // reorder parts to maximize simd packing efficiency
1169  p.resize(nparts);
1170 
1171  typedef std::pair<local_ordinal_type,local_ordinal_type> size_idx_pair_type;
1172  std::vector<size_idx_pair_type> partsz(nparts);
1173  for (local_ordinal_type i=0;i<nparts;++i)
1174  partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1175  std::sort(partsz.begin(), partsz.end(),
1176  [] (const size_idx_pair_type& x, const size_idx_pair_type& y) {
1177  return x.first > y.first;
1178  });
1179  for (local_ordinal_type i=0;i<nparts;++i)
1180  p[i] = partsz[i].second;
1181 
1182  interf.max_partsz = partsz[0].first;
1183  }
1184 
1185  // allocate parts
1186  interf.partptr = local_ordinal_type_1d_view(do_not_initialize_tag("partptr"), nparts + 1);
1187  interf.lclrow = local_ordinal_type_1d_view(do_not_initialize_tag("lclrow"), A_n_lclrows);
1188  interf.part2rowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0"), nparts + 1);
1189  interf.part2packrowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2packrowidx0"), nparts + 1);
1190  interf.rowidx2part = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1191 
1192  // mirror to host and compute on host execution space
1193  const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1194  const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1195  const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1196  const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1197  const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1198 
1199  // Determine parts.
1200  interf.row_contiguous = true;
1201  partptr(0) = 0;
1202  part2rowidx0(0) = 0;
1203  part2packrowidx0(0) = 0;
1204  local_ordinal_type pack_nrows = 0;
1205  if (jacobi) {
1206  for (local_ordinal_type ip=0;ip<nparts;++ip) {
1207  const local_ordinal_type ipnrows = 1;
1208  TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
1209  get_msg_prefix(comm)
1210  << "partition " << p[ip]
1211  << " is empty, which is not allowed.");
1212  //assume No overlap.
1213  part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1214  // Since parts are ordered in nonincreasing size, the size of the first
1215  // part in a pack is the size for all parts in the pack.
1216  if (ip % vector_length == 0) pack_nrows = ipnrows;
1217  part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1218  const local_ordinal_type os = partptr(ip);
1219  for (local_ordinal_type i=0;i<ipnrows;++i) {
1220  const auto lcl_row = ip;
1221  TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1222  get_msg_prefix(comm)
1223  << "partitions[" << p[ip] << "]["
1224  << i << "] = " << lcl_row
1225  << " but input matrix implies limits of [0, " << A_n_lclrows-1
1226  << "].");
1227  lclrow(os+i) = lcl_row;
1228  rowidx2part(os+i) = ip;
1229  if (interf.row_contiguous && os+i > 0 && lclrow((os+i)-1) + 1 != lcl_row)
1230  interf.row_contiguous = false;
1231  }
1232  partptr(ip+1) = os + ipnrows;
1233  }
1234  } else {
1235  for (local_ordinal_type ip=0;ip<nparts;++ip) {
1236  const auto* part = &partitions[p[ip]];
1237  const local_ordinal_type ipnrows = part->size();
1238  TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip-1]].size())));
1239  TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
1240  get_msg_prefix(comm)
1241  << "partition " << p[ip]
1242  << " is empty, which is not allowed.");
1243  //assume No overlap.
1244  part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1245  // Since parts are ordered in nonincreasing size, the size of the first
1246  // part in a pack is the size for all parts in the pack.
1247  if (ip % vector_length == 0) pack_nrows = ipnrows;
1248  part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1249  const local_ordinal_type os = partptr(ip);
1250  for (local_ordinal_type i=0;i<ipnrows;++i) {
1251  const auto lcl_row = (*part)[i];
1252  TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1253  get_msg_prefix(comm)
1254  << "partitions[" << p[ip] << "]["
1255  << i << "] = " << lcl_row
1256  << " but input matrix implies limits of [0, " << A_n_lclrows-1
1257  << "].");
1258  lclrow(os+i) = lcl_row;
1259  rowidx2part(os+i) = ip;
1260  if (interf.row_contiguous && os+i > 0 && lclrow((os+i)-1) + 1 != lcl_row)
1261  interf.row_contiguous = false;
1262  }
1263  partptr(ip+1) = os + ipnrows;
1264  }
1265  }
1266 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1267  TEUCHOS_ASSERT(partptr(nparts) == nrows);
1268 #endif
1269  if (lclrow(0) != 0) interf.row_contiguous = false;
1270 
1271  Kokkos::deep_copy(interf.partptr, partptr);
1272  Kokkos::deep_copy(interf.lclrow, lclrow);
1273 
1274  //assume No overlap. Thus:
1275  interf.part2rowidx0 = interf.partptr;
1276  Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1277 
1278  interf.part2packrowidx0_back = part2packrowidx0(part2packrowidx0.extent(0) - 1);
1279  Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1280 
1281  { // Fill packptr.
1282  local_ordinal_type npacks = 0;
1283  for (local_ordinal_type ip=1;ip<=nparts;++ip)
1284  if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1285  ++npacks;
1286  interf.packptr = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1287  const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1288  packptr(0) = 0;
1289  for (local_ordinal_type ip=1,k=1;ip<=nparts;++ip)
1290  if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1291  packptr(k++) = ip;
1292  Kokkos::deep_copy(interf.packptr, packptr);
1293  }
1294 
1295  return interf;
1296  }
1297 
1301  template <typename MatrixType>
1302  struct BlockTridiags {
1304  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1305  using size_type_1d_view = typename impl_type::size_type_1d_view;
1306  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1307 
1308  // flat_td_ptr(i) is the index into flat-array values of the start of the
1309  // i'th tridiag. pack_td_ptr is the same, but for packs. If vector_length ==
1310  // 1, pack_td_ptr is the same as flat_td_ptr; if vector_length > 1, then i %
1311  // vector_length is the position in the pack.
1312  size_type_1d_view flat_td_ptr, pack_td_ptr;
1313  // List of local column indices into A from which to grab
1314  // data. flat_td_ptr(i) points to the start of the i'th tridiag's data.
1315  local_ordinal_type_1d_view A_colindsub;
1316  // Tridiag block values. pack_td_ptr(i) points to the start of the i'th
1317  // tridiag's pack, and i % vector_length gives the position in the pack.
1318  vector_type_3d_view values;
1319 
1320  bool is_diagonal_only;
1321 
1322  BlockTridiags() = default;
1323  BlockTridiags(const BlockTridiags &b) = default;
1324 
1325  // Index into row-major block of a tridiag.
1326  template <typename idx_type>
1327  static KOKKOS_FORCEINLINE_FUNCTION
1328  idx_type IndexToRow (const idx_type& ind) { return (ind + 1) / 3; }
1329  // Given a row of a row-major tridiag, return the index of the first block
1330  // in that row.
1331  template <typename idx_type>
1332  static KOKKOS_FORCEINLINE_FUNCTION
1333  idx_type RowToIndex (const idx_type& row) { return row > 0 ? 3*row - 1 : 0; }
1334  // Number of blocks in a tridiag having a given number of rows.
1335  template <typename idx_type>
1336  static KOKKOS_FORCEINLINE_FUNCTION
1337  idx_type NumBlocks (const idx_type& nrows) { return nrows > 0 ? 3*nrows - 2 : 0; }
1338  };
1339 
1340 
1344  template<typename MatrixType>
1346  createBlockTridiags(const PartInterface<MatrixType> &interf) {
1347  using impl_type = ImplType<MatrixType>;
1348  using execution_space = typename impl_type::execution_space;
1349  using local_ordinal_type = typename impl_type::local_ordinal_type;
1350  using size_type = typename impl_type::size_type;
1351  using size_type_1d_view = typename impl_type::size_type_1d_view;
1352 
1353  constexpr int vector_length = impl_type::vector_length;
1354 
1356 
1357  const local_ordinal_type ntridiags = interf.partptr.extent(0) - 1;
1358 
1359  { // construct the flat index pointers into the tridiag values array.
1360  btdm.flat_td_ptr = size_type_1d_view(do_not_initialize_tag("btdm.flat_td_ptr"), ntridiags + 1);
1361  const Kokkos::RangePolicy<execution_space> policy(0,ntridiags + 1);
1362  Kokkos::parallel_scan
1363  ("createBlockTridiags::RangePolicy::flat_td_ptr",
1364  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1365  if (final)
1366  btdm.flat_td_ptr(i) = update;
1367  if (i < ntridiags) {
1368  const local_ordinal_type nrows = interf.partptr(i+1) - interf.partptr(i);
1369  update += btdm.NumBlocks(nrows);
1370  }
1371  });
1372 
1373  const auto nblocks = Kokkos::create_mirror_view_and_copy
1374  (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, ntridiags));
1375  btdm.is_diagonal_only = (static_cast<local_ordinal_type>(nblocks()) == ntridiags);
1376  }
1377 
1378  // And the packed index pointers.
1379  if (vector_length == 1) {
1380  btdm.pack_td_ptr = btdm.flat_td_ptr;
1381  } else {
1382  const local_ordinal_type npacks = interf.packptr.extent(0) - 1;
1383  btdm.pack_td_ptr = size_type_1d_view(do_not_initialize_tag("btdm.pack_td_ptr"), ntridiags + 1);
1384  const Kokkos::RangePolicy<execution_space> policy(0,npacks);
1385  Kokkos::parallel_scan
1386  ("createBlockTridiags::RangePolicy::pack_td_ptr",
1387  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1388  const local_ordinal_type parti = interf.packptr(i);
1389  const local_ordinal_type parti_next = interf.packptr(i+1);
1390  if (final) {
1391  const size_type nblks = update;
1392  for (local_ordinal_type pti=parti;pti<parti_next;++pti)
1393  btdm.pack_td_ptr(pti) = nblks;
1394  const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
1395  // last one
1396  if (i == npacks-1)
1397  btdm.pack_td_ptr(ntridiags) = nblks + btdm.NumBlocks(nrows);
1398  }
1399  {
1400  const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
1401  update += btdm.NumBlocks(nrows);
1402  }
1403  });
1404  }
1405 
1406  // values and A_colindsub are created in the symbolic phase
1407 
1408  return btdm;
1409  }
1410 
1411  // Set the tridiags to be I to the full pack block size. That way, if a
1412  // tridiag within a pack is shorter than the longest one, the extra blocks are
1413  // processed in a safe way. Similarly, in the solve phase, if the extra blocks
1414  // in the packed multvector are 0, and the tridiag LU reflects the extra I
1415  // blocks, then the solve proceeds as though the extra blocks aren't
1416  // present. Since this extra work is part of the SIMD calls, it's not actually
1417  // extra work. Instead, it means we don't have to put checks or masks in, or
1418  // quiet NaNs. This functor has to be called just once, in the symbolic phase,
1419  // since the numeric phase fills in only the used entries, leaving these I
1420  // blocks intact.
1421  template<typename MatrixType>
1422  void
1423  setTridiagsToIdentity
1424  (const BlockTridiags<MatrixType>& btdm,
1425  const typename ImplType<MatrixType>::local_ordinal_type_1d_view& packptr)
1426  {
1427  using impl_type = ImplType<MatrixType>;
1428  using execution_space = typename impl_type::execution_space;
1429  using local_ordinal_type = typename impl_type::local_ordinal_type;
1430  using size_type_1d_view = typename impl_type::size_type_1d_view;
1431 
1432  const ConstUnmanaged<size_type_1d_view> pack_td_ptr(btdm.pack_td_ptr);
1433  const local_ordinal_type blocksize = btdm.values.extent(1);
1434 
1435  {
1436  const int vector_length = impl_type::vector_length;
1437  const int internal_vector_length = impl_type::internal_vector_length;
1438 
1439  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
1440  using internal_vector_type = typename impl_type::internal_vector_type;
1441  using internal_vector_type_4d_view =
1442  typename impl_type::internal_vector_type_4d_view;
1443 
1444  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1445  const internal_vector_type_4d_view values
1446  (reinterpret_cast<internal_vector_type*>(btdm.values.data()),
1447  btdm.values.extent(0),
1448  btdm.values.extent(1),
1449  btdm.values.extent(2),
1450  vector_length/internal_vector_length);
1451  const local_ordinal_type vector_loop_size = values.extent(3);
1452 #if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1453  local_ordinal_type total_team_size(0);
1454  if (blocksize <= 5) total_team_size = 32;
1455  else if (blocksize <= 9) total_team_size = 64;
1456  else if (blocksize <= 12) total_team_size = 96;
1457  else if (blocksize <= 16) total_team_size = 128;
1458  else if (blocksize <= 20) total_team_size = 160;
1459  else total_team_size = 160;
1460  const local_ordinal_type team_size = total_team_size/vector_loop_size;
1461  const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1462 #elif defined(KOKKOS_ENABLE_HIP)
1463  // FIXME: HIP
1464  // These settings might be completely wrong
1465  // will have to do some experiments to decide
1466  // what makes sense on AMD GPUs
1467  local_ordinal_type total_team_size(0);
1468  if (blocksize <= 5) total_team_size = 32;
1469  else if (blocksize <= 9) total_team_size = 64;
1470  else if (blocksize <= 12) total_team_size = 96;
1471  else if (blocksize <= 16) total_team_size = 128;
1472  else if (blocksize <= 20) total_team_size = 160;
1473  else total_team_size = 160;
1474  const local_ordinal_type team_size = total_team_size/vector_loop_size;
1475  const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1476 #else // Host architecture: team size is always one
1477  const team_policy_type policy(packptr.extent(0)-1, 1, 1);
1478 #endif
1479  Kokkos::parallel_for
1480  ("setTridiagsToIdentity::TeamPolicy",
1481  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
1482  const local_ordinal_type k = member.league_rank();
1483  const local_ordinal_type ibeg = pack_td_ptr(packptr(k));
1484  const local_ordinal_type iend = pack_td_ptr(packptr(k+1));
1485  const local_ordinal_type diff = iend - ibeg;
1486  const local_ordinal_type icount = diff/3 + (diff%3 > 0);
1487  const btdm_scalar_type one(1);
1488  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
1489  Kokkos::parallel_for(Kokkos::TeamThreadRange(member,icount),[&](const local_ordinal_type &ii) {
1490  const local_ordinal_type i = ibeg + ii*3;
1491  for (local_ordinal_type j=0;j<blocksize;++j)
1492  values(i,j,j,v) = one;
1493  });
1494  });
1495  });
1496  }
1497  }
1498 
1502  template <typename MatrixType>
1503  struct AmD {
1505  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1506  using size_type_1d_view = typename impl_type::size_type_1d_view;
1507  using impl_scalar_type_1d_view_tpetra = Unmanaged<typename impl_type::impl_scalar_type_1d_view_tpetra>;
1508  // rowptr points to the start of each row of A_colindsub.
1509  size_type_1d_view rowptr, rowptr_remote;
1510  // Indices into A's rows giving the blocks to extract. rowptr(i) points to
1511  // the i'th row. Thus, g.entries(A_colindsub(rowptr(row) : rowptr(row+1))),
1512  // where g is A's graph, are the columns AmD uses. If seq_method_, then
1513  // A_colindsub contains all the LIDs and A_colindsub_remote is empty. If !
1514  // seq_method_, then A_colindsub contains owned LIDs and A_colindsub_remote
1515  // contains the remote ones.
1516  local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
1517 
1518  // Currently always true.
1519  bool is_tpetra_block_crs;
1520 
1521  // If is_tpetra_block_crs, then this is a pointer to A_'s value data.
1522  impl_scalar_type_1d_view_tpetra tpetra_values;
1523 
1524  AmD() = default;
1525  AmD(const AmD &b) = default;
1526  };
1527 
1531  template<typename MatrixType>
1532  void
1533  performSymbolicPhase(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
1534  const PartInterface<MatrixType> &interf,
1536  AmD<MatrixType> &amd,
1537  const bool overlap_communication_and_computation) {
1538  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::SymbolicPhase");
1539 
1540  using impl_type = ImplType<MatrixType>;
1541  // using node_memory_space = typename impl_type::node_memory_space;
1542  using host_execution_space = typename impl_type::host_execution_space;
1543 
1544  using local_ordinal_type = typename impl_type::local_ordinal_type;
1545  using global_ordinal_type = typename impl_type::global_ordinal_type;
1546  using size_type = typename impl_type::size_type;
1547  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1548  using size_type_1d_view = typename impl_type::size_type_1d_view;
1549  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1550  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1551 
1552  constexpr int vector_length = impl_type::vector_length;
1553 
1554  const auto comm = A->getRowMap()->getComm();
1555  const auto& g = A->getCrsGraph();
1556  const auto blocksize = A->getBlockSize();
1557 
1558  // mirroring to host
1559  const auto partptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.partptr);
1560  const auto lclrow = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.lclrow);
1561  const auto rowidx2part = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.rowidx2part);
1562  const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1563  const auto packptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.packptr);
1564 
1565  const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1566 
1567  // find column to row map on host
1568  Kokkos::View<local_ordinal_type*,host_execution_space> col2row("col2row", A->getLocalNumCols());
1569  Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1570  {
1571  const auto rowmap = g.getRowMap();
1572  const auto colmap = g.getColMap();
1573  const auto dommap = g.getDomainMap();
1574  TEUCHOS_ASSERT( !(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1575 
1576 #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
1577  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1578  Kokkos::parallel_for
1579  ("performSymbolicPhase::RangePolicy::col2row",
1580  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1581  const global_ordinal_type gid = rowmap->getGlobalElement(lr);
1582  TEUCHOS_ASSERT(gid != Teuchos::OrdinalTraits<global_ordinal_type>::invalid());
1583  if (dommap->isNodeGlobalElement(gid)) {
1584  const local_ordinal_type lc = colmap->getLocalElement(gid);
1585 # if defined(BLOCKTRIDICONTAINER_DEBUG)
1586  TEUCHOS_TEST_FOR_EXCEPT_MSG(lc == Teuchos::OrdinalTraits<local_ordinal_type>::invalid(),
1587  get_msg_prefix(comm) << "GID " << gid
1588  << " gives an invalid local column.");
1589 # endif
1590  col2row(lc) = lr;
1591  }
1592  });
1593 #endif
1594  }
1595 
1596  // construct the D and R graphs in A = D + R.
1597  {
1598  const auto local_graph = g.getLocalGraphHost();
1599  const auto local_graph_rowptr = local_graph.row_map;
1600  TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast<size_t>(nrows + 1));
1601  const auto local_graph_colidx = local_graph.entries;
1602 
1603  //assume no overlap.
1604 
1605  Kokkos::View<local_ordinal_type*,host_execution_space> lclrow2idx("lclrow2idx", nrows);
1606  {
1607  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1608  Kokkos::parallel_for
1609  ("performSymbolicPhase::RangePolicy::lclrow2idx",
1610  policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1611  lclrow2idx[lclrow(i)] = i;
1612  });
1613  }
1614 
1615  // count (block) nnzs in D and R.
1616  typedef SumReducer<size_type,3,host_execution_space> sum_reducer_type;
1617  typename sum_reducer_type::value_type sum_reducer_value;
1618  {
1619  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1620  Kokkos::parallel_reduce
1621  // profiling interface does not work
1622  (//"performSymbolicPhase::RangePolicy::count_nnz",
1623  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, typename sum_reducer_type::value_type &update) {
1624  // LID -> index.
1625  const local_ordinal_type ri0 = lclrow2idx[lr];
1626  const local_ordinal_type pi0 = rowidx2part(ri0);
1627  for (size_type j=local_graph_rowptr(lr);j<local_graph_rowptr(lr+1);++j) {
1628  const local_ordinal_type lc = local_graph_colidx(j);
1629  const local_ordinal_type lc2r = col2row[lc];
1630  bool incr_R = false;
1631  do { // breakable
1632  if (lc2r == (local_ordinal_type) -1) {
1633  incr_R = true;
1634  break;
1635  }
1636  const local_ordinal_type ri = lclrow2idx[lc2r];
1637  const local_ordinal_type pi = rowidx2part(ri);
1638  if (pi != pi0) {
1639  incr_R = true;
1640  break;
1641  }
1642  // Test for being in the tridiag. This is done in index space. In
1643  // LID space, tridiag LIDs in a row are not necessarily related by
1644  // {-1, 0, 1}.
1645  if (ri0 + 1 >= ri && ri0 <= ri + 1)
1646  ++update.v[0]; // D_nnz
1647  else
1648  incr_R = true;
1649  } while (0);
1650  if (incr_R) {
1651  if (lc < nrows) ++update.v[1]; // R_nnz_owned
1652  else ++update.v[2]; // R_nnz_remote
1653  }
1654  }
1655  }, sum_reducer_type(sum_reducer_value));
1656  }
1657  size_type D_nnz = sum_reducer_value.v[0];
1658  size_type R_nnz_owned = sum_reducer_value.v[1];
1659  size_type R_nnz_remote = sum_reducer_value.v[2];
1660 
1661  if (!overlap_communication_and_computation) {
1662  R_nnz_owned += R_nnz_remote;
1663  R_nnz_remote = 0;
1664  }
1665 
1666  // construct the D graph.
1667  {
1668  const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1669 
1670  btdm.A_colindsub = local_ordinal_type_1d_view("btdm.A_colindsub", D_nnz);
1671  const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
1672 
1673 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1674  Kokkos::deep_copy(D_A_colindsub, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1675 #endif
1676 
1677  const local_ordinal_type nparts = partptr.extent(0) - 1;
1678  {
1679  const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
1680  Kokkos::parallel_for
1681  ("performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
1682  policy, KOKKOS_LAMBDA(const local_ordinal_type &pi0) {
1683  const local_ordinal_type part_ri0 = part2rowidx0(pi0);
1684  local_ordinal_type offset = 0;
1685  for (local_ordinal_type ri0=partptr(pi0);ri0<partptr(pi0+1);++ri0) {
1686  const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
1687  offset = 1;
1688  const local_ordinal_type lr0 = lclrow(ri0);
1689  const size_type j0 = local_graph_rowptr(lr0);
1690  for (size_type j=j0;j<local_graph_rowptr(lr0+1);++j) {
1691  const local_ordinal_type lc = local_graph_colidx(j);
1692  const local_ordinal_type lc2r = col2row[lc];
1693  if (lc2r == (local_ordinal_type) -1) continue;
1694  const local_ordinal_type ri = lclrow2idx[lc2r];
1695  const local_ordinal_type pi = rowidx2part(ri);
1696  if (pi != pi0) continue;
1697  if (ri + 1 < ri0 || ri > ri0 + 1) continue;
1698  const local_ordinal_type row_entry = j - j0;
1699  D_A_colindsub(flat_td_ptr(pi0) + ((td_row_os + ri) - ri0)) = row_entry;
1700  }
1701  }
1702  });
1703  }
1704 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1705  for (size_t i=0;i<D_A_colindsub.extent(0);++i)
1706  TEUCHOS_ASSERT(D_A_colindsub(i) != Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1707 #endif
1708  Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
1709 
1710  // Allocate values.
1711  {
1712  const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, nparts);
1713  const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
1714  btdm.values = vector_type_3d_view("btdm.values", num_packed_blocks(), blocksize, blocksize);
1715  if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
1716  }
1717  }
1718 
1719  // Construct the R graph.
1720  {
1721  amd.rowptr = size_type_1d_view("amd.rowptr", nrows + 1);
1722  amd.A_colindsub = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub"), R_nnz_owned);
1723 
1724  const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
1725  const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
1726 
1727  amd.rowptr_remote = size_type_1d_view("amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
1728  amd.A_colindsub_remote = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub_remote"), R_nnz_remote);
1729 
1730  const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
1731  const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
1732 
1733  {
1734  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1735  Kokkos::parallel_for
1736  ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
1737  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1738  const local_ordinal_type ri0 = lclrow2idx[lr];
1739  const local_ordinal_type pi0 = rowidx2part(ri0);
1740  const size_type j0 = local_graph_rowptr(lr);
1741  for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
1742  const local_ordinal_type lc = local_graph_colidx(j);
1743  const local_ordinal_type lc2r = col2row[lc];
1744  if (lc2r != (local_ordinal_type) -1) {
1745  const local_ordinal_type ri = lclrow2idx[lc2r];
1746  const local_ordinal_type pi = rowidx2part(ri);
1747  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
1748  continue;
1749  }
1750  }
1751  // exclusive scan will be performed later
1752  if (!overlap_communication_and_computation || lc < nrows) {
1753  ++R_rowptr(lr);
1754  } else {
1755  ++R_rowptr_remote(lr);
1756  }
1757  }
1758  });
1759  }
1760 
1761  // exclusive scan
1762  typedef ArrayValueType<size_type,2> update_type;
1763  {
1764  Kokkos::RangePolicy<host_execution_space> policy(0,nrows+1);
1765  Kokkos::parallel_scan
1766  ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
1767  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr,
1768  update_type &update,
1769  const bool &final) {
1770  update_type val;
1771  val.v[0] = R_rowptr(lr);
1772  if (overlap_communication_and_computation)
1773  val.v[1] = R_rowptr_remote(lr);
1774 
1775  if (final) {
1776  R_rowptr(lr) = update.v[0];
1777  if (overlap_communication_and_computation)
1778  R_rowptr_remote(lr) = update.v[1];
1779 
1780  if (lr < nrows) {
1781  const local_ordinal_type ri0 = lclrow2idx[lr];
1782  const local_ordinal_type pi0 = rowidx2part(ri0);
1783 
1784  size_type cnt_rowptr = R_rowptr(lr);
1785  size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; // when not overlap_communication_and_computation, this value is garbage
1786 
1787  const size_type j0 = local_graph_rowptr(lr);
1788  for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
1789  const local_ordinal_type lc = local_graph_colidx(j);
1790  const local_ordinal_type lc2r = col2row[lc];
1791  if (lc2r != (local_ordinal_type) -1) {
1792  const local_ordinal_type ri = lclrow2idx[lc2r];
1793  const local_ordinal_type pi = rowidx2part(ri);
1794  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
1795  continue;
1796  }
1797  const local_ordinal_type row_entry = j - j0;
1798  if (!overlap_communication_and_computation || lc < nrows)
1799  R_A_colindsub(cnt_rowptr++) = row_entry;
1800  else
1801  R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
1802  }
1803  }
1804  }
1805  update += val;
1806  });
1807  }
1808  TEUCHOS_ASSERT(R_rowptr(nrows) == R_nnz_owned);
1809  Kokkos::deep_copy(amd.rowptr, R_rowptr);
1810  Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
1811  if (overlap_communication_and_computation) {
1812  TEUCHOS_ASSERT(R_rowptr_remote(nrows) == R_nnz_remote);
1813  Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
1814  Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
1815  }
1816 
1817  // Allocate or view values.
1818  amd.tpetra_values = (const_cast<block_crs_matrix_type*>(A.get())->getValuesDeviceNonConst());
1819 
1820  }
1821  }
1822  }
1823 
1824 
1828  template<typename ArgActiveExecutionMemorySpace>
1830 
1831  template<>
1832  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
1833  typedef KB::Mode::Serial mode_type;
1834 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
1835  typedef KB::Algo::Level3::CompactMKL algo_type;
1836 #else
1837  typedef KB::Algo::Level3::Blocked algo_type;
1838 #endif
1839  static int recommended_team_size(const int /* blksize */,
1840  const int /* vector_length */,
1841  const int /* internal_vector_length */) {
1842  return 1;
1843  }
1844 
1845  };
1846 
1847 #if defined(KOKKOS_ENABLE_CUDA)
1848  static inline int ExtractAndFactorizeRecommendedCudaTeamSize(const int blksize,
1849  const int vector_length,
1850  const int internal_vector_length) {
1851  const int vector_size = vector_length/internal_vector_length;
1852  int total_team_size(0);
1853  if (blksize <= 5) total_team_size = 32;
1854  else if (blksize <= 9) total_team_size = 32; // 64
1855  else if (blksize <= 12) total_team_size = 96;
1856  else if (blksize <= 16) total_team_size = 128;
1857  else if (blksize <= 20) total_team_size = 160;
1858  else total_team_size = 160;
1859  return 2*total_team_size/vector_size;
1860  }
1861  template<>
1862  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
1863  typedef KB::Mode::Team mode_type;
1864  typedef KB::Algo::Level3::Unblocked algo_type;
1865  static int recommended_team_size(const int blksize,
1866  const int vector_length,
1867  const int internal_vector_length) {
1868  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
1869  }
1870  };
1871  template<>
1872  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
1873  typedef KB::Mode::Team mode_type;
1874  typedef KB::Algo::Level3::Unblocked algo_type;
1875  static int recommended_team_size(const int blksize,
1876  const int vector_length,
1877  const int internal_vector_length) {
1878  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
1879  }
1880  };
1881 #endif
1882 
1883 #if defined(KOKKOS_ENABLE_HIP)
1884  static inline int ExtractAndFactorizeRecommendedHIPTeamSize(const int blksize,
1885  const int vector_length,
1886  const int internal_vector_length) {
1887  const int vector_size = vector_length/internal_vector_length;
1888  int total_team_size(0);
1889  if (blksize <= 5) total_team_size = 32;
1890  else if (blksize <= 9) total_team_size = 32; // 64
1891  else if (blksize <= 12) total_team_size = 96;
1892  else if (blksize <= 16) total_team_size = 128;
1893  else if (blksize <= 20) total_team_size = 160;
1894  else total_team_size = 160;
1895  return 2*total_team_size/vector_size;
1896  }
1897  template<>
1898  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::HIPSpace> {
1899  typedef KB::Mode::Team mode_type;
1900  typedef KB::Algo::Level3::Unblocked algo_type;
1901  static int recommended_team_size(const int blksize,
1902  const int vector_length,
1903  const int internal_vector_length) {
1904  return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
1905  }
1906  };
1907  template<>
1908  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::HIPHostPinnedSpace> {
1909  typedef KB::Mode::Team mode_type;
1910  typedef KB::Algo::Level3::Unblocked algo_type;
1911  static int recommended_team_size(const int blksize,
1912  const int vector_length,
1913  const int internal_vector_length) {
1914  return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
1915  }
1916  };
1917 #endif
1918 
1919  template<typename MatrixType>
1920  struct ExtractAndFactorizeTridiags {
1921  public:
1922  using impl_type = ImplType<MatrixType>;
1923  // a functor cannot have both device_type and execution_space; specialization error in kokkos
1924  using execution_space = typename impl_type::execution_space;
1925  using memory_space = typename impl_type::memory_space;
1927  using local_ordinal_type = typename impl_type::local_ordinal_type;
1928  using size_type = typename impl_type::size_type;
1929  using impl_scalar_type = typename impl_type::impl_scalar_type;
1930  using magnitude_type = typename impl_type::magnitude_type;
1932  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1934  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1935  using size_type_1d_view = typename impl_type::size_type_1d_view;
1936  using impl_scalar_type_1d_view_tpetra = typename impl_type::impl_scalar_type_1d_view_tpetra;
1938  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
1939  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
1940  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1941  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
1942  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
1943  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
1944  using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
1945 
1946  using internal_vector_type = typename impl_type::internal_vector_type;
1947  static constexpr int vector_length = impl_type::vector_length;
1948  static constexpr int internal_vector_length = impl_type::internal_vector_length;
1949 
1951  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1952  using member_type = typename team_policy_type::member_type;
1953 
1954  private:
1955  // part interface
1956  const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr;
1957  const local_ordinal_type max_partsz;
1958  // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int)
1959  using size_type_1d_view_tpetra = Kokkos::View<size_t*,typename impl_type::node_device_type>;
1960  const ConstUnmanaged<size_type_1d_view_tpetra> A_rowptr;
1961  const ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
1962  // block tridiags
1963  const ConstUnmanaged<size_type_1d_view> pack_td_ptr, flat_td_ptr;
1964  const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
1965  const Unmanaged<internal_vector_type_4d_view> internal_vector_values;
1966  const Unmanaged<btdm_scalar_type_4d_view> scalar_values;
1967  // shared information
1968  const local_ordinal_type blocksize, blocksize_square;
1969  // diagonal safety
1970  const magnitude_type tiny;
1971  const local_ordinal_type vector_loop_size;
1972  const local_ordinal_type vector_length_value;
1973 
1974  public:
1975  ExtractAndFactorizeTridiags(const BlockTridiags<MatrixType> &btdm_,
1976  const PartInterface<MatrixType> &interf_,
1977  const Teuchos::RCP<const block_crs_matrix_type> &A_,
1978  const magnitude_type& tiny_) :
1979  // interface
1980  partptr(interf_.partptr),
1981  lclrow(interf_.lclrow),
1982  packptr(interf_.packptr),
1983  max_partsz(interf_.max_partsz),
1984  // block crs matrix
1985  A_rowptr(A_->getCrsGraph().getLocalGraphDevice().row_map),
1986  A_values(const_cast<block_crs_matrix_type*>(A_.get())->getValuesDeviceNonConst()),
1987  // block tridiags
1988  pack_td_ptr(btdm_.pack_td_ptr),
1989  flat_td_ptr(btdm_.flat_td_ptr),
1990  A_colindsub(btdm_.A_colindsub),
1991  internal_vector_values((internal_vector_type*)btdm_.values.data(),
1992  btdm_.values.extent(0),
1993  btdm_.values.extent(1),
1994  btdm_.values.extent(2),
1995  vector_length/internal_vector_length),
1996  scalar_values((btdm_scalar_type*)btdm_.values.data(),
1997  btdm_.values.extent(0),
1998  btdm_.values.extent(1),
1999  btdm_.values.extent(2),
2000  vector_length),
2001  blocksize(btdm_.values.extent(1)),
2002  blocksize_square(blocksize*blocksize),
2003  // diagonal weight to avoid zero pivots
2004  tiny(tiny_),
2005  vector_loop_size(vector_length/internal_vector_length),
2006  vector_length_value(vector_length) {}
2007 
2008  private:
2009 
2010  KOKKOS_INLINE_FUNCTION
2011  void
2012  extract(local_ordinal_type partidx,
2013  local_ordinal_type npacks) const {
2014  using tlb = TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
2015  const size_type kps = pack_td_ptr(partidx);
2016  local_ordinal_type kfs[vector_length] = {};
2017  local_ordinal_type ri0[vector_length] = {};
2018  local_ordinal_type nrows[vector_length] = {};
2019 
2020  for (local_ordinal_type vi=0;vi<npacks;++vi,++partidx) {
2021  kfs[vi] = flat_td_ptr(partidx);
2022  ri0[vi] = partptr(partidx);
2023  nrows[vi] = partptr(partidx+1) - ri0[vi];
2024  }
2025  for (local_ordinal_type tr=0,j=0;tr<nrows[0];++tr) {
2026  for (local_ordinal_type e=0;e<3;++e) {
2027  const impl_scalar_type* block[vector_length] = {};
2028  for (local_ordinal_type vi=0;vi<npacks;++vi) {
2029  const size_type Aj = A_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
2030  block[vi] = &A_values(Aj*blocksize_square);
2031  }
2032  const size_type pi = kps + j;
2033  ++j;
2034  for (local_ordinal_type ii=0;ii<blocksize;++ii) {
2035  for (local_ordinal_type jj=0;jj<blocksize;++jj) {
2036  //const auto idx = ii*blocksize + jj;
2037  const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
2038  auto& v = internal_vector_values(pi, ii, jj, 0);
2039  for (local_ordinal_type vi=0;vi<npacks;++vi)
2040  v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
2041  }
2042  }
2043 
2044  if (nrows[0] == 1) break;
2045  if (e == 1 && (tr == 0 || tr+1 == nrows[0])) break;
2046  for (local_ordinal_type vi=1;vi<npacks;++vi) {
2047  if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr+1 == nrows[vi])) {
2048  npacks = vi;
2049  break;
2050  }
2051  }
2052  }
2053  }
2054  }
2055 
2056  KOKKOS_INLINE_FUNCTION
2057  void
2058  extract(const member_type &member,
2059  const local_ordinal_type &partidxbeg,
2060  const local_ordinal_type &npacks,
2061  const local_ordinal_type &vbeg) const {
2062  using tlb = TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
2063  local_ordinal_type kfs_vals[internal_vector_length] = {};
2064  local_ordinal_type ri0_vals[internal_vector_length] = {};
2065  local_ordinal_type nrows_vals[internal_vector_length] = {};
2066 
2067  const size_type kps = pack_td_ptr(partidxbeg);
2068  for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
2069  kfs_vals[vi] = flat_td_ptr(partidxbeg+vi);
2070  ri0_vals[vi] = partptr(partidxbeg+vi);
2071  nrows_vals[vi] = partptr(partidxbeg+vi+1) - ri0_vals[vi];
2072  }
2073 
2074  local_ordinal_type j_vals[internal_vector_length] = {};
2075  for (local_ordinal_type tr=0;tr<nrows_vals[0];++tr) {
2076  for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
2077  const local_ordinal_type nrows = nrows_vals[vi];
2078  if (tr < nrows) {
2079  auto &j = j_vals[vi];
2080  const local_ordinal_type kfs = kfs_vals[vi];
2081  const local_ordinal_type ri0 = ri0_vals[vi];
2082  const local_ordinal_type lbeg = (tr == 0 ? 1 : 0);
2083  const local_ordinal_type lend = (tr == nrows - 1 ? 2 : 3);
2084  for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
2085  const size_type Aj = A_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
2086  const impl_scalar_type* block = &A_values(Aj*blocksize_square);
2087  const size_type pi = kps + j;
2088  Kokkos::parallel_for
2089  (Kokkos::TeamThreadRange(member,blocksize),
2090  [&](const local_ordinal_type &ii) {
2091  for (local_ordinal_type jj=0;jj<blocksize;++jj)
2092  scalar_values(pi, ii, jj, v) = static_cast<btdm_scalar_type>(block[tlb::getFlatIndex(ii,jj,blocksize)]);
2093  });
2094  }
2095  }
2096  }
2097  }
2098  }
2099 
2100  template<typename AAViewType,
2101  typename WWViewType>
2102  KOKKOS_INLINE_FUNCTION
2103  void
2104  factorize(const member_type &member,
2105  const local_ordinal_type &i0,
2106  const local_ordinal_type &nrows,
2107  const local_ordinal_type &v,
2108  const AAViewType &AA,
2109  const WWViewType &WW) const {
2110 
2111  typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
2112  <typename execution_space::memory_space> default_mode_and_algo_type;
2113 
2114  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2115  typedef typename default_mode_and_algo_type::algo_type default_algo_type;
2116 
2117  // constant
2118  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2119 
2120  // subview pattern
2121  auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2122  KB::LU<member_type,
2123  default_mode_type,KB::Algo::LU::Unblocked>
2124  ::invoke(member, A , tiny);
2125 
2126  if (nrows > 1) {
2127  auto B = A;
2128  auto C = A;
2129  local_ordinal_type i = i0;
2130  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2131  B.assign_data( &AA(i+1,0,0,v) );
2132  KB::Trsm<member_type,
2133  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2134  default_mode_type,default_algo_type>
2135  ::invoke(member, one, A, B);
2136  C.assign_data( &AA(i+2,0,0,v) );
2137  KB::Trsm<member_type,
2138  KB::Side::Right,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2139  default_mode_type,default_algo_type>
2140  ::invoke(member, one, A, C);
2141  A.assign_data( &AA(i+3,0,0,v) );
2142 
2143  member.team_barrier();
2144  KB::Gemm<member_type,
2145  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2146  default_mode_type,default_algo_type>
2147  ::invoke(member, -one, C, B, one, A);
2148  KB::LU<member_type,
2149  default_mode_type,KB::Algo::LU::Unblocked>
2150  ::invoke(member, A, tiny);
2151  }
2152  } else {
2153  // for block jacobi invert a matrix here
2154  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2155  KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2156  ::invoke(member, A, W);
2157  KB::SetIdentity<member_type,default_mode_type>
2158  ::invoke(member, A);
2159  member.team_barrier();
2160  KB::Trsm<member_type,
2161  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2162  default_mode_type,default_algo_type>
2163  ::invoke(member, one, W, A);
2164  KB::Trsm<member_type,
2165  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2166  default_mode_type,default_algo_type>
2167  ::invoke(member, one, W, A);
2168  }
2169  }
2170 
2171  public:
2172 
2173  struct ExtractAndFactorizeTag {};
2174 
2175  KOKKOS_INLINE_FUNCTION
2176  void
2177  operator() (const ExtractAndFactorizeTag &, const member_type &member) const {
2178  // btdm is packed and sorted from largest one
2179  const local_ordinal_type packidx = member.league_rank();
2180 
2181  const local_ordinal_type partidx = packptr(packidx);
2182  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2183  const local_ordinal_type i0 = pack_td_ptr(partidx);
2184  const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2185 
2186  internal_vector_scratch_type_3d_view
2187  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
2188  if (vector_loop_size == 1) {
2189  extract(partidx, npacks);
2190  factorize(member, i0, nrows, 0, internal_vector_values, WW);
2191  } else {
2192  Kokkos::parallel_for
2193  (Kokkos::ThreadVectorRange(member, vector_loop_size),
2194  [&](const local_ordinal_type &v) {
2195  const local_ordinal_type vbeg = v*internal_vector_length;
2196  if (vbeg < npacks)
2197  extract(member, partidx+vbeg, npacks, vbeg);
2198  // this is not safe if vector loop size is different from vector size of
2199  // the team policy. we always make sure this when constructing the team policy
2200  member.team_barrier();
2201  factorize(member, i0, nrows, v, internal_vector_values, WW);
2202  });
2203  }
2204  }
2205 
2206  void run() {
2207  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
2208  const local_ordinal_type team_size =
2209  ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
2210  recommended_team_size(blocksize, vector_length, internal_vector_length);
2211  const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
2212  shmem_size(blocksize, blocksize, vector_loop_size);
2213 
2214  Kokkos::TeamPolicy<execution_space,ExtractAndFactorizeTag>
2215  policy(packptr.extent(0)-1, team_size, vector_loop_size);
2216 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
2217  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeTag>",
2218  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this);
2219 #else
2220  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
2221  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeTag>",
2222  policy, *this);
2223 #endif
2224  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
2225  }
2226 
2227  };
2228 
2232  template<typename MatrixType>
2233  void
2234  performNumericPhase(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
2235  const PartInterface<MatrixType> &interf,
2237  const typename ImplType<MatrixType>::magnitude_type tiny) {
2238  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NumericPhase");
2239  ExtractAndFactorizeTridiags<MatrixType> function(btdm, interf, A, tiny);
2240  function.run();
2241  }
2242 
2246  template<typename MatrixType>
2248  public:
2250  using execution_space = typename impl_type::execution_space;
2251  using memory_space = typename impl_type::memory_space;
2252 
2253  using local_ordinal_type = typename impl_type::local_ordinal_type;
2254  using impl_scalar_type = typename impl_type::impl_scalar_type;
2255  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2256  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
2257  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2258  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2259  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
2260  using const_impl_scalar_type_2d_view_tpetra = typename impl_scalar_type_2d_view_tpetra::const_type;
2261  static constexpr int vector_length = impl_type::vector_length;
2262 
2263  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
2264 
2265  private:
2266  // part interface
2267  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2268  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
2269  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2270  const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
2271  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
2272  const local_ordinal_type blocksize;
2273  const local_ordinal_type num_vectors;
2274 
2275  // packed multivector output (or input)
2276  vector_type_3d_view packed_multivector;
2277  const_impl_scalar_type_2d_view_tpetra scalar_multivector;
2278 
2279  template<typename TagType>
2280  KOKKOS_INLINE_FUNCTION
2281  void copy_multivectors(const local_ordinal_type &j,
2282  const local_ordinal_type &vi,
2283  const local_ordinal_type &pri,
2284  const local_ordinal_type &ri0) const {
2285  for (local_ordinal_type col=0;col<num_vectors;++col)
2286  for (local_ordinal_type i=0;i<blocksize;++i)
2287  packed_multivector(pri, i, col)[vi] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
2288  }
2289 
2290  public:
2291 
2292  MultiVectorConverter(const PartInterface<MatrixType> &interf,
2293  const vector_type_3d_view &pmv)
2294  : partptr(interf.partptr),
2295  packptr(interf.packptr),
2296  part2packrowidx0(interf.part2packrowidx0),
2297  part2rowidx0(interf.part2rowidx0),
2298  lclrow(interf.lclrow),
2299  blocksize(pmv.extent(1)),
2300  num_vectors(pmv.extent(2)),
2301  packed_multivector(pmv) {}
2302 
2303  // TODO:: modify this routine similar to the team level functions
2304  // inline ---> FIXME HIP: should not need the KOKKOS_INLINE_FUNCTION below...
2305  KOKKOS_INLINE_FUNCTION
2306  void
2307  operator() (const local_ordinal_type &packidx) const {
2308  local_ordinal_type partidx = packptr(packidx);
2309  local_ordinal_type npacks = packptr(packidx+1) - partidx;
2310  const local_ordinal_type pri0 = part2packrowidx0(partidx);
2311 
2312  local_ordinal_type ri0[vector_length] = {};
2313  local_ordinal_type nrows[vector_length] = {};
2314  for (local_ordinal_type v=0;v<npacks;++v,++partidx) {
2315  ri0[v] = part2rowidx0(partidx);
2316  nrows[v] = part2rowidx0(partidx+1) - ri0[v];
2317  }
2318  for (local_ordinal_type j=0;j<nrows[0];++j) {
2319  local_ordinal_type cnt = 1;
2320  for (;cnt<npacks && j!= nrows[cnt];++cnt);
2321  npacks = cnt;
2322  const local_ordinal_type pri = pri0 + j;
2323  for (local_ordinal_type col=0;col<num_vectors;++col)
2324  for (local_ordinal_type i=0;i<blocksize;++i)
2325  for (local_ordinal_type v=0;v<npacks;++v)
2326  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0[v]+j)+i,col));
2327  }
2328  }
2329 
2330  KOKKOS_INLINE_FUNCTION
2331  void
2332  operator() (const member_type &member) const {
2333  const local_ordinal_type packidx = member.league_rank();
2334  const local_ordinal_type partidx_begin = packptr(packidx);
2335  const local_ordinal_type npacks = packptr(packidx+1) - partidx_begin;
2336  const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
2337  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](const local_ordinal_type &v) {
2338  const local_ordinal_type partidx = partidx_begin + v;
2339  const local_ordinal_type ri0 = part2rowidx0(partidx);
2340  const local_ordinal_type nrows = part2rowidx0(partidx+1) - ri0;
2341 
2342  if (nrows == 1) {
2343  const local_ordinal_type pri = pri0;
2344  for (local_ordinal_type col=0;col<num_vectors;++col) {
2345  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](const local_ordinal_type &i) {
2346  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0)+i,col));
2347  });
2348  }
2349  } else {
2350  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](const local_ordinal_type &j) {
2351  const local_ordinal_type pri = pri0 + j;
2352  for (local_ordinal_type col=0;col<num_vectors;++col)
2353  for (local_ordinal_type i=0;i<blocksize;++i)
2354  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
2355  });
2356  }
2357  });
2358  }
2359 
2360  void run(const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
2361  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
2362  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::MultiVectorConverter");
2363 
2364  scalar_multivector = scalar_multivector_;
2366 #if defined(KOKKOS_ENABLE_CUDA)
2367  const local_ordinal_type vl = vector_length;
2368  const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
2369  Kokkos::parallel_for
2370  ("MultiVectorConverter::TeamPolicy", policy, *this);
2371 #endif
2372  } else if(is_hip<execution_space>::value) {
2373 #if defined(KOKKOS_ENABLE_HIP)
2374  const local_ordinal_type vl = vector_length;
2375  const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
2376  Kokkos::parallel_for
2377  ("MultiVectorConverter::TeamPolicy", policy, *this);
2378 #endif
2379  } else {
2380 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
2381  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: device compiler should not see this code");
2382 #else
2383  const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
2384  Kokkos::parallel_for
2385  ("MultiVectorConverter::RangePolicy", policy, *this);
2386 #endif
2387  }
2388  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
2389  }
2390  };
2391 
2395  template<typename ArgActiveExecutionMemorySpace>
2397 
2398  template<>
2399  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
2400  typedef KB::Mode::Serial mode_type;
2401  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2402 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2403  typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
2404 #else
2405  typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
2406 #endif
2407  static int recommended_team_size(const int /* blksize */,
2408  const int /* vector_length */,
2409  const int /* internal_vector_length */) {
2410  return 1;
2411  }
2412  };
2413 
2414 #if defined(KOKKOS_ENABLE_CUDA)
2415  static inline int SolveTridiagsRecommendedCudaTeamSize(const int blksize,
2416  const int vector_length,
2417  const int internal_vector_length) {
2418  const int vector_size = vector_length/internal_vector_length;
2419  int total_team_size(0);
2420  if (blksize <= 5) total_team_size = 32;
2421  else if (blksize <= 9) total_team_size = 32; // 64
2422  else if (blksize <= 12) total_team_size = 96;
2423  else if (blksize <= 16) total_team_size = 128;
2424  else if (blksize <= 20) total_team_size = 160;
2425  else total_team_size = 160;
2426  return total_team_size/vector_size;
2427  }
2428 
2429  template<>
2430  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2431  typedef KB::Mode::Team mode_type;
2432  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2433  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2434  static int recommended_team_size(const int blksize,
2435  const int vector_length,
2436  const int internal_vector_length) {
2437  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2438  }
2439  };
2440  template<>
2441  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2442  typedef KB::Mode::Team mode_type;
2443  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2444  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2445  static int recommended_team_size(const int blksize,
2446  const int vector_length,
2447  const int internal_vector_length) {
2448  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2449  }
2450  };
2451 #endif
2452 
2453 #if defined(KOKKOS_ENABLE_HIP)
2454  static inline int SolveTridiagsRecommendedHIPTeamSize(const int blksize,
2455  const int vector_length,
2456  const int internal_vector_length) {
2457  const int vector_size = vector_length/internal_vector_length;
2458  int total_team_size(0);
2459  if (blksize <= 5) total_team_size = 32;
2460  else if (blksize <= 9) total_team_size = 32; // 64
2461  else if (blksize <= 12) total_team_size = 96;
2462  else if (blksize <= 16) total_team_size = 128;
2463  else if (blksize <= 20) total_team_size = 160;
2464  else total_team_size = 160;
2465  return total_team_size/vector_size;
2466  }
2467 
2468  template<>
2469  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::HIPSpace> {
2470  typedef KB::Mode::Team mode_type;
2471  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2472  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2473  static int recommended_team_size(const int blksize,
2474  const int vector_length,
2475  const int internal_vector_length) {
2476  return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2477  }
2478  };
2479  template<>
2480  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::HIPHostPinnedSpace> {
2481  typedef KB::Mode::Team mode_type;
2482  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2483  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2484  static int recommended_team_size(const int blksize,
2485  const int vector_length,
2486  const int internal_vector_length) {
2487  return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2488  }
2489  };
2490 #endif
2491 
2492  template<typename MatrixType>
2493  struct SolveTridiags {
2494  public:
2495  using impl_type = ImplType<MatrixType>;
2496  using execution_space = typename impl_type::execution_space;
2497 
2498  using local_ordinal_type = typename impl_type::local_ordinal_type;
2499  using size_type = typename impl_type::size_type;
2500  using impl_scalar_type = typename impl_type::impl_scalar_type;
2501  using magnitude_type = typename impl_type::magnitude_type;
2502  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2503  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2505  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2506  using size_type_1d_view = typename impl_type::size_type_1d_view;
2508  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2509  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
2510  //using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
2511 
2512  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2513 
2514  using internal_vector_type =typename impl_type::internal_vector_type;
2515  static constexpr int vector_length = impl_type::vector_length;
2516  static constexpr int internal_vector_length = impl_type::internal_vector_length;
2517 
2519  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
2520  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
2521 
2523  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2524  using member_type = typename team_policy_type::member_type;
2525 
2526  private:
2527  // part interface
2528  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2529  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
2530  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2531  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
2532 
2533  // block tridiags
2534  const ConstUnmanaged<size_type_1d_view> pack_td_ptr;
2535 
2536  // block tridiags values
2537  const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
2538  const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
2539 
2540  const local_ordinal_type vector_loop_size;
2541 
2542  // copy to multivectors : damping factor and Y_scalar_multivector
2543  Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
2544 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
2545  AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
2546 #else
2547  /* */ Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
2548 #endif
2549  const impl_scalar_type df;
2550  const bool compute_diff;
2551 
2552  public:
2553  SolveTridiags(const PartInterface<MatrixType> &interf,
2554  const BlockTridiags<MatrixType> &btdm,
2555  const vector_type_3d_view &pmv,
2556  const impl_scalar_type damping_factor,
2557  const bool is_norm_manager_active)
2558  :
2559  // interface
2560  partptr(interf.partptr),
2561  packptr(interf.packptr),
2562  part2packrowidx0(interf.part2packrowidx0),
2563  lclrow(interf.lclrow),
2564  // block tridiags and multivector
2565  pack_td_ptr(btdm.pack_td_ptr),
2566  D_internal_vector_values((internal_vector_type*)btdm.values.data(),
2567  btdm.values.extent(0),
2568  btdm.values.extent(1),
2569  btdm.values.extent(2),
2570  vector_length/internal_vector_length),
2571  X_internal_vector_values((internal_vector_type*)pmv.data(),
2572  pmv.extent(0),
2573  pmv.extent(1),
2574  pmv.extent(2),
2575  vector_length/internal_vector_length),
2576  vector_loop_size(vector_length/internal_vector_length),
2577  Y_scalar_multivector(),
2578  Z_scalar_vector(),
2579  df(damping_factor),
2580  compute_diff(is_norm_manager_active)
2581  {}
2582 
2583  public:
2584 
2586  KOKKOS_INLINE_FUNCTION
2587  void
2588  copyToFlatMultiVector(const member_type &member,
2589  const local_ordinal_type partidxbeg, // partidx for v = 0
2590  const local_ordinal_type npacks,
2591  const local_ordinal_type pri0,
2592  const local_ordinal_type v, // index with a loop of vector_loop_size
2593  const local_ordinal_type blocksize,
2594  const local_ordinal_type num_vectors) const {
2595  const local_ordinal_type vbeg = v*internal_vector_length;
2596  if (vbeg < npacks) {
2597  local_ordinal_type ri0_vals[internal_vector_length] = {};
2598  local_ordinal_type nrows_vals[internal_vector_length] = {};
2599  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2600  const local_ordinal_type partidx = partidxbeg+vv;
2601  ri0_vals[vi] = partptr(partidx);
2602  nrows_vals[vi] = partptr(partidx+1) - ri0_vals[vi];
2603  }
2604 
2605  impl_scalar_type z_partial_sum(0);
2606  if (nrows_vals[0] == 1) {
2607  const local_ordinal_type j=0, pri=pri0;
2608  {
2609  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2610  const local_ordinal_type ri0 = ri0_vals[vi];
2611  const local_ordinal_type nrows = nrows_vals[vi];
2612  if (j < nrows) {
2613  Kokkos::parallel_for
2614  (Kokkos::TeamThreadRange(member, blocksize),
2615  [&](const local_ordinal_type &i) {
2616  const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
2617  for (local_ordinal_type col=0;col<num_vectors;++col) {
2618  impl_scalar_type &y = Y_scalar_multivector(row,col);
2619  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
2620  y += df*yd;
2621 
2622  {//if (compute_diff) {
2623  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
2624  z_partial_sum += yd_abs*yd_abs;
2625  }
2626  }
2627  });
2628  }
2629  }
2630  }
2631  } else {
2632  Kokkos::parallel_for
2633  (Kokkos::TeamThreadRange(member, nrows_vals[0]),
2634  [&](const local_ordinal_type &j) {
2635  const local_ordinal_type pri = pri0 + j;
2636  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2637  const local_ordinal_type ri0 = ri0_vals[vi];
2638  const local_ordinal_type nrows = nrows_vals[vi];
2639  if (j < nrows) {
2640  for (local_ordinal_type col=0;col<num_vectors;++col) {
2641  for (local_ordinal_type i=0;i<blocksize;++i) {
2642  const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
2643  impl_scalar_type &y = Y_scalar_multivector(row,col);
2644  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
2645  y += df*yd;
2646 
2647  {//if (compute_diff) {
2648  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
2649  z_partial_sum += yd_abs*yd_abs;
2650  }
2651  }
2652  }
2653  }
2654  }
2655  });
2656  }
2657  //if (compute_diff)
2658  Z_scalar_vector(member.league_rank()) += z_partial_sum;
2659  }
2660  }
2661 
2665  template<typename WWViewType>
2666  KOKKOS_INLINE_FUNCTION
2667  void
2668  solveSingleVector(const member_type &member,
2669  const local_ordinal_type &blocksize,
2670  const local_ordinal_type &i0,
2671  const local_ordinal_type &r0,
2672  const local_ordinal_type &nrows,
2673  const local_ordinal_type &v,
2674  const WWViewType &WW) const {
2675 
2676  typedef SolveTridiagsDefaultModeAndAlgo
2677  <typename execution_space::memory_space> default_mode_and_algo_type;
2678 
2679  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2680  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2681 
2682  // base pointers
2683  auto A = D_internal_vector_values.data();
2684  auto X = X_internal_vector_values.data();
2685 
2686  // constant
2687  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2688  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2689  //const local_ordinal_type num_vectors = X_scalar_values.extent(2);
2690 
2691  // const local_ordinal_type blocksize = D_scalar_values.extent(1);
2692  const local_ordinal_type astep = D_internal_vector_values.stride_0();
2693  const local_ordinal_type as0 = D_internal_vector_values.stride_1(); //blocksize*vector_length;
2694  const local_ordinal_type as1 = D_internal_vector_values.stride_2(); //vector_length;
2695  const local_ordinal_type xstep = X_internal_vector_values.stride_0();
2696  const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); //vector_length;
2697 
2698  // for multiple rhs
2699  //const local_ordinal_type xs0 = num_vectors*vector_length; //X_scalar_values.stride_1();
2700  //const local_ordinal_type xs1 = vector_length; //X_scalar_values.stride_2();
2701 
2702  // move to starting point
2703  A += i0*astep + v;
2704  X += r0*xstep + v;
2705 
2706  //for (local_ordinal_type col=0;col<num_vectors;++col)
2707  if (nrows > 1) {
2708  // solve Lx = x
2709  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2710  (default_mode_type,default_algo_type,
2711  member,
2712  KB::Diag::Unit,
2713  blocksize,blocksize,
2714  one,
2715  A, as0, as1,
2716  X, xs0);
2717 
2718  for (local_ordinal_type tr=1;tr<nrows;++tr) {
2719  member.team_barrier();
2720  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2721  (default_mode_type,default_algo_type,
2722  member,
2723  blocksize, blocksize,
2724  -one,
2725  A+2*astep, as0, as1,
2726  X, xs0,
2727  one,
2728  X+1*xstep, xs0);
2729  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2730  (default_mode_type,default_algo_type,
2731  member,
2732  KB::Diag::Unit,
2733  blocksize,blocksize,
2734  one,
2735  A+3*astep, as0, as1,
2736  X+1*xstep, xs0);
2737 
2738  A += 3*astep;
2739  X += 1*xstep;
2740  }
2741 
2742  // solve Ux = x
2743  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2744  (default_mode_type,default_algo_type,
2745  member,
2746  KB::Diag::NonUnit,
2747  blocksize, blocksize,
2748  one,
2749  A, as0, as1,
2750  X, xs0);
2751 
2752  for (local_ordinal_type tr=nrows;tr>1;--tr) {
2753  A -= 3*astep;
2754  member.team_barrier();
2755  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2756  (default_mode_type,default_algo_type,
2757  member,
2758  blocksize, blocksize,
2759  -one,
2760  A+1*astep, as0, as1,
2761  X, xs0,
2762  one,
2763  X-1*xstep, xs0);
2764  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2765  (default_mode_type,default_algo_type,
2766  member,
2767  KB::Diag::NonUnit,
2768  blocksize, blocksize,
2769  one,
2770  A, as0, as1,
2771  X-1*xstep,xs0);
2772  X -= 1*xstep;
2773  }
2774  // for multiple rhs
2775  //X += xs1;
2776  } else {
2777  const local_ordinal_type ws0 = WW.stride_0();
2778  auto W = WW.data() + v;
2779  KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
2780  (default_mode_type,
2781  member, blocksize, X, xs0, W, ws0);
2782  member.team_barrier();
2783  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2784  (default_mode_type,default_algo_type,
2785  member,
2786  blocksize, blocksize,
2787  one,
2788  A, as0, as1,
2789  W, xs0,
2790  zero,
2791  X, xs0);
2792  }
2793  }
2794 
2795  template<typename WWViewType>
2796  KOKKOS_INLINE_FUNCTION
2797  void
2798  solveMultiVector(const member_type &member,
2799  const local_ordinal_type &/* blocksize */,
2800  const local_ordinal_type &i0,
2801  const local_ordinal_type &r0,
2802  const local_ordinal_type &nrows,
2803  const local_ordinal_type &v,
2804  const WWViewType &WW) const {
2805 
2806  typedef SolveTridiagsDefaultModeAndAlgo
2807  <typename execution_space::memory_space> default_mode_and_algo_type;
2808 
2809  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2810  typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2811 
2812  // constant
2813  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2814  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2815 
2816  // subview pattern
2817  auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2818  auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2819  auto X2 = X1;
2820 
2821  local_ordinal_type i = i0, r = r0;
2822 
2823 
2824  if (nrows > 1) {
2825  // solve Lx = x
2826  KB::Trsm<member_type,
2827  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2828  default_mode_type,default_algo_type>
2829  ::invoke(member, one, A, X1);
2830  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2831  A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2832  X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2833  member.team_barrier();
2834  KB::Gemm<member_type,
2835  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2836  default_mode_type,default_algo_type>
2837  ::invoke(member, -one, A, X1, one, X2);
2838  A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2839  KB::Trsm<member_type,
2840  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2841  default_mode_type,default_algo_type>
2842  ::invoke(member, one, A, X2);
2843  X1.assign_data( X2.data() );
2844  }
2845 
2846  // solve Ux = x
2847  KB::Trsm<member_type,
2848  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2849  default_mode_type,default_algo_type>
2850  ::invoke(member, one, A, X1);
2851  for (local_ordinal_type tr=nrows;tr>1;--tr) {
2852  i -= 3;
2853  A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
2854  X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
2855  member.team_barrier();
2856  KB::Gemm<member_type,
2857  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2858  default_mode_type,default_algo_type>
2859  ::invoke(member, -one, A, X1, one, X2);
2860 
2861  A.assign_data( &D_internal_vector_values(i,0,0,v) );
2862  KB::Trsm<member_type,
2863  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2864  default_mode_type,default_algo_type>
2865  ::invoke(member, one, A, X2);
2866  X1.assign_data( X2.data() );
2867  }
2868  } else {
2869  // matrix is already inverted
2870  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2871  KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2872  ::invoke(member, X1, W);
2873  member.team_barrier();
2874  KB::Gemm<member_type,
2875  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2876  default_mode_type,default_algo_type>
2877  ::invoke(member, one, A, W, zero, X1);
2878  }
2879  }
2880 
2881  template<int B> struct SingleVectorTag {};
2882  template<int B> struct MultiVectorTag {};
2883 
2884  template<int B>
2885  KOKKOS_INLINE_FUNCTION
2886  void
2887  operator() (const SingleVectorTag<B> &, const member_type &member) const {
2888  const local_ordinal_type packidx = member.league_rank();
2889  const local_ordinal_type partidx = packptr(packidx);
2890  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2891  const local_ordinal_type pri0 = part2packrowidx0(partidx);
2892  const local_ordinal_type i0 = pack_td_ptr(partidx);
2893  const local_ordinal_type r0 = part2packrowidx0(partidx);
2894  const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2895  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
2896  const local_ordinal_type num_vectors = 1;
2897  internal_vector_scratch_type_3d_view
2898  WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
2899  Kokkos::single(Kokkos::PerTeam(member), [&]() {
2900  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
2901  });
2902  Kokkos::parallel_for
2903  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
2904  solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
2905  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
2906  });
2907  }
2908 
2909  template<int B>
2910  KOKKOS_INLINE_FUNCTION
2911  void
2912  operator() (const MultiVectorTag<B> &, const member_type &member) const {
2913  const local_ordinal_type packidx = member.league_rank();
2914  const local_ordinal_type partidx = packptr(packidx);
2915  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2916  const local_ordinal_type pri0 = part2packrowidx0(partidx);
2917  const local_ordinal_type i0 = pack_td_ptr(partidx);
2918  const local_ordinal_type r0 = part2packrowidx0(partidx);
2919  const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2920  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
2921  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
2922 
2923  internal_vector_scratch_type_3d_view
2924  WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
2925  Kokkos::single(Kokkos::PerTeam(member), [&]() {
2926  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
2927  });
2928  Kokkos::parallel_for
2929  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
2930  solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
2931  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
2932  });
2933  }
2934 
2935  void run(const impl_scalar_type_2d_view_tpetra &Y,
2936  const impl_scalar_type_1d_view &Z) {
2937  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
2938  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::SolveTridiags");
2939 
2941  this->Y_scalar_multivector = Y;
2942  this->Z_scalar_vector = Z;
2943 
2944  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
2945  const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
2946 
2947  const local_ordinal_type team_size =
2948  SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
2949  recommended_team_size(blocksize, vector_length, internal_vector_length);
2950  const int per_team_scratch = internal_vector_scratch_type_3d_view
2951  ::shmem_size(blocksize, num_vectors, vector_loop_size);
2952 
2953 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
2954 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
2955  if (num_vectors == 1) { \
2956  const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
2957  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2958  Kokkos::parallel_for \
2959  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
2960  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
2961  } else { \
2962  const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
2963  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2964  Kokkos::parallel_for \
2965  ("SolveTridiags::TeamPolicy::run<MultiVector>", \
2966  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
2967  } break
2968 #else
2969 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
2970  if (num_vectors == 1) { \
2971  Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
2972  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2973  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
2974  Kokkos::parallel_for \
2975  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
2976  policy, *this); \
2977  } else { \
2978  Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
2979  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2980  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
2981  Kokkos::parallel_for \
2982  ("SolveTridiags::TeamPolicy::run<MultiVector>", \
2983  policy, *this); \
2984  } break
2985 #endif
2986  switch (blocksize) {
2987  case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
2988  case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
2989  case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
2990  case 9: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 9);
2991  case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
2992  case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
2993  case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
2994  case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
2995  case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
2996  default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
2997  }
2998 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
2999 
3000  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3001  }
3002  };
3003 
3007  static inline int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize,
3008  const int team_size) {
3009  int total_team_size(0);
3010  if (blksize <= 5) total_team_size = 32;
3011  else if (blksize <= 9) total_team_size = 32; // 64
3012  else if (blksize <= 12) total_team_size = 96;
3013  else if (blksize <= 16) total_team_size = 128;
3014  else if (blksize <= 20) total_team_size = 160;
3015  else total_team_size = 160;
3016  return total_team_size/team_size;
3017  }
3018 
3019  static inline int ComputeResidualVectorRecommendedHIPVectorSize(const int blksize,
3020  const int team_size) {
3021  int total_team_size(0);
3022  if (blksize <= 5) total_team_size = 32;
3023  else if (blksize <= 9) total_team_size = 32; // 64
3024  else if (blksize <= 12) total_team_size = 96;
3025  else if (blksize <= 16) total_team_size = 128;
3026  else if (blksize <= 20) total_team_size = 160;
3027  else total_team_size = 160;
3028  return total_team_size/team_size;
3029  }
3030 
3031  template<typename MatrixType>
3032  struct ComputeResidualVector {
3033  public:
3034  using impl_type = ImplType<MatrixType>;
3035  using node_device_type = typename impl_type::node_device_type;
3036  using execution_space = typename impl_type::execution_space;
3037  using memory_space = typename impl_type::memory_space;
3038 
3039  using local_ordinal_type = typename impl_type::local_ordinal_type;
3040  using size_type = typename impl_type::size_type;
3041  using impl_scalar_type = typename impl_type::impl_scalar_type;
3042  using magnitude_type = typename impl_type::magnitude_type;
3043  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
3044  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
3046  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3047  using size_type_1d_view = typename impl_type::size_type_1d_view;
3048  using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
3049  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
3050  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector (layout left)
3051  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3052  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
3053  static constexpr int vector_length = impl_type::vector_length;
3054 
3056  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
3057 
3058  // enum for max blocksize and vector length
3059  enum : int { max_blocksize = 32 };
3060 
3061  private:
3062  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
3063  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
3064  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
3065  Unmanaged<impl_scalar_type_2d_view_tpetra> y;
3066  Unmanaged<vector_type_3d_view> y_packed;
3067  Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
3068 
3069  // AmD information
3070  const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
3071  const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
3072  const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
3073 
3074  // block crs graph information
3075  // for cuda (kokkos crs graph uses a different size_type from size_t)
3076  const ConstUnmanaged<Kokkos::View<size_t*,node_device_type> > A_rowptr;
3077  const ConstUnmanaged<Kokkos::View<local_ordinal_type*,node_device_type> > A_colind;
3078 
3079  // blocksize
3080  const local_ordinal_type blocksize_requested;
3081 
3082  // part interface
3083  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3084  const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
3085  const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
3086  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3087  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3088  const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
3089  const bool is_dm2cm_active;
3090 
3091  public:
3092  template<typename LocalCrsGraphType>
3093  ComputeResidualVector(const AmD<MatrixType> &amd,
3094  const LocalCrsGraphType &graph,
3095  const local_ordinal_type &blocksize_requested_,
3096  const PartInterface<MatrixType> &interf,
3097  const local_ordinal_type_1d_view &dm2cm_)
3098  : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote),
3099  colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote),
3100  tpetra_values(amd.tpetra_values),
3101  A_rowptr(graph.row_map),
3102  A_colind(graph.entries),
3103  blocksize_requested(blocksize_requested_),
3104  part2packrowidx0(interf.part2packrowidx0),
3105  part2rowidx0(interf.part2rowidx0),
3106  rowidx2part(interf.rowidx2part),
3107  partptr(interf.partptr),
3108  lclrow(interf.lclrow),
3109  dm2cm(dm2cm_),
3110  is_dm2cm_active(dm2cm_.span() > 0)
3111  {}
3112 
3113  inline
3114  void
3115  SerialGemv(const local_ordinal_type &blocksize,
3116  const impl_scalar_type * const KOKKOS_RESTRICT AA,
3117  const impl_scalar_type * const KOKKOS_RESTRICT xx,
3118  /* */ impl_scalar_type * KOKKOS_RESTRICT yy) const {
3119  using tlb = TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3120  for (local_ordinal_type k0=0;k0<blocksize;++k0) {
3121  impl_scalar_type val = 0;
3122 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
3123 # pragma ivdep
3124 #endif
3125 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
3126 # pragma unroll
3127 #endif
3128  for (local_ordinal_type k1=0;k1<blocksize;++k1)
3129  val += AA[tlb::getFlatIndex(k0,k1,blocksize)]*xx[k1];
3130  yy[k0] -= val;
3131  }
3132  }
3133 
3134  template<typename bbViewType, typename yyViewType>
3135  KOKKOS_INLINE_FUNCTION
3136  void
3137  VectorCopy(const member_type &member,
3138  const local_ordinal_type &blocksize,
3139  const bbViewType &bb,
3140  const yyViewType &yy) const {
3141  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &k0) {
3142  yy(k0) = static_cast<typename yyViewType::const_value_type>(bb(k0));
3143  });
3144  }
3145 
3146  template<typename AAViewType, typename xxViewType, typename yyViewType>
3147  KOKKOS_INLINE_FUNCTION
3148  void
3149  TeamVectorGemv(const member_type &member,
3150  const local_ordinal_type &blocksize,
3151  const AAViewType &AA,
3152  const xxViewType &xx,
3153  const yyViewType &yy) const {
3154  Kokkos::parallel_for
3155  (Kokkos::TeamThreadRange(member, blocksize),
3156  [&](const local_ordinal_type &k0) {
3157  impl_scalar_type val = 0;
3158  Kokkos::parallel_for
3159  (Kokkos::ThreadVectorRange(member, blocksize),
3160  [&](const local_ordinal_type &k1) {
3161  val += AA(k0,k1)*xx(k1);
3162  });
3163  Kokkos::atomic_fetch_add(&yy(k0), typename yyViewType::const_value_type(-val));
3164  });
3165  }
3166 
3167  template<typename AAViewType, typename xxViewType, typename yyViewType>
3168  KOKKOS_INLINE_FUNCTION
3169  void
3170  VectorGemv(const member_type &member,
3171  const local_ordinal_type &blocksize,
3172  const AAViewType &AA,
3173  const xxViewType &xx,
3174  const yyViewType &yy) const {
3175  Kokkos::parallel_for
3176  (Kokkos::ThreadVectorRange(member, blocksize),
3177  [&](const local_ordinal_type &k0) {
3178  impl_scalar_type val(0);
3179  for (local_ordinal_type k1=0;k1<blocksize;++k1) {
3180  val += AA(k0,k1)*xx(k1);
3181  }
3182  Kokkos::atomic_fetch_add(&yy(k0), typename yyViewType::const_value_type(-val));
3183  });
3184  }
3185 
3186  // template<typename AAViewType, typename xxViewType, typename yyViewType>
3187  // KOKKOS_INLINE_FUNCTION
3188  // void
3189  // VectorGemv(const member_type &member,
3190  // const local_ordinal_type &blocksize,
3191  // const AAViewType &AA,
3192  // const xxViewType &xx,
3193  // const yyViewType &yy) const {
3194  // for (local_ordinal_type k0=0;k0<blocksize;++k0) {
3195  // impl_scalar_type val = 0;
3196  // Kokkos::parallel_for
3197  // (Kokkos::ThreadVectorRange(member, blocksize),
3198  // [&](const local_ordinal_type &k1) {
3199  // val += AA(k0,k1)*xx(k1);
3200  // });
3201  // Kokkos::atomic_fetch_add(&yy(k0), -val);
3202  // }
3203  // }
3204 
3205  struct SeqTag {};
3206 
3207  // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
3208  KOKKOS_INLINE_FUNCTION
3209  void
3210  operator() (const SeqTag &, const local_ordinal_type& i) const {
3211  const local_ordinal_type blocksize = blocksize_requested;
3212  const local_ordinal_type blocksize_square = blocksize*blocksize;
3213 
3214  // constants
3215  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3216  const local_ordinal_type num_vectors = y.extent(1);
3217  const local_ordinal_type row = i*blocksize;
3218  for (local_ordinal_type col=0;col<num_vectors;++col) {
3219  // y := b
3220  impl_scalar_type *yy = &y(row, col);
3221  const impl_scalar_type * const bb = &b(row, col);
3222  memcpy(yy, bb, sizeof(impl_scalar_type)*blocksize);
3223 
3224  // y -= Rx
3225  const size_type A_k0 = A_rowptr[i];
3226  for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
3227  const size_type j = A_k0 + colindsub[k];
3228  const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
3229  const impl_scalar_type * const xx = &x(A_colind[j]*blocksize, col);
3230  SerialGemv(blocksize,AA,xx,yy);
3231  }
3232  }
3233  }
3234 
3235  KOKKOS_INLINE_FUNCTION
3236  void
3237  operator() (const SeqTag &, const member_type &member) const {
3238 
3239  // constants
3240  const local_ordinal_type blocksize = blocksize_requested;
3241  const local_ordinal_type blocksize_square = blocksize*blocksize;
3242 
3243  const local_ordinal_type lr = member.league_rank();
3244  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3245  const local_ordinal_type num_vectors = y.extent(1);
3246 
3247  // subview pattern
3248  auto bb = Kokkos::subview(b, block_range, 0);
3249  auto xx = bb;
3250  auto yy = Kokkos::subview(y, block_range, 0);
3251  auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
3252 
3253  const local_ordinal_type row = lr*blocksize;
3254  for (local_ordinal_type col=0;col<num_vectors;++col) {
3255  // y := b
3256  yy.assign_data(&y(row, col));
3257  bb.assign_data(&b(row, col));
3258  if (member.team_rank() == 0)
3259  VectorCopy(member, blocksize, bb, yy);
3260  member.team_barrier();
3261 
3262  // y -= Rx
3263  const size_type A_k0 = A_rowptr[lr];
3264  Kokkos::parallel_for
3265  (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
3266  [&](const local_ordinal_type &k) {
3267  const size_type j = A_k0 + colindsub[k];
3268  A_block.assign_data( &tpetra_values(j*blocksize_square) );
3269  xx.assign_data( &x(A_colind[j]*blocksize, col) );
3270  VectorGemv(member, blocksize, A_block, xx, yy);
3271  });
3272  }
3273  }
3274 
3275  template<int B>
3276  struct AsyncTag {};
3277 
3278  template<int B>
3279  // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
3280  KOKKOS_INLINE_FUNCTION
3281  void
3282  operator() (const AsyncTag<B> &, const local_ordinal_type &rowidx) const {
3283  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3284  const local_ordinal_type blocksize_square = blocksize*blocksize;
3285 
3286  // constants
3287  const local_ordinal_type partidx = rowidx2part(rowidx);
3288  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3289  const local_ordinal_type v = partidx % vector_length;
3290 
3291  const local_ordinal_type num_vectors = y_packed.extent(2);
3292  const local_ordinal_type num_local_rows = lclrow.extent(0);
3293 
3294  // temporary buffer for y flat
3295  impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
3296 
3297  const local_ordinal_type lr = lclrow(rowidx);
3298  const local_ordinal_type row = lr*blocksize;
3299  for (local_ordinal_type col=0;col<num_vectors;++col) {
3300  // y := b
3301  memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
3302 
3303  // y -= Rx
3304  const size_type A_k0 = A_rowptr[lr];
3305  for (size_type k=rowptr[lr];k<rowptr[lr+1];++k) {
3306  const size_type j = A_k0 + colindsub[k];
3307  const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
3308  const local_ordinal_type A_colind_at_j = A_colind[j];
3309  if (A_colind_at_j < num_local_rows) {
3310  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3311  const impl_scalar_type * const xx = &x(loc*blocksize, col);
3312  SerialGemv(blocksize, AA,xx,yy);
3313  } else {
3314  const auto loc = A_colind_at_j - num_local_rows;
3315  const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
3316  SerialGemv(blocksize, AA,xx_remote,yy);
3317  }
3318  }
3319  // move yy to y_packed
3320  for (local_ordinal_type k=0;k<blocksize;++k)
3321  y_packed(pri, k, col)[v] = yy[k];
3322  }
3323  }
3324 
3325  template<int B>
3326  KOKKOS_INLINE_FUNCTION
3327  void
3328  operator() (const AsyncTag<B> &, const member_type &member) const {
3329  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3330  const local_ordinal_type blocksize_square = blocksize*blocksize;
3331 
3332  // constants
3333  const local_ordinal_type rowidx = member.league_rank();
3334  const local_ordinal_type partidx = rowidx2part(rowidx);
3335  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3336  const local_ordinal_type v = partidx % vector_length;
3337 
3338  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3339  const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
3340  const local_ordinal_type num_local_rows = lclrow.extent(0);
3341 
3342  // subview pattern
3343  auto bb = Kokkos::subview(b, block_range, 0);
3344  auto xx = Kokkos::subview(x, block_range, 0);
3345  auto xx_remote = Kokkos::subview(x_remote, block_range, 0);
3346  auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
3347  auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
3348 
3349  const local_ordinal_type lr = lclrow(rowidx);
3350  const local_ordinal_type row = lr*blocksize;
3351  for (local_ordinal_type col=0;col<num_vectors;++col) {
3352  // y := b
3353  bb.assign_data(&b(row, col));
3354  yy.assign_data(&y_packed_scalar(pri, 0, col, v));
3355  if (member.team_rank() == 0)
3356  VectorCopy(member, blocksize, bb, yy);
3357  member.team_barrier();
3358 
3359  // y -= Rx
3360  const size_type A_k0 = A_rowptr[lr];
3361  Kokkos::parallel_for
3362  (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
3363  [&](const local_ordinal_type &k) {
3364  const size_type j = A_k0 + colindsub[k];
3365  A_block.assign_data( &tpetra_values(j*blocksize_square) );
3366 
3367  const local_ordinal_type A_colind_at_j = A_colind[j];
3368  if (A_colind_at_j < num_local_rows) {
3369  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3370  xx.assign_data( &x(loc*blocksize, col) );
3371  VectorGemv(member, blocksize, A_block, xx, yy);
3372  } else {
3373  const auto loc = A_colind_at_j - num_local_rows;
3374  xx_remote.assign_data( &x_remote(loc*blocksize, col) );
3375  VectorGemv(member, blocksize, A_block, xx_remote, yy);
3376  }
3377  });
3378  }
3379  }
3380 
3381  template <int P, int B> struct OverlapTag {};
3382 
3383  template<int P, int B>
3384  // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
3385  KOKKOS_INLINE_FUNCTION
3386  void
3387  operator() (const OverlapTag<P,B> &, const local_ordinal_type& rowidx) const {
3388  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3389  const local_ordinal_type blocksize_square = blocksize*blocksize;
3390 
3391  // constants
3392  const local_ordinal_type partidx = rowidx2part(rowidx);
3393  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3394  const local_ordinal_type v = partidx % vector_length;
3395 
3396  const local_ordinal_type num_vectors = y_packed.extent(2);
3397  const local_ordinal_type num_local_rows = lclrow.extent(0);
3398 
3399  // temporary buffer for y flat
3400  impl_scalar_type yy[max_blocksize] = {};
3401 
3402  auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
3403  auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
3404 
3405  const local_ordinal_type lr = lclrow(rowidx);
3406  const local_ordinal_type row = lr*blocksize;
3407  for (local_ordinal_type col=0;col<num_vectors;++col) {
3408  if (P == 0) {
3409  // y := b
3410  memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
3411  } else {
3412  // y (temporary) := 0
3413  memset(yy, 0, sizeof(impl_scalar_type)*blocksize);
3414  }
3415 
3416  // y -= Rx
3417  const size_type A_k0 = A_rowptr[lr];
3418  for (size_type k=rowptr_used[lr];k<rowptr_used[lr+1];++k) {
3419  const size_type j = A_k0 + colindsub_used[k];
3420  const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
3421  const local_ordinal_type A_colind_at_j = A_colind[j];
3422  if (P == 0) {
3423  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3424  const impl_scalar_type * const xx = &x(loc*blocksize, col);
3425  SerialGemv(blocksize,AA,xx,yy);
3426  } else {
3427  const auto loc = A_colind_at_j - num_local_rows;
3428  const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
3429  SerialGemv(blocksize,AA,xx_remote,yy);
3430  }
3431  }
3432  // move yy to y_packed
3433  if (P == 0) {
3434  for (local_ordinal_type k=0;k<blocksize;++k)
3435  y_packed(pri, k, col)[v] = yy[k];
3436  } else {
3437  for (local_ordinal_type k=0;k<blocksize;++k)
3438  y_packed(pri, k, col)[v] += yy[k];
3439  }
3440  }
3441  }
3442 
3443  template<int P, int B>
3444  KOKKOS_INLINE_FUNCTION
3445  void
3446  operator() (const OverlapTag<P,B> &, const member_type &member) const {
3447  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3448  const local_ordinal_type blocksize_square = blocksize*blocksize;
3449 
3450  // constants
3451  const local_ordinal_type rowidx = member.league_rank();
3452  const local_ordinal_type partidx = rowidx2part(rowidx);
3453  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3454  const local_ordinal_type v = partidx % vector_length;
3455 
3456  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3457  const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
3458  const local_ordinal_type num_local_rows = lclrow.extent(0);
3459 
3460  // subview pattern
3461  auto bb = Kokkos::subview(b, block_range, 0);
3462  auto xx = bb; //Kokkos::subview(x, block_range, 0);
3463  auto xx_remote = bb; //Kokkos::subview(x_remote, block_range, 0);
3464  auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
3465  auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
3466  auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
3467  auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
3468 
3469  const local_ordinal_type lr = lclrow(rowidx);
3470  const local_ordinal_type row = lr*blocksize;
3471  for (local_ordinal_type col=0;col<num_vectors;++col) {
3472  yy.assign_data(&y_packed_scalar(pri, 0, col, v));
3473  if (P == 0) {
3474  // y := b
3475  bb.assign_data(&b(row, col));
3476  if (member.team_rank() == 0)
3477  VectorCopy(member, blocksize, bb, yy);
3478  member.team_barrier();
3479  }
3480 
3481  // y -= Rx
3482  const size_type A_k0 = A_rowptr[lr];
3483  Kokkos::parallel_for
3484  (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
3485  [&](const local_ordinal_type &k) {
3486  const size_type j = A_k0 + colindsub_used[k];
3487  A_block.assign_data( &tpetra_values(j*blocksize_square) );
3488 
3489  const local_ordinal_type A_colind_at_j = A_colind[j];
3490  if (P == 0) {
3491  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3492  xx.assign_data( &x(loc*blocksize, col) );
3493  VectorGemv(member, blocksize, A_block, xx, yy);
3494  } else {
3495  const auto loc = A_colind_at_j - num_local_rows;
3496  xx_remote.assign_data( &x_remote(loc*blocksize, col) );
3497  VectorGemv(member, blocksize, A_block, xx_remote, yy);
3498  }
3499  });
3500  }
3501  }
3502 
3503  // y = b - Rx; seq method
3504  template<typename MultiVectorLocalViewTypeY,
3505  typename MultiVectorLocalViewTypeB,
3506  typename MultiVectorLocalViewTypeX>
3507  void run(const MultiVectorLocalViewTypeY &y_,
3508  const MultiVectorLocalViewTypeB &b_,
3509  const MultiVectorLocalViewTypeX &x_) {
3510  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3511  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::<SeqTag>");
3512 
3513  y = y_; b = b_; x = x_;
3514  if (is_cuda<execution_space>::value) {
3515 #if defined(KOKKOS_ENABLE_CUDA)
3516  const local_ordinal_type blocksize = blocksize_requested;
3517  const local_ordinal_type team_size = 8;
3518  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size);
3519  const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
3520  Kokkos::parallel_for
3521  ("ComputeResidual::TeamPolicy::run<SeqTag>", policy, *this);
3522 #endif
3523  } else if(is_hip<execution_space>::value) {
3524 #if defined(KOKKOS_ENABLE_HIP)
3525  const local_ordinal_type blocksize = blocksize_requested;
3526  const local_ordinal_type team_size = 8;
3527  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedHIPVectorSize(blocksize, team_size);
3528  const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
3529  Kokkos::parallel_for
3530  ("ComputeResidual::TeamPolicy::run<SeqTag>", policy, *this);
3531 #endif
3532  } else {
3533 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
3534  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: device compiler should not see this code");
3535 #else
3536  const Kokkos::RangePolicy<execution_space,SeqTag> policy(0, rowptr.extent(0) - 1);
3537  Kokkos::parallel_for
3538  ("ComputeResidual::RangePolicy::run<SeqTag>", policy, *this);
3539 #endif
3540  }
3541  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3542  }
3543 
3544  // y = b - R (x , x_remote)
3545  template<typename MultiVectorLocalViewTypeB,
3546  typename MultiVectorLocalViewTypeX,
3547  typename MultiVectorLocalViewTypeX_Remote>
3548  void run(const vector_type_3d_view &y_packed_,
3549  const MultiVectorLocalViewTypeB &b_,
3550  const MultiVectorLocalViewTypeX &x_,
3551  const MultiVectorLocalViewTypeX_Remote &x_remote_) {
3552  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3553  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::<AsyncTag>");
3554 
3555  b = b_; x = x_; x_remote = x_remote_;
3556  if (is_cuda<execution_space>::value) {
3557 #if defined(KOKKOS_ENABLE_CUDA)
3558  y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
3559  y_packed_.extent(0),
3560  y_packed_.extent(1),
3561  y_packed_.extent(2),
3562  vector_length);
3563 #endif
3564  } else if (is_hip<execution_space>::value) {
3565 #if defined(KOKKOS_ENABLE_HIP)
3566  y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
3567  y_packed_.extent(0),
3568  y_packed_.extent(1),
3569  y_packed_.extent(2),
3570  vector_length);
3571 #endif
3572  } else {
3573  y_packed = y_packed_;
3574  }
3575 
3576  if (is_cuda<execution_space>::value) {
3577 #if defined(KOKKOS_ENABLE_CUDA)
3578  const local_ordinal_type blocksize = blocksize_requested;
3579  const local_ordinal_type team_size = 8;
3580  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size);
3581  // local_ordinal_type vl_power_of_two = 1;
3582  // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3583  // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3584  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3585 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
3586  const Kokkos::TeamPolicy<execution_space,AsyncTag<B> > \
3587  policy(rowidx2part.extent(0), team_size, vector_size); \
3588  Kokkos::parallel_for \
3589  ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
3590  policy, *this); } break
3591  switch (blocksize_requested) {
3592  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3593  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3594  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3595  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3596  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3597  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3598  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3599  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3600  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3601  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3602  }
3603 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3604 #endif
3605  } else if (is_hip<execution_space>::value) {
3606 #if defined(KOKKOS_ENABLE_HIP)
3607  const local_ordinal_type blocksize = blocksize_requested;
3608  const local_ordinal_type team_size = 8;
3609  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedHIPVectorSize(blocksize, team_size);
3610  // local_ordinal_type vl_power_of_two = 1;
3611  // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3612  // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3613  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3614 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
3615  const Kokkos::TeamPolicy<execution_space,AsyncTag<B> > \
3616  policy(rowidx2part.extent(0), team_size, vector_size); \
3617  Kokkos::parallel_for \
3618  ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
3619  policy, *this); } break
3620  switch (blocksize_requested) {
3621  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3622  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3623  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3624  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3625  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3626  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3627  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3628  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3629  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3630  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3631  }
3632 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3633 #endif
3634  } else {
3635 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
3636  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: device compiler should not see this code");
3637 #else
3638 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
3639  const Kokkos::RangePolicy<execution_space,AsyncTag<B> > policy(0, rowidx2part.extent(0)); \
3640  Kokkos::parallel_for \
3641  ("ComputeResidual::RangePolicy::run<AsyncTag>", \
3642  policy, *this); } break
3643  switch (blocksize_requested) {
3644  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3645  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3646  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3647  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3648  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3649  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3650  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3651  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3652  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3653  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3654  }
3655 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3656 #endif
3657  }
3658  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3659  }
3660 
3661  // y = b - R (y , y_remote)
3662  template<typename MultiVectorLocalViewTypeB,
3663  typename MultiVectorLocalViewTypeX,
3664  typename MultiVectorLocalViewTypeX_Remote>
3665  void run(const vector_type_3d_view &y_packed_,
3666  const MultiVectorLocalViewTypeB &b_,
3667  const MultiVectorLocalViewTypeX &x_,
3668  const MultiVectorLocalViewTypeX_Remote &x_remote_,
3669  const bool compute_owned) {
3670  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3671  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::<OverlapTag>");
3672 
3673  b = b_; x = x_; x_remote = x_remote_;
3674  if (is_cuda<execution_space>::value) {
3675 #if defined(KOKKOS_ENABLE_CUDA)
3676  y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
3677  y_packed_.extent(0),
3678  y_packed_.extent(1),
3679  y_packed_.extent(2),
3680  vector_length);
3681 #endif
3682  } else if (is_hip<execution_space>::value) {
3683 #if defined(KOKKOS_ENABLE_HIP)
3684  y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
3685  y_packed_.extent(0),
3686  y_packed_.extent(1),
3687  y_packed_.extent(2),
3688  vector_length);
3689 #endif
3690  } else {
3691  y_packed = y_packed_;
3692  }
3693 
3694  if (is_cuda<execution_space>::value) {
3695 #if defined(KOKKOS_ENABLE_CUDA)
3696  const local_ordinal_type blocksize = blocksize_requested;
3697  const local_ordinal_type team_size = 8;
3698  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size);
3699  // local_ordinal_type vl_power_of_two = 1;
3700  // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3701  // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3702  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3703 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3704  if (compute_owned) { \
3705  const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > \
3706  policy(rowidx2part.extent(0), team_size, vector_size); \
3707  Kokkos::parallel_for \
3708  ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
3709  } else { \
3710  const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > \
3711  policy(rowidx2part.extent(0), team_size, vector_size); \
3712  Kokkos::parallel_for \
3713  ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
3714  } break
3715  switch (blocksize_requested) {
3716  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3717  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3718  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3719  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3720  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3721  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3722  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3723  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3724  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3725  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3726  }
3727 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3728 #endif
3729  } else if (is_hip<execution_space>::value) {
3730 #if defined(KOKKOS_ENABLE_HIP)
3731  const local_ordinal_type blocksize = blocksize_requested;
3732  const local_ordinal_type team_size = 8;
3733  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedHIPVectorSize(blocksize, team_size);
3734  // local_ordinal_type vl_power_of_two = 1;
3735  // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3736  // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3737  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3738 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3739  if (compute_owned) { \
3740  const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > \
3741  policy(rowidx2part.extent(0), team_size, vector_size); \
3742  Kokkos::parallel_for \
3743  ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
3744  } else { \
3745  const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > \
3746  policy(rowidx2part.extent(0), team_size, vector_size); \
3747  Kokkos::parallel_for \
3748  ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
3749  } break
3750  switch (blocksize_requested) {
3751  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3752  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3753  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3754  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3755  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3756  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3757  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3758  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3759  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3760  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3761  }
3762 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3763 #endif
3764  } else {
3765 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
3766  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: device compiler should not see this code");
3767 #else
3768 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3769  if (compute_owned) { \
3770  const Kokkos::RangePolicy<execution_space,OverlapTag<0,B> > \
3771  policy(0, rowidx2part.extent(0)); \
3772  Kokkos::parallel_for \
3773  ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
3774  } else { \
3775  const Kokkos::RangePolicy<execution_space,OverlapTag<1,B> > \
3776  policy(0, rowidx2part.extent(0)); \
3777  Kokkos::parallel_for \
3778  ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
3779  } break
3780 
3781  switch (blocksize_requested) {
3782  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3783  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3784  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3785  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3786  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3787  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3788  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3789  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3790  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3791  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3792  }
3793 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3794 #endif
3795  }
3796  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3797  }
3798  };
3799 
3800  template<typename MatrixType>
3801  void reduceVector(const ConstUnmanaged<typename ImplType<MatrixType>::impl_scalar_type_1d_view> zz,
3802  /* */ typename ImplType<MatrixType>::magnitude_type *vals) {
3803  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3804  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ReduceVector");
3805 
3806  using impl_type = ImplType<MatrixType>;
3807  using local_ordinal_type = typename impl_type::local_ordinal_type;
3808  using impl_scalar_type = typename impl_type::impl_scalar_type;
3809 #if 0
3810  const auto norm2 = KokkosBlas::nrm1(zz);
3811 #else
3812  impl_scalar_type norm2(0);
3813  Kokkos::parallel_reduce
3814  ("ReduceMultiVector::Device",
3815  Kokkos::RangePolicy<typename impl_type::execution_space>(0,zz.extent(0)),
3816  KOKKOS_LAMBDA(const local_ordinal_type &i, impl_scalar_type &update) {
3817  update += zz(i);
3818  }, norm2);
3819 #endif
3820  vals[0] = Kokkos::ArithTraits<impl_scalar_type>::abs(norm2);
3821 
3822  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3823  }
3824 
3828  template<typename MatrixType>
3829  struct NormManager {
3830  public:
3832  using host_execution_space = typename impl_type::host_execution_space;
3833  using magnitude_type = typename impl_type::magnitude_type;
3834 
3835  private:
3836  bool collective_;
3837  int sweep_step_, sweep_step_upper_bound_;
3838 #ifdef HAVE_IFPACK2_MPI
3839  MPI_Request mpi_request_;
3840  MPI_Comm comm_;
3841 #endif
3842  magnitude_type work_[3];
3843 
3844  public:
3845  NormManager() = default;
3846  NormManager(const NormManager &b) = default;
3847  NormManager(const Teuchos::RCP<const Teuchos::Comm<int> >& comm) {
3848  sweep_step_ = 1;
3849  sweep_step_upper_bound_ = 1;
3850  collective_ = comm->getSize() > 1;
3851  if (collective_) {
3852 #ifdef HAVE_IFPACK2_MPI
3853  const auto mpi_comm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm);
3854  TEUCHOS_ASSERT( ! mpi_comm.is_null());
3855  comm_ = *mpi_comm->getRawMpiComm();
3856 #endif
3857  }
3858  const magnitude_type zero(0), minus_one(-1);
3859  work_[0] = zero;
3860  work_[1] = zero;
3861  work_[2] = minus_one;
3862  }
3863 
3864  // Check the norm every sweep_step sweeps.
3865  void setCheckFrequency(const int sweep_step) {
3866  TEUCHOS_TEST_FOR_EXCEPT_MSG(sweep_step < 1, "sweep step must be >= 1");
3867  sweep_step_upper_bound_ = sweep_step;
3868  sweep_step_ = 1;
3869  }
3870 
3871  // Get the buffer into which to store rank-local squared norms.
3872  magnitude_type* getBuffer() { return &work_[0]; }
3873 
3874  // Call MPI_Iallreduce to find the global squared norms.
3875  void ireduce(const int sweep, const bool force = false) {
3876  if ( ! force && sweep % sweep_step_) return;
3877 
3878  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::Ireduce");
3879 
3880  work_[1] = work_[0];
3881 #ifdef HAVE_IFPACK2_MPI
3882  auto send_data = &work_[1];
3883  auto recv_data = &work_[0];
3884  if (collective_) {
3885 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
3886  MPI_Iallreduce(send_data, recv_data, 1,
3887  Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
3888  MPI_SUM, comm_, &mpi_request_);
3889 # else
3890  MPI_Allreduce (send_data, recv_data, 1,
3891  Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
3892  MPI_SUM, comm_);
3893 # endif
3894  }
3895 #endif
3896  }
3897 
3898  // Check if the norm-based termination criterion is met. tol2 is the
3899  // tolerance squared. Sweep is the sweep index. If not every iteration is
3900  // being checked, this function immediately returns false. If a check must
3901  // be done at this iteration, it waits for the reduction triggered by
3902  // ireduce to complete, then checks the global norm against the tolerance.
3903  bool checkDone (const int sweep, const magnitude_type tol2, const bool force = false) {
3904  // early return
3905  if (sweep <= 0) return false;
3906 
3907  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::CheckDone");
3908 
3909  TEUCHOS_ASSERT(sweep >= 1);
3910  if ( ! force && (sweep - 1) % sweep_step_) return false;
3911  if (collective_) {
3912 #ifdef HAVE_IFPACK2_MPI
3913 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
3914  MPI_Wait(&mpi_request_, MPI_STATUS_IGNORE);
3915 # else
3916  // Do nothing.
3917 # endif
3918 #endif
3919  }
3920  bool r_val = false;
3921  if (sweep == 1) {
3922  work_[2] = work_[0];
3923  } else {
3924  r_val = (work_[0] < tol2*work_[2]);
3925  }
3926 
3927  // adjust sweep step
3928  const auto adjusted_sweep_step = 2*sweep_step_;
3929  if (adjusted_sweep_step < sweep_step_upper_bound_) {
3930  sweep_step_ = adjusted_sweep_step;
3931  } else {
3932  sweep_step_ = sweep_step_upper_bound_;
3933  }
3934  return r_val;
3935  }
3936 
3937  // After termination has occurred, finalize the norms for use in
3938  // get_norms{0,final}.
3939  void finalize () {
3940  work_[0] = std::sqrt(work_[0]); // after converged
3941  if (work_[2] >= 0)
3942  work_[2] = std::sqrt(work_[2]); // first norm
3943  // if work_[2] is minus one, then norm is not requested.
3944  }
3945 
3946  // Report norms to the caller.
3947  const magnitude_type getNorms0 () const { return work_[2]; }
3948  const magnitude_type getNormsFinal () const { return work_[0]; }
3949  };
3950 
3954  template<typename MatrixType>
3955  int
3956  applyInverseJacobi(// importer
3957  const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
3958  const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
3959  const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
3960  const bool overlap_communication_and_computation,
3961  // tpetra interface
3962  const typename ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
3963  /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
3964  /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Z, // temporary tpetra interface (seq_method)
3965  /* */ typename ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
3966  // local object interface
3967  const PartInterface<MatrixType> &interf, // mesh interface
3968  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
3969  const AmD<MatrixType> &amd, // R = A - D
3970  /* */ typename ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side
3971  /* */ NormManager<MatrixType> &norm_manager,
3972  // preconditioner parameters
3973  const typename ImplType<MatrixType>::impl_scalar_type &damping_factor,
3974  /* */ bool is_y_zero,
3975  const int max_num_sweeps,
3976  const typename ImplType<MatrixType>::magnitude_type tol,
3977  const int check_tol_every) {
3978  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ApplyInverseJacobi");
3979 
3980  using impl_type = ImplType<MatrixType>;
3981  using node_memory_space = typename impl_type::node_memory_space;
3982  using local_ordinal_type = typename impl_type::local_ordinal_type;
3983  using size_type = typename impl_type::size_type;
3984  using impl_scalar_type = typename impl_type::impl_scalar_type;
3985  using magnitude_type = typename impl_type::magnitude_type;
3986  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3987  using vector_type_1d_view = typename impl_type::vector_type_1d_view;
3988  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3989  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
3990 
3991  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
3992 
3993  // either tpetra importer or async importer must be active
3994  TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
3995  "Neither Tpetra importer nor Async importer is null.");
3996  // max number of sweeps should be positive number
3997  TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
3998  "Maximum number of sweeps must be >= 1.");
3999 
4000  // const parameters
4001  const bool is_seq_method_requested = !tpetra_importer.is_null();
4002  const bool is_async_importer_active = !async_importer.is_null();
4003  const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
4004  const magnitude_type tolerance = tol*tol;
4005  const local_ordinal_type blocksize = btdm.values.extent(1);
4006  const local_ordinal_type num_vectors = Y.getNumVectors();
4007  const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
4008 
4009  const impl_scalar_type zero(0.0);
4010 
4011  TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
4012  "The seq method for applyInverseJacobi, " <<
4013  "which in any case is for developer use only, " <<
4014  "does not support norm-based termination.");
4015  const bool device_accessible_from_host = Kokkos::SpaceAccessibility<
4016  Kokkos::DefaultHostExecutionSpace, node_memory_space>::accessible;
4017  TEUCHOS_TEST_FOR_EXCEPTION(is_seq_method_requested && !device_accessible_from_host,
4018  std::invalid_argument,
4019  "The seq method for applyInverseJacobi, " <<
4020  "which in any case is for developer use only, " <<
4021  "only supports memory spaces accessible from host.");
4022 
4023  // if workspace is needed more, resize it
4024  const size_type work_span_required = num_blockrows*num_vectors*blocksize;
4025  if (work.span() < work_span_required)
4026  work = vector_type_1d_view("vector workspace 1d view", work_span_required);
4027 
4028  // construct W
4029  const local_ordinal_type W_size = interf.packptr.extent(0)-1;
4030  if (local_ordinal_type(W.extent(0)) < W_size)
4031  W = impl_scalar_type_1d_view("W", W_size);
4032 
4033  typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
4034  {
4035  if (is_seq_method_requested) {
4036  if (Z.getNumVectors() != Y.getNumVectors())
4037  Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors, false);
4038  } else {
4039  if (is_async_importer_active) {
4040  // create comm data buffer and keep it here
4041  async_importer->createDataBuffer(num_vectors);
4042  remote_multivector = async_importer->getRemoteMultiVectorLocalView();
4043  }
4044  }
4045  }
4046 
4047  // wrap the workspace with 3d view
4048  vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
4049  const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
4050  const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
4051  const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite);
4052  if (is_y_zero) Kokkos::deep_copy(YY, zero);
4053 
4054  MultiVectorConverter<MatrixType> multivector_converter(interf, pmv);
4055  SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
4056  damping_factor, is_norm_manager_active);
4057 
4058  const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
4059  ComputeResidualVector<MatrixType>
4060  compute_residual_vector(amd, A->getCrsGraph().getLocalGraphDevice(), blocksize, interf,
4061  is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view);
4062 
4063  // norm manager workspace resize
4064  if (is_norm_manager_active)
4065  norm_manager.setCheckFrequency(check_tol_every);
4066 
4067  // iterate
4068  int sweep = 0;
4069  for (;sweep<max_num_sweeps;++sweep) {
4070  {
4071  if (is_y_zero) {
4072  // pmv := x(lclrow)
4073  multivector_converter.run(XX);
4074  } else {
4075  if (is_seq_method_requested) {
4076  // SEQ METHOD IS TESTING ONLY
4077 
4078  // y := x - R y
4079  Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
4080  compute_residual_vector.run(YY, XX, ZZ);
4081 
4082  // pmv := y(lclrow).
4083  multivector_converter.run(YY);
4084  } else {
4085  // fused y := x - R y and pmv := y(lclrow);
4086  // real use case does not use overlap comp and comm
4087  if (overlap_communication_and_computation || !is_async_importer_active) {
4088  if (is_async_importer_active) async_importer->asyncSendRecv(YY);
4089  compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
4090  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
4091  if (is_async_importer_active) async_importer->cancel();
4092  break;
4093  }
4094  if (is_async_importer_active) {
4095  async_importer->syncRecv();
4096  compute_residual_vector.run(pmv, XX, YY, remote_multivector, false);
4097  }
4098  } else {
4099  if (is_async_importer_active)
4100  async_importer->syncExchange(YY);
4101  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
4102  compute_residual_vector.run(pmv, XX, YY, remote_multivector);
4103  }
4104  }
4105  }
4106  }
4107 
4108  // pmv := inv(D) pmv.
4109  {
4110  solve_tridiags.run(YY, W);
4111  }
4112  {
4113  if (is_norm_manager_active) {
4114  // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
4115  reduceVector<MatrixType>(W, norm_manager.getBuffer());
4116  if (sweep + 1 == max_num_sweeps) {
4117  norm_manager.ireduce(sweep, true);
4118  norm_manager.checkDone(sweep + 1, tolerance, true);
4119  } else {
4120  norm_manager.ireduce(sweep);
4121  }
4122  }
4123  }
4124  is_y_zero = false;
4125  }
4126 
4127  //sqrt the norms for the caller's use.
4128  if (is_norm_manager_active) norm_manager.finalize();
4129 
4130  return sweep;
4131  }
4132 
4133 
4134  template<typename MatrixType>
4135  struct ImplObject {
4136  using impl_type = ImplType<MatrixType>;
4137  using part_interface_type = PartInterface<MatrixType>;
4138  using block_tridiags_type = BlockTridiags<MatrixType>;
4139  using amd_type = AmD<MatrixType>;
4140  using norm_manager_type = NormManager<MatrixType>;
4141  using async_import_type = AsyncableImport<MatrixType>;
4142 
4143  // distructed objects
4144  Teuchos::RCP<const typename impl_type::tpetra_block_crs_matrix_type> A;
4145  Teuchos::RCP<const typename impl_type::tpetra_import_type> tpetra_importer;
4146  Teuchos::RCP<async_import_type> async_importer;
4147  bool overlap_communication_and_computation;
4148 
4149  // copy of Y (mutable to penentrate const)
4150  mutable typename impl_type::tpetra_multivector_type Z;
4151  mutable typename impl_type::impl_scalar_type_1d_view W;
4152 
4153  // local objects
4154  part_interface_type part_interface;
4155  block_tridiags_type block_tridiags; // D
4156  amd_type a_minus_d; // R = A - D
4157  mutable typename impl_type::vector_type_1d_view work; // right hand side workspace
4158  mutable norm_manager_type norm_manager;
4159  };
4160 
4161  } // namespace BlockTriDiContainerDetails
4162 
4163 } // namespace Ifpack2
4164 
4165 #endif
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:178
BlockTridiags< MatrixType > createBlockTridiags(const PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1346
int applyInverseJacobi(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename ImplType< MatrixType >::tpetra_multivector_type &X, typename ImplType< MatrixType >::tpetra_multivector_type &Y, typename ImplType< MatrixType >::tpetra_multivector_type &Z, typename ImplType< MatrixType >::impl_scalar_type_1d_view &W, const PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const AmD< MatrixType > &amd, typename ImplType< MatrixType >::vector_type_1d_view &work, NormManager< MatrixType > &norm_manager, const typename ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3956
PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::Array< Teuchos::Array< typename ImplType< MatrixType >::local_ordinal_type > > &partitions)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1135
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:252
void performNumericPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename ImplType< MatrixType >::magnitude_type tiny)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2234
static int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize, const int team_size)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3007
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:399
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:343
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:204
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:187
size_t size_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:334
std::string get_msg_prefix(const CommPtrType &comm)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:240
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:279
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:121
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:165
KB::Vector< T, l > Vector
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:386
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1021
node_type::device_type node_device_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:357
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3829
Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:423
Kokkos::DefaultHostExecutionSpace host_execution_space
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:352
void performSymbolicPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, AmD< MatrixType > &amd, const bool overlap_communication_and_computation)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1533
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1503
Preconditioners and smoothers for Tpetra sparse matrices.
Definition: Ifpack2_AdditiveSchwarz_decl.hpp:74
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1302
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2396
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:330
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2247
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:195