Tpetra parallel linear algebra  Version of the Day
Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ************************************************************************
38 // @HEADER
39 
40 #ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
41 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
42 
43 #include "TpetraCore_config.h"
44 #include "Teuchos_Array.hpp"
45 #include "Teuchos_ArrayView.hpp"
46 #include "Teuchos_OrdinalTraits.hpp"
54 #include "Kokkos_Core.hpp"
55 #include <memory>
56 #include <string>
57 
78 
79 namespace Tpetra {
80 
81 //
82 // Users must never rely on anything in the Details namespace.
83 //
84 namespace Details {
85 
86 namespace UnpackAndCombineCrsMatrixImpl {
87 
97 template<class ST, class LO, class GO>
98 KOKKOS_FUNCTION int
99 unpackRow(const typename PackTraits<GO>::output_array_type& gids_out,
100  const typename PackTraits<int>::output_array_type& pids_out,
101  const typename PackTraits<ST>::output_array_type& vals_out,
102  const char imports[],
103  const size_t offset,
104  const size_t /* num_bytes */,
105  const size_t num_ent,
106  const size_t bytes_per_value)
107 {
108  if (num_ent == 0) {
109  // Empty rows always take zero bytes, to ensure sparsity.
110  return 0;
111  }
112  bool unpack_pids = pids_out.size() > 0;
113 
114  const size_t num_ent_beg = offset;
115  const size_t num_ent_len = PackTraits<LO>::packValueCount (LO (0));
116 
117  const size_t gids_beg = num_ent_beg + num_ent_len;
118  const size_t gids_len =
119  num_ent * PackTraits<GO>::packValueCount (GO (0));
120 
121  const size_t pids_beg = gids_beg + gids_len;
122  const size_t pids_len = unpack_pids ?
123  size_t (num_ent * PackTraits<int>::packValueCount (int (0))) :
124  size_t (0);
125 
126  const size_t vals_beg = gids_beg + gids_len + pids_len;
127  const size_t vals_len = num_ent * bytes_per_value;
128 
129  const char* const num_ent_in = imports + num_ent_beg;
130  const char* const gids_in = imports + gids_beg;
131  const char* const pids_in = unpack_pids ? imports + pids_beg : nullptr;
132  const char* const vals_in = imports + vals_beg;
133 
134  size_t num_bytes_out = 0;
135  LO num_ent_out;
136  num_bytes_out += PackTraits<LO>::unpackValue (num_ent_out, num_ent_in);
137  if (static_cast<size_t> (num_ent_out) != num_ent) {
138  return 20; // error code
139  }
140 
141  {
142  Kokkos::pair<int, size_t> p;
143  p = PackTraits<GO>::unpackArray (gids_out.data (), gids_in, num_ent);
144  if (p.first != 0) {
145  return 21; // error code
146  }
147  num_bytes_out += p.second;
148 
149  if (unpack_pids) {
150  p = PackTraits<int>::unpackArray (pids_out.data (), pids_in, num_ent);
151  if (p.first != 0) {
152  return 22; // error code
153  }
154  num_bytes_out += p.second;
155  }
156 
157  p = PackTraits<ST>::unpackArray (vals_out.data (), vals_in, num_ent);
158  if (p.first != 0) {
159  return 23; // error code
160  }
161  num_bytes_out += p.second;
162  }
163 
164  const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
165  if (num_bytes_out != expected_num_bytes) {
166  return 24; // error code
167  }
168  return 0; // no errors
169 }
170 
181 template<class LocalMatrix, class LocalMap, class BufferDeviceType>
183  typedef LocalMatrix local_matrix_type;
184  typedef LocalMap local_map_type;
185 
186  typedef typename local_matrix_type::value_type ST;
187  typedef typename local_map_type::local_ordinal_type LO;
188  typedef typename local_map_type::global_ordinal_type GO;
189  typedef typename local_map_type::device_type DT;
190  typedef typename DT::execution_space XS;
191 
192  typedef Kokkos::View<const size_t*, BufferDeviceType>
193  num_packets_per_lid_type;
194  typedef Kokkos::View<const size_t*, DT> offsets_type;
195  typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
196  typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
197 
198  typedef Kokkos::View<int, DT> error_type;
199  using member_type = typename Kokkos::TeamPolicy<XS>::member_type;
200 
201  static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
202  "LocalMap::local_ordinal_type and "
203  "LocalMatrix::ordinal_type must be the same.");
204 
205  local_matrix_type local_matrix;
206  local_map_type local_col_map;
207  input_buffer_type imports;
208  num_packets_per_lid_type num_packets_per_lid;
209  import_lids_type import_lids;
210  Kokkos::View<const LO*[2], DT> batch_info;
211  offsets_type offsets;
212  Tpetra::CombineMode combine_mode;
213  size_t batch_size;
214  size_t bytes_per_value;
215  bool atomic;
216  error_type error_code;
217 
219  const local_matrix_type& local_matrix_in,
220  const local_map_type& local_col_map_in,
221  const input_buffer_type& imports_in,
222  const num_packets_per_lid_type& num_packets_per_lid_in,
223  const import_lids_type& import_lids_in,
224  const Kokkos::View<const LO*[2], DT>& batch_info_in,
225  const offsets_type& offsets_in,
226  const Tpetra::CombineMode combine_mode_in,
227  const size_t batch_size_in,
228  const size_t bytes_per_value_in,
229  const bool atomic_in) :
230  local_matrix (local_matrix_in),
231  local_col_map (local_col_map_in),
232  imports (imports_in),
233  num_packets_per_lid (num_packets_per_lid_in),
234  import_lids (import_lids_in),
235  batch_info (batch_info_in),
236  offsets (offsets_in),
237  combine_mode (combine_mode_in),
238  batch_size (batch_size_in),
239  bytes_per_value (bytes_per_value_in),
240  atomic (atomic_in),
241  error_code("error")
242  {}
243 
244  KOKKOS_INLINE_FUNCTION
245  void operator()(member_type team_member) const
246  {
247  using Kokkos::View;
248  using Kokkos::subview;
249  using Kokkos::MemoryUnmanaged;
250 
251  const LO batch = team_member.league_rank();
252  const LO lid_no = batch_info(batch, 0);
253  const LO batch_no = batch_info(batch, 1);
254 
255  const size_t num_bytes = num_packets_per_lid(lid_no);
256 
257  // Only unpack data if there is a nonzero number of bytes.
258  if (num_bytes == 0)
259  return;
260 
261  // there is actually something in the row
262  const LO import_lid = import_lids(lid_no);
263  const size_t buf_size = imports.size();
264  const size_t offset = offsets(lid_no);
265 
266  // Get the number of entries to expect in the received data for this row.
267  LO num_ent_LO = 0;
268  const char* const in_buf = imports.data() + offset;
269  (void) PackTraits<LO>::unpackValue(num_ent_LO, in_buf);
270  const size_t num_entries_in_row = static_cast<size_t>(num_ent_LO);
271 
272  // Count the number of bytes expected to unpack
273  size_t expected_num_bytes = 0;
274  {
275  expected_num_bytes += PackTraits<LO>::packValueCount(LO(0));
276  expected_num_bytes += num_entries_in_row * PackTraits<GO>::packValueCount(GO(0));
277  expected_num_bytes += num_entries_in_row * PackTraits<ST>::packValueCount(ST());
278  }
279 
280  if (expected_num_bytes > num_bytes)
281  {
282 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
283 #ifndef KOKKOS_ENABLE_SYCL
284  printf(
285  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
286  "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
287  (int) lid_no, (int) expected_num_bytes, (int) num_bytes
288  );
289 #endif
290  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 21);
291  return;
292  }
293 
294  if (offset > buf_size || offset + num_bytes > buf_size)
295  {
296 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
297 #ifndef KOKKOS_ENABLE_SYCL
298  printf(
299  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
300  "At row %d, the offset (%d) > buffer size (%d)\n",
301  (int) lid_no, (int) offset, (int) buf_size
302  );
303 #endif
304  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 22);
305  return;
306  }
307 
308  // Determine the number of entries to unpack in this batch
309  size_t num_entries_in_batch = 0;
310  if (num_entries_in_row <= batch_size)
311  num_entries_in_batch = num_entries_in_row;
312  else if (num_entries_in_row >= (batch_no + 1) * batch_size)
313  num_entries_in_batch = batch_size;
314  else
315  num_entries_in_batch = num_entries_in_row - batch_no * batch_size;
316 
317  const size_t bytes_per_lid = PackTraits<LO>::packValueCount(LO(0));
318  const size_t num_ent_start = offset;
319  const size_t num_ent_end = num_ent_start + bytes_per_lid;
320 
321  const size_t bytes_per_gid = PackTraits<GO>::packValueCount(GO(0));
322  const size_t gids_start = num_ent_end;
323  const size_t gids_end = gids_start + num_entries_in_row * bytes_per_gid;
324 
325  const size_t vals_start = gids_end;
326 
327  const size_t shift = batch_no * batch_size;
328  const char* const num_ent_in = imports.data() + num_ent_start;
329  const char* const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
330  const char* const vals_in = imports.data() + vals_start + shift * bytes_per_value;
331 
332  LO num_ent_out;
333  (void)PackTraits<LO>::unpackValue(num_ent_out, num_ent_in);
334  if (static_cast<size_t>(num_ent_out) != num_entries_in_row)
335  {
336 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
337 #ifndef KOKKOS_ENABLE_SYCL
338  printf(
339  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
340  "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
341  (int) lid_no, (int) num_entries_in_row, (int) num_ent_out
342  );
343 #endif
344  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 23);
345  }
346 
347  constexpr bool matrix_has_sorted_rows = true; // see #6282
348  Kokkos::parallel_for(
349  Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
350  KOKKOS_LAMBDA(const LO& j)
351  {
352  size_t distance = 0;
353 
354  GO gid_out;
355  distance = j * bytes_per_gid;
356  (void) PackTraits<GO>::unpackValue(gid_out, gids_in + distance);
357  auto lid_out = local_col_map.getLocalElement(gid_out);
358 
359  // Column indices come in as global indices, in case the
360  // source object's column Map differs from the target object's
361  // (this's) column Map, and must be converted local index values
362 
363  // assume that ST is default constructible
364  ST val_out;
365  distance = j * bytes_per_value;
366  (void) PackTraits<ST>::unpackValue(val_out, vals_in + distance);
367 
368  if (combine_mode == ADD) {
369  // NOTE (mfh 20 Nov 2019) Must assume atomic is required, unless
370  // different threads don't touch the same row (i.e., no
371  // duplicates in incoming LIDs list).
372  const bool use_atomic_updates = atomic;
373  (void)local_matrix.sumIntoValues(
374  import_lid,
375  &lid_out,
376  1,
377  &val_out,
378  matrix_has_sorted_rows,
379  use_atomic_updates
380  );
381  } else if (combine_mode == REPLACE) {
382  // NOTE (mfh 20 Nov 2019): It's never correct to use REPLACE
383  // combine mode with multiple incoming rows that touch the same
384  // target matrix entries, so we never need atomic updates.
385  const bool use_atomic_updates = false;
386  (void)local_matrix.replaceValues(
387  import_lid,
388  &lid_out,
389  1,
390  &val_out,
391  matrix_has_sorted_rows,
392  use_atomic_updates
393  );
394  } else {
395  // should never get here
396 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
397 #ifndef KOKKOS_ENABLE_SYCL
398  printf(
399  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
400  "At row %d, an unknown error occurred during unpack\n", (int) lid_no
401  );
402 #endif
403  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 31);
404  }
405  }
406  );
407 
408  team_member.team_barrier();
409 
410  }
411 
413  int error() const {
414  auto error_code_h = Kokkos::create_mirror_view_and_copy(
415  Kokkos::HostSpace(), error_code
416  );
417  return error_code_h();
418  }
419 
420 };
421 
422 struct MaxNumEntTag {};
423 struct TotNumEntTag {};
424 
433 template<class LO, class DT, class BDT>
435 public:
436  typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
437  typedef Kokkos::View<const size_t*, DT> offsets_type;
438  typedef Kokkos::View<const char*, BDT> input_buffer_type;
439  // This needs to be public, since it appears in the argument list of
440  // public methods (see below). Otherwise, build errors may happen.
441  typedef size_t value_type;
442 
443 private:
444  num_packets_per_lid_type num_packets_per_lid;
445  offsets_type offsets;
446  input_buffer_type imports;
447 
448 public:
449  NumEntriesFunctor (const num_packets_per_lid_type num_packets_per_lid_in,
450  const offsets_type& offsets_in,
451  const input_buffer_type& imports_in) :
452  num_packets_per_lid (num_packets_per_lid_in),
453  offsets (offsets_in),
454  imports (imports_in)
455  {}
456 
457  KOKKOS_INLINE_FUNCTION void
458  operator() (const MaxNumEntTag, const LO i, value_type& update) const {
459  // Get how many entries to expect in the received data for this row.
460  const size_t num_bytes = num_packets_per_lid(i);
461  if (num_bytes > 0) {
462  LO num_ent_LO = 0; // output argument of unpackValue
463  const char* const in_buf = imports.data () + offsets(i);
464  (void) PackTraits<LO>::unpackValue (num_ent_LO, in_buf);
465  const size_t num_ent = static_cast<size_t> (num_ent_LO);
466 
467  update = (update < num_ent) ? num_ent : update;
468  }
469  }
470 
471  KOKKOS_INLINE_FUNCTION void
472  join (const MaxNumEntTag,
473  volatile value_type& dst,
474  const volatile value_type& src) const
475  {
476  if (dst < src) dst = src;
477  }
478 
479  KOKKOS_INLINE_FUNCTION void
480  operator() (const TotNumEntTag, const LO i, value_type& tot_num_ent) const {
481  // Get how many entries to expect in the received data for this row.
482  const size_t num_bytes = num_packets_per_lid(i);
483  if (num_bytes > 0) {
484  LO num_ent_LO = 0; // output argument of unpackValue
485  const char* const in_buf = imports.data () + offsets(i);
486  (void) PackTraits<LO>::unpackValue (num_ent_LO, in_buf);
487  tot_num_ent += static_cast<size_t> (num_ent_LO);
488  }
489  }
490 };
491 
499 template<class LO, class DT, class BDT>
500 size_t
502  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
503  const Kokkos::View<const size_t*, DT>& offsets,
504  const Kokkos::View<const char*, BDT>& imports)
505 {
506  typedef typename DT::execution_space XS;
507  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
508  MaxNumEntTag> range_policy;
509 
510  NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
511  imports);
512  const LO numRowsToUnpack =
513  static_cast<LO> (num_packets_per_lid.extent (0));
514  size_t max_num_ent = 0;
515  Kokkos::parallel_reduce ("Max num entries in CRS",
516  range_policy (0, numRowsToUnpack),
517  functor, max_num_ent);
518  return max_num_ent;
519 }
520 
528 template<class LO, class DT, class BDT>
529 size_t
531  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
532  const Kokkos::View<const size_t*, DT>& offsets,
533  const Kokkos::View<const char*, BDT>& imports)
534 {
535  typedef typename DT::execution_space XS;
536  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
537  size_t tot_num_ent = 0;
538  NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
539  imports);
540  const LO numRowsToUnpack =
541  static_cast<LO> (num_packets_per_lid.extent (0));
542  Kokkos::parallel_reduce ("Total num entries in CRS to unpack",
543  range_policy (0, numRowsToUnpack),
544  functor, tot_num_ent);
545  return tot_num_ent;
546 }
547 
548 template<class LO>
549 KOKKOS_INLINE_FUNCTION
550 size_t
551 unpackRowCount(const char imports[],
552  const size_t offset,
553  const size_t num_bytes)
554 {
555  using PT = PackTraits<LO>;
556 
557  LO num_ent_LO = 0;
558  if (num_bytes > 0) {
559  const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
560  if (p_num_bytes > num_bytes) {
561  return OrdinalTraits<size_t>::invalid();
562  }
563  const char* const in_buf = imports + offset;
564  (void) PT::unpackValue(num_ent_LO, in_buf);
565  }
566  return static_cast<size_t>(num_ent_LO);
567 }
568 
573 template<class View1, class View2>
574 inline
575 bool
577  const View1& batches_per_lid,
578  View2& batch_info
579 )
580 {
581  using LO = typename View2::value_type;
582  size_t batch = 0;
583  for (size_t i=0; i<batches_per_lid.extent(0); i++)
584  {
585  for (size_t batch_no=0; batch_no<batches_per_lid(i); batch_no++)
586  {
587  batch_info(batch, 0) = static_cast<LO>(i);
588  batch_info(batch, 1) = batch_no;
589  batch++;
590  }
591  }
592  return batch == batch_info.extent(0);
593 }
594 
602 template<class LocalMatrix, class LocalMap, class BufferDeviceType>
603 void
605  const LocalMatrix& local_matrix,
606  const LocalMap& local_map,
607  const Kokkos::View<const char*, BufferDeviceType>& imports,
608  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
610  const Tpetra::CombineMode combine_mode)
611 {
612  using ST = typename LocalMatrix::value_type;
613  using LO = typename LocalMap::local_ordinal_type;
614  using DT = typename LocalMap::device_type;
615  using XS = typename DT::execution_space;
616  const char prefix[] =
617  "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
618  "unpackAndCombineIntoCrsMatrix: ";
619 
620  const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
621  if (num_import_lids == 0) {
622  // Nothing to unpack
623  return;
624  }
625 
626  {
627  // Check for correct input
628  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == ABSMAX,
629  std::invalid_argument,
630  prefix << "ABSMAX combine mode is not yet implemented for a matrix that has a "
631  "static graph (i.e., was constructed with the CrsMatrix constructor "
632  "that takes a const CrsGraph pointer).");
633 
634  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == INSERT,
635  std::invalid_argument,
636  prefix << "INSERT combine mode is not allowed if the matrix has a static graph "
637  "(i.e., was constructed with the CrsMatrix constructor that takes a "
638  "const CrsGraph pointer).");
639 
640  // Unknown combine mode!
641  TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode == ADD || combine_mode == REPLACE),
642  std::invalid_argument,
643  prefix << "Invalid combine mode; should never get "
644  "here! Please report this bug to the Tpetra developers.");
645 
646  // Check that sizes of input objects are consistent.
647  bool bad_num_import_lids =
648  num_import_lids != static_cast<size_t>(num_packets_per_lid.extent(0));
649  TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
650  std::invalid_argument,
651  prefix << "importLIDs.size() (" << num_import_lids << ") != "
652  "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) << ").");
653  } // end QA error checking
654 
655  // Get the offsets
656  Kokkos::View<size_t*, DT> offsets("offsets", num_import_lids+1);
657  computeOffsetsFromCounts(offsets, num_packets_per_lid);
658 
659  // Determine the sizes of the unpack batches
660  size_t max_num_ent = compute_maximum_num_entries<LO,DT>(num_packets_per_lid, offsets, imports);
661  const size_t default_batch_size = Tpetra::Details::Behavior::hierarchicalUnpackBatchSize();
662  const size_t batch_size = std::min(default_batch_size, max_num_ent);
663 
664  // To achieve some balance amongst threads, unpack each row in equal size batches
665  size_t num_batches = 0;
666  Kokkos::View<LO*[2], DT> batch_info("", num_batches);
667  Kokkos::View<size_t*, DT> batches_per_lid("", num_import_lids);
668  // Compute meta data that allows batch unpacking
669  Kokkos::parallel_reduce(
670  Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
671  KOKKOS_LAMBDA(const size_t i, size_t& batches)
672  {
673  const size_t num_entries_in_row = unpackRowCount<LO>(
674  imports.data(), offsets(i), num_packets_per_lid(i)
675  );
676  batches_per_lid(i) =
677  (num_entries_in_row <= batch_size) ?
678  1 :
679  num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
680  batches += batches_per_lid(i);
681  },
682  num_batches
683  );
684  Kokkos::resize(batch_info, num_batches);
685 
686  Kokkos::HostSpace host_space;
687  auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
688  Kokkos::deep_copy(batches_per_lid_h, batches_per_lid);
689 
690  auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
691 
692  (void) compute_batch_info(batches_per_lid_h, batch_info_h);
693  Kokkos::deep_copy(batch_info, batch_info_h);
694 
695  // FIXME (TJF SEP 2017)
696  // The scalar type is not necessarily default constructible
697  size_t bytes_per_value = PackTraits<ST>::packValueCount(ST());
698 
699  // Now do the actual unpack!
700  const bool atomic = XS::concurrency() != 1;
702  functor f(
703  local_matrix,
704  local_map,
705  imports,
706  num_packets_per_lid,
707  import_lids,
708  batch_info,
709  offsets,
710  combine_mode,
711  batch_size,
712  bytes_per_value,
713  atomic
714  );
715 
716  using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
718 #if defined(KOKKOS_ENABLE_CUDA)
719  constexpr bool is_cuda = std::is_same<XS, Kokkos::Cuda>::value;
720 #else
721  constexpr bool is_cuda = false;
722 #endif
723  if (!is_cuda || team_size == Teuchos::OrdinalTraits<size_t>::invalid())
724  {
725  Kokkos::parallel_for(policy(static_cast<LO>(num_batches), Kokkos::AUTO), f);
726  }
727  else
728  {
729  Kokkos::parallel_for(policy(static_cast<LO>(num_batches), static_cast<int>(team_size)), f);
730  }
731 
732  auto error_code = f.error();
733  TEUCHOS_TEST_FOR_EXCEPTION(
734  error_code != 0,
735  std::runtime_error,
736  prefix << "UnpackCrsMatrixAndCombineFunctor reported error code " << error_code
737  );
738 }
739 
740 template<class LocalMatrix, class BufferDeviceType>
741 size_t
743  const LocalMatrix& local_matrix,
745  const Kokkos::View<const char*, BufferDeviceType>& imports,
746  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
747  const size_t num_same_ids)
748 {
749  using Kokkos::parallel_reduce;
750  typedef typename LocalMatrix::ordinal_type LO;
751  typedef typename LocalMatrix::device_type device_type;
752  typedef typename device_type::execution_space XS;
753  typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
754  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
755  typedef BufferDeviceType BDT;
756 
757  size_t count = 0;
758  LO num_items;
759 
760  // Number of matrix entries to unpack (returned by this function).
761  num_items = static_cast<LO>(num_same_ids);
762  if (num_items) {
763  size_t kcnt = 0;
764  parallel_reduce(range_policy(0, num_items),
765  KOKKOS_LAMBDA(const LO lid, size_t& update) {
766  update += static_cast<size_t>(local_matrix.graph.row_map[lid+1]
767  -local_matrix.graph.row_map[lid]);
768  }, kcnt);
769  count += kcnt;
770  }
771 
772  // Count entries copied directly from the source matrix with permuting.
773  num_items = static_cast<LO>(permute_from_lids.extent(0));
774  if (num_items) {
775  size_t kcnt = 0;
776  parallel_reduce(range_policy(0, num_items),
777  KOKKOS_LAMBDA(const LO i, size_t& update) {
778  const LO lid = permute_from_lids(i);
779  update += static_cast<size_t> (local_matrix.graph.row_map[lid+1]
780  - local_matrix.graph.row_map[lid]);
781  }, kcnt);
782  count += kcnt;
783  }
784 
785  {
786  // Count entries received from other MPI processes.
787  const size_type np = num_packets_per_lid.extent(0);
788  Kokkos::View<size_t*, device_type> offsets("offsets", np+1);
789  computeOffsetsFromCounts(offsets, num_packets_per_lid);
790  count +=
791  compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
792  offsets, imports);
793  }
794 
795  return count;
796 }
797 
799 template<class LO, class DT, class BDT>
800 int
801 setupRowPointersForRemotes(
802  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
803  const typename PackTraits<LO>::input_array_type& import_lids,
804  const Kokkos::View<const char*, BDT>& imports,
805  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
806  const typename PackTraits<size_t>::input_array_type& offsets)
807 {
808  using Kokkos::parallel_reduce;
809  typedef typename DT::execution_space XS;
810  typedef typename PackTraits<size_t>::input_array_type::size_type size_type;
811  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
812 
813  const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
814  const size_type N = num_packets_per_lid.extent(0);
815 
816  int errors = 0;
817  parallel_reduce ("Setup row pointers for remotes",
818  range_policy (0, N),
819  KOKKOS_LAMBDA (const size_t i, int& k_error) {
820  typedef typename std::remove_reference< decltype( tgt_rowptr(0) ) >::type atomic_incr_type;
821  const size_t num_bytes = num_packets_per_lid(i);
822  const size_t offset = offsets(i);
823  const size_t num_ent = unpackRowCount<LO> (imports.data(), offset, num_bytes);
824  if (num_ent == InvalidNum) {
825  k_error += 1;
826  }
827  Kokkos::atomic_fetch_add (&tgt_rowptr (import_lids(i)), atomic_incr_type(num_ent));
828  }, errors);
829  return errors;
830 }
831 
832 // Convert array of row lengths to a CRS pointer array
833 template<class DT>
834 void
835 makeCrsRowPtrFromLengths(
836  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
837  const Kokkos::View<size_t*,DT>& new_start_row)
838 {
839  using Kokkos::parallel_scan;
840  typedef typename DT::execution_space XS;
841  typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
842  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
843  const size_type N = new_start_row.extent(0);
844  parallel_scan(range_policy(0, N),
845  KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
846  auto cur_val = tgt_rowptr(i);
847  if (final) {
848  tgt_rowptr(i) = update;
849  new_start_row(i) = tgt_rowptr(i);
850  }
851  update += cur_val;
852  }
853  );
854 }
855 
856 template<class LocalMatrix, class LocalMap>
857 void
858 copyDataFromSameIDs(
860  const typename PackTraits<int>::output_array_type& tgt_pids,
862  const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
863  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
864  const typename PackTraits<int>::input_array_type& src_pids,
865  const LocalMatrix& local_matrix,
866  const LocalMap& local_col_map,
867  const size_t num_same_ids,
868  const int my_pid)
869 {
870  using Kokkos::parallel_for;
871  typedef typename LocalMap::device_type DT;
872  typedef typename LocalMap::local_ordinal_type LO;
873  typedef typename DT::execution_space XS;
874  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
875 
876  parallel_for(range_policy(0, num_same_ids),
877  KOKKOS_LAMBDA(const size_t i) {
878  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
879 
880  const LO src_lid = static_cast<LO>(i);
881  size_t src_row = local_matrix.graph.row_map(src_lid);
882 
883  const LO tgt_lid = static_cast<LO>(i);
884  const size_t tgt_row = tgt_rowptr(tgt_lid);
885 
886  const size_t nsr = local_matrix.graph.row_map(src_lid+1)
887  - local_matrix.graph.row_map(src_lid);
888  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
889 
890  for (size_t j=local_matrix.graph.row_map(src_lid);
891  j<local_matrix.graph.row_map(src_lid+1); ++j) {
892  LO src_col = local_matrix.graph.entries(j);
893  tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
894  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
895  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
896  }
897  }
898  );
899 }
900 
901 template<class LocalMatrix, class LocalMap>
902 void
903 copyDataFromPermuteIDs(
905  const typename PackTraits<int>::output_array_type& tgt_pids,
907  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
908  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
909  const typename PackTraits<int>::input_array_type& src_pids,
912  const LocalMatrix& local_matrix,
913  const LocalMap& local_col_map,
914  const int my_pid)
915 {
916  using Kokkos::parallel_for;
917  typedef typename LocalMap::device_type DT;
918  typedef typename LocalMap::local_ordinal_type LO;
919  typedef typename DT::execution_space XS;
920  typedef typename PackTraits<LO>::input_array_type::size_type size_type;
921  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
922 
923  const size_type num_permute_to_lids = permute_to_lids.extent(0);
924 
925  parallel_for(range_policy(0, num_permute_to_lids),
926  KOKKOS_LAMBDA(const size_t i) {
927  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
928 
929  const LO src_lid = permute_from_lids(i);
930  const size_t src_row = local_matrix.graph.row_map(src_lid);
931 
932  const LO tgt_lid = permute_to_lids(i);
933  const size_t tgt_row = tgt_rowptr(tgt_lid);
934 
935  size_t nsr = local_matrix.graph.row_map(src_lid+1)
936  - local_matrix.graph.row_map(src_lid);
937  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
938 
939  for (size_t j=local_matrix.graph.row_map(src_lid);
940  j<local_matrix.graph.row_map(src_lid+1); ++j) {
941  LO src_col = local_matrix.graph.entries(j);
942  tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
943  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
944  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
945  }
946  }
947  );
948 }
949 
950 template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
951 int
952 unpackAndCombineIntoCrsArrays2(
954  const typename PackTraits<int>::output_array_type& tgt_pids,
956  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
957  const typename PackTraits<size_t>::input_array_type& offsets,
959  const Kokkos::View<const char*, BufferDeviceType>& imports,
960  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
961  const LocalMatrix& /* local_matrix */,
962  const LocalMap /*& local_col_map*/,
963  const int my_pid,
964  const size_t bytes_per_value)
965 {
966  using Kokkos::View;
967  using Kokkos::subview;
968  using Kokkos::MemoryUnmanaged;
969  using Kokkos::parallel_reduce;
970  using Kokkos::atomic_fetch_add;
972  typedef typename LocalMap::device_type DT;
973  typedef typename LocalMap::local_ordinal_type LO;
974  typedef typename LocalMap::global_ordinal_type GO;
975  typedef typename LocalMatrix::value_type ST;
976  typedef typename DT::execution_space XS;
977  typedef typename Kokkos::View<LO*, DT>::size_type size_type;
978  typedef typename Kokkos::pair<size_type, size_type> slice;
979  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
980 
981  typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
982  typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
983  typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
984 
985  const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
986 
987  int errors = 0;
988  const size_type num_import_lids = import_lids.size();
989 
990  // RemoteIDs: Loop structure following UnpackAndCombine
991  parallel_reduce ("Unpack and combine into CRS",
992  range_policy (0, num_import_lids),
993  KOKKOS_LAMBDA (const size_t i, int& k_error) {
994  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
995  const size_t num_bytes = num_packets_per_lid(i);
996  const size_t offset = offsets(i);
997  if (num_bytes == 0) {
998  // Empty buffer means that the row is empty.
999  return;
1000  }
1001  size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
1002  if (num_ent == InvalidNum) {
1003  k_error += 1;
1004  return;
1005  }
1006  const LO lcl_row = import_lids(i);
1007  const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
1008  const size_t end_row = start_row + num_ent;
1009 
1010  gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
1011  vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
1012  pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
1013 
1014  k_error += unpackRow<ST,LO,GO>(gids_out, pids_out, vals_out,
1015  imports.data(), offset, num_bytes,
1016  num_ent, bytes_per_value);
1017 
1018  // Correct target PIDs.
1019  for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
1020  const int pid = pids_out(j);
1021  pids_out(j) = (pid != my_pid) ? pid : -1;
1022  }
1023  }, errors);
1024 
1025  return errors;
1026 }
1027 
1028 template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
1029 void
1031  const LocalMatrix & local_matrix,
1032  const LocalMap & local_col_map,
1034  const Kokkos::View<const char*, BufferDeviceType>& imports,
1035  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
1038  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
1041  const typename PackTraits<int>::input_array_type& src_pids,
1042  const typename PackTraits<int>::output_array_type& tgt_pids,
1043  const size_t num_same_ids,
1044  const size_t tgt_num_rows,
1045  const size_t tgt_num_nonzeros,
1046  const int my_tgt_pid,
1047  const size_t bytes_per_value)
1048 {
1049  using Kokkos::View;
1050  using Kokkos::subview;
1051  using Kokkos::parallel_for;
1052  using Kokkos::MemoryUnmanaged;
1054  typedef typename LocalMap::device_type DT;
1055  typedef typename LocalMap::local_ordinal_type LO;
1056  typedef typename DT::execution_space XS;
1057  typedef typename Kokkos::View<LO*, DT>::size_type size_type;
1058  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
1059  typedef BufferDeviceType BDT;
1060 
1061  const char prefix[] = "unpackAndCombineIntoCrsArrays: ";
1062 
1063  const size_t N = tgt_num_rows;
1064 
1065  // In the case of reduced communicators, the sourceMatrix won't have
1066  // the right "my_pid", so thus we have to supply it.
1067  const int my_pid = my_tgt_pid;
1068 
1069  // Zero the rowptr
1070  parallel_for(range_policy(0, N+1),
1071  KOKKOS_LAMBDA(const size_t i) {
1072  tgt_rowptr(i) = 0;
1073  }
1074  );
1075 
1076  // same IDs: Always first, always in the same place
1077  parallel_for(range_policy(0, num_same_ids),
1078  KOKKOS_LAMBDA(const size_t i) {
1079  const LO tgt_lid = static_cast<LO>(i);
1080  const LO src_lid = static_cast<LO>(i);
1081  tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1082  - local_matrix.graph.row_map(src_lid);
1083  }
1084  );
1085 
1086  // Permute IDs: Still local, but reordered
1087  const size_type num_permute_to_lids = permute_to_lids.extent(0);
1088  parallel_for(range_policy(0, num_permute_to_lids),
1089  KOKKOS_LAMBDA(const size_t i) {
1090  const LO tgt_lid = permute_to_lids(i);
1091  const LO src_lid = permute_from_lids(i);
1092  tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1093  - local_matrix.graph.row_map(src_lid);
1094  }
1095  );
1096 
1097  // Get the offsets from the number of packets per LID
1098  const size_type num_import_lids = import_lids.extent(0);
1099  View<size_t*, DT> offsets("offsets", num_import_lids+1);
1100  computeOffsetsFromCounts(offsets, num_packets_per_lid);
1101 
1102 #ifdef HAVE_TPETRA_DEBUG
1103  {
1104  auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1105  const bool condition =
1106  nth_offset_h != static_cast<size_t>(imports.extent (0));
1107  TEUCHOS_TEST_FOR_EXCEPTION
1108  (condition, std::logic_error, prefix
1109  << "The final offset in bytes " << nth_offset_h
1110  << " != imports.size() = " << imports.extent(0)
1111  << ". Please report this bug to the Tpetra developers.");
1112  }
1113 #endif // HAVE_TPETRA_DEBUG
1114 
1115  // Setup row pointers for remotes
1116  int k_error =
1117  setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1118  import_lids, imports, num_packets_per_lid, offsets);
1119  TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1120  << " Error transferring data to target row pointers. "
1121  "Please report this bug to the Tpetra developers.");
1122 
1123  // If multiple processes contribute to the same row, we may need to
1124  // update row offsets. This tracks that.
1125  View<size_t*, DT> new_start_row ("new_start_row", N+1);
1126 
1127  // Turn row length into a real CRS row pointer
1128  makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1129 
1130  // SameIDs: Copy the data over
1131  copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1132  tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1133 
1134  copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1135  tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1136  local_matrix, local_col_map, my_pid);
1137 
1138  if (imports.extent(0) <= 0) {
1139  return;
1140  }
1141 
1142  int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1143  tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1144  local_matrix, local_col_map, my_pid, bytes_per_value);
1145  TEUCHOS_TEST_FOR_EXCEPTION(
1146  unpack_err != 0, std::logic_error, prefix << "unpack loop failed. This "
1147  "should never happen. Please report this bug to the Tpetra developers.");
1148 
1149  return;
1150 }
1151 
1152 } // namespace UnpackAndCombineCrsMatrixImpl
1153 
1193 template<typename ST, typename LO, typename GO, typename Node>
1194 void
1196  const CrsMatrix<ST, LO, GO, Node>& sourceMatrix,
1197  const Teuchos::ArrayView<const char>& imports,
1198  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1199  const Teuchos::ArrayView<const LO>& importLIDs,
1200  size_t /* constantNumPackets */,
1201  CombineMode combineMode)
1202 {
1203  using Kokkos::View;
1204  typedef typename Node::device_type device_type;
1205  typedef typename CrsMatrix<ST, LO, GO, Node>::local_matrix_device_type local_matrix_device_type;
1206  static_assert (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1207  "Node::device_type and LocalMatrix::device_type must be the same.");
1208 
1209  // Convert all Teuchos::Array to Kokkos::View.
1210  device_type outputDevice;
1211 
1212  // numPacketsPerLID, importLIDs, and imports are input, so we have to copy
1213  // them to device. Since unpacking is done directly in to the local matrix
1214  // (lclMatrix), no copying needs to be performed after unpacking.
1215  auto num_packets_per_lid_d =
1216  create_mirror_view_from_raw_host_array(outputDevice, numPacketsPerLID.getRawPtr(),
1217  numPacketsPerLID.size(), true, "num_packets_per_lid");
1218 
1219  auto import_lids_d =
1220  create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(),
1221  importLIDs.size(), true, "import_lids");
1222 
1223  auto imports_d =
1224  create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(),
1225  imports.size(), true, "imports");
1226 
1227  auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1228  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1229 
1230 //KDDKDD This loop doesn't appear to do anything; what is it?
1231 //KDDKDD for (int i=0; i<importLIDs.size(); i++)
1232 //KDDKDD {
1233 //KDDKDD auto lclRow = importLIDs[i];
1234 //KDDKDD Teuchos::ArrayView<const LO> A_indices;
1235 //KDDKDD Teuchos::ArrayView<const ST> A_values;
1236 //KDDKDD sourceMatrix.getLocalRowView(lclRow, A_indices, A_values);
1237 //KDDKDD }
1238  // Now do the actual unpack!
1239  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1240  local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1241  import_lids_d, combineMode);
1242 
1243 }
1244 
1245 template<typename ST, typename LO, typename GO, typename NT>
1246 void
1247 unpackCrsMatrixAndCombineNew(
1248  const CrsMatrix<ST, LO, GO, NT>& sourceMatrix,
1249  Kokkos::DualView<char*,
1251  Kokkos::DualView<size_t*,
1252  typename DistObject<char, LO, GO, NT>::buffer_device_type> numPacketsPerLID,
1253  const Kokkos::DualView<const LO*,
1255  const size_t /* constantNumPackets */,
1256  const CombineMode combineMode)
1257 {
1258  using Kokkos::View;
1259  using crs_matrix_type = CrsMatrix<ST, LO, GO, NT>;
1260  using dist_object_type = DistObject<char, LO, GO, NT>;
1261  using device_type = typename crs_matrix_type::device_type;
1262  using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
1263  using buffer_device_type = typename dist_object_type::buffer_device_type;
1264 
1265  static_assert
1266  (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1267  "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1268  "must be the same.");
1269 
1270  if (numPacketsPerLID.need_sync_device()) {
1271  numPacketsPerLID.sync_device ();
1272  }
1273  auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1274 
1275  TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1276  auto import_lids_d = importLIDs.view_device ();
1277 
1278  if (imports.need_sync_device()) {
1279  imports.sync_device ();
1280  }
1281  auto imports_d = imports.view_device ();
1282 
1283  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1284  auto local_col_map = sourceMatrix.getColMap ()->getLocalMap ();
1285  typedef decltype (local_col_map) local_map_type;
1286 
1287  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1288  local_matrix_device_type,
1289  local_map_type,
1290  buffer_device_type
1291  > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1292  import_lids_d, combineMode);
1293 }
1294 
1341 //
1350 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1351 size_t
1354  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1355  const Teuchos::ArrayView<const char> &imports,
1356  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1357  size_t /* constantNumPackets */,
1358  CombineMode /* combineMode */,
1359  size_t numSameIDs,
1360  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1361  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1362 {
1363  using Kokkos::MemoryUnmanaged;
1364  using Kokkos::View;
1365  typedef typename Node::device_type DT;
1367  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
1368 
1369  TEUCHOS_TEST_FOR_EXCEPTION
1370  (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1371  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () << " != "
1372  "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
1373  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1374  // process, then the matrix is neither locally nor globally indexed.
1375  const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1376  TEUCHOS_TEST_FOR_EXCEPTION
1377  (! locallyIndexed, std::invalid_argument, prefix << "The input "
1378  "CrsMatrix 'sourceMatrix' must be locally indexed.");
1379  TEUCHOS_TEST_FOR_EXCEPTION
1380  (importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
1381  prefix << "importLIDs.size() = " << importLIDs.size () << " != "
1382  "numPacketsPerLID.size() = " << numPacketsPerLID.size () << ".");
1383 
1384  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1385  auto permute_from_lids_d =
1387  permuteFromLIDs.getRawPtr (),
1388  permuteFromLIDs.size (), true,
1389  "permute_from_lids");
1390  auto imports_d =
1392  imports.getRawPtr (),
1393  imports.size (), true,
1394  "imports");
1395  auto num_packets_per_lid_d =
1397  numPacketsPerLID.getRawPtr (),
1398  numPacketsPerLID.size (), true,
1399  "num_packets_per_lid");
1400 
1401  return UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1402  local_matrix, permute_from_lids_d, imports_d,
1403  num_packets_per_lid_d, numSameIDs);
1404 }
1405 
1420 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1421 void
1424  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1425  const Teuchos::ArrayView<const char>& imports,
1426  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1427  const size_t /* constantNumPackets */,
1428  const CombineMode /* combineMode */,
1429  const size_t numSameIDs,
1430  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1431  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1432  size_t TargetNumRows,
1433  size_t TargetNumNonzeros,
1434  const int MyTargetPID,
1435  const Teuchos::ArrayView<size_t>& CRS_rowptr,
1436  const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
1437  const Teuchos::ArrayView<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type>& CRS_vals,
1438  const Teuchos::ArrayView<const int>& SourcePids,
1439  Teuchos::Array<int>& TargetPids)
1440 {
1442 
1443  using Kokkos::View;
1444  using Kokkos::deep_copy;
1445 
1446  using Teuchos::ArrayView;
1447  using Teuchos::outArg;
1448  using Teuchos::REDUCE_MAX;
1449  using Teuchos::reduceAll;
1450 
1451  typedef LocalOrdinal LO;
1452 
1453  typedef typename Node::device_type DT;
1454 
1456  typedef typename matrix_type::impl_scalar_type ST;
1457  typedef typename ArrayView<const LO>::size_type size_type;
1458 
1459  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1460 
1461  TEUCHOS_TEST_FOR_EXCEPTION(
1462  TargetNumRows + 1 != static_cast<size_t> (CRS_rowptr.size ()),
1463  std::invalid_argument, prefix << "CRS_rowptr.size() = " <<
1464  CRS_rowptr.size () << "!= TargetNumRows+1 = " << TargetNumRows+1 << ".");
1465 
1466  TEUCHOS_TEST_FOR_EXCEPTION(
1467  permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1468  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size ()
1469  << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size () << ".");
1470  const size_type numImportLIDs = importLIDs.size ();
1471 
1472  TEUCHOS_TEST_FOR_EXCEPTION(
1473  numImportLIDs != numPacketsPerLID.size (), std::invalid_argument,
1474  prefix << "importLIDs.size() = " << numImportLIDs << " != "
1475  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
1476 
1477  // Preseed TargetPids with -1 for local
1478  if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1479  TargetPids.resize (TargetNumNonzeros);
1480  }
1481  TargetPids.assign (TargetNumNonzeros, -1);
1482 
1483  // Grab pointers for sourceMatrix
1484  auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1485  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1486 
1487  // Convert input arrays to Kokkos::View
1488  DT outputDevice;
1489  auto import_lids_d =
1490  create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(),
1491  importLIDs.size(), true, "import_lids");
1492 
1493  auto imports_d =
1494  create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(),
1495  imports.size(), true, "imports");
1496 
1497  auto num_packets_per_lid_d =
1498  create_mirror_view_from_raw_host_array(outputDevice, numPacketsPerLID.getRawPtr(),
1499  numPacketsPerLID.size(), true, "num_packets_per_lid");
1500 
1501  auto permute_from_lids_d =
1502  create_mirror_view_from_raw_host_array(outputDevice, permuteFromLIDs.getRawPtr(),
1503  permuteFromLIDs.size(), true, "permute_from_lids");
1504 
1505  auto permute_to_lids_d =
1506  create_mirror_view_from_raw_host_array(outputDevice, permuteToLIDs.getRawPtr(),
1507  permuteToLIDs.size(), true, "permute_to_lids");
1508 
1509  auto crs_rowptr_d =
1510  create_mirror_view_from_raw_host_array(outputDevice, CRS_rowptr.getRawPtr(),
1511  CRS_rowptr.size(), true, "crs_rowptr");
1512 
1513  auto crs_colind_d =
1514  create_mirror_view_from_raw_host_array(outputDevice, CRS_colind.getRawPtr(),
1515  CRS_colind.size(), true, "crs_colidx");
1516 
1517 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1518  static_assert (! std::is_same<
1519  typename std::remove_const<
1520  typename std::decay<
1521  decltype (CRS_vals)
1522  >::type::value_type
1523  >::type,
1524  std::complex<double> >::value,
1525  "CRS_vals::value_type is std::complex<double>; this should never happen"
1526  ", since std::complex does not work in Kokkos::View objects.");
1527 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1528 
1529  auto crs_vals_d =
1530  create_mirror_view_from_raw_host_array(outputDevice, CRS_vals.getRawPtr(),
1531  CRS_vals.size(), true, "crs_vals");
1532 
1533 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1534  static_assert (! std::is_same<
1535  typename decltype (crs_vals_d)::non_const_value_type,
1536  std::complex<double> >::value,
1537  "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1538  "never happen, since std::complex does not work in Kokkos::View objects.");
1539 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1540 
1541  auto src_pids_d =
1542  create_mirror_view_from_raw_host_array(outputDevice, SourcePids.getRawPtr(),
1543  SourcePids.size(), true, "src_pids");
1544 
1545  auto tgt_pids_d =
1546  create_mirror_view_from_raw_host_array(outputDevice, TargetPids.getRawPtr(),
1547  TargetPids.size(), true, "tgt_pids");
1548 
1549  size_t bytes_per_value = 0;
1551  // assume that ST is default constructible
1552  bytes_per_value = PackTraits<ST>::packValueCount(ST());
1553  }
1554  else {
1555  // Since the packed data come from the source matrix, we can use the source
1556  // matrix to get the number of bytes per Scalar value stored in the matrix.
1557  // This assumes that all Scalar values in the source matrix require the same
1558  // number of bytes. If the source matrix has no entries on the calling
1559  // process, then we hope that some process does have some idea how big
1560  // a Scalar value is. Of course, if no processes have any entries, then no
1561  // values should be packed (though this does assume that in our packing
1562  // scheme, rows with zero entries take zero bytes).
1563  size_t bytes_per_value_l = 0;
1564  if (local_matrix.values.extent(0) > 0) {
1565  const ST& val = local_matrix.values(0);
1566  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1567  } else {
1568  const ST& val = crs_vals_d(0);
1569  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1570  }
1571  Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1572  Teuchos::REDUCE_MAX,
1573  bytes_per_value_l,
1574  outArg(bytes_per_value));
1575  }
1576 
1577 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1578  static_assert (! std::is_same<
1579  typename decltype (crs_vals_d)::non_const_value_type,
1580  std::complex<double> >::value,
1581  "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1582  "never happen, since std::complex does not work in Kokkos::View objects.");
1583 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1584 
1585  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays(
1586  local_matrix, local_col_map, import_lids_d, imports_d,
1587  num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1588  crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1589  numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1590  bytes_per_value);
1591 
1592  // Copy outputs back to host
1593  typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1594  CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1595  deep_copy(crs_rowptr_h, crs_rowptr_d);
1596 
1597  typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1598  CRS_colind.getRawPtr(), CRS_colind.size());
1599  deep_copy(crs_colind_h, crs_colind_d);
1600 
1601  typename decltype(crs_vals_d)::HostMirror crs_vals_h(
1602  CRS_vals.getRawPtr(), CRS_vals.size());
1603  deep_copy(crs_vals_h, crs_vals_d);
1604 
1605  typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1606  TargetPids.getRawPtr(), TargetPids.size());
1607  deep_copy(tgt_pids_h, tgt_pids_d);
1608 
1609 }
1610 
1611 } // namespace Details
1612 } // namespace Tpetra
1613 
1614 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
1615  template void \
1616  Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1617  const CrsMatrix<ST, LO, GO, NT>&, \
1618  const Teuchos::ArrayView<const char>&, \
1619  const Teuchos::ArrayView<const size_t>&, \
1620  const Teuchos::ArrayView<const LO>&, \
1621  size_t, \
1622  CombineMode); \
1623  template void \
1624  Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1625  const CrsMatrix<ST, LO, GO, NT>&, \
1626  Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1627  Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1628  const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1629  const size_t, \
1630  const CombineMode); \
1631  template void \
1632  Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1633  const CrsMatrix<ST, LO, GO, NT> &, \
1634  const Teuchos::ArrayView<const LO>&, \
1635  const Teuchos::ArrayView<const char>&, \
1636  const Teuchos::ArrayView<const size_t>&, \
1637  const size_t, \
1638  const CombineMode, \
1639  const size_t, \
1640  const Teuchos::ArrayView<const LO>&, \
1641  const Teuchos::ArrayView<const LO>&, \
1642  size_t, \
1643  size_t, \
1644  const int, \
1645  const Teuchos::ArrayView<size_t>&, \
1646  const Teuchos::ArrayView<GO>&, \
1647  const Teuchos::ArrayView<CrsMatrix<ST, LO, GO, NT>::impl_scalar_type>&, \
1648  const Teuchos::ArrayView<const int>&, \
1649  Teuchos::Array<int>&); \
1650  template size_t \
1651  Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1652  const CrsMatrix<ST, LO, GO, NT> &, \
1653  const Teuchos::ArrayView<const LO> &, \
1654  const Teuchos::ArrayView<const char> &, \
1655  const Teuchos::ArrayView<const size_t>&, \
1656  size_t, \
1657  CombineMode, \
1658  size_t, \
1659  const Teuchos::ArrayView<const LO>&, \
1660  const Teuchos::ArrayView<const LO>&);
1661 
1662 #endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
GlobalOrdinal global_ordinal_type
The type of global indices.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
void unpackAndCombineIntoCrsMatrix(const LocalMatrix &local_matrix, const LocalMap &local_map, const Kokkos::View< const char *, BufferDeviceType > &imports, const Kokkos::View< const size_t *, BufferDeviceType > &num_packets_per_lid, const typename PackTraits< typename LocalMap::local_ordinal_type >::input_array_type import_lids, const Tpetra::CombineMode combine_mode)
Perform the unpack operation for the matrix.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types...
bool compute_batch_info(const View1 &batches_per_lid, View2 &batch_info)
Compute the index and batch number associated with each batch.
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
Traits class for packing / unpacking data of type T.
Declaration of the Tpetra::CrsMatrix class.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
"Local" part of Map suitable for Kokkos kernels.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Implementation details of Tpetra.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don&#39;t currently exist.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
CombineMode
Rule for combining data in an Import or Export.
Sum new values.
Replace old value with maximum of magnitudes of old and new values.
Replace existing values with new values.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
size_t compute_maximum_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Maximum number of entries in any row of the packed matrix.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index. (device only)
DeviceType device_type
The device type.
size_t compute_total_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Total number of entries in any row of the packed matrix.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Base class for distributed Tpetra objects that support data redistribution.
LocalOrdinal local_ordinal_type
The type of local indices.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...