40 #ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP 41 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP 43 #include "TpetraCore_config.h" 44 #include "Teuchos_Array.hpp" 45 #include "Teuchos_ArrayView.hpp" 46 #include "Teuchos_OrdinalTraits.hpp" 54 #include "Kokkos_Core.hpp" 86 namespace UnpackAndCombineCrsMatrixImpl {
97 template<
class ST,
class LO,
class GO>
102 const char imports[],
105 const size_t num_ent,
106 const size_t bytes_per_value)
112 bool unpack_pids = pids_out.size() > 0;
114 const size_t num_ent_beg = offset;
117 const size_t gids_beg = num_ent_beg + num_ent_len;
118 const size_t gids_len =
121 const size_t pids_beg = gids_beg + gids_len;
122 const size_t pids_len = unpack_pids ?
126 const size_t vals_beg = gids_beg + gids_len + pids_len;
127 const size_t vals_len = num_ent * bytes_per_value;
129 const char*
const num_ent_in = imports + num_ent_beg;
130 const char*
const gids_in = imports + gids_beg;
131 const char*
const pids_in = unpack_pids ? imports + pids_beg :
nullptr;
132 const char*
const vals_in = imports + vals_beg;
134 size_t num_bytes_out = 0;
137 if (static_cast<size_t> (num_ent_out) != num_ent) {
142 Kokkos::pair<int, size_t> p;
147 num_bytes_out += p.second;
154 num_bytes_out += p.second;
161 num_bytes_out += p.second;
164 const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
165 if (num_bytes_out != expected_num_bytes) {
181 template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
183 typedef LocalMatrix local_matrix_type;
186 typedef typename local_matrix_type::value_type ST;
190 typedef typename DT::execution_space XS;
192 typedef Kokkos::View<const size_t*, BufferDeviceType>
193 num_packets_per_lid_type;
194 typedef Kokkos::View<const size_t*, DT> offsets_type;
195 typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
196 typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
198 typedef Kokkos::View<int, DT> error_type;
199 using member_type =
typename Kokkos::TeamPolicy<XS>::member_type;
201 static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
202 "LocalMap::local_ordinal_type and " 203 "LocalMatrix::ordinal_type must be the same.");
205 local_matrix_type local_matrix;
207 input_buffer_type imports;
208 num_packets_per_lid_type num_packets_per_lid;
209 import_lids_type import_lids;
210 Kokkos::View<const LO*[2], DT> batch_info;
211 offsets_type offsets;
214 size_t bytes_per_value;
216 error_type error_code;
219 const local_matrix_type& local_matrix_in,
221 const input_buffer_type& imports_in,
222 const num_packets_per_lid_type& num_packets_per_lid_in,
223 const import_lids_type& import_lids_in,
224 const Kokkos::View<
const LO*[2], DT>& batch_info_in,
225 const offsets_type& offsets_in,
227 const size_t batch_size_in,
228 const size_t bytes_per_value_in,
229 const bool atomic_in) :
230 local_matrix (local_matrix_in),
231 local_col_map (local_col_map_in),
232 imports (imports_in),
233 num_packets_per_lid (num_packets_per_lid_in),
234 import_lids (import_lids_in),
235 batch_info (batch_info_in),
236 offsets (offsets_in),
237 combine_mode (combine_mode_in),
238 batch_size (batch_size_in),
239 bytes_per_value (bytes_per_value_in),
244 KOKKOS_INLINE_FUNCTION
245 void operator()(member_type team_member)
const 248 using Kokkos::subview;
249 using Kokkos::MemoryUnmanaged;
251 const LO batch = team_member.league_rank();
252 const LO lid_no = batch_info(batch, 0);
253 const LO batch_no = batch_info(batch, 1);
255 const size_t num_bytes = num_packets_per_lid(lid_no);
262 const LO import_lid = import_lids(lid_no);
263 const size_t buf_size = imports.size();
264 const size_t offset = offsets(lid_no);
268 const char*
const in_buf = imports.data() + offset;
270 const size_t num_entries_in_row =
static_cast<size_t>(num_ent_LO);
273 size_t expected_num_bytes = 0;
280 if (expected_num_bytes > num_bytes)
283 #ifndef KOKKOS_ENABLE_SYCL 285 "*** Error: UnpackCrsMatrixAndCombineFunctor: " 286 "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
287 (
int) lid_no, (
int) expected_num_bytes, (
int) num_bytes
290 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 21);
294 if (offset > buf_size || offset + num_bytes > buf_size)
297 #ifndef KOKKOS_ENABLE_SYCL 299 "*** Error: UnpackCrsMatrixAndCombineFunctor: " 300 "At row %d, the offset (%d) > buffer size (%d)\n",
301 (
int) lid_no, (
int) offset, (
int) buf_size
304 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 22);
309 size_t num_entries_in_batch = 0;
310 if (num_entries_in_row <= batch_size)
311 num_entries_in_batch = num_entries_in_row;
312 else if (num_entries_in_row >= (batch_no + 1) * batch_size)
313 num_entries_in_batch = batch_size;
315 num_entries_in_batch = num_entries_in_row - batch_no * batch_size;
318 const size_t num_ent_start = offset;
319 const size_t num_ent_end = num_ent_start + bytes_per_lid;
322 const size_t gids_start = num_ent_end;
323 const size_t gids_end = gids_start + num_entries_in_row * bytes_per_gid;
325 const size_t vals_start = gids_end;
327 const size_t shift = batch_no * batch_size;
328 const char*
const num_ent_in = imports.data() + num_ent_start;
329 const char*
const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
330 const char*
const vals_in = imports.data() + vals_start + shift * bytes_per_value;
334 if (static_cast<size_t>(num_ent_out) != num_entries_in_row)
337 #ifndef KOKKOS_ENABLE_SYCL 339 "*** Error: UnpackCrsMatrixAndCombineFunctor: " 340 "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
341 (
int) lid_no, (
int) num_entries_in_row, (
int) num_ent_out
344 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 23);
347 constexpr
bool matrix_has_sorted_rows =
true;
348 Kokkos::parallel_for(
349 Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
350 KOKKOS_LAMBDA(
const LO& j)
355 distance = j * bytes_per_gid;
365 distance = j * bytes_per_value;
368 if (combine_mode ==
ADD) {
372 const bool use_atomic_updates = atomic;
373 (void)local_matrix.sumIntoValues(
378 matrix_has_sorted_rows,
381 }
else if (combine_mode ==
REPLACE) {
385 const bool use_atomic_updates =
false;
386 (void)local_matrix.replaceValues(
391 matrix_has_sorted_rows,
397 #ifndef KOKKOS_ENABLE_SYCL 399 "*** Error: UnpackCrsMatrixAndCombineFunctor: " 400 "At row %d, an unknown error occurred during unpack\n", (
int) lid_no
403 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 31);
408 team_member.team_barrier();
414 auto error_code_h = Kokkos::create_mirror_view_and_copy(
415 Kokkos::HostSpace(), error_code
417 return error_code_h();
422 struct MaxNumEntTag {};
423 struct TotNumEntTag {};
433 template<
class LO,
class DT,
class BDT>
436 typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
437 typedef Kokkos::View<const size_t*, DT> offsets_type;
438 typedef Kokkos::View<const char*, BDT> input_buffer_type;
441 typedef size_t value_type;
444 num_packets_per_lid_type num_packets_per_lid;
445 offsets_type offsets;
446 input_buffer_type imports;
450 const offsets_type& offsets_in,
451 const input_buffer_type& imports_in) :
452 num_packets_per_lid (num_packets_per_lid_in),
453 offsets (offsets_in),
457 KOKKOS_INLINE_FUNCTION
void 458 operator() (
const MaxNumEntTag,
const LO i, value_type& update)
const {
460 const size_t num_bytes = num_packets_per_lid(i);
463 const char*
const in_buf = imports.data () + offsets(i);
465 const size_t num_ent =
static_cast<size_t> (num_ent_LO);
467 update = (update < num_ent) ? num_ent : update;
471 KOKKOS_INLINE_FUNCTION
void 472 join (
const MaxNumEntTag,
473 volatile value_type& dst,
474 const volatile value_type& src)
const 476 if (dst < src) dst = src;
479 KOKKOS_INLINE_FUNCTION
void 480 operator() (
const TotNumEntTag,
const LO i, value_type& tot_num_ent)
const {
482 const size_t num_bytes = num_packets_per_lid(i);
485 const char*
const in_buf = imports.data () + offsets(i);
487 tot_num_ent +=
static_cast<size_t> (num_ent_LO);
499 template<
class LO,
class DT,
class BDT>
502 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
503 const Kokkos::View<const size_t*, DT>& offsets,
504 const Kokkos::View<const char*, BDT>& imports)
506 typedef typename DT::execution_space XS;
507 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
508 MaxNumEntTag> range_policy;
512 const LO numRowsToUnpack =
513 static_cast<LO
> (num_packets_per_lid.extent (0));
514 size_t max_num_ent = 0;
515 Kokkos::parallel_reduce (
"Max num entries in CRS",
516 range_policy (0, numRowsToUnpack),
517 functor, max_num_ent);
528 template<
class LO,
class DT,
class BDT>
531 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
532 const Kokkos::View<const size_t*, DT>& offsets,
533 const Kokkos::View<const char*, BDT>& imports)
535 typedef typename DT::execution_space XS;
536 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
537 size_t tot_num_ent = 0;
540 const LO numRowsToUnpack =
541 static_cast<LO
> (num_packets_per_lid.extent (0));
542 Kokkos::parallel_reduce (
"Total num entries in CRS to unpack",
543 range_policy (0, numRowsToUnpack),
544 functor, tot_num_ent);
549 KOKKOS_INLINE_FUNCTION
551 unpackRowCount(
const char imports[],
553 const size_t num_bytes)
559 const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
560 if (p_num_bytes > num_bytes) {
561 return OrdinalTraits<size_t>::invalid();
563 const char*
const in_buf = imports + offset;
564 (void) PT::unpackValue(num_ent_LO, in_buf);
566 return static_cast<size_t>(num_ent_LO);
573 template<
class View1,
class View2>
577 const View1& batches_per_lid,
581 using LO =
typename View2::value_type;
583 for (
size_t i=0; i<batches_per_lid.extent(0); i++)
585 for (
size_t batch_no=0; batch_no<batches_per_lid(i); batch_no++)
587 batch_info(batch, 0) =
static_cast<LO
>(i);
588 batch_info(batch, 1) = batch_no;
592 return batch == batch_info.extent(0);
602 template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
605 const LocalMatrix& local_matrix,
607 const Kokkos::View<const char*, BufferDeviceType>& imports,
608 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
612 using ST =
typename LocalMatrix::value_type;
615 using XS =
typename DT::execution_space;
616 const char prefix[] =
617 "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::" 618 "unpackAndCombineIntoCrsMatrix: ";
620 const size_t num_import_lids =
static_cast<size_t>(import_lids.extent(0));
621 if (num_import_lids == 0) {
628 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
ABSMAX,
629 std::invalid_argument,
630 prefix <<
"ABSMAX combine mode is not yet implemented for a matrix that has a " 631 "static graph (i.e., was constructed with the CrsMatrix constructor " 632 "that takes a const CrsGraph pointer).");
634 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
INSERT,
635 std::invalid_argument,
636 prefix <<
"INSERT combine mode is not allowed if the matrix has a static graph " 637 "(i.e., was constructed with the CrsMatrix constructor that takes a " 638 "const CrsGraph pointer).");
641 TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode ==
ADD || combine_mode ==
REPLACE),
642 std::invalid_argument,
643 prefix <<
"Invalid combine mode; should never get " 644 "here! Please report this bug to the Tpetra developers.");
647 bool bad_num_import_lids =
648 num_import_lids !=
static_cast<size_t>(num_packets_per_lid.extent(0));
649 TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
650 std::invalid_argument,
651 prefix <<
"importLIDs.size() (" << num_import_lids <<
") != " 652 "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) <<
").");
656 Kokkos::View<size_t*, DT> offsets(
"offsets", num_import_lids+1);
660 size_t max_num_ent = compute_maximum_num_entries<LO,DT>(num_packets_per_lid, offsets, imports);
662 const size_t batch_size = std::min(default_batch_size, max_num_ent);
665 size_t num_batches = 0;
666 Kokkos::View<LO*[2], DT> batch_info(
"", num_batches);
667 Kokkos::View<size_t*, DT> batches_per_lid(
"", num_import_lids);
669 Kokkos::parallel_reduce(
670 Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
671 KOKKOS_LAMBDA(
const size_t i,
size_t& batches)
673 const size_t num_entries_in_row = unpackRowCount<LO>(
674 imports.data(), offsets(i), num_packets_per_lid(i)
677 (num_entries_in_row <= batch_size) ?
679 num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
680 batches += batches_per_lid(i);
684 Kokkos::resize(batch_info, num_batches);
686 Kokkos::HostSpace host_space;
687 auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
690 auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
700 const bool atomic = XS::concurrency() != 1;
716 using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
718 #if defined(KOKKOS_ENABLE_CUDA) 719 constexpr
bool is_cuda = std::is_same<XS, Kokkos::Cuda>::value;
721 constexpr
bool is_cuda =
false;
723 if (!is_cuda || team_size == Teuchos::OrdinalTraits<size_t>::invalid())
725 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), Kokkos::AUTO), f);
729 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), static_cast<int>(team_size)), f);
732 auto error_code = f.error();
733 TEUCHOS_TEST_FOR_EXCEPTION(
736 prefix <<
"UnpackCrsMatrixAndCombineFunctor reported error code " << error_code
740 template<
class LocalMatrix,
class BufferDeviceType>
743 const LocalMatrix& local_matrix,
745 const Kokkos::View<const char*, BufferDeviceType>& imports,
746 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
747 const size_t num_same_ids)
749 using Kokkos::parallel_reduce;
750 typedef typename LocalMatrix::ordinal_type LO;
751 typedef typename LocalMatrix::device_type device_type;
752 typedef typename device_type::execution_space XS;
753 typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
754 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
755 typedef BufferDeviceType BDT;
761 num_items =
static_cast<LO
>(num_same_ids);
764 parallel_reduce(range_policy(0, num_items),
765 KOKKOS_LAMBDA(
const LO lid,
size_t& update) {
766 update +=
static_cast<size_t>(local_matrix.graph.row_map[lid+1]
767 -local_matrix.graph.row_map[lid]);
773 num_items =
static_cast<LO
>(permute_from_lids.extent(0));
776 parallel_reduce(range_policy(0, num_items),
777 KOKKOS_LAMBDA(
const LO i,
size_t& update) {
778 const LO lid = permute_from_lids(i);
779 update +=
static_cast<size_t> (local_matrix.graph.row_map[lid+1]
780 - local_matrix.graph.row_map[lid]);
787 const size_type np = num_packets_per_lid.extent(0);
788 Kokkos::View<size_t*, device_type> offsets(
"offsets", np+1);
791 compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
799 template<
class LO,
class DT,
class BDT>
801 setupRowPointersForRemotes(
804 const Kokkos::View<const char*, BDT>& imports,
805 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
808 using Kokkos::parallel_reduce;
809 typedef typename DT::execution_space XS;
811 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
813 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
814 const size_type N = num_packets_per_lid.extent(0);
817 parallel_reduce (
"Setup row pointers for remotes",
819 KOKKOS_LAMBDA (
const size_t i,
int& k_error) {
820 typedef typename std::remove_reference< decltype( tgt_rowptr(0) ) >::type atomic_incr_type;
821 const size_t num_bytes = num_packets_per_lid(i);
822 const size_t offset = offsets(i);
823 const size_t num_ent = unpackRowCount<LO> (imports.data(), offset, num_bytes);
824 if (num_ent == InvalidNum) {
827 Kokkos::atomic_fetch_add (&tgt_rowptr (import_lids(i)), atomic_incr_type(num_ent));
835 makeCrsRowPtrFromLengths(
837 const Kokkos::View<size_t*,DT>& new_start_row)
839 using Kokkos::parallel_scan;
840 typedef typename DT::execution_space XS;
841 typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
842 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
843 const size_type N = new_start_row.extent(0);
844 parallel_scan(range_policy(0, N),
845 KOKKOS_LAMBDA(
const size_t& i,
size_t& update,
const bool&
final) {
846 auto cur_val = tgt_rowptr(i);
848 tgt_rowptr(i) = update;
849 new_start_row(i) = tgt_rowptr(i);
856 template<
class LocalMatrix,
class LocalMap>
862 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
865 const LocalMatrix& local_matrix,
866 const LocalMap& local_col_map,
867 const size_t num_same_ids,
870 using Kokkos::parallel_for;
873 typedef typename DT::execution_space XS;
874 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
876 parallel_for(range_policy(0, num_same_ids),
877 KOKKOS_LAMBDA(
const size_t i) {
878 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
880 const LO src_lid =
static_cast<LO
>(i);
881 size_t src_row = local_matrix.graph.row_map(src_lid);
883 const LO tgt_lid =
static_cast<LO
>(i);
884 const size_t tgt_row = tgt_rowptr(tgt_lid);
886 const size_t nsr = local_matrix.graph.row_map(src_lid+1)
887 - local_matrix.graph.row_map(src_lid);
888 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
890 for (
size_t j=local_matrix.graph.row_map(src_lid);
891 j<local_matrix.graph.row_map(src_lid+1); ++j) {
892 LO src_col = local_matrix.graph.entries(j);
893 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
894 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
895 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
901 template<
class LocalMatrix,
class LocalMap>
903 copyDataFromPermuteIDs(
907 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
912 const LocalMatrix& local_matrix,
913 const LocalMap& local_col_map,
916 using Kokkos::parallel_for;
919 typedef typename DT::execution_space XS;
920 typedef typename PackTraits<LO>::input_array_type::size_type size_type;
921 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
923 const size_type num_permute_to_lids = permute_to_lids.extent(0);
925 parallel_for(range_policy(0, num_permute_to_lids),
926 KOKKOS_LAMBDA(
const size_t i) {
927 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
929 const LO src_lid = permute_from_lids(i);
930 const size_t src_row = local_matrix.graph.row_map(src_lid);
932 const LO tgt_lid = permute_to_lids(i);
933 const size_t tgt_row = tgt_rowptr(tgt_lid);
935 size_t nsr = local_matrix.graph.row_map(src_lid+1)
936 - local_matrix.graph.row_map(src_lid);
937 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
939 for (
size_t j=local_matrix.graph.row_map(src_lid);
940 j<local_matrix.graph.row_map(src_lid+1); ++j) {
941 LO src_col = local_matrix.graph.entries(j);
942 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
943 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
944 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
950 template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
952 unpackAndCombineIntoCrsArrays2(
956 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
959 const Kokkos::View<const char*, BufferDeviceType>& imports,
960 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
964 const size_t bytes_per_value)
967 using Kokkos::subview;
968 using Kokkos::MemoryUnmanaged;
969 using Kokkos::parallel_reduce;
970 using Kokkos::atomic_fetch_add;
975 typedef typename LocalMatrix::value_type ST;
976 typedef typename DT::execution_space XS;
977 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
978 typedef typename Kokkos::pair<size_type, size_type> slice;
979 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
981 typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
982 typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
983 typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
985 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
988 const size_type num_import_lids = import_lids.size();
991 parallel_reduce (
"Unpack and combine into CRS",
992 range_policy (0, num_import_lids),
993 KOKKOS_LAMBDA (
const size_t i,
int& k_error) {
994 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
995 const size_t num_bytes = num_packets_per_lid(i);
996 const size_t offset = offsets(i);
997 if (num_bytes == 0) {
1001 size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
1002 if (num_ent == InvalidNum) {
1006 const LO lcl_row = import_lids(i);
1007 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
1008 const size_t end_row = start_row + num_ent;
1010 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
1011 vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
1012 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
1014 k_error += unpackRow<ST,LO,GO>(gids_out, pids_out, vals_out,
1015 imports.data(), offset, num_bytes,
1016 num_ent, bytes_per_value);
1019 for (
size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
1020 const int pid = pids_out(j);
1021 pids_out(j) = (pid != my_pid) ? pid : -1;
1028 template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
1031 const LocalMatrix & local_matrix,
1032 const LocalMap & local_col_map,
1034 const Kokkos::View<const char*, BufferDeviceType>& imports,
1035 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
1043 const size_t num_same_ids,
1044 const size_t tgt_num_rows,
1045 const size_t tgt_num_nonzeros,
1046 const int my_tgt_pid,
1047 const size_t bytes_per_value)
1050 using Kokkos::subview;
1051 using Kokkos::parallel_for;
1052 using Kokkos::MemoryUnmanaged;
1056 typedef typename DT::execution_space XS;
1057 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
1058 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
1059 typedef BufferDeviceType BDT;
1061 const char prefix[] =
"unpackAndCombineIntoCrsArrays: ";
1063 const size_t N = tgt_num_rows;
1067 const int my_pid = my_tgt_pid;
1070 parallel_for(range_policy(0, N+1),
1071 KOKKOS_LAMBDA(
const size_t i) {
1077 parallel_for(range_policy(0, num_same_ids),
1078 KOKKOS_LAMBDA(
const size_t i) {
1079 const LO tgt_lid =
static_cast<LO
>(i);
1080 const LO src_lid =
static_cast<LO
>(i);
1081 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1082 - local_matrix.graph.row_map(src_lid);
1087 const size_type num_permute_to_lids = permute_to_lids.extent(0);
1088 parallel_for(range_policy(0, num_permute_to_lids),
1089 KOKKOS_LAMBDA(
const size_t i) {
1090 const LO tgt_lid = permute_to_lids(i);
1091 const LO src_lid = permute_from_lids(i);
1092 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1093 - local_matrix.graph.row_map(src_lid);
1098 const size_type num_import_lids = import_lids.extent(0);
1099 View<size_t*, DT> offsets(
"offsets", num_import_lids+1);
1102 #ifdef HAVE_TPETRA_DEBUG 1104 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1105 const bool condition =
1106 nth_offset_h !=
static_cast<size_t>(imports.extent (0));
1107 TEUCHOS_TEST_FOR_EXCEPTION
1108 (condition, std::logic_error, prefix
1109 <<
"The final offset in bytes " << nth_offset_h
1110 <<
" != imports.size() = " << imports.extent(0)
1111 <<
". Please report this bug to the Tpetra developers.");
1113 #endif // HAVE_TPETRA_DEBUG 1117 setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1118 import_lids, imports, num_packets_per_lid, offsets);
1119 TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1120 <<
" Error transferring data to target row pointers. " 1121 "Please report this bug to the Tpetra developers.");
1125 View<size_t*, DT> new_start_row (
"new_start_row", N+1);
1128 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1131 copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1132 tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1134 copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1135 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1136 local_matrix, local_col_map, my_pid);
1138 if (imports.extent(0) <= 0) {
1142 int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1143 tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1144 local_matrix, local_col_map, my_pid, bytes_per_value);
1145 TEUCHOS_TEST_FOR_EXCEPTION(
1146 unpack_err != 0, std::logic_error, prefix <<
"unpack loop failed. This " 1147 "should never happen. Please report this bug to the Tpetra developers.");
1193 template<
typename ST,
typename LO,
typename GO,
typename Node>
1197 const Teuchos::ArrayView<const char>& imports,
1198 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1199 const Teuchos::ArrayView<const LO>& importLIDs,
1204 typedef typename Node::device_type device_type;
1206 static_assert (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1207 "Node::device_type and LocalMatrix::device_type must be the same.");
1210 device_type outputDevice;
1215 auto num_packets_per_lid_d =
1217 numPacketsPerLID.size(),
true,
"num_packets_per_lid");
1219 auto import_lids_d =
1221 importLIDs.size(),
true,
"import_lids");
1225 imports.size(),
true,
"imports");
1228 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1239 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1240 local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1241 import_lids_d, combineMode);
1245 template<
typename ST,
typename LO,
typename GO,
typename NT>
1247 unpackCrsMatrixAndCombineNew(
1249 Kokkos::DualView<
char*,
1251 Kokkos::DualView<
size_t*,
1253 const Kokkos::DualView<
const LO*,
1261 using device_type =
typename crs_matrix_type::device_type;
1262 using local_matrix_device_type =
typename crs_matrix_type::local_matrix_device_type;
1263 using buffer_device_type =
typename dist_object_type::buffer_device_type;
1266 (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1267 "crs_matrix_type::device_type and local_matrix_device_type::device_type " 1268 "must be the same.");
1270 if (numPacketsPerLID.need_sync_device()) {
1271 numPacketsPerLID.sync_device ();
1273 auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1275 TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1276 auto import_lids_d = importLIDs.view_device ();
1278 if (imports.need_sync_device()) {
1279 imports.sync_device ();
1281 auto imports_d = imports.view_device ();
1284 auto local_col_map = sourceMatrix.
getColMap ()->getLocalMap ();
1285 typedef decltype (local_col_map) local_map_type;
1287 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1288 local_matrix_device_type,
1291 > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1292 import_lids_d, combineMode);
1350 template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1354 const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1355 const Teuchos::ArrayView<const char> &imports,
1356 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1360 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1361 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1363 using Kokkos::MemoryUnmanaged;
1365 typedef typename Node::device_type DT;
1367 const char prefix[] =
"unpackAndCombineWithOwningPIDsCount: ";
1369 TEUCHOS_TEST_FOR_EXCEPTION
1370 (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1371 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size () <<
" != " 1372 "permuteFromLIDs.size() = " << permuteFromLIDs.size() <<
".");
1376 TEUCHOS_TEST_FOR_EXCEPTION
1377 (! locallyIndexed, std::invalid_argument, prefix <<
"The input " 1378 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1379 TEUCHOS_TEST_FOR_EXCEPTION
1380 (importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
1381 prefix <<
"importLIDs.size() = " << importLIDs.size () <<
" != " 1382 "numPacketsPerLID.size() = " << numPacketsPerLID.size () <<
".");
1385 auto permute_from_lids_d =
1387 permuteFromLIDs.getRawPtr (),
1388 permuteFromLIDs.size (),
true,
1389 "permute_from_lids");
1392 imports.getRawPtr (),
1393 imports.size (),
true,
1395 auto num_packets_per_lid_d =
1397 numPacketsPerLID.getRawPtr (),
1398 numPacketsPerLID.size (),
true,
1399 "num_packets_per_lid");
1401 return UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1402 local_matrix, permute_from_lids_d, imports_d,
1403 num_packets_per_lid_d, numSameIDs);
1420 template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1424 const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1425 const Teuchos::ArrayView<const char>& imports,
1426 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1429 const size_t numSameIDs,
1430 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1431 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1432 size_t TargetNumRows,
1433 size_t TargetNumNonzeros,
1434 const int MyTargetPID,
1435 const Teuchos::ArrayView<size_t>& CRS_rowptr,
1436 const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
1438 const Teuchos::ArrayView<const int>& SourcePids,
1439 Teuchos::Array<int>& TargetPids)
1446 using Teuchos::ArrayView;
1447 using Teuchos::outArg;
1448 using Teuchos::REDUCE_MAX;
1449 using Teuchos::reduceAll;
1451 typedef LocalOrdinal LO;
1453 typedef typename Node::device_type DT;
1456 typedef typename matrix_type::impl_scalar_type ST;
1457 typedef typename ArrayView<const LO>::size_type size_type;
1459 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1461 TEUCHOS_TEST_FOR_EXCEPTION(
1462 TargetNumRows + 1 != static_cast<size_t> (CRS_rowptr.size ()),
1463 std::invalid_argument, prefix <<
"CRS_rowptr.size() = " <<
1464 CRS_rowptr.size () <<
"!= TargetNumRows+1 = " << TargetNumRows+1 <<
".");
1466 TEUCHOS_TEST_FOR_EXCEPTION(
1467 permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1468 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size ()
1469 <<
"!= permuteFromLIDs.size() = " << permuteFromLIDs.size () <<
".");
1470 const size_type numImportLIDs = importLIDs.size ();
1472 TEUCHOS_TEST_FOR_EXCEPTION(
1473 numImportLIDs != numPacketsPerLID.size (), std::invalid_argument,
1474 prefix <<
"importLIDs.size() = " << numImportLIDs <<
" != " 1475 "numPacketsPerLID.size() = " << numPacketsPerLID.size() <<
".");
1478 if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1479 TargetPids.resize (TargetNumNonzeros);
1481 TargetPids.assign (TargetNumNonzeros, -1);
1485 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1489 auto import_lids_d =
1491 importLIDs.size(),
true,
"import_lids");
1495 imports.size(),
true,
"imports");
1497 auto num_packets_per_lid_d =
1499 numPacketsPerLID.size(),
true,
"num_packets_per_lid");
1501 auto permute_from_lids_d =
1503 permuteFromLIDs.size(),
true,
"permute_from_lids");
1505 auto permute_to_lids_d =
1507 permuteToLIDs.size(),
true,
"permute_to_lids");
1511 CRS_rowptr.size(),
true,
"crs_rowptr");
1515 CRS_colind.size(),
true,
"crs_colidx");
1517 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE 1518 static_assert (! std::is_same<
1519 typename std::remove_const<
1520 typename std::decay<
1524 std::complex<double> >::value,
1525 "CRS_vals::value_type is std::complex<double>; this should never happen" 1526 ", since std::complex does not work in Kokkos::View objects.");
1527 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE 1531 CRS_vals.size(),
true,
"crs_vals");
1533 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE 1534 static_assert (! std::is_same<
1535 typename decltype (crs_vals_d)::non_const_value_type,
1536 std::complex<double> >::value,
1537 "crs_vals_d::non_const_value_type is std::complex<double>; this should " 1538 "never happen, since std::complex does not work in Kokkos::View objects.");
1539 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE 1543 SourcePids.size(),
true,
"src_pids");
1547 TargetPids.size(),
true,
"tgt_pids");
1549 size_t bytes_per_value = 0;
1563 size_t bytes_per_value_l = 0;
1564 if (local_matrix.values.extent(0) > 0) {
1565 const ST& val = local_matrix.values(0);
1568 const ST& val = crs_vals_d(0);
1571 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.
getComm()),
1572 Teuchos::REDUCE_MAX,
1574 outArg(bytes_per_value));
1577 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE 1578 static_assert (! std::is_same<
1579 typename decltype (crs_vals_d)::non_const_value_type,
1580 std::complex<double> >::value,
1581 "crs_vals_d::non_const_value_type is std::complex<double>; this should " 1582 "never happen, since std::complex does not work in Kokkos::View objects.");
1583 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE 1585 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays(
1586 local_matrix, local_col_map, import_lids_d, imports_d,
1587 num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1588 crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1589 numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1593 typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1594 CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1597 typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1598 CRS_colind.getRawPtr(), CRS_colind.size());
1601 typename decltype(crs_vals_d)::HostMirror crs_vals_h(
1602 CRS_vals.getRawPtr(), CRS_vals.size());
1605 typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1606 TargetPids.getRawPtr(), TargetPids.size());
1614 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \ 1616 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \ 1617 const CrsMatrix<ST, LO, GO, NT>&, \ 1618 const Teuchos::ArrayView<const char>&, \ 1619 const Teuchos::ArrayView<const size_t>&, \ 1620 const Teuchos::ArrayView<const LO>&, \ 1624 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \ 1625 const CrsMatrix<ST, LO, GO, NT>&, \ 1626 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \ 1627 Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \ 1628 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \ 1630 const CombineMode); \ 1632 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \ 1633 const CrsMatrix<ST, LO, GO, NT> &, \ 1634 const Teuchos::ArrayView<const LO>&, \ 1635 const Teuchos::ArrayView<const char>&, \ 1636 const Teuchos::ArrayView<const size_t>&, \ 1638 const CombineMode, \ 1640 const Teuchos::ArrayView<const LO>&, \ 1641 const Teuchos::ArrayView<const LO>&, \ 1645 const Teuchos::ArrayView<size_t>&, \ 1646 const Teuchos::ArrayView<GO>&, \ 1647 const Teuchos::ArrayView<CrsMatrix<ST, LO, GO, NT>::impl_scalar_type>&, \ 1648 const Teuchos::ArrayView<const int>&, \ 1649 Teuchos::Array<int>&); \ 1651 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \ 1652 const CrsMatrix<ST, LO, GO, NT> &, \ 1653 const Teuchos::ArrayView<const LO> &, \ 1654 const Teuchos::ArrayView<const char> &, \ 1655 const Teuchos::ArrayView<const size_t>&, \ 1659 const Teuchos::ArrayView<const LO>&, \ 1660 const Teuchos::ArrayView<const LO>&); 1662 #endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
GlobalOrdinal global_ordinal_type
The type of global indices.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
void unpackAndCombineIntoCrsMatrix(const LocalMatrix &local_matrix, const LocalMap &local_map, const Kokkos::View< const char *, BufferDeviceType > &imports, const Kokkos::View< const size_t *, BufferDeviceType > &num_packets_per_lid, const typename PackTraits< typename LocalMap::local_ordinal_type >::input_array_type import_lids, const Tpetra::CombineMode combine_mode)
Perform the unpack operation for the matrix.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types...
bool compute_batch_info(const View1 &batches_per_lid, View2 &batch_info)
Compute the index and batch number associated with each batch.
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
Traits class for packing / unpacking data of type T.
Declaration of the Tpetra::CrsMatrix class.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
int error() const
Host function for getting the error.
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
"Local" part of Map suitable for Kokkos kernels.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Implementation details of Tpetra.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don't currently exist.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
CombineMode
Rule for combining data in an Import or Export.
Replace old value with maximum of magnitudes of old and new values.
Replace existing values with new values.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
size_t compute_maximum_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Maximum number of entries in any row of the packed matrix.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index. (device only)
DeviceType device_type
The device type.
size_t compute_total_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Total number of entries in any row of the packed matrix.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Base class for distributed Tpetra objects that support data redistribution.
Unpacks and combines a single row of the CrsMatrix.
LocalOrdinal local_ordinal_type
The type of local indices.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...