49 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_ 50 #define _ZOLTAN2_ALGMultiJagged_HPP_ 59 #include <Tpetra_Distributor.hpp> 60 #include <Teuchos_StandardParameterEntryValidators.hpp> 61 #include <Teuchos_ParameterList.hpp> 62 #include <Kokkos_Sort.hpp> 66 #include <unordered_map> 68 #ifdef ZOLTAN2_USEZOLTANCOMM 69 #ifdef HAVE_ZOLTAN2_MPI 70 #define ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION 71 #include "zoltan_comm_cpp.h" 72 #include "zoltan_types.h" 81 template <
typename Ordinal,
typename T>
105 void reduce(
const Ordinal count,
const T inBuffer[], T inoutBuffer[])
const {
106 for(Ordinal i = 0; i < count; i++) {
108 inoutBuffer[i] = inBuffer[i];
124 template <
typename IT,
typename CT,
typename WT>
144 this->index = index_;
145 this->count = count_;
153 void set(IT index_ ,CT count_, WT *vals_) {
154 this->index = index_;
155 this->count = count_;
159 bool operator<(const uMultiSortItem<IT,CT,WT>& other)
const {
160 assert(this->count == other.count);
161 for(CT i = 0; i < this->
count; ++i) {
163 if(std::abs(this->val[i] - other.val[i]) < this->epsilon) {
167 if(this->val[i] < other.val[i]) {
176 return this->index < other.index;
182 template <
class IT,
class WT>
193 template <
class IT,
class WT>
197 IT i, ir=n, j, k, l=1;
198 IT jstack=0, istack[50];
205 for(j=l+1;j<=ir;j++) {
208 for(i=j-1;i>=1;i--) {
209 if(arr[i].val <= aval)
222 std::swap(arr[k],arr[l+1]);
223 if(arr[l+1].val > arr[ir].val) {
224 std::swap(arr[l+1],arr[ir]);
226 if(arr[l].val > arr[ir].val) {
227 std::swap(arr[l],arr[ir]);
229 if(arr[l+1].val > arr[l].val) {
230 std::swap(arr[l+1],arr[l]);
237 do i++;
while (arr[i].val < aval);
238 do j--;
while (arr[j].val > aval);
240 std::swap(arr[i],arr[j]);
245 if(jstack > NSTACK) {
246 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
263 template <
class IT,
class WT,
class SIGN>
269 bool operator<(const uSignedSortItem<IT, WT, SIGN>& rhs)
const {
271 if(this->signbit < rhs.signbit) {
275 else if(this->signbit == rhs.signbit) {
276 if(this->val < rhs.val) {
280 else if(this->val > rhs.val) {
294 bool operator<=(const uSignedSortItem<IT, WT, SIGN>& rhs) {
295 return (this->val == rhs.val && this->signbit == rhs.signbit) || (*
this < rhs);
302 template <
class IT,
class WT,
class SIGN>
306 IT i, ir=n, j, k, l=1;
307 IT jstack=0, istack[50];
313 for(j=l+1;j<=ir;j++) {
315 for(i=j-1;i>=1;i--) {
331 std::swap(arr[k],arr[l+1]);
332 if(arr[ir] < arr[l+1]) {
333 std::swap(arr[l+1],arr[ir]);
335 if(arr[ir] < arr[l] ) {
336 std::swap(arr[l],arr[ir]);
338 if(arr[l] < arr[l+1]) {
339 std::swap(arr[l+1],arr[l]);
345 do i++;
while (arr[i] < a);
346 do j--;
while (a < arr[j]);
348 std::swap(arr[i],arr[j]);
353 if(jstack > NSTACK) {
354 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
390 static int counter = 0;
394 static int counter = 0;
401 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
402 typename mj_part_t,
typename mj_node_t>
406 typedef typename mj_node_t::device_type device_t;
408 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
413 static constexpr
size_t future_reduceall_cutoff = 1500000;
417 static constexpr mj_lno_t min_work_last_dim = 1000;
419 static constexpr mj_scalar_t least_signifiance = 0.0001;
420 static constexpr
int significance_mul = 1000;
422 std::string mj_timer_base_string;
424 RCP<const Environment> mj_env;
425 RCP<const Comm<int> > mj_problemComm;
426 RCP<Comm<int> > comm;
427 double imbalance_tolerance;
430 int num_weights_per_coord;
431 size_t initial_num_loc_coords;
433 mj_lno_t num_local_coords;
434 mj_gno_t num_global_coords;
435 mj_scalar_t sEpsilon;
438 bool distribute_points_on_cut_lines;
441 mj_part_t max_concurrent_part_calculation;
444 int mj_user_recursion_depth;
445 bool mj_keep_part_boxes;
448 int check_migrate_avoid_migration_option;
455 double minimum_migration_imbalance;
472 mj_part_t num_first_level_parts;
476 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
478 mj_part_t total_num_cut ;
479 mj_part_t total_num_part;
481 mj_part_t max_num_part_along_dim ;
482 mj_part_t max_num_cut_along_dim;
485 size_t max_num_total_part_along_dim;
487 mj_part_t total_dim_num_reduce_all;
491 mj_part_t last_dim_num_part;
494 Kokkos::View<mj_part_t *, Kokkos::HostSpace> part_no_array;
498 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
502 Kokkos::View<mj_scalar_t **, device_t> mj_weights;
505 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_parts;
508 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_weights;
512 size_t num_global_parts;
515 RCP<mj_partBoxVector_t> kept_boxes;
517 RCP<mj_partBox_t> global_box;
522 bool divide_to_prime_first;
525 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
528 Kokkos::View<mj_gno_t*, device_t> current_mj_gnos;
531 Kokkos::View<int*, Kokkos::HostSpace> owner_of_coordinate;
534 Kokkos::View<mj_lno_t*, device_t> coordinate_permutations;
537 Kokkos::View<mj_lno_t*, device_t> new_coordinate_permutations;
540 Kokkos::View<mj_part_t*, device_t> assigned_part_ids;
543 Kokkos::View<mj_lno_t *, device_t> part_xadj;
546 Kokkos::View<mj_lno_t *, device_t> new_part_xadj;
548 Kokkos::View<mj_scalar_t *, device_t> all_cut_coordinates;
551 Kokkos::View<mj_scalar_t *, device_t>
552 process_cut_line_weight_to_put_left;
555 Kokkos::View<mj_scalar_t *, device_t>
556 thread_cut_line_weight_to_put_left;
562 Kokkos::View<mj_scalar_t *, device_t> cut_coordinates_work_array;
565 Kokkos::View<mj_scalar_t *, device_t> temp_cut_coords;
568 Kokkos::View<mj_scalar_t *, device_t> target_part_weights;
571 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_coordinates;
574 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_coordinates;
577 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_weights;
580 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_weights;
584 Kokkos::View<mj_scalar_t *, device_t>
585 process_local_min_max_coord_total_weight;
588 Kokkos::View<mj_scalar_t *, device_t>
589 global_min_max_coord_total_weight;
593 Kokkos::View<bool *, device_t> is_cut_line_determined;
599 Kokkos::View<mj_part_t *, device_t> device_incomplete_cut_count;
600 typename decltype(device_incomplete_cut_count)::HostMirror
601 incomplete_cut_count;
604 typename decltype (part_xadj)::HostMirror host_part_xadj;
607 Kokkos::View<double *, device_t>
611 Kokkos::View<double *, device_t>
612 thread_part_weight_work;
616 Kokkos::View<mj_scalar_t *, device_t>
617 thread_cut_left_closest_point;
621 Kokkos::View<mj_scalar_t *, device_t>
622 thread_cut_right_closest_point;
625 Kokkos::View<mj_lno_t *, device_t>
628 Kokkos::View<mj_scalar_t *, device_t> process_rectilinear_cut_weight;
629 Kokkos::View<mj_scalar_t *, device_t> global_rectilinear_cut_weight;
635 Kokkos::View<mj_scalar_t *, device_t>
636 total_part_weight_left_right_closests;
637 Kokkos::View<mj_scalar_t *, device_t>
638 global_total_part_weight_left_right_closests;
640 Kokkos::View<mj_part_t*, device_t> device_num_partitioning_in_current_dim;
641 typename decltype(device_num_partitioning_in_current_dim)::HostMirror
642 host_num_partitioning_in_current_dim;
649 KOKKOS_INLINE_FUNCTION
650 double calculate_imbalance(mj_scalar_t achieved, mj_scalar_t expected) {
651 return static_cast<double>(achieved) / static_cast<double>(expected) - 1.0;
660 void set_part_specifications();
667 inline mj_part_t get_part_count(
668 mj_part_t num_total_future,
677 void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
698 mj_part_t update_part_num_arrays(
699 std::vector<mj_part_t> *future_num_part_in_parts,
700 std::vector<mj_part_t> *next_future_num_parts_in_parts,
701 mj_part_t &future_num_parts,
702 mj_part_t current_num_parts,
703 int current_iteration,
704 RCP<mj_partBoxVector_t> input_part_boxes,
705 RCP<mj_partBoxVector_t> output_part_boxes,
706 mj_part_t atomic_part_count);
720 KOKKOS_INLINE_FUNCTION
721 void mj_calculate_new_cut_position (
722 mj_scalar_t cut_upper_bound,
723 mj_scalar_t cut_lower_bound,
724 mj_scalar_t cut_upper_weight,
725 mj_scalar_t cut_lower_weight,
726 mj_scalar_t expected_weight,
727 mj_scalar_t &new_cut_position,
728 mj_scalar_t sEpsilon);
754 bool mj_perform_migration(
755 mj_part_t in_num_parts,
756 mj_part_t &out_num_parts,
757 std::vector<mj_part_t> *next_future_num_parts_in_parts,
758 mj_part_t &output_part_begin_index,
759 size_t migration_reduce_all_population,
760 mj_lno_t num_coords_for_last_dim_part,
761 std::string iteration,
762 RCP<mj_partBoxVector_t> &input_part_boxes,
763 RCP<mj_partBoxVector_t> &output_part_boxes);
782 bool mj_check_to_migrate(
783 size_t migration_reduce_all_population,
784 mj_lno_t num_coords_for_last_dim_part,
787 mj_gno_t *num_points_in_all_processor_parts);
813 void mj_migration_part_proc_assignment(
814 mj_gno_t * num_points_in_all_processor_parts,
817 mj_lno_t *send_count_to_each_proc,
818 std::vector<mj_part_t> &processor_ranks_for_subcomm,
819 std::vector<mj_part_t> *next_future_num_parts_in_parts,
820 mj_part_t &out_num_part,
821 std::vector<mj_part_t> &out_part_indices,
822 mj_part_t &output_part_numbering_begin_index,
823 int *coordinate_destinations);
850 void mj_assign_proc_to_parts(
851 mj_gno_t * num_points_in_all_processor_parts,
854 mj_lno_t *send_count_to_each_proc,
855 std::vector<mj_part_t> &processor_ranks_for_subcomm,
856 std::vector<mj_part_t> *next_future_num_parts_in_parts,
857 mj_part_t &out_part_index,
858 mj_part_t &output_part_numbering_begin_index,
859 int *coordinate_destinations);
876 void assign_send_destinations(
878 mj_part_t *part_assignment_proc_begin_indices,
879 mj_part_t *processor_chains_in_parts,
880 mj_lno_t *send_count_to_each_proc,
881 int *coordinate_destinations);
897 void assign_send_destinations2(
900 int *coordinate_destinations,
901 mj_part_t &output_part_numbering_begin_index,
902 std::vector<mj_part_t> *next_future_num_parts_in_parts);
926 void mj_assign_parts_to_procs(
927 mj_gno_t * num_points_in_all_processor_parts,
930 mj_lno_t *send_count_to_each_proc,
931 std::vector<mj_part_t> *next_future_num_parts_in_parts,
932 mj_part_t &out_num_part,
933 std::vector<mj_part_t> &out_part_indices,
934 mj_part_t &output_part_numbering_begin_index,
935 int *coordinate_destinations);
950 void mj_migrate_coords(
952 mj_lno_t &num_new_local_points,
953 std::string iteration,
954 int *coordinate_destinations,
955 mj_part_t num_parts);
962 void create_sub_communicator(
963 std::vector<mj_part_t> &processor_ranks_for_subcomm);
969 mj_part_t find_largest_prime_factor(mj_part_t num_parts) {
970 mj_part_t largest_factor = 1;
971 mj_part_t n = num_parts;
972 mj_part_t divisor = 2;
974 while (n % divisor == 0) {
976 largest_factor = divisor;
979 if(divisor * divisor > n) {
986 return largest_factor;
1019 const RCP<const Environment> &env,
1020 RCP<
const Comm<int> > &problemComm,
1021 double imbalance_tolerance,
1023 size_t num_global_parts,
1024 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array,
1025 int recursion_depth,
1027 mj_lno_t num_local_coords,
1028 mj_gno_t num_global_coords,
1029 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos,
1031 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates,
1032 int num_weights_per_coord,
1033 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights,
1034 Kokkos::View<mj_scalar_t**, device_t> & mj_weights,
1035 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts,
1036 Kokkos::View<mj_part_t*, device_t> & result_assigned_part_ids,
1037 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos);
1052 bool distribute_points_on_cut_lines_,
1053 int max_concurrent_part_calculation_,
1054 int check_migrate_avoid_migration_option_,
1055 double minimum_migration_imbalance_,
1056 int migration_type_ = 0);
1073 RCP<mj_partBoxVector_t> &localPartBoxes)
const;
1114 const RCP<const Environment> &env,
1115 mj_lno_t num_total_coords,
1116 mj_lno_t num_selected_coords,
1117 size_t num_target_part,
1120 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
1121 Kokkos::View<mj_lno_t *, device_t> &
1122 initial_selected_coords_output_permutation,
1123 mj_lno_t *output_xadj,
1124 int recursion_depth_,
1125 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array,
1126 bool partition_along_longest_dim,
1127 int num_ranks_per_node,
1128 bool divide_to_prime_first_,
1129 mj_part_t num_first_level_parts_ = 1,
1130 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_
1131 = Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1133 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 1141 void allocate_set_work_memory();
1144 void compute_global_box();
1153 void mj_get_local_min_max_coord_totW(
1154 mj_part_t current_work_part,
1155 mj_part_t current_concurrent_num_parts,
1156 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords);
1170 void mj_get_global_min_max_coord_totW(
1171 mj_part_t current_concurrent_num_parts,
1172 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
1173 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total);
1205 void mj_get_initial_cut_coords_target_weights(
1206 mj_scalar_t min_coord,
1207 mj_scalar_t max_coord,
1208 mj_part_t num_cuts ,
1209 mj_scalar_t global_weight,
1210 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
1211 Kokkos::View<mj_scalar_t *, device_t> & target_part_weights,
1212 std::vector <mj_part_t> *future_num_part_in_parts,
1213 std::vector <mj_part_t> *next_future_num_parts_in_parts,
1214 mj_part_t concurrent_current_part,
1215 mj_part_t obtained_part_index,
1216 mj_part_t num_target_first_level_parts = 1,
1217 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist =
1218 Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1236 void set_initial_coordinate_parts(
1237 mj_scalar_t &max_coordinate,
1238 mj_scalar_t &min_coordinate,
1239 mj_lno_t coordinate_begin_index,
1240 mj_lno_t coordinate_end_index,
1241 Kokkos::View<mj_lno_t *, device_t> &
1242 mj_current_coordinate_permutations,
1243 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1244 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
1245 mj_part_t &partition_count);
1264 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1265 double imbalanceTolerance,
1266 mj_part_t current_work_part,
1267 mj_part_t current_concurrent_num_parts,
1268 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1269 mj_part_t total_incomplete_cut_count,
1270 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
1271 Kokkos::View<size_t*, device_t> & view_total_reduction_size);
1278 void mj_1D_part_get_part_weights(
1279 mj_part_t current_concurrent_num_parts,
1280 mj_part_t current_work_part,
1281 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1291 void mj_combine_rightleft_and_weights(
1292 mj_part_t current_work_part,
1293 mj_part_t current_concurrent_num_parts);
1307 void mj_create_new_partitions(
1308 mj_part_t num_parts,
1309 mj_part_t current_concurrent_work_part,
1310 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1311 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1312 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1313 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj);
1350 void mj_get_new_cut_coordinates(
1351 mj_part_t current_concurrent_num_parts,
1353 const mj_part_t &num_cuts,
1354 const double &used_imbalance_tolerance,
1355 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
1356 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
1357 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
1358 Kokkos::View<bool *, device_t> & current_cut_line_determined,
1359 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1360 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
1361 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
1362 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
1363 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
1364 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
1365 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
1366 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
1367 Kokkos::View<mj_scalar_t *, device_t> &
1368 current_part_cut_line_weight_to_put_left,
1369 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count);
1380 void get_processor_num_points_in_parts(
1381 mj_part_t num_procs,
1382 mj_part_t num_parts,
1383 mj_gno_t *&num_points_in_all_processor_parts);
1389 void fill_permutation_array(
1390 mj_part_t output_num_parts,
1391 mj_part_t num_parts);
1414 void create_consistent_chunks(
1415 mj_part_t num_parts,
1416 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1417 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1418 mj_lno_t coordinate_begin,
1419 mj_lno_t coordinate_end,
1420 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1421 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
1423 bool longest_dim_part,
1434 void set_final_parts(
1435 mj_part_t current_num_parts,
1436 mj_part_t output_part_begin_index,
1437 RCP<mj_partBoxVector_t> &output_part_boxes,
1438 bool is_data_ever_migrated);
1443 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1444 typename mj_part_t,
typename mj_node_t>
1446 mj_env(), mj_problemComm(), comm(), imbalance_tolerance(0),
1447 recursion_depth(0), coord_dim(0),
1448 num_weights_per_coord(0), initial_num_loc_coords(0),
1449 initial_num_glob_coords(0),
1450 num_local_coords(0), num_global_coords(0),
1451 sEpsilon(
std::numeric_limits<mj_scalar_t>::
epsilon() * 100),
1452 distribute_points_on_cut_lines(true),
1453 max_concurrent_part_calculation(1),
1454 mj_run_as_rcb(false), mj_user_recursion_depth(0),
1455 mj_keep_part_boxes(false),
1456 check_migrate_avoid_migration_option(0), migration_type(0),
1457 minimum_migration_imbalance(0.30),
1458 num_first_level_parts(1),
1459 total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1460 max_num_cut_along_dim(0),
1461 max_num_total_part_along_dim(0),
1462 total_dim_num_reduce_all(0),
1463 last_dim_num_part(0),
1465 num_global_parts(1),
1466 kept_boxes(), global_box(),
1467 myRank(0), myActualRank(0),
1468 divide_to_prime_first(false)
1515 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1516 typename mj_part_t,
typename mj_node_t>
1519 const RCP<const Environment> &env,
1520 mj_lno_t num_total_coords,
1521 mj_lno_t num_selected_coords,
1522 size_t num_target_part,
1525 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> &
1527 Kokkos::View<mj_lno_t *, device_t> & initial_adjList_output_adjlist,
1528 mj_lno_t *output_xadj,
1529 int recursion_depth_,
1530 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array_,
1531 bool partition_along_longest_dim,
1532 int num_ranks_per_node,
1533 bool divide_to_prime_first_,
1534 mj_part_t num_first_level_parts_,
1535 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_)
1538 const RCP<Comm<int> > commN;
1539 this->mj_problemComm = Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1540 this->comm = Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1541 this->myActualRank = this->myRank = 1;
1543 this->divide_to_prime_first = divide_to_prime_first_;
1548 this->imbalance_tolerance = 0;
1549 this->num_global_parts = num_target_part;
1550 this->part_no_array = part_no_array_;
1551 this->recursion_depth = recursion_depth_;
1555 this->num_first_level_parts = num_first_level_parts_;
1557 this->first_level_distribution = first_level_distribution_;
1559 this->coord_dim = coord_dim_;
1560 this->num_local_coords = num_total_coords;
1562 this->num_global_coords = num_total_coords;
1563 this->mj_coordinates = mj_coordinates_;
1566 this->initial_mj_gnos =
1567 Kokkos::View<mj_gno_t*, device_t>(
"gids", this->num_local_coords);
1569 this->num_weights_per_coord = 0;
1571 this->mj_uniform_weights = Kokkos::View<bool*, Kokkos::HostSpace>(
1572 "uniform weights", 1);
1573 this->mj_uniform_weights(0) =
true;
1575 this->mj_weights = Kokkos::View<mj_scalar_t**, device_t>
1578 this->mj_uniform_parts =
1579 Kokkos::View<bool*, Kokkos::HostSpace>(
"uniform parts", 1);
1580 this->mj_uniform_parts(0) =
true;
1582 this->set_part_specifications();
1584 this->allocate_set_work_memory();
1587 auto local_part_xadj = this->part_xadj;
1588 Kokkos::parallel_for(
1589 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
1590 KOKKOS_LAMBDA (
int dummy) {
1591 local_part_xadj(0) =
static_cast<mj_lno_t
>(num_selected_coords);
1594 Kokkos::deep_copy(coordinate_permutations, initial_adjList_output_adjlist);
1596 mj_part_t current_num_parts = 1;
1598 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
1599 this->all_cut_coordinates;
1601 mj_part_t future_num_parts = this->total_num_part;
1603 std::vector<mj_part_t> *future_num_part_in_parts =
1604 new std::vector<mj_part_t>();
1605 std::vector<mj_part_t> *next_future_num_parts_in_parts =
1606 new std::vector<mj_part_t>();
1607 next_future_num_parts_in_parts->push_back(this->num_global_parts);
1608 RCP<mj_partBoxVector_t> t1;
1609 RCP<mj_partBoxVector_t> t2;
1611 std::vector <uSignedSortItem<int, mj_scalar_t, char>>
1612 coord_dimension_range_sorted(this->coord_dim);
1614 &(coord_dimension_range_sorted[0]);
1615 std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1616 std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1620 Kokkos::View<mj_part_t*, device_t>
1621 view_rectilinear_cut_count(
"view_rectilinear_cut_count", 1);
1622 Kokkos::View<size_t*, device_t>
1623 view_total_reduction_size(
"view_total_reduction_size", 1);
1625 for(
int rd = 0; rd < this->recursion_depth; ++rd) {
1631 std::vector<mj_part_t> *tmpPartVect = future_num_part_in_parts;
1632 future_num_part_in_parts = next_future_num_parts_in_parts;
1633 next_future_num_parts_in_parts = tmpPartVect;
1637 next_future_num_parts_in_parts->clear();
1640 mj_part_t output_part_count_in_dimension =
1641 this->update_part_num_arrays(
1642 future_num_part_in_parts,
1643 next_future_num_parts_in_parts,
1648 t2, num_ranks_per_node);
1653 if(output_part_count_in_dimension == current_num_parts) {
1654 tmpPartVect = future_num_part_in_parts;
1655 future_num_part_in_parts = next_future_num_parts_in_parts;
1656 next_future_num_parts_in_parts = tmpPartVect;
1661 std::string istring = std::to_string(rd);
1665 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
1666 "new part xadj", output_part_count_in_dimension);
1670 mj_part_t output_part_index = 0;
1674 mj_part_t output_coordinate_end_index = 0;
1676 mj_part_t current_work_part = 0;
1677 mj_part_t current_concurrent_num_parts = 1;
1679 mj_part_t obtained_part_index = 0;
1682 int coordInd = rd % this->coord_dim;
1684 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
1685 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1687 auto host_process_local_min_max_coord_total_weight =
1688 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
1689 auto host_global_min_max_coord_total_weight =
1690 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
1693 for(; current_work_part < current_num_parts;
1694 current_work_part += current_concurrent_num_parts) {
1696 mj_part_t actual_work_part_count = 0;
1701 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1702 mj_part_t current_work_part_in_concurrent_parts =
1703 current_work_part + kk;
1707 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1708 current_work_part_in_concurrent_parts);
1709 if(partition_count == 1) {
1712 ++actual_work_part_count;
1713 if(partition_along_longest_dim) {
1714 auto local_process_local_min_max_coord_total_weight =
1715 this->process_local_min_max_coord_total_weight;
1716 for(
int coord_traverse_ind = 0;
1717 coord_traverse_ind < this->coord_dim; ++coord_traverse_ind) {
1719 Kokkos::View<mj_scalar_t *, device_t> coords =
1720 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coord_traverse_ind);
1722 this->mj_get_local_min_max_coord_totW(
1724 current_concurrent_num_parts,
1727 coord_dimension_range_sorted[coord_traverse_ind].id =
1729 coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1731 Kokkos::deep_copy(host_process_local_min_max_coord_total_weight,
1732 process_local_min_max_coord_total_weight);
1734 coord_dim_mins[coord_traverse_ind] =
1735 host_process_local_min_max_coord_total_weight(kk);
1736 coord_dim_maxs[coord_traverse_ind] =
1737 host_process_local_min_max_coord_total_weight(
1738 kk + current_concurrent_num_parts);
1739 coord_dimension_range_sorted[coord_traverse_ind].val =
1740 host_process_local_min_max_coord_total_weight(
1741 kk + current_concurrent_num_parts) -
1742 host_process_local_min_max_coord_total_weight(kk);
1745 uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1746 coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].
id;
1747 auto set_min = coord_dim_mins[coordInd];
1748 auto set_max = coord_dim_maxs[coordInd];
1749 Kokkos::parallel_for(
1750 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1751 (0, 1), KOKKOS_LAMBDA (
int dummy) {
1752 local_process_local_min_max_coord_total_weight(kk) = set_min;
1753 local_process_local_min_max_coord_total_weight(
1754 kk + current_concurrent_num_parts) = set_max;
1757 mj_current_dim_coords =
1758 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1761 Kokkos::View<mj_scalar_t *, device_t> coords =
1762 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1763 this->mj_get_local_min_max_coord_totW(
1765 current_concurrent_num_parts,
1771 if(actual_work_part_count > 0) {
1773 this->mj_get_global_min_max_coord_totW(
1774 current_concurrent_num_parts,
1775 this->process_local_min_max_coord_total_weight,
1776 this->global_min_max_coord_total_weight);
1779 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
1780 global_min_max_coord_total_weight);
1784 mj_part_t total_incomplete_cut_count = 0;
1789 mj_part_t concurrent_part_cut_shift = 0;
1790 mj_part_t concurrent_part_part_shift = 0;
1791 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1792 mj_scalar_t min_coordinate =
1793 host_global_min_max_coord_total_weight(kk);
1794 mj_scalar_t max_coordinate = host_global_min_max_coord_total_weight(
1795 kk + current_concurrent_num_parts);
1796 mj_scalar_t global_total_weight = host_global_min_max_coord_total_weight(
1797 kk + 2*current_concurrent_num_parts);
1799 mj_part_t concurrent_current_part_index = current_work_part + kk;
1801 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1802 concurrent_current_part_index);
1804 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
1805 Kokkos::subview(current_cut_coordinates,
1806 std::pair<mj_lno_t, mj_lno_t>(
1807 concurrent_part_cut_shift,
1808 current_cut_coordinates.size()));
1809 Kokkos::View<mj_scalar_t *, device_t>
1810 current_target_part_weights =
1811 Kokkos::subview(target_part_weights,
1812 std::pair<mj_lno_t, mj_lno_t>(
1813 concurrent_part_part_shift,
1814 target_part_weights.size()));
1817 concurrent_part_cut_shift += partition_count - 1;
1819 concurrent_part_part_shift += partition_count;
1822 if(partition_count > 1 && min_coordinate <= max_coordinate) {
1825 total_incomplete_cut_count += partition_count - 1;
1827 this->incomplete_cut_count(kk) = partition_count - 1;
1835 this->mj_get_initial_cut_coords_target_weights(
1838 partition_count - 1,
1839 global_total_weight,
1841 current_target_part_weights,
1842 future_num_part_in_parts,
1843 next_future_num_parts_in_parts,
1844 concurrent_current_part_index,
1845 obtained_part_index,
1846 rd == 0 ? this->num_first_level_parts : 1,
1847 this->first_level_distribution);
1849 mj_lno_t coordinate_end_index =
1850 host_part_xadj(concurrent_current_part_index);
1851 mj_lno_t coordinate_begin_index =
1852 (concurrent_current_part_index==0) ? 0 :
1853 host_part_xadj[concurrent_current_part_index - 1];
1856 this->set_initial_coordinate_parts(
1859 coordinate_begin_index, coordinate_end_index,
1860 this->coordinate_permutations,
1861 mj_current_dim_coords,
1862 this->assigned_part_ids,
1868 this->incomplete_cut_count(kk) = 0;
1870 obtained_part_index += partition_count;
1875 double used_imbalance = 0;
1879 mj_timer_base_string +
"mj_1D_part()");
1882 mj_current_dim_coords,
1885 current_concurrent_num_parts,
1886 current_cut_coordinates,
1887 total_incomplete_cut_count,
1888 view_rectilinear_cut_count,
1889 view_total_reduction_size);
1892 mj_timer_base_string +
"mj_1D_part()");
1895 obtained_part_index += current_concurrent_num_parts;
1899 mj_part_t output_array_shift = 0;
1900 mj_part_t cut_shift = 0;
1901 size_t tlr_shift = 0;
1902 size_t partweight_array_shift = 0;
1904 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1905 mj_part_t current_concurrent_work_part = current_work_part + kk;
1907 mj_part_t num_parts = host_num_partitioning_in_current_dim(
1908 current_concurrent_work_part);
1911 int coordinateA_bigger_than_coordinateB =
1912 host_global_min_max_coord_total_weight(kk) >
1913 host_global_min_max_coord_total_weight(
1914 kk + current_concurrent_num_parts);
1916 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
1919 auto local_new_part_xadj = this->new_part_xadj;
1920 Kokkos::parallel_for(
1921 Kokkos::RangePolicy<
typename mj_node_t::execution_space,
1922 mj_part_t> (0, num_parts), KOKKOS_LAMBDA(mj_part_t jj) {
1923 local_new_part_xadj(
1924 output_part_index + output_array_shift + jj) = 0;
1927 cut_shift += num_parts - 1;
1928 tlr_shift += (4 *(num_parts - 1) + 1);
1929 output_array_shift += num_parts;
1930 partweight_array_shift += (2 * (num_parts - 1) + 1);
1933 mj_lno_t coordinate_end =
1934 host_part_xadj(current_concurrent_work_part);
1935 mj_lno_t coordinate_begin =
1936 current_concurrent_work_part==0 ? 0 :
1937 host_part_xadj(current_concurrent_work_part-1);
1939 Kokkos::View<mj_scalar_t *, device_t>
1940 current_concurrent_cut_coordinate =
1941 Kokkos::subview(current_cut_coordinates,
1942 std::pair<mj_lno_t, mj_lno_t>(
1944 current_cut_coordinates.size()));
1945 Kokkos::View<mj_scalar_t *, device_t>
1946 used_local_cut_line_weight_to_left =
1947 Kokkos::subview(process_cut_line_weight_to_put_left,
1948 std::pair<mj_lno_t, mj_lno_t>(
1950 process_cut_line_weight_to_put_left.size()));
1952 this->thread_part_weight_work =
1954 this->thread_part_weights,
1955 std::pair<mj_lno_t, mj_lno_t>(
1956 partweight_array_shift,
1957 this->thread_part_weights.size()));
1961 Kokkos::View<mj_lno_t *, device_t> subview_new_part_xadj =
1962 Kokkos::subview(this->new_part_xadj,
1963 std::pair<mj_lno_t, mj_lno_t>(
1964 output_part_index + output_array_shift,
1965 this->new_part_xadj.size()));
1967 this->create_consistent_chunks(
1969 mj_current_dim_coords,
1970 current_concurrent_cut_coordinate,
1973 used_local_cut_line_weight_to_left,
1974 subview_new_part_xadj,
1976 partition_along_longest_dim,
1977 p_coord_dimension_range_sorted);
1982 mj_lno_t part_size = coordinate_end - coordinate_begin;
1984 auto local_new_part_xadj = this->new_part_xadj;
1985 Kokkos::parallel_for(
1986 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1987 (0, 1), KOKKOS_LAMBDA (
int dummy) {
1988 local_new_part_xadj(output_part_index + output_array_shift)
1992 auto subview_new_coordinate_permutations =
1993 Kokkos::subview(this->new_coordinate_permutations,
1994 std::pair<mj_lno_t, mj_lno_t>(
1996 coordinate_begin + part_size));
1997 auto subview_coordinate_permutations =
1998 Kokkos::subview(this->coordinate_permutations,
1999 std::pair<mj_lno_t, mj_lno_t>(
2001 coordinate_begin + part_size));
2002 Kokkos::deep_copy(subview_new_coordinate_permutations,
2003 subview_coordinate_permutations);
2006 cut_shift += num_parts - 1;
2007 tlr_shift += (4 *(num_parts - 1) + 1);
2008 output_array_shift += num_parts;
2009 partweight_array_shift += (2 * (num_parts - 1) + 1);
2018 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
2019 mj_part_t num_parts =
2020 host_num_partitioning_in_current_dim(current_work_part + kk);
2021 auto local_new_part_xadj = this->new_part_xadj;
2022 auto local_mj_current_dim_coords = mj_current_dim_coords;
2023 auto local_new_coordinate_permutations =
2024 new_coordinate_permutations;
2025 Kokkos::parallel_for(
2026 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t> (
2027 0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
2029 local_new_part_xadj(output_part_index+ii) +=
2030 output_coordinate_end_index;
2033 mj_lno_t coordinate_end =
2034 local_new_part_xadj(output_part_index+ii);
2035 mj_lno_t coordinate_begin =
2036 local_new_part_xadj(output_part_index);
2038 for(mj_lno_t task_traverse = coordinate_begin;
2039 task_traverse < coordinate_end; ++task_traverse) {
2040 mj_lno_t l = local_new_coordinate_permutations(task_traverse);
2042 local_mj_current_dim_coords(l) = -local_mj_current_dim_coords(l);
2048 mj_part_t get_single;
2049 Kokkos::parallel_reduce(
"Read new_part_xadj",
2050 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
2051 KOKKOS_LAMBDA(
int dummy, mj_part_t & set_single) {
2052 set_single = local_new_part_xadj(output_part_index + num_parts - 1);
2055 output_coordinate_end_index = get_single;
2057 output_part_index += num_parts;
2064 current_num_parts = output_part_count_in_dimension;
2067 Kokkos::View<mj_lno_t *, device_t> tmp = this->coordinate_permutations;
2068 this->coordinate_permutations = this->new_coordinate_permutations;
2069 this->new_coordinate_permutations = tmp;
2071 this->part_xadj = this->new_part_xadj;
2072 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2073 Kokkos::deep_copy(host_part_xadj, part_xadj);
2074 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
"empty", 0);
2077 Kokkos::deep_copy(initial_adjList_output_adjlist, coordinate_permutations);
2081 for(
size_t i = 0; i < this->num_global_parts ; ++i) {
2082 output_xadj[i+1] = host_part_xadj(i);
2085 delete future_num_part_in_parts;
2086 delete next_future_num_parts_in_parts;
2092 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2093 typename mj_part_t,
typename mj_node_t>
2095 <mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,mj_node_t>::mj_partBox_t>
2099 return this->global_box;
2104 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2105 typename mj_part_t,
typename mj_node_t>
2106 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2107 mj_node_t>::set_to_keep_part_boxes()
2109 this->mj_keep_part_boxes =
true;
2119 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2120 typename mj_part_t,
typename mj_node_t>
2124 this->total_num_cut = 0;
2125 this->total_num_part = 1;
2126 this->max_num_part_along_dim = 0;
2127 this->total_dim_num_reduce_all = 0;
2128 this->last_dim_num_part = 1;
2131 this->max_num_cut_along_dim = 0;
2132 this->max_num_total_part_along_dim = 0;
2134 if(this->part_no_array.size()) {
2135 auto local_recursion_depth = this->recursion_depth;
2137 this->total_dim_num_reduce_all =
2138 this->total_num_part * this->recursion_depth;
2140 this->total_num_part = 1;
2141 for(
int i = 0; i < local_recursion_depth; ++i) {
2142 this->total_num_part *= this->part_no_array(i);
2145 mj_part_t track_max = 0;
2146 for(
int i = 0; i < local_recursion_depth; ++i) {
2147 if(part_no_array(i) > track_max) {
2148 track_max = this->part_no_array(i);
2152 this->last_dim_num_part = this->total_num_part /
2153 this->part_no_array(local_recursion_depth-1);
2155 this->max_num_part_along_dim = track_max;
2156 this->num_global_parts = this->total_num_part;
2158 mj_part_t future_num_parts = this->num_global_parts;
2162 if (this->first_level_distribution.size() != 0 &&
2163 this->num_first_level_parts > 1) {
2164 this->max_num_part_along_dim = this->num_first_level_parts;
2169 for(
int rd = 0; rd < this->recursion_depth; ++rd) {
2170 mj_part_t maxNoPartAlongI = 0;
2171 mj_part_t nfutureNumParts = 0;
2177 this->first_level_distribution.size() != 0 &&
2178 this->num_first_level_parts > 1) {
2180 maxNoPartAlongI = this->num_first_level_parts;
2181 this->max_num_part_along_dim = this->num_first_level_parts;
2183 mj_part_t sum_first_level_dist = 0;
2184 mj_part_t max_part = 0;
2187 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2188 sum_first_level_dist += this->first_level_distribution(i);
2189 if (this->first_level_distribution(i) > max_part)
2190 max_part = this->first_level_distribution(i);
2196 this->num_global_parts * max_part / sum_first_level_dist;
2200 maxNoPartAlongI = this->get_part_count(future_num_parts,
2201 1.0f / (this->recursion_depth - rd));
2202 if (maxNoPartAlongI > this->max_num_part_along_dim)
2203 this->max_num_part_along_dim = maxNoPartAlongI;
2204 nfutureNumParts = future_num_parts / maxNoPartAlongI;
2205 if (future_num_parts % maxNoPartAlongI) {
2209 future_num_parts = nfutureNumParts;
2211 this->total_num_part = this->num_global_parts;
2213 if(this->divide_to_prime_first) {
2214 this->total_dim_num_reduce_all = this->num_global_parts * 2;
2215 this->last_dim_num_part = this->num_global_parts;
2222 for(
int i = 0; i < this->recursion_depth; ++i) {
2223 this->total_dim_num_reduce_all += p;
2224 p *= this->max_num_part_along_dim;
2227 if(p / this->max_num_part_along_dim > this->num_global_parts) {
2228 this->last_dim_num_part = this->num_global_parts;
2231 this->last_dim_num_part = p / this->max_num_part_along_dim;
2236 this->total_num_cut = this->total_num_part - 1;
2237 this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2238 this->max_num_total_part_along_dim = this->max_num_part_along_dim +
2239 size_t(this->max_num_cut_along_dim);
2244 if(this->max_concurrent_part_calculation > this->last_dim_num_part) {
2245 if(this->mj_problemComm->getRank() == 0) {
2246 std::cerr <<
"Warning: Concurrent part count (" <<
2247 this->max_concurrent_part_calculation <<
2248 ") has been set bigger than maximum amount that can be used." <<
2249 " Setting to:" << this->last_dim_num_part <<
"." << std::endl;
2251 this->max_concurrent_part_calculation = this->last_dim_num_part;
2260 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2261 typename mj_part_t,
typename mj_node_t>
2262 inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2263 get_part_count(mj_part_t num_total_future,
double root)
2265 double fp = pow(num_total_future,
root);
2266 mj_part_t ip = mj_part_t(fp);
2294 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2295 typename mj_part_t,
typename mj_node_t>
2296 mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2297 update_part_num_arrays(
2298 std::vector<mj_part_t> *future_num_part_in_parts,
2299 std::vector<mj_part_t> *next_future_num_parts_in_parts,
2300 mj_part_t &future_num_parts,
2301 mj_part_t current_num_parts,
2302 int current_iteration,
2303 RCP<mj_partBoxVector_t> input_part_boxes,
2304 RCP<mj_partBoxVector_t> output_part_boxes,
2305 mj_part_t atomic_part_count)
2307 std::vector<mj_part_t> num_partitioning_in_current_dim;
2310 mj_part_t output_num_parts = 0;
2311 if(this->part_no_array.size()) {
2315 mj_part_t current_part_no_array =
2316 this->part_no_array(current_iteration);
2318 if(current_part_no_array < 1) {
2319 std::cout <<
"Current recursive iteration: " << current_iteration <<
2320 " part_no_array[" << current_iteration <<
"] is given as:" <<
2321 current_part_no_array << std::endl;
2324 if(current_part_no_array == 1) {
2325 return current_num_parts;
2329 if (this->first_level_distribution.size() != 0 &&
2330 current_iteration == 0 &&
2331 current_part_no_array != this->num_first_level_parts) {
2332 std::cout <<
"Current recursive iteration: " << current_iteration
2333 <<
" part_no_array[" << current_iteration <<
"] is given as: " <<
2334 current_part_no_array <<
" and contradicts num_first_level_parts: " <<
2335 this->num_first_level_parts << std::endl;
2339 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2340 num_partitioning_in_current_dim.push_back(current_part_no_array);
2357 future_num_parts /= num_partitioning_in_current_dim[0];
2358 output_num_parts = current_num_parts *
2359 num_partitioning_in_current_dim[0];
2360 if(this->mj_keep_part_boxes) {
2361 for(mj_part_t k = 0; k < current_num_parts; ++k) {
2363 for(mj_part_t j = 0; j <
2364 num_partitioning_in_current_dim[0]; ++j) {
2365 output_part_boxes->push_back((*input_part_boxes)[k]);
2373 for(mj_part_t ii = 0; ii < output_num_parts; ++ii) {
2374 next_future_num_parts_in_parts->push_back(future_num_parts);
2384 future_num_parts = 1;
2387 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2389 mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2393 mj_part_t num_partitions_in_current_dim =
2394 this->get_part_count(future_num_parts_of_part_ii,
2395 1.0 / (this->recursion_depth - current_iteration)
2397 if(num_partitions_in_current_dim > this->max_num_part_along_dim) {
2398 std::cerr <<
"ERROR: maxPartNo calculation is wrong." 2399 " num_partitions_in_current_dim: " 2400 << num_partitions_in_current_dim <<
" this->max_num_part_along_dim: " 2401 << this->max_num_part_along_dim <<
2402 " this->recursion_depth: " << this->recursion_depth <<
2403 " current_iteration:" << current_iteration <<
2404 " future_num_parts_of_part_ii: " << future_num_parts_of_part_ii <<
2405 " might need to fix max part no calculation for " 2406 "largest_prime_first partitioning." <<
2418 if (current_iteration == 0 &&
2419 this->first_level_distribution.size() != 0 &&
2420 this->num_first_level_parts > 1) {
2423 num_partitioning_in_current_dim.push_back(this->num_first_level_parts);
2426 output_num_parts = this->num_first_level_parts;
2429 future_num_parts /= this->num_first_level_parts;
2431 mj_part_t max_part = 0;
2432 mj_part_t sum_first_level_dist = 0;
2436 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2437 sum_first_level_dist += this->first_level_distribution(i);
2439 if (this->first_level_distribution(i) > max_part)
2440 max_part = this->first_level_distribution(i);
2444 future_num_parts = this->num_global_parts * max_part / sum_first_level_dist;
2448 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2449 next_future_num_parts_in_parts->push_back(this->first_level_distribution(i) *
2450 this->num_global_parts / sum_first_level_dist);
2453 else if (this->divide_to_prime_first) {
2455 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2457 mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2460 output_num_parts += num_partitions_in_current_dim;
2462 if (future_num_parts_of_part_ii == atomic_part_count ||
2463 future_num_parts_of_part_ii % atomic_part_count != 0) {
2464 atomic_part_count = 1;
2467 largest_prime_factor =
2468 this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2475 if (largest_prime_factor < num_partitions_in_current_dim) {
2476 largest_prime_factor = num_partitions_in_current_dim;
2479 mj_part_t ideal_num_future_parts_in_part =
2480 (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2482 mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2490 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2492 mj_part_t my_ideal_primescale = ideal_prime_scale;
2494 if (iii < (largest_prime_factor) % num_partitions_in_current_dim) {
2495 ++my_ideal_primescale;
2498 mj_part_t num_future_parts_for_part_iii =
2499 ideal_num_future_parts_in_part * my_ideal_primescale;
2502 if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor) {
2504 ++num_future_parts_for_part_iii;
2507 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2510 if (this->mj_keep_part_boxes) {
2511 output_part_boxes->push_back((*input_part_boxes)[ii]);
2515 if (num_future_parts_for_part_iii > future_num_parts)
2516 future_num_parts = num_future_parts_for_part_iii;
2522 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2525 output_num_parts += num_partitions_in_current_dim;
2527 if((future_num_parts_of_part_ii == atomic_part_count) ||
2528 (future_num_parts_of_part_ii % atomic_part_count != 0)) {
2529 atomic_part_count = 1;
2532 mj_part_t ideal_num_future_parts_in_part =
2533 (future_num_parts_of_part_ii / atomic_part_count) /
2534 num_partitions_in_current_dim;
2535 for(mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2536 mj_part_t num_future_parts_for_part_iii =
2537 ideal_num_future_parts_in_part;
2540 if(iii < (future_num_parts_of_part_ii / atomic_part_count) %
2541 num_partitions_in_current_dim) {
2543 ++num_future_parts_for_part_iii;
2546 next_future_num_parts_in_parts->push_back(
2547 num_future_parts_for_part_iii * atomic_part_count);
2551 if(this->mj_keep_part_boxes) {
2552 output_part_boxes->push_back((*input_part_boxes)[ii]);
2555 if(num_future_parts_for_part_iii > future_num_parts)
2556 future_num_parts = num_future_parts_for_part_iii;
2562 device_num_partitioning_in_current_dim = Kokkos::View<
2563 mj_part_t*, device_t>(
"test", num_partitioning_in_current_dim.size());
2564 host_num_partitioning_in_current_dim =
2565 Kokkos::create_mirror_view(device_num_partitioning_in_current_dim);
2566 for(
size_t n = 0; n < num_partitioning_in_current_dim.size(); ++n) {
2567 host_num_partitioning_in_current_dim(n) =
2568 num_partitioning_in_current_dim[n];
2573 Kokkos::deep_copy(device_num_partitioning_in_current_dim,
2574 host_num_partitioning_in_current_dim);
2575 return output_num_parts;
2580 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2581 typename mj_part_t,
typename mj_node_t>
2582 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2583 allocate_set_work_memory()
2588 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2589 Kokkos::ViewAllocateWithoutInitializing(
"coordinate_permutations"),
2590 this->num_local_coords);
2591 auto local_coordinate_permutations = coordinate_permutations;
2592 Kokkos::parallel_for(
2593 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
2594 0, this->num_local_coords), KOKKOS_LAMBDA (mj_lno_t i) {
2595 local_coordinate_permutations(i) = i;
2599 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2600 Kokkos::ViewAllocateWithoutInitializing(
"num_local_coords"),
2601 this->num_local_coords);
2603 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2604 Kokkos::ViewAllocateWithoutInitializing(
"assigned parts"), 0);
2605 if(this->num_local_coords > 0) {
2606 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2607 Kokkos::ViewAllocateWithoutInitializing(
"assigned part ids"),
2608 this->num_local_coords);
2615 this->part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2616 Kokkos::ViewAllocateWithoutInitializing(
"part xadj"), 1);
2617 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2618 host_part_xadj(0) = num_local_coords;
2619 Kokkos::deep_copy(this->part_xadj, host_part_xadj);
2622 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2623 Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2626 this->all_cut_coordinates = Kokkos::View<mj_scalar_t*, device_t>(
2627 Kokkos::ViewAllocateWithoutInitializing(
"all cut coordinates"),
2628 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2631 this->process_cut_line_weight_to_put_left = Kokkos::View<mj_scalar_t*,
2632 device_t>(Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2636 this->thread_cut_line_weight_to_put_left =
2637 Kokkos::View<mj_scalar_t*, device_t>(
2638 Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2640 if(this->distribute_points_on_cut_lines) {
2641 this->process_cut_line_weight_to_put_left =
2642 Kokkos::View<mj_scalar_t *, device_t>(
2643 Kokkos::ViewAllocateWithoutInitializing(
2644 "process_cut_line_weight_to_put_left"),
2645 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2646 this->thread_cut_line_weight_to_put_left =
2647 Kokkos::View<mj_scalar_t *, device_t>(
2648 Kokkos::ViewAllocateWithoutInitializing(
2649 "thread_cut_line_weight_to_put_left"),
2650 this->max_num_cut_along_dim);
2651 this->process_rectilinear_cut_weight =
2652 Kokkos::View<mj_scalar_t *, device_t>(
2653 Kokkos::ViewAllocateWithoutInitializing(
"process_rectilinear_cut_weight"),
2654 this->max_num_cut_along_dim);
2655 this->global_rectilinear_cut_weight =
2656 Kokkos::View<mj_scalar_t *, device_t>(
2657 Kokkos::ViewAllocateWithoutInitializing(
"global_rectilinear_cut_weight"),
2658 this->max_num_cut_along_dim);
2665 this->cut_coordinates_work_array =
2666 Kokkos::View<mj_scalar_t *, device_t>(
2667 Kokkos::ViewAllocateWithoutInitializing(
"cut_coordinates_work_array"),
2668 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2671 this->target_part_weights = Kokkos::View<mj_scalar_t*, device_t>(
2672 Kokkos::ViewAllocateWithoutInitializing(
"target_part_weights"),
2673 this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2676 this->cut_upper_bound_coordinates =
2677 Kokkos::View<mj_scalar_t*, device_t>(
2678 Kokkos::ViewAllocateWithoutInitializing(
"cut_upper_bound_coordinates"),
2679 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2682 this->cut_lower_bound_coordinates =
2683 Kokkos::View<mj_scalar_t*, device_t>(
2684 Kokkos::ViewAllocateWithoutInitializing(
"cut_lower_bound_coordinates"),
2685 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2688 this->cut_lower_bound_weights =
2689 Kokkos::View<mj_scalar_t*, device_t>(
2690 Kokkos::ViewAllocateWithoutInitializing(
"cut_lower_bound_weights"),
2691 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2694 this->cut_upper_bound_weights =
2695 Kokkos::View<mj_scalar_t*, device_t>(
2696 Kokkos::ViewAllocateWithoutInitializing(
"cut_upper_bound_weights"),
2697 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2701 this->process_local_min_max_coord_total_weight =
2702 Kokkos::View<mj_scalar_t*, device_t>(
2703 Kokkos::ViewAllocateWithoutInitializing(
2704 "process_local_min_max_coord_total_weight"),
2705 3 * this->max_concurrent_part_calculation);
2708 this->global_min_max_coord_total_weight =
2709 Kokkos::View<mj_scalar_t*, device_t>(
2710 Kokkos::ViewAllocateWithoutInitializing(
"global_min_max_coord_total_weight"),
2711 3 * this->max_concurrent_part_calculation);
2716 this->is_cut_line_determined = Kokkos::View<bool *, device_t>(
2717 Kokkos::ViewAllocateWithoutInitializing(
"is_cut_line_determined"),
2718 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2724 this->device_incomplete_cut_count = Kokkos::View<mj_part_t *, device_t>(
2725 Kokkos::ViewAllocateWithoutInitializing(
"device_incomplete_cut_count"),
2726 this->max_concurrent_part_calculation);
2727 this->incomplete_cut_count =
2728 Kokkos::create_mirror_view(device_incomplete_cut_count);
2731 this->thread_part_weights = Kokkos::View<double *, device_t>(
2732 Kokkos::ViewAllocateWithoutInitializing(
"thread_part_weights"),
2733 this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2735 this->thread_cut_left_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2736 Kokkos::ViewAllocateWithoutInitializing(
"thread_cut_left_closest_point"),
2737 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2741 this->thread_cut_right_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2742 Kokkos::ViewAllocateWithoutInitializing(
"thread_cut_right_closest_point"),
2743 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2746 this->thread_point_counts = Kokkos::View<mj_lno_t *, device_t>(
2747 Kokkos::ViewAllocateWithoutInitializing(
"thread_point_counts"),
2748 this->max_num_part_along_dim);
2754 this->total_part_weight_left_right_closests =
2755 Kokkos::View<mj_scalar_t*, device_t>(
2756 Kokkos::ViewAllocateWithoutInitializing(
2757 "total_part_weight_left_right_closests"),
2758 (this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) *
2759 this->max_concurrent_part_calculation);
2761 this->global_total_part_weight_left_right_closests =
2762 Kokkos::View<mj_scalar_t*, device_t>(
2763 Kokkos::ViewAllocateWithoutInitializing(
2764 "global_total_part_weight_left_right_closests"),
2765 (this->max_num_total_part_along_dim +
2766 this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2768 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
2769 Kokkos::ViewAllocateWithoutInitializing(
"gids"), num_local_coords);
2771 this->owner_of_coordinate = Kokkos::View<int *, Kokkos::HostSpace>(
2772 Kokkos::ViewAllocateWithoutInitializing(
"owner_of_coordinate"),
2778 Kokkos::deep_copy(owner_of_coordinate, myActualRank);
2780 auto local_current_mj_gnos = current_mj_gnos;
2781 auto local_initial_mj_gnos = initial_mj_gnos;
2782 Kokkos::parallel_for(
2783 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2784 (0, num_local_coords), KOKKOS_LAMBDA (mj_lno_t j) {
2785 local_current_mj_gnos(j) = local_initial_mj_gnos(j);
2791 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2792 typename mj_part_t,
typename mj_node_t>
2793 void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,
2794 mj_node_t>::compute_global_box()
2797 mj_scalar_t *mins =
new mj_scalar_t[this->coord_dim];
2799 mj_scalar_t *gmins =
new mj_scalar_t[this->coord_dim];
2801 mj_scalar_t *maxs =
new mj_scalar_t[this->coord_dim];
2803 mj_scalar_t *gmaxs =
new mj_scalar_t[this->coord_dim];
2805 auto local_mj_coordinates = this->mj_coordinates;
2809 for(
int i = 0; i < this->coord_dim; ++i) {
2814 for(
int i = 0; i < std::min(this->recursion_depth, this->coord_dim); ++i) {
2815 Kokkos::parallel_reduce(
"MinReduce",
2816 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2817 (0, this->num_local_coords),
2818 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_min) {
2819 if(local_mj_coordinates(j,i) < running_min) {
2820 running_min = local_mj_coordinates(j,i);
2822 }, Kokkos::Min<mj_scalar_t>(mins[i]));
2823 Kokkos::parallel_reduce(
"MaxReduce",
2824 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2825 (0, this->num_local_coords),
2826 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_max) {
2827 if(local_mj_coordinates(j,i) > running_max) {
2828 running_max = local_mj_coordinates(j,i);
2830 }, Kokkos::Max<mj_scalar_t>(maxs[i]));
2833 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2834 this->coord_dim, mins, gmins
2837 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2838 this->coord_dim, maxs, gmaxs
2842 global_box = rcp(
new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2856 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2857 typename mj_part_t,
typename mj_node_t>
2858 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2859 mj_node_t>::init_part_boxes(
2860 RCP<mj_partBoxVector_t> & initial_partitioning_boxes)
2862 mj_partBox_t tmp_box(*global_box);
2863 initial_partitioning_boxes->push_back(tmp_box);
2870 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2873 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2874 mj_get_local_min_max_coord_totW(
2875 mj_part_t current_work_part,
2876 mj_part_t current_concurrent_num_parts,
2877 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords)
2879 auto local_coordinate_permutations = this->coordinate_permutations;
2880 auto local_process_local_min_max_coord_total_weight =
2881 this->process_local_min_max_coord_total_weight;
2882 auto local_mj_weights = this->mj_weights;
2884 bool bUniformWeights = mj_uniform_weights(0);
2886 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
2888 mj_part_t concurrent_current_part = current_work_part + kk;
2889 mj_lno_t coordinate_begin_index = concurrent_current_part == 0 ? 0 :
2890 host_part_xadj(concurrent_current_part - 1);
2891 mj_lno_t coordinate_end_index =
2892 host_part_xadj(concurrent_current_part);
2894 mj_scalar_t my_min_coord = 0;
2895 mj_scalar_t my_max_coord = 0;
2896 mj_scalar_t my_total_weight;
2899 if(coordinate_begin_index >= coordinate_end_index)
2901 my_min_coord = std::numeric_limits<mj_scalar_t>::max();
2902 my_max_coord = -std::numeric_limits<mj_scalar_t>::max();
2903 my_total_weight = 0;
2907 Kokkos::parallel_reduce(
"get min",
2908 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2909 (coordinate_begin_index, coordinate_end_index),
2910 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_min) {
2911 int i = local_coordinate_permutations(j);
2912 if(mj_current_dim_coords(i) < running_min)
2913 running_min = mj_current_dim_coords(i);
2914 }, Kokkos::Min<mj_scalar_t>(my_min_coord));
2916 Kokkos::parallel_reduce(
"get max",
2917 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2918 (coordinate_begin_index, coordinate_end_index),
2919 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_max) {
2920 int i = local_coordinate_permutations(j);
2921 if(mj_current_dim_coords(i) > running_max)
2922 running_max = mj_current_dim_coords(i);
2923 }, Kokkos::Max<mj_scalar_t>(my_max_coord));
2924 if(bUniformWeights) {
2925 my_total_weight = coordinate_end_index - coordinate_begin_index;
2928 my_total_weight = 0;
2929 Kokkos::parallel_reduce(
"get weight",
2930 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2931 (coordinate_begin_index, coordinate_end_index),
2932 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & lsum) {
2933 int i = local_coordinate_permutations(j);
2934 lsum += local_mj_weights(i,0);
2935 }, my_total_weight);
2940 Kokkos::parallel_for(
2941 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
2942 (0, 1), KOKKOS_LAMBDA (
int dummy) {
2943 local_process_local_min_max_coord_total_weight(kk) =
2945 local_process_local_min_max_coord_total_weight(
2946 kk + current_concurrent_num_parts) = my_max_coord;
2947 local_process_local_min_max_coord_total_weight(
2948 kk + 2*current_concurrent_num_parts) = my_total_weight;
2965 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2966 typename mj_part_t,
typename mj_node_t>
2967 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2968 mj_node_t>::mj_get_global_min_max_coord_totW(
2969 mj_part_t current_concurrent_num_parts,
2970 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
2971 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total) {
2975 if(this->comm->getSize() > 1) {
2978 auto host_local_min_max_total =
2979 Kokkos::create_mirror_view(Kokkos::HostSpace(), local_min_max_total);
2980 auto host_global_min_max_total =
2981 Kokkos::create_mirror_view(Kokkos::HostSpace(), global_min_max_total);
2982 Kokkos::deep_copy(host_local_min_max_total, local_min_max_total);
2984 reductionOp(current_concurrent_num_parts,
2985 current_concurrent_num_parts, current_concurrent_num_parts);
2987 reduceAll<int, mj_scalar_t>(
2990 3 * current_concurrent_num_parts,
2991 host_local_min_max_total.data(),
2992 host_global_min_max_total.data());
2995 Kokkos::deep_copy(global_min_max_total, host_global_min_max_total);
2998 mj_part_t s = 3 * current_concurrent_num_parts;
2999 Kokkos::parallel_for(
3000 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3001 (0, s), KOKKOS_LAMBDA (mj_part_t i) {
3002 global_min_max_total(i) = local_min_max_total(i);
3039 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3040 typename mj_part_t,
typename mj_node_t>
3041 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3042 mj_get_initial_cut_coords_target_weights(
3043 mj_scalar_t min_coord,
3044 mj_scalar_t max_coord,
3045 mj_part_t num_cuts ,
3046 mj_scalar_t global_weight,
3048 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
3050 Kokkos::View<mj_scalar_t *, device_t> & current_target_part_weights ,
3051 std::vector <mj_part_t> *future_num_part_in_parts,
3052 std::vector <mj_part_t> *next_future_num_parts_in_parts,
3053 mj_part_t concurrent_current_part,
3054 mj_part_t obtained_part_index,
3055 mj_part_t num_target_first_level_parts,
3056 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist)
3058 mj_scalar_t coord_range = max_coord - min_coord;
3065 bool bUniformPartsCheck =
3066 num_target_first_level_parts <= 1 && this->mj_uniform_parts(0);
3068 if(!bUniformPartsCheck) {
3069 bool bValidNonUniformTargetWeights =
3070 (num_target_first_level_parts > 1 && target_first_level_dist.size() != 0);
3071 if(!bValidNonUniformTargetWeights) {
3072 std::cerr <<
"MJ does not support non uniform part weights beyond the first partition" << std::endl;
3077 Kokkos::View<mj_scalar_t*, device_t> device_cumulative(
3078 "device_cumulative", num_cuts);
3079 auto host_cumulative = Kokkos::create_mirror_view(device_cumulative);
3081 mj_scalar_t cumulative = 0;
3083 if(bUniformPartsCheck) {
3085 mj_scalar_t total_future_part_count_in_part =
3086 static_cast<mj_scalar_t
>((*future_num_part_in_parts)[concurrent_current_part]);
3089 mj_scalar_t unit_part_weight =
3090 global_weight / total_future_part_count_in_part;
3092 for(mj_part_t i = 0; i < num_cuts; ++i) {
3093 cumulative += unit_part_weight *
static_cast<mj_scalar_t
>((*next_future_num_parts_in_parts)[i + obtained_part_index]);
3094 host_cumulative(i) = cumulative;
3099 mj_scalar_t sum_target_first_level_dist = 0.0;
3100 for (
int i = 0; i < num_target_first_level_parts; ++i) {
3101 sum_target_first_level_dist += target_first_level_dist(i);
3104 for(mj_part_t i = 0; i < num_cuts; ++i) {
3105 cumulative += global_weight * target_first_level_dist(i) /
3106 sum_target_first_level_dist;
3107 host_cumulative(i) = cumulative;
3111 Kokkos::deep_copy(device_cumulative, host_cumulative);
3113 Kokkos::parallel_for(
"Write num in parts",
3114 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3115 (0, num_cuts), KOKKOS_LAMBDA(mj_part_t cut) {
3117 current_target_part_weights(cut) = device_cumulative(cut);
3118 initial_cut_coords(cut) = min_coord +
3119 (coord_range * device_cumulative(cut)) / global_weight;
3121 current_target_part_weights(num_cuts) = global_weight;
3127 if (!bUniformPartsCheck || this->mj_uniform_weights[0]) {
3128 Kokkos::parallel_for(
3129 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3131 KOKKOS_LAMBDA (mj_part_t i) {
3132 current_target_part_weights(i) =
3133 long(current_target_part_weights(i) + 0.5);
3154 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3155 typename mj_part_t,
typename mj_node_t>
3156 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3157 set_initial_coordinate_parts(
3158 mj_scalar_t &max_coordinate,
3159 mj_scalar_t &min_coordinate,
3160 mj_lno_t coordinate_begin_index,
3161 mj_lno_t coordinate_end_index,
3162 Kokkos::View<mj_lno_t *, device_t> & mj_current_coordinate_permutations,
3163 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3164 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
3165 mj_part_t &partition_count)
3167 mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
3171 if(std::abs(coordinate_range) < this->sEpsilon ) {
3172 Kokkos::parallel_for(
3173 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3174 (coordinate_begin_index, coordinate_end_index),
3175 KOKKOS_LAMBDA (mj_lno_t ii) {
3176 mj_part_ids(mj_current_coordinate_permutations[ii]) = 0;
3182 mj_scalar_t slice = coordinate_range / partition_count;
3183 Kokkos::parallel_for(
3184 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3185 (coordinate_begin_index, coordinate_end_index),
3186 KOKKOS_LAMBDA (mj_lno_t ii) {
3187 mj_lno_t iii = mj_current_coordinate_permutations[ii];
3189 mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
3190 if(pp >= partition_count) {
3191 pp = partition_count - 1;
3193 mj_part_ids[iii] = 2 * pp;
3212 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3213 typename mj_part_t,
typename mj_node_t>
3214 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,mj_node_t>::mj_1D_part(
3215 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3216 double used_imbalance_tolerance,
3217 mj_part_t current_work_part,
3218 mj_part_t current_concurrent_num_parts,
3219 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
3220 mj_part_t total_incomplete_cut_count,
3221 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
3222 Kokkos::View<size_t*, device_t> & view_total_reduction_size)
3224 this->temp_cut_coords = current_cut_coordinates;
3227 *reductionOp = NULL;
3229 bool bSingleProcess = (this->comm->getSize() == 1);
3231 std::vector<mj_part_t> temp(host_num_partitioning_in_current_dim.size());
3232 if(!bSingleProcess) {
3233 for(
size_t n = 0; n < host_num_partitioning_in_current_dim.size(); ++n) {
3234 temp[n] = host_num_partitioning_in_current_dim(n);
3237 <mj_part_t, mj_scalar_t>(
3240 current_concurrent_num_parts);
3243 auto local_cut_lower_bound_coordinates =
3244 cut_lower_bound_coordinates;
3245 auto local_cut_upper_bound_coordinates =
3246 cut_upper_bound_coordinates;
3247 auto local_cut_upper_bound_weights = cut_upper_bound_weights;
3248 auto local_cut_lower_bound_weights = cut_lower_bound_weights;
3249 bool local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
3250 auto local_process_cut_line_weight_to_put_left =
3251 process_cut_line_weight_to_put_left;
3252 auto local_temp_cut_coords = temp_cut_coords;
3253 auto local_global_total_part_weight_left_right_closests =
3254 global_total_part_weight_left_right_closests;
3255 auto local_cut_coordinates_work_array =
3256 cut_coordinates_work_array;
3257 auto local_part_xadj = part_xadj;
3258 auto local_global_min_max_coord_total_weight =
3259 global_min_max_coord_total_weight;
3260 auto local_target_part_weights =
3261 target_part_weights;
3262 auto local_global_rectilinear_cut_weight =
3263 global_rectilinear_cut_weight;
3264 auto local_process_rectilinear_cut_weight =
3265 process_rectilinear_cut_weight;
3267 auto local_is_cut_line_determined = this->is_cut_line_determined;
3268 auto local_device_num_partitioning_in_current_dim =
3269 device_num_partitioning_in_current_dim;
3271 Kokkos::parallel_for(
3272 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3273 KOKKOS_LAMBDA (
int dummy) {
3276 view_rectilinear_cut_count(0) = 0;
3277 view_total_reduction_size(0) = 0;
3281 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3282 mj_part_t num_part_in_dim =
3283 local_device_num_partitioning_in_current_dim(current_work_part + i);
3284 mj_part_t num_cut_in_dim = num_part_in_dim - 1;
3285 view_total_reduction_size(0) += (4 * num_cut_in_dim + 1);
3287 for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii) {
3288 local_is_cut_line_determined(next) =
false;
3290 local_cut_lower_bound_coordinates(next) =
3291 local_global_min_max_coord_total_weight(i);
3293 local_cut_upper_bound_coordinates(next) =
3294 local_global_min_max_coord_total_weight(
3295 i + current_concurrent_num_parts);
3297 local_cut_upper_bound_weights(next) =
3298 local_global_min_max_coord_total_weight(
3299 i + 2 * current_concurrent_num_parts);
3300 local_cut_lower_bound_weights(next) = 0;
3301 if(local_distribute_points_on_cut_lines) {
3302 local_process_cut_line_weight_to_put_left(next) = 0;
3313 while (total_incomplete_cut_count != 0) {
3314 this->mj_1D_part_get_part_weights(
3315 current_concurrent_num_parts,
3317 mj_current_dim_coords,
3321 this->mj_combine_rightleft_and_weights(
3323 current_concurrent_num_parts);
3326 if(!bSingleProcess) {
3329 auto host_total_part_weight_left_right_closests =
3330 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3331 total_part_weight_left_right_closests);
3332 auto host_global_total_part_weight_left_right_closests =
3333 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3334 global_total_part_weight_left_right_closests);
3336 Kokkos::deep_copy(host_total_part_weight_left_right_closests,
3337 total_part_weight_left_right_closests);
3339 size_t host_view_total_reduction_size;
3340 Kokkos::parallel_reduce(
"Read single",
3341 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3342 KOKKOS_LAMBDA(
int dummy,
size_t & set_single) {
3343 set_single = view_total_reduction_size(0);
3344 }, host_view_total_reduction_size);
3346 reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
3347 host_view_total_reduction_size,
3348 host_total_part_weight_left_right_closests.data(),
3349 host_global_total_part_weight_left_right_closests.data());
3350 Kokkos::deep_copy(global_total_part_weight_left_right_closests,
3351 host_global_total_part_weight_left_right_closests);
3354 local_global_total_part_weight_left_right_closests =
3355 this->total_part_weight_left_right_closests;
3360 mj_part_t cut_shift = 0;
3364 size_t tlr_shift = 0;
3366 Kokkos::View<mj_part_t*, Kokkos::HostSpace>
3367 save_initial_incomplete_cut_count(
"save_initial_incomplete_cut_count",
3368 current_concurrent_num_parts);
3370 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3372 mj_part_t num_parts =
3373 host_num_partitioning_in_current_dim(current_work_part + kk);
3375 mj_part_t num_cuts = num_parts - 1;
3376 size_t num_total_part = num_parts + size_t (num_cuts);
3381 mj_part_t kk_incomplete_cut_count = this->incomplete_cut_count(kk);
3383 if(kk_incomplete_cut_count == 0) {
3384 cut_shift += num_cuts;
3385 tlr_shift += (num_total_part + 2 * num_cuts);
3389 Kokkos::View<mj_scalar_t *, device_t> current_local_part_weights =
3390 Kokkos::subview(this->total_part_weight_left_right_closests,
3391 std::pair<mj_lno_t, mj_lno_t>(
3393 this->total_part_weight_left_right_closests.size()));
3395 Kokkos::View<mj_scalar_t *, device_t> current_global_tlr =
3397 local_global_total_part_weight_left_right_closests,
3398 std::pair<mj_lno_t, mj_lno_t>(
3400 local_global_total_part_weight_left_right_closests.size()));
3401 Kokkos::View<mj_scalar_t *, device_t>
3402 current_global_left_closest_points =
3403 Kokkos::subview(current_global_tlr,
3404 std::pair<mj_lno_t, mj_lno_t>(
3406 current_global_tlr.size()));
3407 Kokkos::View<mj_scalar_t *, device_t>
3408 current_global_right_closest_points =
3409 Kokkos::subview(current_global_tlr,
3410 std::pair<mj_lno_t, mj_lno_t>(
3411 num_total_part + num_cuts,
3412 current_global_tlr.size()));
3413 Kokkos::View<mj_scalar_t *, device_t> current_global_part_weights =
3416 Kokkos::View<bool *, device_t> current_cut_line_determined =
3417 Kokkos::subview(this->is_cut_line_determined,
3418 std::pair<mj_lno_t, mj_lno_t>(
3420 this->is_cut_line_determined.size()));
3421 Kokkos::View<mj_scalar_t *, device_t> current_part_target_weights =
3422 Kokkos::subview(local_target_part_weights,
3423 std::pair<mj_lno_t, mj_lno_t>(
3425 local_target_part_weights.size()));
3426 Kokkos::View<mj_scalar_t *, device_t>
3427 current_part_cut_line_weight_to_put_left =
3428 Kokkos::subview(local_process_cut_line_weight_to_put_left,
3429 std::pair<mj_lno_t, mj_lno_t>(
3431 local_process_cut_line_weight_to_put_left.size()));
3433 save_initial_incomplete_cut_count(kk) =
3434 kk_incomplete_cut_count;
3436 Kokkos::View<mj_scalar_t *, device_t>
3437 current_cut_lower_bound_weights =
3438 Kokkos::subview(local_cut_lower_bound_weights,
3439 std::pair<mj_lno_t, mj_lno_t>(
3441 local_cut_lower_bound_weights.size()));
3442 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_weights =
3443 Kokkos::subview(local_cut_upper_bound_weights,
3444 std::pair<mj_lno_t, mj_lno_t>(
3446 local_cut_upper_bound_weights.size()));
3447 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_bounds =
3448 Kokkos::subview(local_cut_upper_bound_coordinates,
3449 std::pair<mj_lno_t, mj_lno_t>(
3451 local_cut_upper_bound_coordinates.size()));
3452 Kokkos::View<mj_scalar_t *, device_t> current_cut_lower_bounds =
3453 Kokkos::subview(local_cut_lower_bound_coordinates,
3454 std::pair<mj_lno_t, mj_lno_t>(
3456 local_cut_lower_bound_coordinates.size()));
3459 Kokkos::View<mj_scalar_t*, device_t> sub_temp_cut_coords =
3460 Kokkos::subview(this->temp_cut_coords,
3461 std::pair<mj_lno_t, mj_lno_t>(
3462 cut_shift, this->temp_cut_coords.size()));
3463 Kokkos::View<mj_scalar_t*, device_t> sub_cut_coordinates_work_array =
3464 Kokkos::subview(this->cut_coordinates_work_array,
3465 std::pair<mj_lno_t, mj_lno_t>(
3466 cut_shift, this->cut_coordinates_work_array.size()));
3468 this->mj_get_new_cut_coordinates(
3469 current_concurrent_num_parts,
3472 used_imbalance_tolerance,
3473 current_global_part_weights,
3474 current_local_part_weights,
3475 current_part_target_weights,
3476 current_cut_line_determined,
3477 sub_temp_cut_coords,
3478 current_cut_upper_bounds,
3479 current_cut_lower_bounds,
3480 current_global_left_closest_points,
3481 current_global_right_closest_points,
3482 current_cut_lower_bound_weights,
3483 current_cut_upper_weights,
3484 sub_cut_coordinates_work_array,
3485 current_part_cut_line_weight_to_put_left,
3486 view_rectilinear_cut_count);
3488 cut_shift += num_cuts;
3489 tlr_shift += (num_total_part + 2 * num_cuts);
3492 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3493 mj_part_t iteration_complete_cut_count =
3494 save_initial_incomplete_cut_count(kk) - this->incomplete_cut_count(kk);
3495 total_incomplete_cut_count -= iteration_complete_cut_count;
3498 Kokkos::parallel_for(
3499 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3500 (0, local_temp_cut_coords.size()), KOKKOS_LAMBDA(
int n) {
3501 auto t = local_temp_cut_coords(n);
3502 local_temp_cut_coords(n) = local_cut_coordinates_work_array(n);
3503 local_cut_coordinates_work_array(n) = t;
3511 if(current_cut_coordinates != local_temp_cut_coords) {
3512 Kokkos::parallel_for(
3513 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3514 (0, 1), KOKKOS_LAMBDA(
int dummy) {
3516 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3517 mj_part_t num_parts = -1;
3518 num_parts = local_device_num_partitioning_in_current_dim(
3519 current_work_part + i);
3520 mj_part_t num_cuts = num_parts - 1;
3521 for(mj_part_t ii = 0; ii < num_cuts; ++ii) {
3522 current_cut_coordinates(next + ii) = local_temp_cut_coords(next + ii);
3527 static_cast<int>(local_cut_coordinates_work_array.size()); ++n) {
3528 local_cut_coordinates_work_array(n) = local_temp_cut_coords(n);
3536 template<
class scalar_t>
3542 KOKKOS_INLINE_FUNCTION
3545 KOKKOS_INLINE_FUNCTION
3554 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) 3556 template<
class policy_t,
class scalar_t,
class part_t>
3567 scalar_t mj_max_scalar,
3569 int mj_value_count_rightleft,
3570 int mj_value_count_weights) :
3577 KOKKOS_INLINE_FUNCTION
3582 KOKKOS_INLINE_FUNCTION
3585 dst.
ptr[n] += src.
ptr[n];
3590 if(src.
ptr[n] > dst.
ptr[n]) {
3593 if(src.
ptr[n+1] < dst.
ptr[n+1]) {
3594 dst.
ptr[n+1] = src.
ptr[n+1];
3599 KOKKOS_INLINE_FUNCTION
3602 dst.
ptr[n] += src.
ptr[n];
3607 if(src.
ptr[n] > dst.
ptr[n]) {
3610 if(src.
ptr[n+1] < dst.
ptr[n+1]) {
3611 dst.
ptr[n+1] = src.
ptr[n+1];
3630 #endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP 3632 template<
class policy_t,
class scalar_t,
class part_t,
class index_t,
3633 class device_t,
class array_t>
3638 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) 3661 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 3662 Kokkos::View<double *, device_t> current_part_weights;
3663 Kokkos::View<scalar_t *, device_t> current_left_closest;
3664 Kokkos::View<scalar_t *, device_t> current_right_closest;
3665 #endif // KOKKOS_ENABLE_CUDA || defined(KOKKOS_ENABLE_HIP) 3669 array_t mj_max_scalar,
3670 part_t mj_concurrent_current_part,
3672 part_t mj_current_work_part,
3673 part_t mj_current_concurrent_num_parts,
3674 part_t mj_left_right_array_size,
3675 part_t mj_weight_array_size,
3676 Kokkos::View<index_t*, device_t> & mj_permutations,
3677 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
3678 Kokkos::View<scalar_t**, device_t> & mj_weights,
3679 Kokkos::View<part_t*, device_t> & mj_parts,
3680 Kokkos::View<scalar_t *, device_t> & mj_cut_coordinates,
3681 Kokkos::View<index_t *, device_t> & mj_part_xadj,
3682 bool mj_uniform_weights0,
3683 scalar_t mj_sEpsilon
3684 #
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3685 ,Kokkos::View<double *, device_t> & mj_current_part_weights,
3686 Kokkos::View<scalar_t *, device_t> & mj_current_left_closest,
3687 Kokkos::View<scalar_t *, device_t> & mj_current_right_closest
3698 value_count(mj_weight_array_size+mj_left_right_array_size),
3707 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3708 ,current_part_weights(mj_current_part_weights),
3709 current_left_closest(mj_current_left_closest),
3710 current_right_closest(mj_current_right_closest)
3716 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 3717 int result =
sizeof(array_t) *
3720 int result =
sizeof(array_t) *
3725 int remainder = result % 8;
3726 if(remainder != 0) {
3727 result += 8 - remainder;
3732 KOKKOS_INLINE_FUNCTION
3733 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 3743 index_t num_working_points = all_end - all_begin;
3744 int num_teams = teamMember.league_size();
3746 index_t stride = num_working_points / num_teams;
3747 if((num_working_points % num_teams) > 0) {
3754 index_t begin = all_begin + stride * teamMember.league_rank();
3755 index_t end = begin + stride;
3760 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 3764 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3768 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3778 teamMember.team_barrier();
3780 Kokkos::parallel_for(
3781 Kokkos::TeamThreadRange(teamMember, begin, end),
3783 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP 3788 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3801 Kokkos::parallel_reduce(
3802 Kokkos::TeamThreadRange(teamMember, begin, end),
3804 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP 3811 index_t part =
parts(i)/2;
3822 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 3823 Kokkos::atomic_add(&shared_ptr[part*2], w);
3825 threadSum.
ptr[part*2] += w;
3831 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 3832 array_t new_value = (array_t) coord;
3834 while(new_value < prev_value) {
3835 prev_value = Kokkos::atomic_compare_exchange(
3837 prev_value, new_value);
3840 while(new_value > prev_value) {
3841 prev_value = Kokkos::atomic_compare_exchange(
3843 prev_value, new_value);
3861 if(coord < b + sEpsilon && coord > b -
sEpsilon) {
3865 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 3866 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3870 threadSum.
ptr[part*2+1] += w;
3875 parts(i) = part*2+1;
3886 scalar_t delta = b - base_coord;
3887 if(delta < 0) delta = -delta;
3892 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 3893 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3897 threadSum.
ptr[part*2+1] += w;
3908 scalar_t delta = b - base_coord;
3909 if(delta < 0) delta = -delta;
3914 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 3915 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3919 threadSum.
ptr[part*2+1] += w;
3944 if(part == lower + 1) {
3949 part -= (part - lower)/2;
3952 else if(part == upper - 1) {
3957 part += (upper - part)/2;
3961 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 3963 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP 3964 }, arraySumReducer);
3965 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP 3967 teamMember.team_barrier();
3970 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3972 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 3973 Kokkos::atomic_add(¤t_part_weights(n),
3974 static_cast<double>(shared_ptr[n]));
3975 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP 3976 teamSum[n] += array.ptr[n];
3977 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP 3980 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 3981 int insert_left = 0;
3982 int insert_right = 0;
3987 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 3988 scalar_t new_value = shared_ptr[n+1];
3989 scalar_t prev_value = current_right_closest(insert_right);
3990 while(new_value < prev_value) {
3991 prev_value = Kokkos::atomic_compare_exchange(
3992 ¤t_right_closest(insert_right), prev_value, new_value);
3995 new_value = shared_ptr[n];
3996 prev_value = current_left_closest(insert_left);
3997 while(new_value > prev_value) {
3998 prev_value = Kokkos::atomic_compare_exchange(
3999 ¤t_left_closest(insert_left), prev_value, new_value);
4004 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP 4005 if(array.ptr[n] > teamSum[n]) {
4006 teamSum[n] = array.ptr[n];
4008 if(array.ptr[n+1] < teamSum[n+1]) {
4009 teamSum[n+1] = array.ptr[n+1];
4011 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP 4015 teamMember.team_barrier();
4018 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) 4019 KOKKOS_INLINE_FUNCTION
4027 if(src[n] > dst[n]) {
4030 if(src[n+1] < dst[n+1]) {
4031 dst[n+1] = src[n+1];
4036 KOKKOS_INLINE_FUNCTION
4044 if(src[n] > dst[n]) {
4047 if(src[n+1] < dst[n+1]) {
4048 dst[n+1] = src[n+1];
4064 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP 4074 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4075 typename mj_part_t,
typename mj_node_t>
4076 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t,mj_part_t, mj_node_t>::
4077 mj_1D_part_get_part_weights(
4080 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4083 auto local_is_cut_line_determined = is_cut_line_determined;
4084 auto local_thread_part_weights = thread_part_weights;
4085 auto local_thread_cut_left_closest_point = thread_cut_left_closest_point;
4086 auto local_thread_cut_right_closest_point = thread_cut_right_closest_point;
4090 auto local_sEpsilon = this->
sEpsilon;
4091 auto local_assigned_part_ids = this->assigned_part_ids;
4092 auto local_coordinate_permutations = this->coordinate_permutations;
4093 auto local_mj_weights = this->mj_weights;
4095 auto local_global_min_max_coord_total_weight =
4096 this->global_min_max_coord_total_weight;
4098 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4100 auto local_device_num_partitioning_in_current_dim =
4101 device_num_partitioning_in_current_dim;
4103 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
4104 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
4106 mj_part_t total_part_shift = 0;
4108 mj_part_t concurrent_cut_shifts = 0;
4110 Kokkos::View<mj_scalar_t *, device_t> local_temp_cut_coords =
4111 Kokkos::subview(temp_cut_coords, std::pair<mj_lno_t, mj_lno_t>(
4112 concurrent_cut_shifts, temp_cut_coords.size()));
4114 mj_part_t num_parts =
4116 mj_part_t
num_cuts = num_parts - 1;
4117 mj_part_t total_part_count = num_parts +
num_cuts;
4118 mj_part_t weight_array_length =
num_cuts + num_parts;
4121 mj_part_t right_left_array_length = (
num_cuts + 2) * 2;
4123 if(this->incomplete_cut_count(kk) == 0) {
4124 total_part_shift += total_part_count;
4130 auto policy_ReduceWeightsFunctor = policy_t(
4131 mj_num_teams ? mj_num_teams : 60, Kokkos::AUTO);
4133 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) 4134 int total_array_length =
4135 weight_array_length + right_left_array_length;
4142 typedef mj_scalar_t array_t;
4144 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) 4145 array_t * reduce_array =
4146 new array_t[
static_cast<size_t>(total_array_length)];
4147 #endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP 4149 int offset_cuts = 0;
4150 for(
int kk2 = 0; kk2 < kk; ++kk2) {
4154 Kokkos::View<double *, device_t> my_current_part_weights =
4155 Kokkos::subview(local_thread_part_weights,
4156 std::pair<mj_lno_t, mj_lno_t>(total_part_shift,
4157 total_part_shift + total_part_count));
4158 Kokkos::View<mj_scalar_t *, device_t> my_current_left_closest =
4159 Kokkos::subview(local_thread_cut_left_closest_point,
4160 std::pair<mj_lno_t, mj_lno_t>(
4162 local_thread_cut_left_closest_point.size()));
4163 Kokkos::View<mj_scalar_t *, device_t> my_current_right_closest =
4164 Kokkos::subview(local_thread_cut_right_closest_point,
4165 std::pair<mj_lno_t, mj_lno_t>(
4167 local_thread_cut_right_closest_point.size()));
4169 array_t
max_scalar = std::numeric_limits<array_t>::max();
4171 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 4173 Kokkos::parallel_for(
4174 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4175 KOKKOS_LAMBDA (
int dummy) {
4176 for(
int n = 0; n < weight_array_length; ++n) {
4177 my_current_part_weights(n) = 0;
4179 for(
int n = 0; n <
num_cuts; ++n) {
4190 typename mj_node_t::device_type, array_t>
4198 right_left_array_length,
4199 weight_array_length,
4200 coordinate_permutations,
4201 mj_current_dim_coords,
4204 local_temp_cut_coords,
4206 mj_uniform_weights(0),
4208 #
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4209 ,my_current_part_weights,
4210 my_current_left_closest,
4211 my_current_right_closest
4215 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 4216 Kokkos::parallel_for(policy_ReduceWeightsFunctor, teamFunctor);
4218 Kokkos::parallel_reduce(policy_ReduceWeightsFunctor,
4219 teamFunctor, reduce_array);
4222 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) 4223 auto hostArray = Kokkos::create_mirror_view(my_current_part_weights);
4225 for(
int i = 0; i < static_cast<int>(total_part_count); ++i) {
4226 hostArray(i) = reduce_array[i];
4229 Kokkos::deep_copy(my_current_part_weights, hostArray);
4231 auto hostLeftArray = Kokkos::create_mirror_view(my_current_left_closest);
4232 auto hostRightArray = Kokkos::create_mirror_view(my_current_right_closest);
4233 for(mj_part_t cut = 0; cut <
num_cuts; ++cut) {
4234 hostLeftArray(cut) = reduce_array[weight_array_length + (cut+1)*2+0];
4235 hostRightArray(cut) = reduce_array[weight_array_length + (cut+1)*2+1];
4237 Kokkos::deep_copy(my_current_left_closest, hostLeftArray);
4238 Kokkos::deep_copy(my_current_right_closest, hostRightArray);
4240 delete [] reduce_array;
4243 total_part_shift += total_part_count;
4247 auto local_temp_cut_coords = temp_cut_coords;
4249 Kokkos::parallel_for(
4250 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
4252 mj_part_t num_parts = local_device_num_partitioning_in_current_dim(
4254 mj_part_t
num_cuts = num_parts - 1;
4255 mj_part_t total_part_count = num_parts +
num_cuts;
4257 if(local_device_incomplete_cut_count(kk) > 0) {
4261 size_t offset_cuts = 0;
4262 for(mj_part_t kk2 = 0; kk2 < kk; ++kk2) {
4263 auto num_parts_kk2 = local_device_num_partitioning_in_current_dim(
4265 offset += num_parts_kk2 * 2 - 1;
4266 offset_cuts += num_parts_kk2 - 1;
4269 for(mj_part_t i = 1; i < total_part_count; ++i) {
4273 if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
4274 std::abs(local_temp_cut_coords(offset_cuts + i / 2) -
4275 local_temp_cut_coords(offset_cuts + i /2 - 1))
4280 local_thread_part_weights(offset + i)
4281 = local_thread_part_weights(offset + i-2);
4286 local_thread_part_weights(offset + i) +=
4287 local_thread_part_weights(offset + i-1);
4300 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4301 typename mj_part_t,
typename mj_node_t>
4302 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4303 mj_combine_rightleft_and_weights(
4307 auto local_thread_part_weights = this->thread_part_weights;
4308 auto local_is_cut_line_determined = this->is_cut_line_determined;
4309 auto local_thread_cut_left_closest_point =
4310 this->thread_cut_left_closest_point;
4311 auto local_thread_cut_right_closest_point =
4312 this->thread_cut_right_closest_point;
4313 auto local_total_part_weight_left_right_closests =
4314 this->total_part_weight_left_right_closests;
4315 auto local_device_num_partitioning_in_current_dim =
4316 device_num_partitioning_in_current_dim;
4317 Kokkos::parallel_for(
4318 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0,1),
4319 KOKKOS_LAMBDA (
int dummy) {
4321 size_t tlr_array_shift = 0;
4322 mj_part_t cut_shift = 0;
4323 size_t total_part_array_shift = 0;
4329 mj_part_t num_parts_in_part =
4331 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
4332 size_t num_total_part_in_part =
4333 num_parts_in_part + size_t (num_cuts_in_part);
4336 for(
int ii = 0; ii < num_cuts_in_part; ++ii) {
4337 mj_part_t next = tlr_array_shift + ii;
4338 mj_part_t cut_index = cut_shift + ii;
4340 if(!local_is_cut_line_determined(cut_index)) {
4341 mj_scalar_t left_closest_in_process =
4342 local_thread_cut_left_closest_point(cut_index);
4343 mj_scalar_t right_closest_in_process =
4344 local_thread_cut_right_closest_point(cut_index);
4347 local_total_part_weight_left_right_closests(
4348 num_total_part_in_part + next) = left_closest_in_process;
4350 local_total_part_weight_left_right_closests(
4351 num_total_part_in_part + num_cuts_in_part + next) =
4352 right_closest_in_process;
4356 for(
size_t j = 0; j < num_total_part_in_part; ++j) {
4357 mj_part_t cut_ind = j / 2 + cut_shift;
4363 if(j == num_total_part_in_part - 1 ||
4364 !local_is_cut_line_determined(cut_ind)) {
4365 double pwj = local_thread_part_weights(total_part_array_shift + j);
4366 local_total_part_weight_left_right_closests(tlr_array_shift + j) = pwj;
4371 cut_shift += num_cuts_in_part;
4372 tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
4373 total_part_array_shift += num_total_part_in_part;
4390 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4391 typename mj_part_t,
typename mj_node_t>
4392 KOKKOS_INLINE_FUNCTION
4393 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
4394 mj_node_t>::mj_calculate_new_cut_position(mj_scalar_t cut_upper_bound,
4395 mj_scalar_t cut_lower_bound,
4396 mj_scalar_t cut_upper_weight,
4397 mj_scalar_t cut_lower_weight,
4398 mj_scalar_t expected_weight,
4399 mj_scalar_t &new_cut_position,
4402 if(std::abs(cut_upper_bound - cut_lower_bound) <
sEpsilon) {
4403 new_cut_position = cut_upper_bound;
4406 if(std::abs(cut_upper_weight - cut_lower_weight) <
sEpsilon) {
4407 new_cut_position = cut_lower_bound;
4410 mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
4411 mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
4412 mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
4414 mj_scalar_t required_shift = (my_weight_diff / weight_range);
4415 int scale_constant = 20;
4416 int shiftint= int (required_shift * scale_constant);
4417 if(shiftint == 0) shiftint = 1;
4418 required_shift = mj_scalar_t (shiftint) / scale_constant;
4419 new_cut_position = coordinate_range * required_shift + cut_lower_bound;
4422 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) 4424 template<
class policy_t,
class scalar_t>
4434 int mj_value_count) :
4439 KOKKOS_INLINE_FUNCTION
4444 KOKKOS_INLINE_FUNCTION
4447 dst.
ptr[n] += src.
ptr[n];
4451 KOKKOS_INLINE_FUNCTION
4454 dst.
ptr[n] += src.
ptr[n];
4468 template<
class policy_t,
class scalar_t,
class part_t,
class index_t,
4469 class device_t,
class array_t>
4474 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) 4486 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 4487 Kokkos::View<int *, device_t> local_point_counts;
4488 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP 4491 part_t mj_concurrent_current_part,
4492 part_t mj_weight_array_size,
4493 Kokkos::View<index_t*, device_t> & mj_permutations,
4494 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
4495 Kokkos::View<part_t*, device_t> & mj_parts,
4496 Kokkos::View<index_t *, device_t> & mj_part_xadj,
4497 Kokkos::View<index_t *, device_t> & mj_track_on_cuts
4498 #
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4499 ,Kokkos::View<int *, device_t> & mj_local_point_counts
4508 track_on_cuts(mj_track_on_cuts)
4509 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4510 ,local_point_counts(mj_local_point_counts)
4516 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 4519 int result =
sizeof(array_t) * (
value_count) * team_size;
4523 int remainder = result % 8;
4524 if(remainder != 0) {
4525 result += 8 - remainder;
4530 KOKKOS_INLINE_FUNCTION
4531 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 4540 index_t num_working_points = all_end - all_begin;
4541 int num_teams = teamMember.league_size();
4543 index_t stride = num_working_points / num_teams;
4544 if((num_working_points % num_teams) > 0) {
4548 index_t begin = all_begin + stride * teamMember.league_rank();
4549 index_t end = begin + stride;
4554 int track_on_cuts_insert_index = track_on_cuts.size() - 1;
4557 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 4558 size_t sh_mem_size =
sizeof(array_t) * (
value_count);
4560 size_t sh_mem_size =
4561 sizeof(array_t) * (
value_count) * teamMember.team_size();
4564 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
4567 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 4569 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4574 teamMember.team_barrier();
4576 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, begin, end),
4578 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP 4586 Kokkos::parallel_reduce(
4587 Kokkos::TeamThreadRange(teamMember, begin, end),
4589 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP 4594 if(place % 2 == 0) {
4595 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 4596 Kokkos::atomic_add(&shared_ptr[part], 1);
4598 threadSum.
ptr[part] += 1;
4601 parts(coordinate_index) = part;
4606 index_t set_index = Kokkos::atomic_fetch_add(
4607 &track_on_cuts(track_on_cuts_insert_index), 1);
4608 track_on_cuts(set_index) = ii;
4610 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 4612 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP 4614 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP 4616 teamMember.team_barrier();
4619 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4621 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 4622 Kokkos::atomic_add(&local_point_counts(n), shared_ptr[n]);
4623 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP 4624 teamSum[n] += array.ptr[n];
4625 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP 4629 teamMember.team_barrier();
4632 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) 4634 KOKKOS_INLINE_FUNCTION
4641 KOKKOS_INLINE_FUNCTION
4671 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4672 typename mj_part_t,
typename mj_node_t>
4673 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4674 mj_create_new_partitions(
4675 mj_part_t num_parts,
4676 mj_part_t current_concurrent_work_part,
4677 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4678 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
4679 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
4680 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj)
4683 auto local_thread_part_weight_work = this->thread_part_weight_work;
4684 auto local_point_counts = this->thread_point_counts;
4685 auto local_distribute_points_on_cut_lines =
4686 this->distribute_points_on_cut_lines;
4687 auto local_thread_cut_line_weight_to_put_left =
4688 this->thread_cut_line_weight_to_put_left;
4689 auto local_sEpsilon = this->
sEpsilon;
4690 auto local_coordinate_permutations = this->coordinate_permutations;
4691 auto local_mj_weights = this->mj_weights;
4692 auto local_assigned_part_ids = this->assigned_part_ids;
4693 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
4695 mj_part_t
num_cuts = num_parts - 1;
4697 Kokkos::parallel_for(
4698 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4699 KOKKOS_LAMBDA(
int dummy) {
4701 if(local_distribute_points_on_cut_lines) {
4702 for(
int i = 0; i <
num_cuts; ++i) {
4703 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
4704 if(left_weight > local_sEpsilon) {
4706 mj_scalar_t thread_ii_weight_on_cut =
4707 local_thread_part_weight_work(i * 2 + 1) -
4708 local_thread_part_weight_work(i * 2);
4710 if(thread_ii_weight_on_cut < left_weight) {
4712 local_thread_cut_line_weight_to_put_left(i) =
4713 thread_ii_weight_on_cut;
4717 local_thread_cut_line_weight_to_put_left(i) = left_weight;
4719 left_weight -= thread_ii_weight_on_cut;
4722 local_thread_cut_line_weight_to_put_left(i) = 0;
4728 for(mj_part_t i =
num_cuts - 1; i > 0 ; --i) {
4729 if(std::abs(current_concurrent_cut_coordinate(i) -
4730 current_concurrent_cut_coordinate(i -1)) < local_sEpsilon) {
4731 local_thread_cut_line_weight_to_put_left(i) -=
4732 local_thread_cut_line_weight_to_put_left(i - 1);
4734 local_thread_cut_line_weight_to_put_left(i) =
4735 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
4736 least_signifiance) * significance_mul) /
4737 static_cast<mj_scalar_t
>(significance_mul);
4741 for(mj_part_t i = 0; i < num_parts; ++i) {
4742 local_point_counts(i) = 0;
4746 mj_lno_t coordinate_begin_index =
4747 current_concurrent_work_part == 0 ? 0 :
4748 host_part_xadj(current_concurrent_work_part - 1);
4749 mj_lno_t coordinate_end_index =
4750 host_part_xadj(current_concurrent_work_part);
4752 mj_lno_t total_on_cut;
4753 Kokkos::parallel_reduce(
"Get total_on_cut",
4754 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (
4755 coordinate_begin_index, coordinate_end_index),
4756 KOKKOS_LAMBDA(
int ii, mj_lno_t & val) {
4757 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4758 mj_part_t coordinate_assigned_place =
4759 local_assigned_part_ids(coordinate_index);
4760 if(coordinate_assigned_place % 2 == 1) {
4765 Kokkos::View<mj_lno_t *, device_t> track_on_cuts;
4766 if(total_on_cut > 0) {
4767 track_on_cuts = Kokkos::View<mj_lno_t *, device_t>(
4778 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4781 int use_num_teams = mj_num_teams ? mj_num_teams : 60;
4783 auto policy_ReduceFunctor = policy_t(use_num_teams, Kokkos::AUTO);
4784 typedef int array_t;
4788 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) 4789 array_t * reduce_array =
new array_t[
static_cast<size_t>(num_parts)];
4792 ReduceArrayFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4793 typename mj_node_t::device_type, array_t>teamFunctor(
4794 current_concurrent_work_part,
4796 coordinate_permutations,
4797 mj_current_dim_coords,
4801 #
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4806 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 4807 Kokkos::parallel_for(policy_ReduceFunctor, teamFunctor);
4809 Kokkos::parallel_reduce(policy_ReduceFunctor, teamFunctor, reduce_array);
4812 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) 4813 for(mj_part_t part = 0; part < num_parts; ++part) {
4814 local_point_counts(part) = reduce_array[part];
4816 delete [] reduce_array;
4821 if(track_on_cuts.size() > 0) {
4822 auto track_on_cuts_sort = Kokkos::subview(track_on_cuts,
4823 std::pair<mj_lno_t, mj_lno_t>(0, track_on_cuts.size() - 1));
4824 Kokkos::sort(track_on_cuts_sort);
4828 Kokkos::parallel_for(
4829 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4830 KOKKOS_LAMBDA (
int dummy) {
4832 for(
int j = 0; j < total_on_cut; ++j) {
4833 int ii = track_on_cuts(j);
4834 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4836 local_mj_weights(coordinate_index,0);
4837 mj_part_t coordinate_assigned_place =
4838 local_assigned_part_ids(coordinate_index);
4839 mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
4841 if(local_distribute_points_on_cut_lines &&
4842 local_thread_cut_line_weight_to_put_left(
4843 coordinate_assigned_part) > local_sEpsilon) {
4847 local_thread_cut_line_weight_to_put_left(
4848 coordinate_assigned_part) -= coordinate_weight;
4853 if(local_thread_cut_line_weight_to_put_left(
4854 coordinate_assigned_part) < 0 && coordinate_assigned_part <
4856 std::abs(current_concurrent_cut_coordinate(
4857 coordinate_assigned_part+1) -
4858 current_concurrent_cut_coordinate(
4859 coordinate_assigned_part)) < local_sEpsilon)
4861 local_thread_cut_line_weight_to_put_left(
4862 coordinate_assigned_part + 1) +=
4863 local_thread_cut_line_weight_to_put_left(
4864 coordinate_assigned_part);
4866 ++local_point_counts(coordinate_assigned_part);
4867 local_assigned_part_ids(coordinate_index) =
4868 coordinate_assigned_part;
4873 ++coordinate_assigned_part;
4876 while(local_distribute_points_on_cut_lines &&
4877 coordinate_assigned_part <
num_cuts)
4880 if(std::abs(current_concurrent_cut_coordinate(
4881 coordinate_assigned_part) -
4882 current_concurrent_cut_coordinate(
4883 coordinate_assigned_part - 1)) < local_sEpsilon)
4886 if(local_thread_cut_line_weight_to_put_left(
4887 coordinate_assigned_part) > local_sEpsilon &&
4888 local_thread_cut_line_weight_to_put_left(
4889 coordinate_assigned_part) >=
4890 std::abs(local_thread_cut_line_weight_to_put_left(
4891 coordinate_assigned_part) - coordinate_weight))
4893 local_thread_cut_line_weight_to_put_left(
4894 coordinate_assigned_part) -= coordinate_weight;
4898 if(local_thread_cut_line_weight_to_put_left(
4899 coordinate_assigned_part) < 0 &&
4900 coordinate_assigned_part <
num_cuts - 1 &&
4901 std::abs(current_concurrent_cut_coordinate(
4902 coordinate_assigned_part+1) -
4903 current_concurrent_cut_coordinate(
4904 coordinate_assigned_part)) < local_sEpsilon)
4906 local_thread_cut_line_weight_to_put_left(
4907 coordinate_assigned_part + 1) +=
4908 local_thread_cut_line_weight_to_put_left(
4909 coordinate_assigned_part);
4917 ++coordinate_assigned_part;
4919 local_point_counts(coordinate_assigned_part) += 1;
4920 local_assigned_part_ids(coordinate_index) = coordinate_assigned_part;
4924 for(
int j = 0; j < num_parts; ++j) {
4925 out_part_xadj(j) = local_point_counts(j);
4926 local_point_counts(j) = 0;
4929 out_part_xadj(j) += out_part_xadj(j - 1);
4930 local_point_counts(j) += out_part_xadj(j - 1);
4938 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) 4943 Kokkos::parallel_for(
4944 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
4945 coordinate_begin_index, coordinate_end_index),
4946 KOKKOS_LAMBDA (mj_lno_t ii) {
4947 mj_lno_t i = local_coordinate_permutations(ii);
4948 mj_part_t p = local_assigned_part_ids(i);
4949 mj_lno_t idx = Kokkos::atomic_fetch_add(&local_point_counts(p), 1);
4950 local_new_coordinate_permutations(coordinate_begin_index + idx) = i;
4955 #ifdef KOKKOS_ENABLE_OPENMP 4957 const int num_threads = 1;
4959 const int num_threads = 1;
4962 const int num_teams = 1;
4965 Kokkos::View<mj_lno_t*, device_t>
4966 point_counter(
"insert indices", num_teams * num_threads * num_parts);
4970 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
4971 block_policy(num_teams, num_threads);
4972 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
4974 mj_lno_t range = coordinate_end_index - coordinate_begin_index;
4975 mj_lno_t block_size = range / num_teams + 1;
4976 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(
member_type team_member) {
4977 int team = team_member.league_rank();
4978 int team_offset = team * num_threads * num_parts;
4979 mj_lno_t begin = coordinate_begin_index + team * block_size;
4980 mj_lno_t end = begin + block_size;
4981 if(end > coordinate_end_index) {
4982 end = coordinate_end_index;
4985 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
4987 int thread = team_member.team_rank();
4988 mj_lno_t i = local_coordinate_permutations(ii);
4989 mj_part_t p = local_assigned_part_ids(i);
4990 int index = team_offset + thread * num_parts + p;
4991 ++point_counter(index);
4999 Kokkos::parallel_for(
5000 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
5001 KOKKOS_LAMBDA (
int dummy) {
5002 int num_sets = point_counter.size() / num_parts;
5003 for(
int set = num_sets - 1;
set >= 1;
set -=1) {
5004 int base =
set * num_parts;
5005 for(
int part = 0; part < num_parts; ++part) {
5006 point_counter(base + part) = point_counter(base + part - num_parts);
5010 for(
int part = 0; part < num_parts; ++part) {
5011 point_counter(part) = 0;
5014 for(
int set = 1;
set < num_sets; ++
set) {
5015 int base =
set * num_parts;
5016 for(
int part = 0; part < num_parts; ++part) {
5017 point_counter(base + part) += point_counter(base + part - num_parts);
5023 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(
member_type team_member) {
5024 int team = team_member.league_rank();
5025 int team_offset = team * num_threads * num_parts;
5026 mj_lno_t begin = coordinate_begin_index + team * block_size;
5027 mj_lno_t end = begin + block_size;
5028 if(end > coordinate_end_index) {
5029 end = coordinate_end_index;
5031 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
5033 int thread = team_member.team_rank();
5034 mj_lno_t i = local_coordinate_permutations(ii);
5035 mj_part_t p = local_assigned_part_ids(i);
5036 int index = team_offset + thread * num_parts + p;
5037 int set_counter = (point_counter(index)++) + local_point_counts(p);
5038 local_new_coordinate_permutations(coordinate_begin_index + set_counter) = i;
5087 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5088 typename mj_part_t,
typename mj_node_t>
5089 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5090 mj_node_t>::mj_get_new_cut_coordinates(
5094 const double &used_imbalance_tolerance,
5095 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
5096 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
5097 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
5098 Kokkos::View<bool *, device_t> & current_cut_line_determined,
5099 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
5100 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
5101 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
5102 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
5103 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
5104 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
5105 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
5106 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
5107 Kokkos::View<mj_scalar_t *, device_t> &
5108 current_part_cut_line_weight_to_put_left,
5109 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count)
5111 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
5113 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
5115 auto local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
5116 auto local_global_rectilinear_cut_weight = global_rectilinear_cut_weight;
5117 auto local_process_rectilinear_cut_weight = process_rectilinear_cut_weight;
5118 auto local_global_min_max_coord_total_weight =
5119 global_min_max_coord_total_weight;
5121 const auto _sEpsilon = this->
sEpsilon;
5129 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
5130 policy_one_team(1, Kokkos::AUTO());
5131 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
5133 Kokkos::parallel_for(policy_one_team, KOKKOS_LAMBDA(
member_type team_member) {
5135 mj_scalar_t min_coordinate =
5136 local_global_min_max_coord_total_weight(kk);
5137 mj_scalar_t max_coordinate =
5138 local_global_min_max_coord_total_weight(
5140 mj_scalar_t global_total_weight =
5141 local_global_min_max_coord_total_weight(
5144 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member,
num_cuts),
5149 current_global_left_closest_points(i) > local_sEpsilon) {
5150 current_global_left_closest_points(i) =
5151 current_cut_coordinates(i);
5153 if(current_global_right_closest_points(i) -
5154 max_coordinate > local_sEpsilon) {
5155 current_global_right_closest_points(i) =
5156 current_cut_coordinates(i);
5159 team_member.team_barrier();
5161 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member,
num_cuts),
5163 using algMJ_t = AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5166 mj_scalar_t seen_weight_in_part = 0;
5168 mj_scalar_t expected_weight_in_part = 0;
5170 double imbalance_on_left = 0, imbalance_on_right = 0;
5171 if(local_distribute_points_on_cut_lines) {
5173 local_global_rectilinear_cut_weight(i) = 0;
5174 local_process_rectilinear_cut_weight(i) = 0;
5176 bool bContinue =
false;
5179 if(current_cut_line_determined(i)) {
5180 new_current_cut_coordinates(i) =
5181 current_cut_coordinates(i);
5186 seen_weight_in_part = current_global_part_weights(i * 2);
5189 expected_weight_in_part = current_part_target_weights(i);
5192 imbalance_on_left = algMJ_t::calculate_imbalance(seen_weight_in_part,
5193 expected_weight_in_part);
5196 imbalance_on_right = algMJ_t::calculate_imbalance(global_total_weight -
5197 seen_weight_in_part, global_total_weight - expected_weight_in_part);
5198 bool is_left_imbalance_valid = std::abs(imbalance_on_left) -
5199 used_imbalance_tolerance < local_sEpsilon ;
5200 bool is_right_imbalance_valid = std::abs(imbalance_on_right) -
5201 used_imbalance_tolerance < local_sEpsilon;
5203 if(is_left_imbalance_valid && is_right_imbalance_valid) {
5204 current_cut_line_determined(i) =
true;
5205 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5206 new_current_cut_coordinates(i) = current_cut_coordinates(i);
5208 else if(imbalance_on_left < 0) {
5210 if(local_distribute_points_on_cut_lines) {
5215 if(current_global_part_weights(i * 2 + 1) ==
5216 expected_weight_in_part) {
5218 current_cut_line_determined(i) =
true;
5219 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5222 new_current_cut_coordinates(i) =
5223 current_cut_coordinates(i);
5225 current_part_cut_line_weight_to_put_left(i) =
5226 current_local_part_weights(i * 2 + 1) -
5227 current_local_part_weights(i * 2);
5230 else if(current_global_part_weights(i * 2 + 1) >
5231 expected_weight_in_part) {
5234 current_cut_line_determined(i) =
true;
5235 Kokkos::atomic_add(&view_rectilinear_cut_count(0), 1);
5239 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5240 new_current_cut_coordinates(i) =
5241 current_cut_coordinates(i);
5242 local_process_rectilinear_cut_weight[i] =
5243 current_local_part_weights(i * 2 + 1) -
5244 current_local_part_weights(i * 2);
5253 current_cut_lower_bounds(i) =
5254 current_global_right_closest_points(i);
5257 current_cut_lower_bound_weights(i) = seen_weight_in_part;
5262 for(mj_part_t ii = i + 1; ii <
num_cuts ; ++ii) {
5263 mj_scalar_t p_weight = current_global_part_weights(ii * 2);
5264 mj_scalar_t line_weight =
5265 current_global_part_weights(ii * 2 + 1);
5266 if(p_weight >= expected_weight_in_part) {
5271 if(p_weight == expected_weight_in_part) {
5272 current_cut_upper_bounds(i) =
5273 current_cut_coordinates(ii);
5274 current_cut_upper_weights(i) = p_weight;
5275 current_cut_lower_bounds(i) =
5276 current_cut_coordinates(ii);
5277 current_cut_lower_bound_weights(i) = p_weight;
5278 }
else if(p_weight < current_cut_upper_weights(i)) {
5281 current_cut_upper_bounds(i) =
5282 current_global_left_closest_points(ii);
5283 current_cut_upper_weights(i) = p_weight;
5289 if(line_weight >= expected_weight_in_part) {
5293 current_cut_upper_bounds(i) =
5294 current_cut_coordinates(ii);
5295 current_cut_upper_weights(i) = line_weight;
5296 current_cut_lower_bounds(i) =
5297 current_cut_coordinates(ii);
5298 current_cut_lower_bound_weights(i) = p_weight;
5303 if(p_weight <= expected_weight_in_part && p_weight >=
5304 current_cut_lower_bound_weights(i)) {
5305 current_cut_lower_bounds(i) =
5306 current_global_right_closest_points(ii);
5307 current_cut_lower_bound_weights(i) = p_weight;
5311 mj_scalar_t new_cut_position = 0;
5312 algMJ_t::mj_calculate_new_cut_position(
5313 current_cut_upper_bounds(i),
5314 current_cut_lower_bounds(i),
5315 current_cut_upper_weights(i),
5316 current_cut_lower_bound_weights(i),
5317 expected_weight_in_part, new_cut_position,
5322 if(std::abs(current_cut_coordinates(i) -
5323 new_cut_position) < local_sEpsilon) {
5324 current_cut_line_determined(i) =
true;
5325 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5328 new_current_cut_coordinates(i) =
5329 current_cut_coordinates(i);
5331 new_current_cut_coordinates(i) = new_cut_position;
5337 current_cut_upper_bounds(i) =
5338 current_global_left_closest_points(i);
5339 current_cut_upper_weights(i) =
5340 seen_weight_in_part;
5343 for(
int ii = i - 1; ii >= 0; --ii) {
5344 mj_scalar_t p_weight =
5345 current_global_part_weights(ii * 2);
5346 mj_scalar_t line_weight =
5347 current_global_part_weights(ii * 2 + 1);
5348 if(p_weight <= expected_weight_in_part) {
5349 if(p_weight == expected_weight_in_part) {
5352 current_cut_upper_bounds(i) =
5353 current_cut_coordinates(ii);
5354 current_cut_upper_weights(i) = p_weight;
5355 current_cut_lower_bounds(i) =
5356 current_cut_coordinates(ii);
5357 current_cut_lower_bound_weights(i) = p_weight;
5359 else if(p_weight > current_cut_lower_bound_weights(i)) {
5362 current_cut_lower_bounds(i) =
5363 current_global_right_closest_points(ii);
5364 current_cut_lower_bound_weights(i) = p_weight;
5370 if(line_weight > expected_weight_in_part) {
5371 current_cut_upper_bounds(i) =
5372 current_global_right_closest_points(ii);
5373 current_cut_upper_weights(i) = line_weight;
5383 if(p_weight >= expected_weight_in_part &&
5384 (p_weight < current_cut_upper_weights(i) ||
5385 (p_weight == current_cut_upper_weights(i) &&
5386 current_cut_upper_bounds(i) >
5387 current_global_left_closest_points(ii)))) {
5388 current_cut_upper_bounds(i) =
5389 current_global_left_closest_points(ii);
5390 current_cut_upper_weights(i) = p_weight;
5393 mj_scalar_t new_cut_position = 0;
5394 algMJ_t::mj_calculate_new_cut_position(
5395 current_cut_upper_bounds(i),
5396 current_cut_lower_bounds(i),
5397 current_cut_upper_weights(i),
5398 current_cut_lower_bound_weights(i),
5399 expected_weight_in_part,
5404 if(std::abs(current_cut_coordinates(i) -
5405 new_cut_position) < local_sEpsilon) {
5406 current_cut_line_determined(i) =
true;
5407 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5409 new_current_cut_coordinates(i) =
5410 current_cut_coordinates(i);
5412 new_current_cut_coordinates(i) =
5419 team_member.team_barrier();
5423 mj_part_t rectilinear_cut_count;
5424 Kokkos::parallel_reduce(
"Read bDoingWork",
5425 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
5426 KOKKOS_LAMBDA(
int dummy,
int & set_single) {
5427 set_single = view_rectilinear_cut_count(0);
5428 }, rectilinear_cut_count);
5430 if(rectilinear_cut_count > 0) {
5431 auto host_local_process_rectilinear_cut_weight =
5432 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5433 local_process_rectilinear_cut_weight);
5434 auto host_local_global_rectilinear_cut_weight =
5435 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5436 local_global_rectilinear_cut_weight);
5437 Kokkos::deep_copy(host_local_process_rectilinear_cut_weight,
5438 local_process_rectilinear_cut_weight);
5439 Kokkos::deep_copy(host_local_global_rectilinear_cut_weight,
5440 local_global_rectilinear_cut_weight);
5441 Teuchos::scan<int,mj_scalar_t>(
5442 *comm, Teuchos::REDUCE_SUM,
5444 host_local_process_rectilinear_cut_weight.data(),
5445 host_local_global_rectilinear_cut_weight.data());
5446 Kokkos::deep_copy(local_process_rectilinear_cut_weight,
5447 host_local_process_rectilinear_cut_weight);
5448 Kokkos::deep_copy(local_global_rectilinear_cut_weight,
5449 host_local_global_rectilinear_cut_weight);
5451 Kokkos::parallel_for(
"finish up mj_get_new_cut_coordinates",
5452 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
5453 KOKKOS_LAMBDA(
int dummy) {
5454 for(mj_part_t i = 0; i <
num_cuts; ++i) {
5456 if(local_global_rectilinear_cut_weight(i) > 0) {
5458 mj_scalar_t expected_part_weight = current_part_target_weights(i);
5460 mj_scalar_t necessary_weight_on_line_for_left =
5461 expected_part_weight - current_global_part_weights(i * 2);
5464 mj_scalar_t my_weight_on_line =
5465 local_process_rectilinear_cut_weight(i);
5469 mj_scalar_t weight_on_line_upto_process_inclusive =
5470 local_global_rectilinear_cut_weight(i);
5474 mj_scalar_t space_to_put_left =
5475 necessary_weight_on_line_for_left -
5476 weight_on_line_upto_process_inclusive;
5479 mj_scalar_t space_left_to_me =
5480 space_to_put_left + my_weight_on_line;
5493 if(space_left_to_me < 0) {
5496 current_part_cut_line_weight_to_put_left(i) = 0;
5498 else if(space_left_to_me >= my_weight_on_line) {
5502 current_part_cut_line_weight_to_put_left(i) =
5509 current_part_cut_line_weight_to_put_left(i) =
5516 view_rectilinear_cut_count(0) = 0;
5520 Kokkos::deep_copy(this->incomplete_cut_count, device_incomplete_cut_count);
5532 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5533 typename mj_part_t,
typename mj_node_t>
5534 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5535 get_processor_num_points_in_parts(
5536 mj_part_t num_procs,
5537 mj_part_t num_parts,
5538 mj_gno_t *&num_points_in_all_processor_parts)
5541 size_t allocation_size = num_parts * (num_procs + 1);
5548 mj_gno_t *num_local_points_in_each_part_to_reduce_sum =
5549 new mj_gno_t[allocation_size];
5553 mj_gno_t *my_local_points_to_reduce_sum =
5554 num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
5558 mj_gno_t *my_local_point_counts_in_each_part =
5559 num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
5562 memset(num_local_points_in_each_part_to_reduce_sum, 0,
5563 sizeof(mj_gno_t)*allocation_size);
5565 auto local_new_part_xadj = this->new_part_xadj;
5566 Kokkos::View<mj_gno_t *, typename mj_node_t::device_type> points_per_part(
5567 Kokkos::ViewAllocateWithoutInitializing(
"points per part"), num_parts);
5568 Kokkos::parallel_for(
"get vals on device",
5569 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_gno_t>
5570 (0, num_parts), KOKKOS_LAMBDA(mj_gno_t i) {
5571 points_per_part(i) =
5572 local_new_part_xadj(i) - ((i == 0) ? 0 : local_new_part_xadj(i-1));
5574 auto host_points_per_part = Kokkos::create_mirror_view(points_per_part);
5575 Kokkos::deep_copy(host_points_per_part, points_per_part);
5576 for(
int i = 0; i < num_parts; ++i) {
5577 my_local_points_to_reduce_sum[i] = host_points_per_part(i);
5582 memcpy (my_local_point_counts_in_each_part, my_local_points_to_reduce_sum,
5583 sizeof(mj_gno_t) * (num_parts) );
5590 reduceAll<int, mj_gno_t>(
5592 Teuchos::REDUCE_SUM,
5594 num_local_points_in_each_part_to_reduce_sum,
5595 num_points_in_all_processor_parts);
5599 delete [] num_local_points_in_each_part_to_reduce_sum;
5617 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5618 typename mj_part_t,
typename mj_node_t>
5619 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5620 mj_check_to_migrate(
5621 size_t migration_reduce_all_population,
5622 mj_lno_t num_coords_for_last_dim_part,
5623 mj_part_t num_procs,
5624 mj_part_t num_parts,
5625 mj_gno_t *num_points_in_all_processor_parts)
5628 if(migration_reduce_all_population > future_reduceall_cutoff) {
5633 if(num_coords_for_last_dim_part < min_work_last_dim) {
5638 if(this->check_migrate_avoid_migration_option == 0) {
5639 double global_imbalance = 0;
5641 size_t global_shift = num_procs * num_parts;
5643 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5644 for(mj_part_t i = 0; i < num_parts; ++i) {
5645 double ideal_num = num_points_in_all_processor_parts[global_shift + i]
5646 / double(num_procs);
5648 global_imbalance += std::abs(ideal_num -
5649 num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
5652 global_imbalance /= num_parts;
5653 global_imbalance /= num_procs;
5655 if(global_imbalance <= this->minimum_migration_imbalance) {
5681 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5682 typename mj_part_t,
typename mj_node_t>
5683 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5684 assign_send_destinations(
5685 mj_part_t num_parts,
5686 mj_part_t *part_assignment_proc_begin_indices,
5687 mj_part_t *processor_chains_in_parts,
5688 mj_lno_t *send_count_to_each_proc,
5689 int *coordinate_destinations) {
5691 auto host_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
5692 deep_copy(host_new_part_xadj, this->new_part_xadj);
5694 auto host_new_coordinate_permutations =
5695 Kokkos::create_mirror_view(this->new_coordinate_permutations);
5696 deep_copy(host_new_coordinate_permutations, this->new_coordinate_permutations);
5698 for(mj_part_t p = 0; p < num_parts; ++p) {
5699 mj_lno_t part_begin = 0;
5700 if(p > 0) part_begin = host_new_part_xadj(p - 1);
5701 mj_lno_t part_end = host_new_part_xadj(p);
5703 mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
5705 mj_lno_t num_total_send = 0;
5706 for(mj_lno_t j=part_begin; j < part_end; j++) {
5707 mj_lno_t local_ind = host_new_coordinate_permutations(j);
5708 while (num_total_send >= send_count_to_each_proc[proc_to_sent]) {
5712 part_assignment_proc_begin_indices[p] =
5713 processor_chains_in_parts[proc_to_sent];
5715 processor_chains_in_parts[proc_to_sent] = -1;
5717 proc_to_sent = part_assignment_proc_begin_indices[p];
5720 coordinate_destinations[local_ind] = proc_to_sent;
5746 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5747 typename mj_part_t,
typename mj_node_t>
5748 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5749 mj_assign_proc_to_parts(
5750 mj_gno_t * num_points_in_all_processor_parts,
5751 mj_part_t num_parts,
5752 mj_part_t num_procs,
5753 mj_lno_t *send_count_to_each_proc,
5754 std::vector<mj_part_t> &processor_ranks_for_subcomm,
5755 std::vector<mj_part_t> *next_future_num_parts_in_parts,
5756 mj_part_t &out_part_index,
5757 mj_part_t &output_part_numbering_begin_index,
5758 int * coordinate_destinations) {
5759 mj_gno_t *global_num_points_in_parts =
5760 num_points_in_all_processor_parts + num_procs * num_parts;
5761 mj_part_t *num_procs_assigned_to_each_part =
new mj_part_t[num_parts];
5764 bool did_i_find_my_group =
false;
5766 mj_part_t num_free_procs = num_procs;
5767 mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
5769 double max_imbalance_difference = 0;
5770 mj_part_t max_differing_part = 0;
5773 for(mj_part_t i = 0; i < num_parts; i++) {
5776 double scalar_required_proc = num_procs *
5777 (double (global_num_points_in_parts[i]) /
5778 double (this->num_global_coords));
5781 mj_part_t required_proc =
5782 static_cast<mj_part_t
> (0.5 + scalar_required_proc);
5783 if(required_proc == 0) required_proc = 1;
5789 required_proc < minimum_num_procs_required_for_rest_of_parts) {
5790 required_proc = num_free_procs -
5791 (minimum_num_procs_required_for_rest_of_parts);
5795 num_free_procs -= required_proc;
5799 --minimum_num_procs_required_for_rest_of_parts;
5802 num_procs_assigned_to_each_part[i] = required_proc;
5807 double imbalance_wrt_ideal =
5808 (scalar_required_proc - required_proc) / required_proc;
5809 if(imbalance_wrt_ideal > max_imbalance_difference) {
5810 max_imbalance_difference = imbalance_wrt_ideal;
5811 max_differing_part = i;
5817 if(num_free_procs > 0) {
5818 num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
5825 mj_part_t *part_assignment_proc_begin_indices =
new mj_part_t[num_parts];
5829 mj_part_t *processor_chains_in_parts =
new mj_part_t [num_procs];
5830 mj_part_t *processor_part_assignments =
new mj_part_t[num_procs];
5839 for(
int i = 0; i < num_procs; ++i ) {
5840 processor_part_assignments[i] = -1;
5841 processor_chains_in_parts[i] = -1;
5843 for(
int i = 0; i < num_parts; ++i ) {
5844 part_assignment_proc_begin_indices[i] = -1;
5850 uSignedSortItem<mj_part_t, mj_gno_t, char> *
5851 sort_item_num_part_points_in_procs =
5852 new uSignedSortItem<mj_part_t, mj_gno_t, char>[num_procs];
5854 for(mj_part_t i = 0; i < num_parts; ++i) {
5859 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5860 sort_item_num_part_points_in_procs[ii].id = ii;
5863 if(processor_part_assignments[ii] == -1) {
5864 sort_item_num_part_points_in_procs[ii].val =
5865 num_points_in_all_processor_parts[ii * num_parts + i];
5867 sort_item_num_part_points_in_procs[ii].signbit = 1;
5880 sort_item_num_part_points_in_procs[ii].val =
5881 num_points_in_all_processor_parts[ii * num_parts + i];
5882 sort_item_num_part_points_in_procs[ii].signbit = 0;
5887 uqSignsort<mj_part_t, mj_gno_t,char>
5888 (num_procs, sort_item_num_part_points_in_procs);
5900 mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
5901 mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
5902 mj_gno_t ideal_num_points_in_a_proc = Teuchos::as<mj_gno_t>(
5903 ceil(total_num_points_in_part /
double (required_proc_count)));
5906 mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
5907 mj_part_t next_proc_to_send_id =
5908 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
5909 mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc -
5910 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
5914 for(mj_part_t ii = num_procs - 1;
5915 ii >= num_procs - required_proc_count; --ii) {
5916 mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
5918 processor_part_assignments[proc_id] = i;
5921 bool did_change_sign =
false;
5923 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5926 if(sort_item_num_part_points_in_procs[ii].signbit == 0) {
5927 did_change_sign =
true;
5928 sort_item_num_part_points_in_procs[ii].signbit = 1;
5935 if(did_change_sign) {
5938 uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count,
5939 sort_item_num_part_points_in_procs);
5954 if(!did_i_find_my_group) {
5955 for(mj_part_t ii = num_procs - 1; ii >=
5956 num_procs - required_proc_count; --ii) {
5958 mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
5961 processor_ranks_for_subcomm.push_back(proc_id_to_assign);
5963 if(proc_id_to_assign == this->myRank) {
5965 did_i_find_my_group =
true;
5968 part_assignment_proc_begin_indices[i] = this->myRank;
5969 processor_chains_in_parts[this->myRank] = -1;
5973 send_count_to_each_proc[this->myRank] =
5974 sort_item_num_part_points_in_procs[ii].val;
5978 for(mj_part_t in = 0; in < i; ++in) {
5979 output_part_numbering_begin_index +=
5980 (*next_future_num_parts_in_parts)[in];
5988 if(!did_i_find_my_group) {
5989 processor_ranks_for_subcomm.clear();
5997 for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii) {
5998 mj_part_t nonassigned_proc_id =
5999 sort_item_num_part_points_in_procs[ii].id;
6000 mj_lno_t num_points_to_sent =
6001 sort_item_num_part_points_in_procs[ii].val;
6007 if(num_points_to_sent < 0) {
6008 cout <<
"Migration - processor assignments - for part:" << i
6009 <<
"from proc:" << nonassigned_proc_id <<
" num_points_to_sent:" 6010 << num_points_to_sent << std::endl;
6015 switch (migration_type) {
6019 while (num_points_to_sent > 0) {
6021 if(num_points_to_sent <= space_left_in_sent_proc) {
6023 space_left_in_sent_proc -= num_points_to_sent;
6025 if(this->myRank == nonassigned_proc_id) {
6027 send_count_to_each_proc[next_proc_to_send_id] =
6032 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6033 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6034 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6036 num_points_to_sent = 0;
6040 if(space_left_in_sent_proc > 0) {
6041 num_points_to_sent -= space_left_in_sent_proc;
6044 if(this->myRank == nonassigned_proc_id) {
6046 send_count_to_each_proc[next_proc_to_send_id] =
6047 space_left_in_sent_proc;
6048 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6049 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6050 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6054 ++next_proc_to_send_index;
6057 if(next_part_to_send_index < nprocs - required_proc_count ) {
6058 cout <<
"Migration - processor assignments - for part:" 6060 <<
" next_part_to_send :" << next_part_to_send_index
6061 <<
" nprocs:" << nprocs
6062 <<
" required_proc_count:" << required_proc_count
6063 <<
" Error: next_part_to_send_index <" <<
6064 <<
" nprocs - required_proc_count" << std::endl;
6069 next_proc_to_send_id =
6070 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6072 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6073 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6084 if(this->myRank == nonassigned_proc_id) {
6086 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
6090 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6091 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6092 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6094 num_points_to_sent = 0;
6095 ++next_proc_to_send_index;
6099 if(next_proc_to_send_index == num_procs) {
6100 next_proc_to_send_index = num_procs - required_proc_count;
6103 next_proc_to_send_id =
6104 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6106 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6107 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6120 this->assign_send_destinations(
6122 part_assignment_proc_begin_indices,
6123 processor_chains_in_parts,
6124 send_count_to_each_proc,
6125 coordinate_destinations);
6126 delete [] part_assignment_proc_begin_indices;
6127 delete [] processor_chains_in_parts;
6128 delete [] processor_part_assignments;
6129 delete [] sort_item_num_part_points_in_procs;
6130 delete [] num_procs_assigned_to_each_part;
6148 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6149 typename mj_part_t,
typename mj_node_t>
6150 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6151 assign_send_destinations2(
6152 mj_part_t num_parts,
6153 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
6154 int *coordinate_destinations,
6155 mj_part_t &output_part_numbering_begin_index,
6156 std::vector<mj_part_t> *next_future_num_parts_in_parts)
6158 mj_part_t part_shift_amount = output_part_numbering_begin_index;
6159 mj_part_t previous_processor = -1;
6161 auto local_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
6162 Kokkos::deep_copy(local_new_part_xadj, this->new_part_xadj);
6164 auto local_new_coordinate_permutations =
6165 Kokkos::create_mirror_view(this->new_coordinate_permutations);
6166 Kokkos::deep_copy(local_new_coordinate_permutations,
6167 this->new_coordinate_permutations);
6169 for(mj_part_t i = 0; i < num_parts; ++i) {
6170 mj_part_t p = sort_item_part_to_proc_assignment[i].id;
6173 mj_lno_t part_begin_index = 0;
6176 part_begin_index = local_new_part_xadj(p - 1);
6179 mj_lno_t part_end_index = local_new_part_xadj(p);
6181 mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
6182 if(this->myRank == assigned_proc && previous_processor != assigned_proc) {
6183 output_part_numbering_begin_index = part_shift_amount;
6185 previous_processor = assigned_proc;
6186 part_shift_amount += (*next_future_num_parts_in_parts)[p];
6188 for(mj_lno_t j= part_begin_index; j < part_end_index; j++) {
6189 mj_lno_t localInd = local_new_coordinate_permutations(j);
6190 coordinate_destinations[localInd] = assigned_proc;
6216 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6217 typename mj_part_t,
typename mj_node_t>
6218 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6219 mj_assign_parts_to_procs(
6220 mj_gno_t * num_points_in_all_processor_parts,
6221 mj_part_t num_parts,
6222 mj_part_t num_procs,
6223 mj_lno_t *send_count_to_each_proc,
6224 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6225 mj_part_t &out_num_part,
6226 std::vector<mj_part_t> &out_part_indices,
6227 mj_part_t &output_part_numbering_begin_index,
6228 int *coordinate_destinations) {
6231 mj_gno_t *global_num_points_in_parts =
6232 num_points_in_all_processor_parts + num_procs * num_parts;
6233 out_part_indices.clear();
6237 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment =
6238 new uSortItem<mj_part_t, mj_part_t>[num_parts];
6239 uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i =
6240 new uSortItem<mj_part_t, mj_gno_t>[num_procs];
6244 mj_lno_t work_each =
6245 mj_lno_t (this->num_global_coords / (
double (num_procs)) + 0.5f);
6249 mj_lno_t *space_in_each_processor =
new mj_lno_t[num_procs];
6252 for(mj_part_t i = 0; i < num_procs; ++i) {
6253 space_in_each_processor[i] = work_each;
6260 mj_part_t *num_parts_proc_assigned =
new mj_part_t[num_procs];
6261 memset(num_parts_proc_assigned, 0,
sizeof(mj_part_t) * num_procs);
6262 int empty_proc_count = num_procs;
6266 uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts =
6267 new uSortItem<mj_part_t, mj_gno_t>[num_parts];
6272 for(mj_part_t i = 0; i < num_parts; ++i) {
6273 sort_item_point_counts_in_parts[i].id = i;
6274 sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
6278 uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
6283 for(mj_part_t j = 0; j < num_parts; ++j) {
6285 mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
6288 mj_gno_t load = global_num_points_in_parts[i];
6291 mj_part_t assigned_proc = -1;
6294 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
6295 sort_item_num_points_of_proc_in_part_i[ii].id = ii;
6300 if(empty_proc_count < num_parts - j ||
6301 num_parts_proc_assigned[ii] == 0) {
6303 sort_item_num_points_of_proc_in_part_i[ii].val =
6304 num_points_in_all_processor_parts[ii * num_parts + i];
6307 sort_item_num_points_of_proc_in_part_i[ii].val = -1;
6311 uqsort<mj_part_t, mj_gno_t>(num_procs,
6312 sort_item_num_points_of_proc_in_part_i);
6315 for(mj_part_t iii = num_procs - 1; iii >= 0; --iii) {
6316 mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
6317 if(assigned_proc == -1 ||
6318 (space_in_each_processor[ii] > space_in_each_processor[assigned_proc])) {
6321 else if(space_in_each_processor[ii] == space_in_each_processor[assigned_proc]) {
6322 if(ii < assigned_proc) {
6338 if(num_parts_proc_assigned[assigned_proc]++ == 0) {
6342 space_in_each_processor[assigned_proc] -= load;
6344 sort_item_part_to_proc_assignment[j].id = i;
6347 sort_item_part_to_proc_assignment[j].val = assigned_proc;
6350 if(assigned_proc == this->myRank) {
6352 out_part_indices.push_back(i);
6358 send_count_to_each_proc[assigned_proc] +=
6359 num_points_in_all_processor_parts[this->myRank * num_parts + i];
6362 delete [] num_parts_proc_assigned;
6363 delete [] sort_item_num_points_of_proc_in_part_i;
6364 delete [] sort_item_point_counts_in_parts;
6365 delete [] space_in_each_processor;
6368 uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
6371 this->assign_send_destinations2(
6373 sort_item_part_to_proc_assignment,
6374 coordinate_destinations,
6375 output_part_numbering_begin_index,
6376 next_future_num_parts_in_parts);
6378 delete [] sort_item_part_to_proc_assignment;
6405 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6406 typename mj_part_t,
typename mj_node_t>
6407 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6408 mj_migration_part_proc_assignment(
6409 mj_gno_t * num_points_in_all_processor_parts,
6410 mj_part_t num_parts,
6411 mj_part_t num_procs,
6412 mj_lno_t *send_count_to_each_proc,
6413 std::vector<mj_part_t> &processor_ranks_for_subcomm,
6414 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6415 mj_part_t &out_num_part,
6416 std::vector<mj_part_t> &out_part_indices,
6417 mj_part_t &output_part_numbering_begin_index,
6418 int *coordinate_destinations)
6420 processor_ranks_for_subcomm.clear();
6422 if(num_procs > num_parts) {
6427 mj_part_t out_part_index = 0;
6429 this->mj_assign_proc_to_parts(
6430 num_points_in_all_processor_parts,
6433 send_count_to_each_proc,
6434 processor_ranks_for_subcomm,
6435 next_future_num_parts_in_parts,
6437 output_part_numbering_begin_index,
6438 coordinate_destinations
6442 out_part_indices.clear();
6443 out_part_indices.push_back(out_part_index);
6450 processor_ranks_for_subcomm.push_back(this->myRank);
6455 this->mj_assign_parts_to_procs(
6456 num_points_in_all_processor_parts,
6459 send_count_to_each_proc,
6460 next_future_num_parts_in_parts,
6463 output_part_numbering_begin_index,
6464 coordinate_destinations);
6481 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6482 typename mj_part_t,
typename mj_node_t>
6483 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6485 mj_part_t num_procs,
6486 mj_lno_t &num_new_local_points,
6487 std::string iteration,
6488 int *coordinate_destinations,
6489 mj_part_t num_parts)
6492 #ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION 6493 if(
sizeof(mj_lno_t) <=
sizeof(
int)) {
6497 ZOLTAN_COMM_OBJ *plan = NULL;
6498 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
6499 int num_incoming_gnos = 0;
6500 int message_tag = 7859;
6503 mj_timer_base_string +
"Migration Z1PlanCreating-" + iteration);
6504 int ierr = Zoltan_Comm_Create(
6506 int(this->num_local_coords),
6507 coordinate_destinations,
6510 &num_incoming_gnos);
6514 mj_timer_base_string +
"Migration Z1PlanCreating-" + iteration);
6517 mj_timer_base_string +
"Migration Z1Migration-" + iteration);
6523 auto host_current_mj_gnos = Kokkos::create_mirror_view(
6524 Kokkos::HostSpace(), this->current_mj_gnos);
6525 Kokkos::deep_copy(host_current_mj_gnos, this->current_mj_gnos);
6526 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
6527 Kokkos::ViewAllocateWithoutInitializing(
"dst_gnos"), num_incoming_gnos);
6528 auto host_dst_gnos = Kokkos::create_mirror_view(
6529 Kokkos::HostSpace(), dst_gnos);
6531 ierr = Zoltan_Comm_Do(
6534 (
char *) host_current_mj_gnos.data(),
6536 (
char *) host_dst_gnos.data());
6538 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
6539 this->current_mj_gnos = dst_gnos;
6545 auto host_src_coordinates = Kokkos::create_mirror_view(
6546 Kokkos::HostSpace(), this->mj_coordinates);
6547 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6548 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6549 dst_coordinates(Kokkos::ViewAllocateWithoutInitializing(
"mj_coordinates"),
6550 num_incoming_gnos, this->coord_dim);
6551 auto host_dst_coordinates = Kokkos::create_mirror_view(
6552 Kokkos::HostSpace(), dst_coordinates);
6553 for(
int i = 0; i < this->coord_dim; ++i) {
6554 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sub_host_src_coordinates
6555 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6556 Kokkos::View<mj_scalar_t *, Kokkos::HostSpace> sub_host_dst_coordinates
6557 = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
6560 ierr = Zoltan_Comm_Do(
6563 (
char *) sub_host_src_coordinates.data(),
6564 sizeof(mj_scalar_t),
6565 (
char *) sub_host_dst_coordinates.data());
6568 deep_copy(dst_coordinates, host_dst_coordinates);
6569 this->mj_coordinates = dst_coordinates;
6574 auto host_src_weights = Kokkos::create_mirror_view(
6575 Kokkos::HostSpace(), this->mj_weights);
6576 Kokkos::deep_copy(host_src_weights, this->mj_weights);
6577 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6578 Kokkos::ViewAllocateWithoutInitializing(
"mj_weights"),
6579 num_incoming_gnos, this->num_weights_per_coord);
6580 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
6581 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
6582 auto sub_host_src_weights
6583 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6584 auto sub_host_dst_weights
6585 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6586 ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
6588 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6589 sent_weight[n] = sub_host_src_weights(n);
6591 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6593 ierr = Zoltan_Comm_Do(
6596 (
char *) sent_weight.getRawPtr(),
6597 sizeof(mj_scalar_t),
6598 (
char *) received_weight.getRawPtr());
6601 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6602 sub_host_dst_weights(n) = received_weight[n];
6605 deep_copy(dst_weights, host_dst_weights);
6606 this->mj_weights = dst_weights;
6612 Kokkos::View<int *, Kokkos::HostSpace> dst_owners_of_coordinate(
6613 Kokkos::ViewAllocateWithoutInitializing(
"owner_of_coordinate"),
6616 ierr = Zoltan_Comm_Do(
6619 (
char *) owner_of_coordinate.data(),
6621 (
char *) dst_owners_of_coordinate.data());
6623 this->owner_of_coordinate = dst_owners_of_coordinate;
6630 auto host_src_assigned_part_ids = Kokkos::create_mirror_view(
6631 Kokkos::HostSpace(), this->assigned_part_ids);
6632 Kokkos::deep_copy(host_src_assigned_part_ids, this->assigned_part_ids);
6633 Kokkos::View<int *, device_t> dst_assigned_part_ids(
6634 Kokkos::ViewAllocateWithoutInitializing(
"assigned_part_ids"),
6636 auto host_dst_assigned_part_ids = Kokkos::create_mirror_view(
6637 Kokkos::HostSpace(), dst_assigned_part_ids);
6638 mj_part_t *new_parts =
new mj_part_t[num_incoming_gnos];
6639 if(num_procs < num_parts) {
6641 ierr = Zoltan_Comm_Do(
6644 (
char *) host_src_assigned_part_ids.data(),
6646 (
char *) host_dst_assigned_part_ids.data());
6648 Kokkos::deep_copy(dst_assigned_part_ids, host_dst_assigned_part_ids);
6652 this->assigned_part_ids = dst_assigned_part_ids;
6655 ierr = Zoltan_Comm_Destroy(&plan);
6657 num_new_local_points = num_incoming_gnos;
6659 mj_timer_base_string +
"Migration Z1Migration-" + iteration);
6662 #endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION 6664 this->mj_env->timerStart(
MACRO_TIMERS, mj_timer_base_string +
6665 "Migration DistributorPlanCreating-" + iteration);
6667 Tpetra::Distributor distributor(this->comm);
6668 ArrayView<const mj_part_t> destinations( coordinate_destinations,
6669 this->num_local_coords);
6670 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
6671 this->mj_env->timerStop(
MACRO_TIMERS, mj_timer_base_string +
6672 "Migration DistributorPlanCreating-" + iteration);
6673 this->mj_env->timerStart(
MACRO_TIMERS, mj_timer_base_string +
6674 "Migration DistributorMigration-" + iteration);
6681 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
6682 auto src_host_current_mj_gnos =
6683 Kokkos::create_mirror_view(Kokkos::HostSpace(), this->current_mj_gnos);
6684 Kokkos::deep_copy(src_host_current_mj_gnos, this->current_mj_gnos);
6685 ArrayView<mj_gno_t> sent_gnos(
6686 src_host_current_mj_gnos.data(), this->num_local_coords);
6687 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
6688 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
6689 Kokkos::ViewAllocateWithoutInitializing(
"gids"), num_incoming_gnos);
6690 auto host_current_mj_gnos = Kokkos::create_mirror_view(
6691 this->current_mj_gnos);
6692 memcpy(host_current_mj_gnos.data(),
6693 received_gnos.getRawPtr(), num_incoming_gnos *
sizeof(mj_gno_t));
6694 Kokkos::deep_copy(this->current_mj_gnos, host_current_mj_gnos);
6699 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6700 dst_coordinates(
"mj_coordinates", num_incoming_gnos, this->coord_dim);
6701 auto host_dst_coordinates = Kokkos::create_mirror_view(dst_coordinates);
6702 auto host_src_coordinates = Kokkos::create_mirror_view(
6703 Kokkos::HostSpace(), this->mj_coordinates);
6704 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6705 for(
int i = 0; i < this->coord_dim; ++i) {
6706 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sub_host_src_coordinates
6707 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6708 auto sub_host_dst_coordinates
6709 = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
6722 ArrayRCP<mj_scalar_t> sent_coord(this->num_local_coords);
6723 for(
int n = 0; n < this->num_local_coords; ++n) {
6724 sent_coord[n] = sub_host_src_coordinates[n];
6727 ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
6728 distributor.doPostsAndWaits<mj_scalar_t>(
6729 sent_coord(), 1, received_coord());
6730 memcpy(sub_host_dst_coordinates.data(),
6731 received_coord.getRawPtr(), num_incoming_gnos *
sizeof(mj_scalar_t));
6733 deep_copy(dst_coordinates, host_dst_coordinates);
6734 this->mj_coordinates = dst_coordinates;
6737 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6738 "mj_weights", num_incoming_gnos, this->num_weights_per_coord);
6739 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
6740 auto host_src_weights = Kokkos::create_mirror_view(
6741 Kokkos::HostSpace(), this->mj_weights);
6742 Kokkos::deep_copy(host_src_weights, this->mj_weights);
6743 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
6744 auto sub_host_src_weights
6745 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6746 auto sub_host_dst_weights
6747 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6748 ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
6754 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6755 sent_weight[n] = sub_host_src_weights(n);
6757 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6758 distributor.doPostsAndWaits<mj_scalar_t>(
6759 sent_weight(), 1, received_weight());
6762 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6763 sub_host_dst_weights(n) = received_weight[n];
6766 Kokkos::deep_copy(dst_weights, host_dst_weights);
6767 this->mj_weights = dst_weights;
6772 ArrayView<int> sent_owners(
6773 owner_of_coordinate.data(), this->num_local_coords);
6774 ArrayRCP<int> received_owners(num_incoming_gnos);
6775 distributor.doPostsAndWaits<
int>(sent_owners, 1, received_owners());
6776 this->owner_of_coordinate = Kokkos::View<int *, Kokkos::HostSpace>
6777 (
"owner_of_coordinate", num_incoming_gnos);
6778 memcpy(this->owner_of_coordinate.data(),
6779 received_owners.getRawPtr(), num_incoming_gnos *
sizeof(int));
6785 if(num_procs < num_parts) {
6786 auto src_host_assigned_part_ids =
6787 Kokkos::create_mirror_view(Kokkos::HostSpace(), this->assigned_part_ids);
6788 Kokkos::deep_copy(src_host_assigned_part_ids, assigned_part_ids);
6789 ArrayView<mj_part_t> sent_partids(
6790 src_host_assigned_part_ids.data(), this->num_local_coords);
6791 ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
6792 distributor.doPostsAndWaits<mj_part_t>(
6793 sent_partids, 1, received_partids());
6794 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6795 (
"assigned_part_ids", num_incoming_gnos);
6796 auto host_assigned_part_ids = Kokkos::create_mirror_view(
6797 this->assigned_part_ids);
6799 host_assigned_part_ids.data(),
6800 received_partids.getRawPtr(),
6801 num_incoming_gnos *
sizeof(mj_part_t));
6802 Kokkos::deep_copy(this->assigned_part_ids, host_assigned_part_ids);
6805 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6806 (
"assigned_part_ids", num_incoming_gnos);
6808 this->mj_env->timerStop(
MACRO_TIMERS,
"" + mj_timer_base_string +
6809 "Migration DistributorMigration-" + iteration);
6811 num_new_local_points = num_incoming_gnos;
6820 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6821 typename mj_part_t,
typename mj_node_t>
6822 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6823 create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm)
6825 mj_part_t group_size = processor_ranks_for_subcomm.size();
6826 mj_part_t *ids =
new mj_part_t[group_size];
6827 for(mj_part_t i = 0; i < group_size; ++i) {
6828 ids[i] = processor_ranks_for_subcomm[i];
6830 ArrayView<const mj_part_t> idView(ids, group_size);
6831 this->comm = this->comm->createSubcommunicator(idView);
6840 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6841 typename mj_part_t,
typename mj_node_t>
6842 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6843 fill_permutation_array(
6844 mj_part_t output_num_parts,
6845 mj_part_t num_parts)
6848 if(output_num_parts == 1) {
6849 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6850 Kokkos::parallel_for(
6851 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
6852 (0, this->num_local_coords),
6853 KOKKOS_LAMBDA(mj_lno_t i) {
6854 local_new_coordinate_permutations(i) = i;
6856 auto local_new_part_xadj = this->new_part_xadj;
6857 auto local_num_local_coords = this->num_local_coords;
6858 Kokkos::parallel_for(
6859 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6860 KOKKOS_LAMBDA(
int dummy) {
6861 local_new_part_xadj(0) = local_num_local_coords;
6865 auto local_num_local_coords = this->num_local_coords;
6866 auto local_assigned_part_ids = this->assigned_part_ids;
6867 auto local_new_part_xadj = this->new_part_xadj;
6868 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6871 Kokkos::View<mj_part_t*, device_t> part_shifts(
"part_shifts", num_parts);
6876 Kokkos::View<mj_lno_t*, device_t> num_points_in_parts(
6877 "num_points_in_parts", num_parts);
6879 Kokkos::parallel_for(
6880 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6881 KOKKOS_LAMBDA(
int dummy) {
6883 for(mj_lno_t i = 0; i < local_num_local_coords; ++i) {
6884 mj_part_t ii = local_assigned_part_ids(i);
6885 ++num_points_in_parts(ii);
6890 mj_lno_t prev_index = 0;
6891 for(mj_part_t i = 0; i < num_parts; ++i) {
6892 if(num_points_in_parts(i) > 0) {
6893 local_new_part_xadj(p) = prev_index + num_points_in_parts(i);
6894 prev_index += num_points_in_parts(i);
6895 part_shifts(i) = p++;
6900 mj_part_t assigned_num_parts = p - 1;
6901 for(;p < num_parts; ++p) {
6902 local_new_part_xadj(p) =
6903 local_new_part_xadj(assigned_num_parts);
6905 for(mj_part_t i = 0; i < output_num_parts; ++i) {
6906 num_points_in_parts(i) = local_new_part_xadj(i);
6912 for(mj_lno_t i = local_num_local_coords - 1; i >= 0; --i) {
6914 part_shifts[mj_part_t(local_assigned_part_ids(i))];
6915 local_new_coordinate_permutations(--num_points_in_parts[part]) = i;
6945 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6946 typename mj_part_t,
typename mj_node_t>
6947 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6948 mj_perform_migration(
6949 mj_part_t input_num_parts,
6950 mj_part_t &output_num_parts,
6951 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6952 mj_part_t &output_part_begin_index,
6953 size_t migration_reduce_all_population,
6954 mj_lno_t num_coords_for_last_dim_part,
6955 std::string iteration,
6956 RCP<mj_partBoxVector_t> &input_part_boxes,
6957 RCP<mj_partBoxVector_t> &output_part_boxes)
6959 mj_part_t num_procs = this->comm->getSize();
6960 this->myRank = this->comm->getRank();
6965 mj_gno_t *num_points_in_all_processor_parts =
6966 new mj_gno_t[input_num_parts * (num_procs + 1)];
6969 this->get_processor_num_points_in_parts(
6972 num_points_in_all_processor_parts);
6975 if(!this->mj_check_to_migrate(
6976 migration_reduce_all_population,
6977 num_coords_for_last_dim_part,
6980 num_points_in_all_processor_parts)) {
6981 delete [] num_points_in_all_processor_parts;
6985 mj_lno_t *send_count_to_each_proc = NULL;
6986 int *coordinate_destinations =
new int[this->num_local_coords];
6987 send_count_to_each_proc =
new mj_lno_t[num_procs];
6989 for(
int i = 0; i < num_procs; ++i) {
6990 send_count_to_each_proc[i] = 0;
6993 std::vector<mj_part_t> processor_ranks_for_subcomm;
6994 std::vector<mj_part_t> out_part_indices;
6997 this->mj_migration_part_proc_assignment(
6998 num_points_in_all_processor_parts,
7001 send_count_to_each_proc,
7002 processor_ranks_for_subcomm,
7003 next_future_num_parts_in_parts,
7006 output_part_begin_index,
7007 coordinate_destinations);
7009 delete [] send_count_to_each_proc;
7010 std::vector <mj_part_t> tmpv;
7012 std::sort (out_part_indices.begin(), out_part_indices.end());
7013 mj_part_t outP = out_part_indices.size();
7014 mj_gno_t new_global_num_points = 0;
7015 mj_gno_t *global_num_points_in_parts =
7016 num_points_in_all_processor_parts + num_procs * input_num_parts;
7018 if(this->mj_keep_part_boxes) {
7019 input_part_boxes->clear();
7024 for(mj_part_t i = 0; i < outP; ++i) {
7025 mj_part_t ind = out_part_indices[i];
7026 new_global_num_points += global_num_points_in_parts[ind];
7027 tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
7028 if(this->mj_keep_part_boxes) {
7029 input_part_boxes->push_back((*output_part_boxes)[ind]);
7034 if(this->mj_keep_part_boxes) {
7035 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7036 input_part_boxes = output_part_boxes;
7037 output_part_boxes = tmpPartBoxes;
7039 next_future_num_parts_in_parts->clear();
7040 for(mj_part_t i = 0; i < outP; ++i) {
7041 mj_part_t p = tmpv[i];
7042 next_future_num_parts_in_parts->push_back(p);
7045 delete [] num_points_in_all_processor_parts;
7047 mj_lno_t num_new_local_points = 0;
7049 this->mj_migrate_coords(
7051 num_new_local_points,
7053 coordinate_destinations,
7056 delete [] coordinate_destinations;
7057 if(this->num_local_coords != num_new_local_points) {
7058 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7059 (Kokkos::ViewAllocateWithoutInitializing(
"new_coordinate_permutations"),
7060 num_new_local_points);
7061 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7062 (Kokkos::ViewAllocateWithoutInitializing(
"coordinate_permutations"),
7063 num_new_local_points);
7065 this->num_local_coords = num_new_local_points;
7066 this->num_global_coords = new_global_num_points;
7069 this->create_sub_communicator(processor_ranks_for_subcomm);
7071 processor_ranks_for_subcomm.clear();
7074 this->fill_permutation_array(output_num_parts, input_num_parts);
7097 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7098 typename mj_part_t,
typename mj_node_t>
7099 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7100 create_consistent_chunks(
7101 mj_part_t num_parts,
7102 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
7103 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
7104 mj_lno_t coordinate_begin,
7105 mj_lno_t coordinate_end,
7106 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
7107 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
7109 bool longest_dim_part,
7110 uSignedSortItem<int, mj_scalar_t, char> * p_coord_dimension_range_sorted)
7120 mj_part_t no_cuts = num_parts - 1;
7124 if(this->distribute_points_on_cut_lines) {
7125 auto local_thread_cut_line_weight_to_put_left =
7126 this->thread_cut_line_weight_to_put_left;
7127 auto local_thread_part_weight_work =
7128 this->thread_part_weight_work;
7129 auto local_sEpsilon = this->
sEpsilon;
7131 Kokkos::parallel_for(
7132 Kokkos::RangePolicy<
typename mj_node_t::execution_space,
7133 mj_part_t> (0, no_cuts), KOKKOS_LAMBDA (mj_part_t i) {
7135 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
7136 if(left_weight > local_sEpsilon) {
7138 mj_scalar_t thread_ii_weight_on_cut =
7139 local_thread_part_weight_work(i * 2 + 1) -
7140 local_thread_part_weight_work(i * 2);
7141 if(thread_ii_weight_on_cut < left_weight) {
7142 local_thread_cut_line_weight_to_put_left(i) =
7143 thread_ii_weight_on_cut;
7146 local_thread_cut_line_weight_to_put_left(i) = left_weight;
7150 local_thread_cut_line_weight_to_put_left(i) = 0;
7155 auto local_least_signifiance = least_signifiance;
7156 auto local_significance_mul = significance_mul;
7157 Kokkos::parallel_for(
7158 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
7159 (0, 1), KOKKOS_LAMBDA (
int dummy) {
7163 for(mj_part_t i = no_cuts - 1; i > 0 ; --i) {
7164 mj_scalar_t cut1 = current_concurrent_cut_coordinate(i-1);
7165 mj_scalar_t cut2 = current_concurrent_cut_coordinate(i);
7166 mj_scalar_t delta = cut2 - cut1;
7167 mj_scalar_t abs_delta = (delta > 0) ? delta : -delta;
7168 if(abs_delta < local_sEpsilon) {
7169 local_thread_cut_line_weight_to_put_left(i) -=
7170 local_thread_cut_line_weight_to_put_left(i - 1);
7172 local_thread_cut_line_weight_to_put_left(i) =
7173 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
7174 local_least_signifiance) * local_significance_mul) /
7175 static_cast<mj_scalar_t
>(local_significance_mul);
7181 auto local_thread_point_counts = this->thread_point_counts;
7182 Kokkos::parallel_for(
7183 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
7184 (0, num_parts), KOKKOS_LAMBDA (mj_part_t i) {
7185 local_thread_point_counts(i) = 0;
7196 mj_part_t *cut_map =
new mj_part_t[no_cuts];
7198 typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
7199 typedef std::vector< multiSItem > multiSVector;
7200 typedef std::vector<multiSVector> multiS2Vector;
7203 std::vector<mj_scalar_t *>allocated_memory;
7206 multiS2Vector sort_vector_points_on_cut;
7209 mj_part_t different_cut_count = 1;
7215 multiSVector tmpMultiSVector;
7216 sort_vector_points_on_cut.push_back(tmpMultiSVector);
7218 auto local_current_concurrent_cut_coordinate =
7219 current_concurrent_cut_coordinate;
7220 auto host_current_concurrent_cut_coordinate =
7221 Kokkos::create_mirror_view(local_current_concurrent_cut_coordinate);
7222 Kokkos::deep_copy(host_current_concurrent_cut_coordinate,
7223 local_current_concurrent_cut_coordinate);
7225 for(mj_part_t i = 1; i < no_cuts ; ++i) {
7228 if(std::abs(host_current_concurrent_cut_coordinate(i) -
7229 host_current_concurrent_cut_coordinate(i-1)) < this->sEpsilon) {
7230 cut_map[i] = cut_map[i-1];
7233 cut_map[i] = different_cut_count++;
7234 multiSVector tmp2MultiSVector;
7235 sort_vector_points_on_cut.push_back(tmp2MultiSVector);
7238 Kokkos::deep_copy(current_concurrent_cut_coordinate,
7239 host_current_concurrent_cut_coordinate);
7242 auto host_coordinate_permutations =
7243 Kokkos::create_mirror_view(coordinate_permutations);
7244 Kokkos::deep_copy(host_coordinate_permutations, coordinate_permutations);
7246 auto host_assigned_part_ids = Kokkos::create_mirror_view(assigned_part_ids);
7247 Kokkos::deep_copy(host_assigned_part_ids, assigned_part_ids);
7249 auto host_mj_coordinates = Kokkos::create_mirror_view(mj_coordinates);
7250 Kokkos::deep_copy(host_mj_coordinates, mj_coordinates);
7252 auto host_thread_point_counts = Kokkos::create_mirror_view(thread_point_counts);
7253 Kokkos::deep_copy(host_thread_point_counts, thread_point_counts);
7255 auto local_coord_dim = this->coord_dim;
7257 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7258 mj_lno_t i = host_coordinate_permutations(ii);
7259 mj_part_t pp = host_assigned_part_ids(i);
7260 mj_part_t p = pp / 2;
7263 mj_scalar_t *vals =
new mj_scalar_t[local_coord_dim -1];
7264 allocated_memory.push_back(vals);
7269 if(longest_dim_part) {
7271 for(
int dim = local_coord_dim - 2; dim >= 0; --dim) {
7274 int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
7279 host_mj_coordinates(i,next_largest_coord_dim);
7283 for(
int dim = coordInd + 1; dim < local_coord_dim; ++dim) {
7284 vals[val_ind++] = host_mj_coordinates(i,dim);
7286 for(
int dim = 0; dim < coordInd; ++dim) {
7287 vals[val_ind++] = host_mj_coordinates(i,dim);
7291 multiSItem tempSortItem(i, local_coord_dim -1, vals);
7293 mj_part_t cmap = cut_map[p];
7294 sort_vector_points_on_cut[cmap].push_back(tempSortItem);
7298 ++host_thread_point_counts(p);
7299 host_assigned_part_ids(i) = p;
7304 for(mj_part_t i = 0; i < different_cut_count; ++i) {
7305 std::sort (sort_vector_points_on_cut[i].begin(),
7306 sort_vector_points_on_cut[i].end());
7309 mj_part_t previous_cut_map = cut_map[0];
7311 auto host_thread_cut_line_weight_to_put_left =
7312 Kokkos::create_mirror_view(thread_cut_line_weight_to_put_left);
7313 Kokkos::deep_copy(host_thread_cut_line_weight_to_put_left,
7314 thread_cut_line_weight_to_put_left);
7316 auto host_mj_weights = Kokkos::create_mirror_view(mj_weights);
7317 Kokkos::deep_copy(host_mj_weights, mj_weights);
7327 mj_scalar_t weight_stolen_from_previous_part = 0;
7328 for(mj_part_t p = 0; p < no_cuts; ++p) {
7329 mj_part_t mapped_cut = cut_map[p];
7333 if(previous_cut_map != mapped_cut) {
7334 mj_lno_t sort_vector_end = (mj_lno_t)
7335 sort_vector_points_on_cut[previous_cut_map].size() - 1;
7336 for(; sort_vector_end >= 0; --sort_vector_end) {
7338 sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7339 mj_lno_t i = t.index;
7340 ++host_thread_point_counts(p);
7341 host_assigned_part_ids(i) = p;
7343 sort_vector_points_on_cut[previous_cut_map].clear();
7347 mj_lno_t sort_vector_end = (mj_lno_t)
7348 sort_vector_points_on_cut[mapped_cut].size() - 1;
7354 for(; sort_vector_end >= 0; --sort_vector_end) {
7357 multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
7359 mj_lno_t i = t.index;
7360 mj_scalar_t w = this->mj_uniform_weights(0) ? 1 :
7361 this->mj_weights(i,0);
7363 if(host_thread_cut_line_weight_to_put_left(p) +
7364 weight_stolen_from_previous_part> this->sEpsilon &&
7365 host_thread_cut_line_weight_to_put_left(p) +
7366 weight_stolen_from_previous_part -
7367 std::abs(host_thread_cut_line_weight_to_put_left(p) +
7368 weight_stolen_from_previous_part - w)> this->sEpsilon)
7370 host_thread_cut_line_weight_to_put_left(p) -= w;
7372 sort_vector_points_on_cut[mapped_cut].pop_back();
7374 ++host_thread_point_counts(p);
7375 host_assigned_part_ids(i) = p;
7379 if(p < no_cuts - 1 &&
7380 host_thread_cut_line_weight_to_put_left(p) < this->sEpsilon) {
7381 if(mapped_cut == cut_map[p + 1] ) {
7384 if(previous_cut_map != mapped_cut) {
7385 weight_stolen_from_previous_part =
7386 host_thread_cut_line_weight_to_put_left(p);
7391 weight_stolen_from_previous_part +=
7392 host_thread_cut_line_weight_to_put_left(p);
7396 weight_stolen_from_previous_part =
7397 -host_thread_cut_line_weight_to_put_left(p);
7406 if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]) {
7407 if(previous_cut_map != mapped_cut) {
7408 weight_stolen_from_previous_part =
7409 host_thread_cut_line_weight_to_put_left(p);
7412 weight_stolen_from_previous_part +=
7413 host_thread_cut_line_weight_to_put_left(p);
7417 weight_stolen_from_previous_part =
7418 -host_thread_cut_line_weight_to_put_left(p);
7424 previous_cut_map = mapped_cut;
7429 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[
7430 previous_cut_map].size() - 1;
7436 for(; sort_vector_end >= 0; --sort_vector_end) {
7438 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7441 mj_lno_t i = t.index;
7442 ++host_thread_point_counts(no_cuts);
7443 host_assigned_part_ids(i) = no_cuts;
7446 sort_vector_points_on_cut[previous_cut_map].clear();
7450 mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
7451 for(mj_lno_t i = 0; i < vSize; ++i) {
7452 delete [] allocated_memory[i];
7455 auto local_out_part_xadj = out_part_xadj;
7456 auto host_out_part_xadj = Kokkos::create_mirror_view(local_out_part_xadj);
7457 Kokkos::deep_copy(host_out_part_xadj, out_part_xadj);
7460 for(mj_part_t j = 0; j < num_parts; ++j) {
7461 host_out_part_xadj(j) = host_thread_point_counts(j);
7462 host_thread_point_counts(j) = 0;
7466 for(mj_part_t j = 1; j < num_parts; ++j) {
7467 host_out_part_xadj(j) += host_out_part_xadj(j - 1);
7472 for(mj_part_t j = 1; j < num_parts; ++j) {
7473 host_thread_point_counts(j) += host_out_part_xadj(j - 1);
7476 auto host_new_coordinate_permutations =
7477 Kokkos::create_mirror_view(new_coordinate_permutations);
7478 Kokkos::deep_copy(host_new_coordinate_permutations,
7479 new_coordinate_permutations);
7483 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7484 mj_lno_t i = host_coordinate_permutations(ii);
7485 mj_part_t p = host_assigned_part_ids(i);
7486 host_new_coordinate_permutations(coordinate_begin +
7487 host_thread_point_counts(p)++) = i;
7490 Kokkos::deep_copy(thread_point_counts, host_thread_point_counts);
7491 Kokkos::deep_copy(new_coordinate_permutations,
7492 host_new_coordinate_permutations);
7493 Kokkos::deep_copy(local_out_part_xadj, host_out_part_xadj);
7505 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7506 typename mj_part_t,
typename mj_node_t>
7507 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7509 mj_part_t current_num_parts,
7510 mj_part_t output_part_begin_index,
7511 RCP<mj_partBoxVector_t> &output_part_boxes,
7512 bool is_data_ever_migrated)
7515 mj_timer_base_string +
"Part_Assignment");
7518 auto local_mj_keep_part_boxes = mj_keep_part_boxes;
7519 auto local_coordinate_permutations = coordinate_permutations;
7520 auto local_assigned_part_ids = assigned_part_ids;
7522 if(local_mj_keep_part_boxes) {
7523 for(
int i = 0; i < current_num_parts; ++i) {
7524 (*output_part_boxes)[i].setpId(i + output_part_begin_index);
7528 Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy(
7529 current_num_parts, Kokkos::AUTO());
7530 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
7532 Kokkos::parallel_for(policy, KOKKOS_LAMBDA(
member_type team_member) {
7533 int i = team_member.league_rank();
7534 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, (i != 0) ?
7535 local_part_xadj(i-1) : 0, local_part_xadj(i)),
7537 mj_lno_t k = local_coordinate_permutations(ii);
7538 local_assigned_part_ids(k) = i + output_part_begin_index;
7542 if(is_data_ever_migrated) {
7543 #ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION 7544 if(
sizeof(mj_lno_t) <=
sizeof(int)) {
7551 ZOLTAN_COMM_OBJ *plan = NULL;
7552 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
7555 int message_tag = 7856;
7558 mj_timer_base_string +
"Final Z1PlanCreating");
7561 int ierr = Zoltan_Comm_Create( &plan,
int(this->num_local_coords),
7562 this->owner_of_coordinate.data(), mpi_comm, message_tag, &incoming);
7566 mj_timer_base_string +
"Final Z1PlanCreating" );
7569 mj_timer_base_string +
"Final Z1PlanComm");
7574 auto host_current_mj_gnos = Kokkos::create_mirror_view(
7575 Kokkos::HostSpace(), this->current_mj_gnos);
7576 deep_copy(host_current_mj_gnos, this->current_mj_gnos);
7577 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
7578 Kokkos::ViewAllocateWithoutInitializing(
"dst_gnos"), incoming);
7579 auto host_dst_gnos = Kokkos::create_mirror_view(
7580 Kokkos::HostSpace(), dst_gnos);
7582 ierr = Zoltan_Comm_Do( plan, message_tag,
7583 (
char *) host_current_mj_gnos.data(),
7584 sizeof(mj_gno_t), (
char *) host_dst_gnos.data());
7586 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
7587 this->current_mj_gnos = dst_gnos;
7590 auto host_src_part_ids = Kokkos::create_mirror_view(
7591 Kokkos::HostSpace(), this->assigned_part_ids);
7592 deep_copy(host_src_part_ids, this->assigned_part_ids);
7593 Kokkos::View<mj_part_t*, device_t> dst_part_ids(
7594 Kokkos::ViewAllocateWithoutInitializing(
"dst_part_ids"), incoming);
7595 auto host_dst_part_ids = Kokkos::create_mirror_view(
7596 Kokkos::HostSpace(), dst_part_ids);
7598 ierr = Zoltan_Comm_Do( plan, message_tag,
7599 (
char *) host_src_part_ids.data(),
7600 sizeof(mj_part_t), (
char *) host_dst_part_ids.data());
7602 Kokkos::deep_copy(dst_part_ids, host_dst_part_ids);
7603 this->assigned_part_ids = dst_part_ids;
7605 ierr = Zoltan_Comm_Destroy(&plan);
7608 this->num_local_coords = incoming;
7611 mj_timer_base_string +
"Final Z1PlanComm");
7614 #endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION 7618 mj_timer_base_string +
"Final DistributorPlanCreating");
7619 Tpetra::Distributor distributor(this->mj_problemComm);
7620 ArrayView<const mj_part_t> owners_of_coords(
7621 this->owner_of_coordinate.data(), this->num_local_coords);
7622 mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
7624 mj_timer_base_string +
"Final DistributorPlanCreating" );
7627 mj_timer_base_string +
"Final DistributorPlanComm");
7632 auto src_host_current_mj_gnos =
7633 Kokkos::create_mirror_view(Kokkos::HostSpace(), this->current_mj_gnos);
7634 Kokkos::deep_copy(src_host_current_mj_gnos, this->current_mj_gnos);
7635 ArrayRCP<mj_gno_t> received_gnos(incoming);
7636 ArrayView<mj_gno_t> sent_gnos(src_host_current_mj_gnos.data(),
7637 this->num_local_coords);
7638 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
7639 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
7640 Kokkos::ViewAllocateWithoutInitializing(
"current_mj_gnos"), incoming);
7641 auto host_current_mj_gnos = Kokkos::create_mirror_view(
7642 this->current_mj_gnos);
7643 memcpy(host_current_mj_gnos.data(),
7644 received_gnos.getRawPtr(), incoming *
sizeof(mj_gno_t));
7645 Kokkos::deep_copy(this->current_mj_gnos, host_current_mj_gnos);
7648 auto src_host_assigned_part_ids =
7649 Kokkos::create_mirror_view(Kokkos::HostSpace(), this->assigned_part_ids);
7650 Kokkos::deep_copy(src_host_assigned_part_ids, this->assigned_part_ids);
7651 ArrayView<mj_part_t> sent_partids(src_host_assigned_part_ids.data(),
7652 this->num_local_coords);
7653 ArrayRCP<mj_part_t> received_partids(incoming);
7654 distributor.doPostsAndWaits<mj_part_t>(
7655 sent_partids, 1, received_partids());
7656 this->assigned_part_ids =
7657 Kokkos::View<mj_part_t*, device_t>(
7658 Kokkos::ViewAllocateWithoutInitializing(
"assigned_part_ids"),
7660 auto host_assigned_part_ids = Kokkos::create_mirror_view(
7661 this->assigned_part_ids);
7662 memcpy( host_assigned_part_ids.data(),
7663 received_partids.getRawPtr(), incoming *
sizeof(mj_part_t));
7664 deep_copy(this->assigned_part_ids, host_assigned_part_ids);
7665 this->num_local_coords = incoming;
7668 mj_timer_base_string +
"Final DistributorPlanComm");
7673 mj_timer_base_string +
"Part_Assignment");
7676 mj_timer_base_string +
"Solution_Part_Assignment");
7681 if(this->mj_keep_part_boxes) {
7682 this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
7686 mj_timer_base_string +
"Solution_Part_Assignment");
7701 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7702 typename mj_part_t,
typename mj_node_t>
7705 bool distribute_points_on_cut_lines_,
7706 int max_concurrent_part_calculation_,
7707 int check_migrate_avoid_migration_option_,
7708 double minimum_migration_imbalance_,
7709 int migration_type_)
7711 this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
7712 this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
7713 this->check_migrate_avoid_migration_option =
7714 check_migrate_avoid_migration_option_;
7715 this->minimum_migration_imbalance = minimum_migration_imbalance_;
7716 this->migration_type = migration_type_;
7746 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7747 typename mj_part_t,
typename mj_node_t>
7750 const RCP<const Environment> &env,
7751 RCP<
const Comm<int> > &problemComm,
7752 double imbalance_tolerance_,
7754 size_t num_global_parts_,
7755 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array_,
7756 int recursion_depth_,
7758 mj_lno_t num_local_coords_,
7759 mj_gno_t num_global_coords_,
7760 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
7762 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
7763 int num_weights_per_coord_,
7764 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights_,
7765 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
7766 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts_,
7767 Kokkos::View<mj_part_t *, device_t> & result_assigned_part_ids_,
7768 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos_)
7773 this->mj_timer_base_string =
"MJ(" + std::to_string(execute_counter) +
") - ";
7776 this->mj_problemComm = problemComm;
7777 this->myActualRank = this->myRank = this->mj_problemComm->getRank();
7779 mj_timer_base_string +
"Total");
7780 this->mj_env->debug(3,
"In MultiJagged Jagged");
7781 this->imbalance_tolerance = imbalance_tolerance_;
7782 this->mj_num_teams = num_teams_;
7783 this->num_global_parts = num_global_parts_;
7784 this->part_no_array = part_no_array_;
7785 this->recursion_depth = recursion_depth_;
7786 this->coord_dim = coord_dim_;
7787 this->num_local_coords = num_local_coords_;
7788 this->num_global_coords = num_global_coords_;
7789 this->mj_coordinates = mj_coordinates_;
7790 this->initial_mj_gnos = initial_mj_gnos_;
7791 this->num_weights_per_coord = num_weights_per_coord_;
7792 this->mj_uniform_weights = mj_uniform_weights_;
7793 this->mj_weights = mj_weights_;
7794 this->mj_uniform_parts = mj_uniform_parts_;
7798 this->set_part_specifications();
7801 mj_timer_base_string +
"Allocate Views");
7802 this->allocate_set_work_memory();
7804 mj_timer_base_string +
"Allocate Views");
7808 this->comm = this->mj_problemComm->duplicate();
7811 if(comm->getRank() == 0) {
7812 std::cout <<
"size of gno:" <<
sizeof(mj_gno_t) << std::endl;
7813 std::cout <<
"size of lno:" <<
sizeof(mj_lno_t) << std::endl;
7814 std::cout <<
"size of mj_scalar_t:" <<
sizeof(mj_scalar_t) << std::endl;
7819 mj_part_t current_num_parts = 1;
7820 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
7821 this->all_cut_coordinates;
7823 mj_timer_base_string +
"Problem_Partitioning");
7824 mj_part_t output_part_begin_index = 0;
7825 mj_part_t future_num_parts = this->total_num_part;
7826 bool is_data_ever_migrated =
false;
7828 std::vector<mj_part_t> *future_num_part_in_parts =
7829 new std::vector<mj_part_t> ();
7830 std::vector<mj_part_t> *next_future_num_parts_in_parts =
7831 new std::vector<mj_part_t> ();
7833 next_future_num_parts_in_parts->push_back(this->num_global_parts);
7835 RCP<mj_partBoxVector_t> input_part_boxes;
7836 RCP<mj_partBoxVector_t> output_part_boxes;
7838 if(this->mj_keep_part_boxes) {
7839 input_part_boxes = RCP<mj_partBoxVector_t>(
new mj_partBoxVector_t(),
true);
7840 output_part_boxes = RCP<mj_partBoxVector_t>(
new mj_partBoxVector_t(),
true);
7841 compute_global_box();
7842 this->init_part_boxes(output_part_boxes);
7849 Kokkos::View<mj_part_t*, device_t>
7850 view_rectilinear_cut_count(
"view_rectilinear_cut_count", 1);
7851 Kokkos::View<size_t*, device_t>
7852 view_total_reduction_size(
"view_total_reduction_size", 1);
7854 for(
int i = 0; i < this->recursion_depth; ++i) {
7857 std::string istring = std::to_string(i);
7863 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
7864 future_num_part_in_parts = next_future_num_parts_in_parts;
7865 next_future_num_parts_in_parts = tmpPartVect;
7869 next_future_num_parts_in_parts->clear();
7870 if(this->mj_keep_part_boxes) {
7871 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7872 input_part_boxes = output_part_boxes;
7873 output_part_boxes = tmpPartBoxes;
7874 output_part_boxes->clear();
7878 mj_part_t output_part_count_in_dimension =
7879 this->update_part_num_arrays(
7880 future_num_part_in_parts,
7881 next_future_num_parts_in_parts,
7886 output_part_boxes, 1);
7891 if(output_part_count_in_dimension == current_num_parts) {
7893 tmpPartVect= future_num_part_in_parts;
7894 future_num_part_in_parts = next_future_num_parts_in_parts;
7895 next_future_num_parts_in_parts = tmpPartVect;
7897 if(this->mj_keep_part_boxes) {
7898 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7899 input_part_boxes = output_part_boxes;
7900 output_part_boxes = tmpPartBoxes;
7906 int coordInd = i % this->coord_dim;
7908 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
7909 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
7912 mj_timer_base_string +
"Problem_Partitioning_" + istring);
7916 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
7917 "new part xadj", output_part_count_in_dimension);
7920 mj_part_t output_part_index = 0;
7924 mj_part_t output_coordinate_end_index = 0;
7929 this->max_concurrent_part_calculation);
7931 mj_part_t obtained_part_index = 0;
7933 auto host_process_local_min_max_coord_total_weight =
7934 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
7935 auto host_global_min_max_coord_total_weight =
7936 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
7944 this->max_concurrent_part_calculation);
7947 auto local_device_num_partitioning_in_current_dim =
7948 device_num_partitioning_in_current_dim;
7949 Kokkos::parallel_reduce(
"Read bDoingWork",
7950 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
7951 KOKKOS_LAMBDA(
int dummy,
int & set_single) {
7954 if(local_device_num_partitioning_in_current_dim(
7961 bool bDoingWork = (bDoingWork_int != 0) ?
true :
false;
7963 this->mj_get_local_min_max_coord_totW(
7966 mj_current_dim_coords);
7971 this->mj_get_global_min_max_coord_totW(
7973 this->process_local_min_max_coord_total_weight,
7974 this->global_min_max_coord_total_weight);
7978 mj_part_t total_incomplete_cut_count = 0;
7983 mj_part_t concurrent_part_cut_shift = 0;
7984 mj_part_t concurrent_part_part_shift = 0;
7988 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
7989 global_min_max_coord_total_weight);
7991 mj_scalar_t min_coordinate =
7992 host_global_min_max_coord_total_weight(kk);
7993 mj_scalar_t max_coordinate =
7994 host_global_min_max_coord_total_weight(
7997 mj_scalar_t global_total_weight =
7998 host_global_min_max_coord_total_weight(
8003 mj_part_t partition_count = host_num_partitioning_in_current_dim(
8004 concurrent_current_part_index);
8006 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
8007 Kokkos::subview(current_cut_coordinates,
8008 std::pair<mj_lno_t, mj_lno_t>(
8009 concurrent_part_cut_shift, current_cut_coordinates.size()));
8010 Kokkos::View<mj_scalar_t *, device_t>
8011 current_target_part_weights =
8012 Kokkos::subview(target_part_weights,
8013 std::pair<mj_lno_t, mj_lno_t>(
8014 concurrent_part_part_shift, target_part_weights.size()));
8017 concurrent_part_cut_shift += partition_count - 1;
8019 concurrent_part_part_shift += partition_count;
8023 if(partition_count > 1 && min_coordinate <= max_coordinate) {
8027 total_incomplete_cut_count += partition_count - 1;
8029 this->incomplete_cut_count(kk) = partition_count - 1;
8032 this->mj_get_initial_cut_coords_target_weights(
8035 partition_count - 1,
8036 global_total_weight,
8038 current_target_part_weights,
8039 future_num_part_in_parts,
8040 next_future_num_parts_in_parts,
8041 concurrent_current_part_index,
8042 obtained_part_index);
8044 mj_lno_t coordinate_end_index =
8045 host_part_xadj(concurrent_current_part_index);
8046 mj_lno_t coordinate_begin_index =
8047 concurrent_current_part_index==0 ? 0 :
8048 host_part_xadj(concurrent_current_part_index - 1);
8050 this->set_initial_coordinate_parts(
8053 coordinate_begin_index, coordinate_end_index,
8054 this->coordinate_permutations,
8055 mj_current_dim_coords,
8056 this->assigned_part_ids,
8062 this->incomplete_cut_count(kk) = 0;
8065 obtained_part_index += partition_count;
8070 double used_imbalance = 0;
8073 mj_timer_base_string +
"Problem_Partitioning Get Part Weights");
8076 mj_current_dim_coords,
8080 current_cut_coordinates,
8081 total_incomplete_cut_count,
8082 view_rectilinear_cut_count,
8083 view_total_reduction_size);
8086 mj_timer_base_string +
"Problem_Partitioning Get Part Weights");
8091 mj_part_t output_array_shift = 0;
8092 mj_part_t cut_shift = 0;
8093 size_t tlr_shift = 0;
8094 size_t partweight_array_shift = 0;
8099 mj_part_t num_parts = host_num_partitioning_in_current_dim(
8100 current_concurrent_work_part);
8103 int coordinateA_bigger_than_coordinateB =
8104 host_global_min_max_coord_total_weight(kk) >
8105 host_global_min_max_coord_total_weight(
8108 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
8111 auto local_new_part_xadj = this->new_part_xadj;
8112 Kokkos::parallel_for(
8113 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8114 (0, num_parts), KOKKOS_LAMBDA (mj_part_t jj) {
8115 local_new_part_xadj(
8116 output_part_index + output_array_shift + jj) = 0;
8119 cut_shift += num_parts - 1;
8120 tlr_shift += (4 *(num_parts - 1) + 1);
8121 output_array_shift += num_parts;
8122 partweight_array_shift += (2 * (num_parts - 1) + 1);
8126 Kokkos::View<mj_scalar_t *, device_t>
8127 current_concurrent_cut_coordinate =
8128 Kokkos::subview(current_cut_coordinates,
8129 std::pair<mj_lno_t, mj_lno_t>(
8131 current_cut_coordinates.size()));
8132 Kokkos::View<mj_scalar_t *, device_t>
8133 used_local_cut_line_weight_to_left =
8134 Kokkos::subview(process_cut_line_weight_to_put_left,
8135 std::pair<mj_lno_t, mj_lno_t>(
8137 process_cut_line_weight_to_put_left.size()));
8139 this->thread_part_weight_work =
8141 this->thread_part_weights,
8142 std::pair<mj_lno_t, mj_lno_t>(
8143 partweight_array_shift,
8144 this->thread_part_weights.extent(0)));
8147 if(this->mj_keep_part_boxes) {
8149 for(mj_part_t j = 0; j < num_parts - 1; ++j) {
8150 mj_scalar_t temp_get_val;
8151 Kokkos::parallel_reduce(
"Read single",
8152 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8153 KOKKOS_LAMBDA(
int dummy, mj_scalar_t & set_single) {
8154 set_single = current_concurrent_cut_coordinate(j);
8156 (*output_part_boxes)
8157 [output_array_shift + output_part_index + j].
8158 updateMinMax(temp_get_val, 1 , coordInd);
8159 (*output_part_boxes)
8160 [output_array_shift + output_part_index + j + 1].
8161 updateMinMax(temp_get_val, 0 , coordInd);
8166 Kokkos::View<mj_lno_t*, device_t> sub_new_part_xadj =
8167 Kokkos::subview(this->new_part_xadj,
8168 std::pair<mj_lno_t, mj_lno_t>(
8169 output_part_index + output_array_shift,
8170 this->new_part_xadj.size()));
8172 this->mj_create_new_partitions(
8174 current_concurrent_work_part,
8175 mj_current_dim_coords,
8176 current_concurrent_cut_coordinate,
8177 used_local_cut_line_weight_to_left,
8182 mj_lno_t coordinate_end = host_part_xadj(
8183 current_concurrent_work_part);
8184 mj_lno_t coordinate_begin =
8185 current_concurrent_work_part==0 ? 0 : host_part_xadj(
8186 current_concurrent_work_part - 1);
8190 mj_lno_t part_size = coordinate_end - coordinate_begin;
8194 auto local_new_part_xadj = this->new_part_xadj;
8195 Kokkos::parallel_for(
8196 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
8197 (0, 1), KOKKOS_LAMBDA (
int dummy) {
8198 local_new_part_xadj(
8199 output_part_index + output_array_shift) = part_size;
8202 auto subview_new_coordinate_permutations =
8203 Kokkos::subview(this->new_coordinate_permutations,
8204 std::pair<mj_lno_t, mj_lno_t>(
8206 coordinate_begin + part_size));
8207 auto subview_coordinate_permutations =
8208 Kokkos::subview(this->coordinate_permutations,
8209 std::pair<mj_lno_t, mj_lno_t>(
8211 coordinate_begin + part_size));
8212 Kokkos::deep_copy(subview_new_coordinate_permutations,
8213 subview_coordinate_permutations);
8215 cut_shift += num_parts - 1;
8216 output_array_shift += num_parts;
8217 partweight_array_shift += (2 * (num_parts - 1) + 1);
8227 mj_part_t num_parts =
8232 auto local_new_part_xadj = this->new_part_xadj;
8233 Kokkos::parallel_for(
8234 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8235 (0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
8236 local_new_part_xadj(output_part_index+ii) +=
8237 output_coordinate_end_index;
8242 Kokkos::parallel_reduce(
"Read single",
8243 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8244 KOKKOS_LAMBDA(
int dummy, mj_part_t & set_single) {
8246 local_new_part_xadj(output_part_index + num_parts - 1);
8248 output_coordinate_end_index = temp_get;
8250 output_part_index += num_parts;
8256 int current_world_size = this->comm->getSize();
8257 long migration_reduce_all_population =
8258 this->total_dim_num_reduce_all * current_world_size;
8259 bool is_migrated_in_current_dimension =
false;
8264 if(future_num_parts > 1 &&
8265 this->check_migrate_avoid_migration_option >= 0 &&
8266 current_world_size > 1) {
8268 mj_timer_base_string +
"Problem_Migration-" + istring);
8269 mj_part_t num_parts = output_part_count_in_dimension;
8271 if(this->mj_perform_migration(
8274 next_future_num_parts_in_parts,
8275 output_part_begin_index,
8276 migration_reduce_all_population,
8277 this->num_global_coords / (future_num_parts * current_num_parts),
8279 input_part_boxes, output_part_boxes) )
8281 is_migrated_in_current_dimension =
true;
8282 is_data_ever_migrated =
true;
8284 mj_timer_base_string +
"Problem_Migration-" + istring);
8287 this->total_dim_num_reduce_all /= num_parts;
8290 is_migrated_in_current_dimension =
false;
8292 mj_timer_base_string +
"Problem_Migration-" + istring);
8297 Kokkos::View<mj_lno_t*, device_t> tmp =
8298 this->coordinate_permutations;
8299 this->coordinate_permutations =
8300 this->new_coordinate_permutations;
8302 this->new_coordinate_permutations = tmp;
8303 if(!is_migrated_in_current_dimension) {
8304 this->total_dim_num_reduce_all -= current_num_parts;
8305 current_num_parts = output_part_count_in_dimension;
8309 this->part_xadj = this->new_part_xadj;
8310 local_part_xadj = this->new_part_xadj;
8311 this->host_part_xadj = Kokkos::create_mirror_view(
part_xadj);
8312 Kokkos::deep_copy(host_part_xadj,
part_xadj);
8314 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
"empty", 0);
8316 mj_timer_base_string +
"Problem_Partitioning_" + istring);
8321 delete future_num_part_in_parts;
8322 delete next_future_num_parts_in_parts;
8324 mj_timer_base_string +
"Problem_Partitioning");
8330 this->set_final_parts(
8332 output_part_begin_index,
8334 is_data_ever_migrated);
8336 result_assigned_part_ids_ = this->assigned_part_ids;
8337 result_mj_gnos_ = this->current_mj_gnos;
8339 mj_timer_base_string +
"Total");
8340 this->mj_env->debug(3,
"Out of MultiJagged");
8343 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
8344 typename mj_part_t,
typename mj_node_t>
8345 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8350 if(this->mj_keep_part_boxes) {
8351 return this->kept_boxes;
8354 throw std::logic_error(
"Error: part boxes are not stored.");
8358 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
8359 typename mj_part_t,
typename mj_node_t>
8360 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8366 mj_part_t ntasks = this->num_global_parts;
8367 int dim = (*localPartBoxes)[0].getDim();
8368 coord_t *localPartBoundaries =
new coord_t[ntasks * 2 *dim];
8370 memset(localPartBoundaries, 0,
sizeof(coord_t) * ntasks * 2 *dim);
8372 coord_t *globalPartBoundaries =
new coord_t[ntasks * 2 *dim];
8373 memset(globalPartBoundaries, 0,
sizeof(coord_t) * ntasks * 2 *dim);
8375 coord_t *localPartMins = localPartBoundaries;
8376 coord_t *localPartMaxs = localPartBoundaries + ntasks * dim;
8378 coord_t *globalPartMins = globalPartBoundaries;
8379 coord_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
8381 mj_part_t boxCount = localPartBoxes->size();
8382 for(mj_part_t i = 0; i < boxCount; ++i) {
8383 mj_part_t pId = (*localPartBoxes)[i].getpId();
8387 coord_t *lmins = (*localPartBoxes)[i].getlmins();
8388 coord_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
8390 for(
int j = 0; j < dim; ++j) {
8391 localPartMins[dim * pId + j] = lmins[j];
8392 localPartMaxs[dim * pId + j] = lmaxs[j];
8405 reduceAll<int, coord_t>(*mj_problemComm, reductionOp,
8406 ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
8408 RCP<mj_partBoxVector_t> pB(
new mj_partBoxVector_t(),
true);
8409 for(mj_part_t i = 0; i < ntasks; ++i) {
8411 globalPartMins + dim * i,
8412 globalPartMaxs + dim * i);
8425 delete []localPartBoundaries;
8426 delete []globalPartBoundaries;
8433 template <
typename Adapter>
8439 #ifndef DOXYGEN_SHOULD_SKIP_THIS 8445 typedef typename Adapter::scalar_t adapter_scalar_t;
8448 typedef float default_mj_scalar_t;
8454 (std::is_same<adapter_scalar_t, float>::value ||
8455 std::is_same<adapter_scalar_t, double>::value),
8456 adapter_scalar_t, default_mj_scalar_t>::type mj_scalar_t;
8461 typedef typename Adapter::node_t mj_node_t;
8463 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
8464 typedef typename mj_node_t::device_type device_t;
8469 RCP<const Environment> mj_env;
8470 RCP<const Comm<int> > mj_problemComm;
8471 RCP<const coordinateModel_t> mj_coords;
8474 double imbalance_tolerance;
8478 size_t num_global_parts;
8481 Kokkos::View<mj_part_t*, Kokkos::HostSpace> part_no_array;
8484 int recursion_depth;
8487 mj_lno_t num_local_coords;
8488 mj_gno_t num_global_coords;
8491 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
8495 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8498 int num_weights_per_coord;
8501 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_weights;
8504 Kokkos::View<mj_scalar_t**, device_t> mj_weights;
8507 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_parts;
8517 mj_part_t num_first_level_parts;
8521 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
8525 bool distribute_points_on_cut_lines;
8528 mj_part_t max_concurrent_part_calculation;
8531 int check_migrate_avoid_migration_option;
8539 double minimum_migration_imbalance;
8540 bool mj_keep_part_boxes;
8544 int mj_premigration_option;
8545 int min_coord_per_rank_for_premigration;
8548 ArrayRCP<mj_part_t> comXAdj_;
8551 ArrayRCP<mj_part_t> comAdj_;
8556 void set_input_parameters(
const Teuchos::ParameterList &p);
8558 RCP<mj_partBoxVector_t> getGlobalBoxBoundaries()
const;
8560 bool mj_premigrate_to_subset(
8562 int migration_selection_option,
8563 RCP<const Environment> mj_env_,
8564 RCP<
const Comm<int> > mj_problemComm_,
8566 mj_lno_t num_local_coords_,
8567 mj_gno_t num_global_coords_,
size_t num_global_parts_,
8568 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8570 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8572 int num_weights_per_coord_,
8573 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8575 RCP<
const Comm<int> > &result_problemComm_,
8576 mj_lno_t & result_num_local_coords_,
8577 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8579 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8580 result_mj_coordinates_,
8581 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8582 int * &result_actual_owner_rank_);
8587 RCP<
const Comm<int> > &problemComm,
8588 const RCP<const coordinateModel_t> &coords) :
8591 mj_problemComm(problemComm),
8593 imbalance_tolerance(0),
8595 num_global_parts(1),
8598 num_local_coords(0),
8599 num_global_coords(0),
8600 num_weights_per_coord(0),
8601 num_first_level_parts(1),
8602 distribute_points_on_cut_lines(true),
8603 max_concurrent_part_calculation(1),
8604 check_migrate_avoid_migration_option(0),
8606 minimum_migration_imbalance(0.30),
8607 mj_keep_part_boxes(false),
8608 mj_run_as_rcb(false),
8609 mj_premigration_option(0),
8610 min_coord_per_rank_for_premigration(32000),
8624 const bool bUnsorted =
true;
8625 RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
8627 pl.set(
"mj_parts",
"0",
"list of parts for multiJagged partitioning " 8628 "algorithm. As many as the dimension count.", mj_parts_Validator);
8630 pl.set(
"mj_concurrent_part_count", 1,
"The number of parts whose cut " 8631 "coordinates will be calculated concurently.",
8634 pl.set(
"mj_minimum_migration_imbalance", 1.1,
8635 "mj_minimum_migration_imbalance, the minimum imbalance of the " 8636 "processors to avoid migration",
8639 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
8640 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 2) );
8641 pl.set(
"mj_migration_option", 1,
"Migration option, 0 for decision " 8642 "depending on the imbalance, 1 for forcing migration, 2 for " 8643 "avoiding migration", mj_migration_option_validator);
8645 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
8646 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1) );
8647 pl.set(
"mj_migration_type", 0,
8648 "Migration type, 0 for migration to minimize the imbalance " 8649 "1 for migration to minimize messages exchanged the migration.",
8650 mj_migration_option_validator);
8653 pl.set(
"mj_keep_part_boxes",
false,
"Keep the part boundaries of the " 8657 pl.set(
"mj_enable_rcb",
false,
"Use MJ as RCB.",
8660 pl.set(
"mj_recursion_depth", -1,
"Recursion depth for MJ: Must be " 8663 RCP<Teuchos::EnhancedNumberValidator<int>>
8664 mj_num_teams_validator =
8665 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(
8666 0, Teuchos::EnhancedNumberTraits<int>::max()) );
8667 pl.set(
"mj_num_teams", 0,
8668 "How many teams for the main kernel loop" 8669 , mj_num_teams_validator);
8671 RCP<Teuchos::EnhancedNumberValidator<int>>
8672 mj_premigration_option_validator =
8673 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
8675 pl.set(
"mj_premigration_option", 0,
8676 "Whether to do premigration or not. 0 for no migration " 8677 "x > 0 for migration to consecutive processors, " 8678 "the subset will be 0,x,2x,3x,...subset ranks." 8679 , mj_premigration_option_validator);
8681 pl.set(
"mj_premigration_coordinate_count", 32000,
"How many coordinate to " 8682 "assign each rank in multijagged after premigration" 8695 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
8699 mj_part_t pointAssign(
int dim, adapter_scalar_t *point)
const;
8701 void boxAssign(
int dim, adapter_scalar_t *lower, adapter_scalar_t *upper,
8702 size_t &nPartsFound, mj_part_t **partsFound)
const;
8706 void getCommunicationGraph(
8708 ArrayRCP<mj_part_t> &comXAdj,
8709 ArrayRCP<mj_part_t> &comAdj);
8711 void set_up_partitioning_data(
8715 std::string timer_base_string;
8723 template<
class dst_t,
class src_t>
8724 typename std::enable_if<std::is_same<
typename dst_t::value_type,
8725 typename src_t::value_type>::value>::type
8726 assign_if_same(dst_t & dst,
const src_t & src) {
8729 template<
class dst_t,
class src_t>
8730 typename std::enable_if<!std::is_same<
typename dst_t::value_type,
8731 typename src_t::value_type>::value>::type
8732 assign_if_same(dst_t & dst,
const src_t & src) {
8737 template <
typename Adapter>
8738 bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset(
8740 int migration_selection_option,
8741 RCP<const Environment> mj_env_,
8742 RCP<
const Comm<int> > mj_problemComm_,
8744 mj_lno_t num_local_coords_,
8745 mj_gno_t num_global_coords_,
size_t num_global_parts_,
8746 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8748 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
8749 int num_weights_per_coord_,
8750 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8752 RCP<
const Comm<int> > & result_problemComm_,
8753 mj_lno_t &result_num_local_coords_,
8754 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8756 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8757 result_mj_coordinates_,
8758 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8759 int * &result_actual_owner_rank_)
8762 timer_base_string +
"PreMigration DistributorPlanCreating");
8764 int myRank = mj_problemComm_->getRank();
8765 int worldSize = mj_problemComm_->getSize();
8767 mj_part_t groupsize = worldSize / used_num_ranks;
8769 std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
8771 mj_part_t i_am_sending_to = 0;
8772 bool am_i_a_receiver =
false;
8774 for(
int i = 0; i < used_num_ranks; ++i) {
8775 group_begins[i+ 1] = group_begins[i] + groupsize;
8776 if(worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
8777 if(i == used_num_ranks) group_begins[i+ 1] = worldSize;
8778 if(myRank >= group_begins[i] && myRank < group_begins[i + 1]) {
8779 i_am_sending_to = group_begins[i];
8781 if(myRank == group_begins[i]) {
8782 am_i_a_receiver =
true;
8786 ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
8787 result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
8789 Tpetra::Distributor distributor(mj_problemComm_);
8791 std::vector<mj_part_t>
8792 coordinate_destinations(num_local_coords_, i_am_sending_to);
8794 ArrayView<const mj_part_t>
8795 destinations(&(coordinate_destinations[0]), num_local_coords_);
8796 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
8797 result_num_local_coords_ = num_incoming_gnos;
8799 timer_base_string +
"PreMigration DistributorPlanCreating");
8802 timer_base_string +
"PreMigration DistributorMigration");
8808 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
8809 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> host_initial_mj_gnos(
8810 Kokkos::ViewAllocateWithoutInitializing(
"host_initial_mj_gnos"),
8811 initial_mj_gnos_.size());
8812 Kokkos::deep_copy(host_initial_mj_gnos, initial_mj_gnos_);
8813 ArrayView<const mj_gno_t> sent_gnos(host_initial_mj_gnos.data(),
8815 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
8816 result_initial_mj_gnos_ = Kokkos::View<mj_gno_t*, device_t>(
8817 Kokkos::ViewAllocateWithoutInitializing(
"result_initial_mj_gnos_"),
8819 auto host_result_initial_mj_gnos_ = Kokkos::create_mirror_view(
8820 result_initial_mj_gnos_);
8821 memcpy(host_result_initial_mj_gnos_.data(),
8822 received_gnos.getRawPtr(), num_incoming_gnos *
sizeof(mj_gno_t));
8823 Kokkos::deep_copy(result_initial_mj_gnos_, host_result_initial_mj_gnos_);
8828 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> dst_coordinates(
8829 Kokkos::ViewAllocateWithoutInitializing(
"mj_coordinates"),
8830 num_incoming_gnos, this->coord_dim);
8831 auto host_dst_coordinates = Kokkos::create_mirror_view(
8833 auto host_src_coordinates =
8834 Kokkos::create_mirror_view(Kokkos::HostSpace(), this->mj_coordinates);
8835 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
8836 for(
int i = 0; i < this->coord_dim; ++i) {
8837 auto sub_host_src_coordinates
8838 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
8839 auto sub_host_dst_coordinates
8840 = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
8842 ArrayView<mj_scalar_t> sent_coord(
8843 sub_host_src_coordinates.data(), this->num_local_coords);
8844 ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
8845 distributor.doPostsAndWaits<mj_scalar_t>(
8846 sent_coord, 1, received_coord());
8847 memcpy(sub_host_dst_coordinates.data(),
8848 received_coord.getRawPtr(), num_incoming_gnos *
sizeof(mj_scalar_t));
8850 deep_copy(dst_coordinates, host_dst_coordinates);
8851 result_mj_coordinates_ = dst_coordinates;
8854 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
8855 Kokkos::ViewAllocateWithoutInitializing(
"mj_weights"),
8856 num_incoming_gnos, this->num_weights_per_coord);
8857 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
8858 auto host_src_weights = Kokkos::create_mirror_view(this->mj_weights);
8859 Kokkos::deep_copy(host_src_weights, this->mj_weights);
8860 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
8861 auto sub_host_src_weights
8862 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
8863 auto sub_host_dst_weights
8864 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
8865 ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
8873 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
8874 sent_weight[n] = sub_host_src_weights(n);
8876 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
8877 distributor.doPostsAndWaits<mj_scalar_t>(
8878 sent_weight(), 1, received_weight());
8881 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
8882 sub_host_dst_weights(n) = received_weight[n];
8885 Kokkos::deep_copy(dst_weights, host_dst_weights);
8886 result_mj_weights_ = dst_weights;
8890 std::vector<int> owner_of_coordinate(num_local_coords_, myRank);
8891 ArrayView<int> sent_owners(&(owner_of_coordinate[0]), num_local_coords_);
8892 ArrayRCP<int> received_owners(num_incoming_gnos);
8893 distributor.doPostsAndWaits<
int>(sent_owners, 1, received_owners());
8894 result_actual_owner_rank_ =
new int[num_incoming_gnos];
8896 result_actual_owner_rank_,
8897 received_owners.getRawPtr(),
8898 num_incoming_gnos *
sizeof(int));
8902 timer_base_string +
"PreMigration DistributorMigration");
8903 return am_i_a_receiver;
8913 template <
typename Adapter>
8922 int execute_counter =
8924 timer_base_string =
"partition(" + std::to_string(execute_counter) +
") - ";
8926 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
"all");
8928 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
"setup");
8930 this->set_up_partitioning_data(solution);
8932 this->set_input_parameters(this->mj_env->getParameters());
8933 if(this->mj_keep_part_boxes) {
8934 this->mj_partitioner.set_to_keep_part_boxes();
8937 this->mj_partitioner.set_partitioning_parameters(
8938 this->distribute_points_on_cut_lines,
8939 this->max_concurrent_part_calculation,
8940 this->check_migrate_avoid_migration_option,
8941 this->minimum_migration_imbalance, this->migration_type);
8943 RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
8944 mj_lno_t result_num_local_coords = this->num_local_coords;
8945 Kokkos::View<mj_gno_t*, device_t> result_initial_mj_gnos;
8947 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8948 result_mj_coordinates = this->mj_coordinates;
8949 Kokkos::View<mj_scalar_t**, device_t> result_mj_weights =
8951 int *result_actual_owner_rank = NULL;
8953 Kokkos::View<const mj_gno_t*, device_t> result_initial_mj_gnos_ =
8954 this->initial_mj_gnos;
8972 int current_world_size = this->mj_problemComm->getSize();
8973 mj_lno_t threshold_num_local_coords =
8974 this->min_coord_per_rank_for_premigration;
8975 bool is_pre_migrated =
false;
8976 bool am_i_in_subset =
true;
8982 if(mj_premigration_option > 0 &&
8983 size_t (current_world_size) > this->num_global_parts &&
8984 this->num_global_coords < mj_gno_t (
8985 current_world_size * threshold_num_local_coords))
8987 if(this->mj_keep_part_boxes) {
8988 throw std::logic_error(
"Multijagged: mj_keep_part_boxes and " 8989 "mj_premigration_option are not supported together yet.");
8992 is_pre_migrated =
true;
8993 int migration_selection_option = mj_premigration_option;
8994 if(migration_selection_option * this->num_global_parts >
8995 (
size_t) (current_world_size)) {
8996 migration_selection_option =
8997 current_world_size / this->num_global_parts;
9000 int used_num_ranks = int (this->num_global_coords /
9001 float (threshold_num_local_coords) + 0.5);
9003 if(used_num_ranks == 0) {
9007 am_i_in_subset = this->mj_premigrate_to_subset(
9009 migration_selection_option,
9011 this->mj_problemComm,
9013 this->num_local_coords,
9014 this->num_global_coords,
9015 this->num_global_parts,
9016 this->initial_mj_gnos,
9017 this->mj_coordinates,
9018 this->num_weights_per_coord,
9022 result_num_local_coords,
9023 result_initial_mj_gnos,
9024 result_mj_coordinates,
9026 result_actual_owner_rank);
9028 result_initial_mj_gnos_ = result_initial_mj_gnos;
9031 Kokkos::View<mj_part_t *, device_t> result_assigned_part_ids;
9032 Kokkos::View<mj_gno_t*, device_t> result_mj_gnos;
9034 this->mj_env->timerStop(
MACRO_TIMERS, timer_base_string +
"setup");
9036 if(am_i_in_subset) {
9037 this->mj_partitioner.multi_jagged_part(
9040 this->imbalance_tolerance,
9042 this->num_global_parts,
9043 this->part_no_array,
9044 this->recursion_depth,
9046 result_num_local_coords,
9047 this->num_global_coords,
9048 result_initial_mj_gnos_,
9049 result_mj_coordinates,
9050 this->num_weights_per_coord,
9051 this->mj_uniform_weights,
9053 this->mj_uniform_parts,
9054 result_assigned_part_ids,
9059 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
"cleanup");
9062 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
9063 localGidToLid.reserve(result_num_local_coords);
9064 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> host_result_initial_mj_gnos(
9065 Kokkos::ViewAllocateWithoutInitializing(
"host_result_initial_mj_gnos"),
9066 result_initial_mj_gnos_.size());
9067 Kokkos::deep_copy(host_result_initial_mj_gnos, result_initial_mj_gnos_);
9068 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9069 localGidToLid[host_result_initial_mj_gnos(i)] = i;
9072 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[result_num_local_coords],
9073 0, result_num_local_coords,
true);
9074 auto host_result_assigned_part_ids =
9075 Kokkos::create_mirror_view(result_assigned_part_ids);
9076 Kokkos::deep_copy(host_result_assigned_part_ids, result_assigned_part_ids);
9077 auto host_result_mj_gnos = Kokkos::create_mirror_view(result_mj_gnos);
9078 Kokkos::deep_copy(host_result_mj_gnos, result_mj_gnos);
9079 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9080 mj_lno_t origLID = localGidToLid[host_result_mj_gnos(i)];
9081 partId[origLID] = host_result_assigned_part_ids(i);
9086 if(is_pre_migrated) {
9087 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
9088 "PostMigration DistributorPlanCreating");
9089 Tpetra::Distributor distributor(this->mj_problemComm);
9090 ArrayView<const mj_part_t> actual_owner_destinations(
9091 result_actual_owner_rank , result_num_local_coords);
9092 mj_lno_t num_incoming_gnos = distributor.createFromSends(
9093 actual_owner_destinations);
9094 if(num_incoming_gnos != this->num_local_coords) {
9095 throw std::logic_error(
"Zoltan2 - Multijagged Post Migration - " 9096 "num incoming is not equal to num local coords");
9100 "PostMigration DistributorPlanCreating");
9102 "PostMigration DistributorMigration");
9103 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
9104 ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
9106 ArrayView<const mj_gno_t> sent_gnos(host_result_initial_mj_gnos.data(),
9107 result_num_local_coords);
9108 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
9112 ArrayView<mj_part_t> sent_partnos(partId());
9113 distributor.doPostsAndWaits<mj_part_t>(sent_partnos, 1,
9114 received_partids());
9117 partId = arcp(
new mj_part_t[this->num_local_coords],
9118 0, this->num_local_coords,
true);
9121 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
9122 localGidToLid2.reserve(this->num_local_coords);
9123 auto host_initial_mj_gnos =
9124 Kokkos::create_mirror_view(this->initial_mj_gnos);
9125 Kokkos::deep_copy(host_initial_mj_gnos,
9126 this->initial_mj_gnos);
9127 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9128 localGidToLid2[host_initial_mj_gnos(i)] = i;
9131 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9132 mj_lno_t origLID = localGidToLid2[received_gnos[i]];
9133 partId[origLID] = received_partids[i];
9138 delete [] result_actual_owner_rank;
9141 timer_base_string +
"PostMigration DistributorMigration");
9143 solution->setParts(partId);
9144 this->mj_env->timerStop(
MACRO_TIMERS, timer_base_string +
"cleanup");
9147 this->mj_env->timerStop(
MACRO_TIMERS, timer_base_string +
"all");
9152 template <
typename Adapter>
9157 this->coord_dim = this->mj_coords->getCoordinateDim();
9158 this->num_weights_per_coord = this->mj_coords->getNumWeightsPerCoordinate();
9159 this->num_local_coords = this->mj_coords->getLocalNumCoordinates();
9160 this->num_global_coords = this->mj_coords->getGlobalNumCoordinates();
9161 int criteria_dim = (this->num_weights_per_coord ?
9162 this->num_weights_per_coord : 1);
9166 this->num_global_parts = solution->getTargetGlobalNumberOfParts();
9169 this->mj_uniform_parts = Kokkos::View<bool *, Kokkos::HostSpace>(
9170 "uniform parts", criteria_dim);
9171 this->mj_uniform_weights = Kokkos::View<bool *, Kokkos::HostSpace>(
9172 "uniform weights", criteria_dim);
9174 Kokkos::View<const mj_gno_t *, device_t> gnos;
9175 Kokkos::View<adapter_scalar_t **, Kokkos::LayoutLeft, device_t> xyz_adapter;
9177 Kokkos::View<adapter_scalar_t **, device_t> wgts_adapter;
9178 this->mj_coords->getCoordinatesKokkos(gnos, xyz_adapter, wgts_adapter);
9180 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> xyz;
9181 Kokkos::View<mj_scalar_t **, device_t> wgts;
9185 if(std::is_same<mj_scalar_t, adapter_scalar_t>()) {
9188 assign_if_same(xyz, xyz_adapter);
9189 assign_if_same(wgts, wgts_adapter);
9194 xyz = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
9195 (Kokkos::ViewAllocateWithoutInitializing(
9196 "xyz"), xyz_adapter.extent(0), xyz_adapter.extent(1));
9197 wgts = Kokkos::View<mj_scalar_t **, device_t>(
9198 Kokkos::ViewAllocateWithoutInitializing(
"wgts"),
9199 wgts_adapter.extent(0), wgts_adapter.extent(1));
9201 typedef typename Kokkos::View<mj_scalar_t **, device_t>::size_type view_size_t;
9202 Kokkos::parallel_for(
9203 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9204 (0, xyz_adapter.extent(0)), KOKKOS_LAMBDA (
int i) {
9205 for(view_size_t n = 0; n < xyz_adapter.extent(1); ++n) {
9206 xyz(i, n) =
static_cast<mj_scalar_t
>(xyz_adapter(i, n));
9209 Kokkos::parallel_for(
9210 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9211 (0, wgts.extent(0)), KOKKOS_LAMBDA (
int i) {
9212 for(view_size_t n = 0; n < wgts.extent(1); ++n) {
9213 wgts(i, n) =
static_cast<mj_scalar_t
>(wgts_adapter(i, n));
9219 this->initial_mj_gnos = gnos;
9221 this->mj_coordinates = xyz;
9224 if(this->num_weights_per_coord == 0) {
9225 this->mj_uniform_weights(0) =
true;
9226 Kokkos::resize(this->mj_weights, 0, 0);
9229 this->mj_weights = wgts;
9230 for(
int wdim = 0; wdim < this->num_weights_per_coord; ++wdim) {
9231 this->mj_uniform_weights(wdim) =
false;
9235 for(
int wdim = 0; wdim < criteria_dim; ++wdim) {
9236 if(solution->criteriaHasUniformPartSizes(wdim)) {
9237 this->mj_uniform_parts(wdim) =
true;
9240 printf(
"Error: MJ does not support non uniform target part weights\n");
9249 template <
typename Adapter>
9251 const Teuchos::ParameterList &pl)
9253 const Teuchos::ParameterEntry *pe = pl.getEntryPtr(
"imbalance_tolerance");
9256 tol = pe->getValue(&tol);
9257 this->imbalance_tolerance = tol - 1.0;
9261 if(this->imbalance_tolerance <= 0) {
9262 this->imbalance_tolerance= 10e-4;
9266 Kokkos::resize(this->part_no_array, 0);
9269 this->recursion_depth = 0;
9271 if(pl.getPtr<
int>(
"mj_num_teams")) {
9272 this->num_teams = pl.get<
int>(
"mj_num_teams");
9275 if(pl.getPtr<Array <mj_part_t> >(
"mj_parts")) {
9276 auto mj_parts = pl.get<Array <mj_part_t> >(
"mj_parts");
9277 int mj_parts_size =
static_cast<int>(mj_parts.size());
9280 this->part_no_array = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9281 "part_no_array", mj_parts_size);
9282 for(
int i = 0; i < mj_parts_size; ++i) {
9283 this->part_no_array(i) = mj_parts.getRawPtr()[i];
9286 this->recursion_depth = mj_parts_size - 1;
9287 this->mj_env->debug(2,
"mj_parts provided by user");
9291 this->distribute_points_on_cut_lines =
true;
9292 this->max_concurrent_part_calculation = 1;
9294 this->mj_run_as_rcb =
false;
9295 this->mj_premigration_option = 0;
9296 this->min_coord_per_rank_for_premigration = 32000;
9298 int mj_user_recursion_depth = -1;
9299 this->mj_keep_part_boxes =
false;
9300 this->check_migrate_avoid_migration_option = 0;
9301 this->migration_type = 0;
9302 this->minimum_migration_imbalance = 0.35;
9304 pe = pl.getEntryPtr(
"mj_minimum_migration_imbalance");
9307 imb = pe->getValue(&imb);
9308 this->minimum_migration_imbalance = imb - 1.0;
9311 pe = pl.getEntryPtr(
"mj_migration_option");
9313 this->check_migrate_avoid_migration_option =
9314 pe->getValue(&this->check_migrate_avoid_migration_option);
9316 this->check_migrate_avoid_migration_option = 0;
9318 if(this->check_migrate_avoid_migration_option > 1) {
9319 this->check_migrate_avoid_migration_option = -1;
9323 pe = pl.getEntryPtr(
"mj_migration_type");
9325 this->migration_type = pe->getValue(&this->migration_type);
9327 this->migration_type = 0;
9333 pe = pl.getEntryPtr(
"mj_concurrent_part_count");
9335 this->max_concurrent_part_calculation =
9336 pe->getValue(&this->max_concurrent_part_calculation);
9338 this->max_concurrent_part_calculation = 1;
9341 pe = pl.getEntryPtr(
"mj_keep_part_boxes");
9343 this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
9345 this->mj_keep_part_boxes =
false;
9356 if(this->mj_keep_part_boxes ==
false) {
9357 pe = pl.getEntryPtr(
"mapping_type");
9359 int mapping_type = -1;
9360 mapping_type = pe->getValue(&mapping_type);
9361 if(mapping_type == 0) {
9362 mj_keep_part_boxes =
true;
9368 pe = pl.getEntryPtr(
"mj_enable_rcb");
9370 this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
9372 this->mj_run_as_rcb =
false;
9375 pe = pl.getEntryPtr(
"mj_premigration_option");
9377 mj_premigration_option = pe->getValue(&mj_premigration_option);
9379 mj_premigration_option = 0;
9382 pe = pl.getEntryPtr(
"mj_premigration_coordinate_count");
9384 min_coord_per_rank_for_premigration = pe->getValue(&mj_premigration_option);
9386 min_coord_per_rank_for_premigration = 32000;
9389 pe = pl.getEntryPtr(
"mj_recursion_depth");
9391 mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
9393 mj_user_recursion_depth = -1;
9397 pe = pl.getEntryPtr(
"rectilinear");
9399 val = pe->getValue(&val);
9402 this->distribute_points_on_cut_lines =
false;
9404 this->distribute_points_on_cut_lines =
true;
9407 if(this->mj_run_as_rcb) {
9408 mj_user_recursion_depth =
9409 (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
9411 if(this->recursion_depth < 1) {
9412 if(mj_user_recursion_depth > 0) {
9413 this->recursion_depth = mj_user_recursion_depth;
9416 this->recursion_depth = this->coord_dim;
9422 template <
typename Adapter>
9425 adapter_scalar_t *lower,
9426 adapter_scalar_t *upper,
9427 size_t &nPartsFound,
9437 if(this->mj_keep_part_boxes) {
9440 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9442 size_t nBoxes = (*partBoxes).size();
9444 throw std::logic_error(
"no part boxes exist");
9448 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9450 if(globalBox->boxesOverlap(dim, lower, upper)) {
9452 std::vector<typename Adapter::part_t> partlist;
9455 for(
size_t i = 0; i < nBoxes; i++) {
9457 if((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
9459 partlist.push_back((*partBoxes)[i].getpId());
9481 *partsFound =
new mj_part_t[nPartsFound];
9482 for(
size_t i = 0; i < nPartsFound; i++)
9483 (*partsFound)[i] = partlist[i];
9495 throw std::logic_error(
"need to use keep_cuts parameter for boxAssign");
9500 template <
typename Adapter>
9503 adapter_scalar_t *point)
const 9509 if(this->mj_keep_part_boxes) {
9513 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9515 size_t nBoxes = (*partBoxes).size();
9517 throw std::logic_error(
"no part boxes exist");
9521 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9523 if(globalBox->pointInBox(dim, point)) {
9527 for(i = 0; i < nBoxes; i++) {
9529 if((*partBoxes)[i].pointInBox(dim, point)) {
9530 foundPart = (*partBoxes)[i].getpId();
9544 std::ostringstream oss;
9546 for(
int j = 0; j < dim; j++) oss << point[j] <<
" ";
9547 oss <<
") not found in domain";
9548 throw std::logic_error(oss.str());
9558 size_t closestBox = 0;
9559 coord_t minDistance = std::numeric_limits<coord_t>::max();
9560 coord_t *centroid =
new coord_t[dim];
9561 for(
size_t i = 0; i < nBoxes; i++) {
9562 (*partBoxes)[i].computeCentroid(centroid);
9565 for(
int j = 0; j < dim; j++) {
9566 diff = centroid[j] - point[j];
9569 if(sum < minDistance) {
9574 foundPart = (*partBoxes)[closestBox].getpId();
9581 throw std::logic_error(
"need to use keep_cuts parameter for pointAssign");
9585 template <
typename Adapter>
9591 if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL) {
9592 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
9593 mj_part_t ntasks = (*pBoxes).size();
9594 int dim = (*pBoxes)[0].getDim();
9595 GridHash grid(pBoxes, ntasks, dim);
9602 template <
typename Adapter>
9603 RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
9606 return this->mj_partitioner.get_kept_boxes();
Zoltan2_MJArrayType< scalar_t > value_type
RCP< mj_partBoxVector_t > get_kept_boxes() const
DOCWORK: Documentation.
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType()
Kokkos::View< index_t *, device_t > part_xadj
GridHash Class, Hashing Class for part boxes.
Created by mbenlioglu on Aug 31, 2020.
void set_up_partitioning_data(const RCP< PartitioningSolution< Adapter > > &solution)
Time an algorithm (or other entity) as a whole.
int value_count_rightleft
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
Kokkos::View< index_t *, device_t > track_on_cuts
static int get_counter_AlgMJ()
Defines Parameter related enumerators, declares functions.
KOKKOS_INLINE_FUNCTION ArrayReducer(value_type &val, int mj_value_count)
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Kokkos::View< scalar_t *, device_t > coordinates
Sort items for quick sort function.
KOKKOS_INLINE_FUNCTION value_type & reference() const
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals...
RCP< mj_partBox_t > get_global_box() const
DOCWORK: Documentation.
Kokkos::View< index_t *, device_t > permutations
map_t::global_ordinal_type gno_t
KOKKOS_INLINE_FUNCTION void join(volatile value_type dst, const volatile value_type src) const
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
Class for sorting items with multiple values. First sorting with respect to val[0], then val[1] then ... val[count-1]. The last tie breaking is done with index values. Used for task mapping partitioning where the points on a cut line needs to be distributed consistently.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
Kokkos::View< scalar_t **, device_t > weights
ArrayCombinationReducer reducer
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
size_t team_shmem_size(int team_size) const
KOKKOS_INLINE_FUNCTION void operator()(const member_type &teamMember, value_type teamSum) const
Kokkos::View< scalar_t * > scalar_view_t
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
mj_part_t pointAssign(int dim, adapter_scalar_t *point) const
A ParameterList validator for integer range lists.
static int get_counter_Zoltan2_AlgMJ()
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
SparseMatrixAdapter_t::part_t part_t
Multi Jagged coordinate partitioning algorithm.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
policy_t::member_type member_type
KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const
Kokkos::View< scalar_t *, device_t > cut_coordinates
A PartitioningSolution is a solution to a partitioning problem.
Zoltan2_BoxBoundaries()
Default Constructor.
Kokkos::View< index_t *, device_t > permutations
void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, double minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
Kokkos::View< part_t *, device_t > parts
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
ReduceWeightsFunctor(int mj_loop_count, array_t mj_max_scalar, part_t mj_concurrent_current_part, part_t mj_num_cuts, part_t mj_current_work_part, part_t mj_current_concurrent_num_parts, part_t mj_left_right_array_size, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< scalar_t **, device_t > &mj_weights, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< scalar_t *, device_t > &mj_cut_coordinates, Kokkos::View< index_t *, device_t > &mj_part_xadj, bool mj_uniform_weights0, scalar_t mj_sEpsilon)
size_t team_shmem_size(int team_size) const
Algorithm defines the base class for all algorithms.
map_t::local_ordinal_type lno_t
KOKKOS_INLINE_FUNCTION value_type & reference() const
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
KOKKOS_INLINE_FUNCTION void join(volatile value_type dst, const volatile value_type src) const
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
part_t concurrent_current_part
Kokkos::View< part_t *, device_t > parts
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
uMultiSortItem(IT index_, CT count_, WT *vals_)
KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(scalar_t mj_max_scalar, value_type &val, int mj_value_count_rightleft, int mj_value_count_weights)
KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
Define IntegerRangeList validator.
Defines the CoordinateModel classes.
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, int num_teams, size_t num_global_parts, Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, Kokkos::View< const mj_gno_t *, device_t > &initial_mj_gnos, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates, int num_weights_per_coord, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_weights, Kokkos::View< mj_scalar_t **, device_t > &mj_weights, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_parts, Kokkos::View< mj_part_t *, device_t > &result_assigned_part_ids, Kokkos::View< mj_gno_t *, device_t > &result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
part_t concurrent_current_part
Kokkos::View< scalar_t * > scalar_view_t
policy_t::member_type member_type
Tpetra::global_size_t global_size_t
Zoltan2_MJArrayType< scalar_t > value_type
part_t current_concurrent_num_parts
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
int value_count_rightleft
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
Kokkos::View< scalar_t *, device_t > coordinates
ReduceArrayFunctor(part_t mj_concurrent_current_part, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< index_t *, device_t > &mj_part_xadj, Kokkos::View< index_t *, device_t > &mj_track_on_cuts)
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
Kokkos::View< index_t *, device_t > part_xadj
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType(scalar_t *pSetPtr)
A gathering of useful namespace methods.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals. DOCWORK: Document input params.
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
DOCWORK: Documentation.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const coordinateModel_t > &coords)
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Zoltan2_MJArrayType< scalar_t > & operator=(const volatile Zoltan2_MJArrayType< scalar_t > &zmj)
Multi Jagged coordinate partitioning algorithm.
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...