Zoltan2
Zoltan2_AlgMultiJagged.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // Zoltan2: A package of combinatorial algorithms for scientific computing
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact Karen Devine (kddevin@sandia.gov)
39 // Erik Boman (egboman@sandia.gov)
40 // Siva Rajamanickam (srajama@sandia.gov)
41 //
42 // ***********************************************************************
43 //
44 // @HEADER
49 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_
50 #define _ZOLTAN2_ALGMultiJagged_HPP_
51 
54 #include <Zoltan2_Parameters.hpp>
55 #include <Zoltan2_Algorithm.hpp>
58 #include <Zoltan2_Util.hpp>
59 #include <Tpetra_Distributor.hpp>
60 #include <Teuchos_StandardParameterEntryValidators.hpp>
61 #include <Teuchos_ParameterList.hpp>
62 #include <Kokkos_Sort.hpp>
63 
64 #include <algorithm> // std::sort
65 #include <vector>
66 #include <unordered_map>
67 
68 #ifdef ZOLTAN2_USEZOLTANCOMM
69 #ifdef HAVE_ZOLTAN2_MPI
70 #define ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
71 #include "zoltan_comm_cpp.h"
72 #include "zoltan_types.h" // for error codes
73 #endif
74 #endif
75 
76 namespace Teuchos{
77 
81 template <typename Ordinal, typename T>
82 class Zoltan2_BoxBoundaries : public ValueTypeReductionOp<Ordinal,T>
83 {
84 private:
85  Ordinal size;
86  T epsilon;
87 
88 public:
91  Zoltan2_BoxBoundaries() : size(0),
92  epsilon(std::numeric_limits<T>::epsilon()) {}
93 
97  Zoltan2_BoxBoundaries(Ordinal s_):
98  size(s_), epsilon(std::numeric_limits<T>::epsilon()) {}
99 
105  void reduce( const Ordinal count, const T inBuffer[], T inoutBuffer[]) const {
106  for(Ordinal i = 0; i < count; i++) {
107  if(Z2_ABS(inBuffer[i]) > epsilon) {
108  inoutBuffer[i] = inBuffer[i];
109  }
110  }
111  }
112 };
113 
114 } // namespace Teuchos
115 
116 namespace Zoltan2{
117 
124 template <typename IT, typename CT, typename WT>
126 {
127 public:
128  // TODO: Why volatile?
129  // no idea, another intel compiler failure.
130  volatile IT index;
131  volatile CT count;
132  volatile WT *val;
133  volatile WT epsilon;
134 
136  this->index = 0;
137  this->count = 0;
138  this->val = NULL;
139  this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
140  }
141 
142  // TODO: Document these methods?
143  uMultiSortItem(IT index_ ,CT count_, WT *vals_) {
144  this->index = index_;
145  this->count = count_;
146  this->val = vals_;
147  this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
148  }
149 
151  }
152 
153  void set(IT index_ ,CT count_, WT *vals_) {
154  this->index = index_;
155  this->count = count_;
156  this->val = vals_;
157  }
158 
159  bool operator<(const uMultiSortItem<IT,CT,WT>& other) const {
160  assert(this->count == other.count);
161  for(CT i = 0; i < this->count; ++i) {
162  // if the values are equal go to next one.
163  if(std::abs(this->val[i] - other.val[i]) < this->epsilon) {
164  continue;
165  }
166  // if next value is smaller return true;
167  if(this->val[i] < other.val[i]) {
168  return true;
169  }
170  // if next value is bigger return false;
171  else {
172  return false;
173  }
174  }
175  // if they are totally equal.
176  return this->index < other.index;
177  }
178 };
179 
182 template <class IT, class WT>
183 struct uSortItem
184 {
185  IT id;
186  WT val;
187 };
188 
193 template <class IT, class WT>
194 void uqsort(IT n, uSortItem<IT, WT> * arr) {
195  int NSTACK = 50;
196  int M = 7;
197  IT i, ir=n, j, k, l=1;
198  IT jstack=0, istack[50];
199  WT aval;
201 
202  --arr;
203  for(;;) {
204  if(ir-l < M) {
205  for(j=l+1;j<=ir;j++) {
206  a=arr[j];
207  aval = a.val;
208  for(i=j-1;i>=1;i--) {
209  if(arr[i].val <= aval)
210  break;
211  arr[i+1] = arr[i];
212  }
213  arr[i+1]=a;
214  }
215  if(jstack == 0)
216  break;
217  ir=istack[jstack--];
218  l=istack[jstack--];
219  }
220  else {
221  k=(l+ir) >> 1;
222  std::swap(arr[k],arr[l+1]);
223  if(arr[l+1].val > arr[ir].val) {
224  std::swap(arr[l+1],arr[ir]);
225  }
226  if(arr[l].val > arr[ir].val) {
227  std::swap(arr[l],arr[ir]);
228  }
229  if(arr[l+1].val > arr[l].val) {
230  std::swap(arr[l+1],arr[l]);
231  }
232  i=l+1;
233  j=ir;
234  a=arr[l];
235  aval = a.val;
236  for(;;) {
237  do i++; while (arr[i].val < aval);
238  do j--; while (arr[j].val > aval);
239  if(j < i) break;
240  std::swap(arr[i],arr[j]);
241  }
242  arr[l]=arr[j];
243  arr[j]=a;
244  jstack += 2;
245  if(jstack > NSTACK) {
246  std::cout << "uqsort: NSTACK too small in sort." << std::endl;
247  std::terminate();
248  }
249  if(ir-i+1 >= j-l) {
250  istack[jstack]=ir;
251  istack[jstack-1]=i;
252  ir=j-1;
253  }
254  else {
255  istack[jstack]=j-1;
256  istack[jstack-1]=l;
257  l=i;
258  }
259  }
260  }
261 }
262 
263 template <class IT, class WT, class SIGN>
265 {
266  IT id;
267  WT val;
268  SIGN signbit; // 1 means positive, 0 means negative.
269  bool operator<(const uSignedSortItem<IT, WT, SIGN>& rhs) const {
270  /*if I am negative, the other is positive*/
271  if(this->signbit < rhs.signbit) {
272  return true;
273  }
274  /*if both has the same sign*/
275  else if(this->signbit == rhs.signbit) {
276  if(this->val < rhs.val) {//if my value is smaller,
277  return this->signbit;//then if we both are positive return true.
278  //if we both are negative, return false.
279  }
280  else if(this->val > rhs.val) {//if my value is larger,
281  return !this->signbit; //then if we both are positive return false.
282  //if we both are negative, return true.
283  }
284  else { //if both are equal.
285  return false;
286  }
287  }
288  else {
289  /*if I am positive, the other is negative*/
290  return false;
291  }
292  }
293 
294  bool operator<=(const uSignedSortItem<IT, WT, SIGN>& rhs) {
295  return (this->val == rhs.val && this->signbit == rhs.signbit) || (*this < rhs);
296  }
297 };
298 
302 template <class IT, class WT, class SIGN>
304  IT NSTACK = 50;
305  IT M = 7;
306  IT i, ir=n, j, k, l=1;
307  IT jstack=0, istack[50];
309 
310  --arr;
311  for(;;) {
312  if(ir < M + l) {
313  for(j=l+1;j<=ir;j++) {
314  a=arr[j];
315  for(i=j-1;i>=1;i--) {
316  if(arr[i] <= a) {
317  break;
318  }
319  arr[i+1] = arr[i];
320  }
321  arr[i+1]=a;
322  }
323  if(jstack == 0) {
324  break;
325  }
326  ir=istack[jstack--];
327  l=istack[jstack--];
328  }
329  else {
330  k=(l+ir) >> 1;
331  std::swap(arr[k],arr[l+1]);
332  if(arr[ir] < arr[l+1]) {
333  std::swap(arr[l+1],arr[ir]);
334  }
335  if(arr[ir] < arr[l] ) {
336  std::swap(arr[l],arr[ir]);
337  }
338  if(arr[l] < arr[l+1]) {
339  std::swap(arr[l+1],arr[l]);
340  }
341  i=l+1;
342  j=ir;
343  a=arr[l];
344  for(;;) {
345  do i++; while (arr[i] < a);
346  do j--; while (a < arr[j]);
347  if(j < i) break;
348  std::swap(arr[i],arr[j]);
349  }
350  arr[l]=arr[j];
351  arr[j]=a;
352  jstack += 2;
353  if(jstack > NSTACK) {
354  std::cout << "uqsort: NSTACK too small in sort." << std::endl;
355  std::terminate();
356  }
357  if(ir+l+1 >= j+i) {
358  istack[jstack]=ir;
359  istack[jstack-1]=i;
360  ir=j-1;
361  }
362  else {
363  istack[jstack]=j-1;
364  istack[jstack-1]=l;
365  l=i;
366  }
367  }
368  }
369 }
370 
371 // This exists only so we can track how many times the MJ algorithm is
372 // called and put each of those into different timer names.
373 // Currently the MultiJaggedTest.cpp will actually call it twice.
374 // First time with data from a Tpetra MultiVector and then a second time using
375 // a BasicVectorAdapter which allows us to turn UVM off for some tests. The
376 // results of the two runs are compared which helps to catch a lot of bugs. For
377 // profiling I'm mostly just interested in the UVM off case and need it to be
378 // in separate timers. Passing a value through would mess up the API. Possibly
379 // we could check the Adapter and use that. The statics have to be outside the
380 // templated class as the two called instances will be different template
381 // parameters. Another complication is that MultiJagged.cpp will call through
382 // the Zoltan2_AlgMJ class and we want to time things in both classes. However
383 // TaskMapper will directly call AlgMJ so I made two counters for the two
384 // classes to make sure it was always correct. This does not impact any
385 // behavior and has the sole purpose of generating unique timer names. If you
386 // run an MJ test you'll see MJ(0) and MJ(1) in the names to distinguish the
387 // 1st and 2nd run. Right now only MultijaggedTest.cpp cares about this.
389  static int get_counter_AlgMJ() {
390  static int counter = 0;
391  return counter++;
392  }
394  static int counter = 0;
395  return counter++;
396  }
397 };
398 
401 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
402  typename mj_part_t, typename mj_node_t>
403 class AlgMJ
404 {
405 private:
406  typedef typename mj_node_t::device_type device_t; // for views
408  typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
409 
410  //if the (last dimension reduce all count) x the mpi world size
411  //estimated to be bigger than this number then migration will be forced
412  //in earlier iterations.
413  static constexpr size_t future_reduceall_cutoff = 1500000;
414 
415  //if parts right before last dimension are estimated to have less than
416  //MIN_WORK_LAST_DIM many coords, migration will be forced in earlier iterations.
417  static constexpr mj_lno_t min_work_last_dim = 1000;
418 
419  static constexpr mj_scalar_t least_signifiance = 0.0001;
420  static constexpr int significance_mul = 1000;
421 
422  std::string mj_timer_base_string; // for convenience making timer names
423 
424  RCP<const Environment> mj_env; // the environment object
425  RCP<const Comm<int> > mj_problemComm; // initial comm object
426  RCP<Comm<int> > comm; // comm object than can be altered during execution
427  double imbalance_tolerance; // input imbalance tolerance.
428  int recursion_depth; // number of steps that partitioning will be solved in.
429  int coord_dim; // coordinate dim
430  int num_weights_per_coord; // # of weights per coord
431  size_t initial_num_loc_coords; // initial num local coords.
432  global_size_t initial_num_glob_coords; // initial num global coords.
433  mj_lno_t num_local_coords; // number of local coords.
434  mj_gno_t num_global_coords; // number of global coords.
435  mj_scalar_t sEpsilon; // epsilon for mj_scalar_t
436 
437  // can distribute points on same coordinant to different parts.
438  bool distribute_points_on_cut_lines;
439 
440  // how many parts we can calculate concurrently.
441  mj_part_t max_concurrent_part_calculation;
442 
443  bool mj_run_as_rcb; // means recursion depth is adjusted to maximum value.
444  int mj_user_recursion_depth; // the recursion depth value provided by user.
445  bool mj_keep_part_boxes; // if the boxes need to be kept.
446 
447  // whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
448  int check_migrate_avoid_migration_option;
449 
450  // when doing the migration, 0 will aim for perfect load-imbalance, 1 - will
451  // aim for minimized number of messages with possibly bad load-imbalance
452  int migration_type;
453 
454  // when MJ decides whether to migrate, the minimum imbalance for migration.
455  double minimum_migration_imbalance;
456 
457  // Nonuniform first level partitioning
458  // (Currently available only for sequential_task_partitioning):
459  // Used for Dragonfly task mapping by partitioning Dragonfly RCA
460  // machine coordinates and application coordinates.
461  // An optimization that completely partitions the most important machine dimension
462  // first (i.e. the Dragonfly group coordinate, or RCA's x coordinate). The standard
463  // MJ alg follows after the nonuniform first level partitioning.
464  //
465  // Ex. (first level partitioning): If we have 120 elements,
466  // num_first_level_parts = 3, first_level_distribution = [4, 10, 6], then
467  // part sizes after first level will be [24, 60, 36]. Standard uniform MJ
468  // continues for all subsequent levels.
469 
470  // If used, number of parts requested for a nonuniform
471  // first level partitioning
472  mj_part_t num_first_level_parts;
473 
474  // If used, the requested distribution of parts for the
475  // nonuniform first level partitioning
476  Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
477 
478  mj_part_t total_num_cut ; // how many cuts will be totally
479  mj_part_t total_num_part; // how many parts will be totally
480 
481  mj_part_t max_num_part_along_dim ; // maximum part count along a dimension.
482  mj_part_t max_num_cut_along_dim; // maximum cut count along a dimension.
483 
484  // maximum part+cut count along a dimension.
485  size_t max_num_total_part_along_dim;
486 
487  mj_part_t total_dim_num_reduce_all; // estimate on #reduceAlls can be done.
488 
489  // max no of parts that might occur during the partition before the last
490  // partitioning dimension.
491  mj_part_t last_dim_num_part;
492 
493  // input part array specifying num part to divide along each dim.
494  Kokkos::View<mj_part_t *, Kokkos::HostSpace> part_no_array;
495 
496  // two dimension coordinate array
497  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
498  Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
499  mj_coordinates;
500 
501  // two dimension weight array
502  Kokkos::View<mj_scalar_t **, device_t> mj_weights;
503 
504  // if the target parts are uniform
505  Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_parts;
506 
507  // if the coordinates have uniform weights
508  Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_weights;
509 
510  int mj_num_teams; // the number of teams
511 
512  size_t num_global_parts; // the targeted number of parts
513 
514  // vector of all boxes for all parts, constructed if mj_keep_part_boxes true
515  RCP<mj_partBoxVector_t> kept_boxes;
516 
517  RCP<mj_partBox_t> global_box;
518 
519  int myRank; // processor rank
520  int myActualRank; // initial rank
521 
522  bool divide_to_prime_first;
523 
524  // initial global ids of the coordinates.
525  Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
526 
527  // current global ids of the coordinates, might change during migration.
528  Kokkos::View<mj_gno_t*, device_t> current_mj_gnos;
529 
530  // the actual processor owner of the coordinate, to track after migrations.
531  Kokkos::View<int*, Kokkos::HostSpace> owner_of_coordinate;
532 
533  // permutation of coordinates, for partitioning.
534  Kokkos::View<mj_lno_t*, device_t> coordinate_permutations;
535 
536  // permutation work array.
537  Kokkos::View<mj_lno_t*, device_t> new_coordinate_permutations;
538 
539  // the part ids assigned to coordinates.
540  Kokkos::View<mj_part_t*, device_t> assigned_part_ids;
541 
542  // beginning and end of each part.
543  Kokkos::View<mj_lno_t *, device_t> part_xadj;
544 
545  // work array for beginning and end of each part.
546  Kokkos::View<mj_lno_t *, device_t> new_part_xadj;
547 
548  Kokkos::View<mj_scalar_t *, device_t> all_cut_coordinates;
549 
550  // how much weight should a MPI put left side of the each cutline
551  Kokkos::View<mj_scalar_t *, device_t>
552  process_cut_line_weight_to_put_left;
553 
554  // weight percentage each thread in MPI puts left side of the each outline
555  Kokkos::View<mj_scalar_t *, device_t>
556  thread_cut_line_weight_to_put_left;
557 
558  // work array to manipulate coordinate of cutlines in different iterations.
559  // necessary because previous cut line information is used for determining
560  // the next cutline information. therefore, cannot update the cut work array
561  // until all cutlines are determined.
562  Kokkos::View<mj_scalar_t *, device_t> cut_coordinates_work_array;
563 
564  // Used for swapping above cut_coordinates_work_array
565  Kokkos::View<mj_scalar_t *, device_t> temp_cut_coords;
566 
567  // cumulative part weight array.
568  Kokkos::View<mj_scalar_t *, device_t> target_part_weights;
569 
570  // upper bound coordinate of a cut line
571  Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_coordinates;
572 
573  // lower bound coordinate of a cut line
574  Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_coordinates;
575 
576  // lower bound weight of a cut line
577  Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_weights;
578 
579  // upper bound weight of a cut line
580  Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_weights;
581 
582  // combined array to exchange the min and max coordinate, and total
583  // weight of part.
584  Kokkos::View<mj_scalar_t *, device_t>
585  process_local_min_max_coord_total_weight;
586 
587  // global combined array with the results for min, max and total weight.
588  Kokkos::View<mj_scalar_t *, device_t>
589  global_min_max_coord_total_weight;
590 
591  // isDone is used to determine if a cutline is determined already. If a cut
592  // line is already determined, the next iterations will skip this cut line.
593  Kokkos::View<bool *, device_t> is_cut_line_determined;
594 
595  // incomplete_cut_count count holds the number of cutlines that have not
596  // been finalized for each part when concurrentPartCount>1, using this
597  // information, if incomplete_cut_count[x]==0, then no work is done
598  // for this part.
599  Kokkos::View<mj_part_t *, device_t> device_incomplete_cut_count;
600  typename decltype(device_incomplete_cut_count)::HostMirror
601  incomplete_cut_count;
602 
603  // Need a quick accessor for this on host
604  typename decltype (part_xadj)::HostMirror host_part_xadj;
605 
606  // local part weights of each thread.
607  Kokkos::View<double *, device_t>
608  thread_part_weights;
609 
610  // the work manupulation array for partweights.
611  Kokkos::View<double *, device_t>
612  thread_part_weight_work;
613 
614  // thread_cut_left_closest_point to hold the closest coordinate
615  // to a cutline from left (for each thread).
616  Kokkos::View<mj_scalar_t *, device_t>
617  thread_cut_left_closest_point;
618 
619  // thread_cut_right_closest_point to hold the closest coordinate
620  // to a cutline from right (for each thread)
621  Kokkos::View<mj_scalar_t *, device_t>
622  thread_cut_right_closest_point;
623 
624  // to store how many points in each part a thread has.
625  Kokkos::View<mj_lno_t *, device_t>
626  thread_point_counts;
627 
628  Kokkos::View<mj_scalar_t *, device_t> process_rectilinear_cut_weight;
629  Kokkos::View<mj_scalar_t *, device_t> global_rectilinear_cut_weight;
630 
631  // for faster communication, concatanation of
632  // totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
633  // leftClosest distances sized P-1, since P-1 cut lines
634  // rightClosest distances size P-1, since P-1 cut lines.
635  Kokkos::View<mj_scalar_t *, device_t>
636  total_part_weight_left_right_closests;
637  Kokkos::View<mj_scalar_t *, device_t>
638  global_total_part_weight_left_right_closests;
639 
640  Kokkos::View<mj_part_t*, device_t> device_num_partitioning_in_current_dim;
641  typename decltype(device_num_partitioning_in_current_dim)::HostMirror
642  host_num_partitioning_in_current_dim; // for quick access on host
643 
644  /* \brief helper functio to calculate imbalance.
645  * \param achieved balance we achieved.
646  * \param expected balance expected.
647  */
648  static
649  KOKKOS_INLINE_FUNCTION
650  double calculate_imbalance(mj_scalar_t achieved, mj_scalar_t expected) {
651  return static_cast<double>(achieved) / static_cast<double>(expected) - 1.0;
652  }
653 
654  /* \brief Either the mj array (part_no_array) or num_global_parts should be
655  * provided in the input. part_no_array takes precedence if both are
656  * provided. Depending on these parameters, total cut/part number, maximum
657  * part/cut number along a dimension, estimated number of reduceAlls,
658  * and the number of parts before the last dimension is calculated.
659  * */
660  void set_part_specifications();
661 
662  /* \brief Tries to determine the part number for current dimension,
663  * by trying to make the partitioning as square as possible.
664  * \param num_total_future how many more partitionings are required.
665  * \param root how many more recursion depth is left.
666  */
667  inline mj_part_t get_part_count(
668  mj_part_t num_total_future,
669  double root);
670 
671  /* \brief for part communication we keep track of the box boundaries.
672  * This is performed when either asked specifically, or when geometric
673  * mapping is performed afterwards. This function initializes a single box
674  * with all global min and max coordinates.
675  * \param initial_partitioning_boxes the input and output vector for boxes.
676  */
677  void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
678 
679  /* \brief Function returns how many parts that will be obtained after this
680  * dimension partitioning. It sets how many parts each current part will be
681  * partitioned into in this dimension to device_num_partitioning_in_current_dim
682  * vector, sets how many total future parts each obtained part will be
683  * partitioned into in next_future_num_parts_in_parts vector, If part boxes
684  * are kept, then sets initializes the output_part_boxes as its ancestor.
685  * \param future_num_part_in_parts: input, how many future parts each
686  * current part will be partitioned into.
687  * \param next_future_num_parts_in_parts: output, how many future parts
688  * each obtained part will be partitioned into.
689  * \param future_num_parts: output, max number of future parts that will be
690  * obtained from a single
691  * \param current_num_parts: input, how many parts are there currently.
692  * \param current_iteration: input, current dimension iteration number.
693  * \param input_part_boxes: input, if boxes are kept, current boxes.
694  * \param output_part_boxes: output, if boxes are kept, the initial box
695  * boundaries for obtained parts.
696  * \param atomic_part_count // DOCWORK: Documentation
697  */
698  mj_part_t update_part_num_arrays(
699  std::vector<mj_part_t> *future_num_part_in_parts,
700  std::vector<mj_part_t> *next_future_num_parts_in_parts,
701  mj_part_t &future_num_parts,
702  mj_part_t current_num_parts,
703  int current_iteration,
704  RCP<mj_partBoxVector_t> input_part_boxes,
705  RCP<mj_partBoxVector_t> output_part_boxes,
706  mj_part_t atomic_part_count);
707 
719  static
720  KOKKOS_INLINE_FUNCTION
721  void mj_calculate_new_cut_position (
722  mj_scalar_t cut_upper_bound,
723  mj_scalar_t cut_lower_bound,
724  mj_scalar_t cut_upper_weight,
725  mj_scalar_t cut_lower_weight,
726  mj_scalar_t expected_weight,
727  mj_scalar_t &new_cut_position,
728  mj_scalar_t sEpsilon);
729 
754  bool mj_perform_migration(
755  mj_part_t in_num_parts, //current number of parts
756  mj_part_t &out_num_parts, //output number of parts.
757  std::vector<mj_part_t> *next_future_num_parts_in_parts,
758  mj_part_t &output_part_begin_index,
759  size_t migration_reduce_all_population,
760  mj_lno_t num_coords_for_last_dim_part,
761  std::string iteration,
762  RCP<mj_partBoxVector_t> &input_part_boxes,
763  RCP<mj_partBoxVector_t> &output_part_boxes);
764 
782  bool mj_check_to_migrate(
783  size_t migration_reduce_all_population,
784  mj_lno_t num_coords_for_last_dim_part,
785  mj_part_t num_procs,
786  mj_part_t num_parts,
787  mj_gno_t *num_points_in_all_processor_parts);
788 
813  void mj_migration_part_proc_assignment(
814  mj_gno_t * num_points_in_all_processor_parts,
815  mj_part_t num_parts,
816  mj_part_t num_procs,
817  mj_lno_t *send_count_to_each_proc,
818  std::vector<mj_part_t> &processor_ranks_for_subcomm,
819  std::vector<mj_part_t> *next_future_num_parts_in_parts,
820  mj_part_t &out_num_part,
821  std::vector<mj_part_t> &out_part_indices,
822  mj_part_t &output_part_numbering_begin_index,
823  int *coordinate_destinations);
824 
850  void mj_assign_proc_to_parts(
851  mj_gno_t * num_points_in_all_processor_parts,
852  mj_part_t num_parts,
853  mj_part_t num_procs,
854  mj_lno_t *send_count_to_each_proc,
855  std::vector<mj_part_t> &processor_ranks_for_subcomm,
856  std::vector<mj_part_t> *next_future_num_parts_in_parts,
857  mj_part_t &out_part_index,
858  mj_part_t &output_part_numbering_begin_index,
859  int *coordinate_destinations);
860 
876  void assign_send_destinations(
877  mj_part_t num_parts,
878  mj_part_t *part_assignment_proc_begin_indices,
879  mj_part_t *processor_chains_in_parts,
880  mj_lno_t *send_count_to_each_proc,
881  int *coordinate_destinations);
882 
897  void assign_send_destinations2(
898  mj_part_t num_parts,
899  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
900  int *coordinate_destinations,
901  mj_part_t &output_part_numbering_begin_index,
902  std::vector<mj_part_t> *next_future_num_parts_in_parts);
903 
926  void mj_assign_parts_to_procs(
927  mj_gno_t * num_points_in_all_processor_parts,
928  mj_part_t num_parts,
929  mj_part_t num_procs,
930  mj_lno_t *send_count_to_each_proc,
931  std::vector<mj_part_t> *next_future_num_parts_in_parts,
932  mj_part_t &out_num_part,
933  std::vector<mj_part_t> &out_part_indices,
934  mj_part_t &output_part_numbering_begin_index,
935  int *coordinate_destinations);
936 
950  void mj_migrate_coords(
951  mj_part_t num_procs,
952  mj_lno_t &num_new_local_points,
953  std::string iteration,
954  int *coordinate_destinations,
955  mj_part_t num_parts);
956 
962  void create_sub_communicator(
963  std::vector<mj_part_t> &processor_ranks_for_subcomm);
964 
969  mj_part_t find_largest_prime_factor(mj_part_t num_parts) {
970  mj_part_t largest_factor = 1;
971  mj_part_t n = num_parts;
972  mj_part_t divisor = 2;
973  while (n > 1) {
974  while (n % divisor == 0) {
975  n = n / divisor;
976  largest_factor = divisor;
977  }
978  ++divisor;
979  if(divisor * divisor > n) {
980  if(n > 1) {
981  largest_factor = n;
982  }
983  break;
984  }
985  }
986  return largest_factor;
987  }
988 
989 public:
990  AlgMJ();
991 
992  // DOCWORK: Make param documentation use : consistently
1018  void multi_jagged_part(
1019  const RCP<const Environment> &env,
1020  RCP<const Comm<int> > &problemComm,
1021  double imbalance_tolerance,
1022  int num_teams,
1023  size_t num_global_parts,
1024  Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array,
1025  int recursion_depth,
1026  int coord_dim,
1027  mj_lno_t num_local_coords,
1028  mj_gno_t num_global_coords,
1029  Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos,
1030  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1031  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates,
1032  int num_weights_per_coord,
1033  Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights,
1034  Kokkos::View<mj_scalar_t**, device_t> & mj_weights,
1035  Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts,
1036  Kokkos::View<mj_part_t*, device_t> & result_assigned_part_ids,
1037  Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos);
1038 
1052  bool distribute_points_on_cut_lines_,
1053  int max_concurrent_part_calculation_,
1054  int check_migrate_avoid_migration_option_,
1055  double minimum_migration_imbalance_,
1056  int migration_type_ = 0);
1057 
1060  void set_to_keep_part_boxes();
1061 
1064  RCP<mj_partBox_t> get_global_box() const;
1065 
1068  RCP<mj_partBoxVector_t> get_kept_boxes() const;
1069 
1072  RCP<mj_partBoxVector_t> compute_global_box_boundaries(
1073  RCP<mj_partBoxVector_t> &localPartBoxes) const;
1074 
1114  const RCP<const Environment> &env,
1115  mj_lno_t num_total_coords,
1116  mj_lno_t num_selected_coords,
1117  size_t num_target_part,
1118  int coord_dim,
1119  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1120  Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
1121  Kokkos::View<mj_lno_t *, device_t> &
1122  initial_selected_coords_output_permutation,
1123  mj_lno_t *output_xadj,
1124  int recursion_depth_,
1125  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array,
1126  bool partition_along_longest_dim,
1127  int num_ranks_per_node,
1128  bool divide_to_prime_first_,
1129  mj_part_t num_first_level_parts_ = 1,
1130  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_
1131  = Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1132 
1133 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
1134  public:
1135 #else
1136  private:
1137 #endif
1138 
1139  /* \brief Allocates all required memory for the mj partitioning algorithm.
1140  */
1141  void allocate_set_work_memory();
1142 
1143  /* \brief compute global bounding box: min/max coords of global domain */
1144  void compute_global_box();
1145 
1146  // DOCWORK: Inconsisent use of ! for descriptive/brief commenting - decide.
1153  void mj_get_local_min_max_coord_totW(
1154  mj_part_t current_work_part,
1155  mj_part_t current_concurrent_num_parts,
1156  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords);
1157 
1170  void mj_get_global_min_max_coord_totW(
1171  mj_part_t current_concurrent_num_parts,
1172  Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
1173  Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total);
1174 
1205  void mj_get_initial_cut_coords_target_weights(
1206  mj_scalar_t min_coord,
1207  mj_scalar_t max_coord,
1208  mj_part_t num_cuts/*p-1*/ ,
1209  mj_scalar_t global_weight,
1210  Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
1211  Kokkos::View<mj_scalar_t *, device_t> & target_part_weights,
1212  std::vector <mj_part_t> *future_num_part_in_parts,
1213  std::vector <mj_part_t> *next_future_num_parts_in_parts,
1214  mj_part_t concurrent_current_part,
1215  mj_part_t obtained_part_index,
1216  mj_part_t num_target_first_level_parts = 1,
1217  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist =
1218  Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1219 
1236  void set_initial_coordinate_parts(
1237  mj_scalar_t &max_coordinate,
1238  mj_scalar_t &min_coordinate,
1239  mj_lno_t coordinate_begin_index,
1240  mj_lno_t coordinate_end_index,
1241  Kokkos::View<mj_lno_t *, device_t> &
1242  mj_current_coordinate_permutations,
1243  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1244  Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
1245  mj_part_t &partition_count);
1246 
1263  void mj_1D_part(
1264  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1265  double imbalanceTolerance,
1266  mj_part_t current_work_part,
1267  mj_part_t current_concurrent_num_parts,
1268  Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1269  mj_part_t total_incomplete_cut_count,
1270  Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
1271  Kokkos::View<size_t*, device_t> & view_total_reduction_size);
1272 
1278  void mj_1D_part_get_part_weights(
1279  mj_part_t current_concurrent_num_parts,
1280  mj_part_t current_work_part,
1281  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1282  int loop_count);
1283 
1291  void mj_combine_rightleft_and_weights(
1292  mj_part_t current_work_part,
1293  mj_part_t current_concurrent_num_parts);
1294 
1307  void mj_create_new_partitions(
1308  mj_part_t num_parts,
1309  mj_part_t current_concurrent_work_part,
1310  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1311  Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1312  Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1313  Kokkos::View<mj_lno_t *, device_t> & out_part_xadj);
1314 
1350  void mj_get_new_cut_coordinates(
1351  mj_part_t current_concurrent_num_parts,
1352  mj_part_t kk,
1353  const mj_part_t &num_cuts,
1354  const double &used_imbalance_tolerance,
1355  Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
1356  Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
1357  Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
1358  Kokkos::View<bool *, device_t> & current_cut_line_determined,
1359  Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1360  Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
1361  Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
1362  Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
1363  Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
1364  Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
1365  Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
1366  Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
1367  Kokkos::View<mj_scalar_t *, device_t> &
1368  current_part_cut_line_weight_to_put_left,
1369  Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count);
1370 
1380  void get_processor_num_points_in_parts(
1381  mj_part_t num_procs,
1382  mj_part_t num_parts,
1383  mj_gno_t *&num_points_in_all_processor_parts);
1384 
1389  void fill_permutation_array(
1390  mj_part_t output_num_parts,
1391  mj_part_t num_parts);
1392 
1414  void create_consistent_chunks(
1415  mj_part_t num_parts,
1416  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1417  Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1418  mj_lno_t coordinate_begin,
1419  mj_lno_t coordinate_end,
1420  Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1421  Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
1422  int coordInd,
1423  bool longest_dim_part,
1424  uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted);
1425 
1434  void set_final_parts(
1435  mj_part_t current_num_parts,
1436  mj_part_t output_part_begin_index,
1437  RCP<mj_partBoxVector_t> &output_part_boxes,
1438  bool is_data_ever_migrated);
1439 };
1440 
1443 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1444  typename mj_part_t, typename mj_node_t>
1446  mj_env(), mj_problemComm(), comm(), imbalance_tolerance(0),
1447  recursion_depth(0), coord_dim(0),
1448  num_weights_per_coord(0), initial_num_loc_coords(0),
1449  initial_num_glob_coords(0),
1450  num_local_coords(0), num_global_coords(0),
1451  sEpsilon(std::numeric_limits<mj_scalar_t>::epsilon() * 100),
1452  distribute_points_on_cut_lines(true),
1453  max_concurrent_part_calculation(1),
1454  mj_run_as_rcb(false), mj_user_recursion_depth(0),
1455  mj_keep_part_boxes(false),
1456  check_migrate_avoid_migration_option(0), migration_type(0),
1457  minimum_migration_imbalance(0.30),
1458  num_first_level_parts(1),
1459  total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1460  max_num_cut_along_dim(0),
1461  max_num_total_part_along_dim(0),
1462  total_dim_num_reduce_all(0),
1463  last_dim_num_part(0),
1464  mj_num_teams(0),
1465  num_global_parts(1),
1466  kept_boxes(), global_box(),
1467  myRank(0), myActualRank(0),
1468  divide_to_prime_first(false)
1469 {
1470 }
1471 
1515 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1516  typename mj_part_t, typename mj_node_t>
1519  const RCP<const Environment> &env,
1520  mj_lno_t num_total_coords,
1521  mj_lno_t num_selected_coords,
1522  size_t num_target_part,
1523  int coord_dim_,
1524  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1525  Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> &
1526  mj_coordinates_,
1527  Kokkos::View<mj_lno_t *, device_t> & initial_adjList_output_adjlist,
1528  mj_lno_t *output_xadj,
1529  int recursion_depth_,
1530  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array_,
1531  bool partition_along_longest_dim,
1532  int num_ranks_per_node,
1533  bool divide_to_prime_first_,
1534  mj_part_t num_first_level_parts_,
1535  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_)
1536 {
1537  this->mj_env = env;
1538  const RCP<Comm<int> > commN;
1539  this->mj_problemComm = Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1540  this->comm = Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1541  this->myActualRank = this->myRank = 1;
1542 
1543  this->divide_to_prime_first = divide_to_prime_first_;
1544  //weights are uniform for task mapping
1545 
1546  //parts are uniform for task mapping
1547  //as input indices.
1548  this->imbalance_tolerance = 0;
1549  this->num_global_parts = num_target_part;
1550  this->part_no_array = part_no_array_;
1551  this->recursion_depth = recursion_depth_;
1552 
1553  // If nonuniform first level partitioning, the requested num of parts and the
1554  // requested distribution of elements for each part
1555  this->num_first_level_parts = num_first_level_parts_;
1556 
1557  this->first_level_distribution = first_level_distribution_;
1558 
1559  this->coord_dim = coord_dim_;
1560  this->num_local_coords = num_total_coords;
1561 
1562  this->num_global_coords = num_total_coords;
1563  this->mj_coordinates = mj_coordinates_;
1564 
1565 
1566  this->initial_mj_gnos =
1567  Kokkos::View<mj_gno_t*, device_t>("gids", this->num_local_coords);
1568 
1569  this->num_weights_per_coord = 0;
1570 
1571  this->mj_uniform_weights = Kokkos::View<bool*, Kokkos::HostSpace>(
1572  "uniform weights", 1);
1573  this->mj_uniform_weights(0) = true;
1574 
1575  this->mj_weights = Kokkos::View<mj_scalar_t**, device_t>
1576  ("weights", 1, 1);
1577 
1578  this->mj_uniform_parts =
1579  Kokkos::View<bool*, Kokkos::HostSpace>("uniform parts", 1);
1580  this->mj_uniform_parts(0) = true;
1581 
1582  this->set_part_specifications();
1583 
1584  this->allocate_set_work_memory();
1585 
1586  // Do single init
1587  auto local_part_xadj = this->part_xadj;
1588  Kokkos::parallel_for(
1589  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
1590  KOKKOS_LAMBDA (int dummy) {
1591  local_part_xadj(0) = static_cast<mj_lno_t>(num_selected_coords);
1592  });
1593 
1594  Kokkos::deep_copy(coordinate_permutations, initial_adjList_output_adjlist);
1595 
1596  mj_part_t current_num_parts = 1;
1597 
1598  Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
1599  this->all_cut_coordinates;
1600 
1601  mj_part_t future_num_parts = this->total_num_part;
1602 
1603  std::vector<mj_part_t> *future_num_part_in_parts =
1604  new std::vector<mj_part_t>();
1605  std::vector<mj_part_t> *next_future_num_parts_in_parts =
1606  new std::vector<mj_part_t>();
1607  next_future_num_parts_in_parts->push_back(this->num_global_parts);
1608  RCP<mj_partBoxVector_t> t1;
1609  RCP<mj_partBoxVector_t> t2;
1610 
1611  std::vector <uSignedSortItem<int, mj_scalar_t, char>>
1612  coord_dimension_range_sorted(this->coord_dim);
1613  uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted =
1614  &(coord_dimension_range_sorted[0]);
1615  std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1616  std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1617 
1618  // Need a device counter - how best to allocate?
1619  // Putting this allocation in the loops is very costly so moved out here.
1620  Kokkos::View<mj_part_t*, device_t>
1621  view_rectilinear_cut_count("view_rectilinear_cut_count", 1);
1622  Kokkos::View<size_t*, device_t>
1623  view_total_reduction_size("view_total_reduction_size", 1);
1624 
1625  for(int rd = 0; rd < this->recursion_depth; ++rd) {
1626  // next_future_num_parts_in_parts will be as the size of outnumParts,
1627  // and this will hold how many more parts that each output part
1628  // should be divided. this array will also be used to determine the weight
1629  // ratios of the parts.
1630  // swap the arrays to use iteratively..
1631  std::vector<mj_part_t> *tmpPartVect = future_num_part_in_parts;
1632  future_num_part_in_parts = next_future_num_parts_in_parts;
1633  next_future_num_parts_in_parts = tmpPartVect;
1634 
1635  // clear next_future_num_parts_in_parts array as
1636  // getPartitionArrays expects it to be empty.
1637  next_future_num_parts_in_parts->clear();
1638 
1639  // returns the total number of output parts for this dimension partitioning.
1640  mj_part_t output_part_count_in_dimension =
1641  this->update_part_num_arrays(
1642  future_num_part_in_parts,
1643  next_future_num_parts_in_parts,
1644  future_num_parts,
1645  current_num_parts,
1646  rd,
1647  t1,
1648  t2, num_ranks_per_node);
1649 
1650  // if the number of obtained parts equal to current number of parts,
1651  // skip this dimension. For example, this happens when 1 is given in
1652  // the input part array is given. P=4,5,1,2
1653  if(output_part_count_in_dimension == current_num_parts) {
1654  tmpPartVect = future_num_part_in_parts;
1655  future_num_part_in_parts = next_future_num_parts_in_parts;
1656  next_future_num_parts_in_parts = tmpPartVect;
1657  continue;
1658  }
1659 
1660  //convert i to string to be used for debugging purposes.
1661  std::string istring = std::to_string(rd);
1662 
1663  // alloc Memory to point the indices
1664  // of the parts in the permutation array.
1665  this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
1666  "new part xadj", output_part_count_in_dimension);
1667 
1668  // the index where in the outtotalCounts will be written.
1669 
1670  mj_part_t output_part_index = 0;
1671 
1672  // whatever is written to outTotalCounts will be added with previousEnd
1673  // so that the points will be shifted.
1674  mj_part_t output_coordinate_end_index = 0;
1675 
1676  mj_part_t current_work_part = 0;
1677  mj_part_t current_concurrent_num_parts = 1;
1678 
1679  mj_part_t obtained_part_index = 0;
1680 
1681  // get the coordinate axis along which the partitioning will be done.
1682  int coordInd = rd % this->coord_dim;
1683 
1684  Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
1685  Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1686 
1687  auto host_process_local_min_max_coord_total_weight =
1688  Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
1689  auto host_global_min_max_coord_total_weight =
1690  Kokkos::create_mirror_view(global_min_max_coord_total_weight);
1691 
1692  // run for all available parts.
1693  for(; current_work_part < current_num_parts;
1694  current_work_part += current_concurrent_num_parts) {
1695 
1696  mj_part_t actual_work_part_count = 0;
1697 
1698  // initialization for 1D partitioning.
1699  // get the min and max coordinates of each part
1700  // together with the part weights of each part.
1701  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1702  mj_part_t current_work_part_in_concurrent_parts =
1703  current_work_part + kk;
1704 
1705  // if this part wont be partitioned any further
1706  // dont do any work for this part.
1707  mj_part_t partition_count = host_num_partitioning_in_current_dim(
1708  current_work_part_in_concurrent_parts);
1709  if(partition_count == 1) {
1710  continue;
1711  }
1712  ++actual_work_part_count;
1713  if(partition_along_longest_dim) {
1714  auto local_process_local_min_max_coord_total_weight =
1715  this->process_local_min_max_coord_total_weight;
1716  for(int coord_traverse_ind = 0;
1717  coord_traverse_ind < this->coord_dim; ++coord_traverse_ind) {
1718 
1719  Kokkos::View<mj_scalar_t *, device_t> coords =
1720  Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coord_traverse_ind);
1721 
1722  this->mj_get_local_min_max_coord_totW(
1723  current_work_part,
1724  current_concurrent_num_parts,
1725  coords);
1726 
1727  coord_dimension_range_sorted[coord_traverse_ind].id =
1728  coord_traverse_ind;
1729  coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1730 
1731  Kokkos::deep_copy(host_process_local_min_max_coord_total_weight,
1732  process_local_min_max_coord_total_weight);
1733 
1734  coord_dim_mins[coord_traverse_ind] =
1735  host_process_local_min_max_coord_total_weight(kk);
1736  coord_dim_maxs[coord_traverse_ind] =
1737  host_process_local_min_max_coord_total_weight(
1738  kk + current_concurrent_num_parts);
1739  coord_dimension_range_sorted[coord_traverse_ind].val =
1740  host_process_local_min_max_coord_total_weight(
1741  kk + current_concurrent_num_parts) -
1742  host_process_local_min_max_coord_total_weight(kk);
1743  }
1744 
1745  uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1746  coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].id;
1747  auto set_min = coord_dim_mins[coordInd];
1748  auto set_max = coord_dim_maxs[coordInd];
1749  Kokkos::parallel_for(
1750  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1751  (0, 1), KOKKOS_LAMBDA (int dummy) {
1752  local_process_local_min_max_coord_total_weight(kk) = set_min;
1753  local_process_local_min_max_coord_total_weight(
1754  kk + current_concurrent_num_parts) = set_max;
1755  });
1756 
1757  mj_current_dim_coords =
1758  Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1759  }
1760  else {
1761  Kokkos::View<mj_scalar_t *, device_t> coords =
1762  Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1763  this->mj_get_local_min_max_coord_totW(
1764  current_work_part,
1765  current_concurrent_num_parts,
1766  coords);
1767  }
1768  }
1769 
1770  // 1D partitioning
1771  if(actual_work_part_count > 0) {
1772  // obtain global Min max of the part.
1773  this->mj_get_global_min_max_coord_totW(
1774  current_concurrent_num_parts,
1775  this->process_local_min_max_coord_total_weight,
1776  this->global_min_max_coord_total_weight);
1777 
1778  // update host copy
1779  Kokkos::deep_copy(host_global_min_max_coord_total_weight,
1780  global_min_max_coord_total_weight);
1781 
1782  // represents the total number of cutlines
1783  // whose coordinate should be determined.
1784  mj_part_t total_incomplete_cut_count = 0;
1785 
1786  //Compute weight ratios for parts & cuts:
1787  //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1.0
1788  // part0 cut0 part1 cut1 part2 cut2 part3
1789  mj_part_t concurrent_part_cut_shift = 0;
1790  mj_part_t concurrent_part_part_shift = 0;
1791  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1792  mj_scalar_t min_coordinate =
1793  host_global_min_max_coord_total_weight(kk);
1794  mj_scalar_t max_coordinate = host_global_min_max_coord_total_weight(
1795  kk + current_concurrent_num_parts);
1796  mj_scalar_t global_total_weight = host_global_min_max_coord_total_weight(
1797  kk + 2*current_concurrent_num_parts);
1798 
1799  mj_part_t concurrent_current_part_index = current_work_part + kk;
1800 
1801  mj_part_t partition_count = host_num_partitioning_in_current_dim(
1802  concurrent_current_part_index);
1803 
1804  Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
1805  Kokkos::subview(current_cut_coordinates,
1806  std::pair<mj_lno_t, mj_lno_t>(
1807  concurrent_part_cut_shift,
1808  current_cut_coordinates.size()));
1809  Kokkos::View<mj_scalar_t *, device_t>
1810  current_target_part_weights =
1811  Kokkos::subview(target_part_weights,
1812  std::pair<mj_lno_t, mj_lno_t>(
1813  concurrent_part_part_shift,
1814  target_part_weights.size()));
1815 
1816  // shift the usedCutCoordinate array as noCuts.
1817  concurrent_part_cut_shift += partition_count - 1;
1818  // shift the partRatio array as noParts.
1819  concurrent_part_part_shift += partition_count;
1820  // calculate only if part is not empty,
1821  // and part will be further partitioend.
1822  if(partition_count > 1 && min_coordinate <= max_coordinate) {
1823  // increase allDone by the number of cuts of the current
1824  // part's cut line number.
1825  total_incomplete_cut_count += partition_count - 1;
1826 
1827  this->incomplete_cut_count(kk) = partition_count - 1;
1828 
1829  // When num_first_level_parts != 1 we have
1830  // nonuniform partitioning on the first level, providing
1831  // requested number of parts (num_first_level_parts) and
1832  // requested distribution in parts (first_level_distribution)
1833 
1834  // Get the target part weights given a desired distribution
1835  this->mj_get_initial_cut_coords_target_weights(
1836  min_coordinate,
1837  max_coordinate,
1838  partition_count - 1,
1839  global_total_weight,
1840  usedCutCoordinate,
1841  current_target_part_weights,
1842  future_num_part_in_parts,
1843  next_future_num_parts_in_parts,
1844  concurrent_current_part_index,
1845  obtained_part_index,
1846  rd == 0 ? this->num_first_level_parts : 1,
1847  this->first_level_distribution);
1848 
1849  mj_lno_t coordinate_end_index =
1850  host_part_xadj(concurrent_current_part_index);
1851  mj_lno_t coordinate_begin_index =
1852  (concurrent_current_part_index==0) ? 0 :
1853  host_part_xadj[concurrent_current_part_index - 1];
1854 
1855  // get the initial estimated part assignments of the coordinates.
1856  this->set_initial_coordinate_parts(
1857  max_coordinate,
1858  min_coordinate,
1859  coordinate_begin_index, coordinate_end_index,
1860  this->coordinate_permutations,
1861  mj_current_dim_coords,
1862  this->assigned_part_ids,
1863  partition_count);
1864  }
1865  else {
1866  // e.g., if have fewer coordinates than parts, don't need to do
1867  // next dim.
1868  this->incomplete_cut_count(kk) = 0;
1869  }
1870  obtained_part_index += partition_count;
1871  }
1872 
1873  // used imbalance, it is always 0, as it is difficult
1874  // to estimate a range.
1875  double used_imbalance = 0;
1876 
1877  // Determine cut lines for k parts here.
1878  this->mj_env->timerStart(MACRO_TIMERS,
1879  mj_timer_base_string + "mj_1D_part()");
1880 
1881  this->mj_1D_part(
1882  mj_current_dim_coords,
1883  used_imbalance,
1884  current_work_part,
1885  current_concurrent_num_parts,
1886  current_cut_coordinates,
1887  total_incomplete_cut_count,
1888  view_rectilinear_cut_count,
1889  view_total_reduction_size);
1890 
1891  this->mj_env->timerStop(MACRO_TIMERS,
1892  mj_timer_base_string + "mj_1D_part()");
1893  }
1894  else {
1895  obtained_part_index += current_concurrent_num_parts;
1896  }
1897  // create part chunks
1898  {
1899  mj_part_t output_array_shift = 0;
1900  mj_part_t cut_shift = 0;
1901  size_t tlr_shift = 0;
1902  size_t partweight_array_shift = 0;
1903 
1904  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1905  mj_part_t current_concurrent_work_part = current_work_part + kk;
1906 
1907  mj_part_t num_parts = host_num_partitioning_in_current_dim(
1908  current_concurrent_work_part);
1909 
1910  // if the part is empty, skip the part.
1911  int coordinateA_bigger_than_coordinateB =
1912  host_global_min_max_coord_total_weight(kk) >
1913  host_global_min_max_coord_total_weight(
1914  kk + current_concurrent_num_parts);
1915 
1916  if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
1917  // we still need to write the begin and end point of the empty part.
1918  // simply set it zero, the array indices will be shifted later
1919  auto local_new_part_xadj = this->new_part_xadj;
1920  Kokkos::parallel_for(
1921  Kokkos::RangePolicy<typename mj_node_t::execution_space,
1922  mj_part_t> (0, num_parts), KOKKOS_LAMBDA(mj_part_t jj) {
1923  local_new_part_xadj(
1924  output_part_index + output_array_shift + jj) = 0;
1925  });
1926 
1927  cut_shift += num_parts - 1;
1928  tlr_shift += (4 *(num_parts - 1) + 1);
1929  output_array_shift += num_parts;
1930  partweight_array_shift += (2 * (num_parts - 1) + 1);
1931  continue;
1932  }
1933  mj_lno_t coordinate_end =
1934  host_part_xadj(current_concurrent_work_part);
1935  mj_lno_t coordinate_begin =
1936  current_concurrent_work_part==0 ? 0 :
1937  host_part_xadj(current_concurrent_work_part-1);
1938 
1939  Kokkos::View<mj_scalar_t *, device_t>
1940  current_concurrent_cut_coordinate =
1941  Kokkos::subview(current_cut_coordinates,
1942  std::pair<mj_lno_t, mj_lno_t>(
1943  cut_shift,
1944  current_cut_coordinates.size()));
1945  Kokkos::View<mj_scalar_t *, device_t>
1946  used_local_cut_line_weight_to_left =
1947  Kokkos::subview(process_cut_line_weight_to_put_left,
1948  std::pair<mj_lno_t, mj_lno_t>(
1949  cut_shift,
1950  process_cut_line_weight_to_put_left.size()));
1951 
1952  this->thread_part_weight_work =
1953  Kokkos::subview(
1954  this->thread_part_weights,
1955  std::pair<mj_lno_t, mj_lno_t>(
1956  partweight_array_shift,
1957  this->thread_part_weights.size()));
1958 
1959  if(num_parts > 1) {
1960  // Rewrite the indices based on the computed cuts.
1961  Kokkos::View<mj_lno_t *, device_t> subview_new_part_xadj =
1962  Kokkos::subview(this->new_part_xadj,
1963  std::pair<mj_lno_t, mj_lno_t>(
1964  output_part_index + output_array_shift,
1965  this->new_part_xadj.size()));
1966 
1967  this->create_consistent_chunks(
1968  num_parts,
1969  mj_current_dim_coords,
1970  current_concurrent_cut_coordinate,
1971  coordinate_begin,
1972  coordinate_end,
1973  used_local_cut_line_weight_to_left,
1974  subview_new_part_xadj,
1975  coordInd,
1976  partition_along_longest_dim,
1977  p_coord_dimension_range_sorted);
1978  }
1979  else {
1980  // if this part is partitioned into 1 then just copy
1981  // the old values.
1982  mj_lno_t part_size = coordinate_end - coordinate_begin;
1983 
1984  auto local_new_part_xadj = this->new_part_xadj;
1985  Kokkos::parallel_for(
1986  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1987  (0, 1), KOKKOS_LAMBDA (int dummy) {
1988  local_new_part_xadj(output_part_index + output_array_shift)
1989  = part_size;
1990  });
1991 
1992  auto subview_new_coordinate_permutations =
1993  Kokkos::subview(this->new_coordinate_permutations,
1994  std::pair<mj_lno_t, mj_lno_t>(
1995  coordinate_begin,
1996  coordinate_begin + part_size));
1997  auto subview_coordinate_permutations =
1998  Kokkos::subview(this->coordinate_permutations,
1999  std::pair<mj_lno_t, mj_lno_t>(
2000  coordinate_begin,
2001  coordinate_begin + part_size));
2002  Kokkos::deep_copy(subview_new_coordinate_permutations,
2003  subview_coordinate_permutations);
2004  }
2005 
2006  cut_shift += num_parts - 1;
2007  tlr_shift += (4 *(num_parts - 1) + 1);
2008  output_array_shift += num_parts;
2009  partweight_array_shift += (2 * (num_parts - 1) + 1);
2010  }
2011 
2012  // shift cut coordinates so that all cut coordinates are stored.
2013  // current_cut_coordinates += cutShift;
2014 
2015  // getChunks from coordinates partitioned the parts and
2016  // wrote the indices as if there were a single part.
2017  // now we need to shift the beginning indices.
2018  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
2019  mj_part_t num_parts =
2020  host_num_partitioning_in_current_dim(current_work_part + kk);
2021  auto local_new_part_xadj = this->new_part_xadj;
2022  auto local_mj_current_dim_coords = mj_current_dim_coords;
2023  auto local_new_coordinate_permutations =
2024  new_coordinate_permutations;
2025  Kokkos::parallel_for(
2026  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t> (
2027  0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
2028  //shift it by previousCount
2029  local_new_part_xadj(output_part_index+ii) +=
2030  output_coordinate_end_index;
2031 
2032  if(ii % 2 == 1) {
2033  mj_lno_t coordinate_end =
2034  local_new_part_xadj(output_part_index+ii);
2035  mj_lno_t coordinate_begin =
2036  local_new_part_xadj(output_part_index);
2037 
2038  for(mj_lno_t task_traverse = coordinate_begin;
2039  task_traverse < coordinate_end; ++task_traverse) {
2040  mj_lno_t l = local_new_coordinate_permutations(task_traverse);
2041  //MARKER: FLIPPED ZORDER BELOW
2042  local_mj_current_dim_coords(l) = -local_mj_current_dim_coords(l);
2043  }
2044  }
2045  });
2046 
2047  // increase the previous count by current end.
2048  mj_part_t get_single;
2049  Kokkos::parallel_reduce("Read new_part_xadj",
2050  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
2051  KOKKOS_LAMBDA(int dummy, mj_part_t & set_single) {
2052  set_single = local_new_part_xadj(output_part_index + num_parts - 1);
2053  }, get_single);;
2054 
2055  output_coordinate_end_index = get_single;
2056  // increase the current out.
2057  output_part_index += num_parts;
2058  }
2059  }
2060  }
2061 
2062  // end of this partitioning dimension
2063  // set the current num parts for next dim partitioning
2064  current_num_parts = output_part_count_in_dimension;
2065 
2066  //swap the coordinate permutations for the next dimension.
2067  Kokkos::View<mj_lno_t *, device_t> tmp = this->coordinate_permutations;
2068  this->coordinate_permutations = this->new_coordinate_permutations;
2069  this->new_coordinate_permutations = tmp;
2070 
2071  this->part_xadj = this->new_part_xadj;
2072  this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2073  Kokkos::deep_copy(host_part_xadj, part_xadj); // keep in sync
2074  this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>("empty", 0);
2075  }
2076 
2077  Kokkos::deep_copy(initial_adjList_output_adjlist, coordinate_permutations);
2078 
2079  // Return output_xadj in CSR format
2080  output_xadj[0] = 0;
2081  for(size_t i = 0; i < this->num_global_parts ; ++i) {
2082  output_xadj[i+1] = host_part_xadj(i);
2083  }
2084 
2085  delete future_num_part_in_parts;
2086  delete next_future_num_parts_in_parts;
2087 }
2088 
2092 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2093  typename mj_part_t, typename mj_node_t>
2094 RCP<typename AlgMJ
2095  <mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,mj_node_t>::mj_partBox_t>
2098 {
2099  return this->global_box;
2100 }
2101 
2104 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2105  typename mj_part_t, typename mj_node_t>
2106 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2107  mj_node_t>::set_to_keep_part_boxes()
2108 {
2109  this->mj_keep_part_boxes = true;
2110 }
2111 
2112 /* \brief Either the mj array (part_no_array) or num_global_parts should be
2113  * provided in the input. part_no_array takes
2114  * precedence if both are provided.
2115  * Depending on these parameters, total cut/part number,
2116  * maximum part/cut number along a dimension, estimated number of reduceAlls,
2117  * and the number of parts before the last dimension is calculated.
2118  * */
2119 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2120  typename mj_part_t, typename mj_node_t>
2123 {
2124  this->total_num_cut = 0; //how many cuts will be totally
2125  this->total_num_part = 1; //how many parts will be totally
2126  this->max_num_part_along_dim = 0; // maximum part count along a dimension.
2127  this->total_dim_num_reduce_all = 0; // estimate on #reduceAlls can be done.
2128  this->last_dim_num_part = 1; //max no of parts that might occur
2129  //during the partition before the
2130  //last partitioning dimension.
2131  this->max_num_cut_along_dim = 0;
2132  this->max_num_total_part_along_dim = 0;
2133 
2134  if(this->part_no_array.size()) {
2135  auto local_recursion_depth = this->recursion_depth;
2136 
2137  this->total_dim_num_reduce_all =
2138  this->total_num_part * this->recursion_depth;
2139 
2140  this->total_num_part = 1;
2141  for(int i = 0; i < local_recursion_depth; ++i) {
2142  this->total_num_part *= this->part_no_array(i);
2143  }
2144 
2145  mj_part_t track_max = 0;
2146  for(int i = 0; i < local_recursion_depth; ++i) {
2147  if(part_no_array(i) > track_max) {
2148  track_max = this->part_no_array(i);
2149  };
2150  }
2151 
2152  this->last_dim_num_part = this->total_num_part /
2153  this->part_no_array(local_recursion_depth-1);
2154 
2155  this->max_num_part_along_dim = track_max;
2156  this->num_global_parts = this->total_num_part;
2157  } else {
2158  mj_part_t future_num_parts = this->num_global_parts;
2159 
2160  // If using nonuniform first level partitioning.
2161  // initial value max_num_part_along_dim == num_first_level_parts
2162  if (this->first_level_distribution.size() != 0 &&
2163  this->num_first_level_parts > 1) {
2164  this->max_num_part_along_dim = this->num_first_level_parts;
2165  }
2166 
2167  // we need to calculate the part numbers now, to determine
2168  // the maximum along the dimensions.
2169  for(int rd = 0; rd < this->recursion_depth; ++rd) {
2170  mj_part_t maxNoPartAlongI = 0;
2171  mj_part_t nfutureNumParts = 0;
2172 
2173  // Nonuniform first level partitioning sets part specificiations for
2174  // rd == 0 only, given requested num of parts and distribution in parts
2175  // for the first level.
2176  if (rd == 0 &&
2177  this->first_level_distribution.size() != 0 &&
2178  this->num_first_level_parts > 1) {
2179 
2180  maxNoPartAlongI = this->num_first_level_parts;
2181  this->max_num_part_along_dim = this->num_first_level_parts;
2182 
2183  mj_part_t sum_first_level_dist = 0;
2184  mj_part_t max_part = 0;
2185 
2186  // Cumulative sum of distribution of parts and size of largest part
2187  for (int i = 0; i < this->num_first_level_parts; ++i) {
2188  sum_first_level_dist += this->first_level_distribution(i);
2189  if (this->first_level_distribution(i) > max_part)
2190  max_part = this->first_level_distribution(i);
2191  }
2192 
2193  // Total parts in largest nonuniform superpart from
2194  // first level partitioning
2195  nfutureNumParts =
2196  this->num_global_parts * max_part / sum_first_level_dist;
2197  }
2198  // Standard uniform partitioning this level
2199  else {
2200  maxNoPartAlongI = this->get_part_count(future_num_parts,
2201  1.0f / (this->recursion_depth - rd));
2202  if (maxNoPartAlongI > this->max_num_part_along_dim)
2203  this->max_num_part_along_dim = maxNoPartAlongI;
2204  nfutureNumParts = future_num_parts / maxNoPartAlongI;
2205  if (future_num_parts % maxNoPartAlongI) {
2206  ++nfutureNumParts;
2207  }
2208  }
2209  future_num_parts = nfutureNumParts;
2210  }
2211  this->total_num_part = this->num_global_parts;
2212 
2213  if(this->divide_to_prime_first) {
2214  this->total_dim_num_reduce_all = this->num_global_parts * 2;
2215  this->last_dim_num_part = this->num_global_parts;
2216  }
2217  else {
2218  //this is the lower bound.
2219  //estimate reduceAll Count here.
2220  //we find the upperbound instead.
2221  size_t p = 1;
2222  for(int i = 0; i < this->recursion_depth; ++i) {
2223  this->total_dim_num_reduce_all += p;
2224  p *= this->max_num_part_along_dim;
2225  }
2226 
2227  if(p / this->max_num_part_along_dim > this->num_global_parts) {
2228  this->last_dim_num_part = this->num_global_parts;
2229  }
2230  else {
2231  this->last_dim_num_part = p / this->max_num_part_along_dim;
2232  }
2233  }
2234  }
2235 
2236  this->total_num_cut = this->total_num_part - 1;
2237  this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2238  this->max_num_total_part_along_dim = this->max_num_part_along_dim +
2239  size_t(this->max_num_cut_along_dim);
2240  // maxPartNo is P, maxCutNo = P-1, matTotalPartcount = 2P-1
2241 
2242  // refine the concurrent part count, if it is given bigger than the maximum
2243  // possible part count.
2244  if(this->max_concurrent_part_calculation > this->last_dim_num_part) {
2245  if(this->mj_problemComm->getRank() == 0) {
2246  std::cerr << "Warning: Concurrent part count (" <<
2247  this->max_concurrent_part_calculation <<
2248  ") has been set bigger than maximum amount that can be used." <<
2249  " Setting to:" << this->last_dim_num_part << "." << std::endl;
2250  }
2251  this->max_concurrent_part_calculation = this->last_dim_num_part;
2252  }
2253 }
2254 
2255 /* \brief Tries to determine the part number for current dimension,
2256  * by trying to make the partitioning as square as possible.
2257  * \param num_total_future how many more partitionings are required.
2258  * \param root how many more recursion depth is left.
2259  */
2260 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2261  typename mj_part_t, typename mj_node_t>
2262 inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2263  get_part_count(mj_part_t num_total_future, double root)
2264 {
2265  double fp = pow(num_total_future, root);
2266  mj_part_t ip = mj_part_t(fp);
2267  if(fp - ip < std::numeric_limits<float>::epsilon() * 100) {
2268  return ip;
2269  }
2270  else {
2271  return ip + 1;
2272  }
2273 }
2274 
2275 /* \brief Function returns how many parts that will be obtained after this
2276  * dimension partitioning. It sets how many parts each current part will be
2277  * partitioned into in this dimension to device_num_partitioning_in_current_dim
2278  * view, sets how many total future parts each obtained part will be
2279  * partitioned into in next_future_num_parts_in_parts vector. If part boxes are
2280  * kept, then sets initializes the output_part_boxes as its ancestor.
2281  * \param future_num_part_in_parts: input, how many future parts each current
2282  * part will be partitioned into.
2283  * \param next_future_num_parts_in_parts: output, how many future parts each
2284  * obtained part will be partitioned into.
2285  * \param future_num_parts: output, max number of future parts that will be
2286  * obtained from a single
2287  * \param current_num_parts: input, how many parts are there currently.
2288  * \param current_iteration: input, current dimension iteration number.
2289  * \param input_part_boxes: input, if boxes are kept, current boxes.
2290  * \param output_part_boxes: output, if boxes are kept, the initial box
2291  * boundaries for obtained parts.
2292  * \param atomic_part_count DOCWORK: Documentation
2293  */
2294 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2295  typename mj_part_t, typename mj_node_t>
2296 mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2297  update_part_num_arrays(
2298  std::vector<mj_part_t> *future_num_part_in_parts,
2299  std::vector<mj_part_t> *next_future_num_parts_in_parts,
2300  mj_part_t &future_num_parts,
2301  mj_part_t current_num_parts,
2302  int current_iteration,
2303  RCP<mj_partBoxVector_t> input_part_boxes,
2304  RCP<mj_partBoxVector_t> output_part_boxes,
2305  mj_part_t atomic_part_count)
2306 {
2307  std::vector<mj_part_t> num_partitioning_in_current_dim;
2308 
2309  // how many parts that will be obtained after this dimension.
2310  mj_part_t output_num_parts = 0;
2311  if(this->part_no_array.size()) {
2312  // when the partNo array is provided as input,
2313  // each current partition will be partition to the same number of parts.
2314  // we dont need to use the future_num_part_in_parts vector in this case.
2315  mj_part_t current_part_no_array =
2316  this->part_no_array(current_iteration);
2317 
2318  if(current_part_no_array < 1) {
2319  std::cout << "Current recursive iteration: " << current_iteration <<
2320  " part_no_array[" << current_iteration << "] is given as:" <<
2321  current_part_no_array << std::endl;
2322  std::terminate();
2323  }
2324  if(current_part_no_array == 1) {
2325  return current_num_parts;
2326  }
2327 
2328  // If using part_no_array, ensure compatibility with num_first_level_parts.
2329  if (this->first_level_distribution.size() != 0 &&
2330  current_iteration == 0 &&
2331  current_part_no_array != this->num_first_level_parts) {
2332  std::cout << "Current recursive iteration: " << current_iteration
2333  << " part_no_array[" << current_iteration << "] is given as: " <<
2334  current_part_no_array << " and contradicts num_first_level_parts: " <<
2335  this->num_first_level_parts << std::endl;
2336  std::terminate();
2337  }
2338 
2339  for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2340  num_partitioning_in_current_dim.push_back(current_part_no_array);
2341  }
2342 
2343 /*
2344  std::cout << "\n\nme: " << this->myRank << " current_iteration: " <<
2345  current_iteration << " current_num_parts: " <<
2346  current_num_parts << "\n\n";
2347 
2348  std::cout << "\n\nnum_partitioning_in_current_dim[0]: " <<
2349  num_partitioning_in_current_dim[0] << "\n\n";
2350 
2351  std::cout << "\n\nfuture_num_parts: " << future_num_parts
2352  << " num_partitioning_in_current_dim[0]: " <<
2353  num_partitioning_in_current_dim[0] << " " <<
2354  future_num_parts / num_partitioning_in_current_dim[0] << "\n\n";
2355 */
2356 
2357  future_num_parts /= num_partitioning_in_current_dim[0];
2358  output_num_parts = current_num_parts *
2359  num_partitioning_in_current_dim[0];
2360  if(this->mj_keep_part_boxes) {
2361  for(mj_part_t k = 0; k < current_num_parts; ++k) {
2362  //initialized the output boxes as its ancestor.
2363  for(mj_part_t j = 0; j <
2364  num_partitioning_in_current_dim[0]; ++j) {
2365  output_part_boxes->push_back((*input_part_boxes)[k]);
2366  }
2367  }
2368  }
2369 
2370  // set the how many more parts each part will be divided.
2371  // this is obvious when partNo array is provided as input.
2372  // however, fill this so weights will be calculated according to this array.
2373  for(mj_part_t ii = 0; ii < output_num_parts; ++ii) {
2374  next_future_num_parts_in_parts->push_back(future_num_parts);
2375  }
2376  }
2377  else {
2378  // if partNo array is not provided as input, future_num_part_in_parts
2379  // holds how many parts each part should be divided. Initially it holds a
2380  // single number equal to the total number of global parts.
2381 
2382  // calculate the future_num_parts from beginning,
2383  // since each part might be divided into different number of parts.
2384  future_num_parts = 1;
2385 
2386  // cout << "i:" << i << std::endl;
2387  for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2388  // get how many parts a part should be divided.
2389  mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2390 
2391  // get the ideal number of parts that is close to the
2392  // (recursion_depth - i) root of the future_num_parts_of_part_ii.
2393  mj_part_t num_partitions_in_current_dim =
2394  this->get_part_count(future_num_parts_of_part_ii,
2395  1.0 / (this->recursion_depth - current_iteration)
2396  );
2397  if(num_partitions_in_current_dim > this->max_num_part_along_dim) {
2398  std::cerr << "ERROR: maxPartNo calculation is wrong."
2399  " num_partitions_in_current_dim: "
2400  << num_partitions_in_current_dim << " this->max_num_part_along_dim: "
2401  << this->max_num_part_along_dim <<
2402  " this->recursion_depth: " << this->recursion_depth <<
2403  " current_iteration:" << current_iteration <<
2404  " future_num_parts_of_part_ii: " << future_num_parts_of_part_ii <<
2405  " might need to fix max part no calculation for "
2406  "largest_prime_first partitioning." <<
2407  std::endl;
2408  std::terminate();
2409  }
2410  // add this number to vector_num_partitioning_in_current_dim vector.
2411  // num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2412  // mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2413 
2414  // Update part num arrays when on current_iteration == 0 and
2415  // using nonuniform first level partitioning
2416  // with requested num parts (num_first_level_parts) and
2417  // a requested distribution in parts (first_level_distribution).
2418  if (current_iteration == 0 &&
2419  this->first_level_distribution.size() != 0 &&
2420  this->num_first_level_parts > 1) {
2421  // Only 1 current part to begin and partitions into
2422  // num_first_level_parts many parts
2423  num_partitioning_in_current_dim.push_back(this->num_first_level_parts);
2424 
2425  // The output number of parts from first level partitioning
2426  output_num_parts = this->num_first_level_parts;
2427 
2428  // Remaining parts left to partition for all future levels
2429  future_num_parts /= this->num_first_level_parts;
2430 
2431  mj_part_t max_part = 0;
2432  mj_part_t sum_first_level_dist = 0;
2433 
2434  // Cumulative sum of distribution of first level parts
2435  // and size of largest first level part
2436  for (int i = 0; i < this->num_first_level_parts; ++i) {
2437  sum_first_level_dist += this->first_level_distribution(i);
2438 
2439  if (this->first_level_distribution(i) > max_part)
2440  max_part = this->first_level_distribution(i);
2441  }
2442 
2443  // Maximum # of remaining parts left to partition for all future levels
2444  future_num_parts = this->num_global_parts * max_part / sum_first_level_dist;
2445 
2446  // Number of parts remaining left to partition for each future_part
2447  // The sum must exactly equal global_num_parts
2448  for (int i = 0; i < this->num_first_level_parts; ++i) {
2449  next_future_num_parts_in_parts->push_back(this->first_level_distribution(i) *
2450  this->num_global_parts / sum_first_level_dist);
2451  }
2452  }
2453  else if (this->divide_to_prime_first) {
2454  // Add this number to num_partitioning_in_current_dim vector.
2455  num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2456 
2457  mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2458 
2459  //increase the output number of parts.
2460  output_num_parts += num_partitions_in_current_dim;
2461 
2462  if (future_num_parts_of_part_ii == atomic_part_count ||
2463  future_num_parts_of_part_ii % atomic_part_count != 0) {
2464  atomic_part_count = 1;
2465  }
2466 
2467  largest_prime_factor =
2468  this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2469 
2470  // We divide to num_partitions_in_current_dim. But we adjust the weights
2471  // based on largest prime/ if num_partitions_in_current_dim = 2,
2472  // largest prime = 5 --> we divide to 2 parts with weights 3x and 2x.
2473  // if the largest prime is less than part count, we use the part count
2474  // so that we divide uniformly.
2475  if (largest_prime_factor < num_partitions_in_current_dim) {
2476  largest_prime_factor = num_partitions_in_current_dim;
2477  }
2478  //ideal number of future partitions for each part.
2479  mj_part_t ideal_num_future_parts_in_part =
2480  (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2481  //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2482  mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2483 
2484 /*
2485  std::cout << "\ncurrent num part: " << ii
2486  << " largest_prime_factor: " << largest_prime_factor
2487  << " To Partition: " << future_num_parts_of_part_ii << "\n\n";
2488 */
2489 
2490  for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2491  //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2492  mj_part_t my_ideal_primescale = ideal_prime_scale;
2493  //left over weighs. Left side is adjusted to be 3x, right side stays as 2x
2494  if (iii < (largest_prime_factor) % num_partitions_in_current_dim) {
2495  ++my_ideal_primescale;
2496  }
2497  //scale with 'x';
2498  mj_part_t num_future_parts_for_part_iii =
2499  ideal_num_future_parts_in_part * my_ideal_primescale;
2500 
2501  //if there is a remainder in the part increase the part weight.
2502  if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor) {
2503  //if not uniform, add 1 for the extra parts.
2504  ++num_future_parts_for_part_iii;
2505  }
2506 
2507  next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2508 
2509  //if part boxes are stored, initialize the box of the parts as the ancestor.
2510  if (this->mj_keep_part_boxes) {
2511  output_part_boxes->push_back((*input_part_boxes)[ii]);
2512  }
2513 
2514  //set num future_num_parts to maximum in this part.
2515  if (num_future_parts_for_part_iii > future_num_parts)
2516  future_num_parts = num_future_parts_for_part_iii;
2517 
2518  }
2519  }
2520  else {
2521  // Add this number to num_partitioning_in_current_dim vector.
2522  num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2523 
2524  //increase the output number of parts.
2525  output_num_parts += num_partitions_in_current_dim;
2526 
2527  if((future_num_parts_of_part_ii == atomic_part_count) ||
2528  (future_num_parts_of_part_ii % atomic_part_count != 0)) {
2529  atomic_part_count = 1;
2530  }
2531  //ideal number of future partitions for each part.
2532  mj_part_t ideal_num_future_parts_in_part =
2533  (future_num_parts_of_part_ii / atomic_part_count) /
2534  num_partitions_in_current_dim;
2535  for(mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2536  mj_part_t num_future_parts_for_part_iii =
2537  ideal_num_future_parts_in_part;
2538 
2539  //if there is a remainder in the part increase the part weight.
2540  if(iii < (future_num_parts_of_part_ii / atomic_part_count) %
2541  num_partitions_in_current_dim) {
2542  // if not uniform, add 1 for the extra parts.
2543  ++num_future_parts_for_part_iii;
2544  }
2545 
2546  next_future_num_parts_in_parts->push_back(
2547  num_future_parts_for_part_iii * atomic_part_count);
2548 
2549  // if part boxes are stored, initialize the box of the parts as
2550  // the ancestor.
2551  if(this->mj_keep_part_boxes) {
2552  output_part_boxes->push_back((*input_part_boxes)[ii]);
2553  }
2554  //set num future_num_parts to maximum in this part.
2555  if(num_future_parts_for_part_iii > future_num_parts)
2556  future_num_parts = num_future_parts_for_part_iii;
2557  }
2558  }
2559  }
2560  }
2561  // move temp std::vector to host view
2562  device_num_partitioning_in_current_dim = Kokkos::View<
2563  mj_part_t*, device_t>("test", num_partitioning_in_current_dim.size());
2564  host_num_partitioning_in_current_dim =
2565  Kokkos::create_mirror_view(device_num_partitioning_in_current_dim);
2566  for(size_t n = 0; n < num_partitioning_in_current_dim.size(); ++n) {
2567  host_num_partitioning_in_current_dim(n) =
2568  num_partitioning_in_current_dim[n];
2569  }
2570  // setup device equivalent - this data is used on host and device and it's
2571  // more efficient to just setup array on both sides now rather than copy
2572  // values as needed later.
2573  Kokkos::deep_copy(device_num_partitioning_in_current_dim,
2574  host_num_partitioning_in_current_dim);
2575  return output_num_parts;
2576 }
2577 
2578 /* \brief Allocates and initializes the work memory that will be used by MJ.
2579  * */
2580 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2581  typename mj_part_t, typename mj_node_t>
2582 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2583  allocate_set_work_memory()
2584 {
2585  // Throughout the partitioning execution,
2586  // instead of the moving the coordinates, hold a permutation array for parts.
2587  // coordinate_permutations holds the current permutation.
2588  this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2589  Kokkos::ViewAllocateWithoutInitializing("coordinate_permutations"),
2590  this->num_local_coords);
2591  auto local_coordinate_permutations = coordinate_permutations;
2592  Kokkos::parallel_for(
2593  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
2594  0, this->num_local_coords), KOKKOS_LAMBDA (mj_lno_t i) {
2595  local_coordinate_permutations(i) = i;
2596  });
2597 
2598  // new_coordinate_permutations holds the current permutation.
2599  this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2600  Kokkos::ViewAllocateWithoutInitializing("num_local_coords"),
2601  this->num_local_coords);
2602 
2603  this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2604  Kokkos::ViewAllocateWithoutInitializing("assigned parts"), 0);
2605  if(this->num_local_coords > 0) {
2606  this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2607  Kokkos::ViewAllocateWithoutInitializing("assigned part ids"),
2608  this->num_local_coords);
2609  }
2610 
2611  // single partition starts at index-0, and ends at numLocalCoords
2612  // inTotalCounts array holds the end points in coordinate_permutations array
2613  // for each partition. Initially sized 1, and single element is set to
2614  // numLocalCoords.
2615  this->part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2616  Kokkos::ViewAllocateWithoutInitializing("part xadj"), 1);
2617  this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2618  host_part_xadj(0) = num_local_coords;
2619  Kokkos::deep_copy(this->part_xadj, host_part_xadj);
2620 
2621  // the ends points of the output, this is allocated later.
2622  this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2623  Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2624 
2625  // only store this much if cuts are needed to be stored.
2626  this->all_cut_coordinates = Kokkos::View<mj_scalar_t*, device_t>(
2627  Kokkos::ViewAllocateWithoutInitializing("all cut coordinates"),
2628  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2629 
2630  // how much weight percentage should a MPI put left side of the each cutline
2631  this->process_cut_line_weight_to_put_left = Kokkos::View<mj_scalar_t*,
2632  device_t>(Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2633 
2634  // how much weight percentage should each thread in MPI put left side of
2635  // each outline
2636  this->thread_cut_line_weight_to_put_left =
2637  Kokkos::View<mj_scalar_t*, device_t>(
2638  Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2639 
2640  if(this->distribute_points_on_cut_lines) {
2641  this->process_cut_line_weight_to_put_left =
2642  Kokkos::View<mj_scalar_t *, device_t>(
2643  Kokkos::ViewAllocateWithoutInitializing(
2644  "process_cut_line_weight_to_put_left"),
2645  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2646  this->thread_cut_line_weight_to_put_left =
2647  Kokkos::View<mj_scalar_t *, device_t>(
2648  Kokkos::ViewAllocateWithoutInitializing(
2649  "thread_cut_line_weight_to_put_left"),
2650  this->max_num_cut_along_dim);
2651  this->process_rectilinear_cut_weight =
2652  Kokkos::View<mj_scalar_t *, device_t>(
2653  Kokkos::ViewAllocateWithoutInitializing("process_rectilinear_cut_weight"),
2654  this->max_num_cut_along_dim);
2655  this->global_rectilinear_cut_weight =
2656  Kokkos::View<mj_scalar_t *, device_t>(
2657  Kokkos::ViewAllocateWithoutInitializing("global_rectilinear_cut_weight"),
2658  this->max_num_cut_along_dim);
2659  }
2660 
2661  // work array to manipulate coordinate of cutlines in different iterations.
2662  // necessary because previous cut line information is used for determining
2663  // the next cutline information. therefore, cannot update the cut work array
2664  // until all cutlines are determined.
2665  this->cut_coordinates_work_array =
2666  Kokkos::View<mj_scalar_t *, device_t>(
2667  Kokkos::ViewAllocateWithoutInitializing("cut_coordinates_work_array"),
2668  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2669 
2670  // cumulative part weight array.
2671  this->target_part_weights = Kokkos::View<mj_scalar_t*, device_t>(
2672  Kokkos::ViewAllocateWithoutInitializing("target_part_weights"),
2673  this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2674 
2675  // upper bound coordinate of a cut line
2676  this->cut_upper_bound_coordinates =
2677  Kokkos::View<mj_scalar_t*, device_t>(
2678  Kokkos::ViewAllocateWithoutInitializing("cut_upper_bound_coordinates"),
2679  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2680 
2681  // lower bound coordinate of a cut line
2682  this->cut_lower_bound_coordinates =
2683  Kokkos::View<mj_scalar_t*, device_t>(
2684  Kokkos::ViewAllocateWithoutInitializing("cut_lower_bound_coordinates"),
2685  this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2686 
2687  // lower bound weight of a cut line
2688  this->cut_lower_bound_weights =
2689  Kokkos::View<mj_scalar_t*, device_t>(
2690  Kokkos::ViewAllocateWithoutInitializing("cut_lower_bound_weights"),
2691  this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2692 
2693  //upper bound weight of a cut line
2694  this->cut_upper_bound_weights =
2695  Kokkos::View<mj_scalar_t*, device_t>(
2696  Kokkos::ViewAllocateWithoutInitializing("cut_upper_bound_weights"),
2697  this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2698 
2699  // combined array to exchange the min and max coordinate,
2700  // and total weight of part.
2701  this->process_local_min_max_coord_total_weight =
2702  Kokkos::View<mj_scalar_t*, device_t>(
2703  Kokkos::ViewAllocateWithoutInitializing(
2704  "process_local_min_max_coord_total_weight"),
2705  3 * this->max_concurrent_part_calculation);
2706 
2707  // global combined array with the results for min, max and total weight.
2708  this->global_min_max_coord_total_weight =
2709  Kokkos::View<mj_scalar_t*, device_t>(
2710  Kokkos::ViewAllocateWithoutInitializing("global_min_max_coord_total_weight"),
2711  3 * this->max_concurrent_part_calculation);
2712 
2713  // is_cut_line_determined is used to determine if a cutline is
2714  // determined already. If a cut line is already determined, the next
2715  // iterations will skip this cut line.
2716  this->is_cut_line_determined = Kokkos::View<bool *, device_t>(
2717  Kokkos::ViewAllocateWithoutInitializing("is_cut_line_determined"),
2718  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2719 
2720  // incomplete_cut_count count holds the number of cutlines that have not
2721  // been finalized for each part when concurrentPartCount>1, using this
2722  // information, if incomplete_cut_count[x]==0, then no work is done for
2723  // this part.
2724  this->device_incomplete_cut_count = Kokkos::View<mj_part_t *, device_t>(
2725  Kokkos::ViewAllocateWithoutInitializing("device_incomplete_cut_count"),
2726  this->max_concurrent_part_calculation);
2727  this->incomplete_cut_count =
2728  Kokkos::create_mirror_view(device_incomplete_cut_count);
2729 
2730  // local part weights of each thread.
2731  this->thread_part_weights = Kokkos::View<double *, device_t>(
2732  Kokkos::ViewAllocateWithoutInitializing("thread_part_weights"),
2733  this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2734 
2735  this->thread_cut_left_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2736  Kokkos::ViewAllocateWithoutInitializing("thread_cut_left_closest_point"),
2737  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2738 
2739  // thread_cut_right_closest_point to hold the closest coordinate to a
2740  // cutline from right (for each thread)
2741  this->thread_cut_right_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2742  Kokkos::ViewAllocateWithoutInitializing("thread_cut_right_closest_point"),
2743  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2744 
2745  // to store how many points in each part a thread has.
2746  this->thread_point_counts = Kokkos::View<mj_lno_t *, device_t>(
2747  Kokkos::ViewAllocateWithoutInitializing("thread_point_counts"),
2748  this->max_num_part_along_dim);
2749 
2750  // for faster communication, concatanation of
2751  // totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
2752  // leftClosest distances sized P-1, since P-1 cut lines
2753  // rightClosest distances size P-1, since P-1 cut lines.
2754  this->total_part_weight_left_right_closests =
2755  Kokkos::View<mj_scalar_t*, device_t>(
2756  Kokkos::ViewAllocateWithoutInitializing(
2757  "total_part_weight_left_right_closests"),
2758  (this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) *
2759  this->max_concurrent_part_calculation);
2760 
2761  this->global_total_part_weight_left_right_closests =
2762  Kokkos::View<mj_scalar_t*, device_t>(
2763  Kokkos::ViewAllocateWithoutInitializing(
2764  "global_total_part_weight_left_right_closests"),
2765  (this->max_num_total_part_along_dim +
2766  this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2767 
2768  this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
2769  Kokkos::ViewAllocateWithoutInitializing("gids"), num_local_coords);
2770 
2771  this->owner_of_coordinate = Kokkos::View<int *, Kokkos::HostSpace>(
2772  Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
2773  num_local_coords);
2774 
2775  // changes owners back to host - so we don't run them on device
2776  // this improves migration code but means we have to serial init here.
2777  // Note we might allow this to be OpenMP when available even for CUDA.
2778  Kokkos::deep_copy(owner_of_coordinate, myActualRank);
2779 
2780  auto local_current_mj_gnos = current_mj_gnos;
2781  auto local_initial_mj_gnos = initial_mj_gnos;
2782  Kokkos::parallel_for(
2783  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2784  (0, num_local_coords), KOKKOS_LAMBDA (mj_lno_t j) {
2785  local_current_mj_gnos(j) = local_initial_mj_gnos(j);
2786  });
2787 }
2788 
2789 /* \brief compute the global bounding box
2790  */
2791 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2792  typename mj_part_t, typename mj_node_t>
2793 void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,
2794  mj_node_t>::compute_global_box()
2795 {
2796  //local min coords
2797  mj_scalar_t *mins = new mj_scalar_t[this->coord_dim];
2798  //global min coords
2799  mj_scalar_t *gmins = new mj_scalar_t[this->coord_dim];
2800  //local max coords
2801  mj_scalar_t *maxs = new mj_scalar_t[this->coord_dim];
2802  //global max coords
2803  mj_scalar_t *gmaxs = new mj_scalar_t[this->coord_dim];
2804 
2805  auto local_mj_coordinates = this->mj_coordinates;
2806 
2807  // If we are only doing 2 parts then we don't need these values
2808  // for y and z. Init them all to 0 first
2809  for(int i = 0; i < this->coord_dim; ++i) {
2810  mins[i] = 0;
2811  maxs[i] = 0;
2812  }
2813 
2814  for(int i = 0; i < std::min(this->recursion_depth, this->coord_dim); ++i) {
2815  Kokkos::parallel_reduce("MinReduce",
2816  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2817  (0, this->num_local_coords),
2818  KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_min) {
2819  if(local_mj_coordinates(j,i) < running_min) {
2820  running_min = local_mj_coordinates(j,i);
2821  }
2822  }, Kokkos::Min<mj_scalar_t>(mins[i]));
2823  Kokkos::parallel_reduce("MaxReduce",
2824  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2825  (0, this->num_local_coords),
2826  KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_max) {
2827  if(local_mj_coordinates(j,i) > running_max) {
2828  running_max = local_mj_coordinates(j,i);
2829  }
2830  }, Kokkos::Max<mj_scalar_t>(maxs[i]));
2831  }
2832 
2833  reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2834  this->coord_dim, mins, gmins
2835  );
2836 
2837  reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2838  this->coord_dim, maxs, gmaxs
2839  );
2840 
2841  //create single box with all areas.
2842  global_box = rcp(new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2843  //coordinateModelPartBox <mj_scalar_t, mj_part_t> tmpBox (0, coordDim);
2844  delete [] mins;
2845  delete [] gmins;
2846  delete [] maxs;
2847  delete [] gmaxs;
2848 }
2849 
2850 /* \brief for part communication we keep track of the box boundaries.
2851  * This is performed when either asked specifically, or when geometric mapping
2852  * is performed afterwards.
2853  * This function initializes a single box with all global min, max coordinates.
2854  * \param initial_partitioning_boxes the input and output vector for boxes.
2855  */
2856 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2857  typename mj_part_t, typename mj_node_t>
2858 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2859  mj_node_t>::init_part_boxes(
2860  RCP<mj_partBoxVector_t> & initial_partitioning_boxes)
2861 {
2862  mj_partBox_t tmp_box(*global_box);
2863  initial_partitioning_boxes->push_back(tmp_box);
2864 }
2865 
2870 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2871  typename mj_part_t,
2872  typename mj_node_t>
2873 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2874  mj_get_local_min_max_coord_totW(
2875  mj_part_t current_work_part,
2876  mj_part_t current_concurrent_num_parts,
2877  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords)
2878 {
2879  auto local_coordinate_permutations = this->coordinate_permutations;
2880  auto local_process_local_min_max_coord_total_weight =
2881  this->process_local_min_max_coord_total_weight;
2882  auto local_mj_weights = this->mj_weights;
2883 
2884  bool bUniformWeights = mj_uniform_weights(0);
2885 
2886  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
2887 
2888  mj_part_t concurrent_current_part = current_work_part + kk;
2889  mj_lno_t coordinate_begin_index = concurrent_current_part == 0 ? 0 :
2890  host_part_xadj(concurrent_current_part - 1);
2891  mj_lno_t coordinate_end_index =
2892  host_part_xadj(concurrent_current_part);
2893 
2894  mj_scalar_t my_min_coord = 0;
2895  mj_scalar_t my_max_coord = 0;
2896  mj_scalar_t my_total_weight;
2897  //if the part is empty.
2898  //set the min and max coordinates as reverse.
2899  if(coordinate_begin_index >= coordinate_end_index)
2900  {
2901  my_min_coord = std::numeric_limits<mj_scalar_t>::max();
2902  my_max_coord = -std::numeric_limits<mj_scalar_t>::max();
2903  my_total_weight = 0;
2904  }
2905  else {
2906  // get min
2907  Kokkos::parallel_reduce("get min",
2908  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2909  (coordinate_begin_index, coordinate_end_index),
2910  KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_min) {
2911  int i = local_coordinate_permutations(j);
2912  if(mj_current_dim_coords(i) < running_min)
2913  running_min = mj_current_dim_coords(i);
2914  }, Kokkos::Min<mj_scalar_t>(my_min_coord));
2915  // get max
2916  Kokkos::parallel_reduce("get max",
2917  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2918  (coordinate_begin_index, coordinate_end_index),
2919  KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_max) {
2920  int i = local_coordinate_permutations(j);
2921  if(mj_current_dim_coords(i) > running_max)
2922  running_max = mj_current_dim_coords(i);
2923  }, Kokkos::Max<mj_scalar_t>(my_max_coord));
2924  if(bUniformWeights) {
2925  my_total_weight = coordinate_end_index - coordinate_begin_index;
2926  }
2927  else {
2928  my_total_weight = 0;
2929  Kokkos::parallel_reduce("get weight",
2930  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2931  (coordinate_begin_index, coordinate_end_index),
2932  KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & lsum) {
2933  int i = local_coordinate_permutations(j);
2934  lsum += local_mj_weights(i,0);
2935  }, my_total_weight);
2936  }
2937  }
2938 
2939  // single write
2940  Kokkos::parallel_for(
2941  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
2942  (0, 1), KOKKOS_LAMBDA (int dummy) {
2943  local_process_local_min_max_coord_total_weight(kk) =
2944  my_min_coord;
2945  local_process_local_min_max_coord_total_weight(
2946  kk + current_concurrent_num_parts) = my_max_coord;
2947  local_process_local_min_max_coord_total_weight(
2948  kk + 2*current_concurrent_num_parts) = my_total_weight;
2949  });
2950  }
2951 }
2952 
2965 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2966  typename mj_part_t, typename mj_node_t>
2967 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2968  mj_node_t>::mj_get_global_min_max_coord_totW(
2969  mj_part_t current_concurrent_num_parts,
2970  Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
2971  Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total) {
2972  // reduce min for first current_concurrent_num_parts elements, reduce
2973  // max for next concurrentPartCount elements, reduce sum for the last
2974  // concurrentPartCount elements.
2975  if(this->comm->getSize() > 1) {
2976  // We're using explicit host here as Spectrum MPI would fail
2977  // with the prior HostMirror UVMSpace to UVMSpace setup.
2978  auto host_local_min_max_total =
2979  Kokkos::create_mirror_view(Kokkos::HostSpace(), local_min_max_total);
2980  auto host_global_min_max_total =
2981  Kokkos::create_mirror_view(Kokkos::HostSpace(), global_min_max_total);
2982  Kokkos::deep_copy(host_local_min_max_total, local_min_max_total);
2984  reductionOp(current_concurrent_num_parts,
2985  current_concurrent_num_parts, current_concurrent_num_parts);
2986  try {
2987  reduceAll<int, mj_scalar_t>(
2988  *(this->comm),
2989  reductionOp,
2990  3 * current_concurrent_num_parts,
2991  host_local_min_max_total.data(),
2992  host_global_min_max_total.data());
2993  }
2994  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
2995  Kokkos::deep_copy(global_min_max_total, host_global_min_max_total);
2996  }
2997  else {
2998  mj_part_t s = 3 * current_concurrent_num_parts;
2999  Kokkos::parallel_for(
3000  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3001  (0, s), KOKKOS_LAMBDA (mj_part_t i) {
3002  global_min_max_total(i) = local_min_max_total(i);
3003  });
3004  }
3005 }
3006 
3039 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3040  typename mj_part_t, typename mj_node_t>
3041 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3042  mj_get_initial_cut_coords_target_weights(
3043  mj_scalar_t min_coord,
3044  mj_scalar_t max_coord,
3045  mj_part_t num_cuts/*p-1*/ ,
3046  mj_scalar_t global_weight,
3047  /*p - 1 sized, coordinate of each cut line*/
3048  Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
3049  /*cumulative weights, at left side of each cut line. p-1 sized*/
3050  Kokkos::View<mj_scalar_t *, device_t> & current_target_part_weights ,
3051  std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
3052  std::vector <mj_part_t> *next_future_num_parts_in_parts,
3053  mj_part_t concurrent_current_part,
3054  mj_part_t obtained_part_index,
3055  mj_part_t num_target_first_level_parts,
3056  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist)
3057 {
3058  mj_scalar_t coord_range = max_coord - min_coord;
3059 
3060  // We decided we could keep some std::vectors around for now. Eventually
3061  // it would be nice to have everything just as views with some being device
3062  // and some host. This particular case needs a bit of work to get setup
3063  // in a cleaner way so not going to mess with it at the moment.
3064 
3065  bool bUniformPartsCheck =
3066  num_target_first_level_parts <= 1 && this->mj_uniform_parts(0);
3067 
3068  if(!bUniformPartsCheck) {
3069  bool bValidNonUniformTargetWeights =
3070  (num_target_first_level_parts > 1 && target_first_level_dist.size() != 0);
3071  if(!bValidNonUniformTargetWeights) {
3072  std::cerr << "MJ does not support non uniform part weights beyond the first partition" << std::endl;
3073  std::terminate();
3074  }
3075  }
3076 
3077  Kokkos::View<mj_scalar_t*, device_t> device_cumulative(
3078  "device_cumulative", num_cuts);
3079  auto host_cumulative = Kokkos::create_mirror_view(device_cumulative);
3080 
3081  mj_scalar_t cumulative = 0;
3082 
3083  if(bUniformPartsCheck) {
3084  // How many total future parts the part will be partitioned into.
3085  mj_scalar_t total_future_part_count_in_part =
3086  static_cast<mj_scalar_t>((*future_num_part_in_parts)[concurrent_current_part]);
3087 
3088  // How much each part should weigh in ideal case.
3089  mj_scalar_t unit_part_weight =
3090  global_weight / total_future_part_count_in_part;
3091 
3092  for(mj_part_t i = 0; i < num_cuts; ++i) {
3093  cumulative += unit_part_weight * static_cast<mj_scalar_t>((*next_future_num_parts_in_parts)[i + obtained_part_index]);
3094  host_cumulative(i) = cumulative;
3095  }
3096  }
3097  else {
3098  // Sum of entries in the first level partition distribution vector
3099  mj_scalar_t sum_target_first_level_dist = 0.0;
3100  for (int i = 0; i < num_target_first_level_parts; ++i) {
3101  sum_target_first_level_dist += target_first_level_dist(i);
3102  }
3103 
3104  for(mj_part_t i = 0; i < num_cuts; ++i) {
3105  cumulative += global_weight * target_first_level_dist(i) /
3106  sum_target_first_level_dist;
3107  host_cumulative(i) = cumulative;
3108  }
3109  }
3110 
3111  Kokkos::deep_copy(device_cumulative, host_cumulative);
3112 
3113  Kokkos::parallel_for("Write num in parts",
3114  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3115  (0, num_cuts), KOKKOS_LAMBDA(mj_part_t cut) {
3116  // set target part weight.
3117  current_target_part_weights(cut) = device_cumulative(cut);
3118  initial_cut_coords(cut) = min_coord +
3119  (coord_range * device_cumulative(cut)) / global_weight;
3120  // set this multiple times but here for device handling
3121  current_target_part_weights(num_cuts) = global_weight;
3122  });
3123 
3124  // round the target part weights.
3125  // Note need to discuss regarding DragonFly commits and determine if we
3126  // would not simply check mj_uniform_weights here.
3127  if (!bUniformPartsCheck || this->mj_uniform_weights[0]) {
3128  Kokkos::parallel_for(
3129  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3130  (0, num_cuts + 1),
3131  KOKKOS_LAMBDA (mj_part_t i) {
3132  current_target_part_weights(i) =
3133  long(current_target_part_weights(i) + 0.5);
3134  });
3135  }
3136 }
3137 
3154 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3155  typename mj_part_t, typename mj_node_t>
3156 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3157  set_initial_coordinate_parts(
3158  mj_scalar_t &max_coordinate,
3159  mj_scalar_t &min_coordinate,
3160  mj_lno_t coordinate_begin_index,
3161  mj_lno_t coordinate_end_index,
3162  Kokkos::View<mj_lno_t *, device_t> & mj_current_coordinate_permutations,
3163  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3164  Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
3165  mj_part_t &partition_count)
3166 {
3167  mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
3168 
3169  // if there is single point, or if all points are along a line.
3170  // set initial part to 0 for all.
3171  if(std::abs(coordinate_range) < this->sEpsilon ) {
3172  Kokkos::parallel_for(
3173  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3174  (coordinate_begin_index, coordinate_end_index),
3175  KOKKOS_LAMBDA (mj_lno_t ii) {
3176  mj_part_ids(mj_current_coordinate_permutations[ii]) = 0;
3177  });
3178  }
3179  else {
3180  // otherwise estimate an initial part for each coordinate.
3181  // assuming uniform distribution of points.
3182  mj_scalar_t slice = coordinate_range / partition_count;
3183  Kokkos::parallel_for(
3184  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3185  (coordinate_begin_index, coordinate_end_index),
3186  KOKKOS_LAMBDA (mj_lno_t ii) {
3187  mj_lno_t iii = mj_current_coordinate_permutations[ii];
3188  mj_part_t pp =
3189  mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
3190  if(pp >= partition_count) {
3191  pp = partition_count - 1; // don't want last coord in an invalid part
3192  }
3193  mj_part_ids[iii] = 2 * pp;
3194  });
3195  }
3196 }
3197 
3212 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3213  typename mj_part_t, typename mj_node_t>
3214 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,mj_node_t>::mj_1D_part(
3215  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3216  double used_imbalance_tolerance,
3217  mj_part_t current_work_part,
3218  mj_part_t current_concurrent_num_parts,
3219  Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
3220  mj_part_t total_incomplete_cut_count,
3221  Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
3222  Kokkos::View<size_t*, device_t> & view_total_reduction_size)
3223 {
3224  this->temp_cut_coords = current_cut_coordinates;
3225 
3227  *reductionOp = NULL;
3228 
3229  bool bSingleProcess = (this->comm->getSize() == 1);
3230 
3231  std::vector<mj_part_t> temp(host_num_partitioning_in_current_dim.size());
3232  if(!bSingleProcess) {
3233  for(size_t n = 0; n < host_num_partitioning_in_current_dim.size(); ++n) {
3234  temp[n] = host_num_partitioning_in_current_dim(n);
3235  }
3236  reductionOp = new Teuchos::MultiJaggedCombinedReductionOp
3237  <mj_part_t, mj_scalar_t>(
3238  &temp,
3239  current_work_part,
3240  current_concurrent_num_parts);
3241  }
3242 
3243  auto local_cut_lower_bound_coordinates =
3244  cut_lower_bound_coordinates;
3245  auto local_cut_upper_bound_coordinates =
3246  cut_upper_bound_coordinates;
3247  auto local_cut_upper_bound_weights = cut_upper_bound_weights;
3248  auto local_cut_lower_bound_weights = cut_lower_bound_weights;
3249  bool local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
3250  auto local_process_cut_line_weight_to_put_left =
3251  process_cut_line_weight_to_put_left;
3252  auto local_temp_cut_coords = temp_cut_coords;
3253  auto local_global_total_part_weight_left_right_closests =
3254  global_total_part_weight_left_right_closests;
3255  auto local_cut_coordinates_work_array =
3256  cut_coordinates_work_array;
3257  auto local_part_xadj = part_xadj;
3258  auto local_global_min_max_coord_total_weight =
3259  global_min_max_coord_total_weight;
3260  auto local_target_part_weights =
3261  target_part_weights;
3262  auto local_global_rectilinear_cut_weight =
3263  global_rectilinear_cut_weight;
3264  auto local_process_rectilinear_cut_weight =
3265  process_rectilinear_cut_weight;
3266 
3267  auto local_is_cut_line_determined = this->is_cut_line_determined;
3268  auto local_device_num_partitioning_in_current_dim =
3269  device_num_partitioning_in_current_dim;
3270 
3271  Kokkos::parallel_for(
3272  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3273  KOKKOS_LAMBDA (int dummy) {
3274 
3275  // these need to be initialized
3276  view_rectilinear_cut_count(0) = 0;
3277  view_total_reduction_size(0) = 0;
3278 
3279  // initialize the lower and upper bounds of the cuts.
3280  mj_part_t next = 0;
3281  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3282  mj_part_t num_part_in_dim =
3283  local_device_num_partitioning_in_current_dim(current_work_part + i);
3284  mj_part_t num_cut_in_dim = num_part_in_dim - 1;
3285  view_total_reduction_size(0) += (4 * num_cut_in_dim + 1);
3286 
3287  for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii) {
3288  local_is_cut_line_determined(next) = false;
3289  // min coordinate
3290  local_cut_lower_bound_coordinates(next) =
3291  local_global_min_max_coord_total_weight(i);
3292  // max coordinate
3293  local_cut_upper_bound_coordinates(next) =
3294  local_global_min_max_coord_total_weight(
3295  i + current_concurrent_num_parts);
3296  // total weight
3297  local_cut_upper_bound_weights(next) =
3298  local_global_min_max_coord_total_weight(
3299  i + 2 * current_concurrent_num_parts);
3300  local_cut_lower_bound_weights(next) = 0;
3301  if(local_distribute_points_on_cut_lines) {
3302  local_process_cut_line_weight_to_put_left(next) = 0;
3303  }
3304  ++next;
3305  }
3306  }
3307  });
3308 
3309  // loop_count allows the kernel to behave differently on the first loop
3310  // and subsequent loops. First loop we do a binary search and subsequent
3311  // loops we simply step towards our target.
3312  int loop_count = 0;
3313  while (total_incomplete_cut_count != 0) {
3314  this->mj_1D_part_get_part_weights(
3315  current_concurrent_num_parts,
3316  current_work_part,
3317  mj_current_dim_coords,
3318  loop_count);
3319  ++loop_count;
3320 
3321  this->mj_combine_rightleft_and_weights(
3322  current_work_part,
3323  current_concurrent_num_parts);
3324 
3325  // now sum up the results of mpi processors.
3326  if(!bSingleProcess) {
3327  // We're using explicit host here as Spectrum MPI would fail
3328  // with the prior HostMirror UVMSpace to UVMSpace setup.
3329  auto host_total_part_weight_left_right_closests =
3330  Kokkos::create_mirror_view(Kokkos::HostSpace(),
3331  total_part_weight_left_right_closests);
3332  auto host_global_total_part_weight_left_right_closests =
3333  Kokkos::create_mirror_view(Kokkos::HostSpace(),
3334  global_total_part_weight_left_right_closests);
3335 
3336  Kokkos::deep_copy(host_total_part_weight_left_right_closests,
3337  total_part_weight_left_right_closests);
3338 
3339  size_t host_view_total_reduction_size;
3340  Kokkos::parallel_reduce("Read single",
3341  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3342  KOKKOS_LAMBDA(int dummy, size_t & set_single) {
3343  set_single = view_total_reduction_size(0);
3344  }, host_view_total_reduction_size);
3345 
3346  reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
3347  host_view_total_reduction_size,
3348  host_total_part_weight_left_right_closests.data(),
3349  host_global_total_part_weight_left_right_closests.data());
3350  Kokkos::deep_copy(global_total_part_weight_left_right_closests,
3351  host_global_total_part_weight_left_right_closests);
3352  }
3353  else {
3354  local_global_total_part_weight_left_right_closests =
3355  this->total_part_weight_left_right_closests;
3356  }
3357 
3358  // how much cut will be shifted for the next part in the concurrent
3359  // part calculation.
3360  mj_part_t cut_shift = 0;
3361 
3362  // how much the concantaneted array will be shifted for the next part
3363  // in concurrent part calculation.
3364  size_t tlr_shift = 0;
3365 
3366  Kokkos::View<mj_part_t*, Kokkos::HostSpace>
3367  save_initial_incomplete_cut_count("save_initial_incomplete_cut_count",
3368  current_concurrent_num_parts);
3369 
3370  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3371 
3372  mj_part_t num_parts =
3373  host_num_partitioning_in_current_dim(current_work_part + kk);
3374 
3375  mj_part_t num_cuts = num_parts - 1;
3376  size_t num_total_part = num_parts + size_t (num_cuts);
3377 
3378  //if the cuts of this cut has already been completed.
3379  //nothing to do for this part.
3380  //just update the shift amount and proceed.
3381  mj_part_t kk_incomplete_cut_count = this->incomplete_cut_count(kk);
3382 
3383  if(kk_incomplete_cut_count == 0) {
3384  cut_shift += num_cuts;
3385  tlr_shift += (num_total_part + 2 * num_cuts);
3386  continue;
3387  }
3388 
3389  Kokkos::View<mj_scalar_t *, device_t> current_local_part_weights =
3390  Kokkos::subview(this->total_part_weight_left_right_closests,
3391  std::pair<mj_lno_t, mj_lno_t>(
3392  tlr_shift,
3393  this->total_part_weight_left_right_closests.size()));
3394 
3395  Kokkos::View<mj_scalar_t *, device_t> current_global_tlr =
3396  Kokkos::subview(
3397  local_global_total_part_weight_left_right_closests,
3398  std::pair<mj_lno_t, mj_lno_t>(
3399  tlr_shift,
3400  local_global_total_part_weight_left_right_closests.size()));
3401  Kokkos::View<mj_scalar_t *, device_t>
3402  current_global_left_closest_points =
3403  Kokkos::subview(current_global_tlr,
3404  std::pair<mj_lno_t, mj_lno_t>(
3405  num_total_part,
3406  current_global_tlr.size()));
3407  Kokkos::View<mj_scalar_t *, device_t>
3408  current_global_right_closest_points =
3409  Kokkos::subview(current_global_tlr,
3410  std::pair<mj_lno_t, mj_lno_t>(
3411  num_total_part + num_cuts,
3412  current_global_tlr.size()));
3413  Kokkos::View<mj_scalar_t *, device_t> current_global_part_weights =
3414  current_global_tlr;
3415 
3416  Kokkos::View<bool *, device_t> current_cut_line_determined =
3417  Kokkos::subview(this->is_cut_line_determined,
3418  std::pair<mj_lno_t, mj_lno_t>(
3419  cut_shift,
3420  this->is_cut_line_determined.size()));
3421  Kokkos::View<mj_scalar_t *, device_t> current_part_target_weights =
3422  Kokkos::subview(local_target_part_weights,
3423  std::pair<mj_lno_t, mj_lno_t>(
3424  cut_shift + kk,
3425  local_target_part_weights.size()));
3426  Kokkos::View<mj_scalar_t *, device_t>
3427  current_part_cut_line_weight_to_put_left =
3428  Kokkos::subview(local_process_cut_line_weight_to_put_left,
3429  std::pair<mj_lno_t, mj_lno_t>(
3430  cut_shift,
3431  local_process_cut_line_weight_to_put_left.size()));
3432 
3433  save_initial_incomplete_cut_count(kk) =
3434  kk_incomplete_cut_count;
3435 
3436  Kokkos::View<mj_scalar_t *, device_t>
3437  current_cut_lower_bound_weights =
3438  Kokkos::subview(local_cut_lower_bound_weights,
3439  std::pair<mj_lno_t, mj_lno_t>(
3440  cut_shift,
3441  local_cut_lower_bound_weights.size()));
3442  Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_weights =
3443  Kokkos::subview(local_cut_upper_bound_weights,
3444  std::pair<mj_lno_t, mj_lno_t>(
3445  cut_shift,
3446  local_cut_upper_bound_weights.size()));
3447  Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_bounds =
3448  Kokkos::subview(local_cut_upper_bound_coordinates,
3449  std::pair<mj_lno_t, mj_lno_t>(
3450  cut_shift,
3451  local_cut_upper_bound_coordinates.size()));
3452  Kokkos::View<mj_scalar_t *, device_t> current_cut_lower_bounds =
3453  Kokkos::subview(local_cut_lower_bound_coordinates,
3454  std::pair<mj_lno_t, mj_lno_t>(
3455  cut_shift,
3456  local_cut_lower_bound_coordinates.size()));
3457 
3458  // Now compute the new cut coordinates.
3459  Kokkos::View<mj_scalar_t*, device_t> sub_temp_cut_coords =
3460  Kokkos::subview(this->temp_cut_coords,
3461  std::pair<mj_lno_t, mj_lno_t>(
3462  cut_shift, this->temp_cut_coords.size()));
3463  Kokkos::View<mj_scalar_t*, device_t> sub_cut_coordinates_work_array =
3464  Kokkos::subview(this->cut_coordinates_work_array,
3465  std::pair<mj_lno_t, mj_lno_t>(
3466  cut_shift, this->cut_coordinates_work_array.size()));
3467 
3468  this->mj_get_new_cut_coordinates(
3469  current_concurrent_num_parts,
3470  kk,
3471  num_cuts,
3472  used_imbalance_tolerance,
3473  current_global_part_weights,
3474  current_local_part_weights,
3475  current_part_target_weights,
3476  current_cut_line_determined,
3477  sub_temp_cut_coords,
3478  current_cut_upper_bounds,
3479  current_cut_lower_bounds,
3480  current_global_left_closest_points,
3481  current_global_right_closest_points,
3482  current_cut_lower_bound_weights,
3483  current_cut_upper_weights,
3484  sub_cut_coordinates_work_array,
3485  current_part_cut_line_weight_to_put_left,
3486  view_rectilinear_cut_count);
3487 
3488  cut_shift += num_cuts;
3489  tlr_shift += (num_total_part + 2 * num_cuts);
3490  } // end of kk loop
3491 
3492  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3493  mj_part_t iteration_complete_cut_count =
3494  save_initial_incomplete_cut_count(kk) - this->incomplete_cut_count(kk);
3495  total_incomplete_cut_count -= iteration_complete_cut_count;
3496  }
3497 
3498  Kokkos::parallel_for(
3499  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3500  (0, local_temp_cut_coords.size()), KOKKOS_LAMBDA(int n) {
3501  auto t = local_temp_cut_coords(n);
3502  local_temp_cut_coords(n) = local_cut_coordinates_work_array(n);
3503  local_cut_coordinates_work_array(n) = t;
3504  });
3505  } // end of the while loop
3506 
3507  // Needed only if keep_cuts; otherwise can simply swap array pointers
3508  // cutCoordinates and cutCoordinatesWork.
3509  // (at first iteration, cutCoordinates == cutCoorindates_tmp).
3510  // computed cuts must be in cutCoordinates.
3511  if(current_cut_coordinates != local_temp_cut_coords) {
3512  Kokkos::parallel_for(
3513  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3514  (0, 1), KOKKOS_LAMBDA(int dummy) {
3515  mj_part_t next = 0;
3516  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3517  mj_part_t num_parts = -1;
3518  num_parts = local_device_num_partitioning_in_current_dim(
3519  current_work_part + i);
3520  mj_part_t num_cuts = num_parts - 1;
3521  for(mj_part_t ii = 0; ii < num_cuts; ++ii) {
3522  current_cut_coordinates(next + ii) = local_temp_cut_coords(next + ii);
3523  }
3524  next += num_cuts;
3525  }
3526  for(int n = 0; n <
3527  static_cast<int>(local_cut_coordinates_work_array.size()); ++n) {
3528  local_cut_coordinates_work_array(n) = local_temp_cut_coords(n);
3529  }
3530  });
3531  }
3532 
3533  delete reductionOp;
3534 }
3535 
3536 template<class scalar_t>
3538  scalar_t * ptr;
3539 
3540  // With new kokkos setup parallel_reduce will call empty constructor and
3541  // we update the ptr in the init method.
3542  KOKKOS_INLINE_FUNCTION
3543  Zoltan2_MJArrayType() : ptr(NULL) {};
3544 
3545  KOKKOS_INLINE_FUNCTION
3546  Zoltan2_MJArrayType(scalar_t * pSetPtr) : ptr(pSetPtr) {};
3547 
3549  ptr = zmj.ptr;
3550  return *this;
3551  }
3552 };
3553 
3554 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3555 
3556 template<class policy_t, class scalar_t, class part_t>
3558 
3561  scalar_t max_scalar;
3565 
3566  KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(
3567  scalar_t mj_max_scalar,
3568  value_type &val,
3569  int mj_value_count_rightleft,
3570  int mj_value_count_weights) :
3571  max_scalar(mj_max_scalar),
3572  value(&val),
3573  value_count_rightleft(mj_value_count_rightleft),
3574  value_count_weights(mj_value_count_weights)
3575  {}
3576 
3577  KOKKOS_INLINE_FUNCTION
3579  return *value;
3580  }
3581 
3582  KOKKOS_INLINE_FUNCTION
3583  void join(value_type& dst, const value_type& src) const {
3584  for(int n = 0; n < value_count_weights; ++n) {
3585  dst.ptr[n] += src.ptr[n];
3586  }
3587 
3588  for(int n = value_count_weights + 2;
3589  n < value_count_weights + value_count_rightleft - 2; n += 2) {
3590  if(src.ptr[n] > dst.ptr[n]) {
3591  dst.ptr[n] = src.ptr[n];
3592  }
3593  if(src.ptr[n+1] < dst.ptr[n+1]) {
3594  dst.ptr[n+1] = src.ptr[n+1];
3595  }
3596  }
3597  }
3598 
3599  KOKKOS_INLINE_FUNCTION
3600  void join (volatile value_type& dst, const volatile value_type& src) const {
3601  for(int n = 0; n < value_count_weights; ++n) {
3602  dst.ptr[n] += src.ptr[n];
3603  }
3604 
3605  for(int n = value_count_weights + 2;
3606  n < value_count_weights + value_count_rightleft - 2; n += 2) {
3607  if(src.ptr[n] > dst.ptr[n]) {
3608  dst.ptr[n] = src.ptr[n];
3609  }
3610  if(src.ptr[n+1] < dst.ptr[n+1]) {
3611  dst.ptr[n+1] = src.ptr[n+1];
3612  }
3613  }
3614  }
3615 
3616  KOKKOS_INLINE_FUNCTION void init (value_type& dst) const {
3617  dst.ptr = value->ptr; // must update ptr
3618 
3619  for(int n = 0; n < value_count_weights; ++n) {
3620  dst.ptr[n] = 0;
3621  }
3622 
3623  for(int n = value_count_weights;
3625  dst.ptr[n] = -max_scalar;
3626  dst.ptr[n+1] = max_scalar;
3627  }
3628  }
3629 };
3630 #endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP
3631 
3632 template<class policy_t, class scalar_t, class part_t, class index_t,
3633  class device_t, class array_t>
3635  typedef typename policy_t::member_type member_type;
3636  typedef Kokkos::View<scalar_t*> scalar_view_t;
3637 
3638 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3639  typedef array_t value_type[];
3640 #endif
3641 
3643  array_t max_scalar;
3644 
3652  Kokkos::View<index_t*, device_t> permutations;
3653  Kokkos::View<scalar_t *, device_t> coordinates;
3654  Kokkos::View<scalar_t**, device_t> weights;
3655  Kokkos::View<part_t*, device_t> parts;
3656  Kokkos::View<scalar_t *, device_t> cut_coordinates;
3657  Kokkos::View<index_t *, device_t> part_xadj;
3659  scalar_t sEpsilon;
3660 
3661 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3662  Kokkos::View<double *, device_t> current_part_weights;
3663  Kokkos::View<scalar_t *, device_t> current_left_closest;
3664  Kokkos::View<scalar_t *, device_t> current_right_closest;
3665 #endif // KOKKOS_ENABLE_CUDA || defined(KOKKOS_ENABLE_HIP)
3666 
3668  int mj_loop_count,
3669  array_t mj_max_scalar,
3670  part_t mj_concurrent_current_part,
3671  part_t mj_num_cuts,
3672  part_t mj_current_work_part,
3673  part_t mj_current_concurrent_num_parts,
3674  part_t mj_left_right_array_size,
3675  part_t mj_weight_array_size,
3676  Kokkos::View<index_t*, device_t> & mj_permutations,
3677  Kokkos::View<scalar_t *, device_t> & mj_coordinates,
3678  Kokkos::View<scalar_t**, device_t> & mj_weights,
3679  Kokkos::View<part_t*, device_t> & mj_parts,
3680  Kokkos::View<scalar_t *, device_t> & mj_cut_coordinates,
3681  Kokkos::View<index_t *, device_t> & mj_part_xadj,
3682  bool mj_uniform_weights0,
3683  scalar_t mj_sEpsilon
3684 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3685  ,Kokkos::View<double *, device_t> & mj_current_part_weights,
3686  Kokkos::View<scalar_t *, device_t> & mj_current_left_closest,
3687  Kokkos::View<scalar_t *, device_t> & mj_current_right_closest
3688 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3689  ) :
3690  loop_count(mj_loop_count),
3691  max_scalar(mj_max_scalar),
3692  concurrent_current_part(mj_concurrent_current_part),
3693  num_cuts(mj_num_cuts),
3694  current_work_part(mj_current_work_part),
3695  current_concurrent_num_parts(mj_current_concurrent_num_parts),
3696  value_count_rightleft(mj_left_right_array_size),
3697  value_count_weights(mj_weight_array_size),
3698  value_count(mj_weight_array_size+mj_left_right_array_size),
3699  permutations(mj_permutations),
3700  coordinates(mj_coordinates),
3701  weights(mj_weights),
3702  parts(mj_parts),
3703  cut_coordinates(mj_cut_coordinates),
3704  part_xadj(mj_part_xadj),
3705  uniform_weights0(mj_uniform_weights0),
3706  sEpsilon(mj_sEpsilon)
3707 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3708  ,current_part_weights(mj_current_part_weights),
3709  current_left_closest(mj_current_left_closest),
3710  current_right_closest(mj_current_right_closest)
3711 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3712  {
3713  }
3714 
3715  size_t team_shmem_size (int team_size) const {
3716 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3717  int result = sizeof(array_t) *
3719 #else
3720  int result = sizeof(array_t) *
3722 #endif
3723 
3724  // pad this to a multiple of 8 or it will run corrupt
3725  int remainder = result % 8;
3726  if(remainder != 0) {
3727  result += 8 - remainder;
3728  }
3729  return result;
3730  }
3731 
3732  KOKKOS_INLINE_FUNCTION
3733 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3734  void operator() (const member_type & teamMember) const {
3735 #else
3736  void operator() (const member_type & teamMember, value_type teamSum) const {
3737 #endif
3738 
3739  index_t all_begin = (concurrent_current_part == 0) ? 0 :
3741  index_t all_end = part_xadj(concurrent_current_part);
3742 
3743  index_t num_working_points = all_end - all_begin;
3744  int num_teams = teamMember.league_size();
3745 
3746  index_t stride = num_working_points / num_teams;
3747  if((num_working_points % num_teams) > 0) {
3748  stride += 1; // make sure we have coverage for the final points
3749  }
3750 
3751  // the last team may have less work than the other teams
3752  // the last team can be empty (begin > end) if num_teams > stride
3753  // which is true for many teams and small numbers of coords (tests)
3754  index_t begin = all_begin + stride * teamMember.league_rank();
3755  index_t end = begin + stride;
3756  if(end > all_end) {
3757  end = all_end;
3758  }
3759 
3760 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3761  size_t sh_mem_size = sizeof(array_t) * (value_count_weights +
3763 
3764  array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3765  sh_mem_size);
3766 
3767  // init the shared array to 0
3768  Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3769  for(int n = 0; n < value_count_weights; ++n) {
3770  shared_ptr[n] = 0;
3771  }
3772  for(int n = value_count_weights;
3774  shared_ptr[n] = -max_scalar;
3775  shared_ptr[n+1] = max_scalar;
3776  }
3777  });
3778  teamMember.team_barrier();
3779 
3780  Kokkos::parallel_for(
3781  Kokkos::TeamThreadRange(teamMember, begin, end),
3782  [=] (index_t ii) {
3783 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3784  // create the team shared data - each thread gets one of the arrays
3785  size_t sh_mem_size = sizeof(array_t) * (value_count_weights +
3786  value_count_rightleft) * teamMember.team_size();
3787 
3788  array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3789  sh_mem_size);
3790 
3791  // select the array for this thread
3792  Zoltan2_MJArrayType<array_t> array(&shared_ptr[teamMember.team_rank() *
3794 
3795  // create reducer which handles the Zoltan2_MJArrayType class
3797  max_scalar, array,
3800 
3801  Kokkos::parallel_reduce(
3802  Kokkos::TeamThreadRange(teamMember, begin, end),
3803  [=] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
3804 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3805 
3806  int i = permutations(ii);
3807  scalar_t coord = coordinates(i);
3808  array_t w = uniform_weights0 ? 1 : (array_t) weights(i,0);
3809 
3810  // now check each part and it's right cut
3811  index_t part = parts(i)/2;
3812 
3813  int upper = num_cuts;
3814  int lower = 0;
3815 
3816  // binary search - find matching part
3817  while(true) {
3818  scalar_t a = (part == 0) ? -max_scalar : cut_coordinates(part-1);
3819  scalar_t b = (part == num_cuts) ? max_scalar : cut_coordinates(part);
3820 
3821  if(coord >= a + sEpsilon && coord <= b - sEpsilon) {
3822 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3823  Kokkos::atomic_add(&shared_ptr[part*2], w);
3824 #else
3825  threadSum.ptr[part*2] += w;
3826 #endif
3827 
3828  parts(i) = part*2;
3829 
3830  // now handle the left/right closest part
3831 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3832  array_t new_value = (array_t) coord;
3833  array_t prev_value = shared_ptr[value_count_weights + part * 2 + 1];
3834  while(new_value < prev_value) {
3835  prev_value = Kokkos::atomic_compare_exchange(
3836  &shared_ptr[value_count_weights + part * 2 + 1],
3837  prev_value, new_value);
3838  }
3839  prev_value = shared_ptr[value_count_weights + part * 2 + 2];
3840  while(new_value > prev_value) {
3841  prev_value = Kokkos::atomic_compare_exchange(
3842  &shared_ptr[value_count_weights + part * 2 + 2],
3843  prev_value, new_value);
3844  }
3845 #else
3846  // note cut to left needs to set right closest and cut to right needs
3847  // to set left closest. It's index +1 and +2 instead of -1 and +0
3848  // because right/left segment is padded with an extra pair at
3849  // begining and end to avoid branching with if checks.
3850  if(coord < threadSum.ptr[value_count_weights + part * 2 + 1]) {
3851  threadSum.ptr[value_count_weights + part * 2 + 1] = coord;
3852  }
3853  if(coord > threadSum.ptr[value_count_weights + part * 2 + 2]) {
3854  threadSum.ptr[value_count_weights + part * 2 + 2] = coord;
3855  }
3856 #endif
3857 
3858  break;
3859  }
3860  else if(part != num_cuts) {
3861  if(coord < b + sEpsilon && coord > b - sEpsilon) {
3862  // Note if on cut we set right/left closest to the cut itself
3863  // but we add +2 because we buffered the area with an extra slot
3864  // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3865 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3866  Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3867  shared_ptr[value_count_weights + part * 2 + 2] = b;
3868  shared_ptr[value_count_weights + part * 2 + 3] = b;
3869 #else
3870  threadSum.ptr[part*2+1] += w;
3871  threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3872  threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3873 #endif
3874 
3875  parts(i) = part*2+1;
3876 
3877  // Need to scan up for any other cuts of same coordinate
3878  // This is costly but it's only relevant for the fix4785 test
3879  // which loads a lot of coordinates on the same point, so without
3880  // this our cuts would all just sit at 0.
3881  part_t base_b = part;
3882  scalar_t base_coord = cut_coordinates(base_b);
3883  part += 1;
3884  while(part < num_cuts) {
3885  b = cut_coordinates(part);
3886  scalar_t delta = b - base_coord;
3887  if(delta < 0) delta = -delta;
3888  if(delta < sEpsilon) {
3889  // Note if on cut we set right/left closest to the cut itself
3890  // but we add +2 because we buffered the area with an extra slot
3891  // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3892 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3893  Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3894  shared_ptr[value_count_weights + part * 2 + 2] = b;
3895  shared_ptr[value_count_weights + part * 2 + 3] = b;
3896 #else
3897  threadSum.ptr[part*2+1] += w;
3898  threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3899  threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3900 #endif
3901  }
3902  else { break; }
3903  ++part;
3904  }
3905  part = base_b - 1;
3906  while(part >= 0) {
3907  b = cut_coordinates(part);
3908  scalar_t delta = b - base_coord;
3909  if(delta < 0) delta = -delta;
3910  if(delta < sEpsilon) {
3911  // Note if on cut we set right/left closest to the cut itself
3912  // but we add +2 because we buffered the area with an extra slot
3913  // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3914 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3915  Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3916  shared_ptr[value_count_weights + part * 2 + 2] = b;
3917  shared_ptr[value_count_weights + part * 2 + 3] = b;
3918 #else
3919  threadSum.ptr[part*2+1] += w;
3920  threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3921  threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3922 #endif
3923  }
3924  else { break; }
3925  --part;
3926  }
3927 
3928  break;
3929  }
3930  }
3931 
3932  if(loop_count != 0) {
3933  // subsequent loops can just step towards target
3934  if(coord < b) {
3935  part -= 1;
3936  }
3937  else {
3938  part += 1;
3939  }
3940  }
3941  else {
3942  // initial loop binary search
3943  if(coord < b) {
3944  if(part == lower + 1) {
3945  part = lower;
3946  }
3947  else {
3948  upper = part - 1;
3949  part -= (part - lower)/2;
3950  }
3951  }
3952  else if(part == upper - 1) {
3953  part = upper;
3954  }
3955  else {
3956  lower = part + 1;
3957  part += (upper - part)/2;
3958  }
3959  }
3960  }
3961 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3962  });
3963 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3964  }, arraySumReducer);
3965 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3966 
3967  teamMember.team_barrier();
3968 
3969  // collect all the team's results
3970  Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3971  for(int n = 0; n < value_count_weights; ++n) {
3972 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3973  Kokkos::atomic_add(&current_part_weights(n),
3974  static_cast<double>(shared_ptr[n]));
3975 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3976  teamSum[n] += array.ptr[n];
3977 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3978  }
3979 
3980 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3981  int insert_left = 0;
3982  int insert_right = 0;
3983 #endif
3984 
3985  for(int n = 2 + value_count_weights;
3986  n < value_count_weights + value_count_rightleft - 2; n += 2) {
3987 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3988  scalar_t new_value = shared_ptr[n+1];
3989  scalar_t prev_value = current_right_closest(insert_right);
3990  while(new_value < prev_value) {
3991  prev_value = Kokkos::atomic_compare_exchange(
3992  &current_right_closest(insert_right), prev_value, new_value);
3993  }
3994 
3995  new_value = shared_ptr[n];
3996  prev_value = current_left_closest(insert_left);
3997  while(new_value > prev_value) {
3998  prev_value = Kokkos::atomic_compare_exchange(
3999  &current_left_closest(insert_left), prev_value, new_value);
4000  }
4001 
4002  ++insert_left;
4003  ++insert_right;
4004 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4005  if(array.ptr[n] > teamSum[n]) {
4006  teamSum[n] = array.ptr[n];
4007  }
4008  if(array.ptr[n+1] < teamSum[n+1]) {
4009  teamSum[n+1] = array.ptr[n+1];
4010  }
4011 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4012  }
4013  });
4014 
4015  teamMember.team_barrier();
4016  }
4017 
4018 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4019  KOKKOS_INLINE_FUNCTION
4020  void join(value_type dst, const value_type src) const {
4021  for(int n = 0; n < value_count_weights; ++n) {
4022  dst[n] += src[n];
4023  }
4024 
4025  for(int n = value_count_weights + 2;
4026  n < value_count_weights + value_count_rightleft - 2; n += 2) {
4027  if(src[n] > dst[n]) {
4028  dst[n] = src[n];
4029  }
4030  if(src[n+1] < dst[n+1]) {
4031  dst[n+1] = src[n+1];
4032  }
4033  }
4034  }
4035 
4036  KOKKOS_INLINE_FUNCTION
4037  void join (volatile value_type dst, const volatile value_type src) const {
4038  for(int n = 0; n < value_count_weights; ++n) {
4039  dst[n] += src[n];
4040  }
4041 
4042  for(int n = value_count_weights + 2;
4043  n < value_count_weights + value_count_rightleft - 2; n += 2) {
4044  if(src[n] > dst[n]) {
4045  dst[n] = src[n];
4046  }
4047  if(src[n+1] < dst[n+1]) {
4048  dst[n+1] = src[n+1];
4049  }
4050  }
4051  }
4052 
4053  KOKKOS_INLINE_FUNCTION void init (value_type dst) const {
4054  for(int n = 0; n < value_count_weights; ++n) {
4055  dst[n] = 0;
4056  }
4057 
4058  for(int n = value_count_weights;
4060  dst[n] = -max_scalar;
4061  dst[n+1] = max_scalar;
4062  }
4063  }
4064 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4065 };
4066 
4074 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4075  typename mj_part_t, typename mj_node_t>
4076 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t,mj_part_t, mj_node_t>::
4077  mj_1D_part_get_part_weights(
4078  mj_part_t current_concurrent_num_parts,
4079  mj_part_t current_work_part,
4080  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4081  int loop_count)
4082 {
4083  auto local_is_cut_line_determined = is_cut_line_determined;
4084  auto local_thread_part_weights = thread_part_weights;
4085  auto local_thread_cut_left_closest_point = thread_cut_left_closest_point;
4086  auto local_thread_cut_right_closest_point = thread_cut_right_closest_point;
4087 
4088  // Create some locals so we don't use this inside the kernels
4089  // which causes problems
4090  auto local_sEpsilon = this->sEpsilon;
4091  auto local_assigned_part_ids = this->assigned_part_ids;
4092  auto local_coordinate_permutations = this->coordinate_permutations;
4093  auto local_mj_weights = this->mj_weights;
4094  auto local_part_xadj = this->part_xadj;
4095  auto local_global_min_max_coord_total_weight =
4096  this->global_min_max_coord_total_weight;
4097 
4098  typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4099 
4100  auto local_device_num_partitioning_in_current_dim =
4101  device_num_partitioning_in_current_dim;
4102 
4103  Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
4104  auto local_device_incomplete_cut_count = device_incomplete_cut_count;
4105 
4106  mj_part_t total_part_shift = 0;
4107 
4108  mj_part_t concurrent_cut_shifts = 0;
4109  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
4110  Kokkos::View<mj_scalar_t *, device_t> local_temp_cut_coords =
4111  Kokkos::subview(temp_cut_coords, std::pair<mj_lno_t, mj_lno_t>(
4112  concurrent_cut_shifts, temp_cut_coords.size()));
4113 
4114  mj_part_t num_parts =
4115  host_num_partitioning_in_current_dim(current_work_part + kk);
4116  mj_part_t num_cuts = num_parts - 1;
4117  mj_part_t total_part_count = num_parts + num_cuts;
4118  mj_part_t weight_array_length = num_cuts + num_parts;
4119 
4120  // for right/left closest + buffer cut on either side
4121  mj_part_t right_left_array_length = (num_cuts + 2) * 2;
4122 
4123  if(this->incomplete_cut_count(kk) == 0) {
4124  total_part_shift += total_part_count;
4125  concurrent_cut_shifts += num_cuts;
4126  continue;
4127  }
4128 
4129  // if not set use 60 - was initial testing amount but somewhat arbitrary
4130  auto policy_ReduceWeightsFunctor = policy_t(
4131  mj_num_teams ? mj_num_teams : 60, Kokkos::AUTO);
4132 
4133 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4134  int total_array_length =
4135  weight_array_length + right_left_array_length;
4136 #endif
4137 
4138  // Using float here caused some numerical errors for coord on cut calculations.
4139  // Probably that can be fixed with proper epsilon adjustment but since cuda
4140  // doesn't reduce right now the shared memory pressure is no longer relevant.
4141  // Just use scalar_t to match the original algorithm.
4142  typedef mj_scalar_t array_t;
4143 
4144 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4145  array_t * reduce_array =
4146  new array_t[static_cast<size_t>(total_array_length)];
4147 #endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP
4148 
4149  int offset_cuts = 0;
4150  for(int kk2 = 0; kk2 < kk; ++kk2) {
4151  offset_cuts +=
4152  host_num_partitioning_in_current_dim(current_work_part + kk2) - 1;
4153  }
4154  Kokkos::View<double *, device_t> my_current_part_weights =
4155  Kokkos::subview(local_thread_part_weights,
4156  std::pair<mj_lno_t, mj_lno_t>(total_part_shift,
4157  total_part_shift + total_part_count));
4158  Kokkos::View<mj_scalar_t *, device_t> my_current_left_closest =
4159  Kokkos::subview(local_thread_cut_left_closest_point,
4160  std::pair<mj_lno_t, mj_lno_t>(
4161  offset_cuts,
4162  local_thread_cut_left_closest_point.size()));
4163  Kokkos::View<mj_scalar_t *, device_t> my_current_right_closest =
4164  Kokkos::subview(local_thread_cut_right_closest_point,
4165  std::pair<mj_lno_t, mj_lno_t>(
4166  offset_cuts,
4167  local_thread_cut_right_closest_point.size()));
4168 
4169  array_t max_scalar = std::numeric_limits<array_t>::max();
4170 
4171 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4172  // initialize values
4173  Kokkos::parallel_for(
4174  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4175  KOKKOS_LAMBDA (int dummy) {
4176  for(int n = 0; n < weight_array_length; ++n) {
4177  my_current_part_weights(n) = 0;
4178  }
4179  for(int n = 0; n < num_cuts; ++n) {
4180  my_current_left_closest(n) = -max_scalar;
4181  my_current_right_closest(n) = max_scalar;
4182  }
4183  });
4184 #endif
4185 
4186  mj_part_t concurrent_current_part =
4187  current_work_part + kk;
4188 
4189  ReduceWeightsFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4190  typename mj_node_t::device_type, array_t>
4191  teamFunctor(
4192  loop_count,
4193  max_scalar,
4195  num_cuts,
4198  right_left_array_length,
4199  weight_array_length,
4200  coordinate_permutations,
4201  mj_current_dim_coords,
4202  mj_weights,
4203  assigned_part_ids,
4204  local_temp_cut_coords,
4205  part_xadj,
4206  mj_uniform_weights(0), // host and currently only relevant to slot 0
4207  sEpsilon
4208 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4209  ,my_current_part_weights,
4210  my_current_left_closest,
4211  my_current_right_closest
4212 #endif
4213  );
4214 
4215 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4216  Kokkos::parallel_for(policy_ReduceWeightsFunctor, teamFunctor);
4217 #else
4218  Kokkos::parallel_reduce(policy_ReduceWeightsFunctor,
4219  teamFunctor, reduce_array);
4220 #endif
4221 
4222 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4223  auto hostArray = Kokkos::create_mirror_view(my_current_part_weights);
4224 
4225  for(int i = 0; i < static_cast<int>(total_part_count); ++i) {
4226  hostArray(i) = reduce_array[i];
4227  }
4228 
4229  Kokkos::deep_copy(my_current_part_weights, hostArray);
4230 
4231  auto hostLeftArray = Kokkos::create_mirror_view(my_current_left_closest);
4232  auto hostRightArray = Kokkos::create_mirror_view(my_current_right_closest);
4233  for(mj_part_t cut = 0; cut < num_cuts; ++cut) {
4234  hostLeftArray(cut) = reduce_array[weight_array_length + (cut+1)*2+0];
4235  hostRightArray(cut) = reduce_array[weight_array_length + (cut+1)*2+1];
4236  }
4237  Kokkos::deep_copy(my_current_left_closest, hostLeftArray);
4238  Kokkos::deep_copy(my_current_right_closest, hostRightArray);
4239 
4240  delete [] reduce_array;
4241 #endif
4242 
4243  total_part_shift += total_part_count;
4244  concurrent_cut_shifts += num_cuts;
4245  }
4246 
4247  auto local_temp_cut_coords = temp_cut_coords;
4248 
4249  Kokkos::parallel_for(
4250  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
4251  (0, current_concurrent_num_parts), KOKKOS_LAMBDA(mj_part_t kk) {
4252  mj_part_t num_parts = local_device_num_partitioning_in_current_dim(
4253  current_work_part + kk);
4254  mj_part_t num_cuts = num_parts - 1;
4255  mj_part_t total_part_count = num_parts + num_cuts;
4256 
4257  if(local_device_incomplete_cut_count(kk) > 0) {
4258  // get the prefix sum
4259  // This is an inefficiency but not sure if it matters much
4260  size_t offset = 0;
4261  size_t offset_cuts = 0;
4262  for(mj_part_t kk2 = 0; kk2 < kk; ++kk2) {
4263  auto num_parts_kk2 = local_device_num_partitioning_in_current_dim(
4264  current_work_part + kk2);
4265  offset += num_parts_kk2 * 2 - 1;
4266  offset_cuts += num_parts_kk2 - 1;
4267  }
4268 
4269  for(mj_part_t i = 1; i < total_part_count; ++i) {
4270  // check for cuts sharing the same position; all cuts sharing a position
4271  // have the same weight == total weight for all cuts sharing the
4272  // position. Don't want to accumulate that total weight more than once.
4273  if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
4274  std::abs(local_temp_cut_coords(offset_cuts + i / 2) -
4275  local_temp_cut_coords(offset_cuts + i /2 - 1))
4276  < local_sEpsilon) {
4277  // i % 2 = 0 when part i represents the cut coordinate.
4278  // if it is a cut, and if next cut also has the same coordinate, then
4279  // dont addup.
4280  local_thread_part_weights(offset + i)
4281  = local_thread_part_weights(offset + i-2);
4282  continue;
4283  }
4284 
4285  // otherwise do the prefix sum.
4286  local_thread_part_weights(offset + i) +=
4287  local_thread_part_weights(offset + i-1);
4288  }
4289  }
4290  });
4291 }
4292 
4300 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4301  typename mj_part_t, typename mj_node_t>
4302 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4303  mj_combine_rightleft_and_weights(
4304  mj_part_t current_work_part,
4305  mj_part_t current_concurrent_num_parts)
4306 {
4307  auto local_thread_part_weights = this->thread_part_weights;
4308  auto local_is_cut_line_determined = this->is_cut_line_determined;
4309  auto local_thread_cut_left_closest_point =
4310  this->thread_cut_left_closest_point;
4311  auto local_thread_cut_right_closest_point =
4312  this->thread_cut_right_closest_point;
4313  auto local_total_part_weight_left_right_closests =
4314  this->total_part_weight_left_right_closests;
4315  auto local_device_num_partitioning_in_current_dim =
4316  device_num_partitioning_in_current_dim;
4317  Kokkos::parallel_for(
4318  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0,1),
4319  KOKKOS_LAMBDA (int dummy) {
4320 
4321  size_t tlr_array_shift = 0;
4322  mj_part_t cut_shift = 0;
4323  size_t total_part_array_shift = 0;
4324 
4325  // iterate for all concurrent parts to find the left and right closest
4326  // points in the process.
4327  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
4328 
4329  mj_part_t num_parts_in_part =
4330  local_device_num_partitioning_in_current_dim(current_work_part + i);
4331  mj_part_t num_cuts_in_part = num_parts_in_part - 1;
4332  size_t num_total_part_in_part =
4333  num_parts_in_part + size_t (num_cuts_in_part);
4334 
4335  // iterate for cuts in a single part.
4336  for(int ii = 0; ii < num_cuts_in_part; ++ii) {
4337  mj_part_t next = tlr_array_shift + ii;
4338  mj_part_t cut_index = cut_shift + ii;
4339 
4340  if(!local_is_cut_line_determined(cut_index)) {
4341  mj_scalar_t left_closest_in_process =
4342  local_thread_cut_left_closest_point(cut_index);
4343  mj_scalar_t right_closest_in_process =
4344  local_thread_cut_right_closest_point(cut_index);
4345 
4346  // store the left and right closes points.
4347  local_total_part_weight_left_right_closests(
4348  num_total_part_in_part + next) = left_closest_in_process;
4349 
4350  local_total_part_weight_left_right_closests(
4351  num_total_part_in_part + num_cuts_in_part + next) =
4352  right_closest_in_process;
4353  }
4354  }
4355 
4356  for(size_t j = 0; j < num_total_part_in_part; ++j) {
4357  mj_part_t cut_ind = j / 2 + cut_shift;
4358 
4359  // need to check j != num_total_part_in_part - 1
4360  // which is same as j/2 != num_cuts_in_part.
4361  // we cannot check it using cut_ind, because of the concurrent part
4362  // concantanetion.
4363  if(j == num_total_part_in_part - 1 ||
4364  !local_is_cut_line_determined(cut_ind)) {
4365  double pwj = local_thread_part_weights(total_part_array_shift + j);
4366  local_total_part_weight_left_right_closests(tlr_array_shift + j) = pwj;
4367  }
4368  }
4369 
4370  // set the shift position in the arrays
4371  cut_shift += num_cuts_in_part;
4372  tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
4373  total_part_array_shift += num_total_part_in_part;
4374  }
4375  });
4376 }
4377 
4390 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4391  typename mj_part_t, typename mj_node_t>
4392 KOKKOS_INLINE_FUNCTION
4393 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
4394  mj_node_t>::mj_calculate_new_cut_position(mj_scalar_t cut_upper_bound,
4395  mj_scalar_t cut_lower_bound,
4396  mj_scalar_t cut_upper_weight,
4397  mj_scalar_t cut_lower_weight,
4398  mj_scalar_t expected_weight,
4399  mj_scalar_t &new_cut_position,
4400  mj_scalar_t sEpsilon) {
4401 
4402  if(std::abs(cut_upper_bound - cut_lower_bound) < sEpsilon) {
4403  new_cut_position = cut_upper_bound; //or lower bound does not matter.
4404  }
4405 
4406  if(std::abs(cut_upper_weight - cut_lower_weight) < sEpsilon) {
4407  new_cut_position = cut_lower_bound;
4408  }
4409 
4410  mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
4411  mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
4412  mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
4413 
4414  mj_scalar_t required_shift = (my_weight_diff / weight_range);
4415  int scale_constant = 20;
4416  int shiftint= int (required_shift * scale_constant);
4417  if(shiftint == 0) shiftint = 1;
4418  required_shift = mj_scalar_t (shiftint) / scale_constant;
4419  new_cut_position = coordinate_range * required_shift + cut_lower_bound;
4420 }
4421 
4422 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4423 
4424 template<class policy_t, class scalar_t>
4426 
4431 
4432  KOKKOS_INLINE_FUNCTION ArrayReducer(
4433  value_type &val,
4434  int mj_value_count) :
4435  value(&val),
4436  value_count(mj_value_count)
4437  {}
4438 
4439  KOKKOS_INLINE_FUNCTION
4441  return *value;
4442  }
4443 
4444  KOKKOS_INLINE_FUNCTION
4445  void join(value_type& dst, const value_type& src) const {
4446  for(int n = 0; n < value_count; ++n) {
4447  dst.ptr[n] += src.ptr[n];
4448  }
4449  }
4450 
4451  KOKKOS_INLINE_FUNCTION
4452  void join (volatile value_type& dst, const volatile value_type& src) const {
4453  for(int n = 0; n < value_count; ++n) {
4454  dst.ptr[n] += src.ptr[n];
4455  }
4456  }
4457 
4458  KOKKOS_INLINE_FUNCTION void init (value_type& dst) const {
4459  dst.ptr = value->ptr; // must update ptr
4460  for(int n = 0; n < value_count; ++n) {
4461  dst.ptr[n] = 0;
4462  }
4463  }
4464 };
4465 
4466 #endif
4467 
4468 template<class policy_t, class scalar_t, class part_t, class index_t,
4469  class device_t, class array_t>
4471  typedef typename policy_t::member_type member_type;
4472  typedef Kokkos::View<scalar_t*> scalar_view_t;
4473 
4474 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4475  typedef array_t value_type[];
4476 #endif
4477 
4480  Kokkos::View<index_t*, device_t> permutations;
4481  Kokkos::View<scalar_t *, device_t> coordinates;
4482  Kokkos::View<part_t*, device_t> parts;
4483  Kokkos::View<index_t *, device_t> part_xadj;
4484  Kokkos::View<index_t *, device_t> track_on_cuts;
4485 
4486 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4487  Kokkos::View<int *, device_t> local_point_counts;
4488 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4489 
4491  part_t mj_concurrent_current_part,
4492  part_t mj_weight_array_size,
4493  Kokkos::View<index_t*, device_t> & mj_permutations,
4494  Kokkos::View<scalar_t *, device_t> & mj_coordinates,
4495  Kokkos::View<part_t*, device_t> & mj_parts,
4496  Kokkos::View<index_t *, device_t> & mj_part_xadj,
4497  Kokkos::View<index_t *, device_t> & mj_track_on_cuts
4498 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4499  ,Kokkos::View<int *, device_t> & mj_local_point_counts
4500 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4501  ) :
4502  concurrent_current_part(mj_concurrent_current_part),
4503  value_count(mj_weight_array_size),
4504  permutations(mj_permutations),
4505  coordinates(mj_coordinates),
4506  parts(mj_parts),
4507  part_xadj(mj_part_xadj),
4508  track_on_cuts(mj_track_on_cuts)
4509 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4510  ,local_point_counts(mj_local_point_counts)
4511 #endif
4512  {
4513  }
4514 
4515  size_t team_shmem_size (int team_size) const {
4516 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4517  int result = sizeof(array_t) * (value_count);
4518 #else
4519  int result = sizeof(array_t) * (value_count) * team_size;
4520 #endif
4521 
4522  // pad this to a multiple of 8 or it will run corrupt
4523  int remainder = result % 8;
4524  if(remainder != 0) {
4525  result += 8 - remainder;
4526  }
4527  return result;
4528  }
4529 
4530  KOKKOS_INLINE_FUNCTION
4531 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4532  void operator() (const member_type & teamMember) const {
4533 #else
4534  void operator() (const member_type & teamMember, value_type teamSum) const {
4535 #endif
4536  index_t all_begin = (concurrent_current_part == 0) ? 0 :
4538  index_t all_end = part_xadj(concurrent_current_part);
4539 
4540  index_t num_working_points = all_end - all_begin;
4541  int num_teams = teamMember.league_size();
4542 
4543  index_t stride = num_working_points / num_teams;
4544  if((num_working_points % num_teams) > 0) {
4545  stride += 1; // make sure we have coverage for the final points
4546  }
4547 
4548  index_t begin = all_begin + stride * teamMember.league_rank();
4549  index_t end = begin + stride;
4550  if(end > all_end) {
4551  end = all_end; // the last team may have less work than the other teams
4552  }
4553 
4554  int track_on_cuts_insert_index = track_on_cuts.size() - 1;
4555 
4556  // create the team shared data - each thread gets one of the arrays
4557 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4558  size_t sh_mem_size = sizeof(array_t) * (value_count);
4559 #else
4560  size_t sh_mem_size =
4561  sizeof(array_t) * (value_count) * teamMember.team_size();
4562 #endif
4563 
4564  array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
4565  sh_mem_size);
4566 
4567 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4568  // init the shared array to 0
4569  Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4570  for(int n = 0; n < value_count; ++n) {
4571  shared_ptr[n] = 0;
4572  }
4573  });
4574  teamMember.team_barrier();
4575 
4576  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, begin, end),
4577  [=] (index_t ii) {
4578 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4579  // select the array for this thread
4580  Zoltan2_MJArrayType<array_t> array(&shared_ptr[teamMember.team_rank() *
4581  (value_count)]);
4582 
4583  // create reducer which handles the Zoltan2_MJArrayType class
4584  ArrayReducer<policy_t, array_t> arrayReducer(array, value_count);
4585 
4586  Kokkos::parallel_reduce(
4587  Kokkos::TeamThreadRange(teamMember, begin, end),
4588  [=] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
4589 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4590 
4591  index_t coordinate_index = permutations(ii);
4592  part_t place = parts(coordinate_index);
4593  part_t part = place / 2;
4594  if(place % 2 == 0) {
4595 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4596  Kokkos::atomic_add(&shared_ptr[part], 1);
4597 #else
4598  threadSum.ptr[part] += 1;
4599 #endif
4600 
4601  parts(coordinate_index) = part;
4602  }
4603  else {
4604  // fill a tracking array so we can process these slower points
4605  // in next cycle
4606  index_t set_index = Kokkos::atomic_fetch_add(
4607  &track_on_cuts(track_on_cuts_insert_index), 1);
4608  track_on_cuts(set_index) = ii;
4609  }
4610 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4611  });
4612 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4613  }, arrayReducer);
4614 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4615 
4616  teamMember.team_barrier();
4617 
4618  // collect all the team's results
4619  Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4620  for(int n = 0; n < value_count; ++n) {
4621 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4622  Kokkos::atomic_add(&local_point_counts(n), shared_ptr[n]);
4623 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4624  teamSum[n] += array.ptr[n];
4625 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4626  }
4627  });
4628 
4629  teamMember.team_barrier();
4630  }
4631 
4632 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4633 
4634  KOKKOS_INLINE_FUNCTION
4635  void join(value_type dst, const value_type src) const {
4636  for(int n = 0; n < value_count; ++n) {
4637  dst[n] += src[n];
4638  }
4639  }
4640 
4641  KOKKOS_INLINE_FUNCTION
4642  void join (volatile value_type dst, const volatile value_type src) const {
4643  for(int n = 0; n < value_count; ++n) {
4644  dst[n] += src[n];
4645  }
4646  }
4647 
4648  KOKKOS_INLINE_FUNCTION void init (value_type dst) const {
4649  for(int n = 0; n < value_count; ++n) {
4650  dst[n] = 0;
4651  }
4652  }
4653 #endif
4654 };
4655 
4671 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4672  typename mj_part_t, typename mj_node_t>
4673 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4674 mj_create_new_partitions(
4675  mj_part_t num_parts,
4676  mj_part_t current_concurrent_work_part,
4677  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4678  Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
4679  Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
4680  Kokkos::View<mj_lno_t *, device_t> & out_part_xadj)
4681 {
4682  // Get locals for cuda
4683  auto local_thread_part_weight_work = this->thread_part_weight_work;
4684  auto local_point_counts = this->thread_point_counts;
4685  auto local_distribute_points_on_cut_lines =
4686  this->distribute_points_on_cut_lines;
4687  auto local_thread_cut_line_weight_to_put_left =
4688  this->thread_cut_line_weight_to_put_left;
4689  auto local_sEpsilon = this->sEpsilon;
4690  auto local_coordinate_permutations = this->coordinate_permutations;
4691  auto local_mj_weights = this->mj_weights;
4692  auto local_assigned_part_ids = this->assigned_part_ids;
4693  auto local_new_coordinate_permutations = this->new_coordinate_permutations;
4694 
4695  mj_part_t num_cuts = num_parts - 1;
4696 
4697  Kokkos::parallel_for(
4698  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4699  KOKKOS_LAMBDA(int dummy) {
4700 
4701  if(local_distribute_points_on_cut_lines) {
4702  for(int i = 0; i < num_cuts; ++i) {
4703  mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
4704  if(left_weight > local_sEpsilon) {
4705  // the weight of thread ii on cut.
4706  mj_scalar_t thread_ii_weight_on_cut =
4707  local_thread_part_weight_work(i * 2 + 1) -
4708  local_thread_part_weight_work(i * 2);
4709 
4710  if(thread_ii_weight_on_cut < left_weight) {
4711  // if left weight is bigger than threads weight on cut.
4712  local_thread_cut_line_weight_to_put_left(i) =
4713  thread_ii_weight_on_cut;
4714  }
4715  else {
4716  // if thread's weight is bigger than space, then put only a portion.
4717  local_thread_cut_line_weight_to_put_left(i) = left_weight;
4718  }
4719  left_weight -= thread_ii_weight_on_cut;
4720  }
4721  else {
4722  local_thread_cut_line_weight_to_put_left(i) = 0;
4723  }
4724  }
4725 
4726  // this is a special case. If cutlines share the same coordinate,
4727  // their weights are equal. We need to adjust the ratio for that.
4728  for(mj_part_t i = num_cuts - 1; i > 0 ; --i) {
4729  if(std::abs(current_concurrent_cut_coordinate(i) -
4730  current_concurrent_cut_coordinate(i -1)) < local_sEpsilon) {
4731  local_thread_cut_line_weight_to_put_left(i) -=
4732  local_thread_cut_line_weight_to_put_left(i - 1);
4733  }
4734  local_thread_cut_line_weight_to_put_left(i) =
4735  static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
4736  least_signifiance) * significance_mul) /
4737  static_cast<mj_scalar_t>(significance_mul);
4738  }
4739  }
4740 
4741  for(mj_part_t i = 0; i < num_parts; ++i) {
4742  local_point_counts(i) = 0;
4743  }
4744  });
4745 
4746  mj_lno_t coordinate_begin_index =
4747  current_concurrent_work_part == 0 ? 0 :
4748  host_part_xadj(current_concurrent_work_part - 1);
4749  mj_lno_t coordinate_end_index =
4750  host_part_xadj(current_concurrent_work_part);
4751 
4752  mj_lno_t total_on_cut;
4753  Kokkos::parallel_reduce("Get total_on_cut",
4754  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (
4755  coordinate_begin_index, coordinate_end_index),
4756  KOKKOS_LAMBDA(int ii, mj_lno_t & val) {
4757  mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4758  mj_part_t coordinate_assigned_place =
4759  local_assigned_part_ids(coordinate_index);
4760  if(coordinate_assigned_place % 2 == 1) {
4761  val += 1;
4762  }
4763  }, total_on_cut);
4764 
4765  Kokkos::View<mj_lno_t *, device_t> track_on_cuts;
4766  if(total_on_cut > 0) {
4767  track_on_cuts = Kokkos::View<mj_lno_t *, device_t>(
4768  "track_on_cuts", // would do WithoutInitialization but need last init to 0
4769  total_on_cut + 1); // extra index to use for tracking
4770  }
4771 
4772  // here we need to parallel reduce an array to count coords in each part
4773  // atomically adding, especially for low part count would kill us
4774  // in the original setup we kept arrays allocated for each thread but for
4775  // the cuda version we'd like to avoid allocating N arrays for the number
4776  // of teams/threads which would be complicated based on running openmp or
4777  // cuda.
4778  typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4779 
4780  // if not set use 60 - somewhat arbitrary based on initial performance tests
4781  int use_num_teams = mj_num_teams ? mj_num_teams : 60;
4782 
4783  auto policy_ReduceFunctor = policy_t(use_num_teams, Kokkos::AUTO);
4784  typedef int array_t;
4785 
4786  // just need parts - on the cuts will be handled in a separate serial
4787  // call after this.
4788 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4789  array_t * reduce_array = new array_t[static_cast<size_t>(num_parts)];
4790 #endif
4791 
4792  ReduceArrayFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4793  typename mj_node_t::device_type, array_t>teamFunctor(
4794  current_concurrent_work_part,
4795  num_parts,
4796  coordinate_permutations,
4797  mj_current_dim_coords,
4798  assigned_part_ids,
4799  part_xadj,
4800  track_on_cuts
4801 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4802  ,local_point_counts
4803 #endif
4804  );
4805 
4806 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4807  Kokkos::parallel_for(policy_ReduceFunctor, teamFunctor);
4808 #else
4809  Kokkos::parallel_reduce(policy_ReduceFunctor, teamFunctor, reduce_array);
4810 #endif
4811 
4812 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4813  for(mj_part_t part = 0; part < num_parts; ++part) {
4814  local_point_counts(part) = reduce_array[part];
4815  }
4816  delete [] reduce_array;
4817 #endif
4818 
4819  // the last member is utility used for atomically inserting the values.
4820  // Sorting here avoids potential indeterminancy in the partitioning results
4821  if(track_on_cuts.size() > 0) { // size 0 unused, or size is minimum of 2
4822  auto track_on_cuts_sort = Kokkos::subview(track_on_cuts,
4823  std::pair<mj_lno_t, mj_lno_t>(0, track_on_cuts.size() - 1)); // do not sort last element
4824  Kokkos::sort(track_on_cuts_sort);
4825  }
4826 
4827  bool uniform_weights0 = this->mj_uniform_weights(0);
4828  Kokkos::parallel_for(
4829  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4830  KOKKOS_LAMBDA (int dummy) {
4831 
4832  for(int j = 0; j < total_on_cut; ++j) {
4833  int ii = track_on_cuts(j);
4834  mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4835  mj_scalar_t coordinate_weight = uniform_weights0 ? 1 :
4836  local_mj_weights(coordinate_index,0);
4837  mj_part_t coordinate_assigned_place =
4838  local_assigned_part_ids(coordinate_index);
4839  mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
4840  // if it is on the cut.
4841  if(local_distribute_points_on_cut_lines &&
4842  local_thread_cut_line_weight_to_put_left(
4843  coordinate_assigned_part) > local_sEpsilon) {
4844  // if the rectilinear partitioning is allowed,
4845  // and the thread has still space to put on the left of the cut
4846  // then thread puts the vertex to left.
4847  local_thread_cut_line_weight_to_put_left(
4848  coordinate_assigned_part) -= coordinate_weight;
4849  // if putting the vertex to left increased the weight more
4850  // than expected, and if the next cut is on the same coordinate,
4851  // then we need to adjust how much weight next cut puts to its left as
4852  // well, in order to take care of the imbalance.
4853  if(local_thread_cut_line_weight_to_put_left(
4854  coordinate_assigned_part) < 0 && coordinate_assigned_part <
4855  num_cuts - 1 &&
4856  std::abs(current_concurrent_cut_coordinate(
4857  coordinate_assigned_part+1) -
4858  current_concurrent_cut_coordinate(
4859  coordinate_assigned_part)) < local_sEpsilon)
4860  {
4861  local_thread_cut_line_weight_to_put_left(
4862  coordinate_assigned_part + 1) +=
4863  local_thread_cut_line_weight_to_put_left(
4864  coordinate_assigned_part);
4865  }
4866  ++local_point_counts(coordinate_assigned_part);
4867  local_assigned_part_ids(coordinate_index) =
4868  coordinate_assigned_part;
4869  }
4870  else {
4871  // if there is no more space on the left, put the coordinate to the
4872  // right of the cut.
4873  ++coordinate_assigned_part;
4874  // this while loop is necessary when a line is partitioned into more
4875  // than 2 parts.
4876  while(local_distribute_points_on_cut_lines &&
4877  coordinate_assigned_part < num_cuts)
4878  {
4879  // traverse all the cut lines having the same partitiong
4880  if(std::abs(current_concurrent_cut_coordinate(
4881  coordinate_assigned_part) -
4882  current_concurrent_cut_coordinate(
4883  coordinate_assigned_part - 1)) < local_sEpsilon)
4884  {
4885  // if line has enough space on left, put it there.
4886  if(local_thread_cut_line_weight_to_put_left(
4887  coordinate_assigned_part) > local_sEpsilon &&
4888  local_thread_cut_line_weight_to_put_left(
4889  coordinate_assigned_part) >=
4890  std::abs(local_thread_cut_line_weight_to_put_left(
4891  coordinate_assigned_part) - coordinate_weight))
4892  {
4893  local_thread_cut_line_weight_to_put_left(
4894  coordinate_assigned_part) -= coordinate_weight;
4895  // Again if it put too much on left of the cut,
4896  // update how much the next cut sharing the same coordinate will
4897  // put to its left.
4898  if(local_thread_cut_line_weight_to_put_left(
4899  coordinate_assigned_part) < 0 &&
4900  coordinate_assigned_part < num_cuts - 1 &&
4901  std::abs(current_concurrent_cut_coordinate(
4902  coordinate_assigned_part+1) -
4903  current_concurrent_cut_coordinate(
4904  coordinate_assigned_part)) < local_sEpsilon)
4905  {
4906  local_thread_cut_line_weight_to_put_left(
4907  coordinate_assigned_part + 1) +=
4908  local_thread_cut_line_weight_to_put_left(
4909  coordinate_assigned_part);
4910  }
4911  break;
4912  }
4913  }
4914  else {
4915  break;
4916  }
4917  ++coordinate_assigned_part;
4918  }
4919  local_point_counts(coordinate_assigned_part) += 1;
4920  local_assigned_part_ids(coordinate_index) = coordinate_assigned_part;
4921  }
4922  }
4923 
4924  for(int j = 0; j < num_parts; ++j) {
4925  out_part_xadj(j) = local_point_counts(j);
4926  local_point_counts(j) = 0;
4927 
4928  if(j != 0) {
4929  out_part_xadj(j) += out_part_xadj(j - 1);
4930  local_point_counts(j) += out_part_xadj(j - 1);
4931  }
4932  }
4933  });
4934 
4935  // here we will determine insert indices for N teams
4936  // then all the teams can fill
4937 
4938 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4939 
4940  // This is the fastest so far - just straight atomic writes for CUDA
4941  // However this is not a deterministic result since it is atomic.
4942  // The final result will be deterministic.
4943  Kokkos::parallel_for(
4944  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
4945  coordinate_begin_index, coordinate_end_index),
4946  KOKKOS_LAMBDA (mj_lno_t ii) {
4947  mj_lno_t i = local_coordinate_permutations(ii);
4948  mj_part_t p = local_assigned_part_ids(i);
4949  mj_lno_t idx = Kokkos::atomic_fetch_add(&local_point_counts(p), 1);
4950  local_new_coordinate_permutations(coordinate_begin_index + idx) = i;
4951  });
4952 
4953 #else
4954 
4955 #ifdef KOKKOS_ENABLE_OPENMP
4956  // will return and fix this - revert back to 1 for clear auto testing
4957  const int num_threads = 1; // Kokkos::OpenMP::impl_max_hardware_threads();
4958 #else
4959  const int num_threads = 1;
4960 #endif
4961 
4962  const int num_teams = 1; // cuda is handled above using a different format
4963 
4964  // allow init - we want all 0's first
4965  Kokkos::View<mj_lno_t*, device_t>
4966  point_counter("insert indices", num_teams * num_threads * num_parts);
4967 
4968  // count how many coords per thread
4969  // then we will fill each independently
4970  Kokkos::TeamPolicy<typename mj_node_t::execution_space>
4971  block_policy(num_teams, num_threads);
4972  typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
4973  member_type member_type;
4974  mj_lno_t range = coordinate_end_index - coordinate_begin_index;
4975  mj_lno_t block_size = range / num_teams + 1;
4976  Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
4977  int team = team_member.league_rank();
4978  int team_offset = team * num_threads * num_parts;
4979  mj_lno_t begin = coordinate_begin_index + team * block_size;
4980  mj_lno_t end = begin + block_size;
4981  if(end > coordinate_end_index) {
4982  end = coordinate_end_index;
4983  }
4984 
4985  Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
4986  [=] (mj_lno_t ii) {
4987  int thread = team_member.team_rank();
4988  mj_lno_t i = local_coordinate_permutations(ii);
4989  mj_part_t p = local_assigned_part_ids(i);
4990  int index = team_offset + thread * num_parts + p;
4991  ++point_counter(index);
4992  });
4993  });
4994 
4995  // now prefix sum
4996  // we currently have the counts in the slots
4997  // we want the first counter for each part to be 0
4998  // then the rest should be the sum of all the priors
4999  Kokkos::parallel_for(
5000  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
5001  KOKKOS_LAMBDA (int dummy) {
5002  int num_sets = point_counter.size() / num_parts;
5003  for(int set = num_sets - 1; set >= 1; set -=1) {
5004  int base = set * num_parts;
5005  for(int part = 0; part < num_parts; ++part) {
5006  point_counter(base + part) = point_counter(base + part - num_parts);
5007  }
5008  }
5009 
5010  for(int part = 0; part < num_parts; ++part) {
5011  point_counter(part) = 0;
5012  }
5013 
5014  for(int set = 1; set < num_sets; ++set) {
5015  int base = set * num_parts;
5016  for(int part = 0; part < num_parts; ++part) {
5017  point_counter(base + part) += point_counter(base + part - num_parts);
5018  }
5019  }
5020  });
5021 
5022  // now permute
5023  Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
5024  int team = team_member.league_rank();
5025  int team_offset = team * num_threads * num_parts;
5026  mj_lno_t begin = coordinate_begin_index + team * block_size;
5027  mj_lno_t end = begin + block_size;
5028  if(end > coordinate_end_index) {
5029  end = coordinate_end_index;
5030  }
5031  Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
5032  [=] (mj_lno_t ii) {
5033  int thread = team_member.team_rank();
5034  mj_lno_t i = local_coordinate_permutations(ii);
5035  mj_part_t p = local_assigned_part_ids(i);
5036  int index = team_offset + thread * num_parts + p;
5037  int set_counter = (point_counter(index)++) + local_point_counts(p);
5038  local_new_coordinate_permutations(coordinate_begin_index + set_counter) = i;
5039  });
5040  });
5041 #endif
5042 }
5043 
5087 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5088  typename mj_part_t, typename mj_node_t>
5089 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5090  mj_node_t>::mj_get_new_cut_coordinates(
5091  mj_part_t current_concurrent_num_parts,
5092  mj_part_t kk,
5093  const mj_part_t &num_cuts,
5094  const double &used_imbalance_tolerance,
5095  Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
5096  Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
5097  Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
5098  Kokkos::View<bool *, device_t> & current_cut_line_determined,
5099  Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
5100  Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
5101  Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
5102  Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
5103  Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
5104  Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
5105  Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
5106  Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
5107  Kokkos::View<mj_scalar_t *, device_t> &
5108  current_part_cut_line_weight_to_put_left,
5109  Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count)
5110 {
5111  Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
5112 
5113  auto local_device_incomplete_cut_count = device_incomplete_cut_count;
5114  auto local_sEpsilon = sEpsilon;
5115  auto local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
5116  auto local_global_rectilinear_cut_weight = global_rectilinear_cut_weight;
5117  auto local_process_rectilinear_cut_weight = process_rectilinear_cut_weight;
5118  auto local_global_min_max_coord_total_weight =
5119  global_min_max_coord_total_weight;
5120 
5121  const auto _sEpsilon = this->sEpsilon;
5122  // Note for a 22 part system I tried removing the outer loop
5123  // and doing each sub loop as a simple parallel_for over num_cuts.
5124  // But that was about twice as slow (10ms) as the current form (5ms)
5125  // so I think the overhead of launching the new global parallel kernels
5126  // is costly. This form is just running one team so effectively using
5127  // a single warp to process the cuts. I expect with a lot of parts this
5128  // might need changing.
5129  Kokkos::TeamPolicy<typename mj_node_t::execution_space>
5130  policy_one_team(1, Kokkos::AUTO());
5131  typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
5132  member_type member_type;
5133  Kokkos::parallel_for(policy_one_team, KOKKOS_LAMBDA(member_type team_member) {
5134 
5135  mj_scalar_t min_coordinate =
5136  local_global_min_max_coord_total_weight(kk);
5137  mj_scalar_t max_coordinate =
5138  local_global_min_max_coord_total_weight(
5140  mj_scalar_t global_total_weight =
5141  local_global_min_max_coord_total_weight(
5142  kk + current_concurrent_num_parts * 2);
5143 
5144  Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5145  [=] (mj_part_t i) {
5146  // if left and right closest points are not set yet,
5147  // set it to the cut itself.
5148  if(min_coordinate -
5149  current_global_left_closest_points(i) > local_sEpsilon) {
5150  current_global_left_closest_points(i) =
5151  current_cut_coordinates(i);
5152  }
5153  if(current_global_right_closest_points(i) -
5154  max_coordinate > local_sEpsilon) {
5155  current_global_right_closest_points(i) =
5156  current_cut_coordinates(i);
5157  }
5158  });
5159  team_member.team_barrier(); // for end of Kokkos::TeamThreadRange
5160 
5161  Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5162  [=] (mj_part_t i) {
5163  using algMJ_t = AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5164  mj_node_t>;
5165  // seen weight in the part
5166  mj_scalar_t seen_weight_in_part = 0;
5167  // expected weight for part.
5168  mj_scalar_t expected_weight_in_part = 0;
5169  // imbalance for the left and right side of the cut.
5170  double imbalance_on_left = 0, imbalance_on_right = 0;
5171  if(local_distribute_points_on_cut_lines) {
5172  // init the weight on the cut.
5173  local_global_rectilinear_cut_weight(i) = 0;
5174  local_process_rectilinear_cut_weight(i) = 0;
5175  }
5176  bool bContinue = false;
5177  // if already determined at previous iterations,
5178  // then just write the coordinate to new array, and proceed.
5179  if(current_cut_line_determined(i)) {
5180  new_current_cut_coordinates(i) =
5181  current_cut_coordinates(i);
5182  bContinue = true;
5183  }
5184  if(!bContinue) {
5185  //current weight of the part at the left of the cut line.
5186  seen_weight_in_part = current_global_part_weights(i * 2);
5187 
5188  //expected ratio
5189  expected_weight_in_part = current_part_target_weights(i);
5190 
5191  //leftImbalance = imbalanceOf(seenW, globalTotalWeight, expected);
5192  imbalance_on_left = algMJ_t::calculate_imbalance(seen_weight_in_part,
5193  expected_weight_in_part);
5194  // rightImbalance = imbalanceOf(globalTotalWeight - seenW,
5195  // globalTotalWeight, 1 - expected);
5196  imbalance_on_right = algMJ_t::calculate_imbalance(global_total_weight -
5197  seen_weight_in_part, global_total_weight - expected_weight_in_part);
5198  bool is_left_imbalance_valid = std::abs(imbalance_on_left) -
5199  used_imbalance_tolerance < local_sEpsilon ;
5200  bool is_right_imbalance_valid = std::abs(imbalance_on_right) -
5201  used_imbalance_tolerance < local_sEpsilon;
5202  //if the cut line reaches to desired imbalance.
5203  if(is_left_imbalance_valid && is_right_imbalance_valid) {
5204  current_cut_line_determined(i) = true;
5205  Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5206  new_current_cut_coordinates(i) = current_cut_coordinates(i);
5207  }
5208  else if(imbalance_on_left < 0) {
5209  //if left imbalance < 0 then we need to move the cut to right.
5210  if(local_distribute_points_on_cut_lines) {
5211  // if it is okay to distribute the coordinate on
5212  // the same coordinate to left and right.
5213  // then check if we can reach to the target weight by including the
5214  // coordinates in the part.
5215  if(current_global_part_weights(i * 2 + 1) ==
5216  expected_weight_in_part) {
5217  // if it is we are done.
5218  current_cut_line_determined(i) = true;
5219  Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5220 
5221  //then assign everything on the cut to the left of the cut.
5222  new_current_cut_coordinates(i) =
5223  current_cut_coordinates(i);
5224  //for this cut all the weight on cut will be put to left.
5225  current_part_cut_line_weight_to_put_left(i) =
5226  current_local_part_weights(i * 2 + 1) -
5227  current_local_part_weights(i * 2);
5228  bContinue = true;
5229  }
5230  else if(current_global_part_weights(i * 2 + 1) >
5231  expected_weight_in_part) {
5232  // if the weight is larger than the expected weight,
5233  // then we need to distribute some points to left, some to right.
5234  current_cut_line_determined(i) = true;
5235  Kokkos::atomic_add(&view_rectilinear_cut_count(0), 1);
5236 
5237  // increase the num cuts to be determined with rectilinear
5238  // partitioning.
5239  Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5240  new_current_cut_coordinates(i) =
5241  current_cut_coordinates(i);
5242  local_process_rectilinear_cut_weight[i] =
5243  current_local_part_weights(i * 2 + 1) -
5244  current_local_part_weights(i * 2);
5245  bContinue = true;
5246  }
5247  }
5248 
5249  if(!bContinue) {
5250 
5251  // we need to move further right,so set lower bound to current line,
5252  // and shift it to the closes point from right.
5253  current_cut_lower_bounds(i) =
5254  current_global_right_closest_points(i);
5255 
5256  //set the lower bound weight to the weight we have seen.
5257  current_cut_lower_bound_weights(i) = seen_weight_in_part;
5258 
5259  // compare the upper bound with what has been found in the
5260  // last iteration.
5261  // we try to make more strict bounds for the cut here.
5262  for(mj_part_t ii = i + 1; ii < num_cuts ; ++ii) {
5263  mj_scalar_t p_weight = current_global_part_weights(ii * 2);
5264  mj_scalar_t line_weight =
5265  current_global_part_weights(ii * 2 + 1);
5266  if(p_weight >= expected_weight_in_part) {
5267  // if a cut on the right has the expected weight, then we found
5268  // our cut position. Set up and low coordiantes to this
5269  // new cut coordinate, but we need one more iteration to
5270  // finalize the cut position, as wee need to update the part ids.
5271  if(p_weight == expected_weight_in_part) {
5272  current_cut_upper_bounds(i) =
5273  current_cut_coordinates(ii);
5274  current_cut_upper_weights(i) = p_weight;
5275  current_cut_lower_bounds(i) =
5276  current_cut_coordinates(ii);
5277  current_cut_lower_bound_weights(i) = p_weight;
5278  } else if(p_weight < current_cut_upper_weights(i)) {
5279  // if a part weight is larger then my expected weight,
5280  // but lower than my upper bound weight, update upper bound.
5281  current_cut_upper_bounds(i) =
5282  current_global_left_closest_points(ii);
5283  current_cut_upper_weights(i) = p_weight;
5284  }
5285  break;
5286  }
5287  // if comes here then pw < ew
5288  // then compare the weight against line weight.
5289  if(line_weight >= expected_weight_in_part) {
5290  // if the line is larger than the expected weight, then we need
5291  // to reach to the balance by distributing coordinates on
5292  // this line.
5293  current_cut_upper_bounds(i) =
5294  current_cut_coordinates(ii);
5295  current_cut_upper_weights(i) = line_weight;
5296  current_cut_lower_bounds(i) =
5297  current_cut_coordinates(ii);
5298  current_cut_lower_bound_weights(i) = p_weight;
5299  break;
5300  }
5301  // if a stricter lower bound is found,
5302  // update the lower bound.
5303  if(p_weight <= expected_weight_in_part && p_weight >=
5304  current_cut_lower_bound_weights(i)) {
5305  current_cut_lower_bounds(i) =
5306  current_global_right_closest_points(ii);
5307  current_cut_lower_bound_weights(i) = p_weight;
5308  }
5309  }
5310 
5311  mj_scalar_t new_cut_position = 0;
5312  algMJ_t::mj_calculate_new_cut_position(
5313  current_cut_upper_bounds(i),
5314  current_cut_lower_bounds(i),
5315  current_cut_upper_weights(i),
5316  current_cut_lower_bound_weights(i),
5317  expected_weight_in_part, new_cut_position,
5318  _sEpsilon);
5319 
5320  // if cut line does not move significantly.
5321  // then finalize the search.
5322  if(std::abs(current_cut_coordinates(i) -
5323  new_cut_position) < local_sEpsilon) {
5324  current_cut_line_determined(i) = true;
5325  Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5326 
5327  //set the cut coordinate and proceed.
5328  new_current_cut_coordinates(i) =
5329  current_cut_coordinates(i);
5330  } else {
5331  new_current_cut_coordinates(i) = new_cut_position;
5332  }
5333  } // bContinue
5334  } else {
5335  // need to move the cut line to left.
5336  // set upper bound to current line.
5337  current_cut_upper_bounds(i) =
5338  current_global_left_closest_points(i);
5339  current_cut_upper_weights(i) =
5340  seen_weight_in_part;
5341  // compare the current cut line weights with
5342  // previous upper and lower bounds.
5343  for(int ii = i - 1; ii >= 0; --ii) {
5344  mj_scalar_t p_weight =
5345  current_global_part_weights(ii * 2);
5346  mj_scalar_t line_weight =
5347  current_global_part_weights(ii * 2 + 1);
5348  if(p_weight <= expected_weight_in_part) {
5349  if(p_weight == expected_weight_in_part) {
5350  // if the weight of the part is my expected weight
5351  // then we find the solution.
5352  current_cut_upper_bounds(i) =
5353  current_cut_coordinates(ii);
5354  current_cut_upper_weights(i) = p_weight;
5355  current_cut_lower_bounds(i) =
5356  current_cut_coordinates(ii);
5357  current_cut_lower_bound_weights(i) = p_weight;
5358  }
5359  else if(p_weight > current_cut_lower_bound_weights(i)) {
5360  // if found weight is bigger than the lower bound
5361  // then update the lower bound.
5362  current_cut_lower_bounds(i) =
5363  current_global_right_closest_points(ii);
5364  current_cut_lower_bound_weights(i) = p_weight;
5365 
5366  // at the same time, if weight of line is bigger than the
5367  // expected weight, then update the upper bound as well.
5368  // in this case the balance will be obtained by distributing
5369  // weights on this cut position.
5370  if(line_weight > expected_weight_in_part) {
5371  current_cut_upper_bounds(i) =
5372  current_global_right_closest_points(ii);
5373  current_cut_upper_weights(i) = line_weight;
5374  }
5375  }
5376  break;
5377  }
5378  // if the weight of the cut on the left is still bigger than
5379  // my weight, and also if the weight is smaller than the current
5380  // upper weight, or if the weight is equal to current upper
5381  // weight, but on the left of the upper weight, then update
5382  // upper bound.
5383  if(p_weight >= expected_weight_in_part &&
5384  (p_weight < current_cut_upper_weights(i) ||
5385  (p_weight == current_cut_upper_weights(i) &&
5386  current_cut_upper_bounds(i) >
5387  current_global_left_closest_points(ii)))) {
5388  current_cut_upper_bounds(i) =
5389  current_global_left_closest_points(ii);
5390  current_cut_upper_weights(i) = p_weight;
5391  }
5392  }
5393  mj_scalar_t new_cut_position = 0;
5394  algMJ_t::mj_calculate_new_cut_position(
5395  current_cut_upper_bounds(i),
5396  current_cut_lower_bounds(i),
5397  current_cut_upper_weights(i),
5398  current_cut_lower_bound_weights(i),
5399  expected_weight_in_part,
5400  new_cut_position,
5401  _sEpsilon);
5402 
5403  // if cut line does not move significantly.
5404  if(std::abs(current_cut_coordinates(i) -
5405  new_cut_position) < local_sEpsilon) {
5406  current_cut_line_determined(i) = true;
5407  Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5408  //set the cut coordinate and proceed.
5409  new_current_cut_coordinates(i) =
5410  current_cut_coordinates(i);
5411  } else {
5412  new_current_cut_coordinates(i) =
5413  new_cut_position;
5414  }
5415  }
5416  }; // bContinue
5417  });
5418 
5419  team_member.team_barrier(); // for end of Kokkos::TeamThreadRange
5420  });
5421 
5422  // view_rectilinear_cut_count
5423  mj_part_t rectilinear_cut_count;
5424  Kokkos::parallel_reduce("Read bDoingWork",
5425  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
5426  KOKKOS_LAMBDA(int dummy, int & set_single) {
5427  set_single = view_rectilinear_cut_count(0);
5428  }, rectilinear_cut_count);
5429 
5430  if(rectilinear_cut_count > 0) {
5431  auto host_local_process_rectilinear_cut_weight =
5432  Kokkos::create_mirror_view(Kokkos::HostSpace(),
5433  local_process_rectilinear_cut_weight);
5434  auto host_local_global_rectilinear_cut_weight =
5435  Kokkos::create_mirror_view(Kokkos::HostSpace(),
5436  local_global_rectilinear_cut_weight);
5437  Kokkos::deep_copy(host_local_process_rectilinear_cut_weight,
5438  local_process_rectilinear_cut_weight);
5439  Kokkos::deep_copy(host_local_global_rectilinear_cut_weight,
5440  local_global_rectilinear_cut_weight);
5441  Teuchos::scan<int,mj_scalar_t>(
5442  *comm, Teuchos::REDUCE_SUM,
5443  num_cuts,
5444  host_local_process_rectilinear_cut_weight.data(),
5445  host_local_global_rectilinear_cut_weight.data());
5446  Kokkos::deep_copy(local_process_rectilinear_cut_weight,
5447  host_local_process_rectilinear_cut_weight);
5448  Kokkos::deep_copy(local_global_rectilinear_cut_weight,
5449  host_local_global_rectilinear_cut_weight);
5450 
5451  Kokkos::parallel_for("finish up mj_get_new_cut_coordinates",
5452  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
5453  KOKKOS_LAMBDA(int dummy) {
5454  for(mj_part_t i = 0; i < num_cuts; ++i) {
5455  // if cut line weight to be distributed.
5456  if(local_global_rectilinear_cut_weight(i) > 0) {
5457  // expected weight to go to left of the cut.
5458  mj_scalar_t expected_part_weight = current_part_target_weights(i);
5459  // the weight that should be put to left of the cut.
5460  mj_scalar_t necessary_weight_on_line_for_left =
5461  expected_part_weight - current_global_part_weights(i * 2);
5462 
5463  // the weight of the cut in the process
5464  mj_scalar_t my_weight_on_line =
5465  local_process_rectilinear_cut_weight(i);
5466 
5467  // the sum of the cut weights upto this process,
5468  // including the weight of this process.
5469  mj_scalar_t weight_on_line_upto_process_inclusive =
5470  local_global_rectilinear_cut_weight(i);
5471  // the space on the left side of the cut after all processes
5472  // before this process (including this process)
5473  // puts their weights on cut to left.
5474  mj_scalar_t space_to_put_left =
5475  necessary_weight_on_line_for_left -
5476  weight_on_line_upto_process_inclusive;
5477  // add my weight to this space to find out how much space
5478  // is left to me.
5479  mj_scalar_t space_left_to_me =
5480  space_to_put_left + my_weight_on_line;
5481 
5482  /*
5483  cout << "expected_part_weight:" << expected_part_weight
5484  << " necessary_weight_on_line_for_left:"
5485  << necessary_weight_on_line_for_left
5486  << " my_weight_on_line" << my_weight_on_line
5487  << " weight_on_line_upto_process_inclusive:"
5488  << weight_on_line_upto_process_inclusive
5489  << " space_to_put_left:" << space_to_put_left
5490  << " space_left_to_me" << space_left_to_me << endl;
5491  */
5492 
5493  if(space_left_to_me < 0) {
5494  // space_left_to_me is negative and i dont need to put
5495  // anything to left.
5496  current_part_cut_line_weight_to_put_left(i) = 0;
5497  }
5498  else if(space_left_to_me >= my_weight_on_line) {
5499  // space left to me is bigger than the weight of the
5500  // processor on cut.
5501  // so put everything to left.
5502  current_part_cut_line_weight_to_put_left(i) =
5503  my_weight_on_line;
5504  // cout << "setting current_part_cut_line_weight_to_put_left
5505  // to my_weight_on_line:" << my_weight_on_line << endl;
5506  }
5507  else {
5508  // put only the weight as much as the space.
5509  current_part_cut_line_weight_to_put_left(i) =
5510  space_left_to_me;
5511  // cout << "setting current_part_cut_line_weight_to_put_left
5512  // to space_left_to_me:" << space_left_to_me << endl;
5513  }
5514  }
5515  }
5516  view_rectilinear_cut_count(0) = 0;
5517  });
5518  }
5519 
5520  Kokkos::deep_copy(this->incomplete_cut_count, device_incomplete_cut_count);
5521 }
5522 
5532 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5533  typename mj_part_t, typename mj_node_t>
5534 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5535  get_processor_num_points_in_parts(
5536  mj_part_t num_procs,
5537  mj_part_t num_parts,
5538  mj_gno_t *&num_points_in_all_processor_parts)
5539 {
5540  // initially allocation_size is num_parts
5541  size_t allocation_size = num_parts * (num_procs + 1);
5542 
5543  // this will be output
5544  // holds how many each processor has in each part.
5545  // last portion is the sum of all processor points in each part.
5546 
5547  // allocate memory for the local num coordinates in each part.
5548  mj_gno_t *num_local_points_in_each_part_to_reduce_sum =
5549  new mj_gno_t[allocation_size];
5550 
5551  // this is the portion of the memory which will be used
5552  // at the summation to obtain total number of processors' points in each part.
5553  mj_gno_t *my_local_points_to_reduce_sum =
5554  num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
5555 
5556  // this is the portion of the memory where each stores its local number.
5557  // this information is needed by other processors.
5558  mj_gno_t *my_local_point_counts_in_each_part =
5559  num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
5560 
5561  // initialize the array with 0's.
5562  memset(num_local_points_in_each_part_to_reduce_sum, 0,
5563  sizeof(mj_gno_t)*allocation_size);
5564 
5565  auto local_new_part_xadj = this->new_part_xadj;
5566  Kokkos::View<mj_gno_t *, typename mj_node_t::device_type> points_per_part(
5567  Kokkos::ViewAllocateWithoutInitializing("points per part"), num_parts);
5568  Kokkos::parallel_for("get vals on device",
5569  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_gno_t>
5570  (0, num_parts), KOKKOS_LAMBDA(mj_gno_t i) {
5571  points_per_part(i) =
5572  local_new_part_xadj(i) - ((i == 0) ? 0 : local_new_part_xadj(i-1));
5573  });
5574  auto host_points_per_part = Kokkos::create_mirror_view(points_per_part);
5575  Kokkos::deep_copy(host_points_per_part, points_per_part);
5576  for(int i = 0; i < num_parts; ++i) {
5577  my_local_points_to_reduce_sum[i] = host_points_per_part(i);
5578  }
5579 
5580  // copy the local num parts to the last portion of array, so that this portion
5581  // will represent the global num points in each part after the reduction.
5582  memcpy (my_local_point_counts_in_each_part, my_local_points_to_reduce_sum,
5583  sizeof(mj_gno_t) * (num_parts) );
5584 
5585  // reduceAll operation.
5586  // the portion that belongs to a processor with index p
5587  // will start from myRank * num_parts.
5588  // the global number of points will be held at the index
5589  try{
5590  reduceAll<int, mj_gno_t>(
5591  *(this->comm),
5592  Teuchos::REDUCE_SUM,
5593  allocation_size,
5594  num_local_points_in_each_part_to_reduce_sum,
5595  num_points_in_all_processor_parts);
5596  }
5597  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
5598 
5599  delete [] num_local_points_in_each_part_to_reduce_sum;
5600 }
5601 
5617 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5618  typename mj_part_t, typename mj_node_t>
5619 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5620  mj_check_to_migrate(
5621  size_t migration_reduce_all_population,
5622  mj_lno_t num_coords_for_last_dim_part,
5623  mj_part_t num_procs,
5624  mj_part_t num_parts,
5625  mj_gno_t *num_points_in_all_processor_parts)
5626 {
5627  // if reduce all count and population in the last dim is too high
5628  if(migration_reduce_all_population > future_reduceall_cutoff) {
5629  return true;
5630  }
5631 
5632  // if the work in a part per processor in the last dim is too low.
5633  if(num_coords_for_last_dim_part < min_work_last_dim) {
5634  return true;
5635  }
5636 
5637  // if migration is to be checked and the imbalance is too high
5638  if(this->check_migrate_avoid_migration_option == 0) {
5639  double global_imbalance = 0;
5640  // global shift to reach the sum of coordiante count in each part.
5641  size_t global_shift = num_procs * num_parts;
5642 
5643  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5644  for(mj_part_t i = 0; i < num_parts; ++i) {
5645  double ideal_num = num_points_in_all_processor_parts[global_shift + i]
5646  / double(num_procs);
5647 
5648  global_imbalance += std::abs(ideal_num -
5649  num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
5650  }
5651  }
5652  global_imbalance /= num_parts;
5653  global_imbalance /= num_procs;
5654 
5655  if(global_imbalance <= this->minimum_migration_imbalance) {
5656  return false;
5657  }
5658  else {
5659  return true;
5660  }
5661  }
5662  else {
5663  // if migration is forced
5664  return true;
5665  }
5666 }
5667 
5681 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5682  typename mj_part_t, typename mj_node_t>
5683 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5684  assign_send_destinations(
5685  mj_part_t num_parts,
5686  mj_part_t *part_assignment_proc_begin_indices,
5687  mj_part_t *processor_chains_in_parts,
5688  mj_lno_t *send_count_to_each_proc,
5689  int *coordinate_destinations) {
5690 
5691  auto host_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
5692  deep_copy(host_new_part_xadj, this->new_part_xadj);
5693 
5694  auto host_new_coordinate_permutations =
5695  Kokkos::create_mirror_view(this->new_coordinate_permutations);
5696  deep_copy(host_new_coordinate_permutations, this->new_coordinate_permutations);
5697 
5698  for(mj_part_t p = 0; p < num_parts; ++p) {
5699  mj_lno_t part_begin = 0;
5700  if(p > 0) part_begin = host_new_part_xadj(p - 1);
5701  mj_lno_t part_end = host_new_part_xadj(p);
5702  // get the first part that current processor will send its part-p.
5703  mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
5704  // initialize how many point I sent to this processor.
5705  mj_lno_t num_total_send = 0;
5706  for(mj_lno_t j=part_begin; j < part_end; j++) {
5707  mj_lno_t local_ind = host_new_coordinate_permutations(j);
5708  while (num_total_send >= send_count_to_each_proc[proc_to_sent]) {
5709  // then get the next processor to send the points in part p.
5710  num_total_send = 0;
5711  // assign new processor to part_assign_begin[p]
5712  part_assignment_proc_begin_indices[p] =
5713  processor_chains_in_parts[proc_to_sent];
5714  // remove the previous processor
5715  processor_chains_in_parts[proc_to_sent] = -1;
5716  // choose the next processor as the next one to send.
5717  proc_to_sent = part_assignment_proc_begin_indices[p];
5718  }
5719  // write the gno index to corresponding position in sendBuf.
5720  coordinate_destinations[local_ind] = proc_to_sent;
5721  ++num_total_send;
5722  }
5723  }
5724 }
5725 
5746 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5747  typename mj_part_t, typename mj_node_t>
5748 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5749  mj_assign_proc_to_parts(
5750  mj_gno_t * num_points_in_all_processor_parts,
5751  mj_part_t num_parts,
5752  mj_part_t num_procs,
5753  mj_lno_t *send_count_to_each_proc,
5754  std::vector<mj_part_t> &processor_ranks_for_subcomm,
5755  std::vector<mj_part_t> *next_future_num_parts_in_parts,
5756  mj_part_t &out_part_index,
5757  mj_part_t &output_part_numbering_begin_index,
5758  int * coordinate_destinations) {
5759  mj_gno_t *global_num_points_in_parts =
5760  num_points_in_all_processor_parts + num_procs * num_parts;
5761  mj_part_t *num_procs_assigned_to_each_part = new mj_part_t[num_parts];
5762 
5763  // boolean variable if the process finds its part to be assigned.
5764  bool did_i_find_my_group = false;
5765 
5766  mj_part_t num_free_procs = num_procs;
5767  mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
5768 
5769  double max_imbalance_difference = 0;
5770  mj_part_t max_differing_part = 0;
5771 
5772  // find how many processor each part requires.
5773  for(mj_part_t i = 0; i < num_parts; i++) {
5774 
5775  // scalar portion of the required processors
5776  double scalar_required_proc = num_procs *
5777  (double (global_num_points_in_parts[i]) /
5778  double (this->num_global_coords));
5779 
5780  // round it to closest integer; make sure have at least one proc.
5781  mj_part_t required_proc =
5782  static_cast<mj_part_t> (0.5 + scalar_required_proc);
5783  if(required_proc == 0) required_proc = 1;
5784 
5785  // if assigning the required num procs, creates problems for the rest
5786  // of the parts, then only assign {num_free_procs -
5787  // (minimum_num_procs_required_for_rest_of_parts)} procs to this part.
5788  if(num_free_procs -
5789  required_proc < minimum_num_procs_required_for_rest_of_parts) {
5790  required_proc = num_free_procs -
5791  (minimum_num_procs_required_for_rest_of_parts);
5792  }
5793 
5794  // reduce the free processor count
5795  num_free_procs -= required_proc;
5796 
5797  // reduce the free minimum processor count required for the rest of the
5798  // part by 1.
5799  --minimum_num_procs_required_for_rest_of_parts;
5800 
5801  // part (i) is assigned to (required_proc) processors.
5802  num_procs_assigned_to_each_part[i] = required_proc;
5803 
5804  // because of the roundings some processors might be left as unassigned.
5805  // we want to assign those processors to the part with most imbalance.
5806  // find the part with the maximum imbalance here.
5807  double imbalance_wrt_ideal =
5808  (scalar_required_proc - required_proc) / required_proc;
5809  if(imbalance_wrt_ideal > max_imbalance_difference) {
5810  max_imbalance_difference = imbalance_wrt_ideal;
5811  max_differing_part = i;
5812  }
5813  }
5814 
5815  // assign extra processors to the part with maximum imbalance
5816  // than the ideal.
5817  if(num_free_procs > 0) {
5818  num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
5819  }
5820 
5821  // now find what are the best processors with least migration for each part.
5822 
5823  // part_assignment_proc_begin_indices ([i]) is the array that holds the
5824  // beginning index of a processor that processor sends its data for part - i
5825  mj_part_t *part_assignment_proc_begin_indices = new mj_part_t[num_parts];
5826 
5827  // the next processor send is found in processor_chains_in_parts,
5828  // in linked list manner.
5829  mj_part_t *processor_chains_in_parts = new mj_part_t [num_procs];
5830  mj_part_t *processor_part_assignments = new mj_part_t[num_procs];
5831 
5832  // initialize the assignment of each processor.
5833  // this has a linked list implementation.
5834  // the beginning of processors assigned
5835  // to each part is hold at part_assignment_proc_begin_indices[part].
5836  // then the next processor assigned to that part is located at
5837  // proc_part_assignments[part_assign_begins[part]], this is a chain
5838  // until the value of -1 is reached.
5839  for(int i = 0; i < num_procs; ++i ) {
5840  processor_part_assignments[i] = -1;
5841  processor_chains_in_parts[i] = -1;
5842  }
5843  for(int i = 0; i < num_parts; ++i ) {
5844  part_assignment_proc_begin_indices[i] = -1;
5845  }
5846 
5847  // std::cout << "Before migration: mig type:" <<
5848  // this->migration_type << std::endl;
5849  // Allocate memory for sorting data structure.
5850  uSignedSortItem<mj_part_t, mj_gno_t, char> *
5851  sort_item_num_part_points_in_procs =
5852  new uSignedSortItem<mj_part_t, mj_gno_t, char>[num_procs];
5853 
5854  for(mj_part_t i = 0; i < num_parts; ++i) {
5855  // the algorithm tries to minimize the cost of migration, by assigning the
5856  // processors with highest number of coordinates on that part.
5857  // here we might want to implement a maximum weighted bipartite matching
5858  // algorithm.
5859  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5860  sort_item_num_part_points_in_procs[ii].id = ii;
5861  // if processor is not assigned yet.
5862  // add its num points to the sort data structure.
5863  if(processor_part_assignments[ii] == -1) {
5864  sort_item_num_part_points_in_procs[ii].val =
5865  num_points_in_all_processor_parts[ii * num_parts + i];
5866  // indicate that the processor has positive weight.
5867  sort_item_num_part_points_in_procs[ii].signbit = 1;
5868  }
5869  else {
5870  // if processor is already assigned, insert -nLocal - 1 so that it
5871  // won't be selected again.
5872  // would be same if we simply set it to -1, but more information with
5873  // no extra cost (which is used later) is provided.
5874  // sort_item_num_part_points_in_procs[ii].val =
5875  // -num_points_in_all_processor_parts[ii * num_parts + i] - 1;
5876 
5877  // UPDATE: Since above gets warning when unsigned is used to
5878  // represent, we added extra bit to as sign bit to the sort item.
5879  // It is 1 for positives, 0 for negatives.
5880  sort_item_num_part_points_in_procs[ii].val =
5881  num_points_in_all_processor_parts[ii * num_parts + i];
5882  sort_item_num_part_points_in_procs[ii].signbit = 0;
5883  }
5884  }
5885 
5886  // sort the processors in the part.
5887  uqSignsort<mj_part_t, mj_gno_t,char>
5888  (num_procs, sort_item_num_part_points_in_procs);
5889 
5890  /*
5891  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5892  std::cout << "ii:" << ii << " " <<
5893  sort_item_num_part_points_in_procs[ii].id <<
5894  " " << sort_item_num_part_points_in_procs[ii].val <<
5895  " " << int(sort_item_num_part_points_in_procs[ii].signbit) <<
5896  std::endl;
5897  }
5898  */
5899 
5900  mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
5901  mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
5902  mj_gno_t ideal_num_points_in_a_proc = Teuchos::as<mj_gno_t>(
5903  ceil(total_num_points_in_part / double (required_proc_count)));
5904 
5905  // starts sending to least heaviest part.
5906  mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
5907  mj_part_t next_proc_to_send_id =
5908  sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
5909  mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc -
5910  sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
5911 
5912  // find the processors that will be assigned to this part, which are the
5913  // heaviest non assigned processors.
5914  for(mj_part_t ii = num_procs - 1;
5915  ii >= num_procs - required_proc_count; --ii) {
5916  mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
5917  // assign processor to part - i.
5918  processor_part_assignments[proc_id] = i;
5919  }
5920 
5921  bool did_change_sign = false;
5922  // if processor has a minus count, reverse it.
5923  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5924  // TODO: THE LINE BELOW PRODUCES A WARNING IF gno_t IS UNSIGNED
5925  // TODO: SEE BUG 6194
5926  if(sort_item_num_part_points_in_procs[ii].signbit == 0) {
5927  did_change_sign = true;
5928  sort_item_num_part_points_in_procs[ii].signbit = 1;
5929  }
5930  else {
5931  break;
5932  }
5933  }
5934 
5935  if(did_change_sign) {
5936  // resort the processors in the part for the rest of the processors that
5937  // is not assigned.
5938  uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count,
5939  sort_item_num_part_points_in_procs);
5940  }
5941 
5942  /*
5943  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5944  std::cout << "after resort ii:" << ii << " " <<
5945  sort_item_num_part_points_in_procs[ii].id <<
5946  " " << sort_item_num_part_points_in_procs[ii].val <<
5947  " " << int(sort_item_num_part_points_in_procs[ii].signbit ) <<
5948  std::endl;
5949  }
5950  */
5951 
5952  // check if this processors is one of the procs assigned to this part.
5953  // if it is, then get the group.
5954  if(!did_i_find_my_group) {
5955  for(mj_part_t ii = num_procs - 1; ii >=
5956  num_procs - required_proc_count; --ii) {
5957 
5958  mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
5959 
5960  // add the proc to the group.
5961  processor_ranks_for_subcomm.push_back(proc_id_to_assign);
5962 
5963  if(proc_id_to_assign == this->myRank) {
5964  // if the assigned process is me, then I find my group.
5965  did_i_find_my_group = true;
5966 
5967  // set the beginning of part i to my rank.
5968  part_assignment_proc_begin_indices[i] = this->myRank;
5969  processor_chains_in_parts[this->myRank] = -1;
5970 
5971  // set send count to myself to the number of points that I have
5972  // in part i.
5973  send_count_to_each_proc[this->myRank] =
5974  sort_item_num_part_points_in_procs[ii].val;
5975 
5976  // calculate the shift required for the
5977  // output_part_numbering_begin_index
5978  for(mj_part_t in = 0; in < i; ++in) {
5979  output_part_numbering_begin_index +=
5980  (*next_future_num_parts_in_parts)[in];
5981  }
5982  out_part_index = i;
5983  }
5984  }
5985 
5986  // if these was not my group,
5987  // clear the subcomminicator processor array.
5988  if(!did_i_find_my_group) {
5989  processor_ranks_for_subcomm.clear();
5990  }
5991  }
5992 
5993  // send points of the nonassigned coordinates to the assigned coordinates.
5994  // starts from the heaviest nonassigned processor.
5995  // TODO we might want to play with this part, that allows more
5996  // computational imbalance but having better communication balance.
5997  for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii) {
5998  mj_part_t nonassigned_proc_id =
5999  sort_item_num_part_points_in_procs[ii].id;
6000  mj_lno_t num_points_to_sent =
6001  sort_item_num_part_points_in_procs[ii].val;
6002 
6003  // we set number of points to -to_sent - 1 for the assigned processors.
6004  // we reverse it here. This should not happen, as we have already
6005  // reversed them above.
6006 #ifdef MJ_DEBUG
6007  if(num_points_to_sent < 0) {
6008  cout << "Migration - processor assignments - for part:" << i
6009  << "from proc:" << nonassigned_proc_id << " num_points_to_sent:"
6010  << num_points_to_sent << std::endl;
6011  std::terminate();
6012  }
6013 #endif
6014 
6015  switch (migration_type) {
6016  case 0:
6017  {
6018  // now sends the points to the assigned processors.
6019  while (num_points_to_sent > 0) {
6020  // if the processor has enough space.
6021  if(num_points_to_sent <= space_left_in_sent_proc) {
6022  // reduce the space left in the processor.
6023  space_left_in_sent_proc -= num_points_to_sent;
6024  // if my rank is the one that is sending the coordinates.
6025  if(this->myRank == nonassigned_proc_id) {
6026  // set my sent count to the sent processor.
6027  send_count_to_each_proc[next_proc_to_send_id] =
6028  num_points_to_sent;
6029  // save the processor in the list (processor_chains_in_parts
6030  // and part_assignment_proc_begin_indices)
6031  // that the processor will send its point in part-i.
6032  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6033  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6034  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6035  }
6036  num_points_to_sent = 0;
6037  }
6038  else {
6039  // there might be no space left in the processor.
6040  if(space_left_in_sent_proc > 0) {
6041  num_points_to_sent -= space_left_in_sent_proc;
6042 
6043  //send as the space left in the processor.
6044  if(this->myRank == nonassigned_proc_id) {
6045  // send as much as the space in this case.
6046  send_count_to_each_proc[next_proc_to_send_id] =
6047  space_left_in_sent_proc;
6048  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6049  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6050  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6051  }
6052  }
6053  // change the sent part
6054  ++next_proc_to_send_index;
6055 
6056 #ifdef MJ_DEBUG
6057  if(next_part_to_send_index < nprocs - required_proc_count ) {
6058  cout << "Migration - processor assignments - for part:"
6059  << i
6060  << " next_part_to_send :" << next_part_to_send_index
6061  << " nprocs:" << nprocs
6062  << " required_proc_count:" << required_proc_count
6063  << " Error: next_part_to_send_index <" <<
6064  << " nprocs - required_proc_count" << std::endl;
6065  std::terminate();
6066  }
6067 #endif
6068  // send the new id.
6069  next_proc_to_send_id =
6070  sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6071  // set the new space in the processor.
6072  space_left_in_sent_proc = ideal_num_points_in_a_proc -
6073  sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6074  }
6075  }
6076  }
6077  break;
6078  default:
6079  {
6080  // to minimize messages, we want each processor to send its
6081  // coordinates to only a single point.
6082  // we do not respect imbalances here, we send all points to the
6083  // next processor.
6084  if(this->myRank == nonassigned_proc_id) {
6085  // set my sent count to the sent processor.
6086  send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
6087  // save the processor in the list (processor_chains_in_parts and
6088  // part_assignment_proc_begin_indices)
6089  // that the processor will send its point in part-i.
6090  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6091  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6092  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6093  }
6094  num_points_to_sent = 0;
6095  ++next_proc_to_send_index;
6096 
6097  // if we made it to the heaviest processor we round robin and
6098  // go to beginning
6099  if(next_proc_to_send_index == num_procs) {
6100  next_proc_to_send_index = num_procs - required_proc_count;
6101  }
6102  // send the new id.
6103  next_proc_to_send_id =
6104  sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6105  // set the new space in the processor.
6106  space_left_in_sent_proc = ideal_num_points_in_a_proc -
6107  sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6108  }
6109  }
6110  }
6111  }
6112 
6113  /*
6114  for(int i = 0; i < num_procs;++i) {
6115  std::cout << "me:" << this->myRank << " to part:" << i << " sends:" <<
6116  send_count_to_each_proc[i] << std::endl;
6117  }
6118  */
6119 
6120  this->assign_send_destinations(
6121  num_parts,
6122  part_assignment_proc_begin_indices,
6123  processor_chains_in_parts,
6124  send_count_to_each_proc,
6125  coordinate_destinations);
6126  delete [] part_assignment_proc_begin_indices;
6127  delete [] processor_chains_in_parts;
6128  delete [] processor_part_assignments;
6129  delete [] sort_item_num_part_points_in_procs;
6130  delete [] num_procs_assigned_to_each_part;
6131 }
6132 
6148 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6149  typename mj_part_t, typename mj_node_t>
6150 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6151  assign_send_destinations2(
6152  mj_part_t num_parts,
6153  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
6154  int *coordinate_destinations,
6155  mj_part_t &output_part_numbering_begin_index,
6156  std::vector<mj_part_t> *next_future_num_parts_in_parts)
6157 {
6158  mj_part_t part_shift_amount = output_part_numbering_begin_index;
6159  mj_part_t previous_processor = -1;
6160 
6161  auto local_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
6162  Kokkos::deep_copy(local_new_part_xadj, this->new_part_xadj);
6163 
6164  auto local_new_coordinate_permutations =
6165  Kokkos::create_mirror_view(this->new_coordinate_permutations);
6166  Kokkos::deep_copy(local_new_coordinate_permutations,
6167  this->new_coordinate_permutations);
6168 
6169  for(mj_part_t i = 0; i < num_parts; ++i) {
6170  mj_part_t p = sort_item_part_to_proc_assignment[i].id;
6171 
6172  // assigned processors are sorted.
6173  mj_lno_t part_begin_index = 0;
6174 
6175  if(p > 0) {
6176  part_begin_index = local_new_part_xadj(p - 1);
6177  }
6178 
6179  mj_lno_t part_end_index = local_new_part_xadj(p);
6180 
6181  mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
6182  if(this->myRank == assigned_proc && previous_processor != assigned_proc) {
6183  output_part_numbering_begin_index = part_shift_amount;
6184  }
6185  previous_processor = assigned_proc;
6186  part_shift_amount += (*next_future_num_parts_in_parts)[p];
6187 
6188  for(mj_lno_t j= part_begin_index; j < part_end_index; j++) {
6189  mj_lno_t localInd = local_new_coordinate_permutations(j);
6190  coordinate_destinations[localInd] = assigned_proc;
6191  }
6192  }
6193 }
6194 
6216 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6217  typename mj_part_t, typename mj_node_t>
6218 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6219  mj_assign_parts_to_procs(
6220  mj_gno_t * num_points_in_all_processor_parts,
6221  mj_part_t num_parts,
6222  mj_part_t num_procs,
6223  mj_lno_t *send_count_to_each_proc,
6224  std::vector<mj_part_t> *next_future_num_parts_in_parts,
6225  mj_part_t &out_num_part,
6226  std::vector<mj_part_t> &out_part_indices,
6227  mj_part_t &output_part_numbering_begin_index,
6228  int *coordinate_destinations) {
6229 
6230  out_num_part = 0;
6231  mj_gno_t *global_num_points_in_parts =
6232  num_points_in_all_processor_parts + num_procs * num_parts;
6233  out_part_indices.clear();
6234 
6235  // to sort the parts that is assigned to the processors.
6236  // id is the part number, sort value is the assigned processor id.
6237  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment =
6238  new uSortItem<mj_part_t, mj_part_t>[num_parts];
6239  uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i =
6240  new uSortItem<mj_part_t, mj_gno_t>[num_procs];
6241 
6242  // calculate the optimal number of coordinates that should be assigned
6243  // to each processor.
6244  mj_lno_t work_each =
6245  mj_lno_t (this->num_global_coords / (double (num_procs)) + 0.5f);
6246 
6247  // to hold the left space as the number of coordinates to the optimal
6248  // number in each proc.
6249  mj_lno_t *space_in_each_processor = new mj_lno_t[num_procs];
6250 
6251  // initialize left space in each.
6252  for(mj_part_t i = 0; i < num_procs; ++i) {
6253  space_in_each_processor[i] = work_each;
6254  }
6255 
6256  // we keep track of how many parts each processor is assigned to.
6257  // because in some weird inputs, it might be possible that some
6258  // processors is not assigned to any part. Using these variables,
6259  // we force each processor to have at least one part.
6260  mj_part_t *num_parts_proc_assigned = new mj_part_t[num_procs];
6261  memset(num_parts_proc_assigned, 0, sizeof(mj_part_t) * num_procs);
6262  int empty_proc_count = num_procs;
6263 
6264  // to sort the parts with decreasing order of their coordiantes.
6265  // id are the part numbers, sort value is the number of points in each.
6266  uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts =
6267  new uSortItem<mj_part_t, mj_gno_t>[num_parts];
6268 
6269  // initially we will sort the parts according to the number of coordinates
6270  // they have, so that we will start assigning with the part that has the most
6271  // number of coordinates.
6272  for(mj_part_t i = 0; i < num_parts; ++i) {
6273  sort_item_point_counts_in_parts[i].id = i;
6274  sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
6275  }
6276 
6277  // sort parts with increasing order of loads.
6278  uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
6279 
6280  // assigning parts to the processors
6281  // traverse the part with decreasing order of load.
6282  // first assign the heaviest part.
6283  for(mj_part_t j = 0; j < num_parts; ++j) {
6284  // sorted with increasing order, traverse inverse.
6285  mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
6286 
6287  // load of the part
6288  mj_gno_t load = global_num_points_in_parts[i];
6289 
6290  // assigned processors
6291  mj_part_t assigned_proc = -1;
6292 
6293  // sort processors with increasing number of points in this part.
6294  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
6295  sort_item_num_points_of_proc_in_part_i[ii].id = ii;
6296 
6297  // if there are still enough parts to fill empty processors, than proceed
6298  // normally, but if empty processor count is equal to the number of part,
6299  // then we force to part assignments only to empty processors.
6300  if(empty_proc_count < num_parts - j ||
6301  num_parts_proc_assigned[ii] == 0) {
6302  // how many points processor ii has in part i?
6303  sort_item_num_points_of_proc_in_part_i[ii].val =
6304  num_points_in_all_processor_parts[ii * num_parts + i];
6305  }
6306  else {
6307  sort_item_num_points_of_proc_in_part_i[ii].val = -1;
6308  }
6309  }
6310 
6311  uqsort<mj_part_t, mj_gno_t>(num_procs,
6312  sort_item_num_points_of_proc_in_part_i);
6313 
6314  // traverse all processors with decreasing load.
6315  for(mj_part_t iii = num_procs - 1; iii >= 0; --iii) {
6316  mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
6317  if(assigned_proc == -1 ||
6318  (space_in_each_processor[ii] > space_in_each_processor[assigned_proc])) {
6319  assigned_proc = ii;
6320  }
6321  else if(space_in_each_processor[ii] == space_in_each_processor[assigned_proc]) {
6322  if(ii < assigned_proc) {
6323  // ties go to lower proc
6324  // not necessary for a valid result but allows testing to compare
6325  // MPI results and have parts numbers assigned to the same boxes.
6326  // We don't break here because we may have more ties still to check.
6327  // The indeterminate state before this is due to Cuda using
6328  // atomics to refill the permutation array. So non-cuda runs don't
6329  // actualy need this since they will always have the same pattern.
6330  assigned_proc = ii;
6331  }
6332  }
6333  else {
6334  break; // now we can break - we have our part and no more ties.
6335  }
6336  }
6337 
6338  if(num_parts_proc_assigned[assigned_proc]++ == 0) {
6339  --empty_proc_count;
6340  }
6341 
6342  space_in_each_processor[assigned_proc] -= load;
6343  //to sort later, part-i is assigned to the proccessor - assignment.
6344  sort_item_part_to_proc_assignment[j].id = i; //part i
6345 
6346  // assigned to processor - assignment.
6347  sort_item_part_to_proc_assignment[j].val = assigned_proc;
6348 
6349  // if assigned processor is me, increase the number.
6350  if(assigned_proc == this->myRank) {
6351  out_num_part++;//assigned_part_count;
6352  out_part_indices.push_back(i);
6353  }
6354 
6355  // increase the send to that processor by the number of points in that
6356  // part, as everyone send their coordiantes in this part to the
6357  // processor assigned to this part.
6358  send_count_to_each_proc[assigned_proc] +=
6359  num_points_in_all_processor_parts[this->myRank * num_parts + i];
6360  }
6361 
6362  delete [] num_parts_proc_assigned;
6363  delete [] sort_item_num_points_of_proc_in_part_i;
6364  delete [] sort_item_point_counts_in_parts;
6365  delete [] space_in_each_processor;
6366 
6367  // sort assignments with respect to the assigned processors.
6368  uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
6369 
6370  // fill sendBuf.
6371  this->assign_send_destinations2(
6372  num_parts,
6373  sort_item_part_to_proc_assignment,
6374  coordinate_destinations,
6375  output_part_numbering_begin_index,
6376  next_future_num_parts_in_parts);
6377 
6378  delete [] sort_item_part_to_proc_assignment;
6379 }
6380 
6381 
6405 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6406  typename mj_part_t, typename mj_node_t>
6407 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6408  mj_migration_part_proc_assignment(
6409  mj_gno_t * num_points_in_all_processor_parts,
6410  mj_part_t num_parts,
6411  mj_part_t num_procs,
6412  mj_lno_t *send_count_to_each_proc,
6413  std::vector<mj_part_t> &processor_ranks_for_subcomm,
6414  std::vector<mj_part_t> *next_future_num_parts_in_parts,
6415  mj_part_t &out_num_part,
6416  std::vector<mj_part_t> &out_part_indices,
6417  mj_part_t &output_part_numbering_begin_index,
6418  int *coordinate_destinations)
6419 {
6420  processor_ranks_for_subcomm.clear();
6421  // if(this->num_local_coords > 0)
6422  if(num_procs > num_parts) {
6423  // if there are more processors than the number of current part
6424  // then processors share the existing parts.
6425  // at the end each processor will have a single part,
6426  // but a part will be shared by a group of processors.
6427  mj_part_t out_part_index = 0;
6428 
6429  this->mj_assign_proc_to_parts(
6430  num_points_in_all_processor_parts,
6431  num_parts,
6432  num_procs,
6433  send_count_to_each_proc,
6434  processor_ranks_for_subcomm,
6435  next_future_num_parts_in_parts,
6436  out_part_index,
6437  output_part_numbering_begin_index,
6438  coordinate_destinations
6439  );
6440 
6441  out_num_part = 1;
6442  out_part_indices.clear();
6443  out_part_indices.push_back(out_part_index);
6444  }
6445  else {
6446 
6447  // there are more parts than the processors.
6448  // therefore a processor will be assigned multiple parts,
6449  // the subcommunicators will only have a single processor.
6450  processor_ranks_for_subcomm.push_back(this->myRank);
6451 
6452  // since there are more parts then procs,
6453  // assign multiple parts to processors.
6454 
6455  this->mj_assign_parts_to_procs(
6456  num_points_in_all_processor_parts,
6457  num_parts,
6458  num_procs,
6459  send_count_to_each_proc,
6460  next_future_num_parts_in_parts,
6461  out_num_part,
6462  out_part_indices,
6463  output_part_numbering_begin_index,
6464  coordinate_destinations);
6465  }
6466 }
6467 
6481 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6482  typename mj_part_t, typename mj_node_t>
6483 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6484  mj_migrate_coords(
6485  mj_part_t num_procs,
6486  mj_lno_t &num_new_local_points,
6487  std::string iteration,
6488  int *coordinate_destinations,
6489  mj_part_t num_parts)
6490 {
6491 
6492 #ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6493  if(sizeof(mj_lno_t) <= sizeof(int)) {
6494  // Cannot use Zoltan_Comm with local ordinals larger than ints.
6495  // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
6496  // may overflow.
6497  ZOLTAN_COMM_OBJ *plan = NULL;
6498  MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
6499  int num_incoming_gnos = 0;
6500  int message_tag = 7859;
6501 
6502  this->mj_env->timerStart(MACRO_TIMERS,
6503  mj_timer_base_string + "Migration Z1PlanCreating-" + iteration);
6504  int ierr = Zoltan_Comm_Create(
6505  &plan,
6506  int(this->num_local_coords),
6507  coordinate_destinations,
6508  mpi_comm,
6509  message_tag,
6510  &num_incoming_gnos);
6511 
6512  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6513  this->mj_env->timerStop(MACRO_TIMERS,
6514  mj_timer_base_string + "Migration Z1PlanCreating-" + iteration);
6515 
6516  this->mj_env->timerStart(MACRO_TIMERS,
6517  mj_timer_base_string + "Migration Z1Migration-" + iteration);
6518 
6519  // MPI Buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
6520 
6521  // migrate gnos.
6522  {
6523  auto host_current_mj_gnos = Kokkos::create_mirror_view(
6524  Kokkos::HostSpace(), this->current_mj_gnos);
6525  Kokkos::deep_copy(host_current_mj_gnos, this->current_mj_gnos);
6526  Kokkos::View<mj_gno_t*, device_t> dst_gnos(
6527  Kokkos::ViewAllocateWithoutInitializing("dst_gnos"), num_incoming_gnos);
6528  auto host_dst_gnos = Kokkos::create_mirror_view(
6529  Kokkos::HostSpace(), dst_gnos);
6530  message_tag++;
6531  ierr = Zoltan_Comm_Do(
6532  plan,
6533  message_tag,
6534  (char *) host_current_mj_gnos.data(),
6535  sizeof(mj_gno_t),
6536  (char *) host_dst_gnos.data());
6537  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6538  Kokkos::deep_copy(dst_gnos, host_dst_gnos);
6539  this->current_mj_gnos = dst_gnos;
6540  }
6541 
6542  //migrate coordinates
6543  {
6544  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
6545  auto host_src_coordinates = Kokkos::create_mirror_view(
6546  Kokkos::HostSpace(), this->mj_coordinates);
6547  Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6548  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6549  dst_coordinates(Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
6550  num_incoming_gnos, this->coord_dim);
6551  auto host_dst_coordinates = Kokkos::create_mirror_view(
6552  Kokkos::HostSpace(), dst_coordinates);
6553  for(int i = 0; i < this->coord_dim; ++i) {
6554  Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sub_host_src_coordinates
6555  = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6556  Kokkos::View<mj_scalar_t *, Kokkos::HostSpace> sub_host_dst_coordinates
6557  = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
6558  // Note Layout Left means we can do these in contiguous blocks
6559  message_tag++;
6560  ierr = Zoltan_Comm_Do(
6561  plan,
6562  message_tag,
6563  (char *) sub_host_src_coordinates.data(),
6564  sizeof(mj_scalar_t),
6565  (char *) sub_host_dst_coordinates.data());
6566  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6567  }
6568  deep_copy(dst_coordinates, host_dst_coordinates);
6569  this->mj_coordinates = dst_coordinates;
6570  }
6571 
6572  // migrate weights.
6573  {
6574  auto host_src_weights = Kokkos::create_mirror_view(
6575  Kokkos::HostSpace(), this->mj_weights);
6576  Kokkos::deep_copy(host_src_weights, this->mj_weights);
6577  Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6578  Kokkos::ViewAllocateWithoutInitializing("mj_weights"),
6579  num_incoming_gnos, this->num_weights_per_coord);
6580  auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
6581  for(int i = 0; i < this->num_weights_per_coord; ++i) {
6582  auto sub_host_src_weights
6583  = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6584  auto sub_host_dst_weights
6585  = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6586  ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
6587  // Copy because of layout
6588  for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6589  sent_weight[n] = sub_host_src_weights(n);
6590  }
6591  ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6592  message_tag++;
6593  ierr = Zoltan_Comm_Do(
6594  plan,
6595  message_tag,
6596  (char *) sent_weight.getRawPtr(),
6597  sizeof(mj_scalar_t),
6598  (char *) received_weight.getRawPtr());
6599  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6600  // Again we copy by index due to layout
6601  for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6602  sub_host_dst_weights(n) = received_weight[n];
6603  }
6604  }
6605  deep_copy(dst_weights, host_dst_weights);
6606  this->mj_weights = dst_weights;
6607  }
6608 
6609  // migrate owners.
6610  {
6611  // Note that owners we kept on Serial
6612  Kokkos::View<int *, Kokkos::HostSpace> dst_owners_of_coordinate(
6613  Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
6614  num_incoming_gnos);
6615  message_tag++;
6616  ierr = Zoltan_Comm_Do(
6617  plan,
6618  message_tag,
6619  (char *) owner_of_coordinate.data(),
6620  sizeof(int),
6621  (char *) dst_owners_of_coordinate.data());
6622  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6623  this->owner_of_coordinate = dst_owners_of_coordinate;
6624  }
6625 
6626  // if num procs is less than num parts,
6627  // we need the part assigment arrays as well, since
6628  // there will be multiple parts in processor.
6629  {
6630  auto host_src_assigned_part_ids = Kokkos::create_mirror_view(
6631  Kokkos::HostSpace(), this->assigned_part_ids);
6632  Kokkos::deep_copy(host_src_assigned_part_ids, this->assigned_part_ids);
6633  Kokkos::View<int *, device_t> dst_assigned_part_ids(
6634  Kokkos::ViewAllocateWithoutInitializing("assigned_part_ids"),
6635  num_incoming_gnos);
6636  auto host_dst_assigned_part_ids = Kokkos::create_mirror_view(
6637  Kokkos::HostSpace(), dst_assigned_part_ids);
6638  mj_part_t *new_parts = new mj_part_t[num_incoming_gnos];
6639  if(num_procs < num_parts) {
6640  message_tag++;
6641  ierr = Zoltan_Comm_Do(
6642  plan,
6643  message_tag,
6644  (char *) host_src_assigned_part_ids.data(),
6645  sizeof(mj_part_t),
6646  (char *) host_dst_assigned_part_ids.data());
6647  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6648  Kokkos::deep_copy(dst_assigned_part_ids, host_dst_assigned_part_ids);
6649  }
6650  // In original code this would just assign to an uninitialized array
6651  // if num_procs < num_parts. We're doing the same here.
6652  this->assigned_part_ids = dst_assigned_part_ids;
6653  }
6654 
6655  ierr = Zoltan_Comm_Destroy(&plan);
6656  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6657  num_new_local_points = num_incoming_gnos;
6658  this->mj_env->timerStop(MACRO_TIMERS,
6659  mj_timer_base_string + "Migration Z1Migration-" + iteration);
6660  }
6661  else
6662 #endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6663  {
6664  this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6665  "Migration DistributorPlanCreating-" + iteration);
6666 
6667  Tpetra::Distributor distributor(this->comm);
6668  ArrayView<const mj_part_t> destinations( coordinate_destinations,
6669  this->num_local_coords);
6670  mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
6671  this->mj_env->timerStop(MACRO_TIMERS, mj_timer_base_string +
6672  "Migration DistributorPlanCreating-" + iteration);
6673  this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6674  "Migration DistributorMigration-" + iteration);
6675 
6676  // note MPI buffers should all be on Kokkos::HostSpace and not
6677  // Kokkos::CudaUVMSpace.
6678 
6679  // migrate gnos.
6680  {
6681  ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
6682  auto src_host_current_mj_gnos =
6683  Kokkos::create_mirror_view(Kokkos::HostSpace(), this->current_mj_gnos);
6684  Kokkos::deep_copy(src_host_current_mj_gnos, this->current_mj_gnos);
6685  ArrayView<mj_gno_t> sent_gnos(
6686  src_host_current_mj_gnos.data(), this->num_local_coords);
6687  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
6688  this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
6689  Kokkos::ViewAllocateWithoutInitializing("gids"), num_incoming_gnos);
6690  auto host_current_mj_gnos = Kokkos::create_mirror_view(
6691  this->current_mj_gnos);
6692  memcpy(host_current_mj_gnos.data(),
6693  received_gnos.getRawPtr(), num_incoming_gnos * sizeof(mj_gno_t));
6694  Kokkos::deep_copy(this->current_mj_gnos, host_current_mj_gnos);
6695  }
6696 
6697  // migrate coordinates
6698  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
6699  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6700  dst_coordinates("mj_coordinates", num_incoming_gnos, this->coord_dim);
6701  auto host_dst_coordinates = Kokkos::create_mirror_view(dst_coordinates);
6702  auto host_src_coordinates = Kokkos::create_mirror_view(
6703  Kokkos::HostSpace(), this->mj_coordinates);
6704  Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6705  for(int i = 0; i < this->coord_dim; ++i) {
6706  Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sub_host_src_coordinates
6707  = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6708  auto sub_host_dst_coordinates
6709  = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
6710  // Note Layout Left means we can do these in contiguous blocks
6711  // This form was causing problems on cuda 10 pascal nodes, issue #6422
6712  // Doing a manual copy clears the error though it seems this is probably
6713  // just shifting some kind of race condition or UVM issue around. The
6714  // bug can be sensitive to simple changes like adding a printf log.
6715 
6716  // Using this form will segfault on cuda 10 pascal node
6717  //ArrayView<mj_scalar_t> sent_coord(
6718  // sub_host_src_coordinates.data(), this->num_local_coords);
6719 
6720  // Manual copy will clear the error but this is probably just due to
6721  // shifting some kind of race condition.
6722  ArrayRCP<mj_scalar_t> sent_coord(this->num_local_coords);
6723  for(int n = 0; n < this->num_local_coords; ++n) {
6724  sent_coord[n] = sub_host_src_coordinates[n];
6725  }
6726 
6727  ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
6728  distributor.doPostsAndWaits<mj_scalar_t>(
6729  sent_coord(), 1, received_coord());
6730  memcpy(sub_host_dst_coordinates.data(),
6731  received_coord.getRawPtr(), num_incoming_gnos * sizeof(mj_scalar_t));
6732  }
6733  deep_copy(dst_coordinates, host_dst_coordinates);
6734  this->mj_coordinates = dst_coordinates;
6735 
6736  // migrate weights.
6737  Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6738  "mj_weights", num_incoming_gnos, this->num_weights_per_coord);
6739  auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
6740  auto host_src_weights = Kokkos::create_mirror_view(
6741  Kokkos::HostSpace(), this->mj_weights);
6742  Kokkos::deep_copy(host_src_weights, this->mj_weights);
6743  for(int i = 0; i < this->num_weights_per_coord; ++i) {
6744  auto sub_host_src_weights
6745  = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6746  auto sub_host_dst_weights
6747  = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6748  ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
6749 
6750  // TODO: Layout Right means these are not contiguous
6751  // However we don't have any systems setup with more than 1 weight so
6752  // really I have not tested any of this code with num weights > 1.
6753  // I think this is the right thing to do.
6754  for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6755  sent_weight[n] = sub_host_src_weights(n);
6756  }
6757  ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6758  distributor.doPostsAndWaits<mj_scalar_t>(
6759  sent_weight(), 1, received_weight());
6760 
6761  // Again we copy by index due to layout
6762  for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6763  sub_host_dst_weights(n) = received_weight[n];
6764  }
6765  }
6766  Kokkos::deep_copy(dst_weights, host_dst_weights);
6767  this->mj_weights = dst_weights;
6768 
6769  // migrate owners
6770  {
6771  // Note owners we kept on Serial
6772  ArrayView<int> sent_owners(
6773  owner_of_coordinate.data(), this->num_local_coords);
6774  ArrayRCP<int> received_owners(num_incoming_gnos);
6775  distributor.doPostsAndWaits<int>(sent_owners, 1, received_owners());
6776  this->owner_of_coordinate = Kokkos::View<int *, Kokkos::HostSpace>
6777  ("owner_of_coordinate", num_incoming_gnos);
6778  memcpy(this->owner_of_coordinate.data(),
6779  received_owners.getRawPtr(), num_incoming_gnos * sizeof(int));
6780  }
6781 
6782  // if num procs is less than num parts,
6783  // we need the part assigment arrays as well, since
6784  // there will be multiple parts in processor.
6785  if(num_procs < num_parts) {
6786  auto src_host_assigned_part_ids =
6787  Kokkos::create_mirror_view(Kokkos::HostSpace(), this->assigned_part_ids);
6788  Kokkos::deep_copy(src_host_assigned_part_ids, assigned_part_ids);
6789  ArrayView<mj_part_t> sent_partids(
6790  src_host_assigned_part_ids.data(), this->num_local_coords);
6791  ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
6792  distributor.doPostsAndWaits<mj_part_t>(
6793  sent_partids, 1, received_partids());
6794  this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6795  ("assigned_part_ids", num_incoming_gnos);
6796  auto host_assigned_part_ids = Kokkos::create_mirror_view(
6797  this->assigned_part_ids);
6798  memcpy(
6799  host_assigned_part_ids.data(),
6800  received_partids.getRawPtr(),
6801  num_incoming_gnos * sizeof(mj_part_t));
6802  Kokkos::deep_copy(this->assigned_part_ids, host_assigned_part_ids);
6803  }
6804  else {
6805  this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6806  ("assigned_part_ids", num_incoming_gnos);
6807  }
6808  this->mj_env->timerStop(MACRO_TIMERS, "" + mj_timer_base_string +
6809  "Migration DistributorMigration-" + iteration);
6810 
6811  num_new_local_points = num_incoming_gnos;
6812  }
6813 }
6814 
6820 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6821  typename mj_part_t, typename mj_node_t>
6822 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6823  create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm)
6824 {
6825  mj_part_t group_size = processor_ranks_for_subcomm.size();
6826  mj_part_t *ids = new mj_part_t[group_size];
6827  for(mj_part_t i = 0; i < group_size; ++i) {
6828  ids[i] = processor_ranks_for_subcomm[i];
6829  }
6830  ArrayView<const mj_part_t> idView(ids, group_size);
6831  this->comm = this->comm->createSubcommunicator(idView);
6832  delete [] ids;
6833 }
6834 
6840 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6841  typename mj_part_t, typename mj_node_t>
6842 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6843  fill_permutation_array(
6844  mj_part_t output_num_parts,
6845  mj_part_t num_parts)
6846 {
6847  // if there is single output part, then simply fill the permutation array.
6848  if(output_num_parts == 1) {
6849  auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6850  Kokkos::parallel_for(
6851  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
6852  (0, this->num_local_coords),
6853  KOKKOS_LAMBDA(mj_lno_t i) {
6854  local_new_coordinate_permutations(i) = i;
6855  });
6856  auto local_new_part_xadj = this->new_part_xadj;
6857  auto local_num_local_coords = this->num_local_coords;
6858  Kokkos::parallel_for(
6859  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6860  KOKKOS_LAMBDA(int dummy) {
6861  local_new_part_xadj(0) = local_num_local_coords;
6862  });
6863  }
6864  else {
6865  auto local_num_local_coords = this->num_local_coords;
6866  auto local_assigned_part_ids = this->assigned_part_ids;
6867  auto local_new_part_xadj = this->new_part_xadj;
6868  auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6869 
6870  // part shift holds the which part number an old part number corresponds to.
6871  Kokkos::View<mj_part_t*, device_t> part_shifts("part_shifts", num_parts);
6872 
6873  // otherwise we need to count how many points are there in each part.
6874  // we allocate here as num_parts, because the sent partids are up to
6875  // num_parts, although there are outout_num_parts different part.
6876  Kokkos::View<mj_lno_t*, device_t> num_points_in_parts(
6877  "num_points_in_parts", num_parts);
6878 
6879  Kokkos::parallel_for(
6880  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6881  KOKKOS_LAMBDA(int dummy) {
6882 
6883  for(mj_lno_t i = 0; i < local_num_local_coords; ++i) {
6884  mj_part_t ii = local_assigned_part_ids(i);
6885  ++num_points_in_parts(ii);
6886  }
6887 
6888  // write the end points of the parts.
6889  mj_part_t p = 0;
6890  mj_lno_t prev_index = 0;
6891  for(mj_part_t i = 0; i < num_parts; ++i) {
6892  if(num_points_in_parts(i) > 0) {
6893  local_new_part_xadj(p) = prev_index + num_points_in_parts(i);
6894  prev_index += num_points_in_parts(i);
6895  part_shifts(i) = p++;
6896  }
6897  }
6898 
6899  // for the rest of the parts write the end index as end point.
6900  mj_part_t assigned_num_parts = p - 1;
6901  for(;p < num_parts; ++p) {
6902  local_new_part_xadj(p) =
6903  local_new_part_xadj(assigned_num_parts);
6904  }
6905  for(mj_part_t i = 0; i < output_num_parts; ++i) {
6906  num_points_in_parts(i) = local_new_part_xadj(i);
6907  }
6908 
6909  // write the permutation array here.
6910  // get the part of the coordinate i, shift it to obtain the new part number.
6911  // assign it to the end of the new part numbers pointer.
6912  for(mj_lno_t i = local_num_local_coords - 1; i >= 0; --i) {
6913  mj_part_t part =
6914  part_shifts[mj_part_t(local_assigned_part_ids(i))];
6915  local_new_coordinate_permutations(--num_points_in_parts[part]) = i;
6916  }
6917  });
6918  }
6919 }
6920 
6945 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6946  typename mj_part_t, typename mj_node_t>
6947 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6948  mj_perform_migration(
6949  mj_part_t input_num_parts,
6950  mj_part_t &output_num_parts,
6951  std::vector<mj_part_t> *next_future_num_parts_in_parts,
6952  mj_part_t &output_part_begin_index,
6953  size_t migration_reduce_all_population,
6954  mj_lno_t num_coords_for_last_dim_part,
6955  std::string iteration,
6956  RCP<mj_partBoxVector_t> &input_part_boxes,
6957  RCP<mj_partBoxVector_t> &output_part_boxes)
6958 {
6959  mj_part_t num_procs = this->comm->getSize();
6960  this->myRank = this->comm->getRank();
6961 
6962  // this array holds how many points each processor has in each part.
6963  // to access how many points processor i has on part j,
6964  // num_points_in_all_processor_parts[i * num_parts + j]
6965  mj_gno_t *num_points_in_all_processor_parts =
6966  new mj_gno_t[input_num_parts * (num_procs + 1)];
6967 
6968  // get the number of coordinates in each part in each processor.
6969  this->get_processor_num_points_in_parts(
6970  num_procs,
6971  input_num_parts,
6972  num_points_in_all_processor_parts);
6973 
6974  // check if migration will be performed or not.
6975  if(!this->mj_check_to_migrate(
6976  migration_reduce_all_population,
6977  num_coords_for_last_dim_part,
6978  num_procs,
6979  input_num_parts,
6980  num_points_in_all_processor_parts)) {
6981  delete [] num_points_in_all_processor_parts;
6982  return false;
6983  }
6984 
6985  mj_lno_t *send_count_to_each_proc = NULL;
6986  int *coordinate_destinations = new int[this->num_local_coords];
6987  send_count_to_each_proc = new mj_lno_t[num_procs];
6988 
6989  for(int i = 0; i < num_procs; ++i) {
6990  send_count_to_each_proc[i] = 0;
6991  }
6992 
6993  std::vector<mj_part_t> processor_ranks_for_subcomm;
6994  std::vector<mj_part_t> out_part_indices;
6995 
6996  // determine which processors are assigned to which parts
6997  this->mj_migration_part_proc_assignment(
6998  num_points_in_all_processor_parts,
6999  input_num_parts,
7000  num_procs,
7001  send_count_to_each_proc,
7002  processor_ranks_for_subcomm,
7003  next_future_num_parts_in_parts,
7004  output_num_parts,
7005  out_part_indices,
7006  output_part_begin_index,
7007  coordinate_destinations);
7008 
7009  delete [] send_count_to_each_proc;
7010  std::vector <mj_part_t> tmpv;
7011 
7012  std::sort (out_part_indices.begin(), out_part_indices.end());
7013  mj_part_t outP = out_part_indices.size();
7014  mj_gno_t new_global_num_points = 0;
7015  mj_gno_t *global_num_points_in_parts =
7016  num_points_in_all_processor_parts + num_procs * input_num_parts;
7017 
7018  if(this->mj_keep_part_boxes) {
7019  input_part_boxes->clear();
7020  }
7021 
7022  // now we calculate the new values for next_future_num_parts_in_parts.
7023  // same for the part boxes.
7024  for(mj_part_t i = 0; i < outP; ++i) {
7025  mj_part_t ind = out_part_indices[i];
7026  new_global_num_points += global_num_points_in_parts[ind];
7027  tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
7028  if(this->mj_keep_part_boxes) {
7029  input_part_boxes->push_back((*output_part_boxes)[ind]);
7030  }
7031  }
7032 
7033  // swap the input and output part boxes.
7034  if(this->mj_keep_part_boxes) {
7035  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7036  input_part_boxes = output_part_boxes;
7037  output_part_boxes = tmpPartBoxes;
7038  }
7039  next_future_num_parts_in_parts->clear();
7040  for(mj_part_t i = 0; i < outP; ++i) {
7041  mj_part_t p = tmpv[i];
7042  next_future_num_parts_in_parts->push_back(p);
7043  }
7044 
7045  delete [] num_points_in_all_processor_parts;
7046 
7047  mj_lno_t num_new_local_points = 0;
7048  //perform the actual migration operation here.
7049  this->mj_migrate_coords(
7050  num_procs,
7051  num_new_local_points,
7052  iteration,
7053  coordinate_destinations,
7054  input_num_parts);
7055 
7056  delete [] coordinate_destinations;
7057  if(this->num_local_coords != num_new_local_points) {
7058  this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7059  (Kokkos::ViewAllocateWithoutInitializing("new_coordinate_permutations"),
7060  num_new_local_points);
7061  this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7062  (Kokkos::ViewAllocateWithoutInitializing("coordinate_permutations"),
7063  num_new_local_points);
7064  }
7065  this->num_local_coords = num_new_local_points;
7066  this->num_global_coords = new_global_num_points;
7067 
7068  // create subcommunicator.
7069  this->create_sub_communicator(processor_ranks_for_subcomm);
7070 
7071  processor_ranks_for_subcomm.clear();
7072 
7073  // fill the new permutation arrays.
7074  this->fill_permutation_array(output_num_parts, input_num_parts);
7075 
7076  return true;
7077 }
7078 
7097 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7098  typename mj_part_t, typename mj_node_t>
7099 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7100  create_consistent_chunks(
7101  mj_part_t num_parts,
7102  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
7103  Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
7104  mj_lno_t coordinate_begin,
7105  mj_lno_t coordinate_end,
7106  Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
7107  Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
7108  int coordInd,
7109  bool longest_dim_part,
7110  uSignedSortItem<int, mj_scalar_t, char> * p_coord_dimension_range_sorted)
7111 {
7112  // Note that this method is only used by task mapper
7113  // All code in this file has been verified to run with UVM off by running
7114  // mj tests and task mapper tests with UVM off. However for this particular
7115  // method I did not do much for UVM off. I heavily use device to host copies
7116  // and more or less preserve the original logic. Due to the handling of
7117  // arrays it will be a bit of work to convert this to as better form.
7118  // Since it's only relevant to task mapper and I wasn't sure how much priority
7119  // to give it, I put that on hold until further discussion.
7120  mj_part_t no_cuts = num_parts - 1;
7121 
7122  // now if the rectilinear partitioning is allowed we decide how
7123  // much weight each thread should put to left and right.
7124  if(this->distribute_points_on_cut_lines) {
7125  auto local_thread_cut_line_weight_to_put_left =
7126  this->thread_cut_line_weight_to_put_left;
7127  auto local_thread_part_weight_work =
7128  this->thread_part_weight_work;
7129  auto local_sEpsilon = this->sEpsilon;
7130 
7131  Kokkos::parallel_for(
7132  Kokkos::RangePolicy<typename mj_node_t::execution_space,
7133  mj_part_t> (0, no_cuts), KOKKOS_LAMBDA (mj_part_t i) {
7134  // the left to be put on the left of the cut.
7135  mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
7136  if(left_weight > local_sEpsilon) {
7137  // the weight of thread ii on cut.
7138  mj_scalar_t thread_ii_weight_on_cut =
7139  local_thread_part_weight_work(i * 2 + 1) -
7140  local_thread_part_weight_work(i * 2);
7141  if(thread_ii_weight_on_cut < left_weight) {
7142  local_thread_cut_line_weight_to_put_left(i) =
7143  thread_ii_weight_on_cut;
7144  }
7145  else {
7146  local_thread_cut_line_weight_to_put_left(i) = left_weight;
7147  }
7148  }
7149  else {
7150  local_thread_cut_line_weight_to_put_left(i) = 0;
7151  }
7152  });
7153 
7154  if(no_cuts > 0) {
7155  auto local_least_signifiance = least_signifiance;
7156  auto local_significance_mul = significance_mul;
7157  Kokkos::parallel_for(
7158  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
7159  (0, 1), KOKKOS_LAMBDA (int dummy) {
7160  // this is a special case. If cutlines share the same coordinate,
7161  // their weights are equal.
7162  // we need to adjust the ratio for that.
7163  for(mj_part_t i = no_cuts - 1; i > 0 ; --i) {
7164  mj_scalar_t cut1 = current_concurrent_cut_coordinate(i-1);
7165  mj_scalar_t cut2 = current_concurrent_cut_coordinate(i);
7166  mj_scalar_t delta = cut2 - cut1;
7167  mj_scalar_t abs_delta = (delta > 0) ? delta : -delta;
7168  if(abs_delta < local_sEpsilon) {
7169  local_thread_cut_line_weight_to_put_left(i) -=
7170  local_thread_cut_line_weight_to_put_left(i - 1);
7171  }
7172  local_thread_cut_line_weight_to_put_left(i) =
7173  static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
7174  local_least_signifiance) * local_significance_mul) /
7175  static_cast<mj_scalar_t>(local_significance_mul);
7176  }
7177  });
7178  }
7179  }
7180 
7181  auto local_thread_point_counts = this->thread_point_counts;
7182  Kokkos::parallel_for(
7183  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
7184  (0, num_parts), KOKKOS_LAMBDA (mj_part_t i) {
7185  local_thread_point_counts(i) = 0;
7186  });
7187 
7188  // for this specific case we dont want to distribute the points along the
7189  // cut position randomly, as we need a specific ordering of them. Instead,
7190  // we put the coordinates into a sort item, where we sort those
7191  // using the coordinates of points on other dimensions and the index.
7192 
7193  // some of the cuts might share the same position.
7194  // in this case, if cut i and cut j share the same position
7195  // cut_map[i] = cut_map[j] = sort item index.
7196  mj_part_t *cut_map = new mj_part_t[no_cuts];
7197 
7198  typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
7199  typedef std::vector< multiSItem > multiSVector;
7200  typedef std::vector<multiSVector> multiS2Vector;
7201 
7202  // to keep track of the memory allocated.
7203  std::vector<mj_scalar_t *>allocated_memory;
7204 
7205  // vector for which the coordinates will be sorted.
7206  multiS2Vector sort_vector_points_on_cut;
7207 
7208  // the number of cuts that have different coordinates.
7209  mj_part_t different_cut_count = 1;
7210  cut_map[0] = 0;
7211 
7212  // now we insert 1 sort vector for all cuts on the different
7213  // positins.if multiple cuts are on the same position,
7214  // they share sort vectors.
7215  multiSVector tmpMultiSVector;
7216  sort_vector_points_on_cut.push_back(tmpMultiSVector);
7217 
7218  auto local_current_concurrent_cut_coordinate =
7219  current_concurrent_cut_coordinate;
7220  auto host_current_concurrent_cut_coordinate =
7221  Kokkos::create_mirror_view(local_current_concurrent_cut_coordinate);
7222  Kokkos::deep_copy(host_current_concurrent_cut_coordinate,
7223  local_current_concurrent_cut_coordinate);
7224 
7225  for(mj_part_t i = 1; i < no_cuts ; ++i) {
7226  // if cuts share the same cut coordinates
7227  // set the cutmap accordingly.
7228  if(std::abs(host_current_concurrent_cut_coordinate(i) -
7229  host_current_concurrent_cut_coordinate(i-1)) < this->sEpsilon) {
7230  cut_map[i] = cut_map[i-1];
7231  }
7232  else {
7233  cut_map[i] = different_cut_count++;
7234  multiSVector tmp2MultiSVector;
7235  sort_vector_points_on_cut.push_back(tmp2MultiSVector);
7236  }
7237  }
7238  Kokkos::deep_copy(current_concurrent_cut_coordinate,
7239  host_current_concurrent_cut_coordinate);
7240 
7241  // now the actual part assigment.
7242  auto host_coordinate_permutations =
7243  Kokkos::create_mirror_view(coordinate_permutations);
7244  Kokkos::deep_copy(host_coordinate_permutations, coordinate_permutations);
7245 
7246  auto host_assigned_part_ids = Kokkos::create_mirror_view(assigned_part_ids);
7247  Kokkos::deep_copy(host_assigned_part_ids, assigned_part_ids);
7248 
7249  auto host_mj_coordinates = Kokkos::create_mirror_view(mj_coordinates);
7250  Kokkos::deep_copy(host_mj_coordinates, mj_coordinates);
7251 
7252  auto host_thread_point_counts = Kokkos::create_mirror_view(thread_point_counts);
7253  Kokkos::deep_copy(host_thread_point_counts, thread_point_counts);
7254 
7255  auto local_coord_dim = this->coord_dim;
7256 
7257  for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7258  mj_lno_t i = host_coordinate_permutations(ii);
7259  mj_part_t pp = host_assigned_part_ids(i);
7260  mj_part_t p = pp / 2;
7261  // if the coordinate is on a cut.
7262  if(pp % 2 == 1 ) {
7263  mj_scalar_t *vals = new mj_scalar_t[local_coord_dim -1];
7264  allocated_memory.push_back(vals);
7265 
7266  // we insert the coordinates to the sort item here.
7267  int val_ind = 0;
7268 
7269  if(longest_dim_part) {
7270  // std::cout << std::endl << std::endl;
7271  for(int dim = local_coord_dim - 2; dim >= 0; --dim) {
7272  // uSignedSortItem<int, mj_scalar_t, char>
7273  // *p_coord_dimension_range_sorted
7274  int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
7275  // std::cout << "next_largest_coord_dim: " <<
7276  // next_largest_coord_dim << " ";
7277  // Note refactor in progress
7278  vals[val_ind++] =
7279  host_mj_coordinates(i,next_largest_coord_dim);
7280  }
7281  }
7282  else {
7283  for(int dim = coordInd + 1; dim < local_coord_dim; ++dim) {
7284  vals[val_ind++] = host_mj_coordinates(i,dim);
7285  }
7286  for(int dim = 0; dim < coordInd; ++dim) {
7287  vals[val_ind++] = host_mj_coordinates(i,dim);
7288  }
7289  }
7290 
7291  multiSItem tempSortItem(i, local_coord_dim -1, vals);
7292  //insert the point to the sort vector pointed by the cut_map[p].
7293  mj_part_t cmap = cut_map[p];
7294  sort_vector_points_on_cut[cmap].push_back(tempSortItem);
7295  }
7296  else {
7297  //if it is not on the cut, simple sorting.
7298  ++host_thread_point_counts(p);
7299  host_assigned_part_ids(i) = p;
7300  }
7301  }
7302 
7303  // sort all the sort vectors.
7304  for(mj_part_t i = 0; i < different_cut_count; ++i) {
7305  std::sort (sort_vector_points_on_cut[i].begin(),
7306  sort_vector_points_on_cut[i].end());
7307  }
7308 
7309  mj_part_t previous_cut_map = cut_map[0];
7310 
7311  auto host_thread_cut_line_weight_to_put_left =
7312  Kokkos::create_mirror_view(thread_cut_line_weight_to_put_left);
7313  Kokkos::deep_copy(host_thread_cut_line_weight_to_put_left,
7314  thread_cut_line_weight_to_put_left);
7315 
7316  auto host_mj_weights = Kokkos::create_mirror_view(mj_weights);
7317  Kokkos::deep_copy(host_mj_weights, mj_weights);
7318 
7319  // this is how much previous part owns the weight of the current part.
7320  // when target part weight is 1.6, and the part on the left is given 2,
7321  // the left has an extra 0.4, while the right has missing 0.4 from the
7322  // previous cut.
7323  // This parameter is used to balance this issues.
7324  // in the above example weight_stolen_from_previous_part will be 0.4.
7325  // if the left part target is 2.2 but it is given 2,
7326  // then weight_stolen_from_previous_part will be -0.2.
7327  mj_scalar_t weight_stolen_from_previous_part = 0;
7328  for(mj_part_t p = 0; p < no_cuts; ++p) {
7329  mj_part_t mapped_cut = cut_map[p];
7330 
7331  // if previous cut map is done, and it does not have the same index,
7332  // then assign all points left on that cut to its right.
7333  if(previous_cut_map != mapped_cut) {
7334  mj_lno_t sort_vector_end = (mj_lno_t)
7335  sort_vector_points_on_cut[previous_cut_map].size() - 1;
7336  for(; sort_vector_end >= 0; --sort_vector_end) {
7337  multiSItem t =
7338  sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7339  mj_lno_t i = t.index;
7340  ++host_thread_point_counts(p);
7341  host_assigned_part_ids(i) = p;
7342  }
7343  sort_vector_points_on_cut[previous_cut_map].clear();
7344  }
7345 
7346  // TODO: MD: I dont remember why I have it reverse order here.
7347  mj_lno_t sort_vector_end = (mj_lno_t)
7348  sort_vector_points_on_cut[mapped_cut].size() - 1;
7349  // mj_lno_t sort_vector_begin= 0;
7350  // mj_lno_t sort_vector_size =
7351  // (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size();
7352 
7353  // TODO commented for reverse order
7354  for(; sort_vector_end >= 0; --sort_vector_end) {
7355  // for(; sort_vector_begin < sort_vector_size; ++sort_vector_begin) {
7356  // TODO COMMENTED FOR REVERSE ORDER
7357  multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
7358  //multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_begin];
7359  mj_lno_t i = t.index;
7360  mj_scalar_t w = this->mj_uniform_weights(0) ? 1 :
7361  this->mj_weights(i,0);
7362  // part p has enough space for point i, then put it to point i.
7363  if(host_thread_cut_line_weight_to_put_left(p) +
7364  weight_stolen_from_previous_part> this->sEpsilon &&
7365  host_thread_cut_line_weight_to_put_left(p) +
7366  weight_stolen_from_previous_part -
7367  std::abs(host_thread_cut_line_weight_to_put_left(p) +
7368  weight_stolen_from_previous_part - w)> this->sEpsilon)
7369  {
7370  host_thread_cut_line_weight_to_put_left(p) -= w;
7371 
7372  sort_vector_points_on_cut[mapped_cut].pop_back();
7373 
7374  ++host_thread_point_counts(p);
7375  host_assigned_part_ids(i) = p;
7376  // if putting this weight to left overweights the left cut, then
7377  // increase the space for the next cut using
7378  // weight_stolen_from_previous_part.
7379  if(p < no_cuts - 1 &&
7380  host_thread_cut_line_weight_to_put_left(p) < this->sEpsilon) {
7381  if(mapped_cut == cut_map[p + 1] ) {
7382  // if the cut before the cut indexed at p was also at the same
7383  // position special case, as we handle the weight differently here.
7384  if(previous_cut_map != mapped_cut) {
7385  weight_stolen_from_previous_part =
7386  host_thread_cut_line_weight_to_put_left(p);
7387  }
7388  else {
7389  // if the cut before the cut indexed at p was also at the same
7390  // position we assign extra weights cumulatively in this case.
7391  weight_stolen_from_previous_part +=
7392  host_thread_cut_line_weight_to_put_left(p);
7393  }
7394  }
7395  else{
7396  weight_stolen_from_previous_part =
7397  -host_thread_cut_line_weight_to_put_left(p);
7398  }
7399  // end assignment for part p
7400  break;
7401  }
7402  } else {
7403  // if part p does not have enough space for this point
7404  // and if there is another cut sharing the same positon,
7405  // again increase the space for the next
7406  if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]) {
7407  if(previous_cut_map != mapped_cut) {
7408  weight_stolen_from_previous_part =
7409  host_thread_cut_line_weight_to_put_left(p);
7410  }
7411  else {
7412  weight_stolen_from_previous_part +=
7413  host_thread_cut_line_weight_to_put_left(p);
7414  }
7415  }
7416  else{
7417  weight_stolen_from_previous_part =
7418  -host_thread_cut_line_weight_to_put_left(p);
7419  }
7420  // end assignment for part p
7421  break;
7422  }
7423  }
7424  previous_cut_map = mapped_cut;
7425  }
7426 
7427  // TODO commented for reverse order
7428  // put everything left on the last cut to the last part.
7429  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[
7430  previous_cut_map].size() - 1;
7431 
7432  // mj_lno_t sort_vector_begin= 0;
7433  // mj_lno_t sort_vector_size = (mj_lno_t)
7434  // sort_vector_points_on_cut[previous_cut_map].size();
7435  // TODO commented for reverse order
7436  for(; sort_vector_end >= 0; --sort_vector_end) {
7437  // TODO commented for reverse order
7438  multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7439  // multiSItem t =
7440  // sort_vector_points_on_cut[previous_cut_map][sort_vector_begin];
7441  mj_lno_t i = t.index;
7442  ++host_thread_point_counts(no_cuts);
7443  host_assigned_part_ids(i) = no_cuts;
7444  }
7445 
7446  sort_vector_points_on_cut[previous_cut_map].clear();
7447  delete [] cut_map;
7448 
7449  //free the memory allocated for vertex sort items .
7450  mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
7451  for(mj_lno_t i = 0; i < vSize; ++i) {
7452  delete [] allocated_memory[i];
7453  }
7454 
7455  auto local_out_part_xadj = out_part_xadj;
7456  auto host_out_part_xadj = Kokkos::create_mirror_view(local_out_part_xadj);
7457  Kokkos::deep_copy(host_out_part_xadj, out_part_xadj);
7458 
7459  // creation of part_xadj as in usual case.
7460  for(mj_part_t j = 0; j < num_parts; ++j) {
7461  host_out_part_xadj(j) = host_thread_point_counts(j);
7462  host_thread_point_counts(j) = 0;
7463  }
7464 
7465  // perform prefix sum for num_points in parts.
7466  for(mj_part_t j = 1; j < num_parts; ++j) {
7467  host_out_part_xadj(j) += host_out_part_xadj(j - 1);
7468  }
7469 
7470  // shift the num points in threads thread to obtain the
7471  // beginning index of each thread's private space.
7472  for(mj_part_t j = 1; j < num_parts; ++j) {
7473  host_thread_point_counts(j) += host_out_part_xadj(j - 1);
7474  }
7475 
7476  auto host_new_coordinate_permutations =
7477  Kokkos::create_mirror_view(new_coordinate_permutations);
7478  Kokkos::deep_copy(host_new_coordinate_permutations,
7479  new_coordinate_permutations);
7480 
7481  // now thread gets the coordinate and writes the index of coordinate to
7482  // the permutation array using the part index we calculated.
7483  for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7484  mj_lno_t i = host_coordinate_permutations(ii);
7485  mj_part_t p = host_assigned_part_ids(i);
7486  host_new_coordinate_permutations(coordinate_begin +
7487  host_thread_point_counts(p)++) = i;
7488  }
7489 
7490  Kokkos::deep_copy(thread_point_counts, host_thread_point_counts);
7491  Kokkos::deep_copy(new_coordinate_permutations,
7492  host_new_coordinate_permutations);
7493  Kokkos::deep_copy(local_out_part_xadj, host_out_part_xadj);
7494 }
7495 
7505 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7506  typename mj_part_t, typename mj_node_t>
7507 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7508  set_final_parts(
7509  mj_part_t current_num_parts,
7510  mj_part_t output_part_begin_index,
7511  RCP<mj_partBoxVector_t> &output_part_boxes,
7512  bool is_data_ever_migrated)
7513 {
7514  this->mj_env->timerStart(MACRO_TIMERS,
7515  mj_timer_base_string + "Part_Assignment");
7516 
7517  auto local_part_xadj = part_xadj;
7518  auto local_mj_keep_part_boxes = mj_keep_part_boxes;
7519  auto local_coordinate_permutations = coordinate_permutations;
7520  auto local_assigned_part_ids = assigned_part_ids;
7521 
7522  if(local_mj_keep_part_boxes) {
7523  for(int i = 0; i < current_num_parts; ++i) {
7524  (*output_part_boxes)[i].setpId(i + output_part_begin_index);
7525  }
7526  }
7527 
7528  Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy(
7529  current_num_parts, Kokkos::AUTO());
7530  typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
7531  member_type member_type;
7532  Kokkos::parallel_for(policy, KOKKOS_LAMBDA(member_type team_member) {
7533  int i = team_member.league_rank();
7534  Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, (i != 0) ?
7535  local_part_xadj(i-1) : 0, local_part_xadj(i)),
7536  [=] (mj_lno_t ii) {
7537  mj_lno_t k = local_coordinate_permutations(ii);
7538  local_assigned_part_ids(k) = i + output_part_begin_index;
7539  });
7540  });
7541 
7542  if(is_data_ever_migrated) {
7543 #ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7544  if(sizeof(mj_lno_t) <= sizeof(int)) {
7545 
7546  // Cannot use Zoltan_Comm with local ordinals larger than ints.
7547  // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
7548  // may overflow.
7549 
7550  // if data is migrated, then send part numbers to the original owners.
7551  ZOLTAN_COMM_OBJ *plan = NULL;
7552  MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
7553 
7554  int incoming = 0;
7555  int message_tag = 7856;
7556 
7557  this->mj_env->timerStart(MACRO_TIMERS,
7558  mj_timer_base_string + "Final Z1PlanCreating");
7559 
7560  // setup incoming count
7561  int ierr = Zoltan_Comm_Create( &plan, int(this->num_local_coords),
7562  this->owner_of_coordinate.data(), mpi_comm, message_tag, &incoming);
7563 
7564  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7565  this->mj_env->timerStop(MACRO_TIMERS,
7566  mj_timer_base_string + "Final Z1PlanCreating" );
7567 
7568  this->mj_env->timerStart(MACRO_TIMERS,
7569  mj_timer_base_string + "Final Z1PlanComm");
7570 
7571  // MPI Buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
7572 
7573  // migrate gnos to actual owners.
7574  auto host_current_mj_gnos = Kokkos::create_mirror_view(
7575  Kokkos::HostSpace(), this->current_mj_gnos);
7576  deep_copy(host_current_mj_gnos, this->current_mj_gnos);
7577  Kokkos::View<mj_gno_t*, device_t> dst_gnos(
7578  Kokkos::ViewAllocateWithoutInitializing("dst_gnos"), incoming);
7579  auto host_dst_gnos = Kokkos::create_mirror_view(
7580  Kokkos::HostSpace(), dst_gnos);
7581  message_tag++;
7582  ierr = Zoltan_Comm_Do( plan, message_tag,
7583  (char *) host_current_mj_gnos.data(),
7584  sizeof(mj_gno_t), (char *) host_dst_gnos.data());
7585  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7586  Kokkos::deep_copy(dst_gnos, host_dst_gnos);
7587  this->current_mj_gnos = dst_gnos;
7588 
7589  // migrate part ids to actual owners.
7590  auto host_src_part_ids = Kokkos::create_mirror_view(
7591  Kokkos::HostSpace(), this->assigned_part_ids);
7592  deep_copy(host_src_part_ids, this->assigned_part_ids);
7593  Kokkos::View<mj_part_t*, device_t> dst_part_ids(
7594  Kokkos::ViewAllocateWithoutInitializing("dst_part_ids"), incoming);
7595  auto host_dst_part_ids = Kokkos::create_mirror_view(
7596  Kokkos::HostSpace(), dst_part_ids);
7597  message_tag++;
7598  ierr = Zoltan_Comm_Do( plan, message_tag,
7599  (char *) host_src_part_ids.data(),
7600  sizeof(mj_part_t), (char *) host_dst_part_ids.data());
7601  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7602  Kokkos::deep_copy(dst_part_ids, host_dst_part_ids);
7603  this->assigned_part_ids = dst_part_ids;
7604 
7605  ierr = Zoltan_Comm_Destroy(&plan);
7606  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7607 
7608  this->num_local_coords = incoming;
7609 
7610  this->mj_env->timerStop(MACRO_TIMERS,
7611  mj_timer_base_string + "Final Z1PlanComm");
7612  }
7613  else
7614 #endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7615  {
7616  // setup incoming count
7617  this->mj_env->timerStart(MACRO_TIMERS,
7618  mj_timer_base_string + "Final DistributorPlanCreating");
7619  Tpetra::Distributor distributor(this->mj_problemComm);
7620  ArrayView<const mj_part_t> owners_of_coords(
7621  this->owner_of_coordinate.data(), this->num_local_coords);
7622  mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
7623  this->mj_env->timerStop(MACRO_TIMERS,
7624  mj_timer_base_string + "Final DistributorPlanCreating" );
7625 
7626  this->mj_env->timerStart(MACRO_TIMERS,
7627  mj_timer_base_string + "Final DistributorPlanComm");
7628 
7629  // MPI buffers should be Kokkos::HostSpace, not Kokkos::CudaUVMSpace
7630 
7631  // migrate gnos to actual owners.
7632  auto src_host_current_mj_gnos =
7633  Kokkos::create_mirror_view(Kokkos::HostSpace(), this->current_mj_gnos);
7634  Kokkos::deep_copy(src_host_current_mj_gnos, this->current_mj_gnos);
7635  ArrayRCP<mj_gno_t> received_gnos(incoming);
7636  ArrayView<mj_gno_t> sent_gnos(src_host_current_mj_gnos.data(),
7637  this->num_local_coords);
7638  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
7639  this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
7640  Kokkos::ViewAllocateWithoutInitializing("current_mj_gnos"), incoming);
7641  auto host_current_mj_gnos = Kokkos::create_mirror_view(
7642  this->current_mj_gnos);
7643  memcpy(host_current_mj_gnos.data(),
7644  received_gnos.getRawPtr(), incoming * sizeof(mj_gno_t));
7645  Kokkos::deep_copy(this->current_mj_gnos, host_current_mj_gnos);
7646 
7647  // migrate part ids to actual owners.
7648  auto src_host_assigned_part_ids =
7649  Kokkos::create_mirror_view(Kokkos::HostSpace(), this->assigned_part_ids);
7650  Kokkos::deep_copy(src_host_assigned_part_ids, this->assigned_part_ids);
7651  ArrayView<mj_part_t> sent_partids(src_host_assigned_part_ids.data(),
7652  this->num_local_coords);
7653  ArrayRCP<mj_part_t> received_partids(incoming);
7654  distributor.doPostsAndWaits<mj_part_t>(
7655  sent_partids, 1, received_partids());
7656  this->assigned_part_ids =
7657  Kokkos::View<mj_part_t*, device_t>(
7658  Kokkos::ViewAllocateWithoutInitializing("assigned_part_ids"),
7659  incoming);
7660  auto host_assigned_part_ids = Kokkos::create_mirror_view(
7661  this->assigned_part_ids);
7662  memcpy( host_assigned_part_ids.data(),
7663  received_partids.getRawPtr(), incoming * sizeof(mj_part_t));
7664  deep_copy(this->assigned_part_ids, host_assigned_part_ids);
7665  this->num_local_coords = incoming;
7666 
7667  this->mj_env->timerStop(MACRO_TIMERS,
7668  mj_timer_base_string + "Final DistributorPlanComm");
7669  }
7670  }
7671 
7672  this->mj_env->timerStop(MACRO_TIMERS,
7673  mj_timer_base_string + "Part_Assignment");
7674 
7675  this->mj_env->timerStart(MACRO_TIMERS,
7676  mj_timer_base_string + "Solution_Part_Assignment");
7677 
7678  // ArrayRCP<mj_part_t> partId;
7679  // partId = arcp(this->assigned_part_ids, 0, this->num_local_coords, true);
7680 
7681  if(this->mj_keep_part_boxes) {
7682  this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
7683  }
7684 
7685  this->mj_env->timerStop(MACRO_TIMERS,
7686  mj_timer_base_string + "Solution_Part_Assignment");
7687 }
7688 
7701 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7702  typename mj_part_t, typename mj_node_t>
7705  bool distribute_points_on_cut_lines_,
7706  int max_concurrent_part_calculation_,
7707  int check_migrate_avoid_migration_option_,
7708  double minimum_migration_imbalance_,
7709  int migration_type_)
7710 {
7711  this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
7712  this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
7713  this->check_migrate_avoid_migration_option =
7714  check_migrate_avoid_migration_option_;
7715  this->minimum_migration_imbalance = minimum_migration_imbalance_;
7716  this->migration_type = migration_type_;
7717 }
7718 
7746 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7747  typename mj_part_t, typename mj_node_t>
7750  const RCP<const Environment> &env,
7751  RCP<const Comm<int> > &problemComm,
7752  double imbalance_tolerance_,
7753  int num_teams_,
7754  size_t num_global_parts_,
7755  Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array_,
7756  int recursion_depth_,
7757  int coord_dim_,
7758  mj_lno_t num_local_coords_,
7759  mj_gno_t num_global_coords_,
7760  Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
7761  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
7762  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
7763  int num_weights_per_coord_,
7764  Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights_,
7765  Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
7766  Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts_,
7767  Kokkos::View<mj_part_t *, device_t> & result_assigned_part_ids_,
7768  Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos_)
7769 {
7770 
7771  // see comment above for Zoltan2_AlgMJ_TrackCallsCounter
7773  this->mj_timer_base_string = "MJ(" + std::to_string(execute_counter) + ") - ";
7774 
7775  this->mj_env = env;
7776  this->mj_problemComm = problemComm;
7777  this->myActualRank = this->myRank = this->mj_problemComm->getRank();
7778  this->mj_env->timerStart(MACRO_TIMERS,
7779  mj_timer_base_string + "Total");
7780  this->mj_env->debug(3, "In MultiJagged Jagged");
7781  this->imbalance_tolerance = imbalance_tolerance_;
7782  this->mj_num_teams = num_teams_;
7783  this->num_global_parts = num_global_parts_;
7784  this->part_no_array = part_no_array_;
7785  this->recursion_depth = recursion_depth_;
7786  this->coord_dim = coord_dim_;
7787  this->num_local_coords = num_local_coords_;
7788  this->num_global_coords = num_global_coords_;
7789  this->mj_coordinates = mj_coordinates_;
7790  this->initial_mj_gnos = initial_mj_gnos_;
7791  this->num_weights_per_coord = num_weights_per_coord_;
7792  this->mj_uniform_weights = mj_uniform_weights_;
7793  this->mj_weights = mj_weights_;
7794  this->mj_uniform_parts = mj_uniform_parts_;
7795 
7796  // this->set_input_data();
7797 
7798  this->set_part_specifications();
7799 
7800  this->mj_env->timerStart(MACRO_TIMERS,
7801  mj_timer_base_string + "Allocate Views");
7802  this->allocate_set_work_memory();
7803  this->mj_env->timerStop(MACRO_TIMERS,
7804  mj_timer_base_string + "Allocate Views");
7805 
7806  // We duplicate the comm as we create subcommunicators during migration.
7807  // We keep the problemComm as it is, while comm changes after each migration.
7808  this->comm = this->mj_problemComm->duplicate();
7809 
7810 #ifdef print_debug
7811  if(comm->getRank() == 0) {
7812  std::cout << "size of gno:" << sizeof(mj_gno_t) << std::endl;
7813  std::cout << "size of lno:" << sizeof(mj_lno_t) << std::endl;
7814  std::cout << "size of mj_scalar_t:" << sizeof(mj_scalar_t) << std::endl;
7815  }
7816 #endif
7817 
7818  // initially there is a single partition
7819  mj_part_t current_num_parts = 1;
7820  Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
7821  this->all_cut_coordinates;
7822  this->mj_env->timerStart(MACRO_TIMERS,
7823  mj_timer_base_string + "Problem_Partitioning");
7824  mj_part_t output_part_begin_index = 0;
7825  mj_part_t future_num_parts = this->total_num_part;
7826  bool is_data_ever_migrated = false;
7827 
7828  std::vector<mj_part_t> *future_num_part_in_parts =
7829  new std::vector<mj_part_t> ();
7830  std::vector<mj_part_t> *next_future_num_parts_in_parts =
7831  new std::vector<mj_part_t> ();
7832 
7833  next_future_num_parts_in_parts->push_back(this->num_global_parts);
7834 
7835  RCP<mj_partBoxVector_t> input_part_boxes;
7836  RCP<mj_partBoxVector_t> output_part_boxes;
7837 
7838  if(this->mj_keep_part_boxes) {
7839  input_part_boxes = RCP<mj_partBoxVector_t>(new mj_partBoxVector_t(), true);
7840  output_part_boxes = RCP<mj_partBoxVector_t>(new mj_partBoxVector_t(), true);
7841  compute_global_box();
7842  this->init_part_boxes(output_part_boxes);
7843  }
7844 
7845  auto local_part_xadj = this->part_xadj;
7846 
7847  // Need a device counter - how best to allocate?
7848  // Putting this allocation in the loops is very costly so moved out here.
7849  Kokkos::View<mj_part_t*, device_t>
7850  view_rectilinear_cut_count("view_rectilinear_cut_count", 1);
7851  Kokkos::View<size_t*, device_t>
7852  view_total_reduction_size("view_total_reduction_size", 1);
7853 
7854  for(int i = 0; i < this->recursion_depth; ++i) {
7855 
7856  // convert i to string to be used for debugging purposes.
7857  std::string istring = std::to_string(i);
7858 
7859  // next_future_num_parts_in_parts will be as the size of outnumParts,
7860  // and this will hold how many more parts that each output part
7861  // should be divided. this array will also be used to determine the weight
7862  // ratios of the parts. swap the arrays to use iteratively.
7863  std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
7864  future_num_part_in_parts = next_future_num_parts_in_parts;
7865  next_future_num_parts_in_parts = tmpPartVect;
7866 
7867  // clear next_future_num_parts_in_parts array as
7868  // getPartitionArrays expects it to be empty.
7869  next_future_num_parts_in_parts->clear();
7870  if(this->mj_keep_part_boxes) {
7871  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7872  input_part_boxes = output_part_boxes;
7873  output_part_boxes = tmpPartBoxes;
7874  output_part_boxes->clear();
7875  }
7876 
7877  // returns the total no. of output parts for this dimension partitioning.
7878  mj_part_t output_part_count_in_dimension =
7879  this->update_part_num_arrays(
7880  future_num_part_in_parts,
7881  next_future_num_parts_in_parts,
7882  future_num_parts,
7883  current_num_parts,
7884  i,
7885  input_part_boxes,
7886  output_part_boxes, 1);
7887 
7888  // if the number of obtained parts equal to current number of parts,
7889  // skip this dimension. For example, this happens when 1 is given in the
7890  // input part array is given. P=4,5,1,2
7891  if(output_part_count_in_dimension == current_num_parts) {
7892  //still need to swap the input output arrays.
7893  tmpPartVect= future_num_part_in_parts;
7894  future_num_part_in_parts = next_future_num_parts_in_parts;
7895  next_future_num_parts_in_parts = tmpPartVect;
7896 
7897  if(this->mj_keep_part_boxes) {
7898  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7899  input_part_boxes = output_part_boxes;
7900  output_part_boxes = tmpPartBoxes;
7901  }
7902  continue;
7903  }
7904 
7905  // get the coordinate axis along which the partitioning will be done.
7906  int coordInd = i % this->coord_dim;
7907 
7908  Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
7909  Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
7910 
7911  this->mj_env->timerStart(MACRO_TIMERS,
7912  mj_timer_base_string + "Problem_Partitioning_" + istring);
7913 
7914  // alloc Memory to point the indices
7915  // of the parts in the permutation array.
7916  this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
7917  "new part xadj", output_part_count_in_dimension);
7918 
7919  // the index where in the new_part_xadj will be written.
7920  mj_part_t output_part_index = 0;
7921 
7922  // whatever is written to output_part_index will be added with
7923  // output_coordinate_end_index so that the points will be shifted.
7924  mj_part_t output_coordinate_end_index = 0;
7925 
7926  mj_part_t current_work_part = 0;
7927  mj_part_t current_concurrent_num_parts =
7928  std::min(current_num_parts - current_work_part,
7929  this->max_concurrent_part_calculation);
7930 
7931  mj_part_t obtained_part_index = 0;
7932 
7933  auto host_process_local_min_max_coord_total_weight =
7934  Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
7935  auto host_global_min_max_coord_total_weight =
7936  Kokkos::create_mirror_view(global_min_max_coord_total_weight);
7937 
7938  // run for all available parts.
7939  for(; current_work_part < current_num_parts;
7941 
7943  std::min(current_num_parts - current_work_part,
7944  this->max_concurrent_part_calculation);
7945 
7946  int bDoingWork_int; // Can't reduce on bool so use int
7947  auto local_device_num_partitioning_in_current_dim =
7948  device_num_partitioning_in_current_dim;
7949  Kokkos::parallel_reduce("Read bDoingWork",
7950  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
7951  KOKKOS_LAMBDA(int dummy, int & set_single) {
7952  set_single = 0;
7953  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7954  if(local_device_num_partitioning_in_current_dim(
7955  current_work_part + kk) != 1) {
7956  set_single = 1;
7957  break;
7958  }
7959  }
7960  }, bDoingWork_int);
7961  bool bDoingWork = (bDoingWork_int != 0) ? true : false;
7962 
7963  this->mj_get_local_min_max_coord_totW(
7966  mj_current_dim_coords);
7967 
7968  // 1D partitioning
7969  if(bDoingWork) {
7970  // obtain global Min max of the part.
7971  this->mj_get_global_min_max_coord_totW(
7973  this->process_local_min_max_coord_total_weight,
7974  this->global_min_max_coord_total_weight);
7975 
7976  // represents the total number of cutlines
7977  // whose coordinate should be determined.
7978  mj_part_t total_incomplete_cut_count = 0;
7979 
7980  // Compute weight ratios for parts & cuts:
7981  // e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1
7982  // part0 cut0 part1 cut1 part2 cut2 part3
7983  mj_part_t concurrent_part_cut_shift = 0;
7984  mj_part_t concurrent_part_part_shift = 0;
7985 
7986  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7987 
7988  Kokkos::deep_copy(host_global_min_max_coord_total_weight,
7989  global_min_max_coord_total_weight);
7990 
7991  mj_scalar_t min_coordinate =
7992  host_global_min_max_coord_total_weight(kk);
7993  mj_scalar_t max_coordinate =
7994  host_global_min_max_coord_total_weight(
7996 
7997  mj_scalar_t global_total_weight =
7998  host_global_min_max_coord_total_weight(
7999  kk + 2 * current_concurrent_num_parts);
8000 
8001  mj_part_t concurrent_current_part_index = current_work_part + kk;
8002 
8003  mj_part_t partition_count = host_num_partitioning_in_current_dim(
8004  concurrent_current_part_index);
8005 
8006  Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
8007  Kokkos::subview(current_cut_coordinates,
8008  std::pair<mj_lno_t, mj_lno_t>(
8009  concurrent_part_cut_shift, current_cut_coordinates.size()));
8010  Kokkos::View<mj_scalar_t *, device_t>
8011  current_target_part_weights =
8012  Kokkos::subview(target_part_weights,
8013  std::pair<mj_lno_t, mj_lno_t>(
8014  concurrent_part_part_shift, target_part_weights.size()));
8015 
8016  // shift the usedCutCoordinate array as noCuts.
8017  concurrent_part_cut_shift += partition_count - 1;
8018  // shift the partRatio array as noParts.
8019  concurrent_part_part_shift += partition_count;
8020 
8021  // calculate only if part is not empty,
8022  // and part will be further partitioned.
8023  if(partition_count > 1 && min_coordinate <= max_coordinate) {
8024 
8025  // increase num_cuts_do_be_determined by the number of cuts of the
8026  // current part's cut line number.
8027  total_incomplete_cut_count += partition_count - 1;
8028 
8029  this->incomplete_cut_count(kk) = partition_count - 1;
8030 
8031  // get the target weights of the parts
8032  this->mj_get_initial_cut_coords_target_weights(
8033  min_coordinate,
8034  max_coordinate,
8035  partition_count - 1,
8036  global_total_weight,
8037  usedCutCoordinate,
8038  current_target_part_weights,
8039  future_num_part_in_parts,
8040  next_future_num_parts_in_parts,
8041  concurrent_current_part_index,
8042  obtained_part_index);
8043 
8044  mj_lno_t coordinate_end_index =
8045  host_part_xadj(concurrent_current_part_index);
8046  mj_lno_t coordinate_begin_index =
8047  concurrent_current_part_index==0 ? 0 :
8048  host_part_xadj(concurrent_current_part_index - 1);
8049 
8050  this->set_initial_coordinate_parts(
8051  max_coordinate,
8052  min_coordinate,
8053  coordinate_begin_index, coordinate_end_index,
8054  this->coordinate_permutations,
8055  mj_current_dim_coords,
8056  this->assigned_part_ids,
8057  partition_count);
8058  }
8059  else {
8060  // e.g., if have fewer coordinates than parts, don't need to do
8061  // next dim.
8062  this->incomplete_cut_count(kk) = 0;
8063  }
8064 
8065  obtained_part_index += partition_count;
8066  }
8067 
8068  // used imbalance, it is always 0, as it is difficult to
8069  // estimate a range.
8070  double used_imbalance = 0;
8071  // Determine cut lines for all concurrent parts parts here.
8072  this->mj_env->timerStart(MACRO_TIMERS,
8073  mj_timer_base_string + "Problem_Partitioning Get Part Weights");
8074 
8075  this->mj_1D_part(
8076  mj_current_dim_coords,
8077  used_imbalance,
8080  current_cut_coordinates,
8081  total_incomplete_cut_count,
8082  view_rectilinear_cut_count,
8083  view_total_reduction_size);
8084 
8085  this->mj_env->timerStop(MACRO_TIMERS,
8086  mj_timer_base_string + "Problem_Partitioning Get Part Weights");
8087  }
8088 
8089  // create new part chunks
8090  {
8091  mj_part_t output_array_shift = 0;
8092  mj_part_t cut_shift = 0;
8093  size_t tlr_shift = 0;
8094  size_t partweight_array_shift = 0;
8095  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
8096 
8097  mj_part_t current_concurrent_work_part = current_work_part + kk;
8098 
8099  mj_part_t num_parts = host_num_partitioning_in_current_dim(
8100  current_concurrent_work_part);
8101 
8102  // if the part is empty, skip the part.
8103  int coordinateA_bigger_than_coordinateB =
8104  host_global_min_max_coord_total_weight(kk) >
8105  host_global_min_max_coord_total_weight(
8107 
8108  if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
8109  // we still need to write the begin and end point of the empty part.
8110  // simply set it zero, the array indices will be shifted later
8111  auto local_new_part_xadj = this->new_part_xadj;
8112  Kokkos::parallel_for(
8113  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8114  (0, num_parts), KOKKOS_LAMBDA (mj_part_t jj) {
8115  local_new_part_xadj(
8116  output_part_index + output_array_shift + jj) = 0;
8117  });
8118 
8119  cut_shift += num_parts - 1;
8120  tlr_shift += (4 *(num_parts - 1) + 1);
8121  output_array_shift += num_parts;
8122  partweight_array_shift += (2 * (num_parts - 1) + 1);
8123  continue;
8124  }
8125 
8126  Kokkos::View<mj_scalar_t *, device_t>
8127  current_concurrent_cut_coordinate =
8128  Kokkos::subview(current_cut_coordinates,
8129  std::pair<mj_lno_t, mj_lno_t>(
8130  cut_shift,
8131  current_cut_coordinates.size()));
8132  Kokkos::View<mj_scalar_t *, device_t>
8133  used_local_cut_line_weight_to_left =
8134  Kokkos::subview(process_cut_line_weight_to_put_left,
8135  std::pair<mj_lno_t, mj_lno_t>(
8136  cut_shift,
8137  process_cut_line_weight_to_put_left.size()));
8138 
8139  this->thread_part_weight_work =
8140  Kokkos::subview(
8141  this->thread_part_weights,
8142  std::pair<mj_lno_t, mj_lno_t>(
8143  partweight_array_shift,
8144  this->thread_part_weights.extent(0)));
8145 
8146  if(num_parts > 1) {
8147  if(this->mj_keep_part_boxes) {
8148  // if part boxes are to be stored update the boundaries.
8149  for(mj_part_t j = 0; j < num_parts - 1; ++j) {
8150  mj_scalar_t temp_get_val;
8151  Kokkos::parallel_reduce("Read single",
8152  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8153  KOKKOS_LAMBDA(int dummy, mj_scalar_t & set_single) {
8154  set_single = current_concurrent_cut_coordinate(j);
8155  }, temp_get_val);
8156  (*output_part_boxes)
8157  [output_array_shift + output_part_index + j].
8158  updateMinMax(temp_get_val, 1 /*update max*/, coordInd);
8159  (*output_part_boxes)
8160  [output_array_shift + output_part_index + j + 1].
8161  updateMinMax(temp_get_val, 0 /*update max*/, coordInd);
8162  }
8163  }
8164 
8165  // Rewrite the indices based on the computed cuts.
8166  Kokkos::View<mj_lno_t*, device_t> sub_new_part_xadj =
8167  Kokkos::subview(this->new_part_xadj,
8168  std::pair<mj_lno_t, mj_lno_t>(
8169  output_part_index + output_array_shift,
8170  this->new_part_xadj.size()));
8171 
8172  this->mj_create_new_partitions(
8173  num_parts,
8174  current_concurrent_work_part,
8175  mj_current_dim_coords,
8176  current_concurrent_cut_coordinate,
8177  used_local_cut_line_weight_to_left,
8178  sub_new_part_xadj);
8179  }
8180  else {
8181 
8182  mj_lno_t coordinate_end = host_part_xadj(
8183  current_concurrent_work_part);
8184  mj_lno_t coordinate_begin =
8185  current_concurrent_work_part==0 ? 0 : host_part_xadj(
8186  current_concurrent_work_part - 1);
8187 
8188  // if this part is partitioned into 1 then just copy
8189  // the old values.
8190  mj_lno_t part_size = coordinate_end - coordinate_begin;
8191 
8192  // Awkward here to set one value - need some broader
8193  // refactoring to improve this one.
8194  auto local_new_part_xadj = this->new_part_xadj;
8195  Kokkos::parallel_for(
8196  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
8197  (0, 1), KOKKOS_LAMBDA (int dummy) {
8198  local_new_part_xadj(
8199  output_part_index + output_array_shift) = part_size;
8200  });
8201 
8202  auto subview_new_coordinate_permutations =
8203  Kokkos::subview(this->new_coordinate_permutations,
8204  std::pair<mj_lno_t, mj_lno_t>(
8205  coordinate_begin,
8206  coordinate_begin + part_size));
8207  auto subview_coordinate_permutations =
8208  Kokkos::subview(this->coordinate_permutations,
8209  std::pair<mj_lno_t, mj_lno_t>(
8210  coordinate_begin,
8211  coordinate_begin + part_size));
8212  Kokkos::deep_copy(subview_new_coordinate_permutations,
8213  subview_coordinate_permutations);
8214  }
8215  cut_shift += num_parts - 1;
8216  output_array_shift += num_parts;
8217  partweight_array_shift += (2 * (num_parts - 1) + 1);
8218  }
8219 
8220  // shift cut coordinates so that all cut coordinates are stored.
8221  // no shift now because we dont keep the cuts.
8222  // current_cut_coordinates += cut_shift;
8223  // mj_create_new_partitions from coordinates partitioned the parts
8224  // and write the indices as if there were a single part.
8225  // now we need to shift the beginning indices.
8226  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
8227  mj_part_t num_parts =
8228  host_num_partitioning_in_current_dim(current_work_part + kk);
8229 
8230  // These two kernels are a bit awkward but need broader redesign to
8231  // avoid this situation.
8232  auto local_new_part_xadj = this->new_part_xadj;
8233  Kokkos::parallel_for(
8234  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8235  (0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
8236  local_new_part_xadj(output_part_index+ii) +=
8237  output_coordinate_end_index;
8238  });
8239 
8240  // increase the previous count by current end.
8241  mj_part_t temp_get;
8242  Kokkos::parallel_reduce("Read single",
8243  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8244  KOKKOS_LAMBDA(int dummy, mj_part_t & set_single) {
8245  set_single =
8246  local_new_part_xadj(output_part_index + num_parts - 1);
8247  }, temp_get);
8248  output_coordinate_end_index = temp_get;
8249  //increase the current out.
8250  output_part_index += num_parts;
8251  }
8252  }
8253  }
8254 
8255  // end of this partitioning dimension
8256  int current_world_size = this->comm->getSize();
8257  long migration_reduce_all_population =
8258  this->total_dim_num_reduce_all * current_world_size;
8259  bool is_migrated_in_current_dimension = false;
8260 
8261  // we migrate if there are more partitionings to be done after this step
8262  // and if the migration is not forced to be avoided.
8263  // and the operation is not sequential.
8264  if(future_num_parts > 1 &&
8265  this->check_migrate_avoid_migration_option >= 0 &&
8266  current_world_size > 1) {
8267  this->mj_env->timerStart(MACRO_TIMERS,
8268  mj_timer_base_string + "Problem_Migration-" + istring);
8269  mj_part_t num_parts = output_part_count_in_dimension;
8270 
8271  if(this->mj_perform_migration(
8272  num_parts,
8273  current_num_parts, //output
8274  next_future_num_parts_in_parts, //output
8275  output_part_begin_index,
8276  migration_reduce_all_population,
8277  this->num_global_coords / (future_num_parts * current_num_parts),
8278  istring,
8279  input_part_boxes, output_part_boxes) )
8280  {
8281  is_migrated_in_current_dimension = true;
8282  is_data_ever_migrated = true;
8283  this->mj_env->timerStop(MACRO_TIMERS,
8284  mj_timer_base_string + "Problem_Migration-" + istring);
8285  // since data is migrated, we reduce the number of reduceAll
8286  // operations for the last part.
8287  this->total_dim_num_reduce_all /= num_parts;
8288  }
8289  else {
8290  is_migrated_in_current_dimension = false;
8291  this->mj_env->timerStop(MACRO_TIMERS,
8292  mj_timer_base_string + "Problem_Migration-" + istring);
8293  }
8294  }
8295 
8296  // swap the coordinate permutations for the next dimension.
8297  Kokkos::View<mj_lno_t*, device_t> tmp =
8298  this->coordinate_permutations;
8299  this->coordinate_permutations =
8300  this->new_coordinate_permutations;
8301 
8302  this->new_coordinate_permutations = tmp;
8303  if(!is_migrated_in_current_dimension) {
8304  this->total_dim_num_reduce_all -= current_num_parts;
8305  current_num_parts = output_part_count_in_dimension;
8306  }
8307 
8308  {
8309  this->part_xadj = this->new_part_xadj;
8310  local_part_xadj = this->new_part_xadj;
8311  this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
8312  Kokkos::deep_copy(host_part_xadj, part_xadj); // keep in sync
8313 
8314  this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>("empty", 0);
8315  this->mj_env->timerStop(MACRO_TIMERS,
8316  mj_timer_base_string + "Problem_Partitioning_" + istring);
8317  }
8318  }
8319 
8320  // Partitioning is done
8321  delete future_num_part_in_parts;
8322  delete next_future_num_parts_in_parts;
8323  this->mj_env->timerStop(MACRO_TIMERS,
8324  mj_timer_base_string + "Problem_Partitioning");
8326 
8327  //get the final parts of each initial coordinate
8328  //the results will be written to
8329  //this->assigned_part_ids for gnos given in this->current_mj_gnos
8330  this->set_final_parts(
8331  current_num_parts,
8332  output_part_begin_index,
8333  output_part_boxes,
8334  is_data_ever_migrated);
8335 
8336  result_assigned_part_ids_ = this->assigned_part_ids;
8337  result_mj_gnos_ = this->current_mj_gnos;
8338  this->mj_env->timerStop(MACRO_TIMERS,
8339  mj_timer_base_string + "Total");
8340  this->mj_env->debug(3, "Out of MultiJagged");
8341 }
8342 
8343 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
8344  typename mj_part_t, typename mj_node_t>
8345 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8346  mj_partBoxVector_t>
8349 {
8350  if(this->mj_keep_part_boxes) {
8351  return this->kept_boxes;
8352  }
8353  else {
8354  throw std::logic_error("Error: part boxes are not stored.");
8355  }
8356 }
8357 
8358 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
8359  typename mj_part_t, typename mj_node_t>
8360 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8361  mj_partBoxVector_t>
8363  compute_global_box_boundaries(RCP<mj_partBoxVector_t> &localPartBoxes) const
8364 {
8365  typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
8366  mj_part_t ntasks = this->num_global_parts;
8367  int dim = (*localPartBoxes)[0].getDim();
8368  coord_t *localPartBoundaries = new coord_t[ntasks * 2 *dim];
8369 
8370  memset(localPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
8371 
8372  coord_t *globalPartBoundaries = new coord_t[ntasks * 2 *dim];
8373  memset(globalPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
8374 
8375  coord_t *localPartMins = localPartBoundaries;
8376  coord_t *localPartMaxs = localPartBoundaries + ntasks * dim;
8377 
8378  coord_t *globalPartMins = globalPartBoundaries;
8379  coord_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
8380 
8381  mj_part_t boxCount = localPartBoxes->size();
8382  for(mj_part_t i = 0; i < boxCount; ++i) {
8383  mj_part_t pId = (*localPartBoxes)[i].getpId();
8384 
8385  // cout << "me:" << comm->getRank() << " has:" << pId << endl;
8386 
8387  coord_t *lmins = (*localPartBoxes)[i].getlmins();
8388  coord_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
8389 
8390  for(int j = 0; j < dim; ++j) {
8391  localPartMins[dim * pId + j] = lmins[j];
8392  localPartMaxs[dim * pId + j] = lmaxs[j];
8393 
8394  /*
8395  std::cout << "me:" << comm->getRank() <<
8396  " dim * pId + j:"<< dim * pId + j <<
8397  " localMin:" << localPartMins[dim * pId + j] <<
8398  " localMax:" << localPartMaxs[dim * pId + j] << std::endl;
8399  */
8400  }
8401  }
8402 
8403  Teuchos::Zoltan2_BoxBoundaries<int, coord_t> reductionOp(ntasks * 2 *dim);
8404 
8405  reduceAll<int, coord_t>(*mj_problemComm, reductionOp,
8406  ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
8407 
8408  RCP<mj_partBoxVector_t> pB(new mj_partBoxVector_t(),true);
8409  for(mj_part_t i = 0; i < ntasks; ++i) {
8411  globalPartMins + dim * i,
8412  globalPartMaxs + dim * i);
8413 
8414  /*
8415  for(int j = 0; j < dim; ++j) {
8416  std::cout << "me:" << comm->getRank() <<
8417  " dim * pId + j:"<< dim * i + j <<
8418  " globalMin:" << globalPartMins[dim * i + j] <<
8419  " globalMax:" << globalPartMaxs[dim * i + j] << std::endl;
8420  }
8421  */
8422 
8423  pB->push_back(tpb);
8424  }
8425  delete []localPartBoundaries;
8426  delete []globalPartBoundaries;
8427  //RCP <mj_partBoxVector_t> tmpRCPBox(pB, true);
8428  return pB;
8429 }
8430 
8433 template <typename Adapter>
8434 class Zoltan2_AlgMJ : public Algorithm<Adapter>
8435 {
8436 
8437 private:
8438 
8439 #ifndef DOXYGEN_SHOULD_SKIP_THIS
8440  typedef CoordinateModel<typename Adapter::base_adapter_t> coordinateModel_t;
8441 
8442  // For coordinates and weights, MJ needs floats or doubles
8443  // But Adapter can provide other scalars, e.g., ints.
8444  // So have separate scalar_t for MJ and adapter.
8445  typedef typename Adapter::scalar_t adapter_scalar_t;
8446 
8447  // Provide a default type for mj_scalar_t;
8448  typedef float default_mj_scalar_t;
8449 
8450  // If Adapter provided float or double scalar_t, use it (prevents copies).
8451  // Otherwise, use the default type of mj_scalar_t;
8452  typedef typename
8453  std::conditional<
8454  (std::is_same<adapter_scalar_t, float>::value ||
8455  std::is_same<adapter_scalar_t, double>::value),
8456  adapter_scalar_t, default_mj_scalar_t>::type mj_scalar_t;
8457 
8458  typedef typename Adapter::gno_t mj_gno_t;
8459  typedef typename Adapter::lno_t mj_lno_t;
8460  typedef typename Adapter::part_t mj_part_t;
8461  typedef typename Adapter::node_t mj_node_t;
8462  typedef coordinateModelPartBox mj_partBox_t;
8463  typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
8464  typedef typename mj_node_t::device_type device_t;
8465 #endif
8466 
8468 
8469  RCP<const Environment> mj_env; // the environment object
8470  RCP<const Comm<int> > mj_problemComm; // initial comm object
8471  RCP<const coordinateModel_t> mj_coords; // coordinate adapter
8472 
8473  // PARAMETERS
8474  double imbalance_tolerance; // input imbalance tolerance.
8475 
8476  int num_teams; // how many teams to run main loop with
8477 
8478  size_t num_global_parts; // the targeted number of parts
8479 
8480  // input part array specifying num part to divide along each dim.
8481  Kokkos::View<mj_part_t*, Kokkos::HostSpace> part_no_array;
8482 
8483  // the number of steps that partitioning will be solved in.
8484  int recursion_depth;
8485 
8486  int coord_dim; // coordinate dimension.
8487  mj_lno_t num_local_coords; //number of local coords.
8488  mj_gno_t num_global_coords; //number of global coords.
8489 
8490  // initial global ids of the coordinates.
8491  Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
8492 
8493  // two dimension coordinate array.
8494  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8495  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8496  mj_coordinates;
8497 
8498  int num_weights_per_coord; // number of weights per coordinate
8499 
8500  // if the target parts are uniform.
8501  Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_weights;
8502 
8503  // two dimensional weight array.
8504  Kokkos::View<mj_scalar_t**, device_t> mj_weights;
8505 
8506  // if the target parts are uniform
8507  Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_parts;
8508 
8509  // Nonuniform first level partitioning
8510  // Currently used for Dragonfly task mapping by partitioning Dragonfly RCA
8511  // machine coordinates and application coordinates.
8512  // An optimization that completely partitions the most important machine
8513  // dimension first (i.e. the Dragonfly group coordinate, or RCA's x
8514  // coordinate). The standard MJ alg follows after the nonuniform first level
8515  // partitioning.
8516  // If used, number of parts for the first level partitioning
8517  mj_part_t num_first_level_parts;
8518 
8519  // If used, the distribution of parts for the nonuniform
8520  // first level partitioning
8521  Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
8522 
8523  // if partitioning can distribute points on same coordiante to
8524  // different parts.
8525  bool distribute_points_on_cut_lines;
8526 
8527  // how many parts we can calculate concurrently.
8528  mj_part_t max_concurrent_part_calculation;
8529 
8530  // whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
8531  int check_migrate_avoid_migration_option;
8532 
8533  // when doing the migration, 0 will aim for perfect load-imbalance,
8534  int migration_type;
8535 
8536  // 1 for minimized messages
8537 
8538  // when MJ decides whether to migrate, the minimum imbalance for migration.
8539  double minimum_migration_imbalance;
8540  bool mj_keep_part_boxes; //if the boxes need to be kept.
8541 
8542  // if this is set, then recursion depth is adjusted to its maximum value.
8543  bool mj_run_as_rcb;
8544  int mj_premigration_option;
8545  int min_coord_per_rank_for_premigration;
8546 
8547  // communication graph xadj
8548  ArrayRCP<mj_part_t> comXAdj_;
8549 
8550  // communication graph adj.
8551  ArrayRCP<mj_part_t> comAdj_;
8552 
8553  void copy(
8554  const RCP<PartitioningSolution<Adapter> >&solution);
8555 
8556  void set_input_parameters(const Teuchos::ParameterList &p);
8557 
8558  RCP<mj_partBoxVector_t> getGlobalBoxBoundaries() const;
8559 
8560  bool mj_premigrate_to_subset(
8561  int used_num_ranks,
8562  int migration_selection_option,
8563  RCP<const Environment> mj_env_,
8564  RCP<const Comm<int> > mj_problemComm_,
8565  int coord_dim_,
8566  mj_lno_t num_local_coords_,
8567  mj_gno_t num_global_coords_, size_t num_global_parts_,
8568  Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8569  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8570  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8571  mj_coordinates_,
8572  int num_weights_per_coord_,
8573  Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8574  //results
8575  RCP<const Comm<int> > &result_problemComm_,
8576  mj_lno_t & result_num_local_coords_,
8577  Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8578  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8579  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8580  result_mj_coordinates_,
8581  Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8582  int * &result_actual_owner_rank_);
8583 
8584 public:
8585 
8586  Zoltan2_AlgMJ(const RCP<const Environment> &env,
8587  RCP<const Comm<int> > &problemComm,
8588  const RCP<const coordinateModel_t> &coords) :
8589  mj_partitioner(),
8590  mj_env(env),
8591  mj_problemComm(problemComm),
8592  mj_coords(coords),
8593  imbalance_tolerance(0),
8594  num_teams(0),
8595  num_global_parts(1),
8596  recursion_depth(0),
8597  coord_dim(0),
8598  num_local_coords(0),
8599  num_global_coords(0),
8600  num_weights_per_coord(0),
8601  num_first_level_parts(1),
8602  distribute_points_on_cut_lines(true),
8603  max_concurrent_part_calculation(1),
8604  check_migrate_avoid_migration_option(0),
8605  migration_type(0),
8606  minimum_migration_imbalance(0.30),
8607  mj_keep_part_boxes(false),
8608  mj_run_as_rcb(false),
8609  mj_premigration_option(0),
8610  min_coord_per_rank_for_premigration(32000),
8611  comXAdj_(),
8612  comAdj_()
8613  {
8614  }
8615 
8617  {
8618  }
8619 
8622  static void getValidParameters(ParameterList & pl)
8623  {
8624  const bool bUnsorted = true; // this clarifies the flag is for unsrorted
8625  RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
8626  Teuchos::rcp( new Zoltan2::IntegerRangeListValidator<int>(bUnsorted) );
8627  pl.set("mj_parts", "0", "list of parts for multiJagged partitioning "
8628  "algorithm. As many as the dimension count.", mj_parts_Validator);
8629 
8630  pl.set("mj_concurrent_part_count", 1, "The number of parts whose cut "
8631  "coordinates will be calculated concurently.",
8633 
8634  pl.set("mj_minimum_migration_imbalance", 1.1,
8635  "mj_minimum_migration_imbalance, the minimum imbalance of the "
8636  "processors to avoid migration",
8638 
8639  RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
8640  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 2) );
8641  pl.set("mj_migration_option", 1, "Migration option, 0 for decision "
8642  "depending on the imbalance, 1 for forcing migration, 2 for "
8643  "avoiding migration", mj_migration_option_validator);
8644 
8645  RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
8646  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1) );
8647  pl.set("mj_migration_type", 0,
8648  "Migration type, 0 for migration to minimize the imbalance "
8649  "1 for migration to minimize messages exchanged the migration.",
8650  mj_migration_option_validator);
8651 
8652  // bool parameter
8653  pl.set("mj_keep_part_boxes", false, "Keep the part boundaries of the "
8654  "geometric partitioning.", Environment::getBoolValidator());
8655 
8656  // bool parameter
8657  pl.set("mj_enable_rcb", false, "Use MJ as RCB.",
8659 
8660  pl.set("mj_recursion_depth", -1, "Recursion depth for MJ: Must be "
8661  "greater than 0.", Environment::getAnyIntValidator());
8662 
8663  RCP<Teuchos::EnhancedNumberValidator<int>>
8664  mj_num_teams_validator =
8665  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(
8666  0, Teuchos::EnhancedNumberTraits<int>::max()) );
8667  pl.set("mj_num_teams", 0,
8668  "How many teams for the main kernel loop"
8669  , mj_num_teams_validator);
8670 
8671  RCP<Teuchos::EnhancedNumberValidator<int>>
8672  mj_premigration_option_validator =
8673  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
8674 
8675  pl.set("mj_premigration_option", 0,
8676  "Whether to do premigration or not. 0 for no migration "
8677  "x > 0 for migration to consecutive processors, "
8678  "the subset will be 0,x,2x,3x,...subset ranks."
8679  , mj_premigration_option_validator);
8680 
8681  pl.set("mj_premigration_coordinate_count", 32000, "How many coordinate to "
8682  "assign each rank in multijagged after premigration"
8684  }
8685 
8691  void partition(const RCP<PartitioningSolution<Adapter> > &solution);
8692 
8693  mj_partBoxVector_t &getPartBoxesView() const
8694  {
8695  RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
8696  return *pBoxes;
8697  }
8698 
8699  mj_part_t pointAssign(int dim, adapter_scalar_t *point) const;
8700 
8701  void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper,
8702  size_t &nPartsFound, mj_part_t **partsFound) const;
8703 
8706  void getCommunicationGraph(
8707  const PartitioningSolution<Adapter> *solution,
8708  ArrayRCP<mj_part_t> &comXAdj,
8709  ArrayRCP<mj_part_t> &comAdj);
8710 
8711  void set_up_partitioning_data( // public for CUDA
8712  const RCP<PartitioningSolution<Adapter> >&solution);
8713 
8714  private:
8715  std::string timer_base_string; // used for making timers
8716 
8717  // After loading views from coordinate adapter we may need to copy them
8718  // if mj type is different, but otherwise we just want to assign the view.
8719  // So purpose of this code is to make that assign only happen when the types
8720  // match. The empty case would otherwise not compile.
8721  // If they don't match the internal code handles allocating the new view
8722  // and copying the elements. See the test Zoltan2_mj_int_coordinates.
8723  template<class dst_t, class src_t> // version for same types
8724  typename std::enable_if<std::is_same<typename dst_t::value_type,
8725  typename src_t::value_type>::value>::type
8726  assign_if_same(dst_t & dst, const src_t & src) {
8727  dst = src;
8728  }
8729  template<class dst_t, class src_t> // version for different types
8730  typename std::enable_if<!std::is_same<typename dst_t::value_type,
8731  typename src_t::value_type>::value>::type
8732  assign_if_same(dst_t & dst, const src_t & src) {
8733  // do nothing - handled manually
8734  }
8735 };
8736 
8737 template <typename Adapter>
8738 bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset(
8739  int used_num_ranks,
8740  int migration_selection_option,
8741  RCP<const Environment> mj_env_,
8742  RCP<const Comm<int> > mj_problemComm_,
8743  int coord_dim_,
8744  mj_lno_t num_local_coords_,
8745  mj_gno_t num_global_coords_, size_t num_global_parts_,
8746  Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8747  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8748  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
8749  int num_weights_per_coord_,
8750  Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8751  //results
8752  RCP<const Comm<int> > & result_problemComm_,
8753  mj_lno_t &result_num_local_coords_,
8754  Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8755  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8756  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8757  result_mj_coordinates_,
8758  Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8759  int * &result_actual_owner_rank_)
8760 {
8761  mj_env_->timerStart(MACRO_TIMERS,
8762  timer_base_string + "PreMigration DistributorPlanCreating");
8763 
8764  int myRank = mj_problemComm_->getRank();
8765  int worldSize = mj_problemComm_->getSize();
8766 
8767  mj_part_t groupsize = worldSize / used_num_ranks;
8768 
8769  std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
8770 
8771  mj_part_t i_am_sending_to = 0;
8772  bool am_i_a_receiver = false;
8773 
8774  for(int i = 0; i < used_num_ranks; ++i) {
8775  group_begins[i+ 1] = group_begins[i] + groupsize;
8776  if(worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
8777  if(i == used_num_ranks) group_begins[i+ 1] = worldSize;
8778  if(myRank >= group_begins[i] && myRank < group_begins[i + 1]) {
8779  i_am_sending_to = group_begins[i];
8780  }
8781  if(myRank == group_begins[i]) {
8782  am_i_a_receiver = true;
8783  }
8784  }
8785 
8786  ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
8787  result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
8788 
8789  Tpetra::Distributor distributor(mj_problemComm_);
8790 
8791  std::vector<mj_part_t>
8792  coordinate_destinations(num_local_coords_, i_am_sending_to);
8793 
8794  ArrayView<const mj_part_t>
8795  destinations(&(coordinate_destinations[0]), num_local_coords_);
8796  mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
8797  result_num_local_coords_ = num_incoming_gnos;
8798  mj_env_->timerStop(MACRO_TIMERS,
8799  timer_base_string + "PreMigration DistributorPlanCreating");
8800 
8801  mj_env_->timerStart(MACRO_TIMERS,
8802  timer_base_string + "PreMigration DistributorMigration");
8803 
8804  // MPI Buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
8805 
8806  // migrate gnos.
8807  {
8808  ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
8809  Kokkos::View<mj_gno_t*, Kokkos::HostSpace> host_initial_mj_gnos(
8810  Kokkos::ViewAllocateWithoutInitializing("host_initial_mj_gnos"),
8811  initial_mj_gnos_.size()); // initial_mj_gnos_ is const mj_gno_t *
8812  Kokkos::deep_copy(host_initial_mj_gnos, initial_mj_gnos_);
8813  ArrayView<const mj_gno_t> sent_gnos(host_initial_mj_gnos.data(),
8814  num_local_coords_);
8815  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
8816  result_initial_mj_gnos_ = Kokkos::View<mj_gno_t*, device_t>(
8817  Kokkos::ViewAllocateWithoutInitializing("result_initial_mj_gnos_"),
8818  num_incoming_gnos);
8819  auto host_result_initial_mj_gnos_ = Kokkos::create_mirror_view(
8820  result_initial_mj_gnos_);
8821  memcpy(host_result_initial_mj_gnos_.data(),
8822  received_gnos.getRawPtr(), num_incoming_gnos * sizeof(mj_gno_t));
8823  Kokkos::deep_copy(result_initial_mj_gnos_, host_result_initial_mj_gnos_);
8824  }
8825 
8826  // migrate coordinates
8827  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8828  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> dst_coordinates(
8829  Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
8830  num_incoming_gnos, this->coord_dim);
8831  auto host_dst_coordinates = Kokkos::create_mirror_view(
8832  dst_coordinates);
8833  auto host_src_coordinates =
8834  Kokkos::create_mirror_view(Kokkos::HostSpace(), this->mj_coordinates);
8835  Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
8836  for(int i = 0; i < this->coord_dim; ++i) {
8837  auto sub_host_src_coordinates
8838  = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
8839  auto sub_host_dst_coordinates
8840  = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
8841  // Note Layout Left means we can do these in contiguous blocks
8842  ArrayView<mj_scalar_t> sent_coord(
8843  sub_host_src_coordinates.data(), this->num_local_coords);
8844  ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
8845  distributor.doPostsAndWaits<mj_scalar_t>(
8846  sent_coord, 1, received_coord());
8847  memcpy(sub_host_dst_coordinates.data(),
8848  received_coord.getRawPtr(), num_incoming_gnos * sizeof(mj_scalar_t));
8849  }
8850  deep_copy(dst_coordinates, host_dst_coordinates);
8851  result_mj_coordinates_ = dst_coordinates;
8852 
8853  // migrate weights.
8854  Kokkos::View<mj_scalar_t**, device_t> dst_weights(
8855  Kokkos::ViewAllocateWithoutInitializing("mj_weights"),
8856  num_incoming_gnos, this->num_weights_per_coord);
8857  auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
8858  auto host_src_weights = Kokkos::create_mirror_view(this->mj_weights);
8859  Kokkos::deep_copy(host_src_weights, this->mj_weights);
8860  for(int i = 0; i < this->num_weights_per_coord; ++i) {
8861  auto sub_host_src_weights
8862  = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
8863  auto sub_host_dst_weights
8864  = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
8865  ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
8866 
8867  // Layout Right means these are not contiguous
8868  // However we don't have any systems setup with more than 1 weight so
8869  // really I have not tested any of this code with num weights > 1.
8870  // I think this is the right thing to do. Note that there are other
8871  // places in the code which don't handle the possibility of more weights.
8872  // So evaluating all that and adding tests would be another project.
8873  for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
8874  sent_weight[n] = sub_host_src_weights(n);
8875  }
8876  ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
8877  distributor.doPostsAndWaits<mj_scalar_t>(
8878  sent_weight(), 1, received_weight());
8879 
8880  // Again we copy by index due to layout
8881  for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
8882  sub_host_dst_weights(n) = received_weight[n];
8883  }
8884  }
8885  Kokkos::deep_copy(dst_weights, host_dst_weights);
8886  result_mj_weights_ = dst_weights;
8887 
8888  // migrate the owners of the coordinates
8889  {
8890  std::vector<int> owner_of_coordinate(num_local_coords_, myRank);
8891  ArrayView<int> sent_owners(&(owner_of_coordinate[0]), num_local_coords_);
8892  ArrayRCP<int> received_owners(num_incoming_gnos);
8893  distributor.doPostsAndWaits<int>(sent_owners, 1, received_owners());
8894  result_actual_owner_rank_ = new int[num_incoming_gnos];
8895  memcpy(
8896  result_actual_owner_rank_,
8897  received_owners.getRawPtr(),
8898  num_incoming_gnos * sizeof(int));
8899  }
8900 
8901  mj_env_->timerStop(MACRO_TIMERS,
8902  timer_base_string + "PreMigration DistributorMigration");
8903  return am_i_a_receiver;
8904 }
8905 
8913 template <typename Adapter>
8915  const RCP<PartitioningSolution<Adapter> > &solution)
8916 {
8917  // purpose of this code is to validate node and UVM status for the tests
8918  // std::cout << "Memory Space: " << mj_node_t::memory_space::name() << " "
8919  // << "Execution Space: " << mj_node_t::execution_space::name()
8920  // << std::endl;
8921 
8922  int execute_counter =
8924  timer_base_string = "partition(" + std::to_string(execute_counter) + ") - ";
8925 
8926  this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "all");
8927  {
8928  this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "setup");
8929 
8930  this->set_up_partitioning_data(solution);
8931 
8932  this->set_input_parameters(this->mj_env->getParameters());
8933  if(this->mj_keep_part_boxes) {
8934  this->mj_partitioner.set_to_keep_part_boxes();
8935  }
8936 
8937  this->mj_partitioner.set_partitioning_parameters(
8938  this->distribute_points_on_cut_lines,
8939  this->max_concurrent_part_calculation,
8940  this->check_migrate_avoid_migration_option,
8941  this->minimum_migration_imbalance, this->migration_type);
8942 
8943  RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
8944  mj_lno_t result_num_local_coords = this->num_local_coords;
8945  Kokkos::View<mj_gno_t*, device_t> result_initial_mj_gnos;
8946  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8947  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8948  result_mj_coordinates = this->mj_coordinates;
8949  Kokkos::View<mj_scalar_t**, device_t> result_mj_weights =
8950  this->mj_weights;
8951  int *result_actual_owner_rank = NULL;
8952 
8953  Kokkos::View<const mj_gno_t*, device_t> result_initial_mj_gnos_ =
8954  this->initial_mj_gnos;
8955 
8956  // TODO: MD 08/2017: Further discussion is required.
8957  // MueLu calls MJ when it has very few coordinates per processors,
8958  // such as 10. For example, it begins with 1K processor with 1K coordinate
8959  // in each. Then with coarsening this reduces to 10 coordinate per procesor.
8960  // It calls MJ to repartition these to 10 coordinates.
8961  // MJ runs with 1K processor, 10 coordinate in each, and partitions to
8962  // 10 parts. As expected strong scaling is problem here, because
8963  // computation is almost 0, and communication cost of MJ linearly increases.
8964  // Premigration option gathers the coordinates to 10 parts before MJ starts
8965  // therefore MJ will run with a smalller subset of the problem.
8966  // Below, I am migrating the coordinates if mj_premigration_option is set,
8967  // and the result parts are less than the current part count, and the
8968  // average number of local coordinates is less than some threshold.
8969  // For example, premigration may not help if 1000 processors are
8970  // partitioning data to 10, but each of them already have 1M coordinate.
8971  // In that case, we premigration would not help.
8972  int current_world_size = this->mj_problemComm->getSize();
8973  mj_lno_t threshold_num_local_coords =
8974  this->min_coord_per_rank_for_premigration;
8975  bool is_pre_migrated = false;
8976  bool am_i_in_subset = true;
8977 
8978  // Note that we need to add testing for migration and should also cover the
8979  // zoltan case when ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION is defined.
8980  // Currently did a minimal test of this code by running mjTest with
8981  // PM=1, TB=0 then run again with C=3 instead of C=4 (numProcs is 4).
8982  if(mj_premigration_option > 0 &&
8983  size_t (current_world_size) > this->num_global_parts &&
8984  this->num_global_coords < mj_gno_t (
8985  current_world_size * threshold_num_local_coords))
8986  {
8987  if(this->mj_keep_part_boxes) {
8988  throw std::logic_error("Multijagged: mj_keep_part_boxes and "
8989  "mj_premigration_option are not supported together yet.");
8990  }
8991 
8992  is_pre_migrated =true;
8993  int migration_selection_option = mj_premigration_option;
8994  if(migration_selection_option * this->num_global_parts >
8995  (size_t) (current_world_size)) {
8996  migration_selection_option =
8997  current_world_size / this->num_global_parts;
8998  }
8999 
9000  int used_num_ranks = int (this->num_global_coords /
9001  float (threshold_num_local_coords) + 0.5);
9002 
9003  if(used_num_ranks == 0) {
9004  used_num_ranks = 1;
9005  }
9006 
9007  am_i_in_subset = this->mj_premigrate_to_subset(
9008  used_num_ranks,
9009  migration_selection_option,
9010  this->mj_env,
9011  this->mj_problemComm,
9012  this->coord_dim,
9013  this->num_local_coords,
9014  this->num_global_coords,
9015  this->num_global_parts,
9016  this->initial_mj_gnos,
9017  this->mj_coordinates,
9018  this->num_weights_per_coord,
9019  this->mj_weights,
9020  //results
9021  result_problemComm,
9022  result_num_local_coords,
9023  result_initial_mj_gnos,
9024  result_mj_coordinates,
9025  result_mj_weights,
9026  result_actual_owner_rank);
9027 
9028  result_initial_mj_gnos_ = result_initial_mj_gnos;
9029  }
9030 
9031  Kokkos::View<mj_part_t *, device_t> result_assigned_part_ids;
9032  Kokkos::View<mj_gno_t*, device_t> result_mj_gnos;
9033 
9034  this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "setup");
9035 
9036  if(am_i_in_subset) {
9037  this->mj_partitioner.multi_jagged_part(
9038  this->mj_env,
9039  result_problemComm, //this->mj_problemComm,
9040  this->imbalance_tolerance,
9041  this->num_teams,
9042  this->num_global_parts,
9043  this->part_no_array,
9044  this->recursion_depth,
9045  this->coord_dim,
9046  result_num_local_coords, //this->num_local_coords,
9047  this->num_global_coords,
9048  result_initial_mj_gnos_,
9049  result_mj_coordinates,
9050  this->num_weights_per_coord,
9051  this->mj_uniform_weights,
9052  result_mj_weights,
9053  this->mj_uniform_parts,
9054  result_assigned_part_ids,
9055  result_mj_gnos
9056  );
9057  }
9058 
9059  this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "cleanup");
9060 
9061  // Reorder results so that they match the order of the input
9062  std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
9063  localGidToLid.reserve(result_num_local_coords);
9064  Kokkos::View<mj_gno_t*, Kokkos::HostSpace> host_result_initial_mj_gnos(
9065  Kokkos::ViewAllocateWithoutInitializing("host_result_initial_mj_gnos"),
9066  result_initial_mj_gnos_.size());
9067  Kokkos::deep_copy(host_result_initial_mj_gnos, result_initial_mj_gnos_);
9068  for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9069  localGidToLid[host_result_initial_mj_gnos(i)] = i;
9070  }
9071 
9072  ArrayRCP<mj_part_t> partId = arcp(new mj_part_t[result_num_local_coords],
9073  0, result_num_local_coords, true);
9074  auto host_result_assigned_part_ids =
9075  Kokkos::create_mirror_view(result_assigned_part_ids);
9076  Kokkos::deep_copy(host_result_assigned_part_ids, result_assigned_part_ids);
9077  auto host_result_mj_gnos = Kokkos::create_mirror_view(result_mj_gnos);
9078  Kokkos::deep_copy(host_result_mj_gnos, result_mj_gnos);
9079  for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9080  mj_lno_t origLID = localGidToLid[host_result_mj_gnos(i)];
9081  partId[origLID] = host_result_assigned_part_ids(i);
9082  }
9083 
9084  //now the results are reordered. but if premigration occured,
9085  //then we need to send these ids to actual owners again.
9086  if(is_pre_migrated) {
9087  this->mj_env->timerStart(MACRO_TIMERS, timer_base_string +
9088  "PostMigration DistributorPlanCreating");
9089  Tpetra::Distributor distributor(this->mj_problemComm);
9090  ArrayView<const mj_part_t> actual_owner_destinations(
9091  result_actual_owner_rank , result_num_local_coords);
9092  mj_lno_t num_incoming_gnos = distributor.createFromSends(
9093  actual_owner_destinations);
9094  if(num_incoming_gnos != this->num_local_coords) {
9095  throw std::logic_error("Zoltan2 - Multijagged Post Migration - "
9096  "num incoming is not equal to num local coords");
9097  }
9098 
9099  mj_env->timerStop(MACRO_TIMERS, timer_base_string +
9100  "PostMigration DistributorPlanCreating");
9101  mj_env->timerStart(MACRO_TIMERS, timer_base_string +
9102  "PostMigration DistributorMigration");
9103  ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
9104  ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
9105  {
9106  ArrayView<const mj_gno_t> sent_gnos(host_result_initial_mj_gnos.data(),
9107  result_num_local_coords);
9108  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
9109  }
9110 
9111  {
9112  ArrayView<mj_part_t> sent_partnos(partId());
9113  distributor.doPostsAndWaits<mj_part_t>(sent_partnos, 1,
9114  received_partids());
9115  }
9116 
9117  partId = arcp(new mj_part_t[this->num_local_coords],
9118  0, this->num_local_coords, true);
9119 
9120  {
9121  std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
9122  localGidToLid2.reserve(this->num_local_coords);
9123  auto host_initial_mj_gnos =
9124  Kokkos::create_mirror_view(this->initial_mj_gnos);
9125  Kokkos::deep_copy(host_initial_mj_gnos,
9126  this->initial_mj_gnos);
9127  for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9128  localGidToLid2[host_initial_mj_gnos(i)] = i;
9129  }
9130 
9131  for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9132  mj_lno_t origLID = localGidToLid2[received_gnos[i]];
9133  partId[origLID] = received_partids[i];
9134  }
9135  }
9136 
9137  {
9138  delete [] result_actual_owner_rank;
9139  }
9140  mj_env->timerStop(MACRO_TIMERS,
9141  timer_base_string + "PostMigration DistributorMigration");
9142  }
9143  solution->setParts(partId);
9144  this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "cleanup");
9145  }
9146 
9147  this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "all");
9148 }
9149 
9150 /* \brief Sets the partitioning data for multijagged algorithm.
9151  * */
9152 template <typename Adapter>
9154  const RCP<PartitioningSolution<Adapter> > &solution
9155 )
9156 {
9157  this->coord_dim = this->mj_coords->getCoordinateDim();
9158  this->num_weights_per_coord = this->mj_coords->getNumWeightsPerCoordinate();
9159  this->num_local_coords = this->mj_coords->getLocalNumCoordinates();
9160  this->num_global_coords = this->mj_coords->getGlobalNumCoordinates();
9161  int criteria_dim = (this->num_weights_per_coord ?
9162  this->num_weights_per_coord : 1);
9163  // From the Solution we get part information.
9164  // If the part sizes for a given criteria are not uniform,
9165  // then they are values that sum to 1.0.
9166  this->num_global_parts = solution->getTargetGlobalNumberOfParts();
9167  // allocate only two dimensional pointer.
9168  // raw pointer addresess will be obtained from multivector.
9169  this->mj_uniform_parts = Kokkos::View<bool *, Kokkos::HostSpace>(
9170  "uniform parts", criteria_dim);
9171  this->mj_uniform_weights = Kokkos::View<bool *, Kokkos::HostSpace>(
9172  "uniform weights", criteria_dim);
9173 
9174  Kokkos::View<const mj_gno_t *, device_t> gnos;
9175  Kokkos::View<adapter_scalar_t **, Kokkos::LayoutLeft, device_t> xyz_adapter;
9176  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9177  Kokkos::View<adapter_scalar_t **, device_t> wgts_adapter;
9178  this->mj_coords->getCoordinatesKokkos(gnos, xyz_adapter, wgts_adapter);
9179  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9180  Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> xyz;
9181  Kokkos::View<mj_scalar_t **, device_t> wgts;
9182 
9183  // Now we must get the data from the adapter.
9184  // If the types match we point to the view but if not, we must copy.
9185  if(std::is_same<mj_scalar_t, adapter_scalar_t>()) {
9186  // we can just point the views but we must specialize because this code
9187  // only compiles in this case - for is_same false assign does nothing.
9188  assign_if_same(xyz, xyz_adapter);
9189  assign_if_same(wgts, wgts_adapter);
9190  }
9191  else {
9192  // we only allocate a new view if we are going to copy
9193  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9194  xyz = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
9195  (Kokkos::ViewAllocateWithoutInitializing(
9196  "xyz"), xyz_adapter.extent(0), xyz_adapter.extent(1));
9197  wgts = Kokkos::View<mj_scalar_t **, device_t>(
9198  Kokkos::ViewAllocateWithoutInitializing("wgts"),
9199  wgts_adapter.extent(0), wgts_adapter.extent(1));
9200 
9201  typedef typename Kokkos::View<mj_scalar_t **, device_t>::size_type view_size_t;
9202  Kokkos::parallel_for(
9203  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9204  (0, xyz_adapter.extent(0)), KOKKOS_LAMBDA (int i) {
9205  for(view_size_t n = 0; n < xyz_adapter.extent(1); ++n) {
9206  xyz(i, n) = static_cast<mj_scalar_t>(xyz_adapter(i, n));
9207  }
9208  });
9209  Kokkos::parallel_for(
9210  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9211  (0, wgts.extent(0)), KOKKOS_LAMBDA (int i) {
9212  for(view_size_t n = 0; n < wgts.extent(1); ++n) {
9213  wgts(i, n) = static_cast<mj_scalar_t>(wgts_adapter(i, n));
9214  }
9215  });
9216  }
9217 
9218  // obtain global ids.
9219  this->initial_mj_gnos = gnos;
9220  // extract coordinates from multivector.
9221  this->mj_coordinates = xyz;
9222  // if no weights are provided set uniform weight.
9223 
9224  if(this->num_weights_per_coord == 0) {
9225  this->mj_uniform_weights(0) = true;
9226  Kokkos::resize(this->mj_weights, 0, 0);
9227  }
9228  else{
9229  this->mj_weights = wgts;
9230  for(int wdim = 0; wdim < this->num_weights_per_coord; ++wdim) {
9231  this->mj_uniform_weights(wdim) = false;
9232  }
9233  }
9234 
9235  for(int wdim = 0; wdim < criteria_dim; ++wdim) {
9236  if(solution->criteriaHasUniformPartSizes(wdim)) {
9237  this->mj_uniform_parts(wdim) = true;
9238  }
9239  else {
9240  printf("Error: MJ does not support non uniform target part weights\n");
9241  std::terminate();
9242  }
9243  }
9244 }
9245 
9246 /* \brief Sets the partitioning parameters for multijagged algorithm.
9247  * \param pl: is the parameter list provided to zoltan2 call
9248  * */
9249 template <typename Adapter>
9251  const Teuchos::ParameterList &pl)
9252 {
9253  const Teuchos::ParameterEntry *pe = pl.getEntryPtr("imbalance_tolerance");
9254  if(pe) {
9255  double tol;
9256  tol = pe->getValue(&tol);
9257  this->imbalance_tolerance = tol - 1.0;
9258  }
9259 
9260  // TODO: May be a more relaxed tolerance is needed. RCB uses 10%
9261  if(this->imbalance_tolerance <= 0) {
9262  this->imbalance_tolerance= 10e-4;
9263  }
9264 
9265  // if an input partitioning array is provided.
9266  Kokkos::resize(this->part_no_array, 0);
9267 
9268  // the length of the input partitioning array.
9269  this->recursion_depth = 0;
9270 
9271  if(pl.getPtr<int>("mj_num_teams")) {
9272  this->num_teams = pl.get<int>("mj_num_teams");
9273  }
9274 
9275  if(pl.getPtr<Array <mj_part_t> >("mj_parts")) {
9276  auto mj_parts = pl.get<Array <mj_part_t> >("mj_parts");
9277  int mj_parts_size = static_cast<int>(mj_parts.size());
9278 
9279  // build the view we'll have data on and copy values from host
9280  this->part_no_array = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9281  "part_no_array", mj_parts_size);
9282  for(int i = 0; i < mj_parts_size; ++i) {
9283  this->part_no_array(i) = mj_parts.getRawPtr()[i];
9284  }
9285 
9286  this->recursion_depth = mj_parts_size - 1;
9287  this->mj_env->debug(2, "mj_parts provided by user");
9288  }
9289 
9290  // get mj specific parameters.
9291  this->distribute_points_on_cut_lines = true;
9292  this->max_concurrent_part_calculation = 1;
9293 
9294  this->mj_run_as_rcb = false;
9295  this->mj_premigration_option = 0;
9296  this->min_coord_per_rank_for_premigration = 32000;
9297 
9298  int mj_user_recursion_depth = -1;
9299  this->mj_keep_part_boxes = false;
9300  this->check_migrate_avoid_migration_option = 0;
9301  this->migration_type = 0;
9302  this->minimum_migration_imbalance = 0.35;
9303 
9304  pe = pl.getEntryPtr("mj_minimum_migration_imbalance");
9305  if(pe) {
9306  double imb;
9307  imb = pe->getValue(&imb);
9308  this->minimum_migration_imbalance = imb - 1.0;
9309  }
9310 
9311  pe = pl.getEntryPtr("mj_migration_option");
9312  if(pe) {
9313  this->check_migrate_avoid_migration_option =
9314  pe->getValue(&this->check_migrate_avoid_migration_option);
9315  } else {
9316  this->check_migrate_avoid_migration_option = 0;
9317  }
9318  if(this->check_migrate_avoid_migration_option > 1) {
9319  this->check_migrate_avoid_migration_option = -1;
9320  }
9321 
9323  pe = pl.getEntryPtr("mj_migration_type");
9324  if(pe) {
9325  this->migration_type = pe->getValue(&this->migration_type);
9326  } else {
9327  this->migration_type = 0;
9328  }
9329 
9330  //std::cout << " this->migration_type:" << this->migration_type << std::endl;
9332 
9333  pe = pl.getEntryPtr("mj_concurrent_part_count");
9334  if(pe) {
9335  this->max_concurrent_part_calculation =
9336  pe->getValue(&this->max_concurrent_part_calculation);
9337  } else {
9338  this->max_concurrent_part_calculation = 1; // Set to 1 if not provided.
9339  }
9340 
9341  pe = pl.getEntryPtr("mj_keep_part_boxes");
9342  if(pe) {
9343  this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
9344  } else {
9345  this->mj_keep_part_boxes = false; // Set to invalid value
9346  }
9347 
9348  // For now, need keep_part_boxes to do pointAssign and boxAssign.
9349  // pe = pl.getEntryPtr("keep_cuts");
9350  // if(pe) {
9351  // int tmp = pe->getValue(&tmp);
9352  // if(tmp) this->mj_keep_part_boxes = true;
9353  // }
9354 
9355  //need to keep part boxes if mapping type is geometric.
9356  if(this->mj_keep_part_boxes == false) {
9357  pe = pl.getEntryPtr("mapping_type");
9358  if(pe) {
9359  int mapping_type = -1;
9360  mapping_type = pe->getValue(&mapping_type);
9361  if(mapping_type == 0) {
9362  mj_keep_part_boxes = true;
9363  }
9364  }
9365  }
9366 
9367  // need to keep part boxes if mapping type is geometric.
9368  pe = pl.getEntryPtr("mj_enable_rcb");
9369  if(pe) {
9370  this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
9371  } else {
9372  this->mj_run_as_rcb = false; // Set to invalid value
9373  }
9374 
9375  pe = pl.getEntryPtr("mj_premigration_option");
9376  if(pe) {
9377  mj_premigration_option = pe->getValue(&mj_premigration_option);
9378  } else {
9379  mj_premigration_option = 0;
9380  }
9381 
9382  pe = pl.getEntryPtr("mj_premigration_coordinate_count");
9383  if(pe) {
9384  min_coord_per_rank_for_premigration = pe->getValue(&mj_premigration_option);
9385  } else {
9386  min_coord_per_rank_for_premigration = 32000;
9387  }
9388 
9389  pe = pl.getEntryPtr("mj_recursion_depth");
9390  if(pe) {
9391  mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
9392  } else {
9393  mj_user_recursion_depth = -1; // Set to invalid value
9394  }
9395 
9396  bool val = false;
9397  pe = pl.getEntryPtr("rectilinear");
9398  if(pe) {
9399  val = pe->getValue(&val);
9400  }
9401  if(val) {
9402  this->distribute_points_on_cut_lines = false;
9403  } else {
9404  this->distribute_points_on_cut_lines = true;
9405  }
9406 
9407  if(this->mj_run_as_rcb) {
9408  mj_user_recursion_depth =
9409  (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
9410  }
9411  if(this->recursion_depth < 1) {
9412  if(mj_user_recursion_depth > 0) {
9413  this->recursion_depth = mj_user_recursion_depth;
9414  }
9415  else {
9416  this->recursion_depth = this->coord_dim;
9417  }
9418  }
9419 }
9420 
9422 template <typename Adapter>
9424  int dim,
9425  adapter_scalar_t *lower,
9426  adapter_scalar_t *upper,
9427  size_t &nPartsFound,
9428  typename Adapter::part_t **partsFound) const
9429 {
9430  // TODO: Implement with cuts rather than boxes to reduce algorithmic
9431  // TODO: complexity. Or at least do a search through the boxes, using
9432  // TODO: p x q x r x ... if possible.
9433 
9434  nPartsFound = 0;
9435  *partsFound = NULL;
9436 
9437  if(this->mj_keep_part_boxes) {
9438 
9439  // Get vector of part boxes
9440  RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9441 
9442  size_t nBoxes = (*partBoxes).size();
9443  if(nBoxes == 0) {
9444  throw std::logic_error("no part boxes exist");
9445  }
9446 
9447  // Determine whether the box overlaps the globalBox at all
9448  RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9449 
9450  if(globalBox->boxesOverlap(dim, lower, upper)) {
9451 
9452  std::vector<typename Adapter::part_t> partlist;
9453 
9454  // box overlaps the global box; find specific overlapping boxes
9455  for(size_t i = 0; i < nBoxes; i++) {
9456  try {
9457  if((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
9458  nPartsFound++;
9459  partlist.push_back((*partBoxes)[i].getpId());
9460  /*
9461  std::cout << "Given box (";
9462  for(int j = 0; j < dim; j++)
9463  std::cout << lower[j] << " ";
9464  std::cout << ") x (";
9465  for(int j = 0; j < dim; j++)
9466  std::cout << upper[j] << " ";
9467  std::cout << ") overlaps PartBox "
9468  << (*partBoxes)[i].getpId() << " (";
9469  for(int j = 0; j < dim; j++)
9470  std::cout << (*partBoxes)[i].getlmins()[j] << " ";
9471  std::cout << ") x (";
9472  for(int j = 0; j < dim; j++)
9473  std::cout << (*partBoxes)[i].getlmaxs()[j] << " ";
9474  std::cout << ")" << std::endl;
9475  */
9476  }
9477  }
9479  }
9480  if(nPartsFound) {
9481  *partsFound = new mj_part_t[nPartsFound];
9482  for(size_t i = 0; i < nPartsFound; i++)
9483  (*partsFound)[i] = partlist[i];
9484  }
9485  }
9486  else {
9487  // Box does not overlap the domain at all. Find the closest part
9488  // Not sure how to perform this operation for MJ without having the
9489  // cuts. With the RCB cuts, the concept of a part extending to
9490  // infinity was natural. With the boxes, it is much more difficult.
9491  // TODO: For now, return information indicating NO OVERLAP.
9492  }
9493  }
9494  else {
9495  throw std::logic_error("need to use keep_cuts parameter for boxAssign");
9496  }
9497 }
9498 
9500 template <typename Adapter>
9502  int dim,
9503  adapter_scalar_t *point) const
9504 {
9505  // TODO: Implement with cuts rather than boxes to reduce algorithmic
9506  // TODO: complexity. Or at least do a search through the boxes, using
9507  // TODO: p x q x r x ... if possible.
9508 
9509  if(this->mj_keep_part_boxes) {
9510  typename Adapter::part_t foundPart = -1;
9511 
9512  // Get vector of part boxes
9513  RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9514 
9515  size_t nBoxes = (*partBoxes).size();
9516  if(nBoxes == 0) {
9517  throw std::logic_error("no part boxes exist");
9518  }
9519 
9520  // Determine whether the point is within the global domain
9521  RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9522 
9523  if(globalBox->pointInBox(dim, point)) {
9524 
9525  // point is in the global domain; determine in which part it is.
9526  size_t i;
9527  for(i = 0; i < nBoxes; i++) {
9528  try {
9529  if((*partBoxes)[i].pointInBox(dim, point)) {
9530  foundPart = (*partBoxes)[i].getpId();
9531  // std::cout << "Point (";
9532  // for(int j = 0; j < dim; j++) std::cout << point[j] << " ";
9533  // std::cout << ") found in box " << i << " part " << foundPart
9534  // << std::endl;
9535  // (*partBoxes)[i].print();
9536  break;
9537  }
9538  }
9540  }
9541 
9542  if(i == nBoxes) {
9543  // This error should never occur
9544  std::ostringstream oss;
9545  oss << "Point (";
9546  for(int j = 0; j < dim; j++) oss << point[j] << " ";
9547  oss << ") not found in domain";
9548  throw std::logic_error(oss.str());
9549  }
9550  }
9551 
9552  else {
9553  // Point is outside the global domain.
9554  // Determine to which part it is closest.
9555  // TODO: with cuts, would not need this special case
9556 
9557  typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
9558  size_t closestBox = 0;
9559  coord_t minDistance = std::numeric_limits<coord_t>::max();
9560  coord_t *centroid = new coord_t[dim];
9561  for(size_t i = 0; i < nBoxes; i++) {
9562  (*partBoxes)[i].computeCentroid(centroid);
9563  coord_t sum = 0.;
9564  coord_t diff;
9565  for(int j = 0; j < dim; j++) {
9566  diff = centroid[j] - point[j];
9567  sum += diff * diff;
9568  }
9569  if(sum < minDistance) {
9570  minDistance = sum;
9571  closestBox = i;
9572  }
9573  }
9574  foundPart = (*partBoxes)[closestBox].getpId();
9575  delete [] centroid;
9576  }
9577 
9578  return foundPart;
9579  }
9580  else {
9581  throw std::logic_error("need to use keep_cuts parameter for pointAssign");
9582  }
9583 }
9584 
9585 template <typename Adapter>
9587  const PartitioningSolution<Adapter> *solution,
9588  ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comXAdj,
9589  ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comAdj)
9590 {
9591  if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL) {
9592  RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
9593  mj_part_t ntasks = (*pBoxes).size();
9594  int dim = (*pBoxes)[0].getDim();
9595  GridHash grid(pBoxes, ntasks, dim);
9596  grid.getAdjArrays(comXAdj_, comAdj_);
9597  }
9598  comAdj = comAdj_;
9599  comXAdj = comXAdj_;
9600 }
9601 
9602 template <typename Adapter>
9603 RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
9605 {
9606  return this->mj_partitioner.get_kept_boxes();
9607 }
9608 } // namespace Zoltan2
9609 
9610 #endif
Zoltan2_MJArrayType< scalar_t > value_type
RCP< mj_partBoxVector_t > get_kept_boxes() const
DOCWORK: Documentation.
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType()
Kokkos::View< index_t *, device_t > part_xadj
GridHash Class, Hashing Class for part boxes.
Created by mbenlioglu on Aug 31, 2020.
void set_up_partitioning_data(const RCP< PartitioningSolution< Adapter > > &solution)
Time an algorithm (or other entity) as a whole.
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
Kokkos::View< index_t *, device_t > track_on_cuts
Defines Parameter related enumerators, declares functions.
KOKKOS_INLINE_FUNCTION ArrayReducer(value_type &val, int mj_value_count)
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Kokkos::View< scalar_t *, device_t > coordinates
Sort items for quick sort function.
KOKKOS_INLINE_FUNCTION value_type & reference() const
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals...
RCP< mj_partBox_t > get_global_box() const
DOCWORK: Documentation.
Kokkos::View< index_t *, device_t > permutations
map_t::global_ordinal_type gno_t
Definition: mapRemotes.cpp:18
KOKKOS_INLINE_FUNCTION void join(volatile value_type dst, const volatile value_type src) const
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
Class for sorting items with multiple values. First sorting with respect to val[0], then val[1] then ... val[count-1]. The last tie breaking is done with index values. Used for task mapping partitioning where the points on a cut line needs to be distributed consistently.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
Kokkos::View< scalar_t **, device_t > weights
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
size_t team_shmem_size(int team_size) const
KOKKOS_INLINE_FUNCTION void operator()(const member_type &teamMember, value_type teamSum) const
Kokkos::View< scalar_t * > scalar_view_t
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
mj_part_t pointAssign(int dim, adapter_scalar_t *point) const
A ParameterList validator for integer range lists.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
SparseMatrixAdapter_t::part_t part_t
Multi Jagged coordinate partitioning algorithm.
#define epsilon
Definition: nd.cpp:82
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const
Kokkos::View< scalar_t *, device_t > cut_coordinates
A PartitioningSolution is a solution to a partitioning problem.
Zoltan2_BoxBoundaries()
Default Constructor.
Kokkos::View< index_t *, device_t > permutations
void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, double minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
Kokkos::View< part_t *, device_t > parts
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
ReduceWeightsFunctor(int mj_loop_count, array_t mj_max_scalar, part_t mj_concurrent_current_part, part_t mj_num_cuts, part_t mj_current_work_part, part_t mj_current_concurrent_num_parts, part_t mj_left_right_array_size, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< scalar_t **, device_t > &mj_weights, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< scalar_t *, device_t > &mj_cut_coordinates, Kokkos::View< index_t *, device_t > &mj_part_xadj, bool mj_uniform_weights0, scalar_t mj_sEpsilon)
size_t team_shmem_size(int team_size) const
Algorithm defines the base class for all algorithms.
map_t::local_ordinal_type lno_t
Definition: mapRemotes.cpp:17
KOKKOS_INLINE_FUNCTION value_type & reference() const
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
KOKKOS_INLINE_FUNCTION void join(volatile value_type dst, const volatile value_type src) const
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
Kokkos::View< part_t *, device_t > parts
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
uMultiSortItem(IT index_, CT count_, WT *vals_)
KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(scalar_t mj_max_scalar, value_type &val, int mj_value_count_rightleft, int mj_value_count_weights)
KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
Define IntegerRangeList validator.
Defines the CoordinateModel classes.
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, int num_teams, size_t num_global_parts, Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, Kokkos::View< const mj_gno_t *, device_t > &initial_mj_gnos, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates, int num_weights_per_coord, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_weights, Kokkos::View< mj_scalar_t **, device_t > &mj_weights, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_parts, Kokkos::View< mj_part_t *, device_t > &result_assigned_part_ids, Kokkos::View< mj_gno_t *, device_t > &result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
Kokkos::View< scalar_t * > scalar_view_t
Tpetra::global_size_t global_size_t
Zoltan2_MJArrayType< scalar_t > value_type
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
Kokkos::View< scalar_t *, device_t > coordinates
ReduceArrayFunctor(part_t mj_concurrent_current_part, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< index_t *, device_t > &mj_part_xadj, Kokkos::View< index_t *, device_t > &mj_track_on_cuts)
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
Kokkos::View< index_t *, device_t > part_xadj
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType(scalar_t *pSetPtr)
A gathering of useful namespace methods.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals. DOCWORK: Document input params.
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
DOCWORK: Documentation.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const coordinateModel_t > &coords)
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Zoltan2_MJArrayType< scalar_t > & operator=(const volatile Zoltan2_MJArrayType< scalar_t > &zmj)
Multi Jagged coordinate partitioning algorithm.
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...