2 #ifndef _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_ 3 #define _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_ 13 #include "Teuchos_ArrayViewDecl.hpp" 16 #include "Teuchos_ReductionOp.hpp" 24 #include "Teuchos_Comm.hpp" 25 #ifdef HAVE_ZOLTAN2_MPI 26 #include "Teuchos_DefaultMpiComm.hpp" 27 #endif // HAVE_ZOLTAN2_MPI 28 #include <Teuchos_DefaultSerialComm.hpp> 39 template <
typename Ordinal,
typename T>
55 T inoutBuffer[])
const {
57 for (Ordinal i = 0; i < count; i++) {
58 if (inBuffer[0] - inoutBuffer[0] < -
epsilon) {
59 inoutBuffer[0] = inBuffer[0];
60 inoutBuffer[1] = inBuffer[1];
61 }
else if(inBuffer[0] - inoutBuffer[0] <
epsilon &&
62 inBuffer[1] - inoutBuffer[1] <
epsilon) {
63 inoutBuffer[0] = inBuffer[0];
64 inoutBuffer[1] = inBuffer[1];
75 template <
typename it>
77 return (x == 1 ? x : x * z2Fact<it>(x - 1));
80 template <
typename gno_t,
typename part_t>
88 template <
typename IT>
98 fact[k] = fact[k - 1] * k;
101 for (k = 0; k < n; ++k)
103 perm[k] = i / fact[n - 1 - k];
104 i = i % fact[n - 1 - k];
109 for (k = n - 1; k > 0; --k)
110 for (j = k - 1; j >= 0; --j)
111 if (perm[j] <= perm[k])
117 template <
typename part_t>
121 std::vector<int> grid_dims) {
122 int dim = grid_dims.size();
123 int neighborCount = 2 * dim;
124 task_comm_xadj =
new part_t[taskCount + 1];
125 task_comm_adj =
new part_t[taskCount * neighborCount];
128 task_comm_xadj[0] = 0;
129 for (
part_t i = 0; i < taskCount; ++i) {
131 for (
int j = 0; j < dim; ++j) {
132 part_t lNeighbor = i - prevDimMul;
133 part_t rNeighbor = i + prevDimMul;
134 prevDimMul *= grid_dims[j];
135 if (lNeighbor >= 0 &&
136 lNeighbor/ prevDimMul == i / prevDimMul &&
137 lNeighbor < taskCount) {
138 task_comm_adj[neighBorIndex++] = lNeighbor;
140 if (rNeighbor >= 0 &&
141 rNeighbor/ prevDimMul == i / prevDimMul &&
142 rNeighbor < taskCount) {
143 task_comm_adj[neighBorIndex++] = rNeighbor;
146 task_comm_xadj[i + 1] = neighBorIndex;
151 template <
typename Adapter,
typename scalar_t,
typename part_t>
154 const Teuchos::Comm<int> *comm,
160 scalar_t **partCenters) {
166 ArrayView<const gno_t> gnos;
167 ArrayView<input_t> xyz;
168 ArrayView<input_t> wgts;
177 memset(point_counts, 0,
sizeof(
gno_t) * ntasks);
180 gno_t *global_point_counts =
new gno_t[ntasks];
182 scalar_t **multiJagged_coordinates =
new scalar_t*[coordDim];
184 ArrayRCP<ArrayRCP<const scalar_t>> ar = arcp(
new ArrayRCP<const scalar_t>[coordDim], 0, coordDim);
185 for (
int dim=0; dim < coordDim; dim++){
186 xyz[dim].getInputArray(ar[dim]);
188 multiJagged_coordinates[dim] = (scalar_t *)ar[dim].getRawPtr();
189 memset(partCenters[dim], 0,
sizeof(scalar_t) * ntasks);
197 for (
lno_t i=0; i < numLocalCoords; i++) {
200 for(
int j = 0; j < coordDim; ++j) {
201 scalar_t c = multiJagged_coordinates[j][i];
202 partCenters[j][p] += c;
208 reduceAll<int, gno_t>(*comm, Teuchos::REDUCE_SUM,
209 ntasks, point_counts, global_point_counts);
211 for(
int j = 0; j < coordDim; ++j) {
212 for (
part_t i=0; i < ntasks; ++i) {
213 if (global_point_counts[i] > 0)
214 partCenters[j][i] /= global_point_counts[i];
218 scalar_t *tmpCoords =
new scalar_t[ntasks];
219 for(
int j = 0; j < coordDim; ++j) {
220 reduceAll<int, scalar_t>(*comm, Teuchos::REDUCE_SUM,
221 ntasks, partCenters[j], tmpCoords);
223 scalar_t *tmp = partCenters[j];
224 partCenters[j] = tmpCoords;
230 delete [] point_counts;
231 delete [] global_point_counts;
234 delete [] multiJagged_coordinates;
238 template <
typename Adapter,
typename scalar_t,
typename part_t>
241 const Teuchos::Comm<int> *comm,
246 ArrayRCP<part_t> &g_part_xadj,
247 ArrayRCP<part_t> &g_part_adj,
248 ArrayRCP<scalar_t> &g_part_ew) {
252 typedef typename Adapter::scalar_t t_scalar_t;
253 typedef typename Adapter::offset_t t_offset_t;
275 ArrayView<const t_gno_t> Ids;
276 ArrayView<t_input_t> v_wghts;
280 ArrayView<const t_gno_t> edgeIds;
281 ArrayView<const t_offset_t> offsets;
282 ArrayView<t_input_t> e_wgts;
285 std::vector<t_scalar_t> edge_weights;
288 if (numWeightPerEdge > 0) {
289 edge_weights = std::vector<t_scalar_t>(localNumEdges);
290 for (t_lno_t i = 0; i < localNumEdges; ++i) {
291 edge_weights[i] = e_wgts[0][i];
297 std::vector<part_t> e_parts(localNumEdges);
298 #ifdef HAVE_ZOLTAN2_MPI 299 if (comm->getSize() > 1) {
300 const bool bUseLocalIDs =
false;
303 const RCP<const Comm<int> > rcp_comm(comm,
false);
304 directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
305 directory.update(localNumVertices, &Ids[0], NULL, &parts[0],
307 directory.find(localNumEdges, &edgeIds[0], NULL, &e_parts[0],
320 for (t_lno_t i = 0; i < localNumEdges; ++i) {
321 t_gno_t ei = edgeIds[i];
327 std::vector<t_lno_t> part_begins(np, -1);
328 std::vector<t_lno_t> part_nexts(localNumVertices, -1);
332 for (t_lno_t i = 0; i < localNumVertices; ++i) {
334 part_nexts[i] = part_begins[ap];
339 g_part_xadj = ArrayRCP<part_t>(np + 1);
340 g_part_adj = ArrayRCP<part_t>(localNumEdges);
341 g_part_ew = ArrayRCP<t_scalar_t>(localNumEdges);
344 std::vector<part_t> part_neighbors(np);
345 std::vector<t_scalar_t> part_neighbor_weights(np, 0);
346 std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
349 for (t_lno_t i = 0; i < np; ++i) {
350 part_t num_neighbor_parts = 0;
351 t_lno_t v = part_begins[i];
355 for (t_offset_t j = offsets[v]; j < offsets[v+1]; ++j) {
360 if (numWeightPerEdge > 0) {
361 ew = edge_weights[j];
368 if (part_neighbor_weights[ep] < 0.00001) {
369 part_neighbors[num_neighbor_parts++] = ep;
372 part_neighbor_weights[ep] += ew;
380 for (t_lno_t j = 0; j < num_neighbor_parts; ++j) {
381 part_t neighbor_part = part_neighbors[j];
382 g_part_adj[nindex] = neighbor_part;
383 g_part_ew[nindex++] = part_neighbor_weights[neighbor_part];
384 part_neighbor_weights[neighbor_part] = 0;
386 g_part_xadj[i + 1] = nindex;
390 #ifdef HAVE_ZOLTAN2_MPI 391 if(comm->getSize() > 1) {
397 part_info() : weight(0) {
400 const part_info & operator+=(
const part_info & src) {
401 this->weight += src.weight;
405 bool operator>(
const part_info & src) {
406 return (destination_part > src.destination_part);
409 bool operator==(
const part_info & src) {
410 return (destination_part == src.destination_part);
417 bool bUseLocalIDs =
false;
418 const int debug_level = 0;
421 const RCP<const Comm<int> > rcp_comm(comm,
false);
422 directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
423 std::vector<part_t> part_data;
424 std::vector<std::vector<part_info>> user_data;
429 std::vector<t_lno_t> part_begins(np, -1);
430 std::vector<t_lno_t> part_nexts(localNumVertices, -1);
434 for (t_lno_t i = 0; i < localNumVertices; ++i) {
436 part_nexts[i] = part_begins[ap];
440 std::vector<part_t> part_neighbors(np);
441 std::vector<t_scalar_t> part_neighbor_weights(np, 0);
442 std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
445 for (t_lno_t i = 0; i < np; ++i) {
446 part_t num_neighbor_parts = 0;
447 t_lno_t v = part_begins[i];
452 for (t_offset_t j = offsets[v]; j < offsets[v+1]; ++j) {
457 if (numWeightPerEdge > 0) {
458 ew = edge_weights[j];
462 if (part_neighbor_weights[ep] < 0.00001) {
463 part_neighbors[num_neighbor_parts++] = ep;
466 part_neighbor_weights[ep] += ew;
473 for (t_lno_t j = 0; j < num_neighbor_parts; ++j) {
474 part_t neighbor_part = part_neighbors[j];
475 part_neighbor_weights_ordered[j] =
476 part_neighbor_weights[neighbor_part];
477 part_neighbor_weights[neighbor_part] = 0;
481 if (num_neighbor_parts > 0) {
482 part_data.push_back(i);
483 std::vector<part_info> new_user_data(num_neighbor_parts);
484 for(
int q = 0; q < num_neighbor_parts; ++q) {
485 part_info & info = new_user_data[q];
486 info.weight = part_neighbor_weights_ordered[q];
487 info.destination_part = part_neighbors[q];
490 user_data.push_back(new_user_data);
496 std::vector<part_t> part_indices(np);
498 for (
part_t i = 0; i < np; ++i) part_indices[i] = i;
501 directory.update(part_data.size(), &part_data[0], NULL, &user_data[0],
502 NULL, directory_t::Update_Mode::AggregateAdd);
506 std::vector<std::vector<part_info>> find_user_data(part_indices.size());
507 directory.find(find_user_data.size(), &part_indices[0], NULL,
508 &find_user_data[0], NULL, NULL,
false);
518 int get_total_length = 0;
519 for (
size_t n = 0; n < find_user_data.size(); ++n) {
520 get_total_length += find_user_data[n].size();
524 g_part_xadj = ArrayRCP<part_t>(np + 1);
525 g_part_adj = ArrayRCP<part_t>(get_total_length);
526 g_part_ew = ArrayRCP<t_scalar_t>(get_total_length);
529 int track_insert_index = 0;
530 for(
size_t n = 0; n < find_user_data.size(); ++n) {
531 g_part_xadj[n] = track_insert_index;
532 const std::vector<part_info> & user_data_vector = find_user_data[n];
533 for(
size_t q = 0; q < user_data_vector.size(); ++q) {
534 const part_info & info = user_data_vector[q];
535 g_part_adj[track_insert_index] = info.destination_part;
536 g_part_ew[track_insert_index] = info.weight;
537 ++track_insert_index;
540 g_part_xadj[np] = get_total_length;
542 #endif // HAVE_ZOLTAN2_MPI 548 template <
class IT,
class WT>
559 delete [] this->indices;
560 delete [] this->values;
564 this->heapSize = heapsize_;
565 this->indices =
new IT[heapsize_];
566 this->values =
new WT[heapsize_];
571 WT maxVal = this->values[0];
576 if (distance >= maxVal)
579 this->values[0] = distance;
580 this->indices[0] = index;
587 IT child_index1 = 2 * index_on_heap + 1;
588 IT child_index2 = 2 * index_on_heap + 2;
591 if(child_index1 < this->heapSize && child_index2 < this->heapSize) {
593 if (this->values[child_index1] < this->values[child_index2]) {
594 biggerIndex = child_index2;
597 biggerIndex = child_index1;
600 else if(child_index1 < this->heapSize) {
601 biggerIndex = child_index1;
604 else if(child_index2 < this->heapSize) {
605 biggerIndex = child_index2;
607 if (biggerIndex >= 0 &&
608 this->values[biggerIndex] > this->values[index_on_heap]) {
609 WT tmpVal = this->values[biggerIndex];
610 this->values[biggerIndex] = this->values[index_on_heap];
611 this->values[index_on_heap] = tmpVal;
613 IT tmpIndex = this->indices[biggerIndex];
614 this->indices[biggerIndex] = this->indices[index_on_heap];
615 this->indices[index_on_heap] = tmpIndex;
621 WT MAXVAL = std::numeric_limits<WT>::max();
623 for(IT i = 0; i < this->heapSize; ++i) {
624 this->values[i] = MAXVAL;
625 this->indices[i] = -1;
633 for(IT j = 0; j < this->heapSize; ++j) {
634 nc += this->values[j];
646 for(
int i = 0; i < dimension; ++i) {
649 for(IT j = 0; j < this->heapSize; ++j) {
650 IT k = this->indices[j];
658 nc /= this->heapSize;
659 moved = (std::abs(center[i] - nc) > this->epsilon || moved );
666 for (IT i = 0; i < this->heapSize; ++i) {
667 permutation[i] = this->indices[i];
674 template <
class IT,
class WT>
689 this->dimension = dimension_;
690 this->center =
new WT[dimension_];
707 for (
int i = 0; i < this->dimension; ++i) {
708 WT d = (
center[i] - elementCoords[i][index]);
711 distance = pow(distance, WT(1.0 / this->dimension));
712 closestPoints.
addPoint(index, distance);
732 template <
class IT,
class WT>
739 IT required_elements;
747 delete [] maxCoordinates;
748 delete [] minCoordinates;
757 IT required_elements_):
759 numElements(numElements_),
760 elementCoords(elementCoords_),
761 numClusters((1 << dim_) + 1),
762 required_elements(required_elements_) {
766 for (
int i = 0; i < numClusters; ++i) {
767 this->clusters[i].
setParams(this->dim, this->required_elements);
770 this->maxCoordinates =
new WT[this->dim];
771 this->minCoordinates =
new WT[this->dim];
774 for (
int j = 0; j < dim; ++j) {
775 this->minCoordinates[j] = this->elementCoords[j][0];
776 this->maxCoordinates[j] = this->elementCoords[j][0];
778 for(IT i = 1; i < numElements; ++i) {
779 WT t = this->elementCoords[j][i];
780 if(t > this->maxCoordinates[j]){
781 this->maxCoordinates[j] = t;
784 if (t < minCoordinates[j]) {
785 this->minCoordinates[j] = t;
792 for (
int j = 0; j < dim; ++j) {
793 int mod = (1 << (j+1));
794 for (
int i = 0; i < numClusters - 1; ++i) {
797 if ( (i % mod) < mod / 2) {
798 c = this->maxCoordinates[j];
803 c = this->minCoordinates[j];
806 this->clusters[i].
center[j] = c;
811 for (
int j = 0; j < dim; ++j) {
812 this->clusters[numClusters - 1].
center[j] =
813 (this->maxCoordinates[j] + this->minCoordinates[j]) / 2;
830 for (
int it = 0; it < 10; ++it) {
833 for (IT j = 0; j < this->numClusters; ++j) {
837 for (IT i = 0; i < this->numElements; ++i) {
840 for (IT j = 0; j < this->numClusters; ++j) {
844 this->clusters[j].
getDistance(i,this->elementCoords);
850 for (IT j = 0; j < this->numClusters; ++j) {
852 (this->clusters[j].
getNewCenters(this->elementCoords) || moved );
871 for (IT j = 1; j < this->numClusters; ++j) {
878 if (minTmpDistance < minDistance) {
879 minDistance = minTmpDistance;
890 #define MINOF(a,b) (((a)<(b))?(a):(b)) 899 template <
typename T>
903 #ifdef HAVE_ZOLTAN2_OMP 904 #pragma omp parallel for 906 for (
size_t i = 0; i < arrSize; ++i) {
913 #ifdef HAVE_ZOLTAN2_OMP 914 #pragma omp parallel for 916 for (
size_t i = 0; i < arrSize; ++i) {
926 template <
typename part_t,
typename pcoord_t,
typename node_t>
957 part_t *task_communication_xadj,
958 part_t *task_communication_adj,
959 pcoord_t *task_communication_edge_weight) {
961 double totalCost = 0;
965 int assigned_proc = task_to_proc[task];
967 part_t task_adj_begin = task_communication_xadj[task];
968 part_t task_adj_end = task_communication_xadj[task + 1];
970 commCount += task_adj_end - task_adj_begin;
972 for (
part_t task2 = task_adj_begin; task2 < task_adj_end; ++task2) {
974 part_t neighborTask = task_communication_adj[task2];
975 int neighborProc = task_to_proc[neighborTask];
978 if (task_communication_edge_weight == NULL) {
979 totalCost += distance ;
982 totalCost += distance * task_communication_edge_weight[task2];
988 this->commCost = totalCost;
1008 const RCP<const Environment> &env,
1009 ArrayRCP <part_t> &proc_to_task_xadj,
1010 ArrayRCP <part_t> &proc_to_task_adj,
1011 ArrayRCP <part_t> &task_to_proc,
1012 const Teuchos::RCP <
const Teuchos::Comm<int> > comm_
1020 template <
typename pcoord_t,
typename tcoord_t,
typename part_t,
typename node_t>
1076 pcoord_t **pcoords_,
1078 tcoord_t **tcoords_,
1081 int *machine_extent_,
1082 bool *machine_extent_wrap_around_,
1098 this->partArraySize = psize;
1102 this->kokkos_partNoArray = pNo;
1115 part_t minCoordDim =
MINOF(this->task_coord_dim, this->proc_coord_dim);
1117 minCoordDim, nprocs,
1118 this->proc_coords, ntasks);
1123 for(
int i = ntasks; i < nprocs; ++i) {
1124 proc_permutation[i] = -1;
1151 lo = _u_umpa_seed % q;
1152 hi = _u_umpa_seed / q;
1153 test = (a * lo) - (r * hi);
1155 _u_umpa_seed = test;
1157 _u_umpa_seed = test + m;
1159 d = (double) ((
double) _u_umpa_seed / (double) m);
1161 return (
part_t) (d*(double)l);
1165 pcoord_t distance = 0;
1179 this->machine->
getHopCount(procId1, procId2, distance);
1188 int &_u_umpa_seed,
part_t rndm) {
1200 for (i = 0; i < n; i += 16) {
1207 ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v], a[u], tmp);
1208 ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 1], a[u + 1], tmp);
1209 ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 2], a[u + 2], tmp);
1210 ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 3], a[u + 3], tmp);
1216 for (i = 1; i < end; i++) {
1238 const RCP<const Environment> &env,
1239 ArrayRCP <part_t> &rcp_proc_to_task_xadj,
1240 ArrayRCP <part_t> &rcp_proc_to_task_adj,
1241 ArrayRCP <part_t> &rcp_task_to_proc,
1242 const Teuchos::RCP <
const Teuchos::Comm<int> > comm_
1245 rcp_proc_to_task_xadj = ArrayRCP <part_t>(this->
no_procs + 1);
1246 rcp_proc_to_task_adj = ArrayRCP <part_t>(this->
no_tasks);
1247 rcp_task_to_proc = ArrayRCP <part_t>(this->
no_tasks);
1250 part_t *proc_to_task_xadj = rcp_proc_to_task_xadj.getRawPtr();
1253 part_t *proc_to_task_adj = rcp_proc_to_task_adj.getRawPtr();
1256 part_t *task_to_proc = rcp_task_to_proc.getRawPtr();
1259 fillContinousArray<part_t> (proc_to_task_xadj, this->
no_procs + 1, &invalid);
1286 recursion_depth = log(
float(this->no_procs)) / log(2.0) + 1;
1295 if (this->task_coord_dim <= 8)
1299 taskPerm = z2Fact<int>(8);
1302 if (this->proc_coord_dim <= 8)
1306 procPerm = z2Fact<int>(8);
1310 int permutations = taskPerm * procPerm;
1314 permutations += taskPerm;
1318 permutations += procPerm;
1336 if (this->no_procs > this->
no_tasks) {
1344 fillContinousArray<part_t>(proc_adjList,this->
no_procs, NULL);
1348 int myPermutation = myRank % permutations;
1349 bool task_partition_along_longest_dim =
false;
1350 bool proc_partition_along_longest_dim =
false;
1356 if (myPermutation == 0) {
1357 task_partition_along_longest_dim =
true;
1358 proc_partition_along_longest_dim =
true;
1362 if (myPermutation < taskPerm) {
1363 proc_partition_along_longest_dim =
true;
1365 myTaskPerm = myPermutation;
1368 myPermutation -= taskPerm;
1369 if (myPermutation < procPerm) {
1370 task_partition_along_longest_dim =
true;
1372 myProcPerm = myPermutation;
1375 myPermutation -= procPerm;
1377 myProcPerm = myPermutation % procPerm;
1379 myTaskPerm = myPermutation / procPerm;
1404 int *permutation =
new int[(this->proc_coord_dim > this->
task_coord_dim)
1405 ? this->proc_coord_dim : this->task_coord_dim];
1408 if (this->proc_coord_dim <= 8)
1409 ithPermutation<int>(this->
proc_coord_dim, myProcPerm, permutation);
1411 ithPermutation<int>(8, myProcPerm, permutation);
1479 part_t num_group_count = 1;
1480 part_t *group_count = NULL;
1483 num_group_count =
machine->getNumUniqueGroups();
1485 if (num_group_count > 1) {
1486 group_count =
new part_t[num_group_count];
1487 memset(group_count, 0,
sizeof(
part_t) * num_group_count);
1489 machine->getGroupCount(group_count);
1493 env->timerStart(
MACRO_TIMERS,
"Mapping - Proc Partitioning");
1497 typedef typename node_t::device_type device_t;
1499 Kokkos::View<pcoord_t**, Kokkos::LayoutLeft, device_t>
1500 kokkos_pcoords(
"pcoords", this->no_procs, procdim);
1501 auto host_kokkos_pcoords = Kokkos::create_mirror_view(kokkos_pcoords);
1502 for(
int i = 0; i < procdim; ++i) {
1503 for(
int j = 0; j < this->
no_procs; ++j) {
1504 host_kokkos_pcoords(j,i) = pcoords[i][j];
1507 Kokkos::deep_copy(kokkos_pcoords, host_kokkos_pcoords);
1509 Kokkos::View<part_t*, device_t> initial_selected_coords_output_permutation_pcoords(
1510 "initial_selected_coords_output_permutation_pcoords", this->no_procs);
1511 typename Kokkos::View<part_t*, device_t>::HostMirror
1512 host_initial_selected_coords_output_permutation_pcoords =
1513 Kokkos::create_mirror_view(initial_selected_coords_output_permutation_pcoords);
1514 for(
int n = 0; n < this->
no_procs; ++n) {
1515 host_initial_selected_coords_output_permutation_pcoords(n) =
1518 Kokkos::deep_copy(initial_selected_coords_output_permutation_pcoords,
1519 host_initial_selected_coords_output_permutation_pcoords);
1522 Kokkos::View<part_t *, Kokkos::HostSpace> kokkos_group_count(
1523 "kokkos_group_count", group_count ? num_group_count : 0);
1525 for(
int n = 0; n < num_group_count; ++n) {
1526 kokkos_group_count(n) = group_count[n];
1538 initial_selected_coords_output_permutation_pcoords,
1542 proc_partition_along_longest_dim,
1546 kokkos_group_count);
1547 env->timerStop(
MACRO_TIMERS,
"Mapping - Proc Partitioning");
1552 Kokkos::deep_copy(host_initial_selected_coords_output_permutation_pcoords,
1553 initial_selected_coords_output_permutation_pcoords);
1554 for(
int n = 0; n < this->
no_procs; ++n) {
1556 host_initial_selected_coords_output_permutation_pcoords(n);
1563 fillContinousArray<part_t>(task_adjList,this->
no_tasks, NULL);
1566 if (this->task_coord_dim <= 8)
1567 ithPermutation<int>(this->
task_coord_dim, myTaskPerm, permutation);
1569 ithPermutation<int>(8, myTaskPerm, permutation);
1574 tcoords[i] = this->task_coords[permutation[i]];
1578 Kokkos::View<tcoord_t**, Kokkos::LayoutLeft, device_t>
1579 kokkos_tcoords(
"tcoords", this->no_tasks, this->task_coord_dim);
1580 auto host_kokkos_tcoords = Kokkos::create_mirror_view(kokkos_tcoords);
1582 for(
int j = 0; j < this->
no_tasks; ++j) {
1583 host_kokkos_tcoords(j,i) = tcoords[i][j];
1586 Kokkos::deep_copy(kokkos_tcoords, host_kokkos_tcoords);
1588 env->timerStart(
MACRO_TIMERS,
"Mapping - Task Partitioning");
1590 Kokkos::View<part_t*, device_t> initial_selected_coords_output_permutation_tcoords(
1591 "initial_selected_coords_output_permutation_tcoords", this->no_tasks);
1592 typename Kokkos::View<part_t*, device_t>::HostMirror
1593 host_initial_selected_coords_output_permutation_tcoords =
1594 Kokkos::create_mirror_view(initial_selected_coords_output_permutation_tcoords);
1595 for(
int n = 0; n < this->
no_tasks; ++n) {
1596 host_initial_selected_coords_output_permutation_tcoords(n) =
1599 Kokkos::deep_copy(initial_selected_coords_output_permutation_tcoords,
1600 host_initial_selected_coords_output_permutation_tcoords);
1608 this->task_coord_dim,
1611 initial_selected_coords_output_permutation_tcoords,
1615 task_partition_along_longest_dim,
1619 kokkos_group_count);
1620 env->timerStop(
MACRO_TIMERS,
"Mapping - Task Partitioning");
1622 Kokkos::deep_copy(host_initial_selected_coords_output_permutation_tcoords,
1623 initial_selected_coords_output_permutation_tcoords);
1624 for(
int n = 0; n < this->
no_tasks; ++n) {
1626 host_initial_selected_coords_output_permutation_tcoords(n);
1635 delete [] permutation;
1639 for(
part_t i = 0; i < num_parts; ++i) {
1641 part_t proc_index_begin = proc_xadj[i];
1642 part_t task_begin_index = task_xadj[i];
1643 part_t proc_index_end = proc_xadj[i + 1];
1644 part_t task_end_index = task_xadj[i + 1];
1647 if(proc_index_end - proc_index_begin != 1) {
1648 std::cerr <<
"Error at partitioning of processors" << std::endl;
1649 std::cerr <<
"PART:" << i <<
" is assigned to " 1650 << proc_index_end - proc_index_begin <<
" processors." 1654 part_t assigned_proc = proc_adjList[proc_index_begin];
1655 proc_to_task_xadj[assigned_proc] = task_end_index - task_begin_index;
1663 part_t tmp = proc_to_task_xadj[i];
1664 proc_to_task_xadj[i] = sum;
1666 proc_to_task_xadj_work[i] = sum;
1668 proc_to_task_xadj[this->
no_procs] = sum;
1670 for(
part_t i = 0; i < num_parts; ++i){
1672 part_t proc_index_begin = proc_xadj[i];
1673 part_t task_begin_index = task_xadj[i];
1674 part_t task_end_index = task_xadj[i + 1];
1676 part_t assigned_proc = proc_adjList[proc_index_begin];
1678 for (
part_t j = task_begin_index; j < task_end_index; ++j) {
1679 part_t taskId = task_adjList[j];
1681 task_to_proc[taskId] = assigned_proc;
1683 proc_to_task_adj [--proc_to_task_xadj_work[assigned_proc]] = taskId;
1740 delete [] proc_to_task_xadj_work;
1741 delete [] task_xadj;
1742 delete [] task_adjList;
1743 delete [] proc_xadj;
1744 delete [] proc_adjList;
1749 template <
typename Adapter,
typename part_t>
1753 #ifndef DOXYGEN_SHOULD_SKIP_THIS 1755 typedef typename Adapter::scalar_t pcoord_t;
1756 typedef typename Adapter::scalar_t tcoord_t;
1757 typedef typename Adapter::scalar_t
scalar_t;
1760 #if defined(KOKKOS_ENABLE_CUDA) 1761 using node_t = Kokkos::Compat::KokkosDeviceWrapperNode<
1762 Kokkos::Cuda, Kokkos::CudaSpace>;
1763 #elif defined(KOKKOS_ENABLE_HIP) 1764 using node_t = Kokkos::Compat::KokkosDeviceWrapperNode<
1765 Kokkos::Experimental::HIP, Kokkos::Experimental::HIPSpace>;
1767 using node_t =
typename Adapter::node_t;
1803 const Teuchos::RCP<
const Teuchos::Comm<int> > comm_) {
1805 if (this->proc_task_comm) {
1810 this->proc_to_task_xadj,
1812 this->proc_to_task_adj,
1819 std::cerr <<
"communicationModel is not specified in the Mapper" 1839 taskPerm = z2Fact<int>(taskDim);
1842 taskPerm = z2Fact<int>(8);
1846 procPerm = z2Fact<int>(procDim);
1849 procPerm = z2Fact<int>(8);
1852 int idealGroupSize = taskPerm * procPerm;
1855 idealGroupSize += taskPerm + procPerm + 1;
1857 int myRank = this->
comm->getRank();
1858 int commSize = this->
comm->getSize();
1860 int myGroupIndex = myRank / idealGroupSize;
1862 int prevGroupBegin = (myGroupIndex - 1)* idealGroupSize;
1863 if (prevGroupBegin < 0) prevGroupBegin = 0;
1864 int myGroupBegin = myGroupIndex * idealGroupSize;
1865 int myGroupEnd = (myGroupIndex + 1) * idealGroupSize;
1866 int nextGroupEnd = (myGroupIndex + 2)* idealGroupSize;
1868 if (myGroupEnd > commSize) {
1869 myGroupBegin = prevGroupBegin;
1870 myGroupEnd = commSize;
1872 if (nextGroupEnd > commSize) {
1873 myGroupEnd = commSize;
1875 int myGroupSize = myGroupEnd - myGroupBegin;
1878 for (
int i = 0; i < myGroupSize; ++i) {
1879 myGroup[i] = myGroupBegin + i;
1884 ArrayView<const part_t> myGroupView(myGroup, myGroupSize);
1886 RCP<Comm<int> > subComm =
1887 this->
comm->createSubcommunicator(myGroupView);
1903 double localCost[2], globalCost[2];
1905 localCost[0] = myCost;
1906 localCost[1] = double(subComm->getRank());
1908 globalCost[1] = globalCost[0] = std::numeric_limits<double>::max();
1910 reduceAll<int, double>(*subComm, reduceBest,
1911 2, localCost, globalCost);
1913 int sender = int(globalCost[1]);
1930 broadcast(*subComm, sender, this->ntasks,
1931 this->task_to_proc.getRawPtr());
1932 broadcast(*subComm, sender, this->nprocs,
1933 this->proc_to_task_xadj.getRawPtr());
1934 broadcast(*subComm, sender, this->ntasks,
1935 this->proc_to_task_adj.getRawPtr());
1940 std::ofstream gnuPlotCode(
"gnuPlot.plot", std::ofstream::out);
1944 std::string ss =
"";
1947 std::string procFile = Teuchos::toString<int>(i) +
"_mapping.txt";
1949 gnuPlotCode <<
"plot \"" << procFile <<
"\"\n";
1952 gnuPlotCode <<
"replot \"" << procFile <<
"\"\n";
1955 std::ofstream inpFile(procFile.c_str(), std::ofstream::out);
1957 std::string gnuPlotArrow =
"set arrow from ";
1958 for (
int j = 0; j < mindim; ++j) {
1959 if (j == mindim - 1) {
1969 proc_coords[j][i]) +
",";
1972 gnuPlotArrow +=
" to ";
1974 inpFile << std::endl;
1977 for (
int k = 0; k < a.size(); ++k) {
1980 std::string gnuPlotArrow2 = gnuPlotArrow;
1981 for (
int z = 0; z < mindim; ++z) {
1982 if (z == mindim - 1) {
1994 task_coords[z][j]) +
",";
1997 ss += gnuPlotArrow2 +
"\n";
1998 inpFile << std::endl;
2003 gnuPlotCode <<
"\nreplot\n pause -1 \n";
2004 gnuPlotCode.close();
2009 std::string rankStr = Teuchos::toString<int>(myRank);
2010 std::string gnuPlots =
"gnuPlot", extentionS =
".plot";
2011 std::string outF = gnuPlots + rankStr+ extentionS;
2012 std::ofstream gnuPlotCode(outF.c_str(), std::ofstream::out);
2015 *tmpproc_task_comm =
2017 pcoord_t, tcoord_t,
part_t, node_t> * > (
2022 int mindim = tmpproc_task_comm->proc_coord_dim;
2024 std::cerr <<
"Mapping Write is only good for 3 dim" << std::endl;
2027 std::string ss =
"";
2028 std::string procs =
"";
2030 std::set < std::tuple<int,int,int,int,int,int> > my_arrows;
2031 for (
part_t origin_rank = 0; origin_rank < this->
nprocs; ++origin_rank) {
2033 if (a.size() == 0) {
2037 std::string gnuPlotArrow =
"set arrow from ";
2038 for (
int j = 0; j < mindim; ++j) {
2039 if (j == mindim - 1) {
2041 Teuchos::toString<float>(tmpproc_task_comm->
2042 proc_coords[j][origin_rank]);
2044 Teuchos::toString<float>(tmpproc_task_comm->
2045 proc_coords[j][origin_rank]);
2050 Teuchos::toString<float>(tmpproc_task_comm->
2051 proc_coords[j][origin_rank]) +
",";
2053 Teuchos::toString<float>(tmpproc_task_comm->
2054 proc_coords[j][origin_rank])+
" ";
2059 gnuPlotArrow +=
" to ";
2062 for (
int k = 0; k < a.size(); ++k) {
2063 int origin_task = a[k];
2069 bool differentnode =
false;
2073 for (
int j = 0; j < mindim; ++j) {
2074 if (
int(tmpproc_task_comm->proc_coords[j][origin_rank]) !=
2075 int(tmpproc_task_comm->proc_coords[j][neighbor_rank])) {
2076 differentnode =
true;
break;
2079 std::tuple<int,int,int, int, int, int> foo(
2080 int(tmpproc_task_comm->proc_coords[0][origin_rank]),
2081 int(tmpproc_task_comm->proc_coords[1][origin_rank]),
2082 int(tmpproc_task_comm->proc_coords[2][origin_rank]),
2083 int(tmpproc_task_comm->proc_coords[0][neighbor_rank]),
2084 int(tmpproc_task_comm->proc_coords[1][neighbor_rank]),
2085 int(tmpproc_task_comm->proc_coords[2][neighbor_rank]));
2088 if (differentnode && my_arrows.find(foo) == my_arrows.end()) {
2089 my_arrows.insert(foo);
2091 std::string gnuPlotArrow2 =
"";
2092 for (
int j = 0; j < mindim; ++j) {
2093 if (j == mindim - 1) {
2095 Teuchos::toString<float>(tmpproc_task_comm->
2096 proc_coords[j][neighbor_rank]);
2100 Teuchos::toString<float>(tmpproc_task_comm->
2101 proc_coords[j][neighbor_rank]) +
",";
2104 ss += gnuPlotArrow + gnuPlotArrow2 +
" nohead\n";
2110 std::ofstream procFile(
"procPlot.plot", std::ofstream::out);
2111 procFile << procs <<
"\n";
2116 gnuPlotCode <<
"plot \"procPlot.plot\" with points pointsize 3\n";
2118 gnuPlotCode <<
"splot \"procPlot.plot\" with points pointsize 3\n";
2121 gnuPlotCode << ss <<
"\nreplot\n pause -1 \n";
2122 gnuPlotCode.close();
2129 const Teuchos::Comm<int> *comm_,
2132 tcoord_t **partCenters) {
2134 std::string file =
"gggnuPlot";
2135 std::string exten =
".plot";
2136 std::ofstream mm(
"2d.txt");
2137 file += Teuchos::toString<int>(comm_->getRank()) + exten;
2138 std::ofstream ff(file.c_str());
2140 std::vector<Zoltan2::coordinateModelPartBox <tcoord_t, part_t> >
2146 outPartBoxes[i].writeGnuPlot(ff, mm);
2148 if (coordDim == 2) {
2149 ff <<
"plot \"2d.txt\"" << std::endl;
2153 ff <<
"splot \"2d.txt\"" << std::endl;
2158 ff <<
"set style arrow 5 nohead size screen 0.03,15,135 ls 1" 2165 for (
part_t p = pb; p < pe; ++p) {
2169 std::string arrowline =
"set arrow from ";
2170 for (
int j = 0; j < coordDim - 1; ++j) {
2172 Teuchos::toString<tcoord_t>(partCenters[j][n]) +
",";
2175 Teuchos::toString<tcoord_t>(partCenters[coordDim - 1][n]) +
2178 for (
int j = 0; j < coordDim - 1; ++j) {
2180 Teuchos::toString<tcoord_t>(partCenters[j][i]) +
",";
2183 Teuchos::toString<tcoord_t>(partCenters[coordDim - 1][i]) +
2191 ff <<
"replot\n pause -1" << std::endl;
2199 part_t* &proc_to_task_adj_) {
2200 proc_to_task_xadj_ = this->proc_to_task_xadj.getRawPtr();
2201 proc_to_task_adj_ = this->proc_to_task_adj.getRawPtr();
2222 if(this->isOwnerofModel) {
2228 const lno_t num_local_coords,
2229 const part_t *local_coord_parts,
2230 const ArrayRCP<part_t> task_to_proc_) {
2233 for (
lno_t i = 0; i < num_local_coords; ++i) {
2234 part_t local_coord_part = local_coord_parts[i];
2235 part_t rank_index = task_to_proc_[local_coord_part];
2256 const Teuchos::RCP <
const Teuchos::Comm<int> > comm_,
2259 const Teuchos::RCP <const Adapter> input_adapter_,
2262 const Teuchos::RCP <const Environment> envConst,
2263 bool is_input_adapter_distributed =
true,
2264 int num_ranks_per_node = 1,
2265 bool divide_to_prime_first =
false,
2266 bool reduce_best_mapping =
true):
2281 RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2282 RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2284 RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2285 RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2286 if (!is_input_adapter_distributed) {
2287 ia_comm = Teuchos::createSerialComm<int>();
2290 RCP<const Environment> envConst_ = envConst;
2292 RCP<const ctm_base_adapter_t> baseInputAdapter_(
2293 rcp(dynamic_cast<const ctm_base_adapter_t *>(
2294 input_adapter_.getRawPtr()),
false));
2302 baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2312 baseInputAdapter_, envConst_, ia_comm,
2316 if (!machine_->hasMachineCoordinates()) {
2317 throw std::runtime_error(
"Existing machine does not provide " 2318 "coordinates for coordinate task mapping");
2322 int procDim = machine_->getMachineDim();
2323 this->nprocs = machine_->getNumRanks();
2326 pcoord_t **procCoordinates = NULL;
2327 if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2328 throw std::runtime_error(
"Existing machine does not implement " 2329 "getAllMachineCoordinatesView");
2336 std::vector<int> machine_extent_vec(procDim);
2338 int *machine_extent = &(machine_extent_vec[0]);
2339 bool *machine_extent_wrap_around =
new bool[procDim];
2340 for (
int i = 0; i < procDim; ++i)
2341 machine_extent_wrap_around[i] =
false;
2343 bool haveWrapArounds = machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2349 if (machine_->getMachineExtent(machine_extent) &&
2356 machine_extent_wrap_around,
2363 int coordDim = coordinateModel_->getCoordinateDim();
2374 tcoord_t **partCenters =
new tcoord_t*[coordDim];
2375 for (
int i = 0; i < coordDim; ++i) {
2376 partCenters[i] =
new tcoord_t[this->
ntasks];
2379 typedef typename Adapter::scalar_t t_scalar_t;
2382 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Solution Center");
2385 getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2386 envConst.getRawPtr(),
2387 ia_comm.getRawPtr(),
2388 coordinateModel_.getRawPtr(),
2396 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Solution Center");
2399 if (graph_model_.getRawPtr() != NULL) {
2400 getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2401 envConst.getRawPtr(),
2402 ia_comm.getRawPtr(),
2403 graph_model_.getRawPtr(),
2416 pcoord_t, tcoord_t,
part_t, node_t>(
2424 machine_extent_wrap_around,
2425 machine_.getRawPtr());
2427 int myRank = comm_->getRank();
2431 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Processor Task map");
2433 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Processor Task map");
2435 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Communication Graph");
2442 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Communication Graph");
2444 if (comm_->getRank() == 0) {
2447 std::cout <<
" TotalComm:" 2450 for (
part_t i = 1; i <= taskCommCount; ++i) {
2456 std::cout <<
" maxNeighbor:" << maxN << std::endl;
2459 this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2462 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Communication Cost");
2477 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Communication Cost");
2484 coordinateModel_->getLocalNumCoordinates(),
2507 delete [] machine_extent_wrap_around;
2509 if (machine_->getMachineExtent(machine_extent) &&
2511 for (
int i = 0; i < procDim; ++i) {
2512 delete [] procCoordinates[i];
2514 delete [] procCoordinates;
2517 for (
int i = 0; i < coordDim; ++i) {
2518 delete [] partCenters[i];
2520 delete [] partCenters;
2543 const Teuchos::RCP <
const Teuchos::Comm<int> > comm_,
2546 const Teuchos::RCP <const Adapter> input_adapter_,
2548 const part_t *result_parts,
2549 const Teuchos::RCP <const Environment> envConst,
2550 bool is_input_adapter_distributed =
true,
2551 int num_ranks_per_node = 1,
2552 bool divide_to_prime_first =
false,
2553 bool reduce_best_mapping =
true):
2555 num_parts_, result_parts, envConst),
2568 RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2569 RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2571 RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2572 RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2573 if (!is_input_adapter_distributed) {
2574 ia_comm = Teuchos::createSerialComm<int>();
2576 RCP<const Environment> envConst_ = envConst;
2578 RCP<const ctm_base_adapter_t> baseInputAdapter_(
2579 rcp(dynamic_cast<const ctm_base_adapter_t *>(
2580 input_adapter_.getRawPtr()),
false));
2588 baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2598 baseInputAdapter_, envConst_, ia_comm,
2602 if (!machine_->hasMachineCoordinates()) {
2603 throw std::runtime_error(
"Existing machine does not provide " 2604 "coordinates for coordinate task mapping.");
2608 int procDim = machine_->getMachineDim();
2609 this->nprocs = machine_->getNumRanks();
2612 pcoord_t **procCoordinates = NULL;
2613 if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2614 throw std::runtime_error(
"Existing machine does not implement " 2615 "getAllMachineCoordinatesView");
2623 std::vector<int> machine_extent_vec(procDim);
2625 int *machine_extent = &(machine_extent_vec[0]);
2626 bool *machine_extent_wrap_around =
new bool[procDim];
2627 bool haveWrapArounds = machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2633 if (machine_->getMachineExtent(machine_extent) &&
2639 machine_extent_wrap_around,
2646 int coordDim = coordinateModel_->getCoordinateDim();
2651 this->ntasks = num_parts_;
2655 tcoord_t **partCenters =
new tcoord_t*[coordDim];
2656 for (
int i = 0; i < coordDim; ++i) {
2657 partCenters[i] =
new tcoord_t[this->
ntasks];
2660 typedef typename Adapter::scalar_t t_scalar_t;
2663 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Solution Center");
2666 getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2667 envConst.getRawPtr(),
2668 ia_comm.getRawPtr(),
2669 coordinateModel_.getRawPtr(),
2677 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Solution Center");
2681 if (graph_model_.getRawPtr() != NULL) {
2682 getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2683 envConst.getRawPtr(),
2684 ia_comm.getRawPtr(),
2685 graph_model_.getRawPtr(),
2698 "CoordinateCommunicationModel Create");
2701 pcoord_t, tcoord_t,
part_t, node_t>(
2709 machine_extent_wrap_around,
2710 machine_.getRawPtr());
2713 "CoordinateCommunicationModel Create");
2719 int myRank = comm_->getRank();
2722 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Processor Task map");
2724 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Processor Task map");
2727 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Communication Graph");
2734 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Communication Graph");
2736 if (comm_->getRank() == 0) {
2739 std::cout <<
" TotalComm:" 2742 for (
part_t i = 1; i <= taskCommCount; ++i) {
2748 std::cout <<
" maxNeighbor:" << maxN << std::endl;
2751 this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2754 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Communication Cost");
2769 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Communication Cost");
2777 coordinateModel_->getLocalNumCoordinates(),
2801 delete [] machine_extent_wrap_around;
2803 if (machine_->getMachineExtent(machine_extent) &&
2805 for (
int i = 0; i < procDim; ++i) {
2806 delete [] procCoordinates[i];
2808 delete [] procCoordinates;
2811 for (
int i = 0; i < coordDim; ++i) {
2812 delete [] partCenters[i];
2814 delete [] partCenters;
2884 const Teuchos::Comm<int> *problemComm,
2887 pcoord_t **machine_coords,
2890 tcoord_t **task_coords,
2891 ArrayRCP<part_t>task_comm_xadj,
2892 ArrayRCP<part_t>task_comm_adj,
2893 pcoord_t *task_communication_edge_weight_,
2894 int recursion_depth,
2895 Kokkos::View<part_t *, Kokkos::HostSpace> part_no_array,
2896 const part_t *machine_dimensions,
2897 int num_ranks_per_node = 1,
2898 bool divide_to_prime_first =
false,
2899 bool reduce_best_mapping =
true):
2912 pcoord_t ** virtual_machine_coordinates = machine_coords;
2913 bool *wrap_arounds =
new bool [proc_dim];
2914 for (
int i = 0; i < proc_dim; ++i) wrap_arounds[i] =
true;
2916 if (machine_dimensions) {
2917 virtual_machine_coordinates =
2926 this->nprocs = num_processors;
2928 int coordDim = task_dim;
2929 this->ntasks = num_tasks;
2932 tcoord_t **partCenters = task_coords;
2935 this->proc_task_comm =
2938 virtual_machine_coordinates,
2951 int myRank = problemComm->getRank();
2966 task_communication_edge_weight_
2987 delete [] wrap_arounds;
2989 if (machine_dimensions) {
2990 for (
int i = 0; i < proc_dim; ++i) {
2991 delete [] virtual_machine_coordinates[i];
2993 delete [] virtual_machine_coordinates;
2996 if (problemComm->getRank() == 0)
3026 const part_t *machine_dimensions,
3027 bool *machine_extent_wrap_around,
3029 pcoord_t **mCoords) {
3031 pcoord_t **result_machine_coords = NULL;
3032 result_machine_coords =
new pcoord_t*[machine_dim];
3034 for (
int i = 0; i < machine_dim; ++i) {
3035 result_machine_coords[i] =
new pcoord_t [numProcs];
3038 for (
int i = 0; i < machine_dim; ++i) {
3039 part_t numMachinesAlongDim = machine_dimensions[i];
3041 part_t *machineCounts =
new part_t[numMachinesAlongDim];
3042 memset(machineCounts, 0,
sizeof(
part_t) * numMachinesAlongDim);
3044 int *filledCoordinates =
new int[numMachinesAlongDim];
3046 pcoord_t *coords = mCoords[i];
3048 for (
part_t j = 0; j < numProcs; ++j) {
3050 ++machineCounts[mc];
3053 part_t filledCoordinateCount = 0;
3054 for (
part_t j = 0; j < numMachinesAlongDim; ++j) {
3055 if (machineCounts[j] > 0) {
3056 filledCoordinates[filledCoordinateCount++] = j;
3060 part_t firstProcCoord = filledCoordinates[0];
3061 part_t firstProcCount = machineCounts[firstProcCoord];
3063 part_t lastProcCoord = filledCoordinates[filledCoordinateCount - 1];
3064 part_t lastProcCount = machineCounts[lastProcCoord];
3067 numMachinesAlongDim - lastProcCoord + firstProcCoord;
3068 part_t firstLastGapProc = lastProcCount + firstProcCount;
3070 part_t leftSideProcCoord = firstProcCoord;
3071 part_t leftSideProcCount = firstProcCount;
3073 part_t biggestGapProc = numProcs;
3075 part_t shiftBorderCoordinate = -1;
3076 for (
part_t j = 1; j < filledCoordinateCount; ++j) {
3077 part_t rightSideProcCoord= filledCoordinates[j];
3078 part_t rightSideProcCount = machineCounts[rightSideProcCoord];
3080 part_t gap = rightSideProcCoord - leftSideProcCoord;
3081 part_t gapProc = rightSideProcCount + leftSideProcCount;
3087 if (gap > biggestGap ||
3088 (gap == biggestGap && biggestGapProc > gapProc)) {
3089 shiftBorderCoordinate = rightSideProcCoord;
3090 biggestGapProc = gapProc;
3093 leftSideProcCoord = rightSideProcCoord;
3094 leftSideProcCount = rightSideProcCount;
3098 if (!(biggestGap > firstLastGap ||
3099 (biggestGap == firstLastGap &&
3100 biggestGapProc < firstLastGapProc))) {
3101 shiftBorderCoordinate = -1;
3104 for (
part_t j = 0; j < numProcs; ++j) {
3106 if (machine_extent_wrap_around[i] &&
3107 coords[j] < shiftBorderCoordinate) {
3108 result_machine_coords[i][j] = coords[j] + numMachinesAlongDim;
3112 result_machine_coords[i][j] = coords[j];
3115 delete [] machineCounts;
3116 delete [] filledCoordinates;
3119 return result_machine_coords;
3133 procs = this->task_to_proc.getRawPtr() + taskId;
3141 return this->task_to_proc[taskId];
3154 part_t task_begin = this->proc_to_task_xadj[procId];
3155 part_t taskend = this->proc_to_task_xadj[procId + 1];
3157 parts = this->proc_to_task_adj.getRawPtr() + task_begin;
3158 numParts = taskend - task_begin;
3162 part_t task_begin = this->proc_to_task_xadj[procId];
3163 part_t taskend = this->proc_to_task_xadj[procId + 1];
3174 if (taskend - task_begin > 0) {
3175 ArrayView <part_t> assignedParts(
3176 this->proc_to_task_adj.getRawPtr() + task_begin,
3177 taskend - task_begin);
3179 return assignedParts;
3182 ArrayView <part_t> assignedParts;
3184 return assignedParts;
3268 template <
typename part_t,
typename pcoord_t,
typename tcoord_t>
3270 RCP<
const Teuchos::Comm<int> > problemComm,
3273 pcoord_t **machine_coords,
3276 tcoord_t **task_coords,
3281 pcoord_t *task_communication_edge_weight_,
3284 int recursion_depth,
3285 Kokkos::View<part_t *, Kokkos::HostSpace> part_no_array,
3286 const part_t *machine_dimensions,
3287 int num_ranks_per_node = 1,
3288 bool divide_to_prime_first =
false) {
3295 typedef Tpetra::MultiVector<tcoord_t, part_t, part_t>
tMVector_t;
3298 task_comm_xadj, 0, num_tasks + 1,
false);
3301 if (task_comm_xadj) {
3302 Teuchos::ArrayRCP<part_t> tmp_task_communication_adj(
3303 task_comm_adj, 0, task_comm_xadj[num_tasks],
false);
3311 problemComm.getRawPtr(),
3323 task_communication_edge_weight_,
3328 divide_to_prime_first);
3331 part_t* proc_to_task_xadj_;
3332 part_t* proc_to_task_adj_;
3334 ctm->
getProcTask(proc_to_task_xadj_, proc_to_task_adj_);
3336 for (
part_t i = 0; i <= num_processors; ++i) {
3340 for (
part_t i = 0; i < num_tasks; ++i) {
3348 template <
typename proc_coord_t,
typename v_lno_t>
3350 const int machine_coord_dim,
3351 const int num_ranks,
3352 proc_coord_t **machine_coords,
3353 const v_lno_t num_tasks,
3356 const int *task_to_rank) {
3358 std::string rankStr = Teuchos::toString<int>(myRank);
3359 std::string gnuPlots =
"gnuPlot", extentionS =
".plot";
3360 std::string outF = gnuPlots + rankStr+ extentionS;
3361 std::ofstream gnuPlotCode( outF.c_str(), std::ofstream::out);
3363 if (machine_coord_dim != 3) {
3364 std::cerr <<
"Mapping Write is only good for 3 dim" << std::endl;
3367 std::string ss =
"";
3368 std::string procs =
"";
3370 std::set<std::tuple<int, int, int, int, int, int> > my_arrows;
3372 for (v_lno_t origin_task = 0; origin_task < num_tasks; ++origin_task) {
3373 int origin_rank = task_to_rank[origin_task];
3374 std::string gnuPlotArrow =
"set arrow from ";
3376 for (
int j = 0; j < machine_coord_dim; ++j) {
3377 if (j == machine_coord_dim - 1) {
3379 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
3381 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
3386 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank])
3389 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank])
3395 gnuPlotArrow +=
" to ";
3403 bool differentnode =
false;
3404 int neighbor_rank = task_to_rank[neighbor_task];
3406 for (
int j = 0; j < machine_coord_dim; ++j) {
3407 if (
int(machine_coords[j][origin_rank]) !=
3408 int(machine_coords[j][neighbor_rank])) {
3409 differentnode =
true;
break;
3413 std::tuple<int,int,int, int, int, int> foo(
3414 (
int)(machine_coords[0][origin_rank]),
3415 (
int)(machine_coords[1][origin_rank]),
3416 (
int)(machine_coords[2][origin_rank]),
3417 (
int)(machine_coords[0][neighbor_rank]),
3418 (
int)(machine_coords[1][neighbor_rank]),
3419 (
int)(machine_coords[2][neighbor_rank]));
3421 if (differentnode && my_arrows.find(foo) == my_arrows.end()) {
3422 my_arrows.insert(foo);
3424 std::string gnuPlotArrow2 =
"";
3425 for (
int j = 0; j < machine_coord_dim; ++j) {
3426 if (j == machine_coord_dim - 1) {
3428 Teuchos::toString<float>(machine_coords[j][neighbor_rank]);
3432 Teuchos::toString<float>(machine_coords[j][neighbor_rank])
3436 ss += gnuPlotArrow + gnuPlotArrow2 +
" nohead\n";
3441 std::ofstream procFile(
"procPlot.plot", std::ofstream::out);
3442 procFile << procs <<
"\n";
3446 if (machine_coord_dim == 2) {
3447 gnuPlotCode <<
"plot \"procPlot.plot\" with points pointsize 3\n";
3450 gnuPlotCode <<
"splot \"procPlot.plot\" with points pointsize 3\n";
3453 gnuPlotCode << ss <<
"\nreplot\n pause -1\npause -1";
3454 gnuPlotCode.close();
void setParams(int dimension_, int heapsize)
CommunicationModel Base Class that performs mapping between the coordinate partitioning result...
Zoltan2::BaseAdapter< userTypes_t > base_adapter_t
void getBestMapping()
finds the lowest cost mapping and broadcasts solution to everyone.
void setPartArraySize(int psize)
void ithPermutation(const IT n, IT i, IT *perm)
Created by mbenlioglu on Aug 31, 2020.
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const part_t num_parts_, const part_t *result_parts, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. Instead of Solution we have two parameters, numparts.
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
Time an algorithm (or other entity) as a whole.
WT getDistance(IT index, WT **elementCoords)
virtual size_t getLocalNumberOfParts() const
Returns the number of parts to be assigned to this process.
bool getHopCount(int rank1, int rank2, pcoord_t &hops) const
return the hop count between rank1 and rank2
void getCoarsenedPartGraph(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::GraphModel< typename Adapter::base_adapter_t > *graph, part_t np, const part_t *parts, ArrayRCP< part_t > &g_part_xadj, ArrayRCP< part_t > &g_part_adj, ArrayRCP< scalar_t > &g_part_ew)
void visualize_mapping(int myRank, const int machine_coord_dim, const int num_ranks, proc_coord_t **machine_coords, const v_lno_t num_tasks, const v_lno_t *task_communication_xadj, const v_lno_t *task_communication_adj, const int *task_to_rank)
PartitionMapping maps a solution or an input distribution to ranks.
ArrayRCP< part_t > local_task_to_rank
void getSolutionCenterCoordinates(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::CoordinateModel< typename Adapter::base_adapter_t > *coords, const part_t *parts, int coordDim, part_t ntasks, scalar_t **partCenters)
size_t getLocalNumVertices() const
Returns the number vertices on this process.
size_t getEdgeList(ArrayView< const gno_t > &edgeIds, ArrayView< const offset_t > &offsets, ArrayView< input_t > &wgts) const
Sets pointers to this process' edge (neighbor) global Ids, including off-process edges.
bool getNewCenters(WT **coords)
void calculateCommunicationCost(part_t *task_to_proc, part_t *task_communication_xadj, part_t *task_communication_adj, pcoord_t *task_communication_edge_weight)
void timerStop(TimerType tt, const char *timerName) const
Stop a named timer.
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
map_t::global_ordinal_type gno_t
virtual std::vector< coordinateModelPartBox > & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
size_t getLocalNumEdges() const
Returns the number of edges on this process. In global or subset graphs, includes off-process edges...
virtual ~CoordinateTaskMapper()
const part_t * solution_parts
ArrayView< part_t > getAssignedTasksForProc(part_t procId)
bool * machine_extent_wrap_around
part_t getAssignedProcForTask(part_t taskId)
getAssignedProcForTask function, returns the assigned processor id for the given task ...
void getMinDistanceCluster(IT *procPermutation)
void doMapping(int myRank, const Teuchos::RCP< const Teuchos::Comm< int > > comm_)
doMapping function, calls getMapping function of communicationModel object.
virtual void getPartsForProc(int procId, part_t &numParts, part_t *&parts) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
Defines the XpetraMultiVectorAdapter.
const MachineRepresentation< pcoord_t, part_t > * machine
SparseMatrixAdapter_t::part_t part_t
Multi Jagged coordinate partitioning algorithm.
virtual void getProcsForPart(part_t taskId, part_t &numProcs, part_t *&procs) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
void create_local_task_to_rank(const lno_t num_local_coords, const part_t *local_coord_parts, const ArrayRCP< part_t > task_to_proc_)
Kokkos::View< part_t *, Kokkos::HostSpace > kokkos_partNoArray
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
double getCommunicationCostMetric()
virtual void map(const RCP< MappingSolution< Adapter > > &mappingsoln)
Mapping method.
int getNumWeightsPerEdge() const
Returns the number (0 or greater) of weights per edge.
CoordinateTaskMapper(const Environment *env_const_, const Teuchos::Comm< int > *problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, ArrayRCP< part_t >task_comm_xadj, ArrayRCP< part_t >task_comm_adj, pcoord_t *task_communication_edge_weight_, int recursion_depth, Kokkos::View< part_t *, Kokkos::HostSpace > part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor The mapping constructor which will also perform the mapping operation. The result mapping can be obtained by –getAssignedProcForTask function: which returns the assigned processor id for the given task –getPartsForProc: which returns the assigned tasks with the number of tasks.
Contains the Multi-jagged algorthm.
Adapter::scalar_t scalar_t
PartitionMapping maps a solution or an input distribution to ranks.
void getProcTask(part_t *&proc_to_task_xadj_, part_t *&proc_to_task_adj_)
void update_visit_order(part_t *visitOrder, part_t n, int &_u_umpa_seed, part_t rndm)
void getClosestSubset(part_t *proc_permutation, part_t nprocs, part_t ntasks) const
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
A PartitioningSolution is a solution to a partitioning problem.
CoordinateCommunicationModel< pcoord_t, tcoord_t, part_t, node_t > * proc_task_comm
size_t getCoordinates(ArrayView< const gno_t > &Ids, ArrayView< input_t > &xyz, ArrayView< input_t > &wgts) const
Returns the coordinate ids, values and optional weights.
void copyCoordinates(IT *permutation)
void getGridCommunicationGraph(part_t taskCount, part_t *&task_comm_xadj, part_t *&task_comm_adj, std::vector< int > grid_dims)
Zoltan2_ReduceBestMapping Class, reduces the minimum cost mapping, ties breaks with minimum proc id...
ArrayRCP< part_t > task_communication_adj
BaseAdapterType
An enum to identify general types of adapters.
void setPartArray(Kokkos::View< part_t *, Kokkos::HostSpace > pNo)
The StridedData class manages lists of weights or coordinates.
CoordinateCommunicationModel()
map_t::local_ordinal_type lno_t
ArrayRCP< part_t > proc_to_task_adj
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &proc_to_task_xadj, ArrayRCP< part_t > &proc_to_task_adj, ArrayRCP< part_t > &task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const =0
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
The user parameters, debug, timing and memory profiling output objects, and error checking methods...
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &rcp_proc_to_task_xadj, ArrayRCP< part_t > &rcp_proc_to_task_adj, ArrayRCP< part_t > &rcp_task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
void addPoint(IT index, WT distance)
bool getNewCenters(WT *center, WT **coords, int dimension)
MachineRepresentation Class Base class for representing machine coordinates, networks, etc.
CommunicationModel(part_t no_procs_, part_t no_tasks_)
virtual double getProcDistance(int procId1, int procId2) const
size_t getVertexList(ArrayView< const gno_t > &Ids, ArrayView< input_t > &wgts) const
Sets pointers to this process' vertex Ids and their weights.
static part_t umpa_uRandom(part_t l, int &_u_umpa_seed)
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const Teuchos::RCP< const Zoltan2::PartitioningSolution< Adapter > > soln_, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. When this constructor is called, in order to calculate the communication metric...
virtual ~CoordinateCommunicationModel()
CoordinateModelInput Class that performs mapping between the coordinate partitioning result and mpi r...
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS's idx_t...
size_t getActualGlobalNumberOfParts() const
Returns the actual global number of parts provided in setParts().
pcoord_t ** shiftMachineCoordinates(int machine_dim, const part_t *machine_dimensions, bool *machine_extent_wrap_around, part_t numProcs, pcoord_t **mCoords)
Using the machine dimensions provided, create virtual machine coordinates by assigning the largest ga...
Defines the MappingSolution class.
const Teuchos::RCP< const Teuchos::Comm< int > > comm
CoordinateCommunicationModel(int pcoord_dim_, pcoord_t **pcoords_, int tcoord_dim_, tcoord_t **tcoords_, part_t no_procs_, part_t no_tasks_, int *machine_extent_, bool *machine_extent_wrap_around_, const MachineRepresentation< pcoord_t, part_t > *machine_=NULL)
Class Constructor:
bool divide_to_prime_first
virtual ~CommunicationModel()
ArrayRCP< part_t > task_communication_xadj
const part_t * getPartListView() const
Returns the part list corresponding to the global ID list.
GraphModel defines the interface required for graph models.
void coordinateTaskMapperInterface(RCP< const Teuchos::Comm< int > > problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, part_t *task_comm_xadj, part_t *task_comm_adj, pcoord_t *task_communication_edge_weight_, part_t *proc_to_task_xadj, part_t *proc_to_task_adj, int recursion_depth, Kokkos::View< part_t *, Kokkos::HostSpace > part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false)
Constructor The interface function that calls CoordinateTaskMapper which will also perform the mappin...
Gathering definitions used in software development.
Zoltan2_ReduceBestMapping()
Default Constructor.
size_t getTargetGlobalNumberOfParts() const
Returns the global number of parts desired in the solution.
void timerStart(TimerType tt, const char *timerName) const
Start a named timer.
KmeansHeap Class, max heap, but holds the minimum values.
Defines the GraphModel interface.
void fillContinousArray(T *arr, size_t arrSize, T *val)
fillContinousArray function
void push_down(IT index_on_heap)
ArrayRCP< scalar_t > task_communication_edge_weight
RCP< Comm< int > > create_subCommunicator()
creates and returns the subcommunicator for the processor group.
KMeansAlgorithm Class that performs clustering of the coordinates, and returns the closest set of coo...
const Teuchos::RCP< const Environment > env
ArrayRCP< part_t > task_to_proc
KMeansAlgorithm(int dim_, IT numElements_, WT **elementCoords_, IT required_elements_)
KMeansAlgorithm Constructor.
Tpetra::MultiVector< zscalar_t, zlno_t, zgno_t, znode_t > tMVector_t
void writeMapping2(int myRank)
void copyCoordinates(IT *permutation)
virtual double getProcDistance(int procId1, int procId2) const =0
ArrayRCP< part_t > proc_to_task_xadj
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
void setHeapsize(IT heapsize_)