Tpetra parallel linear algebra  Version of the Day
Tpetra_Distributor.hpp
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ************************************************************************
38 // @HEADER
39 
40 #ifndef TPETRA_DISTRIBUTOR_HPP
41 #define TPETRA_DISTRIBUTOR_HPP
42 
43 #include "Tpetra_Util.hpp"
44 #include "Teuchos_as.hpp"
45 #include "Teuchos_Describable.hpp"
46 #include "Teuchos_ParameterListAcceptorDefaultBase.hpp"
47 #include "Teuchos_VerboseObject.hpp"
49 
50 #include "KokkosCompat_View.hpp"
51 #include "Kokkos_Core.hpp"
52 #include "Kokkos_TeuchosCommAdapters.hpp"
53 #include <memory>
54 #include <sstream>
55 #include <type_traits>
56 
57 namespace Tpetra {
58 
59  namespace Details {
65  DISTRIBUTOR_ISEND, // Use MPI_Isend (Teuchos::isend)
66  DISTRIBUTOR_RSEND, // Use MPI_Rsend (Teuchos::readySend)
67  DISTRIBUTOR_SEND, // Use MPI_Send (Teuchos::send)
68  DISTRIBUTOR_SSEND // Use MPI_Ssend (Teuchos::ssend)
69  };
70 
75  std::string
77 
83  DISTRIBUTOR_NOT_INITIALIZED, // Not initialized yet
84  DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS, // By createFromSends
85  DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS, // By createFromRecvs
86  DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS_N_RECVS, // By createFromSendsAndRecvs
87  DISTRIBUTOR_INITIALIZED_BY_REVERSE, // By createReverseDistributor
88  DISTRIBUTOR_INITIALIZED_BY_COPY, // By copy constructor
89  };
90 
95  std::string
97 
98  } // namespace Details
99 
106  Teuchos::Array<std::string> distributorSendTypes ();
107 
175  class Distributor :
176  public Teuchos::Describable,
177  public Teuchos::ParameterListAcceptorDefaultBase {
178  public:
180 
181 
190  explicit Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm);
191 
203  Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
204  const Teuchos::RCP<Teuchos::FancyOStream>& out);
205 
219  Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
220  const Teuchos::RCP<Teuchos::ParameterList>& plist);
221 
238  Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
239  const Teuchos::RCP<Teuchos::FancyOStream>& out,
240  const Teuchos::RCP<Teuchos::ParameterList>& plist);
241 
243  Distributor (const Distributor& distributor);
244 
249  virtual ~Distributor () = default;
250 
256  void swap (Distributor& rhs);
257 
259 
261 
266  void setParameterList (const Teuchos::RCP<Teuchos::ParameterList>& plist);
267 
272  Teuchos::RCP<const Teuchos::ParameterList> getValidParameters () const;
273 
275 
277 
297  size_t createFromSends (const Teuchos::ArrayView<const int>& exportProcIDs);
298 
332  template <class Ordinal>
333  void
334  createFromRecvs (const Teuchos::ArrayView<const Ordinal>& remoteIDs,
335  const Teuchos::ArrayView<const int>& remoteProcIDs,
336  Teuchos::Array<Ordinal>& exportIDs,
337  Teuchos::Array<int>& exportProcIDs);
338 
346  void
347  createFromSendsAndRecvs (const Teuchos::ArrayView<const int>& exportProcIDs,
348  const Teuchos::ArrayView<const int>& remoteProcIDs);
349 
351 
353 
357  size_t getNumReceives() const;
358 
362  size_t getNumSends() const;
363 
365  bool hasSelfMessage() const;
366 
368  size_t getMaxSendLength() const;
369 
371  size_t getTotalReceiveLength() const;
372 
377  Teuchos::ArrayView<const int> getProcsFrom() const;
378 
383  Teuchos::ArrayView<const int> getProcsTo() const;
384 
392  Teuchos::ArrayView<const size_t> getLengthsFrom() const;
393 
401  Teuchos::ArrayView<const size_t> getLengthsTo() const;
402 
408  return howInitialized_;
409  }
410 
412 
414 
425  Teuchos::RCP<Distributor> getReverse(bool create=true) const;
426 
428 
430 
451  template <class Packet>
452  void
453  doPostsAndWaits (const Teuchos::ArrayView<const Packet> &exports,
454  size_t numPackets,
455  const Teuchos::ArrayView<Packet> &imports);
456 
478  template <class Packet>
479  void
480  doPostsAndWaits (const Teuchos::ArrayView<const Packet> &exports,
481  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
482  const Teuchos::ArrayView<Packet> &imports,
483  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
484 
509  template <class Packet>
510  void
511  doPosts (const Teuchos::ArrayRCP<const Packet> &exports,
512  size_t numPackets,
513  const Teuchos::ArrayRCP<Packet> &imports);
514 
533  template <class Packet>
534  void
535  doPosts (const Teuchos::ArrayRCP<const Packet> &exports,
536  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
537  const Teuchos::ArrayRCP<Packet> &imports,
538  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
539 
546  void doWaits ();
547 
552  template <class Packet>
553  void
554  doReversePostsAndWaits (const Teuchos::ArrayView<const Packet> &exports,
555  size_t numPackets,
556  const Teuchos::ArrayView<Packet> &imports);
557 
562  template <class Packet>
563  void
564  doReversePostsAndWaits (const Teuchos::ArrayView<const Packet> &exports,
565  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
566  const Teuchos::ArrayView<Packet> &imports,
567  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
568 
573  template <class Packet>
574  void
575  doReversePosts (const Teuchos::ArrayRCP<const Packet> &exports,
576  size_t numPackets,
577  const Teuchos::ArrayRCP<Packet> &imports);
578 
583  template <class Packet>
584  void
585  doReversePosts (const Teuchos::ArrayRCP<const Packet> &exports,
586  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
587  const Teuchos::ArrayRCP<Packet> &imports,
588  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
589 
596  void doReverseWaits ();
597 
618  template <class ExpView, class ImpView>
619  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
621  const ExpView &exports,
622  size_t numPackets,
623  const ImpView &imports);
624 
646  template <class ExpView, class ImpView>
647  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
648  doPostsAndWaits (const ExpView &exports,
649  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
650  const ImpView &imports,
651  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
652 
677  template <class ExpView, class ImpView>
678  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
679  doPosts (const ExpView &exports,
680  size_t numPackets,
681  const ImpView &imports);
682 
701  template <class ExpView, class ImpView>
702  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
703  doPosts (const ExpView &exports,
704  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
705  const ImpView &imports,
706  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
707 
712  template <class ExpView, class ImpView>
713  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
714  doReversePostsAndWaits (const ExpView &exports,
715  size_t numPackets,
716  const ImpView &imports);
717 
722  template <class ExpView, class ImpView>
723  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
724  doReversePostsAndWaits (const ExpView &exports,
725  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
726  const ImpView &imports,
727  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
728 
733  template <class ExpView, class ImpView>
734  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
735  doReversePosts (const ExpView &exports,
736  size_t numPackets,
737  const ImpView &imports);
738 
743  template <class ExpView, class ImpView>
744  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
745  doReversePosts (const ExpView &exports,
746  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
747  const ImpView &imports,
748  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
749 
753  void getLastDoStatistics(size_t & bytes_sent, size_t & bytes_recvd) const{
754  bytes_sent = lastRoundBytesSend_;
755  bytes_recvd = lastRoundBytesRecv_;
756  }
757 
759 
761 
763  std::string description() const;
764 
786  void
787  describe (Teuchos::FancyOStream& out,
788  const Teuchos::EVerbosityLevel verbLevel =
789  Teuchos::Describable::verbLevel_default) const;
791 
792  private:
794  Teuchos::RCP<const Teuchos::Comm<int> > comm_;
795 
797  Details::EDistributorHowInitialized howInitialized_;
798 
800 
801 
804 
806  bool barrierBetween_;
807 
809  static bool getVerbose();
810 
815  std::unique_ptr<std::string>
816  createPrefix(const char methodName[]) const;
817 
819  bool verbose_ = getVerbose();
821 
825  bool selfMessage_;
826 
836  size_t numSends_;
837 
842  Teuchos::Array<int> procsTo_;
843 
852  Teuchos::Array<size_t> startsTo_;
853 
859  Teuchos::Array<size_t> lengthsTo_;
860 
864  size_t maxSendLength_;
865 
881  Teuchos::Array<size_t> indicesTo_;
882 
892  size_t numReceives_;
893 
900  size_t totalReceiveLength_;
901 
907  Teuchos::Array<size_t> lengthsFrom_;
908 
914  Teuchos::Array<int> procsFrom_;
915 
921  Teuchos::Array<size_t> startsFrom_;
922 
928  Teuchos::Array<size_t> indicesFrom_;
929 
936  Teuchos::Array<Teuchos::RCP<Teuchos::CommRequest<int> > > requests_;
937 
942  mutable Teuchos::RCP<Distributor> reverseDistributor_;
943 
945  size_t lastRoundBytesSend_;
946 
948  size_t lastRoundBytesRecv_;
949 
950 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
951  Teuchos::RCP<Teuchos::Time> timer_doPosts3TA_;
952  Teuchos::RCP<Teuchos::Time> timer_doPosts4TA_;
953  Teuchos::RCP<Teuchos::Time> timer_doPosts3KV_;
954  Teuchos::RCP<Teuchos::Time> timer_doPosts4KV_;
955  Teuchos::RCP<Teuchos::Time> timer_doWaits_;
956  Teuchos::RCP<Teuchos::Time> timer_doPosts3TA_recvs_;
957  Teuchos::RCP<Teuchos::Time> timer_doPosts4TA_recvs_;
958  Teuchos::RCP<Teuchos::Time> timer_doPosts3TA_barrier_;
959  Teuchos::RCP<Teuchos::Time> timer_doPosts4TA_barrier_;
960  Teuchos::RCP<Teuchos::Time> timer_doPosts3TA_sends_;
961  Teuchos::RCP<Teuchos::Time> timer_doPosts4TA_sends_;
962  Teuchos::RCP<Teuchos::Time> timer_doPosts3TA_sends_slow_;
963  Teuchos::RCP<Teuchos::Time> timer_doPosts4TA_sends_slow_;
964  Teuchos::RCP<Teuchos::Time> timer_doPosts3TA_sends_fast_;
965  Teuchos::RCP<Teuchos::Time> timer_doPosts4TA_sends_fast_;
966  Teuchos::RCP<Teuchos::Time> timer_doPosts3KV_recvs_;
967  Teuchos::RCP<Teuchos::Time> timer_doPosts4KV_recvs_;
968  Teuchos::RCP<Teuchos::Time> timer_doPosts3KV_barrier_;
969  Teuchos::RCP<Teuchos::Time> timer_doPosts4KV_barrier_;
970  Teuchos::RCP<Teuchos::Time> timer_doPosts3KV_sends_;
971  Teuchos::RCP<Teuchos::Time> timer_doPosts4KV_sends_;
972  Teuchos::RCP<Teuchos::Time> timer_doPosts3KV_sends_slow_;
973  Teuchos::RCP<Teuchos::Time> timer_doPosts4KV_sends_slow_;
974  Teuchos::RCP<Teuchos::Time> timer_doPosts3KV_sends_fast_;
975  Teuchos::RCP<Teuchos::Time> timer_doPosts4KV_sends_fast_;
976 
978  void makeTimers ();
979 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
980 
992  bool useDistinctTags_;
993 
998  int getTag (const int pathTag) const;
999 
1010  void computeReceives ();
1011 
1024  template <class Ordinal>
1025  void computeSends (const Teuchos::ArrayView<const Ordinal> &remoteGIDs,
1026  const Teuchos::ArrayView<const int> &remoteProcIDs,
1027  Teuchos::Array<Ordinal> &exportGIDs,
1028  Teuchos::Array<int> &exportProcIDs);
1029 
1031  void createReverseDistributor() const;
1032 
1033 
1038  std::string
1039  localDescribeToString (const Teuchos::EVerbosityLevel vl) const;
1040  }; // class Distributor
1041 
1042 
1043  template <class Packet>
1045  doPostsAndWaits (const Teuchos::ArrayView<const Packet>& exports,
1046  size_t numPackets,
1047  const Teuchos::ArrayView<Packet>& imports)
1048  {
1049  using Teuchos::arcp;
1050  using Teuchos::ArrayRCP;
1051  typedef typename ArrayRCP<const Packet>::size_type size_type;
1052 
1053  TEUCHOS_TEST_FOR_EXCEPTION(
1054  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
1055  "doPostsAndWaits(3 args): There are " << requests_.size () <<
1056  " outstanding nonblocking messages pending. It is incorrect to call "
1057  "this method with posts outstanding.");
1058 
1059  // doPosts() accepts the exports and imports arrays as ArrayRCPs,
1060  // requiring that the memory location is persisting (as is
1061  // necessary for nonblocking receives). However, it need only
1062  // persist until doWaits() completes, so it is safe for us to use
1063  // a nonpersisting reference in this case. The use of a
1064  // nonpersisting reference is purely a performance optimization.
1065 
1066  //const Packet* exportsPtr = exports.getRawPtr();
1067  //ArrayRCP<const Packet> exportsArcp (exportsPtr, static_cast<size_type> (0),
1068  // exports.size(), false);
1069  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (),
1070  static_cast<size_type> (0),
1071  exports.size(), false);
1072 
1073  // For some reason, neither of the options below (that use arcp)
1074  // compile for Packet=std::complex<double> with GCC 4.5.1. The
1075  // issue only arises with the exports array. This is why we
1076  // construct a separate nonowning ArrayRCP.
1077 
1078  // doPosts (arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false),
1079  // numPackets,
1080  // arcp<Packet> (imports.getRawPtr(), 0, imports.size(), false));
1081  // doPosts (arcp<const Packet> (exportsPtr, 0, exports.size(), false),
1082  // numPackets,
1083  // arcp<Packet> (imports.getRawPtr(), 0, imports.size(), false));
1084  doPosts (exportsArcp,
1085  numPackets,
1086  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false));
1087  doWaits ();
1088 
1089  lastRoundBytesSend_ = exports.size () * sizeof (Packet);
1090  lastRoundBytesRecv_ = imports.size () * sizeof (Packet);
1091  }
1092 
1093  template <class Packet>
1095  doPostsAndWaits (const Teuchos::ArrayView<const Packet>& exports,
1096  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
1097  const Teuchos::ArrayView<Packet> &imports,
1098  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
1099  {
1100  using Teuchos::arcp;
1101  using Teuchos::ArrayRCP;
1102 
1103  TEUCHOS_TEST_FOR_EXCEPTION(
1104  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
1105  "doPostsAndWaits: There are " << requests_.size () << " outstanding "
1106  "nonblocking messages pending. It is incorrect to call doPostsAndWaits "
1107  "with posts outstanding.");
1108 
1109  // doPosts() accepts the exports and imports arrays as ArrayRCPs,
1110  // requiring that the memory location is persisting (as is
1111  // necessary for nonblocking receives). However, it need only
1112  // persist until doWaits() completes, so it is safe for us to use
1113  // a nonpersisting reference in this case.
1114 
1115  // mfh 04 Apr 2012: For some reason, calling arcp<const Packet>
1116  // for Packet=std::complex<T> (e.g., T=float) fails to compile
1117  // with some versions of GCC. The issue only arises with the
1118  // exports array. This is why we construct a separate nonowning
1119  // ArrayRCP.
1120  typedef typename ArrayRCP<const Packet>::size_type size_type;
1121  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (),
1122  static_cast<size_type> (0),
1123  exports.size (), false);
1124  // mfh 04 Apr 2012: This is the offending code. This statement
1125  // would normally be in place of "exportsArcp" in the
1126  // doPosts() call below.
1127  //arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false),
1128  doPosts (exportsArcp,
1129  numExportPacketsPerLID,
1130  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false),
1131  numImportPacketsPerLID);
1132  doWaits ();
1133 
1134  lastRoundBytesSend_ = exports.size () * sizeof (Packet);
1135  lastRoundBytesRecv_ = imports.size () * sizeof (Packet);
1136  }
1137 
1138 
1139  template <class Packet>
1141  doPosts (const Teuchos::ArrayRCP<const Packet>& exports,
1142  size_t numPackets,
1143  const Teuchos::ArrayRCP<Packet>& imports)
1144  {
1145  using Teuchos::Array;
1146  using Teuchos::ArrayRCP;
1147  using Teuchos::ArrayView;
1148  using Teuchos::as;
1149  using Teuchos::FancyOStream;
1150  using Teuchos::includesVerbLevel;
1151  using Teuchos::ireceive;
1152  using Teuchos::isend;
1153  using Teuchos::readySend;
1154  using Teuchos::send;
1155  using Teuchos::ssend;
1156  using Teuchos::TypeNameTraits;
1157  using Teuchos::typeName;
1158  using std::endl;
1159  using size_type = Array<size_t>::size_type;
1160 
1161 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1162  Teuchos::TimeMonitor timeMon (*timer_doPosts3TA_);
1163 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1164 
1165  const bool debug = Details::Behavior::debug("Distributor");
1166  const int myRank = comm_->getRank ();
1167  // Run-time configurable parameters that come from the input
1168  // ParameterList set by setParameterList().
1169  const Details::EDistributorSendType sendType = sendType_;
1170  const bool doBarrier = barrierBetween_;
1171 
1172  std::unique_ptr<std::string> prefix;
1173  if (verbose_) {
1174  prefix = createPrefix("doPosts(3-arg, ArrayRCP)");
1175  std::ostringstream os;
1176  os << *prefix << "Start" << endl;
1177  std::cerr << os.str();
1178  }
1179 
1180  TEUCHOS_TEST_FOR_EXCEPTION(
1181  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier, std::logic_error,
1182  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): Ready-send "
1183  "version requires a barrier between posting receives and posting ready "
1184  "sends. This should have been checked before. "
1185  "Please report this bug to the Tpetra developers.");
1186 
1187  size_t selfReceiveOffset = 0;
1188 
1189  // mfh 30 Mar 2016: See Github Issue #227 to see why we need to
1190  // check whether we're doing reverse mode before checking the
1191  // length of the imports array.
1192  if (howInitialized_ != Details::DISTRIBUTOR_INITIALIZED_BY_REVERSE) {
1193  // Each message has the same number of packets.
1194  //
1195  // FIXME (mfh 18 Jul 2014): Relaxing this test from strict
1196  // inequality to a less-than seems to have fixed Bug 6170. It's
1197  // OK for the 'imports' array to be longer than it needs to be;
1198  // I'm just curious why it would be.
1199  const size_t totalNumImportPackets = totalReceiveLength_ * numPackets;
1200  TEUCHOS_TEST_FOR_EXCEPTION
1201  (static_cast<size_t> (imports.size ()) < totalNumImportPackets,
1202  std::invalid_argument,
1203  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): "
1204  "The 'imports' array must have enough entries to hold the expected number "
1205  "of import packets. imports.size() = " << imports.size () << " < "
1206  "totalNumImportPackets = " << totalNumImportPackets << ".");
1207  }
1208 
1209  // MPI tag for nonblocking receives and blocking sends in this
1210  // method. Some processes might take the "fast" path
1211  // (indicesTo_.empty()) and others might take the "slow" path for
1212  // the same doPosts() call, so the path tag must be the same for
1213  // both.
1214  const int pathTag = 0;
1215  const int tag = this->getTag (pathTag);
1216 
1217  if (debug) {
1218  TEUCHOS_TEST_FOR_EXCEPTION
1219  (requests_.size () != 0,
1220  std::logic_error,
1221  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): Process "
1222  << myRank << ": requests_.size() = " << requests_.size () << " != 0.");
1223  }
1224 
1225  // Distributor uses requests_.size() as the number of outstanding
1226  // nonblocking message requests, so we resize to zero to maintain
1227  // this invariant.
1228  //
1229  // numReceives_ does _not_ include the self message, if there is
1230  // one. Here, we do actually send a message to ourselves, so we
1231  // include any self message in the "actual" number of receives to
1232  // post.
1233  //
1234  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
1235  // doesn't (re)allocate its array of requests. That happens in
1236  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
1237  // demand), or Resize_().
1238  const size_type actualNumReceives = as<size_type> (numReceives_) +
1239  as<size_type> (selfMessage_ ? 1 : 0);
1240  requests_.resize (0);
1241 
1242  if (verbose_) {
1243  std::ostringstream os;
1244  os << *prefix << (indicesTo_.empty () ? "Fast" : "Slow")
1245  << ": Post receives" << endl;
1246  std::cerr << os.str();
1247  }
1248 
1249  // Post the nonblocking receives. It's common MPI wisdom to post
1250  // receives before sends. In MPI terms, this means favoring
1251  // adding to the "posted queue" (of receive requests) over adding
1252  // to the "unexpected queue" (of arrived messages not yet matched
1253  // with a receive).
1254  {
1255 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1256  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts3TA_recvs_);
1257 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1258 
1259  size_t curBufOffset = 0;
1260  for (size_type i = 0; i < actualNumReceives; ++i) {
1261  const size_t curBufLen = lengthsFrom_[i] * numPackets;
1262  if (procsFrom_[i] != myRank) {
1263  if (verbose_) {
1264  std::ostringstream os;
1265  os << *prefix << (indicesTo_.empty () ? "Fast" : "Slow")
1266  << ": Post irecv: {source: " << procsFrom_[i]
1267  << ", tag: " << tag << "}" << endl;
1268  std::cerr << os.str();
1269  }
1270  // If my process is receiving these packet(s) from another
1271  // process (not a self-receive):
1272  //
1273  // 1. Set up the persisting view (recvBuf) of the imports
1274  // array, given the offset and size (total number of
1275  // packets from process procsFrom_[i]).
1276  // 2. Start the Irecv and save the resulting request.
1277  TEUCHOS_TEST_FOR_EXCEPTION(
1278  curBufOffset + curBufLen > static_cast<size_t> (imports.size ()),
1279  std::logic_error,
1280  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): "
1281  "Exceeded size of 'imports' array in packing loop on Process " <<
1282  myRank << ". imports.size() = " << imports.size () << " < "
1283  "curBufOffset(" << curBufOffset << ") + curBufLen(" << curBufLen
1284  << ").");
1285  ArrayRCP<Packet> recvBuf =
1286  imports.persistingView (curBufOffset, curBufLen);
1287  requests_.push_back (ireceive<int, Packet> (recvBuf, procsFrom_[i],
1288  tag, *comm_));
1289  }
1290  else { // Receiving from myself
1291  selfReceiveOffset = curBufOffset; // Remember the self-recv offset
1292  }
1293  curBufOffset += curBufLen;
1294  }
1295  }
1296 
1297  if (doBarrier) {
1298 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1299  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts3TA_barrier_);
1300 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1301 
1302  if (verbose_) {
1303  std::ostringstream os;
1304  os << *prefix << (indicesTo_.empty () ? "Fast" : "Slow")
1305  << ": Barrier" << endl;
1306  std::cerr << os.str();
1307  }
1308  // If we are using ready sends (MPI_Rsend) below, we need to do
1309  // a barrier before we post the ready sends. This is because a
1310  // ready send requires that its matching receive has already
1311  // been posted before the send has been posted. The only way to
1312  // guarantee that in this case is to use a barrier.
1313  comm_->barrier ();
1314  }
1315 
1316 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1317  Teuchos::TimeMonitor timeMonSends (*timer_doPosts3TA_sends_);
1318 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1319 
1320  // setup scan through procsTo_ list starting with higher numbered procs
1321  // (should help balance message traffic)
1322  //
1323  // FIXME (mfh 20 Feb 2013) Why haven't we precomputed this?
1324  // It doesn't depend on the input at all.
1325  size_t numBlocks = numSends_ + selfMessage_;
1326  size_t procIndex = 0;
1327  while ((procIndex < numBlocks) && (procsTo_[procIndex] < myRank)) {
1328  ++procIndex;
1329  }
1330  if (procIndex == numBlocks) {
1331  procIndex = 0;
1332  }
1333 
1334  size_t selfNum = 0;
1335  size_t selfIndex = 0;
1336 
1337  if (verbose_) {
1338  std::ostringstream os;
1339  os << *prefix << (indicesTo_.empty () ? "Fast" : "Slow")
1340  << ": Post sends" << endl;
1341  std::cerr << os.str();
1342  }
1343 
1344  if (indicesTo_.empty ()) {
1345 
1346 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1347  Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts3TA_sends_fast_);
1348 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1349 
1350  // Data are already blocked (laid out) by process, so we don't
1351  // need a separate send buffer (besides the exports array).
1352  for (size_t i = 0; i < numBlocks; ++i) {
1353  size_t p = i + procIndex;
1354  if (p > (numBlocks - 1)) {
1355  p -= numBlocks;
1356  }
1357 
1358  if (procsTo_[p] != myRank) {
1359  if (verbose_) {
1360  std::ostringstream os;
1361  os << *prefix << ": Post send: {target: "
1362  << procsTo_[p] << ", tag: " << tag << "}" << endl;
1363  std::cerr << os.str();
1364  }
1365 
1366  ArrayView<const Packet> tmpSend =
1367  exports.view (startsTo_[p]*numPackets, lengthsTo_[p]*numPackets);
1368 
1369  if (sendType == Details::DISTRIBUTOR_SEND) {
1370  send<int, Packet> (tmpSend.getRawPtr (),
1371  as<int> (tmpSend.size ()),
1372  procsTo_[p], tag, *comm_);
1373  }
1374  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1375  ArrayRCP<const Packet> tmpSendBuf =
1376  exports.persistingView (startsTo_[p] * numPackets,
1377  lengthsTo_[p] * numPackets);
1378  requests_.push_back (isend<int, Packet> (tmpSendBuf, procsTo_[p],
1379  tag, *comm_));
1380  }
1381  else if (sendType == Details::DISTRIBUTOR_RSEND) {
1382  readySend<int, Packet> (tmpSend.getRawPtr (),
1383  as<int> (tmpSend.size ()),
1384  procsTo_[p], tag, *comm_);
1385  }
1386  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1387  ssend<int, Packet> (tmpSend.getRawPtr (),
1388  as<int> (tmpSend.size ()),
1389  procsTo_[p], tag, *comm_);
1390  } else {
1391  TEUCHOS_TEST_FOR_EXCEPTION(
1392  true, std::logic_error,
1393  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): "
1394  "Invalid send type. We should never get here. "
1395  "Please report this bug to the Tpetra developers.");
1396  }
1397  }
1398  else { // "Sending" the message to myself
1399  selfNum = p;
1400  }
1401  }
1402 
1403  if (selfMessage_) {
1404  if (verbose_) {
1405  std::ostringstream os;
1406  os << *prefix << "Fast: Self-send" << endl;
1407  std::cerr << os.str();
1408  }
1409  // This is how we "send a message to ourself": we copy from
1410  // the export buffer to the import buffer. That saves
1411  // Teuchos::Comm implementations other than MpiComm (in
1412  // particular, SerialComm) the trouble of implementing self
1413  // messages correctly. (To do this right, SerialComm would
1414  // need internal buffer space for messages, keyed on the
1415  // message's tag.)
1416  std::copy (exports.begin()+startsTo_[selfNum]*numPackets,
1417  exports.begin()+startsTo_[selfNum]*numPackets+lengthsTo_[selfNum]*numPackets,
1418  imports.begin()+selfReceiveOffset);
1419  }
1420  }
1421  else { // data are not blocked by proc, use send buffer
1422 
1423 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1424  Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts3TA_sends_slow_);
1425 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1426 
1427  // FIXME (mfh 05 Mar 2013) This is broken for Isend (nonblocking
1428  // sends), because the buffer is only long enough for one send.
1429  ArrayRCP<Packet> sendArray (maxSendLength_ * numPackets); // send buffer
1430 
1431  TEUCHOS_TEST_FOR_EXCEPTION(
1432  sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
1433  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): "
1434  "The \"send buffer\" code path doesn't currently work with "
1435  "nonblocking sends.");
1436 
1437  for (size_t i = 0; i < numBlocks; ++i) {
1438  size_t p = i + procIndex;
1439  if (p > (numBlocks - 1)) {
1440  p -= numBlocks;
1441  }
1442 
1443  if (procsTo_[p] != myRank) {
1444  if (verbose_) {
1445  std::ostringstream os;
1446  os << *prefix << "Slow: Post send: "
1447  "{target: " << procsTo_[p] << ", tag: " << tag << "}" << endl;
1448  std::cerr << os.str();
1449  }
1450 
1451  typename ArrayView<const Packet>::iterator srcBegin, srcEnd;
1452  size_t sendArrayOffset = 0;
1453  size_t j = startsTo_[p];
1454  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
1455  srcBegin = exports.begin() + indicesTo_[j]*numPackets;
1456  srcEnd = srcBegin + numPackets;
1457  std::copy (srcBegin, srcEnd, sendArray.begin()+sendArrayOffset);
1458  sendArrayOffset += numPackets;
1459  }
1460  ArrayView<const Packet> tmpSend =
1461  sendArray.view (0, lengthsTo_[p]*numPackets);
1462 
1463  if (sendType == Details::DISTRIBUTOR_SEND) {
1464  send<int, Packet> (tmpSend.getRawPtr (),
1465  as<int> (tmpSend.size ()),
1466  procsTo_[p], tag, *comm_);
1467  }
1468  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1469  ArrayRCP<const Packet> tmpSendBuf =
1470  sendArray.persistingView (0, lengthsTo_[p] * numPackets);
1471  requests_.push_back (isend<int, Packet> (tmpSendBuf, procsTo_[p],
1472  tag, *comm_));
1473  }
1474  else if (sendType == Details::DISTRIBUTOR_RSEND) {
1475  readySend<int, Packet> (tmpSend.getRawPtr (),
1476  as<int> (tmpSend.size ()),
1477  procsTo_[p], tag, *comm_);
1478  }
1479  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1480  ssend<int, Packet> (tmpSend.getRawPtr (),
1481  as<int> (tmpSend.size ()),
1482  procsTo_[p], tag, *comm_);
1483  }
1484  else {
1485  TEUCHOS_TEST_FOR_EXCEPTION(
1486  true, std::logic_error,
1487  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): "
1488  "Invalid send type. We should never get here. "
1489  "Please report this bug to the Tpetra developers.");
1490  }
1491  }
1492  else { // "Sending" the message to myself
1493  selfNum = p;
1494  selfIndex = startsTo_[p];
1495  }
1496  }
1497 
1498  if (selfMessage_) {
1499  if (verbose_) {
1500  std::ostringstream os;
1501  os << *prefix << "Slow: Self-send" << endl;
1502  std::cerr << os.str();
1503  }
1504  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
1505  std::copy (exports.begin()+indicesTo_[selfIndex]*numPackets,
1506  exports.begin()+indicesTo_[selfIndex]*numPackets + numPackets,
1507  imports.begin() + selfReceiveOffset);
1508  ++selfIndex;
1509  selfReceiveOffset += numPackets;
1510  }
1511  }
1512  }
1513 
1514  if (verbose_) {
1515  std::ostringstream os;
1516  os << *prefix << "Done!" << endl;
1517  std::cerr << os.str();
1518  }
1519  }
1520 
1521  template <class Packet>
1523  doPosts (const Teuchos::ArrayRCP<const Packet>& exports,
1524  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
1525  const Teuchos::ArrayRCP<Packet>& imports,
1526  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
1527  {
1528  using Teuchos::Array;
1529  using Teuchos::ArrayRCP;
1530  using Teuchos::ArrayView;
1531  using Teuchos::as;
1532  using Teuchos::ireceive;
1533  using Teuchos::isend;
1534  using Teuchos::readySend;
1535  using Teuchos::send;
1536  using Teuchos::ssend;
1537  using Teuchos::TypeNameTraits;
1538  using std::endl;
1539  typedef Array<size_t>::size_type size_type;
1540 
1541  std::unique_ptr<std::string> prefix;
1542  if (verbose_) {
1543  prefix = createPrefix("doPosts(4-arg, Teuchos)");
1544  std::ostringstream os;
1545  os << *prefix << "Start" << endl;
1546  std::cerr << os.str();
1547  }
1548 
1549 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1550  Teuchos::TimeMonitor timeMon (*timer_doPosts4TA_);
1551 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1552 
1553  // Run-time configurable parameters that come from the input
1554  // ParameterList set by setParameterList().
1555  const Details::EDistributorSendType sendType = sendType_;
1556  const bool doBarrier = barrierBetween_;
1557 
1558 // #ifdef HAVE_TEUCHOS_DEBUG
1559 // // Prepare for verbose output, if applicable.
1560 // Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
1561 // Teuchos::RCP<Teuchos::FancyOStream> out = this->getOStream ();
1562 // const bool doPrint = out.get () && (comm_->getRank () == 0) &&
1563 // includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
1564 
1565 // if (doPrint) {
1566 // // Only need one process to print out parameters.
1567 // *out << "Distributor::doPosts (4 args)" << endl;
1568 // }
1569 // // Add one tab level. We declare this outside the doPrint scopes
1570 // // so that the tab persists until the end of this method.
1571 // if (doPrint) {
1572 // *out << "Parameters:" << endl;
1573 // {
1574 // *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
1575 // << endl << "barrierBetween: " << doBarrier << endl;
1576 // }
1577 // }
1578 // #endif // HAVE_TEUCHOS_DEBUG
1579 
1580  TEUCHOS_TEST_FOR_EXCEPTION(
1581  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier,
1582  std::logic_error,
1583  "Tpetra::Distributor::doPosts(4 args, Teuchos::ArrayRCP): Ready-send "
1584  "version requires a barrier between posting receives and posting ready "
1585  "ends. This should have been checked before. "
1586  "Please report this bug to the Tpetra developers.");
1587 
1588  const int myProcID = comm_->getRank ();
1589  size_t selfReceiveOffset = 0;
1590 
1591 #ifdef HAVE_TEUCHOS_DEBUG
1592  // Different messages may have different numbers of packets.
1593  size_t totalNumImportPackets = 0;
1594  for (size_t ii = 0; ii < static_cast<size_t> (numImportPacketsPerLID.size ()); ++ii) {
1595  totalNumImportPackets += numImportPacketsPerLID[ii];
1596  }
1597  TEUCHOS_TEST_FOR_EXCEPTION(
1598  static_cast<size_t> (imports.size ()) < totalNumImportPackets,
1599  std::runtime_error,
1600  "Tpetra::Distributor::doPosts(4 args, Teuchos::ArrayRCP): The 'imports' "
1601  "array must have enough entries to hold the expected number of import "
1602  "packets. imports.size() = " << imports.size() << " < "
1603  "totalNumImportPackets = " << totalNumImportPackets << ".");
1604 #endif // HAVE_TEUCHOS_DEBUG
1605 
1606  // MPI tag for nonblocking receives and blocking sends in this
1607  // method. Some processes might take the "fast" path
1608  // (indicesTo_.empty()) and others might take the "slow" path for
1609  // the same doPosts() call, so the path tag must be the same for
1610  // both.
1611  const int pathTag = 1;
1612  const int tag = this->getTag (pathTag);
1613 
1614 #ifdef HAVE_TEUCHOS_DEBUG
1615  TEUCHOS_TEST_FOR_EXCEPTION
1616  (requests_.size () != 0,
1617  std::logic_error,
1618  "Tpetra::Distributor::doPosts(4 args, Teuchos::ArrayRCP): Process "
1619  << myProcID << ": requests_.size() = " << requests_.size ()
1620  << " != 0.");
1621 #endif // HAVE_TEUCHOS_DEBUG
1622  if (verbose_) {
1623  std::ostringstream os;
1624  os << *prefix << (indicesTo_.empty () ? "fast" : "slow")
1625  << " path" << endl;
1626  std::cerr << os.str();
1627  }
1628 
1629  // Distributor uses requests_.size() as the number of outstanding
1630  // nonblocking message requests, so we resize to zero to maintain
1631  // this invariant.
1632  //
1633  // numReceives_ does _not_ include the self message, if there is
1634  // one. Here, we do actually send a message to ourselves, so we
1635  // include any self message in the "actual" number of receives to
1636  // post.
1637  //
1638  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
1639  // doesn't (re)allocate its array of requests. That happens in
1640  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
1641  // demand), or Resize_().
1642  const size_type actualNumReceives = as<size_type> (numReceives_) +
1643  as<size_type> (selfMessage_ ? 1 : 0);
1644  requests_.resize (0);
1645 
1646  // Post the nonblocking receives. It's common MPI wisdom to post
1647  // receives before sends. In MPI terms, this means favoring
1648  // adding to the "posted queue" (of receive requests) over adding
1649  // to the "unexpected queue" (of arrived messages not yet matched
1650  // with a receive).
1651  {
1652 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1653  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts4TA_recvs_);
1654 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1655 
1656  size_t curBufferOffset = 0;
1657  size_t curLIDoffset = 0;
1658  for (size_type i = 0; i < actualNumReceives; ++i) {
1659  size_t totalPacketsFrom_i = 0;
1660  for (size_t j = 0; j < lengthsFrom_[i]; ++j) {
1661  totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset+j];
1662  }
1663  curLIDoffset += lengthsFrom_[i];
1664  if (procsFrom_[i] != myProcID && totalPacketsFrom_i) {
1665  // If my process is receiving these packet(s) from another
1666  // process (not a self-receive), and if there is at least
1667  // one packet to receive:
1668  //
1669  // 1. Set up the persisting view (recvBuf) into the imports
1670  // array, given the offset and size (total number of
1671  // packets from process procsFrom_[i]).
1672  // 2. Start the Irecv and save the resulting request.
1673  ArrayRCP<Packet> recvBuf =
1674  imports.persistingView (curBufferOffset, totalPacketsFrom_i);
1675  requests_.push_back (ireceive<int, Packet> (recvBuf, procsFrom_[i],
1676  tag, *comm_));
1677  }
1678  else { // Receiving these packet(s) from myself
1679  selfReceiveOffset = curBufferOffset; // Remember the offset
1680  }
1681  curBufferOffset += totalPacketsFrom_i;
1682  }
1683  }
1684 
1685  if (doBarrier) {
1686 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1687  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts4TA_barrier_);
1688 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1689  // If we are using ready sends (MPI_Rsend) below, we need to do
1690  // a barrier before we post the ready sends. This is because a
1691  // ready send requires that its matching receive has already
1692  // been posted before the send has been posted. The only way to
1693  // guarantee that in this case is to use a barrier.
1694  comm_->barrier ();
1695  }
1696 
1697 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1698  Teuchos::TimeMonitor timeMonSends (*timer_doPosts4TA_sends_);
1699 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1700 
1701  // setup arrays containing starting-offsets into exports for each send,
1702  // and num-packets-to-send for each send.
1703  Array<size_t> sendPacketOffsets(numSends_,0), packetsPerSend(numSends_,0);
1704  size_t maxNumPackets = 0;
1705  size_t curPKToffset = 0;
1706  for (size_t pp=0; pp<numSends_; ++pp) {
1707  sendPacketOffsets[pp] = curPKToffset;
1708  size_t numPackets = 0;
1709  for (size_t j=startsTo_[pp]; j<startsTo_[pp]+lengthsTo_[pp]; ++j) {
1710  numPackets += numExportPacketsPerLID[j];
1711  }
1712  if (numPackets > maxNumPackets) maxNumPackets = numPackets;
1713  packetsPerSend[pp] = numPackets;
1714  curPKToffset += numPackets;
1715  }
1716 
1717  // setup scan through procsTo_ list starting with higher numbered procs
1718  // (should help balance message traffic)
1719  size_t numBlocks = numSends_+ selfMessage_;
1720  size_t procIndex = 0;
1721  while ((procIndex < numBlocks) && (procsTo_[procIndex] < myProcID)) {
1722  ++procIndex;
1723  }
1724  if (procIndex == numBlocks) {
1725  procIndex = 0;
1726  }
1727 
1728  size_t selfNum = 0;
1729  size_t selfIndex = 0;
1730 
1731  if (indicesTo_.empty()) {
1732 
1733 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1734  Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts4TA_sends_fast_);
1735 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1736 
1737  if (verbose_) {
1738  std::ostringstream os;
1739  os << *prefix << "fast path: posting sends" << endl;
1740  std::cerr << os.str();
1741  }
1742 
1743  // Data are already blocked (laid out) by process, so we don't
1744  // need a separate send buffer (besides the exports array).
1745  for (size_t i = 0; i < numBlocks; ++i) {
1746  size_t p = i + procIndex;
1747  if (p > (numBlocks - 1)) {
1748  p -= numBlocks;
1749  }
1750 
1751  if (procsTo_[p] != myProcID && packetsPerSend[p] > 0) {
1752  ArrayView<const Packet> tmpSend =
1753  exports.view (sendPacketOffsets[p], packetsPerSend[p]);
1754 
1755  if (sendType == Details::DISTRIBUTOR_SEND) { // the default, so put it first
1756  send<int, Packet> (tmpSend.getRawPtr (),
1757  as<int> (tmpSend.size ()),
1758  procsTo_[p], tag, *comm_);
1759  }
1760  else if (sendType == Details::DISTRIBUTOR_RSEND) {
1761  readySend<int, Packet> (tmpSend.getRawPtr (),
1762  as<int> (tmpSend.size ()),
1763  procsTo_[p], tag, *comm_);
1764  }
1765  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1766  ArrayRCP<const Packet> tmpSendBuf =
1767  exports.persistingView (sendPacketOffsets[p], packetsPerSend[p]);
1768  requests_.push_back (isend<int, Packet> (tmpSendBuf, procsTo_[p],
1769  tag, *comm_));
1770  }
1771  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1772  ssend<int, Packet> (tmpSend.getRawPtr (),
1773  as<int> (tmpSend.size ()),
1774  procsTo_[p], tag, *comm_);
1775  }
1776  else {
1777  TEUCHOS_TEST_FOR_EXCEPTION(
1778  true, std::logic_error,
1779  "Tpetra::Distributor::doPosts(4 args, Teuchos::ArrayRCP): "
1780  "Invalid send type. We should never get here. Please report "
1781  "this bug to the Tpetra developers.");
1782  }
1783  }
1784  else { // "Sending" the message to myself
1785  selfNum = p;
1786  }
1787  }
1788 
1789  if (selfMessage_) {
1790  std::copy (exports.begin()+sendPacketOffsets[selfNum],
1791  exports.begin()+sendPacketOffsets[selfNum]+packetsPerSend[selfNum],
1792  imports.begin()+selfReceiveOffset);
1793  }
1794  if (verbose_) {
1795  std::ostringstream os;
1796  os << *prefix << "fast path: done" << endl;
1797  std::cerr << os.str();
1798  }
1799  }
1800  else { // data are not blocked by proc, use send buffer
1801 
1802 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1803  Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts4TA_sends_slow_);
1804 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1805 
1806  if (verbose_) {
1807  std::ostringstream os;
1808  os << *prefix << "slow path: posting sends" << endl;
1809  std::cerr << os.str();
1810  }
1811 
1812  // FIXME (mfh 05 Mar 2013) This may be broken for Isend.
1813  ArrayRCP<Packet> sendArray (maxNumPackets); // send buffer
1814 
1815  TEUCHOS_TEST_FOR_EXCEPTION(
1816  sendType == Details::DISTRIBUTOR_ISEND,
1817  std::logic_error,
1818  "Tpetra::Distributor::doPosts(4 args, Teuchos::ArrayRCP): "
1819  "The \"send buffer\" code path may not necessarily work with nonblocking sends.");
1820 
1821  Array<size_t> indicesOffsets (numExportPacketsPerLID.size(), 0);
1822  size_t ioffset = 0;
1823  for (int j=0; j<numExportPacketsPerLID.size(); ++j) {
1824  indicesOffsets[j] = ioffset;
1825  ioffset += numExportPacketsPerLID[j];
1826  }
1827 
1828  for (size_t i = 0; i < numBlocks; ++i) {
1829  size_t p = i + procIndex;
1830  if (p > (numBlocks - 1)) {
1831  p -= numBlocks;
1832  }
1833 
1834  if (procsTo_[p] != myProcID) {
1835  typename ArrayView<const Packet>::iterator srcBegin, srcEnd;
1836  size_t sendArrayOffset = 0;
1837  size_t j = startsTo_[p];
1838  size_t numPacketsTo_p = 0;
1839  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
1840  srcBegin = exports.begin() + indicesOffsets[j];
1841  srcEnd = srcBegin + numExportPacketsPerLID[j];
1842  numPacketsTo_p += numExportPacketsPerLID[j];
1843  std::copy (srcBegin, srcEnd, sendArray.begin()+sendArrayOffset);
1844  sendArrayOffset += numExportPacketsPerLID[j];
1845  }
1846  if (numPacketsTo_p > 0) {
1847  ArrayView<const Packet> tmpSend =
1848  sendArray.view (0, numPacketsTo_p);
1849 
1850  if (sendType == Details::DISTRIBUTOR_RSEND) {
1851  readySend<int, Packet> (tmpSend.getRawPtr (),
1852  as<int> (tmpSend.size ()),
1853  procsTo_[p], tag, *comm_);
1854  }
1855  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1856  ArrayRCP<const Packet> tmpSendBuf =
1857  sendArray.persistingView (0, numPacketsTo_p);
1858  requests_.push_back (isend<int, Packet> (tmpSendBuf, procsTo_[p],
1859  tag, *comm_));
1860  }
1861  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1862  ssend<int, Packet> (tmpSend.getRawPtr (),
1863  as<int> (tmpSend.size ()),
1864  procsTo_[p], tag, *comm_);
1865  }
1866  else { // if (sendType == Details::DISTRIBUTOR_SSEND)
1867  send<int, Packet> (tmpSend.getRawPtr (),
1868  as<int> (tmpSend.size ()),
1869  procsTo_[p], tag, *comm_);
1870  }
1871  }
1872  }
1873  else { // "Sending" the message to myself
1874  selfNum = p;
1875  selfIndex = startsTo_[p];
1876  }
1877  }
1878 
1879  if (selfMessage_) {
1880  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
1881  std::copy (exports.begin()+indicesOffsets[selfIndex],
1882  exports.begin()+indicesOffsets[selfIndex]+numExportPacketsPerLID[selfIndex],
1883  imports.begin() + selfReceiveOffset);
1884  selfReceiveOffset += numExportPacketsPerLID[selfIndex];
1885  ++selfIndex;
1886  }
1887  }
1888  if (verbose_) {
1889  std::ostringstream os;
1890  os << *prefix << "slow path: done" << endl;
1891  std::cerr << os.str();
1892  }
1893  }
1894  }
1895 
1896  template <class Packet>
1898  doReversePostsAndWaits (const Teuchos::ArrayView<const Packet>& exports,
1899  size_t numPackets,
1900  const Teuchos::ArrayView<Packet>& imports)
1901  {
1902  using Teuchos::arcp;
1903  using Teuchos::ArrayRCP;
1904  using Teuchos::as;
1905 
1906  // doReversePosts() takes exports and imports as ArrayRCPs,
1907  // requiring that the memory locations are persisting. However,
1908  // they need only persist within the scope of that routine, so it
1909  // is safe for us to use nonpersisting references in this case.
1910 
1911  // mfh 04 Apr 2012: For some reason, calling arcp<const Packet>
1912  // for Packet=std::complex<T> (e.g., T=float) fails to compile
1913  // with some versions of GCC. The issue only arises with the
1914  // exports array. This is why we construct a separate nonowning
1915  // ArrayRCP.
1916  typedef typename ArrayRCP<const Packet>::size_type size_type;
1917  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr(), as<size_type> (0),
1918  exports.size(), false);
1919  // mfh 04 Apr 2012: This is the offending code. This statement
1920  // would normally be in place of "exportsArcp" in the
1921  // doReversePosts() call below.
1922  //arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false)
1923  doReversePosts (exportsArcp,
1924  numPackets,
1925  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false));
1926  doReverseWaits ();
1927 
1928  lastRoundBytesSend_ = exports.size() * sizeof(Packet);
1929  lastRoundBytesRecv_ = imports.size() * sizeof(Packet);
1930  }
1931 
1932  template <class Packet>
1934  doReversePostsAndWaits (const Teuchos::ArrayView<const Packet>& exports,
1935  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
1936  const Teuchos::ArrayView<Packet> &imports,
1937  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
1938  {
1939  using Teuchos::as;
1940  using Teuchos::arcp;
1941  using Teuchos::ArrayRCP;
1942 
1943  TEUCHOS_TEST_FOR_EXCEPTION(
1944  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
1945  "doReversePostsAndWaits(4 args): There are " << requests_.size ()
1946  << " outstanding nonblocking messages pending. It is incorrect to call "
1947  "this method with posts outstanding.");
1948 
1949  // doReversePosts() accepts the exports and imports arrays as
1950  // ArrayRCPs, requiring that the memory location is persisting (as
1951  // is necessary for nonblocking receives). However, it need only
1952  // persist until doReverseWaits() completes, so it is safe for us
1953  // to use a nonpersisting reference in this case. The use of a
1954  // nonpersisting reference is purely a performance optimization.
1955 
1956  // mfh 02 Apr 2012: For some reason, calling arcp<const Packet>
1957  // for Packet=std::complex<double> fails to compile with some
1958  // versions of GCC. The issue only arises with the exports array.
1959  // This is why we construct a separate nonowning ArrayRCP.
1960  typedef typename ArrayRCP<const Packet>::size_type size_type;
1961  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (), as<size_type> (0),
1962  exports.size (), false);
1963  doReversePosts (exportsArcp,
1964  numExportPacketsPerLID,
1965  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false),
1966  numImportPacketsPerLID);
1967  doReverseWaits ();
1968 
1969  lastRoundBytesSend_ = exports.size() * sizeof(Packet);
1970  lastRoundBytesRecv_ = imports.size() * sizeof(Packet);
1971  }
1972 
1973  template <class Packet>
1975  doReversePosts (const Teuchos::ArrayRCP<const Packet>& exports,
1976  size_t numPackets,
1977  const Teuchos::ArrayRCP<Packet>& imports)
1978  {
1979  // FIXME (mfh 29 Mar 2012) WHY?
1980  TEUCHOS_TEST_FOR_EXCEPTION(
1981  ! indicesTo_.empty (), std::runtime_error,
1982  "Tpetra::Distributor::doReversePosts(3 args): Can only do reverse "
1983  "communication when original data are blocked by process.");
1984  if (reverseDistributor_.is_null ()) {
1985  createReverseDistributor ();
1986  }
1987  reverseDistributor_->doPosts (exports, numPackets, imports);
1988  }
1989 
1990  template <class Packet>
1992  doReversePosts (const Teuchos::ArrayRCP<const Packet>& exports,
1993  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
1994  const Teuchos::ArrayRCP<Packet>& imports,
1995  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
1996  {
1997  // FIXME (mfh 29 Mar 2012) WHY?
1998  TEUCHOS_TEST_FOR_EXCEPTION(
1999  ! indicesTo_.empty (), std::runtime_error,
2000  "Tpetra::Distributor::doReversePosts(3 args): Can only do reverse "
2001  "communication when original data are blocked by process.");
2002  if (reverseDistributor_.is_null ()) {
2003  createReverseDistributor ();
2004  }
2005  reverseDistributor_->doPosts (exports, numExportPacketsPerLID,
2006  imports, numImportPacketsPerLID);
2007  }
2008 
2009  template <class ExpView, class ImpView>
2010  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2012  doPostsAndWaits (const ExpView& exports,
2013  size_t numPackets,
2014  const ImpView& imports)
2015  {
2016  using Teuchos::RCP;
2017  using Teuchos::rcp;
2018  using std::endl;
2019 
2020  std::unique_ptr<std::string> prefix;
2021  if (verbose_) {
2022  prefix = createPrefix("doPostsAndWaits(3-arg, Kokkos)");
2023  std::ostringstream os;
2024  os << *prefix << "sendType: "
2025  << DistributorSendTypeEnumToString(sendType_)
2026  << ", barrierBetween: "
2027  << (barrierBetween_ ? "true" : "false") << endl;
2028  std::cerr << os.str();
2029  }
2030 
2031  TEUCHOS_TEST_FOR_EXCEPTION(
2032  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
2033  "doPostsAndWaits(3 args): There are " << requests_.size () <<
2034  " outstanding nonblocking messages pending. It is incorrect to call "
2035  "this method with posts outstanding.");
2036 
2037  if (verbose_) {
2038  std::ostringstream os;
2039  os << *prefix << "Call doPosts" << endl;
2040  std::cerr << os.str();
2041  }
2042  doPosts (exports, numPackets, imports);
2043  if (verbose_) {
2044  std::ostringstream os;
2045  os << *prefix << "Call doWaits" << endl;
2046  std::cerr << os.str();
2047  }
2048  doWaits ();
2049  if (verbose_) {
2050  std::ostringstream os;
2051  os << *prefix << "Done" << endl;
2052  std::cerr << os.str();
2053  }
2054  }
2055 
2056  template <class ExpView, class ImpView>
2057  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2059  doPostsAndWaits(const ExpView& exports,
2060  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
2061  const ImpView& imports,
2062  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
2063  {
2064  using std::endl;
2065  const char rawPrefix[] = "doPostsAndWaits(4-arg, Kokkos)";
2066 
2067  std::unique_ptr<std::string> prefix;
2068  if (verbose_) {
2069  prefix = createPrefix(rawPrefix);
2070  std::ostringstream os;
2071  os << *prefix << "Start" << endl;
2072  std::cerr << os.str();
2073  }
2074  TEUCHOS_TEST_FOR_EXCEPTION
2075  (requests_.size() != 0, std::runtime_error,
2076  "Tpetra::Distributor::" << rawPrefix << ": There is/are "
2077  << requests_.size() << " outstanding nonblocking message(s) "
2078  "pending. It is incorrect to call this method with posts "
2079  "outstanding.");
2080  doPosts(exports, numExportPacketsPerLID, imports, numImportPacketsPerLID);
2081  doWaits();
2082  if (verbose_) {
2083  std::ostringstream os;
2084  os << *prefix << "Done" << endl;
2085  std::cerr << os.str();
2086  }
2087  }
2088 
2089 
2090  template <class ExpView, class ImpView>
2091  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2093  doPosts (const ExpView &exports,
2094  size_t numPackets,
2095  const ImpView &imports)
2096  {
2097  using Teuchos::Array;
2098  using Teuchos::as;
2099  using Teuchos::FancyOStream;
2100  using Teuchos::includesVerbLevel;
2101  using Teuchos::ireceive;
2102  using Teuchos::isend;
2103  using Teuchos::readySend;
2104  using Teuchos::send;
2105  using Teuchos::ssend;
2106  using Teuchos::TypeNameTraits;
2107  using Teuchos::typeName;
2108  using std::endl;
2109  using Kokkos::Compat::create_const_view;
2110  using Kokkos::Compat::create_view;
2111  using Kokkos::Compat::subview_offset;
2112  using Kokkos::Compat::deep_copy_offset;
2113  typedef Array<size_t>::size_type size_type;
2114  typedef ExpView exports_view_type;
2115  typedef ImpView imports_view_type;
2116 
2117 #ifdef KOKKOS_ENABLE_CUDA
2118  static_assert
2119  (! std::is_same<typename ExpView::memory_space, Kokkos::CudaUVMSpace>::value &&
2120  ! std::is_same<typename ImpView::memory_space, Kokkos::CudaUVMSpace>::value,
2121  "Please do not use Tpetra::Distributor with UVM allocations. "
2122  "See Trilinos GitHub issue #1088.");
2123 #endif // KOKKOS_ENABLE_CUDA
2124 
2125 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2126  Teuchos::TimeMonitor timeMon (*timer_doPosts3KV_);
2127 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2128 
2129  const int myRank = comm_->getRank ();
2130  // Run-time configurable parameters that come from the input
2131  // ParameterList set by setParameterList().
2132  const Details::EDistributorSendType sendType = sendType_;
2133  const bool doBarrier = barrierBetween_;
2134 
2135  std::unique_ptr<std::string> prefix;
2136  if (verbose_) {
2137  prefix = createPrefix("doPosts(3-arg, Kokkos)");
2138  std::ostringstream os;
2139  os << *prefix << "Start" << endl;
2140  std::cerr << os.str();
2141  }
2142 
2143  TEUCHOS_TEST_FOR_EXCEPTION(
2144  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier,
2145  std::logic_error,
2146  "Tpetra::Distributor::doPosts(3 args, Kokkos): Ready-send version "
2147  "requires a barrier between posting receives and posting ready sends. "
2148  "This should have been checked before. "
2149  "Please report this bug to the Tpetra developers.");
2150 
2151  size_t selfReceiveOffset = 0;
2152 
2153  // mfh 30 Mar 2016: See Github Issue #227 to see why we need to
2154  // check whether we're doing reverse mode before checking the
2155  // length of the imports array.
2156  if (false /* howInitialized_ != Details::DISTRIBUTOR_INITIALIZED_BY_REVERSE */) {
2157  // Each message has the same number of packets.
2158  const size_t totalNumImportPackets = totalReceiveLength_ * numPackets;
2159 
2160  if (verbose_) {
2161  std::ostringstream os;
2162  os << *prefix << "totalNumImportPackets = " <<
2163  totalNumImportPackets << " = " << totalReceiveLength_ << " * " <<
2164  numPackets << "; imports.extent(0) = " << imports.extent (0)
2165  << endl;
2166  std::cerr << os.str();
2167  }
2168 
2169 #ifdef HAVE_TPETRA_DEBUG
2170  // mfh 31 Mar 2016: Extra special all-reduce check to help diagnose #227.
2171  {
2172  const size_t importBufSize = static_cast<size_t> (imports.extent (0));
2173  const int lclBad = (importBufSize < totalNumImportPackets) ? 1 : 0;
2174  int gblBad = 0;
2175  using Teuchos::reduceAll;
2176  using Teuchos::REDUCE_MAX;
2177  using Teuchos::outArg;
2178  reduceAll (*comm_, REDUCE_MAX, lclBad, outArg (gblBad));
2179  TEUCHOS_TEST_FOR_EXCEPTION
2180  (gblBad != 0,
2181  std::runtime_error,
2182  "Tpetra::Distributor::doPosts(3 args, Kokkos): "
2183  "On one or more MPI processes, the 'imports' array "
2184  "does not have enough entries to hold the expected number of "
2185  "import packets. ");
2186  }
2187 #else
2188  TEUCHOS_TEST_FOR_EXCEPTION
2189  (static_cast<size_t> (imports.extent (0)) < totalNumImportPackets,
2190  std::runtime_error,
2191  "Tpetra::Distributor::doPosts(3 args, Kokkos): The 'imports' "
2192  "array must have enough entries to hold the expected number of import "
2193  "packets. imports.extent(0) = " << imports.extent (0) << " < "
2194  "totalNumImportPackets = " << totalNumImportPackets << " = "
2195  "totalReceiveLength_ (" << totalReceiveLength_ << ") * numPackets ("
2196  << numPackets << ").");
2197 #endif // HAVE_TPETRA_DEBUG
2198  }
2199 
2200  // MPI tag for nonblocking receives and blocking sends in this
2201  // method. Some processes might take the "fast" path
2202  // (indicesTo_.empty()) and others might take the "slow" path for
2203  // the same doPosts() call, so the path tag must be the same for
2204  // both.
2205  const int pathTag = 0;
2206  const int tag = this->getTag (pathTag);
2207 
2208 #ifdef HAVE_TPETRA_DEBUG
2209  TEUCHOS_TEST_FOR_EXCEPTION
2210  (requests_.size () != 0,
2211  std::logic_error,
2212  "Tpetra::Distributor::doPosts(3 args, Kokkos): Process "
2213  << myRank << ": requests_.size() = " << requests_.size () << " != 0.");
2214 #endif // HAVE_TPETRA_DEBUG
2215 
2216  // Distributor uses requests_.size() as the number of outstanding
2217  // nonblocking message requests, so we resize to zero to maintain
2218  // this invariant.
2219  //
2220  // numReceives_ does _not_ include the self message, if there is
2221  // one. Here, we do actually send a message to ourselves, so we
2222  // include any self message in the "actual" number of receives to
2223  // post.
2224  //
2225  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
2226  // doesn't (re)allocate its array of requests. That happens in
2227  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
2228  // demand), or Resize_().
2229  const size_type actualNumReceives = as<size_type> (numReceives_) +
2230  as<size_type> (selfMessage_ ? 1 : 0);
2231  requests_.resize (0);
2232 
2233  if (verbose_) {
2234  std::ostringstream os;
2235  os << *prefix << (indicesTo_.empty() ? "fast" : "slow")
2236  << " path: post receives" << endl;
2237  std::cerr << os.str();
2238  }
2239 
2240  // Post the nonblocking receives. It's common MPI wisdom to post
2241  // receives before sends. In MPI terms, this means favoring
2242  // adding to the "posted queue" (of receive requests) over adding
2243  // to the "unexpected queue" (of arrived messages not yet matched
2244  // with a receive).
2245  {
2246 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2247  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts3KV_recvs_);
2248 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2249 
2250  size_t curBufferOffset = 0;
2251  for (size_type i = 0; i < actualNumReceives; ++i) {
2252  const size_t curBufLen = lengthsFrom_[i] * numPackets;
2253  if (procsFrom_[i] != myRank) {
2254  if (verbose_) {
2255  std::ostringstream os;
2256  os << *prefix
2257  << (indicesTo_.empty() ? "fast" : "slow") << " path: "
2258  << "post irecv: {source: " << procsFrom_[i]
2259  << ", tag: " << tag << "}" << endl;
2260  std::cerr << os.str();
2261  }
2262  // If my process is receiving these packet(s) from another
2263  // process (not a self-receive):
2264  //
2265  // 1. Set up the persisting view (recvBuf) of the imports
2266  // array, given the offset and size (total number of
2267  // packets from process procsFrom_[i]).
2268  // 2. Start the Irecv and save the resulting request.
2269  TEUCHOS_TEST_FOR_EXCEPTION(
2270  curBufferOffset + curBufLen > static_cast<size_t> (imports.size ()),
2271  std::logic_error, "Tpetra::Distributor::doPosts(3 args, Kokkos): "
2272  "Exceeded size of 'imports' array in packing loop on Process " <<
2273  myRank << ". imports.size() = " << imports.size () << " < "
2274  "curBufferOffset(" << curBufferOffset << ") + curBufLen(" <<
2275  curBufLen << ").");
2276  imports_view_type recvBuf =
2277  subview_offset (imports, curBufferOffset, curBufLen);
2278  requests_.push_back (ireceive<int> (recvBuf, procsFrom_[i],
2279  tag, *comm_));
2280  }
2281  else { // Receiving from myself
2282  selfReceiveOffset = curBufferOffset; // Remember the self-recv offset
2283  }
2284  curBufferOffset += curBufLen;
2285  }
2286  }
2287 
2288  if (doBarrier) {
2289 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2290  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts3KV_barrier_);
2291 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2292 
2293  if (verbose_) {
2294  std::ostringstream os;
2295  os << *prefix << (indicesTo_.empty() ? "fast" : "slow")
2296  << " path: barrier" << endl;
2297  std::cerr << os.str();
2298  }
2299  // If we are using ready sends (MPI_Rsend) below, we need to do
2300  // a barrier before we post the ready sends. This is because a
2301  // ready send requires that its matching receive has already
2302  // been posted before the send has been posted. The only way to
2303  // guarantee that in this case is to use a barrier.
2304  comm_->barrier ();
2305  }
2306 
2307 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2308  Teuchos::TimeMonitor timeMonSends (*timer_doPosts3KV_sends_);
2309 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2310 
2311  // setup scan through procsTo_ list starting with higher numbered procs
2312  // (should help balance message traffic)
2313  //
2314  // FIXME (mfh 20 Feb 2013) Why haven't we precomputed this?
2315  // It doesn't depend on the input at all.
2316  size_t numBlocks = numSends_ + selfMessage_;
2317  size_t procIndex = 0;
2318  while ((procIndex < numBlocks) && (procsTo_[procIndex] < myRank)) {
2319  ++procIndex;
2320  }
2321  if (procIndex == numBlocks) {
2322  procIndex = 0;
2323  }
2324 
2325  size_t selfNum = 0;
2326  size_t selfIndex = 0;
2327 
2328  if (verbose_) {
2329  std::ostringstream os;
2330  os << *prefix << (indicesTo_.empty() ? "fast" : "slow")
2331  << " path: post sends" << endl;
2332  std::cerr << os.str();
2333  }
2334 
2335  if (indicesTo_.empty()) {
2336 
2337 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2338  Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts3KV_sends_fast_);
2339 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2340 
2341  if (verbose_) {
2342  std::ostringstream os;
2343  os << *prefix << "fast path: posting sends" << endl;
2344  std::cerr << os.str();
2345  }
2346 
2347  // Data are already blocked (laid out) by process, so we don't
2348  // need a separate send buffer (besides the exports array).
2349  for (size_t i = 0; i < numBlocks; ++i) {
2350  size_t p = i + procIndex;
2351  if (p > (numBlocks - 1)) {
2352  p -= numBlocks;
2353  }
2354 
2355  if (procsTo_[p] != myRank) {
2356  if (verbose_) {
2357  std::ostringstream os;
2358  os << *prefix << "fast path: post send: {target: "
2359  << procsTo_[p] << ", tag: " << tag << "}" << endl;
2360  std::cerr << os.str();
2361  }
2362  exports_view_type tmpSend = subview_offset(
2363  exports, startsTo_[p]*numPackets, lengthsTo_[p]*numPackets);
2364 
2365  if (sendType == Details::DISTRIBUTOR_SEND) {
2366  send<int> (tmpSend,
2367  as<int> (tmpSend.size ()),
2368  procsTo_[p], tag, *comm_);
2369  }
2370  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2371  exports_view_type tmpSendBuf =
2372  subview_offset (exports, startsTo_[p] * numPackets,
2373  lengthsTo_[p] * numPackets);
2374  requests_.push_back (isend<int> (tmpSendBuf, procsTo_[p],
2375  tag, *comm_));
2376  }
2377  else if (sendType == Details::DISTRIBUTOR_RSEND) {
2378  readySend<int> (tmpSend,
2379  as<int> (tmpSend.size ()),
2380  procsTo_[p], tag, *comm_);
2381  }
2382  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2383  ssend<int> (tmpSend,
2384  as<int> (tmpSend.size ()),
2385  procsTo_[p], tag, *comm_);
2386  } else {
2387  TEUCHOS_TEST_FOR_EXCEPTION(
2388  true,
2389  std::logic_error,
2390  "Tpetra::Distributor::doPosts(3 args, Kokkos): "
2391  "Invalid send type. We should never get here. "
2392  "Please report this bug to the Tpetra developers.");
2393  }
2394  }
2395  else { // "Sending" the message to myself
2396  selfNum = p;
2397  }
2398  }
2399 
2400  if (selfMessage_) {
2401  if (verbose_) {
2402  std::ostringstream os;
2403  os << *prefix << "fast path: self-send" << endl;
2404  std::cerr << os.str();
2405  }
2406  // This is how we "send a message to ourself": we copy from
2407  // the export buffer to the import buffer. That saves
2408  // Teuchos::Comm implementations other than MpiComm (in
2409  // particular, SerialComm) the trouble of implementing self
2410  // messages correctly. (To do this right, SerialComm would
2411  // need internal buffer space for messages, keyed on the
2412  // message's tag.)
2413  deep_copy_offset(imports, exports, selfReceiveOffset,
2414  startsTo_[selfNum]*numPackets,
2415  lengthsTo_[selfNum]*numPackets);
2416  }
2417  if (verbose_) {
2418  std::ostringstream os;
2419  os << *prefix << "fast path: done" << endl;
2420  std::cerr << os.str();
2421  }
2422  }
2423  else { // data are not blocked by proc, use send buffer
2424 
2425 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2426  Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts3KV_sends_slow_);
2427 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2428 
2429  if (verbose_) {
2430  std::ostringstream os;
2431  os << *prefix << "slow path: posting sends" << endl;
2432  std::cerr << os.str();
2433  }
2434  typedef typename ExpView::non_const_value_type Packet;
2435  typedef typename ExpView::array_layout Layout;
2436  typedef typename ExpView::device_type Device;
2437  typedef typename ExpView::memory_traits Mem;
2438  Kokkos::View<Packet*,Layout,Device,Mem> sendArray ("sendArray",
2439  maxSendLength_ * numPackets);
2440 
2441  // FIXME (mfh 05 Mar 2013) This is broken for Isend (nonblocking
2442  // sends), because the buffer is only long enough for one send.
2443  TEUCHOS_TEST_FOR_EXCEPTION(
2444  sendType == Details::DISTRIBUTOR_ISEND,
2445  std::logic_error,
2446  "Tpetra::Distributor::doPosts(3 args, Kokkos): The \"send buffer\" code path "
2447  "doesn't currently work with nonblocking sends.");
2448 
2449  for (size_t i = 0; i < numBlocks; ++i) {
2450  size_t p = i + procIndex;
2451  if (p > (numBlocks - 1)) {
2452  p -= numBlocks;
2453  }
2454 
2455  if (procsTo_[p] != myRank) {
2456  if (verbose_) {
2457  std::ostringstream os;
2458  os << *prefix << "slow path: post send: {target: "
2459  << procsTo_[p] << ", tag: " << tag << "}" << endl;
2460  std::cerr << os.str();
2461  }
2462 
2463  size_t sendArrayOffset = 0;
2464  size_t j = startsTo_[p];
2465  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
2466  deep_copy_offset(sendArray, exports, sendArrayOffset,
2467  indicesTo_[j]*numPackets, numPackets);
2468  sendArrayOffset += numPackets;
2469  }
2470  ImpView tmpSend =
2471  subview_offset(sendArray, size_t(0), lengthsTo_[p]*numPackets);
2472 
2473  if (sendType == Details::DISTRIBUTOR_SEND) {
2474  send<int> (tmpSend,
2475  as<int> (tmpSend.size ()),
2476  procsTo_[p], tag, *comm_);
2477  }
2478  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2479  exports_view_type tmpSendBuf =
2480  subview_offset (sendArray, size_t(0), lengthsTo_[p] * numPackets);
2481  requests_.push_back (isend<int> (tmpSendBuf, procsTo_[p],
2482  tag, *comm_));
2483  }
2484  else if (sendType == Details::DISTRIBUTOR_RSEND) {
2485  readySend<int> (tmpSend,
2486  as<int> (tmpSend.size ()),
2487  procsTo_[p], tag, *comm_);
2488  }
2489  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2490  ssend<int> (tmpSend,
2491  as<int> (tmpSend.size ()),
2492  procsTo_[p], tag, *comm_);
2493  }
2494  else {
2495  TEUCHOS_TEST_FOR_EXCEPTION(
2496  true,
2497  std::logic_error,
2498  "Tpetra::Distributor::doPosts(3 args, Kokkos): "
2499  "Invalid send type. We should never get here. "
2500  "Please report this bug to the Tpetra developers.");
2501  }
2502  }
2503  else { // "Sending" the message to myself
2504  selfNum = p;
2505  selfIndex = startsTo_[p];
2506  }
2507  }
2508 
2509  if (selfMessage_) {
2510  if (verbose_) {
2511  std::ostringstream os;
2512  os << *prefix << "slow path: self-send" << endl;
2513  std::cerr << os.str();
2514  }
2515  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
2516  deep_copy_offset(imports, exports, selfReceiveOffset,
2517  indicesTo_[selfIndex]*numPackets, numPackets);
2518  ++selfIndex;
2519  selfReceiveOffset += numPackets;
2520  }
2521  }
2522  if (verbose_) {
2523  std::ostringstream os;
2524  os << *prefix << "slow path: done" << endl;
2525  std::cerr << os.str();
2526  }
2527  }
2528 
2529  if (verbose_) {
2530  std::ostringstream os;
2531  os << *prefix << "Done" << endl;
2532  std::cerr << os.str();
2533  }
2534  }
2535 
2536  template <class ExpView, class ImpView>
2537  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2539  doPosts (const ExpView &exports,
2540  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
2541  const ImpView &imports,
2542  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
2543  {
2544  using Teuchos::Array;
2545  using Teuchos::as;
2546  using Teuchos::ireceive;
2547  using Teuchos::isend;
2548  using Teuchos::readySend;
2549  using Teuchos::send;
2550  using Teuchos::ssend;
2551  using Teuchos::TypeNameTraits;
2552  using std::endl;
2553  using Kokkos::Compat::create_const_view;
2554  using Kokkos::Compat::create_view;
2555  using Kokkos::Compat::subview_offset;
2556  using Kokkos::Compat::deep_copy_offset;
2557  typedef Array<size_t>::size_type size_type;
2558  typedef ExpView exports_view_type;
2559  typedef ImpView imports_view_type;
2560 
2561 #ifdef KOKKOS_ENABLE_CUDA
2562  static_assert (! std::is_same<typename ExpView::memory_space, Kokkos::CudaUVMSpace>::value &&
2563  ! std::is_same<typename ImpView::memory_space, Kokkos::CudaUVMSpace>::value,
2564  "Please do not use Tpetra::Distributor with UVM "
2565  "allocations. See GitHub issue #1088.");
2566 #endif // KOKKOS_ENABLE_CUDA
2567 
2568  std::unique_ptr<std::string> prefix;
2569  if (verbose_) {
2570  prefix = createPrefix("doPosts(4-arg, Kokkos)");
2571  std::ostringstream os;
2572  os << *prefix << "Start" << endl;
2573  std::cerr << os.str();
2574  }
2575 
2576 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2577  Teuchos::TimeMonitor timeMon (*timer_doPosts4KV_);
2578 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2579 
2580  // Run-time configurable parameters that come from the input
2581  // ParameterList set by setParameterList().
2582  const Details::EDistributorSendType sendType = sendType_;
2583  const bool doBarrier = barrierBetween_;
2584 
2585 // #ifdef HAVE_TEUCHOS_DEBUG
2586 // // Prepare for verbose output, if applicable.
2587 // Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
2588 // RCP<Teuchos::FancyOStream> out = this->getOStream ();
2589 // const bool doPrint = out.get () && (comm_->getRank () == 0) &&
2590 // includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
2591 
2592 // if (doPrint) {
2593 // // Only need one process to print out parameters.
2594 // *out << "Distributor::doPosts (4 args)" << endl;
2595 // }
2596 // // Add one tab level. We declare this outside the doPrint scopes
2597 // // so that the tab persists until the end of this method.
2598 // if (doPrint) {
2599 // *out << "Parameters:" << endl;
2600 // {
2601 // *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
2602 // << endl << "barrierBetween: " << doBarrier << endl;
2603 // }
2604 // }
2605 // #endif // HAVE_TEUCHOS_DEBUG
2606 
2607  TEUCHOS_TEST_FOR_EXCEPTION(
2608  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier,
2609  std::logic_error, "Tpetra::Distributor::doPosts(4 args, Kokkos): Ready-send "
2610  "version requires a barrier between posting receives and posting ready "
2611  "sends. This should have been checked before. "
2612  "Please report this bug to the Tpetra developers.");
2613 
2614  const int myProcID = comm_->getRank ();
2615  size_t selfReceiveOffset = 0;
2616 
2617 #ifdef HAVE_TEUCHOS_DEBUG
2618  // Different messages may have different numbers of packets.
2619  size_t totalNumImportPackets = 0;
2620  for (size_type ii = 0; ii < numImportPacketsPerLID.size (); ++ii) {
2621  totalNumImportPackets += numImportPacketsPerLID[ii];
2622  }
2623  TEUCHOS_TEST_FOR_EXCEPTION(
2624  imports.extent (0) < totalNumImportPackets, std::runtime_error,
2625  "Tpetra::Distributor::doPosts(4 args, Kokkos): The 'imports' array must have "
2626  "enough entries to hold the expected number of import packets. "
2627  "imports.extent(0) = " << imports.extent (0) << " < "
2628  "totalNumImportPackets = " << totalNumImportPackets << ".");
2629 #endif // HAVE_TEUCHOS_DEBUG
2630 
2631  // MPI tag for nonblocking receives and blocking sends in this
2632  // method. Some processes might take the "fast" path
2633  // (indicesTo_.empty()) and others might take the "slow" path for
2634  // the same doPosts() call, so the path tag must be the same for
2635  // both.
2636  const int pathTag = 1;
2637  const int tag = this->getTag (pathTag);
2638 
2639 #ifdef HAVE_TEUCHOS_DEBUG
2640  TEUCHOS_TEST_FOR_EXCEPTION
2641  (requests_.size () != 0, std::logic_error, "Tpetra::Distributor::"
2642  "doPosts(4 args, Kokkos): Process " << myProcID << ": requests_.size () = "
2643  << requests_.size () << " != 0.");
2644 #endif // HAVE_TEUCHOS_DEBUG
2645  if (verbose_) {
2646  std::ostringstream os;
2647  os << *prefix << (indicesTo_.empty() ? "fast" : "slow")
2648  << " path, tag=" << tag << endl;
2649  std::cerr << os.str();
2650  }
2651  // Distributor uses requests_.size() as the number of outstanding
2652  // nonblocking message requests, so we resize to zero to maintain
2653  // this invariant.
2654  //
2655  // numReceives_ does _not_ include the self message, if there is
2656  // one. Here, we do actually send a message to ourselves, so we
2657  // include any self message in the "actual" number of receives to
2658  // post.
2659  //
2660  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
2661  // doesn't (re)allocate its array of requests. That happens in
2662  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
2663  // demand), or Resize_().
2664  const size_type actualNumReceives = as<size_type> (numReceives_) +
2665  as<size_type> (selfMessage_ ? 1 : 0);
2666  requests_.resize (0);
2667 
2668  // Post the nonblocking receives. It's common MPI wisdom to post
2669  // receives before sends. In MPI terms, this means favoring
2670  // adding to the "posted queue" (of receive requests) over adding
2671  // to the "unexpected queue" (of arrived messages not yet matched
2672  // with a receive).
2673  {
2674 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2675  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts4KV_recvs_);
2676 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2677 
2678  size_t curBufferOffset = 0;
2679  size_t curLIDoffset = 0;
2680  for (size_type i = 0; i < actualNumReceives; ++i) {
2681  size_t totalPacketsFrom_i = 0;
2682  for (size_t j = 0; j < lengthsFrom_[i]; ++j) {
2683  totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset+j];
2684  }
2685  curLIDoffset += lengthsFrom_[i];
2686  if (procsFrom_[i] != myProcID && totalPacketsFrom_i) {
2687  // If my process is receiving these packet(s) from another
2688  // process (not a self-receive), and if there is at least
2689  // one packet to receive:
2690  //
2691  // 1. Set up the persisting view (recvBuf) into the imports
2692  // array, given the offset and size (total number of
2693  // packets from process procsFrom_[i]).
2694  // 2. Start the Irecv and save the resulting request.
2695  imports_view_type recvBuf =
2696  subview_offset (imports, curBufferOffset, totalPacketsFrom_i);
2697  requests_.push_back (ireceive<int> (recvBuf, procsFrom_[i],
2698  tag, *comm_));
2699  }
2700  else { // Receiving these packet(s) from myself
2701  selfReceiveOffset = curBufferOffset; // Remember the offset
2702  }
2703  curBufferOffset += totalPacketsFrom_i;
2704  }
2705  }
2706 
2707  if (doBarrier) {
2708 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2709  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts4KV_barrier_);
2710 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2711  // If we are using ready sends (MPI_Rsend) below, we need to do
2712  // a barrier before we post the ready sends. This is because a
2713  // ready send requires that its matching receive has already
2714  // been posted before the send has been posted. The only way to
2715  // guarantee that in this case is to use a barrier.
2716  comm_->barrier ();
2717  }
2718 
2719 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2720  Teuchos::TimeMonitor timeMonSends (*timer_doPosts4KV_sends_);
2721 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2722 
2723  // setup arrays containing starting-offsets into exports for each send,
2724  // and num-packets-to-send for each send.
2725  Array<size_t> sendPacketOffsets(numSends_,0), packetsPerSend(numSends_,0);
2726  size_t maxNumPackets = 0;
2727  size_t curPKToffset = 0;
2728  for (size_t pp=0; pp<numSends_; ++pp) {
2729  sendPacketOffsets[pp] = curPKToffset;
2730  size_t numPackets = 0;
2731  for (size_t j=startsTo_[pp]; j<startsTo_[pp]+lengthsTo_[pp]; ++j) {
2732  numPackets += numExportPacketsPerLID[j];
2733  }
2734  if (numPackets > maxNumPackets) maxNumPackets = numPackets;
2735  packetsPerSend[pp] = numPackets;
2736  curPKToffset += numPackets;
2737  }
2738 
2739  // setup scan through procsTo_ list starting with higher numbered procs
2740  // (should help balance message traffic)
2741  size_t numBlocks = numSends_+ selfMessage_;
2742  size_t procIndex = 0;
2743  while ((procIndex < numBlocks) && (procsTo_[procIndex] < myProcID)) {
2744  ++procIndex;
2745  }
2746  if (procIndex == numBlocks) {
2747  procIndex = 0;
2748  }
2749 
2750  size_t selfNum = 0;
2751  size_t selfIndex = 0;
2752  if (indicesTo_.empty()) {
2753 
2754 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2755  Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts4KV_sends_fast_);
2756 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2757 
2758  if (verbose_) {
2759  std::ostringstream os;
2760  os << *prefix << "fast path: posting sends" << endl;
2761  std::cerr << os.str();
2762  }
2763 
2764  // Data are already blocked (laid out) by process, so we don't
2765  // need a separate send buffer (besides the exports array).
2766  for (size_t i = 0; i < numBlocks; ++i) {
2767  size_t p = i + procIndex;
2768  if (p > (numBlocks - 1)) {
2769  p -= numBlocks;
2770  }
2771 
2772  if (procsTo_[p] != myProcID && packetsPerSend[p] > 0) {
2773  exports_view_type tmpSend =
2774  subview_offset(exports, sendPacketOffsets[p], packetsPerSend[p]);
2775 
2776  if (sendType == Details::DISTRIBUTOR_SEND) { // the default, so put it first
2777  send<int> (tmpSend,
2778  as<int> (tmpSend.size ()),
2779  procsTo_[p], tag, *comm_);
2780  }
2781  else if (sendType == Details::DISTRIBUTOR_RSEND) {
2782  readySend<int> (tmpSend,
2783  as<int> (tmpSend.size ()),
2784  procsTo_[p], tag, *comm_);
2785  }
2786  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2787  exports_view_type tmpSendBuf =
2788  subview_offset (exports, sendPacketOffsets[p], packetsPerSend[p]);
2789  requests_.push_back (isend<int> (tmpSendBuf, procsTo_[p],
2790  tag, *comm_));
2791  }
2792  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2793  ssend<int> (tmpSend,
2794  as<int> (tmpSend.size ()),
2795  procsTo_[p], tag, *comm_);
2796  }
2797  else {
2798  TEUCHOS_TEST_FOR_EXCEPTION(
2799  true, std::logic_error,
2800  "Tpetra::Distributor::doPosts(4 args, Kokkos): "
2801  "Invalid send type. We should never get here. "
2802  "Please report this bug to the Tpetra developers.");
2803  }
2804  }
2805  else { // "Sending" the message to myself
2806  selfNum = p;
2807  }
2808  }
2809 
2810  if (selfMessage_) {
2811  deep_copy_offset(imports, exports, selfReceiveOffset,
2812  sendPacketOffsets[selfNum], packetsPerSend[selfNum]);
2813  }
2814  if (verbose_) {
2815  std::ostringstream os;
2816  os << *prefix << "fast path: done" << endl;
2817  std::cerr << os.str();
2818  }
2819  }
2820  else { // data are not blocked by proc, use send buffer
2821 
2822 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2823  Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts4KV_sends_slow_);
2824 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2825 
2826  if (verbose_) {
2827  std::ostringstream os;
2828  os << *prefix << "slow path: posting sends" << endl;
2829  std::cerr << os.str();
2830  }
2831  // FIXME (mfh 05 Mar 2013) This may be broken for Isend.
2832  typedef typename ExpView::non_const_value_type Packet;
2833  typedef typename ExpView::array_layout Layout;
2834  typedef typename ExpView::device_type Device;
2835  typedef typename ExpView::memory_traits Mem;
2836  Kokkos::View<Packet*,Layout,Device,Mem> sendArray ("sendArray", maxNumPackets); // send buffer
2837 
2838  TEUCHOS_TEST_FOR_EXCEPTION(
2839  sendType == Details::DISTRIBUTOR_ISEND,
2840  std::logic_error,
2841  "Tpetra::Distributor::doPosts(4-arg, Kokkos): "
2842  "The \"send buffer\" code path may not necessarily work with nonblocking sends.");
2843 
2844  Array<size_t> indicesOffsets (numExportPacketsPerLID.size(), 0);
2845  size_t ioffset = 0;
2846  for (int j=0; j<numExportPacketsPerLID.size(); ++j) {
2847  indicesOffsets[j] = ioffset;
2848  ioffset += numExportPacketsPerLID[j];
2849  }
2850 
2851  for (size_t i = 0; i < numBlocks; ++i) {
2852  size_t p = i + procIndex;
2853  if (p > (numBlocks - 1)) {
2854  p -= numBlocks;
2855  }
2856 
2857  if (procsTo_[p] != myProcID) {
2858  size_t sendArrayOffset = 0;
2859  size_t j = startsTo_[p];
2860  size_t numPacketsTo_p = 0;
2861  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
2862  numPacketsTo_p += numExportPacketsPerLID[j];
2863  deep_copy_offset(sendArray, exports, sendArrayOffset,
2864  indicesOffsets[j], numExportPacketsPerLID[j]);
2865  sendArrayOffset += numExportPacketsPerLID[j];
2866  }
2867  if (numPacketsTo_p > 0) {
2868  ImpView tmpSend =
2869  subview_offset(sendArray, size_t(0), numPacketsTo_p);
2870 
2871  if (sendType == Details::DISTRIBUTOR_RSEND) {
2872  readySend<int> (tmpSend,
2873  as<int> (tmpSend.size ()),
2874  procsTo_[p], tag, *comm_);
2875  }
2876  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2877  exports_view_type tmpSendBuf =
2878  subview_offset (sendArray, size_t(0), numPacketsTo_p);
2879  requests_.push_back (isend<int> (tmpSendBuf, procsTo_[p],
2880  tag, *comm_));
2881  }
2882  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2883  ssend<int> (tmpSend,
2884  as<int> (tmpSend.size ()),
2885  procsTo_[p], tag, *comm_);
2886  }
2887  else { // if (sendType == Details::DISTRIBUTOR_SSEND)
2888  send<int> (tmpSend,
2889  as<int> (tmpSend.size ()),
2890  procsTo_[p], tag, *comm_);
2891  }
2892  }
2893  }
2894  else { // "Sending" the message to myself
2895  selfNum = p;
2896  selfIndex = startsTo_[p];
2897  }
2898  }
2899 
2900  if (selfMessage_) {
2901  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
2902  deep_copy_offset(imports, exports, selfReceiveOffset,
2903  indicesOffsets[selfIndex],
2904  numExportPacketsPerLID[selfIndex]);
2905  selfReceiveOffset += numExportPacketsPerLID[selfIndex];
2906  ++selfIndex;
2907  }
2908  }
2909  if (verbose_) {
2910  std::ostringstream os;
2911  os << *prefix << "slow path: done" << endl;
2912  std::cerr << os.str();
2913  }
2914  }
2915  }
2916 
2917  template <class ExpView, class ImpView>
2918  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2920  doReversePostsAndWaits (const ExpView& exports,
2921  size_t numPackets,
2922  const ImpView& imports)
2923  {
2924  doReversePosts (exports, numPackets, imports);
2925  doReverseWaits ();
2926  }
2927 
2928  template <class ExpView, class ImpView>
2929  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2931  doReversePostsAndWaits (const ExpView& exports,
2932  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
2933  const ImpView& imports,
2934  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
2935  {
2936  TEUCHOS_TEST_FOR_EXCEPTION(requests_.size() != 0, std::runtime_error,
2937  "Tpetra::Distributor::doReversePostsAndWaits(4 args): There are "
2938  << requests_.size() << " outstanding nonblocking messages pending. It "
2939  "is incorrect to call this method with posts outstanding.");
2940 
2941  doReversePosts (exports, numExportPacketsPerLID, imports,
2942  numImportPacketsPerLID);
2943  doReverseWaits ();
2944  }
2945 
2946  template <class ExpView, class ImpView>
2947  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2949  doReversePosts (const ExpView &exports,
2950  size_t numPackets,
2951  const ImpView &imports)
2952  {
2953  // FIXME (mfh 29 Mar 2012) WHY?
2954  TEUCHOS_TEST_FOR_EXCEPTION(
2955  ! indicesTo_.empty (), std::runtime_error,
2956  "Tpetra::Distributor::doReversePosts(3 args): Can only do "
2957  "reverse communication when original data are blocked by process.");
2958  if (reverseDistributor_.is_null ()) {
2959  createReverseDistributor ();
2960  }
2961  reverseDistributor_->doPosts (exports, numPackets, imports);
2962  }
2963 
2964  template <class ExpView, class ImpView>
2965  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2967  doReversePosts (const ExpView &exports,
2968  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
2969  const ImpView &imports,
2970  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
2971  {
2972  // FIXME (mfh 29 Mar 2012) WHY?
2973  TEUCHOS_TEST_FOR_EXCEPTION(
2974  ! indicesTo_.empty (), std::runtime_error,
2975  "Tpetra::Distributor::doReversePosts(3 args): Can only do "
2976  "reverse communication when original data are blocked by process.");
2977  if (reverseDistributor_.is_null ()) {
2978  createReverseDistributor ();
2979  }
2980  reverseDistributor_->doPosts (exports, numExportPacketsPerLID,
2981  imports, numImportPacketsPerLID);
2982  }
2983 
2984  template <class OrdinalType>
2985  void Distributor::
2986  computeSends(const Teuchos::ArrayView<const OrdinalType>& importGIDs,
2987  const Teuchos::ArrayView<const int>& importProcIDs,
2988  Teuchos::Array<OrdinalType>& exportGIDs,
2989  Teuchos::Array<int>& exportProcIDs)
2990  {
2991  // NOTE (mfh 19 Apr 2012): There was a note on this code saying:
2992  // "assumes that size_t >= Ordinal". The code certainly does
2993  // assume that sizeof(size_t) >= sizeof(OrdinalType) as well as
2994  // sizeof(size_t) >= sizeof(int). This is because it casts the
2995  // OrdinalType elements of importGIDs (along with their
2996  // corresponding process IDs, as int) to size_t, and does a
2997  // doPostsAndWaits<size_t>() to send the packed data.
2998  using Teuchos::Array;
2999  using Teuchos::ArrayView;
3000  using std::endl;
3001  using size_type = typename ArrayView<const OrdinalType>::size_type;
3002  const char errPrefix[] = "Tpetra::Distributor::computeSends: ";
3003  const char suffix[] =
3004  " Please report this bug to the Tpetra developers.";
3005 
3006  const int myRank = comm_->getRank ();
3007  std::unique_ptr<std::string> prefix;
3008  if (verbose_) {
3009  prefix = createPrefix("computeSends");
3010  std::ostringstream os;
3011  os << *prefix << "Start" << endl;
3012  std::cerr << os.str();
3013  }
3014 
3015  TEUCHOS_TEST_FOR_EXCEPTION
3016  (importGIDs.size () != importProcIDs.size (),
3017  std::invalid_argument, errPrefix << "On Process " << myRank
3018  << ": importProcIDs.size()=" << importProcIDs.size()
3019  << " != importGIDs.size()=" << importGIDs.size() << ".");
3020 
3021  const size_type numImports = importProcIDs.size();
3022  Array<size_t> importObjs(2*numImports);
3023  // Pack pairs (importGIDs[i], my process ID) to send into importObjs.
3024  for (size_type i = 0; i < numImports; ++i) {
3025  importObjs[2*i] = static_cast<size_t>(importGIDs[i]);
3026  importObjs[2*i+1] = static_cast<size_t>(myRank);
3027  }
3028  //
3029  // Use a temporary Distributor to send the (importGIDs[i], myRank)
3030  // pairs to importProcIDs[i].
3031  //
3032  Distributor tempPlan(comm_);
3033  if (verbose_) {
3034  std::ostringstream os;
3035  os << *prefix << "Call tempPlan.createFromSends" << endl;
3036  std::cerr << os.str();
3037  }
3038  // mfh 20 Mar 2014: An extra-cautious cast from unsigned to
3039  // signed, in order to forestall any possible causes for Bug 6069.
3040  const size_t numExportsAsSizeT =
3041  tempPlan.createFromSends(importProcIDs);
3042  const size_type numExports =
3043  static_cast<size_type>(numExportsAsSizeT);
3044  TEUCHOS_TEST_FOR_EXCEPTION
3045  (numExports < 0, std::logic_error, errPrefix <<
3046  "tempPlan.createFromSends() returned numExports="
3047  << numExportsAsSizeT << " as a size_t, which overflows to "
3048  << numExports << " when cast to " <<
3049  Teuchos::TypeNameTraits<size_type>::name () << "." << suffix);
3050  TEUCHOS_TEST_FOR_EXCEPTION
3051  (size_type(tempPlan.getTotalReceiveLength()) != numExports,
3052  std::logic_error, errPrefix << "tempPlan.getTotalReceiveLength()="
3053  << tempPlan.getTotalReceiveLength () << " != numExports="
3054  << numExports << "." << suffix);
3055 
3056  if (numExports > 0) {
3057  exportGIDs.resize(numExports);
3058  exportProcIDs.resize(numExports);
3059  }
3060 
3061  // exportObjs: Packed receive buffer. (exportObjs[2*i],
3062  // exportObjs[2*i+1]) will give the (GID, PID) pair for export i,
3063  // after tempPlan.doPostsAndWaits(...) finishes below.
3064  //
3065  // FIXME (mfh 19 Mar 2014) This only works if OrdinalType fits in
3066  // size_t. This issue might come up, for example, on a 32-bit
3067  // machine using 64-bit global indices. I will add a check here
3068  // for that case.
3069  static_assert(sizeof(size_t) >= sizeof(OrdinalType),
3070  "Tpetra::Distributor::computeSends: "
3071  "sizeof(size_t) < sizeof(OrdinalType).");
3072 
3073  TEUCHOS_TEST_FOR_EXCEPTION
3074  (tempPlan.getTotalReceiveLength () < size_t(numExports),
3075  std::logic_error,
3076  errPrefix << "tempPlan.getTotalReceiveLength()="
3077  << tempPlan.getTotalReceiveLength() << " < numExports="
3078  << numExports << "." << suffix);
3079 
3080  Array<size_t> exportObjs (tempPlan.getTotalReceiveLength () * 2);
3081  if (verbose_) {
3082  std::ostringstream os;
3083  os << *prefix << "Call tempPlan.doPostsAndWaits" << endl;
3084  std::cerr << os.str();
3085  }
3086  tempPlan.doPostsAndWaits<size_t> (importObjs (), 2, exportObjs ());
3087 
3088  // Unpack received (GID, PID) pairs into exportIDs resp. exportProcIDs.
3089  for (size_type i = 0; i < numExports; ++i) {
3090  exportGIDs[i] = static_cast<OrdinalType> (exportObjs[2*i]);
3091  exportProcIDs[i] = static_cast<int> (exportObjs[2*i+1]);
3092  }
3093 
3094  if (verbose_) {
3095  std::ostringstream os;
3096  os << *prefix << "Done" << endl;
3097  std::cerr << os.str();
3098  }
3099  }
3100 
3101  template <class OrdinalType>
3102  void Distributor::
3103  createFromRecvs (const Teuchos::ArrayView<const OrdinalType> &remoteGIDs,
3104  const Teuchos::ArrayView<const int> &remoteProcIDs,
3105  Teuchos::Array<OrdinalType> &exportGIDs,
3106  Teuchos::Array<int> &exportProcIDs)
3107  {
3108  using std::endl;
3109  const char errPrefix[] = "Tpetra::Distributor::createFromRecvs: ";
3110  const int myRank = comm_->getRank();
3111 
3112  std::unique_ptr<std::string> prefix;
3113  if (verbose_) {
3114  prefix = createPrefix("createFromRecvs");
3115  std::ostringstream os;
3116  os << *prefix << "Start" << endl;
3117  std::cerr << os.str();
3118  }
3119 
3120  const bool debug = Details::Behavior::debug("Distributor");
3121  if (debug) {
3122  using Teuchos::outArg;
3123  using Teuchos::REDUCE_MAX;
3124  using Teuchos::reduceAll;
3125  // In debug mode, first test locally, then do an all-reduce to
3126  // make sure that all processes passed.
3127  const int errProc =
3128  (remoteGIDs.size () != remoteProcIDs.size ()) ? myRank : -1;
3129  int maxErrProc = -1;
3130  reduceAll(*comm_, REDUCE_MAX, errProc, outArg(maxErrProc));
3131  TEUCHOS_TEST_FOR_EXCEPTION
3132  (maxErrProc != -1, std::runtime_error, errPrefix << "Lists "
3133  "of remote IDs and remote process IDs must have the same "
3134  "size on all participating processes. Maximum process ID "
3135  "with error: " << maxErrProc << ".");
3136  }
3137  else { // in non-debug mode, just test locally
3138  // NOTE (mfh 13 Feb 2020) This needs to throw std::runtime_error
3139  // in order to make an existing Distributor unit test pass.
3140  TEUCHOS_TEST_FOR_EXCEPTION
3141  (remoteGIDs.size() != remoteProcIDs.size(), std::runtime_error,
3142  errPrefix << "On Process " << myRank << ": "
3143  "remoteGIDs.size()=" << remoteGIDs.size() <<
3144  " != remoteProcIDs.size()=" << remoteProcIDs.size() << ".");
3145  }
3146 
3147  computeSends(remoteGIDs, remoteProcIDs, exportGIDs, exportProcIDs);
3148 
3149  const size_t numProcsSendingToMe = createFromSends(exportProcIDs ());
3150 
3151  if (verbose_) {
3152  // NOTE (mfh 20 Mar 2014) If remoteProcIDs could contain
3153  // duplicates, then its length might not be the right check here,
3154  // even if we account for selfMessage_. selfMessage_ is set in
3155  // createFromSends.
3156  std::ostringstream os;
3157  os << *prefix << "numProcsSendingToMe: "
3158  << numProcsSendingToMe << ", remoteProcIDs.size(): "
3159  << remoteProcIDs.size () << ", selfMessage_: "
3160  << (selfMessage_ ? "true" : "false") << "" << endl;
3161  std::cerr << os.str();
3162  }
3163 
3164  howInitialized_ = Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS;
3165 
3166  if (verbose_) {
3167  std::ostringstream os;
3168  os << *prefix << "Done" << endl;
3169  std::cerr << os.str();
3170  }
3171  }
3172 
3173 } // namespace Tpetra
3174 
3175 #endif // TPETRA_DISTRIBUTOR_HPP
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Stand-alone utility functions and macros.
static bool debug()
Whether Tpetra is in debug mode.
Sets up and executes a communication plan for a Tpetra DistObject.
void doPostsAndWaits(const Teuchos::ArrayView< const Packet > &exports, size_t numPackets, const Teuchos::ArrayView< Packet > &imports)
Execute the (forward) communication plan.
void doReversePostsAndWaits(const Teuchos::ArrayView< const Packet > &exports, size_t numPackets, const Teuchos::ArrayView< Packet > &imports)
Execute the reverse communication plan.
size_t getMaxSendLength() const
Maximum number of values this process will send to another single process.
Teuchos::RCP< Distributor > getReverse(bool create=true) const
A reverse communication plan Distributor.
void createFromRecvs(const Teuchos::ArrayView< const Ordinal > &remoteIDs, const Teuchos::ArrayView< const int > &remoteProcIDs, Teuchos::Array< Ordinal > &exportIDs, Teuchos::Array< int > &exportProcIDs)
Set up Distributor using list of process ranks from which to receive.
Teuchos::ArrayView< const int > getProcsTo() const
Ranks of the processes to which this process will send values.
void doReversePosts(const Teuchos::ArrayRCP< const Packet > &exports, size_t numPackets, const Teuchos::ArrayRCP< Packet > &imports)
Post the data for a reverse plan, but do not execute the waits yet.
size_t getNumReceives() const
The number of processes from which we will receive data.
void doPosts(const Teuchos::ArrayRCP< const Packet > &exports, size_t numPackets, const Teuchos::ArrayRCP< Packet > &imports)
Post the data for a forward plan, but do not execute the waits yet.
void setParameterList(const Teuchos::RCP< Teuchos::ParameterList > &plist)
Set Distributor parameters.
size_t getTotalReceiveLength() const
Total number of values this process will receive from other processes.
virtual ~Distributor()=default
Destructor (virtual for memory safety).
bool hasSelfMessage() const
Whether the calling process will send or receive messages to itself.
void swap(Distributor &rhs)
Swap the contents of rhs with those of *this.
Teuchos::ArrayView< const size_t > getLengthsTo() const
Number of values this process will send to each process.
Teuchos::ArrayView< const int > getProcsFrom() const
Ranks of the processes sending values to this process.
Distributor(const Teuchos::RCP< const Teuchos::Comm< int > > &comm)
Construct using the specified communicator and default parameters.
std::string description() const
Return a one-line description of this object.
size_t createFromSends(const Teuchos::ArrayView< const int > &exportProcIDs)
Set up Distributor using list of process ranks to which this process will send.
void createFromSendsAndRecvs(const Teuchos::ArrayView< const int > &exportProcIDs, const Teuchos::ArrayView< const int > &remoteProcIDs)
Set up Distributor using list of process ranks to which to send, and list of process ranks from which...
Teuchos::RCP< const Teuchos::ParameterList > getValidParameters() const
List of valid Distributor parameters.
Teuchos::ArrayView< const size_t > getLengthsFrom() const
Number of values this process will receive from each process.
Details::EDistributorHowInitialized howInitialized() const
Return an enum indicating whether and how a Distributor was initialized.
size_t getNumSends() const
The number of processes to which we will send data.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Describe this object in a human-readable way to the given output stream.
void getLastDoStatistics(size_t &bytes_sent, size_t &bytes_recvd) const
Information on the last call to do/doReverse.
Implementation details of Tpetra.
std::string DistributorSendTypeEnumToString(EDistributorSendType sendType)
Convert an EDistributorSendType enum value to a string.
EDistributorSendType
The type of MPI send that Distributor should use.
EDistributorHowInitialized
Enum indicating how and whether a Distributor was initialized.
std::string DistributorHowInitializedEnumToString(EDistributorHowInitialized how)
Convert an EDistributorHowInitialized enum value to a string.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Teuchos::Array< std::string > distributorSendTypes()
Valid values for Distributor's "Send type" parameter.