Intrepid2
Intrepid2_IntegrationToolsDef.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ************************************************************************
3 //
4 // Intrepid2 Package
5 // Copyright (2007) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Kyungjoo Kim (kyukim@sandia.gov),
38 // Mauro Perego (mperego@sandia.gov), or
39 // Nate Roberts (nvrober@sandia.gov)
40 //
41 // ************************************************************************
42 // @HEADER
43 
49 #ifndef __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
50 #define __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
51 
54 
55 #include "Teuchos_TimeMonitor.hpp"
56 
57 namespace Intrepid2 {
58 
59  namespace Impl
60  {
64  template<class Scalar, class DeviceType, int integralViewRank>
66  {
67  using ExecutionSpace = typename DeviceType::execution_space;
68  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace>;
69  using TeamMember = typename TeamPolicy::member_type;
70 
71  using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
72  IntegralViewType integralView_;
73  TensorData<Scalar,DeviceType> leftComponent_;
74  Data<Scalar,DeviceType> composedTransform_;
75  TensorData<Scalar,DeviceType> rightComponent_;
76  TensorData<Scalar,DeviceType> cellMeasures_;
77  int a_offset_;
78  int b_offset_;
79  int leftComponentSpan_; // leftComponentSpan tracks the dimensions spanned by the left component
80  int rightComponentSpan_; // rightComponentSpan tracks the dimensions spanned by the right component
81  int numTensorComponents_;
82  int leftFieldOrdinalOffset_;
83  int rightFieldOrdinalOffset_;
84  bool forceNonSpecialized_; // if true, don't use the specialized (more manual) implementation(s) for particular component counts. Primary use case is for testing.
85 
86  size_t fad_size_output_ = 0; // 0 if not a fad type
87 
88  Kokkos::Array<int, 7> offsetsForComponentOrdinal_;
89 
90  // as an optimization, we do all the bounds and argument iteration within the functor rather than relying on TensorArgumentIterator
91  // (this also makes it easier to reorder loops, etc., for further optimizations)
92  Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
93  Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
94  Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
95 
96  Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldRelativeEnumerationSpans_; // total number of enumeration indices with arguments prior to the startingComponent fixed
97  Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldRelativeEnumerationSpans_;
98 
99  int maxFieldsLeft_;
100  int maxFieldsRight_;
101  int maxPointCount_;
102  public:
104  TensorData<Scalar,DeviceType> leftComponent,
105  Data<Scalar,DeviceType> composedTransform,
106  TensorData<Scalar,DeviceType> rightComponent,
107  TensorData<Scalar,DeviceType> cellMeasures,
108  int a_offset,
109  int b_offset,
110  int leftFieldOrdinalOffset,
111  int rightFieldOrdinalOffset,
112  bool forceNonSpecialized)
113  :
114  integralView_(integralData.template getUnderlyingView<integralViewRank>()),
115  leftComponent_(leftComponent),
116  composedTransform_(composedTransform),
117  rightComponent_(rightComponent),
118  cellMeasures_(cellMeasures),
119  a_offset_(a_offset),
120  b_offset_(b_offset),
121  leftComponentSpan_(leftComponent.extent_int(2)),
122  rightComponentSpan_(rightComponent.extent_int(2)),
123  numTensorComponents_(leftComponent.numTensorComponents()),
124  leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
125  rightFieldOrdinalOffset_(rightFieldOrdinalOffset),
126  forceNonSpecialized_(forceNonSpecialized)
127  {
128  INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument, "Left and right components must have matching number of tensorial components");
129 
130  // set up bounds containers
131  const int FIELD_DIM = 0;
132  const int POINT_DIM = 1;
133  maxFieldsLeft_ = 0;
134  maxFieldsRight_ = 0;
135  maxPointCount_ = 0;
136  for (int r=0; r<numTensorComponents_; r++)
137  {
138  leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
139  maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
140  rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
141  maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
142  pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
143  maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
144  }
145 
146  // set up relative enumeration spans: total number of enumeration indices with arguments prior to the startingComponent fixed. These are for *truncated* iterators; hence the -2 rather than -1 for the first startingComponent value.
147  int leftRelativeEnumerationSpan = 1;
148  int rightRelativeEnumerationSpan = 1;
149  for (int startingComponent=numTensorComponents_-2; startingComponent>=0; startingComponent--)
150  {
151  leftRelativeEnumerationSpan *= leftFieldBounds_[startingComponent];
152  rightRelativeEnumerationSpan *= rightFieldBounds_[startingComponent];
153  leftFieldRelativeEnumerationSpans_ [startingComponent] = leftRelativeEnumerationSpan;
154  rightFieldRelativeEnumerationSpans_[startingComponent] = rightRelativeEnumerationSpan;
155  }
156 
157  // prepare for allocation of temporary storage
158  // note: tempStorage goes "backward", starting from the final component, which needs just one entry
159 
160  const bool allocateFadStorage = !std::is_pod<Scalar>::value;
161  if (allocateFadStorage)
162  {
163  fad_size_output_ = dimension_scalar(integralView_);
164  }
165 
166  const int R = numTensorComponents_ - 1;
167 
168  int num_ij = 1; // this counts how many entries there are corresponding to components from r to R-1.
169  int allocationSoFar = 0;
170  offsetsForComponentOrdinal_[R] = allocationSoFar;
171  allocationSoFar++; // we store one entry corresponding to R, the final component
172 
173  for (int r=R-1; r>0; r--) // last component is innermost in the for loops (requires least storage)
174  {
175  const int leftFields = leftComponent.getTensorComponent(r).extent_int(0);
176  const int rightFields = rightComponent.getTensorComponent(r).extent_int(0);
177 
178  num_ij *= leftFields * rightFields;
179 
180  offsetsForComponentOrdinal_[r] = allocationSoFar;
181  allocationSoFar += num_ij;
182  }
183  offsetsForComponentOrdinal_[0] = allocationSoFar; // first component stores directly to final integralView.
184  }
185 
186  template<size_t maxComponents, size_t numComponents = maxComponents>
187  KOKKOS_INLINE_FUNCTION
188  int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
189  const Kokkos::Array<int,maxComponents> &bounds) const
190  {
191  if (numComponents == 0)
192  {
193  return -1;
194  }
195  else
196  {
197  int r = static_cast<int>(numComponents - 1);
198  while (arguments[r] + 1 >= bounds[r])
199  {
200  arguments[r] = 0; // reset
201  r--;
202  if (r < 0) break;
203  }
204  if (r >= 0) ++arguments[r];
205  return r;
206  }
207  }
208 
210  KOKKOS_INLINE_FUNCTION
211  int incrementArgument( Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
212  const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
213  const int &numComponents) const
214  {
215  if (numComponents == 0) return -1;
216  int r = static_cast<int>(numComponents - 1);
217  while (arguments[r] + 1 >= bounds[r])
218  {
219  arguments[r] = 0; // reset
220  r--;
221  if (r < 0) break;
222  }
223  if (r >= 0) ++arguments[r];
224  return r;
225  }
226 
227  template<size_t maxComponents, size_t numComponents = maxComponents>
228  KOKKOS_INLINE_FUNCTION
229  int nextIncrementResult(const Kokkos::Array<int,maxComponents> &arguments,
230  const Kokkos::Array<int,maxComponents> &bounds) const
231  {
232  if (numComponents == 0)
233  {
234  return -1;
235  }
236  else
237  {
238  int r = static_cast<int>(numComponents - 1);
239  while (arguments[r] + 1 >= bounds[r])
240  {
241  r--;
242  if (r < 0) break;
243  }
244  return r;
245  }
246  }
247 
249  KOKKOS_INLINE_FUNCTION
250  int nextIncrementResult(const Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
251  const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
252  const int &numComponents) const
253  {
254  if (numComponents == 0) return -1;
255  int r = numComponents - 1;
256  while (arguments[r] + 1 >= bounds[r])
257  {
258  r--;
259  if (r < 0) break;
260  }
261  return r;
262  }
263 
264  template<size_t maxComponents, size_t numComponents = maxComponents>
265  KOKKOS_INLINE_FUNCTION
266  int relativeEnumerationIndex(const Kokkos::Array<int,maxComponents> &arguments,
267  const Kokkos::Array<int,maxComponents> &bounds,
268  const int startIndex) const
269  {
270  // the following mirrors what is done in TensorData
271  if (numComponents == 0)
272  {
273  return 0;
274  }
275  else
276  {
277  int enumerationIndex = 0;
278  for (size_t r=numComponents-1; r>static_cast<size_t>(startIndex); r--)
279  {
280  enumerationIndex += arguments[r];
281  enumerationIndex *= bounds[r-1];
282  }
283  enumerationIndex += arguments[startIndex];
284  return enumerationIndex;
285  }
286  }
287 
289 
290  // nvcc refuses to compile the below with error, "explicit specialization is not allowed in the current scope". Clang is OK with it. We just do a non-templated version below.
291 // template<size_t numTensorComponents>
292 // KOKKOS_INLINE_FUNCTION
293 // void runSpecialized( const TeamMember & teamMember ) const;
294 
295 // template<>
296 // KOKKOS_INLINE_FUNCTION
297 // void runSpecialized<3>( const TeamMember & teamMember ) const
298  KOKKOS_INLINE_FUNCTION
299  void runSpecialized3( const TeamMember & teamMember ) const
300  {
301  constexpr int numTensorComponents = 3;
302 
303  Kokkos::Array<int,numTensorComponents> pointBounds;
304  Kokkos::Array<int,numTensorComponents> leftFieldBounds;
305  Kokkos::Array<int,numTensorComponents> rightFieldBounds;
306  for (unsigned r=0; r<numTensorComponents; r++)
307  {
308  pointBounds[r] = pointBounds_[r];
309  leftFieldBounds[r] = leftFieldBounds_[r];
310  rightFieldBounds[r] = rightFieldBounds_[r];
311  }
312 
313  const int cellDataOrdinal = teamMember.league_rank();
314  const int numThreads = teamMember.team_size(); // num threads
315  const int scratchViewSize = offsetsForComponentOrdinal_[0]; // per thread
316 
317  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView; // for caching partial integration values
318  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure
319  Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, leftFields_y, leftFields_z, rightFields_x, rightFields_y, rightFields_z; // cache the field values for faster access
320  if (fad_size_output_ > 0) {
321  scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
322  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
323  leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0], fad_size_output_);
324  rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0], fad_size_output_);
325  leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1], fad_size_output_);
326  rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1], fad_size_output_);
327  leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2], fad_size_output_);
328  rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2], fad_size_output_);
329  }
330  else {
331  scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads);
332  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
333  leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0]);
334  rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0]);
335  leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1]);
336  rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1]);
337  leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2]);
338  rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2]);
339  }
340 
341 // int approximateFlopCount = 0;
342 // int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
343 
344  constexpr int R = numTensorComponents - 1;
345 
346  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
347  {
348  const int a = a_offset_ + a_component;
349  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
350  {
351  const int b = b_offset_ + b_component;
352 
353  const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
354  const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
355 
356  const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
357  const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
358 
359  const Data<Scalar,DeviceType> & leftTensorComponent_x = leftComponent_.getTensorComponent(0);
360  const Data<Scalar,DeviceType> & rightTensorComponent_x = rightComponent_.getTensorComponent(0);
361  const Data<Scalar,DeviceType> & leftTensorComponent_y = leftComponent_.getTensorComponent(1);
362  const Data<Scalar,DeviceType> & rightTensorComponent_y = rightComponent_.getTensorComponent(1);
363  const Data<Scalar,DeviceType> & leftTensorComponent_z = leftComponent_.getTensorComponent(2);
364  const Data<Scalar,DeviceType> & rightTensorComponent_z = rightComponent_.getTensorComponent(2);
365 
366  const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
367  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields * maxPointCount_), [&] (const int& fieldOrdinalPointOrdinal) {
368  const int fieldOrdinal = fieldOrdinalPointOrdinal % maxFields;
369  const int pointOrdinal = fieldOrdinalPointOrdinal / maxFields;
370  if ((fieldOrdinal < leftTensorComponent_x.extent_int(0)) && (pointOrdinal < leftTensorComponent_x.extent_int(1)))
371  {
372  const int leftRank = leftTensorComponent_x.rank();
373  leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
374  }
375  if ((fieldOrdinal < leftTensorComponent_y.extent_int(0)) && (pointOrdinal < leftTensorComponent_y.extent_int(1)))
376  {
377  const int leftRank = leftTensorComponent_y.rank();
378  leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
379  }
380  if ((fieldOrdinal < leftTensorComponent_z.extent_int(0)) && (pointOrdinal < leftTensorComponent_z.extent_int(1)))
381  {
382  const int leftRank = leftTensorComponent_z.rank();
383  leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
384  }
385  if ((fieldOrdinal < rightTensorComponent_x.extent_int(0)) && (pointOrdinal < rightTensorComponent_x.extent_int(1)))
386  {
387  const int rightRank = rightTensorComponent_x.rank();
388  rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,b_component);
389  }
390  if ((fieldOrdinal < rightTensorComponent_y.extent_int(0)) && (pointOrdinal < rightTensorComponent_y.extent_int(1)))
391  {
392  const int rightRank = rightTensorComponent_y.rank();
393  rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,b_component);
394  }
395  if ((fieldOrdinal < rightTensorComponent_z.extent_int(0)) && (pointOrdinal < rightTensorComponent_z.extent_int(1)))
396  {
397  const int rightRank = rightTensorComponent_z.rank();
398  rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,b_component);
399  }
400  });
401 
402  if (composedTransform_.underlyingMatchesLogical())
403  {
404  const auto & composedTransformView = composedTransform_.getUnderlyingView4();
405  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (const int& pointOrdinal) {
406  pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
407  });
408  }
409  else
410  {
411  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
412  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
413  });
414  }
415 
416  // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
417 
418  // synchronize threads
419  teamMember.team_barrier();
420 
421  // Setting scratchView to 0 here is not necessary from an algorithmic point of view, but *might* help with performance (due to a first-touch policy)
422  const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
423  for (int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
424  {
425  scratchView(i) = 0.0;
426  }
427 
428  // TODO: consider adding an innerLoopRange that is sized to be the maximum of the size of the inner loops we'd like to parallelize over. (note that this means we do the work in the outer loop redundantly that many times…)
429  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (const int& leftRightFieldEnumeration) {
430  const int iz = leftRightFieldEnumeration % numLeftFieldsFinal;
431  const int jz = leftRightFieldEnumeration / numLeftFieldsFinal;
432 
433  // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
434  // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
435  Kokkos::Array<int,numTensorComponents> leftFieldArguments;
436  Kokkos::Array<int,numTensorComponents> rightFieldArguments;
437  rightFieldArguments[R] = jz;
438  leftFieldArguments[R] = iz;
439 
440  Kokkos::Array<int,numTensorComponents> pointArguments;
441  for (int i=0; i<numTensorComponents; i++)
442  {
443  pointArguments[i] = 0;
444  }
445 
446  for (int lx=0; lx<pointBounds[0]; lx++)
447  {
448  pointArguments[0] = lx;
449 
450  // clear Gy scratch:
451  // in scratch, Gz (1 entry) comes first, then Gy entries.
452  const int Gy_start_index = scratchOffsetForThread + offsetsForComponentOrdinal_[1];
453  const int Gy_end_index = scratchOffsetForThread + offsetsForComponentOrdinal_[0];
454 
455  for (int Gy_index=Gy_start_index; Gy_index < Gy_end_index; Gy_index++)
456  {
457  scratchView(Gy_index) = 0;
458  }
459 
460  for (int ly=0; ly<pointBounds[1]; ly++)
461  {
462  pointArguments[1] = ly;
463 
464  Scalar * Gz = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
465  *Gz = 0;
466 
467  for (int lz=0; lz < pointBounds[2]; lz++)
468  {
469  const Scalar & leftValue = leftFields_z(iz,lz);
470  const Scalar & rightValue = rightFields_z(jz,lz);
471 
472  pointArguments[2] = lz;
473  int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
474 
475  *Gz += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
476 
477  // approximateFlopCount += 3; // 2 multiplies, 1 sum
478  } // lz
479 
480  for (int iy=0; iy<leftFieldBounds_[1]; iy++)
481  {
482  leftFieldArguments[1] = iy;
483  const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
484 
485  const Scalar & leftValue = leftFields_y(iy,ly);
486 
487  for (int jy=0; jy<rightFieldBounds_[1]; jy++)
488  {
489  rightFieldArguments[1] = jy;
490 
491  const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
492  const Scalar & rightValue = rightFields_y(jy,ly);
493 
494  const int & rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
495  const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
496 
497  const int Gz_index = 0;
498  const Scalar & Gz = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[2] + Gz_index);
499 
500  Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
501 
502  Gy += leftValue * Gz * rightValue;
503  // approximateFlopCount += 3; // 2 multiplies, 1 sum
504  }
505  }
506  } // ly
507  for (int ix=0; ix<leftFieldBounds_[0]; ix++)
508  {
509  leftFieldArguments[0] = ix;
510  const Scalar & leftValue = leftFields_x(ix,lx);
511 
512  for (int iy=0; iy<leftFieldBounds_[1]; iy++)
513  {
514  leftFieldArguments[1] = iy;
515 
516  const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
517 
518  for (int jx=0; jx<rightFieldBounds_[0]; jx++)
519  {
520  rightFieldArguments[0] = jx;
521  const Scalar & rightValue = rightFields_x(jx,lx);
522 
523  for (int jy=0; jy<rightFieldBounds_[1]; jy++)
524  {
525  rightFieldArguments[1] = jy;
526  const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
527 
528  const int rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
529 
530  const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
531  Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
532 
533  // compute enumeration indices to get field indices into output view
534  int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
535  int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
536  const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
537  const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
538 
539  if (integralViewRank == 3)
540  {
541 // if ((leftFieldIndex==0) && (rightFieldIndex==2))
542 // {
543 // using std::cout;
544 // using std::endl;
545 // cout << "***** Contribution to (0,0,2) *****\n";
546 // cout << "lx = " << lx << endl;
547 // cout << "ix = " << ix << endl;
548 // cout << "iy = " << iy << endl;
549 // cout << "jx = " << jx << endl;
550 // cout << "jy = " << jy << endl;
551 // cout << "iz = " << iz << endl;
552 // cout << "jz = " << jz << endl;
553 // cout << " leftValue = " << leftValue << endl;
554 // cout << "rightValue = " << rightValue << endl;
555 // cout << "Gy = " << Gy << endl;
556 //
557 // cout << endl;
558 // }
559 
560  // shape (C,F1,F2)
561  integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex) += leftValue * Gy * rightValue;
562  }
563  else
564  {
565  // shape (F1,F2)
566  integralView_.access(leftFieldIndex,rightFieldIndex,0) += leftValue * Gy * rightValue;
567  }
568  // approximateFlopCount += 3; // 2 multiplies, 1 sum
569  } // jy
570  } // ix
571  } // iy
572  } // ix
573  } // lx
574  }); // TeamThreadRange parallel_for - (iz,jz) loop
575  }
576  }
577 // std::cout << "flop count per cell (within operator()) : " << approximateFlopCount << std::endl;
578  }
579 
580  template<size_t numTensorComponents>
581  KOKKOS_INLINE_FUNCTION
582  void run( const TeamMember & teamMember ) const
583  {
584  Kokkos::Array<int,numTensorComponents> pointBounds;
585  Kokkos::Array<int,numTensorComponents> leftFieldBounds;
586  Kokkos::Array<int,numTensorComponents> rightFieldBounds;
587  for (unsigned r=0; r<numTensorComponents; r++)
588  {
589  pointBounds[r] = pointBounds_[r];
590  leftFieldBounds[r] = leftFieldBounds_[r];
591  rightFieldBounds[r] = rightFieldBounds_[r];
592  }
593 
594  const int cellDataOrdinal = teamMember.league_rank();
595  const int numThreads = teamMember.team_size(); // num threads
596  const int scratchViewSize = offsetsForComponentOrdinal_[0]; // per thread
597  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView; // for caching partial integration values
598  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure
599  Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields; // cache the field values for faster access
600  if (fad_size_output_ > 0) {
601  scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
602  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
603  leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_, fad_size_output_);
604  rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_, fad_size_output_);
605  }
606  else {
607  scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize*numThreads);
608  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
609  leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_);
610  rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_);
611  }
612 
613 // int approximateFlopCount = 0;
614 // int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
615 
616  constexpr int R = numTensorComponents - 1;
617 
618  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
619  {
620  const int a = a_offset_ + a_component;
621  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
622  {
623  const int b = b_offset_ + b_component;
624 
625  const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
626  const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
627 
628  const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
629  const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
630 
631  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numTensorComponents), [&] (const int& r) {
632  const Data<Scalar,DeviceType> & leftTensorComponent = leftComponent_.getTensorComponent(r);
633  const Data<Scalar,DeviceType> & rightTensorComponent = rightComponent_.getTensorComponent(r);
634  const int leftFieldCount = leftTensorComponent.extent_int(0);
635  const int pointCount = leftTensorComponent.extent_int(1);
636  const int leftRank = leftTensorComponent.rank();
637  const int rightFieldCount = rightTensorComponent.extent_int(0);
638  const int rightRank = rightTensorComponent.rank();
639  for (int fieldOrdinal=0; fieldOrdinal<maxFieldsLeft_; fieldOrdinal++)
640  {
641  // slightly weird logic here in effort to avoid branch divergence
642  const int fieldAddress = (fieldOrdinal < leftFieldCount) ? fieldOrdinal : leftFieldCount - 1;
643  for (int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
644  {
645  const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
646  leftFields(r,fieldAddress,pointAddress) = (leftRank == 2) ? leftTensorComponent(fieldAddress,pointAddress) : leftTensorComponent(fieldAddress,pointAddress,a_component);
647  }
648  }
649  for (int fieldOrdinal=0; fieldOrdinal<maxFieldsRight_; fieldOrdinal++)
650  {
651  // slightly weird logic here in effort to avoid branch divergence
652  const int fieldAddress = (fieldOrdinal < rightFieldCount) ? fieldOrdinal : rightFieldCount - 1;
653  for (int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
654  {
655  const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
656  rightFields(r,fieldAddress,pointAddress) = (rightRank == 2) ? rightTensorComponent(fieldAddress,pointAddress) : rightTensorComponent(fieldAddress,pointAddress,b_component);
657  }
658  }
659  });
660 
661  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
662  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
663  });
664  // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
665 
666  // synchronize threads
667  teamMember.team_barrier();
668 
669  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (const int& leftRightFieldEnumeration) {
670  const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
671  const int i_R = leftRightFieldEnumeration % numLeftFieldsFinal;
672  const int j_R = leftRightFieldEnumeration / numLeftFieldsFinal;
673 
674  // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
675  // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
676  Kokkos::Array<int,numTensorComponents> leftFieldArguments;
677  Kokkos::Array<int,numTensorComponents> rightFieldArguments;
678  rightFieldArguments[R] = j_R;
679  leftFieldArguments[R] = i_R;
680 
681  //TODO: I believe that this can be moved outside the thread parallel_for
682  for (int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
683  {
684  scratchView(i) = 0.0;
685  }
686  Kokkos::Array<int,numTensorComponents> pointArguments;
687  for (unsigned i=0; i<numTensorComponents; i++)
688  {
689  pointArguments[i] = 0;
690  }
691 
692  int r = R;
693  while (r >= 0)
694  {
695  // integrate in final component dimension; this is where we need the M weight, as well as the weighted measure
696  const int pointBounds_R = pointBounds[R];
697  int & pointArgument_R = pointArguments[R];
698  for (pointArgument_R=0; pointArgument_R < pointBounds_R; pointArgument_R++)
699  {
700  Scalar * G_R;
701  if (R != 0)
702  {
703  G_R = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
704  }
705  else
706  {
707  const int leftFieldIndex = i_R + leftFieldOrdinalOffset_;
708  const int rightFieldIndex = j_R + rightFieldOrdinalOffset_;
709 
710  if (integralViewRank == 3)
711  {
712  // shape (C,F1,F2)
713  G_R = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
714  }
715  else
716  {
717  // shape (F1,F2)
718  G_R = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
719  }
720  }
721 
722  const int & pointIndex_R = pointArguments[R];
723 
724  const Scalar & leftValue = leftFields(R,i_R,pointIndex_R);
725  const Scalar & rightValue = rightFields(R,j_R,pointIndex_R);
726 
727  int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
728 
729  *G_R += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
730 
731 // approximateFlopCount += 3; // 2 multiplies, 1 sum
732  } // pointArgument_R
733 
734  const int r_next = nextIncrementResult<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds); // numTensorComponents_-1 means that we won't touch the [R] argument, which is treated in for loop above
735  const int r_min = (r_next >= 0) ? r_next : 0;
736 
737  for (int s=R-1; s>=r_min; s--)
738  {
739  const int & pointIndex_s = pointArguments[s];
740 
741  // want to cover all the multi-indices from s to R-1
742  for (int s1=s; s1<R; s1++)
743  {
744  leftFieldArguments[s1] = 0;
745  }
746 
747  // i_s, j_s are the indices into the "current" tensor component; these are references, so they actually vary as the arguments are incremented.
748  const int & i_s = leftFieldArguments[s];
749  const int & j_s = rightFieldArguments[s];
750 
751  int sLeft = s; // hereafter, sLeft is the return from the left field increment: the lowest rank that was incremented. If this is less than s, we've gotten through all the multi-indexes from rank s through R-1 (inclusive).
752  const int & rightEnumerationSpan_s = rightFieldRelativeEnumerationSpans_[s];
753  const int & rightEnumerationSpan_sp = rightFieldRelativeEnumerationSpans_[s+1];
754 
755  while (sLeft >= s)
756  {
757  const int leftEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s);
758 
759  // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage (hence the "0" index possibility here)
760  const int leftEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s+1);
761 
762  const Scalar & leftValue = leftFields(s,i_s,pointIndex_s);
763 
764  for (int s1=s; s1<R; s1++)
765  {
766  rightFieldArguments[s1] = 0;
767  }
768  int sRight = s; // hereafter, sRight is the return from the leftFieldTensorIterator: the lowest rank that was incremented. If this is less than s, we've gotten through all the multi-indexes from rank s through R-1 (inclusive).
769  while (sRight >= s)
770  {
771  const int rightEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s);
772 
773  // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage (hence the "0" index possibility here)
774  const int rightEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s+1);
775 
776  const Scalar & rightValue = rightFields(s,j_s,pointIndex_s);
777 
778  const int G_s_index = leftEnumerationIndex_s * rightEnumerationSpan_s + rightEnumerationIndex_s;
779 
780  Scalar* G_s;
781 
782  // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage
783  // (above, we have set the leftEnumerationIndex_sp, rightEnumerationIndex_sp to be 0 in this case, so G_sp_index will then also be 0)
784  const int G_sp_index = leftEnumerationIndex_sp * rightEnumerationSpan_sp + rightEnumerationIndex_sp;
785 
786  const Scalar & G_sp = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s+1] + G_sp_index);
787 
788 
789  if (s != 0)
790  {
791  G_s = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s] + G_s_index);
792  }
793  else
794  {
795  // compute enumeration indices
796  int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
797  int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
798  const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
799  const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
800 
801  if (integralViewRank == 3)
802  {
803  // shape (C,F1,F2)
804  G_s = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
805  }
806  else
807  {
808  // shape (F1,F2)
809  G_s = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
810  }
811  }
812 
813  *G_s += leftValue * G_sp * rightValue;
814 
815 // approximateFlopCount += 3; // 2 multiplies, 1 sum
816 
817  // increment rightField
818  sRight = incrementArgument<numTensorComponents,R>(rightFieldArguments, rightFieldBounds);
819  }
820 
821  // increment leftField
822  sLeft = incrementArgument<numTensorComponents,R>(leftFieldArguments, leftFieldBounds);
823  }
824  }
825 
826  // clear tempStorage for r_next+1 to R
827  if (r_min+1 <= R)
828  {
829  const int endIndex = scratchOffsetForThread + offsetsForComponentOrdinal_[r_min];
830  for (int i=scratchOffsetForThread; i<endIndex; i++)
831  {
832  scratchView(i) = 0.0;
833  }
834 // auto tempStorageSubview = Kokkos::subview(scratchView, Kokkos::pair<int,int>{0,offsetsForComponentOrdinal_[r_min]});
835 // Kokkos::deep_copy(tempStorageSubview, 0.0);
836  }
837 
838  // proceed to next point
839  r = incrementArgument<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds); // numTensorComponents_-1 means that we won't touch the [R] argument, which is treated in the G_R integration for loop above
840  }
841  }); // TeamThreadRange parallel_for
842  }
843  }
844 // std::cout << "flop count per cell (within operator()) : " << approximateFlopCount << std::endl;
845  }
846 
847  KOKKOS_INLINE_FUNCTION
848  void operator()( const TeamMember & teamMember ) const
849  {
850  switch (numTensorComponents_)
851  {
852  case 1: run<1>(teamMember); break;
853  case 2: run<2>(teamMember); break;
854  case 3:
855  if (forceNonSpecialized_)
856  run<3>(teamMember);
857  else
858  runSpecialized3(teamMember);
859  break;
860  case 4: run<4>(teamMember); break;
861  case 5: run<5>(teamMember); break;
862  case 6: run<6>(teamMember); break;
863  case 7: run<7>(teamMember); break;
864 #ifdef INTREPID2_HAVE_DEBUG
865  default:
866  INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true,std::invalid_argument,"Unsupported component count");
867 #endif
868  }
869  }
870 
873  {
874  // compute flop count on a single cell, then multiply by the number of cells
875  int flopCount = 0;
876 
877  const int R = numTensorComponents_ - 1;
878 
879  // we cache the value of M_ab * cellMeasure at each point.
880  // access to cellMeasures involves cellMeasures.numTensorComponents() - 1 multiplies, so total is the below:
881  const int flopsPerPoint_ab = cellMeasures_.numTensorComponents(); // the access involves multiplying all the components together
882 
883  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
884  {
885  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
886  {
887  const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
888  const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
889 
890  const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
891  const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
892 
893  flopCount += flopsPerPoint_ab * cellMeasures_.extent_int(1);
894 
895  int flopsPer_i_R_j_R = 0;
896 
897  // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
898  // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
899  Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldArguments;
900  leftFieldArguments[R] = 0;
901 
902  Kokkos::Array<int,Parameters::MaxTensorComponents> pointArguments;
903  for (int i=0; i<numTensorComponents_; i++)
904  {
905  pointArguments[i] = 0;
906  }
907 
908  // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
909  // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
910  Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldArguments;
911  rightFieldArguments[R] = 0;
912 
913  int r = R;
914  while (r >= 0)
915  {
916  // integrate in final component dimension
917  for (pointArguments[R]=0; pointArguments[R] < pointBounds_[R]; pointArguments[R]++)
918  {
919  flopsPer_i_R_j_R += 4;
920  }
921  // TODO: figure out why the below is not the same thing as the above -- the below overcounts, somehow
922 // if (0 < pointBounds_[R])
923 // {
924 // flopsPer_i_R_j_R += pointBounds_[R] * 4;
925 // }
926 
927  const int r_next = nextIncrementResult(pointArguments, pointBounds_, numTensorComponents_);
928  const int r_min = (r_next >= 0) ? r_next : 0;
929 
930  for (int s=R-1; s>=r_min; s--)
931  {
932  // want to cover all the multi-indices from s to R-1: for each we have 2 multiplies and one add (3 flops)
933  int numLeftIterations = leftFieldRelativeEnumerationSpans_[s];
934  int numRightIterations = rightFieldRelativeEnumerationSpans_[s];
935 
936  flopsPer_i_R_j_R += numLeftIterations * numRightIterations * 3;
937  }
938 
939  // proceed to next point
940  r = incrementArgument(pointArguments, pointBounds_, numTensorComponents_);
941  }
942 
943  flopCount += flopsPer_i_R_j_R * numLeftFieldsFinal * numRightFieldsFinal;
944  }
945  }
946 // std::cout << "flop count per cell: " << flopCount << std::endl;
947  return flopCount;
948  }
949 
951  int teamSize(const int &maxTeamSizeFromKokkos) const
952  {
953  const int R = numTensorComponents_ - 1;
954  const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
955  return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
956  }
957 
959  size_t team_shmem_size (int team_size) const
960  {
961  // we use shared memory to create a fast buffer for intermediate values, as well as fast access to the current-cell's field values
962  size_t shmem_size = 0;
963 
964  if (fad_size_output_ > 0)
965  {
966  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(offsetsForComponentOrdinal_[0] * team_size, fad_size_output_);
967  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.extent_int(1), fad_size_output_);
968  shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_, fad_size_output_);
969  shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_, fad_size_output_);
970  }
971  else
972  {
973  shmem_size += Kokkos::View<Scalar*, DeviceType>::shmem_size(offsetsForComponentOrdinal_[0] * team_size);
974  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.extent_int(1));
975  shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_);
976  shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_);
977  }
978 
979  return shmem_size;
980  }
981  };
982 
992  template<class Scalar, class DeviceType, int integralViewRank>
994  {
995  using ExecutionSpace = typename DeviceType::execution_space;
996  using TeamPolicy = Kokkos::TeamPolicy<DeviceType>;
997  using TeamMember = typename TeamPolicy::member_type;
998 
999  using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
1000  IntegralViewType integralView_;
1001  TensorData<Scalar,DeviceType> leftComponent_;
1002  Data<Scalar,DeviceType> composedTransform_;
1003  TensorData<Scalar,DeviceType> rightComponent_;
1004  TensorData<Scalar,DeviceType> cellMeasures_;
1005  int a_offset_;
1006  int b_offset_;
1007  int leftComponentSpan_; // leftComponentSpan tracks the dimensions spanned by the left component
1008  int rightComponentSpan_; // rightComponentSpan tracks the dimensions spanned by the right component
1009  int numTensorComponents_;
1010  int leftFieldOrdinalOffset_;
1011  int rightFieldOrdinalOffset_;
1012 
1013  size_t fad_size_output_ = 0; // 0 if not a fad type
1014 
1015  // as an optimization, we do all the bounds and argument iteration within the functor rather than relying on TensorArgumentIterator
1016  // (this also makes it easier to reorder loops, etc., for further optimizations)
1017  Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
1018  Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
1019  Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
1020 
1021  int maxFieldsLeft_;
1022  int maxFieldsRight_;
1023  int maxPointCount_;
1024  public:
1026  TensorData<Scalar,DeviceType> leftComponent,
1027  Data<Scalar,DeviceType> composedTransform,
1028  TensorData<Scalar,DeviceType> rightComponent,
1029  TensorData<Scalar,DeviceType> cellMeasures,
1030  int a_offset,
1031  int b_offset,
1032  int leftFieldOrdinalOffset,
1033  int rightFieldOrdinalOffset)
1034  :
1035  integralView_(integralData.template getUnderlyingView<integralViewRank>()),
1036  leftComponent_(leftComponent),
1037  composedTransform_(composedTransform),
1038  rightComponent_(rightComponent),
1039  cellMeasures_(cellMeasures),
1040  a_offset_(a_offset),
1041  b_offset_(b_offset),
1042  leftComponentSpan_(leftComponent.extent_int(2)),
1043  rightComponentSpan_(rightComponent.extent_int(2)),
1044  numTensorComponents_(leftComponent.numTensorComponents()),
1045  leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
1046  rightFieldOrdinalOffset_(rightFieldOrdinalOffset)
1047  {
1048  INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument, "Left and right components must have matching number of tensorial components");
1049 
1050  const int FIELD_DIM = 0;
1051  const int POINT_DIM = 1;
1052  maxFieldsLeft_ = 0;
1053  maxFieldsRight_ = 0;
1054  maxPointCount_ = 0;
1055  for (int r=0; r<numTensorComponents_; r++)
1056  {
1057  leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1058  maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
1059  rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1060  maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
1061  pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
1062  maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
1063  }
1064 
1065  // prepare for allocation of temporary storage
1066  // note: tempStorage goes "backward", starting from the final component, which needs just one entry
1067 
1068  const bool allocateFadStorage = !std::is_pod<Scalar>::value;
1069  if (allocateFadStorage)
1070  {
1071  fad_size_output_ = dimension_scalar(integralView_);
1072  }
1073  }
1074 
1075  template<size_t maxComponents, size_t numComponents = maxComponents>
1076  KOKKOS_INLINE_FUNCTION
1077  int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
1078  const Kokkos::Array<int,maxComponents> &bounds) const
1079  {
1080  if (numComponents == 0) return -1;
1081  int r = static_cast<int>(numComponents - 1);
1082  while (arguments[r] + 1 >= bounds[r])
1083  {
1084  arguments[r] = 0; // reset
1085  r--;
1086  if (r < 0) break;
1087  }
1088  if (r >= 0) ++arguments[r];
1089  return r;
1090  }
1091 
1093  KOKKOS_INLINE_FUNCTION
1094  int incrementArgument( Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
1095  const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1096  const int &numComponents) const
1097  {
1098  if (numComponents == 0) return -1;
1099  int r = static_cast<int>(numComponents - 1);
1100  while (arguments[r] + 1 >= bounds[r])
1101  {
1102  arguments[r] = 0; // reset
1103  r--;
1104  if (r < 0) break;
1105  }
1106  if (r >= 0) ++arguments[r];
1107  return r;
1108  }
1109 
1110  template<size_t maxComponents, size_t numComponents = maxComponents>
1111  KOKKOS_INLINE_FUNCTION
1112  int nextIncrementResult(const Kokkos::Array<int,maxComponents> &arguments,
1113  const Kokkos::Array<int,maxComponents> &bounds) const
1114  {
1115  if (numComponents == 0) return -1;
1116  int r = static_cast<int>(numComponents - 1);
1117  while (arguments[r] + 1 >= bounds[r])
1118  {
1119  r--;
1120  if (r < 0) break;
1121  }
1122  return r;
1123  }
1124 
1126  KOKKOS_INLINE_FUNCTION
1127  int nextIncrementResult(const Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
1128  const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1129  const int &numComponents) const
1130  {
1131  if (numComponents == 0) return -1;
1132  int r = numComponents - 1;
1133  while (arguments[r] + 1 >= bounds[r])
1134  {
1135  r--;
1136  if (r < 0) break;
1137  }
1138  return r;
1139  }
1140 
1141  template<size_t maxComponents, size_t numComponents = maxComponents>
1142  KOKKOS_INLINE_FUNCTION
1143  int relativeEnumerationIndex(const Kokkos::Array<int,maxComponents> &arguments,
1144  const Kokkos::Array<int,maxComponents> &bounds,
1145  const int startIndex) const
1146  {
1147  // the following mirrors what is done in TensorData
1148  if (numComponents == 0)
1149  {
1150  return 0;
1151  }
1152  int enumerationIndex = 0;
1153  for (size_t r=numComponents-1; r>static_cast<size_t>(startIndex); r--)
1154  {
1155  enumerationIndex += arguments[r];
1156  enumerationIndex *= bounds[r-1];
1157  }
1158  enumerationIndex += arguments[startIndex];
1159  return enumerationIndex;
1160  }
1161 
1162  template<int rank>
1163  KOKKOS_INLINE_FUNCTION
1164  enable_if_t<rank==3 && rank==integralViewRank, Scalar &>
1165  integralViewEntry(const IntegralViewType& integralView, const int &cellDataOrdinal, const int &i, const int &j) const
1166  {
1167  return integralView(cellDataOrdinal,i,j);
1168  }
1169 
1170  template<int rank>
1171  KOKKOS_INLINE_FUNCTION
1172  enable_if_t<rank==2 && rank==integralViewRank, Scalar &>
1173  integralViewEntry(const IntegralViewType& integralView, const int &cellDataOrdinal, const int &i, const int &j) const
1174  {
1175  return integralView(i,j);
1176  }
1177 
1179  KOKKOS_INLINE_FUNCTION
1180  void runSpecialized3( const TeamMember & teamMember ) const
1181  {
1182  constexpr int numTensorComponents = 3;
1183 
1184  const int pointBounds_x = pointBounds_[0];
1185  const int pointBounds_y = pointBounds_[1];
1186  const int pointBounds_z = pointBounds_[2];
1187  const int pointsInNonzeroComponentDimensions = pointBounds_y * pointBounds_z;
1188 
1189  const int leftFieldBounds_x = leftFieldBounds_[0];
1190  const int rightFieldBounds_x = rightFieldBounds_[0];
1191  const int leftFieldBounds_y = leftFieldBounds_[1];
1192  const int rightFieldBounds_y = rightFieldBounds_[1];
1193  const int leftFieldBounds_z = leftFieldBounds_[2];
1194  const int rightFieldBounds_z = rightFieldBounds_[2];
1195 
1196  Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1197  Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1198  for (unsigned r=0; r<numTensorComponents; r++)
1199  {
1200  leftFieldBounds[r] = leftFieldBounds_[r];
1201  rightFieldBounds[r] = rightFieldBounds_[r];
1202  }
1203 
1204  const auto integralView = integralView_;
1205  const auto leftFieldOrdinalOffset = leftFieldOrdinalOffset_;
1206  const auto rightFieldOrdinalOffset = rightFieldOrdinalOffset_;
1207 
1208  const int cellDataOrdinal = teamMember.league_rank();
1209  const int threadNumber = teamMember.team_rank();
1210 
1211  const int numThreads = teamMember.team_size(); // num threads
1212  const int GyEntryCount = pointBounds_z; // for each thread: store one Gy value per z coordinate
1213  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GxIntegrals; // for caching Gx values: we integrate out the first component dimension for each coordinate in the remaining dimensios
1214  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GyIntegrals; // for caching Gy values (each thread gets a stack, of the same height as tensorComponents - 1)
1215  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GzIntegral; // for one Gz value that we sum into before summing into the destination matrix
1216  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure; shared by team
1217 
1218  Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, rightFields_x;
1219  Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_y, rightFields_y;
1220  Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_z, rightFields_z;
1221  if (fad_size_output_ > 0) {
1222  GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1223  GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads, fad_size_output_);
1224  GzIntegral = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numThreads, fad_size_output_);
1225  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
1226 
1227  leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x, fad_size_output_);
1228  rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x, fad_size_output_);
1229  leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y, fad_size_output_);
1230  rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y, fad_size_output_);
1231  leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z, fad_size_output_);
1232  rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z, fad_size_output_);
1233  }
1234  else {
1235  GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1236  GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads);
1237  GzIntegral = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numThreads);
1238  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
1239 
1240  leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x);
1241  rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x);
1242  leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y);
1243  rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y);
1244  leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z);
1245  rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z);
1246  }
1247 
1248 // int approximateFlopCount = 0;
1249 // int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
1250 
1251  // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1252 
1253  // synchronize threads
1254  teamMember.team_barrier();
1255 
1256  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1257  {
1258  const int a = a_offset_ + a_component;
1259  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1260  {
1261  const int b = b_offset_ + b_component;
1262 
1263  const Data<Scalar,DeviceType> & leftTensorComponent_x = leftComponent_.getTensorComponent(0);
1264  const Data<Scalar,DeviceType> & rightTensorComponent_x = rightComponent_.getTensorComponent(0);
1265  const Data<Scalar,DeviceType> & leftTensorComponent_y = leftComponent_.getTensorComponent(1);
1266  const Data<Scalar,DeviceType> & rightTensorComponent_y = rightComponent_.getTensorComponent(1);
1267  const Data<Scalar,DeviceType> & leftTensorComponent_z = leftComponent_.getTensorComponent(2);
1268  const Data<Scalar,DeviceType> & rightTensorComponent_z = rightComponent_.getTensorComponent(2);
1269 
1270  const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
1271  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields), [&] (const int& fieldOrdinal) {
1272  if (fieldOrdinal < leftTensorComponent_x.extent_int(0))
1273  {
1274  const int pointCount = leftTensorComponent_x.extent_int(1);
1275  const int leftRank = leftTensorComponent_x.rank();
1276  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1277  {
1278  leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1279  }
1280  }
1281  if (fieldOrdinal < leftTensorComponent_y.extent_int(0))
1282  {
1283  const int pointCount = leftTensorComponent_y.extent_int(1);
1284  const int leftRank = leftTensorComponent_y.rank();
1285  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1286  {
1287  leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1288  }
1289  }
1290  if (fieldOrdinal < leftTensorComponent_z.extent_int(0))
1291  {
1292  const int pointCount = leftTensorComponent_z.extent_int(1);
1293  const int leftRank = leftTensorComponent_z.rank();
1294  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1295  {
1296  leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1297  }
1298  }
1299  if (fieldOrdinal < rightTensorComponent_x.extent_int(0))
1300  {
1301  const int pointCount = rightTensorComponent_x.extent_int(1);
1302  const int rightRank = rightTensorComponent_x.rank();
1303  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1304  {
1305  rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1306  }
1307  }
1308  if (fieldOrdinal < rightTensorComponent_y.extent_int(0))
1309  {
1310  const int pointCount = rightTensorComponent_y.extent_int(1);
1311  const int rightRank = rightTensorComponent_y.rank();
1312  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1313  {
1314  rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1315  }
1316  }
1317  if (fieldOrdinal < rightTensorComponent_z.extent_int(0))
1318  {
1319  const int pointCount = rightTensorComponent_z.extent_int(1);
1320  const int rightRank = rightTensorComponent_z.rank();
1321  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1322  {
1323  rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1324  }
1325  }
1326  });
1327 
1328  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1329  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1330  });
1331 
1332  // synchronize threads
1333  teamMember.team_barrier();
1334 
1335  for (int i0=0; i0<leftFieldBounds_x; i0++)
1336  {
1337  for (int j0=0; j0<rightFieldBounds_x; j0++)
1338  {
1339  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1340  // first component is fastest-moving; we can get a tensorPointEnumerationOffset just by multiplying by the pointBounds in x
1341  const int tensorPointEnumerationOffset = pointBounds_x * pointEnumerationIndexLaterDimensions; // compute offset for pointWeights container, for which x is the fastest-moving
1342 
1343  Scalar & Gx = GxIntegrals(pointEnumerationIndexLaterDimensions);
1344 
1345  Gx = 0;
1346  if (fad_size_output_ == 0)
1347  {
1348  // not a Fad type; we're allow to have a vector range
1349  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_x), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1350  {
1351  integralThusFar += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1352  }, Gx);
1353  }
1354  else
1355  {
1356  for (int x_pointOrdinal=0; x_pointOrdinal<pointBounds_x; x_pointOrdinal++)
1357  {
1358  Gx += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1359  }
1360  }
1361  });
1362 
1363  // synchronize threads
1364  teamMember.team_barrier();
1365 
1366  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds_y * rightFieldBounds_y), [&] (const int& i1j1) {
1367  const int i1 = i1j1 % leftFieldBounds_y;
1368  const int j1 = i1j1 / leftFieldBounds_y;
1369 
1370  int Gy_index = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1371 
1372  int pointEnumerationIndex = 0; // incremented at bottom of lz loop below.
1373  for (int lz=0; lz<pointBounds_z; lz++)
1374  {
1375  Scalar & Gy = GyIntegrals(Gy_index);
1376  Gy = 0.0;
1377 
1378  for (int ly=0; ly<pointBounds_y; ly++)
1379  {
1380  const Scalar & leftValue = leftFields_y(i1,ly);
1381  const Scalar & rightValue = rightFields_y(j1,ly);
1382 
1383  Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex);
1384 
1385  pointEnumerationIndex++;
1386  }
1387  Gy_index++;
1388  }
1389 
1390  Scalar & Gz = GzIntegral(threadNumber); // one entry per thread
1391  for (int i2=0; i2<leftFieldBounds_z; i2++)
1392  {
1393  for (int j2=0; j2<rightFieldBounds_z; j2++)
1394  {
1395  Gz = 0.0;
1396 
1397  int Gy_index = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1398 
1399  for (int lz=0; lz<pointBounds_z; lz++)
1400  {
1401  const Scalar & leftValue = leftFields_z(i2,lz);
1402  const Scalar & rightValue = rightFields_z(j2,lz);
1403 
1404  Gz += leftValue * rightValue * GyIntegrals(Gy_index);
1405 
1406  Gy_index++;
1407  }
1408 
1409  const int i = leftFieldOrdinalOffset + i0 + (i1 + i2 * leftFieldBounds_y) * leftFieldBounds_x;
1410  const int j = rightFieldOrdinalOffset + j0 + (j1 + j2 * rightFieldBounds_y) * rightFieldBounds_x;
1411  // the above are an optimization of the below, undertaken on the occasion of a weird Intel compiler segfault, possibly a compiler bug.
1412 // const int i = relativeEnumerationIndex( leftArguments, leftFieldBounds, 0) + leftFieldOrdinalOffset;
1413 // const int j = relativeEnumerationIndex(rightArguments, rightFieldBounds, 0) + rightFieldOrdinalOffset;
1414 
1415  integralViewEntry<integralViewRank>(integralView, cellDataOrdinal, i, j) += Gz;
1416  }
1417  }
1418  });
1419  // synchronize threads
1420  teamMember.team_barrier();
1421  }
1422  }
1423  }
1424  }
1425  }
1426 
1427  template<size_t numTensorComponents>
1428  KOKKOS_INLINE_FUNCTION
1429  void run( const TeamMember & teamMember ) const
1430  {
1431  INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::logic_error, "implementation incomplete");
1432 // Kokkos::Array<int,numTensorComponents> pointBounds;
1433 // Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1434 // Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1435 //
1436 // int pointsInNonzeroComponentDimensions = 1;
1437 // for (unsigned r=0; r<numTensorComponents; r++)
1438 // {
1439 // pointBounds[r] = pointBounds_[r];
1440 // if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1441 // leftFieldBounds[r] = leftFieldBounds_[r];
1442 // rightFieldBounds[r] = rightFieldBounds_[r];
1443 // }
1444 //
1445 // const int cellDataOrdinal = teamMember.league_rank();
1446 // const int numThreads = teamMember.team_size(); // num threads
1447 // const int G_k_StackHeight = numTensorComponents - 1; // per thread
1448 // Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> G_0_IntegralsView; // for caching G0 values: we integrate out the first component dimension for each coordinate in the remaining dimensios
1449 // Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> G_k_StackView; // for caching G_k values (each thread gets a stack, of the same height as tensorComponents - 1)
1450 // Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure; shared by team
1451 // Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields; // cache the field values at each level for faster access
1452 // if (fad_size_output_ > 0) {
1453 // G_k_StackView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), G_k_StackHeight * numThreads, fad_size_output_);
1454 // G_0_IntegralsView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1455 // pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
1456 // leftFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_, fad_size_output_);
1457 // rightFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_, fad_size_output_);
1458 // }
1459 // else {
1460 // G_k_StackView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), G_k_StackHeight * numThreads);
1461 // G_0_IntegralsView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1462 // pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
1463 // leftFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_);
1464 // rightFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_);
1465 // }
1466 //
1469 //
1470 // constexpr int R = numTensorComponents - 1;
1471 //
1472 // // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1473 //
1474 // // synchronize threads
1475 // teamMember.team_barrier();
1476 //
1477 // for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1478 // {
1479 // const int a = a_offset_ + a_component;
1480 // for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1481 // {
1482 // const int b = b_offset_ + b_component;
1483 //
1484 // const Data<Scalar,DeviceType> & leftFirstComponent = leftComponent_.getTensorComponent(0);
1485 // const Data<Scalar,DeviceType> & rightFirstComponent = rightComponent_.getTensorComponent(0);
1486 //
1487 // const int numLeftFieldsFirst = leftFirstComponent.extent_int(0); // shape (F,P[,D])
1488 // const int numRightFieldsFirst = rightFirstComponent.extent_int(0); // shape (F,P[,D])
1489 //
1490 // const int numPointsFirst = leftFirstComponent.extent_int(1);
1491 //
1492 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1493 // pointWeights(pointOrdinal) = composedTransform_.access(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1494 // });
1495 //
1496 // // synchronize threads
1497 // teamMember.team_barrier();
1498 //
1499 // for (int i0=0; i0<numLeftFieldsFirst; i0++)
1500 // {
1501 // for (int j0=0; j0<numRightFieldsFirst; j0++)
1502 // {
1503 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFirst*numPointsFirst), [&] (const int& fieldOrdinalByPointOrdinal) {
1504 // const int fieldOrdinal = fieldOrdinalByPointOrdinal % numPointsFirst;
1505 // const int pointOrdinal = fieldOrdinalByPointOrdinal / numPointsFirst;
1506 // leftFields(pointOrdinal) = leftFirstComponent(fieldOrdinal,pointOrdinal);
1507 // });
1508 //
1509 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numRightFieldsFirst*numPointsFirst), [&] (const int& fieldOrdinalByPointOrdinal) {
1510 // const int fieldOrdinal = fieldOrdinalByPointOrdinal % numPointsFirst;
1511 // const int pointOrdinal = fieldOrdinalByPointOrdinal / numPointsFirst;
1512 // rightFields(pointOrdinal) = rightFirstComponent(fieldOrdinal,pointOrdinal);
1513 // });
1514 //
1515 // // synchronize threads
1516 // teamMember.team_barrier();
1517 //
1518 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1519 // Kokkos::Array<int,numTensorComponents-1> pointArgumentsInLaterDimensions;
1520 // int remainingIndex = pointEnumerationIndexLaterDimensions;
1521 //
1522 // for (int d=R-1; d>0; d--) // last component (z in 3D hypercube) is fastest-moving // TODO: consider doing first component as fastest-moving. That would make indexing into pointWeights simpler
1523 // {
1524 // pointArgumentsInLaterDimensions[d] = pointEnumerationIndexLaterDimensions % pointBounds[d+1];
1525 // remainingIndex /= pointBounds[d+1];
1526 // }
1527 // pointArgumentsInLaterDimensions[0] = remainingIndex;
1528 //
1529 // int tensorPointEnumerationOffset = 0; // compute offset for pointWeights container, for which x is the fastest-moving
1530 // for (int d=R; d>0; d--)
1531 // {
1532 // tensorPointEnumerationOffset += pointArgumentsInLaterDimensions[d-1]; // pointArgumentsInLaterDimensions does not have an x component, hence d-1 here
1533 // tensorPointEnumerationOffset *= pointBounds[d-1];
1534 // }
1535 //
1536 // Scalar integralValue = 0;
1537 // if (fad_size_output_ == 0)
1538 // {
1539 // // not a Fad type; we're allow to have a vector range
1540 // Kokkos::parallel_reduce("first component integral", Kokkos::ThreadVectorRange(teamMember, numPointsFirst), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1541 // {
1542 // integralThusFar += leftFields(x_pointOrdinal) * rightFields(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset);
1543 // }, integralValue);
1544 // }
1545 // else
1546 // {
1547 // for (int pointOrdinal=0; pointOrdinal<numPointsFirst; pointOrdinal++)
1548 // {
1549 // integralValue += leftFields(pointOrdinal) * rightFields(pointOrdinal) * pointWeights(tensorPointEnumerationOffset);
1550 // }
1551 // }
1552 //
1553 // G_0_IntegralsView(pointEnumerationIndexLaterDimensions) = integralValue;
1554 // });
1555 //
1556 // // synchronize threads
1557 // teamMember.team_barrier();
1558 //
1559 // // TODO: finish this, probably after having written up the algorithm for arbitrary component count. (I have it written down for 3D.)
1560 // }
1561 // }
1562 // }
1563 // }
1565  }
1566 
1567  KOKKOS_INLINE_FUNCTION
1568  void operator()( const TeamMember & teamMember ) const
1569  {
1570  switch (numTensorComponents_)
1571  {
1572  case 1: run<1>(teamMember); break;
1573  case 2: run<2>(teamMember); break;
1574  case 3: runSpecialized3(teamMember); break;
1575  case 4: run<4>(teamMember); break;
1576  case 5: run<5>(teamMember); break;
1577  case 6: run<6>(teamMember); break;
1578  case 7: run<7>(teamMember); break;
1579 #ifdef INTREPID2_HAVE_DEBUG
1580  default:
1581  INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true,std::invalid_argument,"Unsupported component count");
1582 #endif
1583  }
1584  }
1585 
1588  {
1589  // compute flop count on a single cell
1590  int flopCount = 0;
1591 
1592  constexpr int numTensorComponents = 3;
1593  Kokkos::Array<int,numTensorComponents> pointBounds;
1594  Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1595  Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1596 
1597  int pointsInNonzeroComponentDimensions = 1;
1598  for (unsigned r=0; r<numTensorComponents; r++)
1599  {
1600  pointBounds[r] = pointBounds_[r];
1601  if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1602  leftFieldBounds[r] = leftFieldBounds_[r];
1603  rightFieldBounds[r] = rightFieldBounds_[r];
1604  }
1605 
1606  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1607  {
1608  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1609  {
1610 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1611 // pointWeights(pointOrdinal) = composedTransform_.access(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1612 // });
1613  flopCount += composedTransform_.extent_int(1) * cellMeasures_.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1614 
1615  for (int i0=0; i0<leftFieldBounds[0]; i0++)
1616  {
1617  for (int j0=0; j0<rightFieldBounds[0]; j0++)
1618  {
1619  flopCount += pointsInNonzeroComponentDimensions * pointBounds[0] * 3; // 3 flops per integration point in the loop commented out below
1620 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1621 // Kokkos::Array<int,numTensorComponents-1> pointArgumentsInLaterDimensions;
1622 // int remainingIndex = pointEnumerationIndexLaterDimensions;
1623 //
1624 // for (int d=0; d<R-1; d++) // first component is fastest-moving; this is determined by order of access in the lz/ly loop to compute Gy (integrals in y dimension)
1625 // {
1626 // pointArgumentsInLaterDimensions[d] = pointEnumerationIndexLaterDimensions % pointBounds[d+1]; // d+1 because x dimension is being integrated away
1627 // remainingIndex /= pointBounds[d+1];
1628 // }
1629 // pointArgumentsInLaterDimensions[R-1] = remainingIndex;
1630 //
1631 // int tensorPointEnumerationOffset = 0; // compute offset for pointWeights container, for which x is the fastest-moving
1632 // for (int d=R; d>0; d--)
1633 // {
1634 // tensorPointEnumerationOffset += pointArgumentsInLaterDimensions[d-1]; // pointArgumentsInLaterDimensions does not have an x component, hence d-1 here
1635 // tensorPointEnumerationOffset *= pointBounds[d-1];
1636 // }
1637 //
1638 // Scalar integralValue = 0;
1639 // if (fad_size_output_ == 0)
1640 // {
1641 // // not a Fad type; we're allow to have a vector range
1642 // Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, numPointsFirst), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1643 // {
1644 // integralThusFar += leftFields(x_pointOrdinal) * rightFields(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1645 // }, integralValue);
1646 // }
1647 // else
1648 // {
1649 // for (int x_pointOrdinal=0; x_pointOrdinal<numPointsFirst; x_pointOrdinal++)
1650 // {
1651 // integralValue += leftFields_x(x_pointOrdinal) * rightFields_x(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1652 // }
1653 // }
1654 //
1655 // GxIntegrals(pointEnumerationIndexLaterDimensions) = integralValue;
1656 // });
1657 
1658 
1659  flopCount += leftFieldBounds[1] * rightFieldBounds[1] * pointBounds[1] * pointBounds[2] * 3; // 3 flops for each Gy += line in the below
1660  flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * pointBounds[2] * 3; // 3 flops for each Gz += line in the below
1661  flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * 1; // 1 flops for the integralView += line below
1662 
1663 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds[1] * rightFieldBounds[1]), [&] (const int& i1j1) {
1664 // const int i1 = i1j1 % leftFieldBounds[1];
1665 // const int j1 = i1j1 / leftFieldBounds[1];
1666 //
1668 //
1669 // int pointEnumerationIndex = 0; // incremented at bottom of lz loop below.
1670 // for (int lz=0; lz<pointBounds[2]; lz++)
1671 // {
1672 // Scalar & Gy = GyIntegrals(Gy_index);
1673 // Gy = 0.0;
1674 //
1675 // const bool leftRankIs3 = ( leftFields_y.rank() == 3);
1676 // const bool rightRankIs3 = (rightFields_y.rank() == 3);
1677 // for (int ly=0; ly<pointBounds[1]; ly++)
1678 // {
1679 // const Scalar & leftValue = leftRankIs3 ? leftFields_y(i1,ly,a_component) : leftFields_y(i1,ly);
1680 // const Scalar & rightValue = rightRankIs3 ? rightFields_y(j1,ly,b_component) : rightFields_y(j1,ly);
1681 //
1682 // Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex);
1683 //
1684 // pointEnumerationIndex++;
1685 // }
1686 // Gy_index++;
1687 // }
1688 //
1689 // for (int i2=0; i2<leftFieldBounds[2]; i2++)
1690 // {
1691 // for (int j2=0; j2<rightFieldBounds[2]; j2++)
1692 // {
1693 // Scalar Gz = 0.0;
1694 //
1695 // int Gy_index = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1696 //
1697 // const bool leftRankIs3 = ( leftFields_z.rank() == 3);
1698 // const bool rightRankIs3 = (rightFields_z.rank() == 3);
1699 // for (int lz=0; lz<pointBounds[2]; lz++)
1700 // {
1701 // const Scalar & leftValue = leftRankIs3 ? leftFields_z(i2,lz,a_component) : leftFields_z(i2,lz);
1702 // const Scalar & rightValue = rightRankIs3 ? rightFields_z(j2,lz,b_component) : rightFields_z(j2,lz);
1703 //
1704 // Gz += leftValue * rightValue * GyIntegrals(Gy_index);
1705 //
1706 // Gy_index++;
1707 // }
1708 //
1709 // Kokkos::Array<int,3> leftArguments {i0,i1,i2};
1710 // Kokkos::Array<int,3> rightArguments {j0,j1,j2};
1711 //
1712 // const int i = relativeEnumerationIndex( leftArguments, leftFieldBounds, 0) + leftFieldOrdinalOffset_;
1713 // const int j = relativeEnumerationIndex(rightArguments, rightFieldBounds, 0) + rightFieldOrdinalOffset_;
1714 //
1715 // if (integralViewRank == 2)
1716 // {
1717 // integralView_.access(i,j,0) += Gz;
1718 // }
1719 // else
1720 // {
1721 // integralView_.access(cellDataOrdinal,i,j) += Gz;
1722 // }
1723 // }
1724 // }
1725 // });
1726  }
1727  }
1728  }
1729  }
1730  return flopCount;
1731  }
1732 
1734  int teamSize(const int &maxTeamSizeFromKokkos) const
1735  {
1736  // TODO: fix this to match the actual parallelism expressed
1737  const int R = numTensorComponents_ - 1;
1738  const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
1739  return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
1740  }
1741 
1743  size_t team_shmem_size (int numThreads) const
1744  {
1745  // we use shared memory to create a fast buffer for intermediate values, as well as fast access to the current-cell's field values
1746  size_t shmem_size = 0;
1747 
1748  const int GyEntryCount = pointBounds_[2]; // for each thread: store one Gy value per z coordinate
1749 
1750  int pointsInNonzeroComponentDimensions = 1;
1751  for (int d=1; d<numTensorComponents_; d++)
1752  {
1753  pointsInNonzeroComponentDimensions *= pointBounds_[d];
1754  }
1755 
1756  if (fad_size_output_ > 0)
1757  {
1758  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions, fad_size_output_); // GxIntegrals: entries with x integrated away
1759  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads, fad_size_output_); // GyIntegrals: entries with x,y integrated away
1760  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( 1 * numThreads, fad_size_output_); // GzIntegral: entry with x,y,z integrated away
1761  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.extent_int(1), fad_size_output_); // pointWeights
1762 
1763  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0], fad_size_output_); // leftFields_x
1764  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0], fad_size_output_); // rightFields_x
1765  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1], fad_size_output_); // leftFields_y
1766  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1], fad_size_output_); // rightFields_y
1767  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2], fad_size_output_); // leftFields_z
1768  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2], fad_size_output_); // rightFields_z
1769  }
1770  else
1771  {
1772  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions); // GxIntegrals: entries with x integrated away
1773  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads); // GyIntegrals: entries with x,y integrated away
1774  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( 1 * numThreads); // GzIntegral: entry with x,y,z integrated away
1775  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.extent_int(1)); // pointWeights
1776 
1777  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0]); // leftFields_x
1778  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0]); // rightFields_x
1779  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1]); // leftFields_y
1780  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1]); // rightFields_y
1781  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2]); // leftFields_z
1782  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2]); // rightFields_z
1783  }
1784 
1785  return shmem_size;
1786  }
1787  };
1788  }
1789 
1790 template<typename DeviceType>
1791 template<class Scalar>
1793  const TensorData<Scalar,DeviceType> cellMeasures,
1794  const TransformedBasisValues<Scalar,DeviceType> vectorDataRight)
1795 {
1796  // allocates a (C,F,F) container for storing integral data
1797 
1798  // ordinal filter is used for Serendipity basis; we don't yet support Serendipity for sum factorization.
1799  // (when we do, the strategy will likely be to do sum factorized assembly and then filter the result).
1800  const bool leftHasOrdinalFilter = vectorDataLeft.basisValues().ordinalFilter().extent_int(0) > 0;
1801  const bool rightHasOrdinalFilter = vectorDataRight.basisValues().ordinalFilter().extent_int(0) > 0;
1802  TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument, "Ordinal filters for BasisValues are not yet supported by IntegrationTools");
1803 
1804  // determine cellDataExtent and variation type. We currently support CONSTANT, MODULAR, and GENERAL as possible output variation types, depending on the inputs.
1805  // If cellMeasures has non-trivial tensor structure, the rank-1 cell Data object is the first component.
1806  // If cellMeasures has trivial tensor structure, then the first and only component has the cell index in its first dimension.
1807  // I.e., either way the relevant Data object is cellMeasures.getTensorComponent(0)
1808  const int CELL_DIM = 0;
1809  const auto cellMeasureData = cellMeasures.getTensorComponent(0);
1810  const auto leftTransform = vectorDataLeft.transform();
1811 
1812  DimensionInfo combinedCellDimInfo = cellMeasureData.getDimensionInfo(CELL_DIM);
1813  combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataLeft.transform().getDimensionInfo(CELL_DIM));
1814  combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataRight.transform().getDimensionInfo(CELL_DIM));
1815 
1816  DataVariationType cellVariationType = combinedCellDimInfo.variationType;
1817  int cellDataExtent = combinedCellDimInfo.dataExtent;
1818 
1819  const int numCells = vectorDataLeft.numCells();
1820  const int numFieldsLeft = vectorDataLeft.numFields();
1821  const int numFieldsRight = vectorDataRight.numFields();
1822 
1823  Kokkos::Array<int,3> extents {numCells, numFieldsLeft, numFieldsRight};
1824  Kokkos::Array<DataVariationType,3> variationTypes {cellVariationType,GENERAL,GENERAL};
1825 
1826  if (cellVariationType != CONSTANT)
1827  {
1828  Kokkos::View<Scalar***,DeviceType> data("Intrepid2 integral data",cellDataExtent,numFieldsLeft,numFieldsRight);
1829  return Data<Scalar,DeviceType>(data, extents, variationTypes);
1830  }
1831  else
1832  {
1833  Kokkos::View<Scalar**,DeviceType> data("Intrepid2 integral data",numFieldsLeft,numFieldsRight);
1834  return Data<Scalar,DeviceType>(data, extents, variationTypes);
1835  }
1836 }
1837 
1841 template<typename DeviceType>
1842 template<class Scalar>
1844  const TensorData<Scalar,DeviceType> & cellMeasures,
1845  const TransformedBasisValues<Scalar,DeviceType> & basisValuesRight, const bool sumInto,
1846  double* approximateFlops)
1847 {
1848  using ExecutionSpace = typename DeviceType::execution_space;
1849 
1850  // ordinal filter is used for Serendipity basis; we don't yet support Serendipity for sum factorization.
1851  // (when we do, the strategy will likely be to do sum factorized assembly and then filter the result).
1852  const bool leftHasOrdinalFilter = basisValuesLeft.basisValues().ordinalFilter().extent_int(0) > 0;
1853  const bool rightHasOrdinalFilter = basisValuesRight.basisValues().ordinalFilter().extent_int(0) > 0;
1854  TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument, "Ordinal filters for BasisValues are not yet supported by IntegrationTools");
1855 
1856  if (approximateFlops != NULL)
1857  {
1858  *approximateFlops = 0;
1859  }
1860 
1861  // integral data may have shape (C,F1,F2) or (if the variation type is CONSTANT in the cell dimension) shape (F1,F2)
1862  const int integralViewRank = integrals.getUnderlyingViewRank();
1863 
1864  if (!sumInto)
1865  {
1866  integrals.clear();
1867  }
1868 
1869  const int cellDataExtent = integrals.getDataExtent(0);
1870  const int numFieldsLeft = basisValuesLeft.numFields();
1871  const int numFieldsRight = basisValuesRight.numFields();
1872  const int spaceDim = basisValuesLeft.spaceDim();
1873 
1874  INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.spaceDim() != basisValuesRight.spaceDim(), std::invalid_argument, "vectorDataLeft and vectorDataRight must agree on the space dimension");
1875 
1876  const int leftFamilyCount = basisValuesLeft.basisValues().numFamilies();
1877  const int rightFamilyCount = basisValuesRight.basisValues().numFamilies();
1878 
1879  // we require that the number of tensor components in the vectors are the same for each vector entry
1880  // this is not strictly necessary, but it makes implementation easier, and we don't at present anticipate other use cases
1881  int numTensorComponentsLeft = -1;
1882  const bool isVectorValued = basisValuesLeft.vectorData().isValid();
1883  if (isVectorValued)
1884  {
1885  const bool rightIsVectorValued = basisValuesRight.vectorData().isValid();
1886  INTREPID2_TEST_FOR_EXCEPTION(!rightIsVectorValued, std::invalid_argument, "left and right must either both be vector-valued, or both scalar-valued");
1887  const auto &refVectorLeft = basisValuesLeft.vectorData();
1888  int numFamiliesLeft = refVectorLeft.numFamilies();
1889  int numVectorComponentsLeft = refVectorLeft.numComponents();
1890  Kokkos::Array<int,7> maxFieldsForComponentLeft {0,0,0,0,0,0,0};
1891  Kokkos::Array<int,7> maxFieldsForComponentRight {0,0,0,0,0,0,0};
1892  for (int familyOrdinal=0; familyOrdinal<numFamiliesLeft; familyOrdinal++)
1893  {
1894  for (int vectorComponent=0; vectorComponent<numVectorComponentsLeft; vectorComponent++)
1895  {
1896  const TensorData<Scalar,DeviceType> &tensorData = refVectorLeft.getComponent(familyOrdinal,vectorComponent);
1897  if (tensorData.numTensorComponents() > 0)
1898  {
1899  if (numTensorComponentsLeft == -1)
1900  {
1901  numTensorComponentsLeft = tensorData.numTensorComponents();
1902  }
1903  INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataLeft must have the same number of tensor components as every other");
1904  for (int r=0; r<numTensorComponentsLeft; r++)
1905  {
1906  maxFieldsForComponentLeft[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentLeft[r]);
1907  }
1908  }
1909  }
1910  }
1911  int numTensorComponentsRight = -1;
1912  const auto &refVectorRight = basisValuesRight.vectorData();
1913  int numFamiliesRight = refVectorRight.numFamilies();
1914  int numVectorComponentsRight = refVectorRight.numComponents();
1915  for (int familyOrdinal=0; familyOrdinal<numFamiliesRight; familyOrdinal++)
1916  {
1917  for (int vectorComponent=0; vectorComponent<numVectorComponentsRight; vectorComponent++)
1918  {
1919  const auto &tensorData = refVectorRight.getComponent(familyOrdinal,vectorComponent);
1920  if (tensorData.numTensorComponents() > 0)
1921  {
1922  if (numTensorComponentsRight == -1)
1923  {
1924  numTensorComponentsRight = tensorData.numTensorComponents();
1925  }
1926  INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsRight != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataRight must have the same number of tensor components as every other");
1927  for (int r=0; r<numTensorComponentsRight; r++)
1928  {
1929  maxFieldsForComponentRight[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentRight[r]);
1930  }
1931  }
1932  }
1933  }
1934  INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != numVectorComponentsRight, std::invalid_argument, "Left and right vector entries must have the same number of tensorial components");
1935  }
1936  else
1937  {
1938  numTensorComponentsLeft = basisValuesLeft.basisValues().tensorData(0).numTensorComponents(); // family ordinal 0
1939  for (int familyOrdinal = 0; familyOrdinal < leftFamilyCount; familyOrdinal++)
1940  {
1941  INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument, "All families must match in the number of tensor components");
1942  }
1943 
1944  // check that right tensor component count also agrees
1945  for (int familyOrdinal=0; familyOrdinal< rightFamilyCount; familyOrdinal++)
1946  {
1947  INTREPID2_TEST_FOR_EXCEPTION(basisValuesRight.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument, "Right families must match left in the number of tensor components");
1948  }
1949  }
1950  const int numPointTensorComponents = cellMeasures.numTensorComponents() - 1;
1951 
1952  if ((numPointTensorComponents == numTensorComponentsLeft) && basisValuesLeft.axisAligned() && basisValuesRight.axisAligned())
1953  {
1954  // cellMeasures is a non-trivial tensor product, and the pullbacks are all diagonals.
1955 
1956  // in this case, the integrals in each tensorial direction are entirely separable
1957  // allocate some temporary storage for the integrals in each tensorial direction
1958 
1959  // if cellMeasures is a nontrivial tensor product, that means that all cells have the same shape, up to scaling.
1960 
1961  Kokkos::Array<int,Parameters::MaxTensorComponents> pointDimensions;
1962  for (int r=0; r<numPointTensorComponents; r++)
1963  {
1964  // first tensorial component of cell measures is the cell dimension; after that we have (P1,P2,…)
1965  pointDimensions[r] = cellMeasures.getTensorComponent(r+1).extent_int(0);
1966  }
1967 
1968  // only one of these will be a valid container:
1969  Kokkos::View<Scalar**, DeviceType> integralView2;
1970  Kokkos::View<Scalar***, DeviceType> integralView3;
1971  if (integralViewRank == 3)
1972  {
1973  integralView3 = integrals.getUnderlyingView3();
1974  }
1975  else
1976  {
1977  integralView2 = integrals.getUnderlyingView2();
1978  }
1979  for (int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
1980  {
1981  int a_offset = 0; // left vector component offset
1982  int leftFieldOffset = basisValuesLeft.basisValues().familyFieldOrdinalOffset(leftFamilyOrdinal);
1983 
1984  const int leftVectorComponentCount = isVectorValued ? basisValuesLeft.vectorData().numComponents() : 1;
1985  for (int leftVectorComponentOrdinal = 0; leftVectorComponentOrdinal < leftVectorComponentCount; leftVectorComponentOrdinal++)
1986  {
1987  TensorData<Scalar,DeviceType> leftComponent = isVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftVectorComponentOrdinal)
1988  : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
1989  if (!leftComponent.isValid())
1990  {
1991  a_offset++; // empty components are understood to take up one dimension
1992  continue;
1993  }
1994  const int leftDimSpan = leftComponent.extent_int(2);
1995 
1996  const int leftComponentFieldCount = leftComponent.extent_int(0);
1997 
1998  for (int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
1999  {
2000  int b_offset = 0; // right vector component offset
2001  int rightFieldOffset = basisValuesRight.vectorData().familyFieldOrdinalOffset(rightFamilyOrdinal);
2002 
2003  const int rightVectorComponentCount = isVectorValued ? basisValuesRight.vectorData().numComponents() : 1;
2004  for (int rightVectorComponentOrdinal = 0; rightVectorComponentOrdinal < rightVectorComponentCount; rightVectorComponentOrdinal++)
2005  {
2006  TensorData<Scalar,DeviceType> rightComponent = isVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightVectorComponentOrdinal)
2007  : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
2008  if (!rightComponent.isValid())
2009  {
2010  b_offset++; // empty components are understood to take up one dimension
2011  continue;
2012  }
2013  const int rightDimSpan = rightComponent.extent_int(2);
2014 
2015  const int rightComponentFieldCount = rightComponent.extent_int(0);
2016 
2017  // we only accumulate for a == b (since this is the axis-aligned case). Do the a, b spans intersect?
2018  if ((a_offset + leftDimSpan <= b_offset) || (b_offset + rightDimSpan <= a_offset)) // no intersection
2019  {
2020  b_offset += rightDimSpan;
2021 
2022  continue;
2023  }
2024 
2025  // if the a, b spans intersect, by assumption they should align exactly
2026  INTREPID2_TEST_FOR_EXCEPTION(( a_offset != b_offset), std::logic_error, "left and right dimension offsets should match.");
2027  INTREPID2_TEST_FOR_EXCEPTION(( leftDimSpan != rightDimSpan), std::invalid_argument, "left and right components must span the same number of dimensions.");
2028 
2029  const int d_start = a_offset;
2030  const int d_end = d_start + leftDimSpan;
2031 
2032  using ComponentIntegralsArray = Kokkos::Array< Kokkos::Array<ScalarView<Scalar,DeviceType>, Parameters::MaxTensorComponents>, Parameters::MaxTensorComponents>;
2033  ComponentIntegralsArray componentIntegrals;
2034  for (int r=0; r<numPointTensorComponents; r++)
2035  {
2036  /*
2037  Four vector data cases to consider:
2038  1. Both vector data containers are filled with axial components - first component in 3D has form (f,0,0), second (0,f,0), third (0,0,f).
2039  2. Both vector data containers have arbitrary components - in 3D: (f1,f2,f3) where f1 is given by the first component, f2 by the second, f3 by the third.
2040  3. First container is axial, second arbitrary.
2041  4. First is arbitrary, second axial.
2042 
2043  But note that in all four cases, the structure of the integral is the same: you have three vector component integrals that get summed. The actual difference between
2044  the cases does not show up in the reference-space integrals here, but in the accumulation in physical space below, where the tensor field numbering comes into play.
2045 
2046  The choice between axial and arbitrary affects the way the fields are numbered; the arbitrary components' indices refer to the same vector function, so they correspond,
2047  while the axial components refer to distinct scalar functions, so their numbering in the data container is cumulative.
2048  */
2049 
2050  Data<Scalar,DeviceType> quadratureWeights = cellMeasures.getTensorComponent(r+1);
2051  const int numPoints = pointDimensions[r];
2052 
2053  // It may be worth considering the possibility that some of these components point to the same data -- if so, we could possibly get better data locality by making the corresponding componentIntegral entries point to the same location as well. (And we could avoid some computations here.)
2054 
2055  Data<Scalar,DeviceType> leftTensorComponent = leftComponent.getTensorComponent(r);
2056  Data<Scalar,DeviceType> rightTensorComponent = rightComponent.getTensorComponent(r);
2057 
2058  const int leftTensorComponentRank = leftTensorComponent.rank();
2059  const int leftTensorComponentDimSpan = leftTensorComponent.extent_int(2);
2060  const int leftTensorComponentFields = leftTensorComponent.extent_int(0);
2061  const int rightTensorComponentRank = rightTensorComponent.rank();
2062  const int rightTensorComponentDimSpan = rightTensorComponent.extent_int(2);
2063  const int rightTensorComponentFields = rightTensorComponent.extent_int(0);
2064 
2065  INTREPID2_TEST_FOR_EXCEPTION(( leftTensorComponentDimSpan != rightTensorComponentDimSpan), std::invalid_argument, "left and right components must span the same number of dimensions.");
2066 
2067  for (int d=d_start; d<d_end; d++)
2068  {
2069  const bool allocateFadStorage = !std::is_pod<Scalar>::value;
2070  if (allocateFadStorage)
2071  {
2072  auto fad_size_output = dimension_scalar(integrals.getUnderlyingView());
2073  componentIntegrals[r][d] = ScalarView<Scalar,DeviceType>("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields, fad_size_output);
2074  }
2075  else
2076  {
2077  componentIntegrals[r][d] = ScalarView<Scalar,DeviceType>("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields);
2078  }
2079 
2080  auto componentIntegralView = componentIntegrals[r][d];
2081 
2082  auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>>({0,0},{leftTensorComponentFields,rightTensorComponentFields});
2083 
2084  for (int a=0; a<leftTensorComponentDimSpan; a++)
2085  {
2086  Kokkos::parallel_for("compute componentIntegrals", policy,
2087  KOKKOS_LAMBDA (const int &i, const int &j) {
2088  Scalar refSpaceIntegral = 0.0;
2089  for (int ptOrdinal=0; ptOrdinal<numPoints; ptOrdinal++)
2090  {
2091  const Scalar & leftValue = ( leftTensorComponentRank == 2) ? leftTensorComponent(i,ptOrdinal) : leftTensorComponent(i,ptOrdinal,a);
2092  const Scalar & rightValue = (rightTensorComponentRank == 2) ? rightTensorComponent(j,ptOrdinal) : rightTensorComponent(j,ptOrdinal,a);
2093  refSpaceIntegral += leftValue * rightValue * quadratureWeights(ptOrdinal);
2094  }
2095  componentIntegralView(i,j) = refSpaceIntegral;
2096  });
2097  }
2098 
2099  if (approximateFlops != NULL)
2100  {
2101  *approximateFlops += leftTensorComponentFields*rightTensorComponentFields*numPoints*(3); // two multiplies, one add in innermost loop
2102  }
2103  } // d
2104  } // r
2105 
2106  ExecutionSpace().fence();
2107 
2108  Kokkos::Array<int,3> upperBounds {cellDataExtent,leftComponentFieldCount,rightComponentFieldCount}; // separately declared in effort to get around Intel 17.0.1 compiler weirdness.
2109  Kokkos::Array<int,3> lowerBounds {0,0,0};
2110  auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>(lowerBounds, upperBounds);
2111  // TODO: note that for best performance, especially with Fad types, we should replace this parallel for with a Functor and use hierarchical parallelism
2112  Kokkos::parallel_for("compute field integrals", policy,
2113  KOKKOS_LAMBDA (const int &cellDataOrdinal, const int &leftFieldOrdinal, const int &rightFieldOrdinal) {
2114  const Scalar & cellMeasureWeight = cellMeasures.getTensorComponent(0)(cellDataOrdinal);
2115 
2116  TensorArgumentIterator leftTensorIterator(leftComponent, 0); // shape is (F,P), and we walk the F dimension of the container
2117  leftTensorIterator.setEnumerationIndex(leftFieldOrdinal);
2118 
2119  TensorArgumentIterator rightTensorIterator(rightComponent, 0); // shape is (F,P), and we walk the F dimension of the container
2120  rightTensorIterator.setEnumerationIndex(rightFieldOrdinal);
2121 
2122  Scalar integralSum = 0.0;
2123  for (int d=d_start; d<d_end; d++)
2124  {
2125  const Scalar & transformLeft_d = basisValuesLeft.transformWeight(cellDataOrdinal,0,d,d);
2126  const Scalar & transformRight_d = basisValuesRight.transformWeight(cellDataOrdinal,0,d,d);
2127 
2128  const Scalar & leftRightTransform_d = transformLeft_d * transformRight_d;
2129  // approximateFlopCount++;
2130 
2131  Scalar integral_d = 1.0;
2132 
2133  for (int r=0; r<numPointTensorComponents; r++)
2134  {
2135  integral_d *= componentIntegrals[r][d](leftTensorIterator.argument(r),rightTensorIterator.argument(r));
2136  // approximateFlopCount++; // product
2137  }
2138  integralSum += leftRightTransform_d * integral_d;
2139  // approximateFlopCount += 2; // multiply and sum
2140 
2141  const int i = leftFieldOrdinal + leftFieldOffset;
2142  const int j = rightFieldOrdinal + rightFieldOffset;
2143 
2144  if (integralViewRank == 3)
2145  {
2146  integralView3(cellDataOrdinal,i,j) += cellMeasureWeight * integralSum;
2147  }
2148  else
2149  {
2150  integralView2(i,j) += cellMeasureWeight * integralSum;
2151  }
2152  }
2153  });
2154  b_offset += rightDimSpan;
2155  } // rightVectorComponentOrdinal
2156  } // rightFamilyOrdinal
2157  a_offset += leftDimSpan;
2158  } // leftVectorComponentOrdinal
2159  } // leftFamilyOrdinal
2160 
2161  if (approximateFlops != NULL)
2162  {
2163  // TODO: check the accuracy of this
2164  *approximateFlops += (2 + spaceDim * (3 + numPointTensorComponents)) * cellDataExtent * numFieldsLeft * numFieldsRight;
2165  }
2166  }
2167  else // general case (not axis-aligned + affine tensor-product structure)
2168  {
2169  // prepare composed transformation matrices
2170  const Data<Scalar,DeviceType> & leftTransform = basisValuesLeft.transform();
2171  const Data<Scalar,DeviceType> & rightTransform = basisValuesRight.transform();
2172  const bool transposeLeft = true;
2173  const bool transposeRight = false;
2174 // auto timer = Teuchos::TimeMonitor::getNewTimer("mat-mat");
2175 // timer->start();
2176  // transforms can be matrices -- (C,P,D,D): rank 4 -- or scalar weights -- (C,P): rank 2
2177  const bool matrixTransform = (leftTransform.rank() == 4) || (rightTransform.rank() == 4);
2178  Data<Scalar,DeviceType> composedTransform;
2179  // invalid/empty transforms are used when the identity is intended.
2180  if (leftTransform.isValid() && rightTransform.isValid())
2181  {
2182  if (matrixTransform)
2183  {
2184  composedTransform = leftTransform.allocateMatMatResult(transposeLeft, leftTransform, transposeRight, rightTransform);
2185  composedTransform.storeMatMat(transposeLeft, leftTransform, transposeRight, rightTransform);
2186 
2187  // if the composedTransform matrices are full, the following is a good estimate. If they have some diagonal portions, this will overcount.
2188  if (approximateFlops != NULL)
2189  {
2190  *approximateFlops += composedTransform.getUnderlyingViewSize() * (spaceDim - 1) * 2;
2191  }
2192  }
2193  else
2194  {
2195  composedTransform = leftTransform.allocateInPlaceCombinationResult(leftTransform, rightTransform);
2196  composedTransform.storeInPlaceProduct(leftTransform, rightTransform);
2197 
2198  // re-cast composedTranform as a rank 4 (C,P,D,D) object -- a 1 x 1 matrix at each (C,P).
2199  const int newRank = 4;
2200  auto extents = composedTransform.getExtents();
2201  auto variationTypes = composedTransform.getVariationTypes();
2202  composedTransform = composedTransform.shallowCopy(newRank, extents, variationTypes);
2203  if (approximateFlops != NULL)
2204  {
2205  *approximateFlops += composedTransform.getUnderlyingViewSize(); // one multiply per entry
2206  }
2207  }
2208  }
2209  else if (leftTransform.isValid())
2210  {
2211  // rightTransform is the identity
2212  composedTransform = leftTransform;
2213  }
2214  else if (rightTransform.isValid())
2215  {
2216  // leftTransform is the identity
2217  composedTransform = rightTransform;
2218  }
2219  else
2220  {
2221  // both left and right transforms are identity
2222  Kokkos::Array<ordinal_type,4> extents {basisValuesLeft.numCells(),basisValuesLeft.numPoints(),spaceDim,spaceDim};
2223  Kokkos::Array<DataVariationType,4> variationTypes {CONSTANT,CONSTANT,BLOCK_PLUS_DIAGONAL,BLOCK_PLUS_DIAGONAL};
2224 
2225  Kokkos::View<Scalar*,DeviceType> identityUnderlyingView("Intrepid2::FST::integrate() - identity view",spaceDim);
2226  Kokkos::deep_copy(identityUnderlyingView, 1.0);
2227  composedTransform = Data<Scalar,DeviceType>(identityUnderlyingView,extents,variationTypes);
2228  }
2229 
2230 // timer->stop();
2231 // std::cout << "Completed mat-mat in " << timer->totalElapsedTime() << " seconds.\n";
2232 // timer->reset();
2233 
2234  const int leftFamilyCount = basisValuesLeft. basisValues().numFamilies();
2235  const int rightFamilyCount = basisValuesRight.basisValues().numFamilies();
2236  const int leftComponentCount = isVectorValued ? basisValuesLeft. vectorData().numComponents() : 1;
2237  const int rightComponentCount = isVectorValued ? basisValuesRight.vectorData().numComponents() : 1;
2238 
2239  int leftFieldOrdinalOffset = 0; // keeps track of the number of fields in prior families
2240  for (int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2241  {
2242  // "a" keeps track of the spatial dimension over which we are integrating in the left vector
2243  // components are allowed to span several dimensions; we keep track of the offset for the component in a_offset
2244  int a_offset = 0;
2245  bool haveLaunchedContributionToCurrentFamilyLeft = false; // helps to track whether we need a Kokkos::fence before launching a kernel.
2246  for (int leftComponentOrdinal=0; leftComponentOrdinal<leftComponentCount; leftComponentOrdinal++)
2247  {
2248  TensorData<Scalar,DeviceType> leftComponent = isVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftComponentOrdinal)
2249  : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
2250  if (!leftComponent.isValid())
2251  {
2252  // represents zero
2253  a_offset += basisValuesLeft.vectorData().numDimsForComponent(leftComponentOrdinal);
2254  continue;
2255  }
2256 
2257  int rightFieldOrdinalOffset = 0; // keeps track of the number of fields in prior families
2258  for (int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2259  {
2260  // "b" keeps track of the spatial dimension over which we are integrating in the right vector
2261  // components are allowed to span several dimensions; we keep track of the offset for the component in b_offset
2262  bool haveLaunchedContributionToCurrentFamilyRight = false; // helps to track whether we need a Kokkos::fence before launching a kernel.
2263  int b_offset = 0;
2264  for (int rightComponentOrdinal=0; rightComponentOrdinal<rightComponentCount; rightComponentOrdinal++)
2265  {
2266  TensorData<Scalar,DeviceType> rightComponent = isVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightComponentOrdinal)
2267  : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
2268  if (!rightComponent.isValid())
2269  {
2270  // represents zero
2271  b_offset += basisValuesRight.vectorData().numDimsForComponent(rightComponentOrdinal);
2272  continue;
2273  }
2274 
2275  INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(leftComponent.numTensorComponents() != rightComponent.numTensorComponents(), std::invalid_argument, "left TensorData and right TensorData have different number of tensor components. This is not supported.");
2276 
2277  const int vectorSize = getVectorSizeForHierarchicalParallelism<Scalar>();
2278  Kokkos::TeamPolicy<ExecutionSpace> policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,Kokkos::AUTO(),vectorSize);
2279 
2280  // TODO: expose the options for forceNonSpecialized and usePointCacheForRank3Tensor through an IntegrationAlgorithm enumeration.
2281  // AUTOMATIC: let Intrepid2 choose an algorithm based on the inputs (and, perhaps, the execution space)
2282  // STANDARD: don't use sum factorization or axis alignment -- just do the simple contraction, a (p+1)^9 algorithm in 3D
2283  // SUM_FACTORIZATION // (p+1)^7 algorithm in 3D
2284  // SUM_FACTORIZATION_AXIS_ALIGNED // (p+1)^6 algorithm in 3D
2285  // SUM_FACTORIZATION_FORCE_GENERIC_IMPLEMENTATION // mainly intended for testing purposes (specialized implementations perform better when they are provided)
2286  // SUM_FACTORIZATION_WITH_POINT_CACHE // novel (p+1)^7 (in 3D) algorithm in Intrepid2; unclear as yet when and whether this may be a superior approach
2287  bool forceNonSpecialized = false; // We might expose this in the integrate() arguments in the future. We *should* default to false in the future.
2288  bool usePointCacheForRank3Tensor = true; // EXPERIMENTAL; has better performance under CUDA, but slightly worse performance than standard on serial CPU
2289 
2290  // in one branch or another below, we will launch a parallel kernel that contributes to (leftFamily, rightFamily) field ordinal pairs.
2291  // if we have already launched something that contributes to that part of the integral container, we need a Kokkos fence() to ensure that these do not interfere with each other.
2292  if (haveLaunchedContributionToCurrentFamilyLeft && haveLaunchedContributionToCurrentFamilyRight)
2293  {
2294  ExecutionSpace().fence();
2295  }
2296  haveLaunchedContributionToCurrentFamilyLeft = true;
2297  haveLaunchedContributionToCurrentFamilyRight = true;
2298 
2299  if (integralViewRank == 2)
2300  {
2301  if (usePointCacheForRank3Tensor && (leftComponent.numTensorComponents() == 3))
2302  {
2303  auto functor = Impl::F_IntegratePointValueCache<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2304 
2305  const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2306  const int teamSize = functor.teamSize(recommendedTeamSize);
2307 
2308  policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2309 
2310  Kokkos::parallel_for("F_IntegratePointValueCache rank 2", policy, functor);
2311 
2312  if (approximateFlops != NULL)
2313  {
2314  *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2315  }
2316  }
2317  else
2318  {
2319  auto functor = Impl::F_Integrate<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2320 
2321  const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2322  const int teamSize = functor.teamSize(recommendedTeamSize);
2323 
2324  policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2325 
2326  Kokkos::parallel_for("F_Integrate rank 2", policy, functor);
2327 
2328  if (approximateFlops != NULL)
2329  {
2330  *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2331  }
2332  }
2333  }
2334  else if (integralViewRank == 3)
2335  {
2336  if (usePointCacheForRank3Tensor && (leftComponent.numTensorComponents() == 3))
2337  {
2338  auto functor = Impl::F_IntegratePointValueCache<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2339 
2340  const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2341  const int teamSize = functor.teamSize(recommendedTeamSize);
2342 
2343  policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2344 
2345  Kokkos::parallel_for("F_IntegratePointValueCache rank 3", policy, functor);
2346 
2347  if (approximateFlops != NULL)
2348  {
2349  *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2350  }
2351  }
2352  else
2353  {
2354  auto functor = Impl::F_Integrate<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2355 
2356  const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2357  const int teamSize = functor.teamSize(recommendedTeamSize);
2358 
2359  policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2360 
2361  Kokkos::parallel_for("F_Integrate rank 3", policy, functor);
2362 
2363  if (approximateFlops != NULL)
2364  {
2365  *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2366  }
2367  }
2368  }
2369  b_offset += isVectorValued ? basisValuesRight.vectorData().numDimsForComponent(rightComponentOrdinal) : 1;
2370  }
2371  rightFieldOrdinalOffset += isVectorValued ? basisValuesRight.vectorData().numFieldsInFamily(rightFamilyOrdinal) : basisValuesRight.basisValues().numFieldsInFamily(rightFamilyOrdinal);
2372  }
2373  a_offset += isVectorValued ? basisValuesLeft.vectorData().numDimsForComponent(leftComponentOrdinal) : 1;
2374  }
2375  leftFieldOrdinalOffset += isVectorValued ? basisValuesLeft.vectorData().numFieldsInFamily(leftFamilyOrdinal) : basisValuesLeft.basisValues().numFieldsInFamily(leftFamilyOrdinal);
2376  }
2377  }
2378 // if (approximateFlops != NULL)
2379 // {
2380 // std::cout << "Approximate flop count (new): " << *approximateFlops << std::endl;
2381 // }
2382  ExecutionSpace().fence(); // make sure we've finished writing to integrals container before we return
2383 }
2384 
2385 } // end namespace Intrepid2
2386 
2387 #endif
arbitrary variation
KOKKOS_INLINE_FUNCTION int numPoints() const
Returns the logical extent in the points dimension, which is the 2 dimension in this container...
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
Hand-coded 3-component version.
KOKKOS_INLINE_FUNCTION DimensionInfo getDimensionInfo(const int &dim) const
Returns an object fully specifying the indicated dimension. This is used in determining appropriate s...
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums...
static Data< DataScalar, DeviceType > allocateInPlaceCombinationResult(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
KOKKOS_INLINE_FUNCTION int numFields() const
Returns the logical extent in the fields dimension, which is the 1 dimension in this container...
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
runSpecialized implementations are hand-coded variants of run() for a particular number of components...
KOKKOS_INLINE_FUNCTION Kokkos::Array< int, 7 > getExtents() const
Returns an array containing the logical extents in each dimension.
#define INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(test, x, msg)
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewRank() const
returns the rank of the View that stores the unique data
Implementation of a general sum factorization algorithm, using a novel approach developed by Roberts...
size_t team_shmem_size(int team_size) const
Provide the shared memory capacity.
KOKKOS_INLINE_FUNCTION int familyFieldOrdinalOffset(const int &familyOrdinal) const
Returns the field ordinal offset for the specified family.
KOKKOS_INLINE_FUNCTION enable_if_t< rank==1, const Kokkos::View< typename RankExpander< DataScalar, rank >::value_type, DeviceType > & > getUnderlyingView() const
Returns the underlying view. Throws an exception if the underlying view is not rank 1...
void clear() const
Copies 0.0 to the underlying View.
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
returns true for containers that have data; false for those that don&#39;t (namely, those that have been ...
const Data< Scalar, DeviceType > & transform() const
Returns the transform matrix. An invalid/empty container indicates the identity transform.
one of two dimensions in a matrix; bottom-right part of matrix is diagonal
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
Returns true for containers that have data; false for those that don&#39;t (e.g., those that have been co...
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
static Data< Scalar, DeviceType > allocateIntegralData(const TransformedBasisValues< Scalar, DeviceType > vectorDataLeft, const TensorData< Scalar, DeviceType > cellMeasures, const TransformedBasisValues< Scalar, DeviceType > vectorDataRight)
Allocates storage for the contraction of vectorDataLeft and vectorDataRight containers on point and s...
KOKKOS_INLINE_FUNCTION bool underlyingMatchesLogical() const
Returns true if the underlying container has exactly the same rank and extents as the logical contain...
Defines TensorArgumentIterator, which allows systematic enumeration of a TensorData object...
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
static void integrate(Data< Scalar, DeviceType > integrals, const TransformedBasisValues< Scalar, DeviceType > &vectorDataLeft, const TensorData< Scalar, DeviceType > &cellMeasures, const TransformedBasisValues< Scalar, DeviceType > &vectorDataRight, const bool sumInto=false, double *approximateFlops=NULL)
Contracts vectorDataLeft and vectorDataRight containers on point and space dimensions, weighting each point according to cellMeasures, and stores the result in outputValues. The integrals container can be constructed using allocateIntegralData().
Struct expressing all variation information about a Data object in a single dimension, including its logical extent and storage extent.
KOKKOS_INLINE_FUNCTION unsigned rank() const
Returns the logical rank of the Data container.
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< std::is_pod< T >::value, unsigned >::type dimension_scalar(const Kokkos::DynRankView< T, P... >)
specialization of functions for pod types, returning the scalar dimension (1 for pod types) of a view...
size_t team_shmem_size(int numThreads) const
Provide the shared memory capacity.
KOKKOS_INLINE_FUNCTION int numCells() const
Returns the logical extent in the cell dimension, which is the 0 dimension in this container...
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
KOKKOS_INLINE_FUNCTION void setEnumerationIndex(const ordinal_type &enumerationIndex)
Sets the enumeration index; this refers to a 1D enumeration of the possible in-bound arguments...
KOKKOS_INLINE_FUNCTION int extent_int(const int &r) const
Returns the logical extent in the specified dimension.
KOKKOS_INLINE_FUNCTION const ordinal_type & argument(const ordinal_type &r) const
KOKKOS_INLINE_FUNCTION int numFamilies() const
For valid vectorData, returns the number of families in vectorData; otherwise, returns number of Tens...
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ****, DeviceType > & getUnderlyingView4() const
returns the View that stores the unique data. For rank-4 underlying containers.
void storeMatMat(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
KOKKOS_INLINE_FUNCTION std::enable_if< std::is_integral< iType >::value, ordinal_type >::type extent_int(const iType &d) const
Returns the logical extent in the requested dimension.
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
KOKKOS_INLINE_FUNCTION const Data< Scalar, DeviceType > & getTensorComponent(const ordinal_type &r) const
Returns the requested tensor component.
KOKKOS_INLINE_FUNCTION DimensionInfo combinedDimensionInfo(const DimensionInfo &myData, const DimensionInfo &otherData)
Returns DimensionInfo for a Data container that combines (through multiplication, say...
KOKKOS_INLINE_FUNCTION int spaceDim() const
Returns the logical extent in the space dimension, which is the 3 dimension in this container...
Defines the Intrepid2::FunctorIterator class, as well as the Intrepid2::functor_returns_ref SFINAE he...
void storeInPlaceProduct(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
stores the in-place (entrywise) product, A .* B, into this container.
TensorDataType & tensorData()
TensorData accessor for single-family scalar data.
KOKKOS_INLINE_FUNCTION ordinal_type numTensorComponents() const
Return the number of tensorial components.
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
Data shallowCopy(const int rank, const Kokkos::Array< int, 7 > &extents, const Kokkos::Array< DataVariationType, 7 > &variationTypes)
Creates a new Data object with the same underlying view, but with the specified logical rank...
Structure-preserving representation of transformed vector data; reference space values and transforma...
KOKKOS_INLINE_FUNCTION int getDataExtent(const ordinal_type &d) const
returns the true extent of the data corresponding to the logical dimension provided; if the data does...
KOKKOS_INLINE_FUNCTION bool axisAligned() const
Returns true if the transformation matrix is diagonal.
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar **, DeviceType > & getUnderlyingView2() const
returns the View that stores the unique data. For rank-2 underlying containers.
View-like interface to tensor data; tensor components are stored separately and multiplied together a...
Allows systematic enumeration of all entries in a TensorData object, tracking indices for each tensor...
DataVariationType
Enumeration to indicate how data varies in a particular dimension of an Intrepid2::Data object...
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums...
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ***, DeviceType > & getUnderlyingView3() const
returns the View that stores the unique data. For rank-3 underlying containers.
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
Implementation of a general sum factorization algorithm, abstracted from the algorithm described by M...
KOKKOS_INLINE_FUNCTION Scalar transformWeight(const int &cellOrdinal, const int &pointOrdinal) const
Returns the specified entry in the (scalar) transform. (Only valid for scalar-valued BasisValues; see...
static Data< DataScalar, DeviceType > allocateMatMatResult(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
const VectorData< Scalar, DeviceType > & vectorData() const
Returns the reference-space vector data.
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewSize() const
returns the number of entries in the View that stores the unique data
static constexpr ordinal_type MaxTensorComponents
Maximum number of tensor/Cartesian products that can be taken: this allows hypercube basis in 7D to b...
KOKKOS_INLINE_FUNCTION const Kokkos::Array< DataVariationType, 7 > & getVariationTypes() const
Returns an array with the variation types in each logical dimension.