Halide 16.0.0
Halide compiler and libraries
 
Loading...
Searching...
No Matches
LoopNest.h
Go to the documentation of this file.
1/** This file defines the LoopNest, which is our
2 * representation of a Halide schedule, and contains methods to
3 * generate candidates for scheduling as well as extract a
4 * featurization that can be used to cost each candidate. */
5
6#ifndef LOOP_NEST_H
7#define LOOP_NEST_H
8
9#include "ASLog.h"
10#include "CostModel.h"
11#include "FunctionDAG.h"
12#include "GPULoopInfo.h"
13#include "GPUMemInfo.h"
14#include "PerfectHashMap.h"
15#include "SearchSpaceOptions.h"
16#include "Statistics.h"
17#include "ThreadInfo.h"
18#include "Tiling.h"
19#include <set>
20#include <vector>
21
22namespace Halide {
23namespace Internal {
24namespace Autoscheduler {
25
26template<typename T>
27using NodeMap = PerfectHashMap<FunctionDAG::Node, T>;
28
29template<typename T>
30using StageMap = PerfectHashMap<FunctionDAG::Node::Stage, T>;
31
38
39std::string stringify(GPU_parallelism label);
40
41// inlined => func is inlined so has no memory store location
42enum class GPUMemoryType { Global,
47
48bool may_subtile(const Anderson2021Params &params);
49
51
53
55
57 return 128;
58}
59
60int get_unroll_limit(const Target &target);
61
62bool in_range_zero_one(double x);
63
64bool are_valid_thread_extents(const vector<int64_t> &counts);
65
68
69bool all(const vector<int> &v);
70bool accessed_at_constant_indices(const std::vector<int> &unrolled, const FunctionDAG::Edge *e);
71
72// We're going to do a tree search over possible schedules to find an
73// optimal one. A tree search requires a state, and a function that
74// gives you children of the state (with costs). The following struct
75// represents the state, which is a partial schedule.
76//
77// A partial schedule is a tree. Each node is some portion of the for
78// loop nest of some Func. If there are no children, it's the
79// innermost set of loops. If there are children, it's a loop over
80// tiles of that Func.
81struct LoopNest {
82 mutable RefCount ref_count;
83
84 // The extents of this loop. Put another way, the number of tiles,
85 // not the size of each tile.
86 vector<int64_t> size;
87
88 // The nodes inside the loop body
89 vector<IntrusivePtr<const LoopNest>> children;
90
91 // Funcs inlined into this inner loop, and the number of times
92 // each is called. Only valid if children is empty.
94
95 // Funcs stored inside this loop
96 std::set<const FunctionDAG::Node *> store_at;
97
98 // The total bounds required of any given Func over all iterations
99 // of this loop. In the paper, this is represented using the
100 // little boxes to the left of the loop nest tree figures.
101 mutable NodeMap<Bound> bounds;
102
103 // The Func this loop nest belongs to
104 const FunctionDAG::Node *node = nullptr;
105
106 // The stage of the Func
107 const FunctionDAG::Node::Stage *stage = nullptr;
108
109 // Is this the innermost loop of this func (the SIMD loop)?
110 bool innermost = false;
111
112 // Are we permitted to tile this loop?
113 bool tileable = false;
114
115 // Is this the parallel outer loop?
116 bool parallel = false;
117
118 // What dimension is this Func vectorized over, in terms of the pure args of the Func?
119 int vector_dim = -1;
120
121 // Which loop corresponds to the innermost storage dimension and will be vectorized. -1 means none of them.
122 int vectorized_loop_index = -1;
123
124 // Apply gpu threads to this loop nest
126
138
139 mutable std::map<uint64_t, StageMap<StageMap<FeatureIntermediates>>> feature_intermediates;
140 mutable std::map<uint64_t, StageMap<ScheduleFeatures>> features;
141
142 bool is_gpu_serial(const Target &target) const {
144 }
145
146 bool is_gpu_thread(const Target &target) const {
148 }
149
150 bool is_gpu_block(const Target &target) const {
152 }
153
154 bool is_scalar() const {
155 return size.empty();
156 }
157
158 // given a newly inserted node f into this LoopNest, get union of thread counts in each dimension
159 // across all siblings of f.
160 vector<int64_t> get_union_thread_counts(const FunctionDAG::Node *f) const;
161
162 // given a newly inserted node f into this LoopNest, gets the size of
163 // all of f's stages and their pure_dim indices
165 vector<vector<int64_t>> &stage_sizes,
166 vector<vector<int>> &pure_dims,
167 vector<int> &vectorized_indices) const;
168
169 // given the loop nest of a stage to parallelize at root, figure out if using odd tile sizes
170 // for the vectorized dimension will allow the resulting thread tiles to be multiples of 32
171 // if so, we will include these in the serial loop sizes
172 void generate_vec_dim_serial_tilings(vector<int> &serial_sizes) const;
173
174 // get the loop nests of a newly inserted node, f, that is marked GPU threads. Tiles
175 // the newly inserted loop nests of f into a threads loop outside a serial loop.
176 // V is the vectorized dimension of f. Adds loopnests created from each tiling option in result.
178 const Anderson2021Params &params,
179 const Target &target,
180 int v,
181 vector<IntrusivePtr<const LoopNest>> &result,
182 const vector<int64_t> &max_size);
183
184 void copy_from(const LoopNest &n);
186
187 static void hash_combine(uint64_t &h, uint64_t next) {
188 // From boost
189 h ^= (next + 0x9e3779b9 + (h << 6) + (h >> 2));
190 }
191
192 // Hash the loop structure and sizes up to a fixed depth. This is
193 // used as the hash function for the coarse-to-fine beam search in
194 // the paper.
195 void structural_hash(uint64_t &h, int depth) const;
196
197 // How many funcs are scheduled inside this loop level. Used in
198 // the structural hash.
200 size_t count = inlined.size() + store_at.size();
201 for (const auto &c : children) {
202 count += c->funcs_realized_or_inlined();
203 }
204 return count;
205 }
206
207 // All of a stage's interesting locations in the loop nest. Used to help compute the featurization of a stage.
208 struct Sites {
209 const LoopNest *compute = nullptr; // Its containing compute_at site
210 const LoopNest *store = nullptr; // Its containing store_at site
211 const LoopNest *produce = nullptr; // Its own outermost node
212 const LoopNest *innermost = nullptr; // Its innermost node - usually a SIMD loop
213 const LoopNest *task = nullptr; // The parallel for loop it belongs to
214 const LoopNest *thread = nullptr; // Its containing gpu_thread loop
215 GPUMemoryType gpu_store_memory_type; // global, local, shared?
216 int64_t allocation_size = 0; // Allocation size in bytes
217 bool is_constant_allocation = false; // Does the allocation have constant size?
218 int64_t num_realizations = 0; // Number of times this stage is realized. Only valid for unscheduled producers
219 bool inlined = false; // Is the Func inlined?
220 std::vector<const LoopNest *> inlined_innermosts; // Is the Func inlined?
222
235 };
236
237 GPUMemoryType get_gpu_memory_type(bool in_block, bool in_thread, bool is_inlined = false) const;
238
239 std::vector<int> unrolled_loops(const Target &target, const LoopNest *parent, const LoopNest *grandparent) const;
240
242 StageMap<Sites> &sites,
243 NodeMap<bool> &can_be_promoted_to_registers,
244 const LoopNest *grandparent,
245 const LoopNest *parent) const;
246
247 bool promote_allocs_to_registers(const Target &target, StageMap<Sites> &sites) const;
248
249 // Compute all the sites of interest for each pipeline stage
250 void get_sites(const Target &target,
251 StageMap<Sites> &sites,
252 StageMap<int64_t> &shared_mem_alloc_sizes,
253 const LoopNest *task = nullptr,
254 const LoopNest *parent = nullptr,
255 const LoopNest *current_thread_loop = nullptr) const;
256
257 // A helper for the working_set_at_task feature. Most features are
258 // computed in the recursive pass 'compute_features' below, but
259 // this one must be done in a second separate recursive pass.
262 for (const auto &c : children) {
263 c->set_working_set_at_task_feature(working_set, features);
264 features->get(c->stage).working_set_at_task = working_set;
265 }
266 }
267
268 bool exceeds_serial_extents_limit(const Target &target, const LoopNest *parent, bool in_threads_loop) const;
269
271
272 bool has_dynamic_allocation_inside_thread(bool in_thread_loop) const;
273
275
277
279
280 // Get the stride over "node's" storage for a unit increment in the vectorized loop's
281 // index
282 double storage_stride(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const LoopNest &root) const;
283
284 Strides compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo &thread_info, bool verbose = false) const;
285
286 bool all_strides_exist(const LoadJacobian &jac, const FunctionDAG::Node *storage_node, const LoopNest &root) const;
287
288 int get_actual_vector_dim(const Bound &store_bounds) const;
289
290 void compute_gpu_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const GPULoopInfo &gpu_loop_info, const std::vector<int64_t> &inner_serial_loop_extents, const Sites &consumer_site, ScheduleFeatures &feat, const LoopNest *parent, const LoopNest &root, GlobalMemInfo &global_mem_loads, SharedMemInfo &shared_mem_loads, LocalMemInfo &local_mem_loads, bool verbose = false) const;
291
292 bool can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, const FunctionDAG::Node *accessed, int innermost_dim, int loop_index) const;
293
294 bool can_vectorize_store_access(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, int loop_index, const GPUMemoryType &mem_type) const;
295
296 int vectorized_load_access_size(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, const GPUMemoryType &mem_type, bool verbose = false) const;
297
298 int vectorized_access_size(size_t loop_index, bool verbose = false) const;
299
300 template<typename T>
301 void compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<T> &mem_info, bool verbose = false) const;
302
303 std::pair<double, double> compute_local_mem_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const LoopNest &root, double serial_loop_extents) const;
304
305 template<typename T>
306 MemInfoType<T> compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo &thread_info, double serial_loop_extents, bool verbose) const;
307
308 template<typename T>
309 void compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType<T> &mem_info, double serial_loop_extents, bool verbose = false) const;
310
311 double compute_local_mem_stride(double stride, double bytes) const;
312
313 // Assumes block, serial, thread or block, thread nesting
314 const LoopNest *get_enclosing_block(const LoopNest *parent, const LoopNest *grandparent) const;
315
316 std::pair<int64_t, int64_t> get_block_and_serial_extents(const LoopNest *block) const;
317
319
321
322 void compute_warp_features(ScheduleFeatures &features, const GPULoopInfo &gpu_loop_info) const;
323
324 // Assume that when a block is active, all its warps are active
325 void compute_warp_and_block_occupancy(const Anderson2021Params &params, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const;
326
327 void compute_shared_mem_occupancy(const Anderson2021Params &params, const Target &target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const;
328
329 std::pair<const LoopNest *, const LoopNest *> find_innermost_and_parent() const;
330
331 int64_t points_accessed_per_thread(const Anderson2021Params &params, const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector<const FunctionDAG::Edge *> &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose = false) const;
332
333 int64_t compute_licm_amortization(const LoopNest *innermost, const LoopNest *parent, const ScheduleFeatures &feat, const LoadJacobian &jac, int producer_dims) const;
334
336
337 vector<pair<int, int>> collect_producers(const StageMap<Sites> &sites) const;
338
340
341 void collect_stages(std::set<const FunctionDAG::Node::Stage *> &stages) const;
342
344
347
349
350 std::pair<int64_t, bool> compute_alloc_size_of_node_here(const FunctionDAG::Node *f) const;
351
352 // Do a recursive walk over the loop nest computing features to feed the cost model.
354 const Anderson2021Params &params,
355 const Target &target,
356 const StageMap<Sites> &sites,
357 int64_t instances,
358 int64_t parallelism,
359 const LoopNest *parent,
360 const LoopNest *grandparent,
361 const LoopNest &root,
362 GPULoopInfo gpu_loop_info,
363 bool use_memoized_features,
364 const StageMap<int64_t> &total_shared_mem_alloc_sizes,
365 int64_t *working_set,
366 int64_t *working_set_local_constant,
367 int64_t *working_set_local_dynamic,
369 Statistics &stats,
370 bool verbose = false) const;
371
372 bool is_root() const {
373 // The root is the sole node without a Func associated with
374 // it.
375 return node == nullptr;
376 }
377
378 // Set the region required of a Func at this site.
379 const Bound &set_bounds(const FunctionDAG::Node *f, BoundContents *b) const {
380 return bounds.emplace(f, b);
381 }
382
383 // Get the region required of a Func at this site, from which we
384 // know what region would be computed if it were scheduled here,
385 // and what its loop nest would be.
386 const Bound &get_bounds(const FunctionDAG::Node *f) const;
387
388 // Get the region required of a Func at this site (but only to satisfy the
389 // consumers along the given edge chain), from which we know what region
390 // would be computed if it were scheduled here and what its loop nest
391 // would be.
392 Bound get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector<const FunctionDAG::Edge *> &edge_chain) const;
393
394 void dump() const;
395
396 std::string to_string() const;
397
398 // Recursively print a loop nest representation to stderr
399 template<typename T>
400 void dump(T &stream, string prefix, const LoopNest *parent) const;
401
402 // Does this loop nest access the given Func
403 bool calls(const FunctionDAG::Node *f) const;
404
405 // What is the maximum number of inlined calls to a Func that
406 // occur within this loop. Used to prune states that would
407 // generate too much code.
409
410 // Does this loop nest access an input buffer? Used to select
411 // trail strategies when splitting loops. We don't want to read
412 // out of bounds on inputs, even if we don't intend to use the
413 // values read. It could create annoying assertion failures for
414 // the user. It's OK to read out of range of the values computed
415 // on internal Funcs though. Allocation bounds inference just pads
416 // out the bounds so that it won't fault.
418
419 // Does this loop nest contain a computation of the given Func.
420 bool computes(const FunctionDAG::Node *f) const;
421
422 // Above here most methods query the loop nest. Below we have
423 // methods that mutate the loop nest.
424
425 // Inline a Func into all consumers within this loop.
427
428 // Compute a Func at this site.
430 bool tileable,
431 int v,
432 bool in_threads_loop,
433 const Anderson2021Params &params,
434 const Target &target);
435
436 // Parallelize this loop according to the given tiling.
438 const LoopNest *parent,
439 const Anderson2021Params &params,
440 const Target &target,
441 bool inner_tiling,
442 bool adjust_tiling,
443 bool move_all_rvars_inward = true,
444 const vector<int> &rvars_to_move_inward = {}) const;
445
446 int64_t get_total_local_mem_alloc_size(bool constant_allocs_only = false, bool in_threads_loop = false) const;
448
449 // All store ats further in than the block level must be fixed
450 // sized allocations. This method checks if f will require a dynamic
451 // allocation
452 bool requires_dynamic_allocation(const FunctionDAG::Node *f, const Target &target, bool in_threads_loop) const;
453
454 // Return all possible ways to compute f in tiles somewhere within
455 // this loop nest.
456 // in_threads_loop tracks whether or not function is going to be placed inside a
457 // loop marked gpu_threads, in which case f's loops cannot be gpu_threads
458 vector<IntrusivePtr<const LoopNest>> compute_in_tiles(const FunctionDAG::Node *f,
459 const LoopNest *parent,
460 const Anderson2021Params &params,
461 const Target &target,
462 const SearchSpaceOptions &search_space_options,
463 int v,
464 bool in_realization,
465 bool in_threads_loop,
466 bool is_pre_pass,
467 vector<int64_t> union_counts = vector<int64_t>()) const;
468
469 // Below here we have methods that apply a schedule to a Halide pipeline.
470
471 // A model of the state of the loop nest of a Func while applying
472 // Halide's scheduling directives.
473
474 // Note that StageScheduleState is movable-but-not-copyable thanks to its ostringstream member.
475 struct StageScheduleState {
476 // How much parallelism do we need to exploit with this Func?
477 double num_cores = 0;
478
479 // Which storage dimension is vectorized? We need to reorder it innermost
480 int vector_dim = -1;
481 int vectorized_loop_index = -1;
482
483 // The various Vars and RVars used for scheduling a Func.
484 struct FuncVar {
485 // The top-level var or rvar this was split off from
487
488 // This var.
490
491 // Source code to access this Var/RVar. Used for printing
492 // valid Halide source for this schedule.
493 string accessor;
494
495 // Our estimate of the extent of this var. This is exact
496 // when constant_extent flag is true.
497 int64_t extent = 0;
498
499 // Which index in the symbolic loop nest does this var
500 // belong to.
501 size_t index = 0;
502
503 // Some flags.
504 bool innermost_pure_dim = false,
505 outermost = false,
506 parallel = false,
507 exists = false,
508 pure = false,
509 constant_extent = false;
510
511 bool vectorized = false;
512 bool gpu_threads = false;
513
515 : orig(Var()), var(Var()) {
516 }
517 };
520 bool parallel = false;
521 bool vectorized = false;
524
525 // In order from innermost to outermost. Each group of d is one tiling level.
526 vector<FuncVar> vars;
527
528 // In order from innermost to outermost. Each group of d is one tiling level.
529 vector<FuncVar> ordered_vars;
530 vector<int64_t> gpu_thread_extents;
531
533
534 // From outermost in
535 vector<StageScheduleState *> ancestors;
536
537 std::ostringstream schedule_source;
538 };
539
544 int num_serial_loops() const;
546
547 void update_producers_to_be_staged(StageScheduleState &state, const NodeMap<bool> &all_inlined) const;
548 bool region_computed_shrinks(const FunctionDAG::Node *f, const LoopNest *parent) const;
549
550 // Apply the schedule represented by this loop nest to a Halide pipeline.
551 void apply(LoopLevel here,
552 StageMap<std::unique_ptr<StageScheduleState>> &state_map,
553 double num_cores,
554 int depth,
555 const LoopNest *parent,
556 const LoopNest *compute_site,
557 const Target &target,
558 std::vector<StageScheduleState *> &ancestors,
559 const NodeMap<bool> &all_inlined) const;
560
561 double max_idle_lane_wastage(const Target &target, GPULoopInfo gpu_loop_info) const;
562
564
565 void collect_nodes_that_should_be_inlined(const NodeMap<bool> &nodes_to_freeze, NodeMap<bool> &inlined_nodes) const;
566
567 void collect_all_inlined(NodeMap<bool> &all_inlined) const;
568
570 int64_t product_of_descendants(int loop_index) const;
571
572 void get_stages_computed_in_each_compute_root_loop(StageMap<StageMap<bool>> &descendants, const LoopNest *compute_root_loop_nest = nullptr) const;
573};
574
575struct Filter {
577 bool logging = false;
578
581 if (logging) {
582 std::cerr << "\nState filtered: \n";
583 loop_nest->dump();
584 std::cerr << "Reason: ";
585 }
586 }
587
588 template<typename T>
590 if (logging) {
591 std::cerr << std::forward<T>(x);
592 }
593 return *this;
594 }
595
597};
598
599} // namespace Autoscheduler
600} // namespace Internal
601} // namespace Halide
602
603#endif // LOOP_NEST_H
Data structure containing information about the current GPU loop nest hierarchy of blocks,...
Data structures that help track memory access information.
Data structure containing information about GPU threads for a particular location in the loop nest an...
A class representing a reference count to be used with IntrusivePtr.
A reference to a site in a Halide statement at the top of the body of a particular for loop.
Definition Schedule.h:176
A Halide variable, to be used when defining functions.
Definition Var.h:19
MemInfoType< SharedMem > SharedMemInfo
Definition GPUMemInfo.h:112
int64_t get_active_block_hardware_limit(const Anderson2021Params &params)
PerfectHashMap< FunctionDAG::Node::Stage, T > StageMap
Definition LoopNest.h:24
bool all(const vector< int > &v)
IntrusivePtr< const BoundContents > Bound
bool are_valid_thread_extents(const vector< int64_t > &counts)
bool accessed_at_constant_indices(const std::vector< int > &unrolled, const FunctionDAG::Edge *e)
constexpr int64_t get_register_mem_alloc_limit()
Definition LoopNest.h:56
PerfectHashMap< FunctionDAG::Node, T > NodeMap
Definition LoopNest.h:21
int64_t get_active_warp_hardware_limit(const Anderson2021Params &params)
bool may_subtile(const Anderson2021Params &params)
MemInfo< typename MemTraits< T >::MemInfoType > MemInfoType
Definition GPUMemInfo.h:109
MemInfoType< LocalMem > LocalMemInfo
Definition GPUMemInfo.h:113
int get_unroll_limit(const Target &target)
int64_t get_shared_memory_limit(const Anderson2021Params &params)
std::string stringify(GPU_parallelism label)
MemInfoType< GlobalMem > GlobalMemInfo
Definition GPUMemInfo.h:111
RefCount & ref_count(const T *t) noexcept
Because in this header we don't yet know how client classes store their RefCount (and we don't want t...
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
unsigned __INT64_TYPE__ uint64_t
signed __INT64_TYPE__ int64_t
Filter(const LoopNest *loop_nest)
Definition LoopNest.h:579
std::vector< const LoopNest * > inlined_innermosts
Definition LoopNest.h:220
NodeMap< std::vector< std::pair< const LoopNest *, std::vector< const FunctionDAG::Edge * > > > > producers_to_be_staged
Definition LoopNest.h:532
vector< pair< int, int > > collect_producers(const StageMap< Sites > &sites) const
bool is_gpu_thread(const Target &target) const
Definition LoopNest.h:146
vector< IntrusivePtr< const LoopNest > > compute_in_tiles(const FunctionDAG::Node *f, const LoopNest *parent, const Anderson2021Params &params, const Target &target, const SearchSpaceOptions &search_space_options, int v, bool in_realization, bool in_threads_loop, bool is_pre_pass, vector< int64_t > union_counts=vector< int64_t >()) const
const LoopNest * get_enclosing_block(const LoopNest *parent, const LoopNest *grandparent) const
int num_serial_loops(const FunctionDAG::Node::Stage *stage) const
bool has_constant_region_required(const FunctionDAG::Node *node) const
std::map< uint64_t, StageMap< ScheduleFeatures > > features
Definition LoopNest.h:140
int get_pure_stage_vectorized_loop_index(const FunctionDAG::Node *node) const
const FunctionDAG::Node * node
Definition LoopNest.h:57
void dump(T &stream, string prefix, const LoopNest *parent) const
int64_t product_of_self_and_descendants(int loop_index) const
bool all_strides_exist(const LoadJacobian &jac, const FunctionDAG::Node *storage_node, const LoopNest &root) const
void recompute_inlined_features(const StageMap< Sites > &sites, StageMap< ScheduleFeatures > *features) const
void inline_func(const FunctionDAG::Node *f)
void generate_vec_dim_serial_tilings(vector< int > &serial_sizes) const
bool region_computed_shrinks(const FunctionDAG::Node *f, const LoopNest *parent) const
void compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType< T > &mem_info, bool verbose=false) const
void compute_gpu_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const GPULoopInfo &gpu_loop_info, const std::vector< int64_t > &inner_serial_loop_extents, const Sites &consumer_site, ScheduleFeatures &feat, const LoopNest *parent, const LoopNest &root, GlobalMemInfo &global_mem_loads, SharedMemInfo &shared_mem_loads, LocalMemInfo &local_mem_loads, bool verbose=false) const
void compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType< T > &mem_info, double serial_loop_extents, bool verbose=false) const
int64_t compute_licm_amortization(const LoopNest *innermost, const LoopNest *parent, const ScheduleFeatures &feat, const LoadJacobian &jac, int producer_dims) const
int64_t product_of_descendants(int loop_index) const
bool requires_dynamic_allocation(const FunctionDAG::Node *f, const Target &target, bool in_threads_loop) const
bool is_gpu_block(const Target &target) const
Definition LoopNest.h:150
bool node_has_dynamic_region_computed(const FunctionDAG::Node *f) const
bool exceeds_serial_extents_limit(const Target &target, const LoopNest *parent, bool in_threads_loop) const
void collect_stages(std::set< const FunctionDAG::Node::Stage * > &stages) const
Bound get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector< const FunctionDAG::Edge * > &edge_chain) const
int64_t get_total_constant_local_mem_alloc_size() const
const Bound & get_bounds(const FunctionDAG::Node *f) const
std::pair< double, double > compute_local_mem_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const LoopNest &root, double serial_loop_extents) const
int vectorized_load_access_size(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, const GPUMemoryType &mem_type, bool verbose=false) const
GPUMemoryType get_gpu_memory_type(bool in_block, bool in_thread, bool is_inlined=false) const
void compute_warp_and_block_occupancy(const Anderson2021Params &params, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const
vector< int64_t > get_union_thread_counts(const FunctionDAG::Node *f) const
MemInfoType< T > compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo &thread_info, double serial_loop_extents, bool verbose) const
void collect_nodes_that_should_be_inlined(const NodeMap< bool > &nodes_to_freeze, NodeMap< bool > &inlined_nodes) const
void apply(LoopLevel here, StageMap< std::unique_ptr< StageScheduleState > > &state_map, double num_cores, int depth, const LoopNest *parent, const LoopNest *compute_site, const Target &target, std::vector< StageScheduleState * > &ancestors, const NodeMap< bool > &all_inlined) const
std::map< uint64_t, StageMap< StageMap< FeatureIntermediates > > > feature_intermediates
Definition LoopNest.h:139
int get_actual_vector_dim(const Bound &store_bounds) const
void compute_shared_mem_occupancy(const Anderson2021Params &params, const Target &target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const
int get_vectorized_loop_index_from_pure_stage(const LoopNest &root) const
bool computes(const FunctionDAG::Node *f) const
double storage_stride(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const LoopNest &root) const
std::vector< IntrusivePtr< const LoopNest > > children
Definition LoopNest.h:42
void set_working_set_at_task_feature(int64_t working_set, StageMap< ScheduleFeatures > *features) const
Definition LoopNest.h:260
bool promote_allocs_to_registers(const Target &target, StageMap< Sites > &sites) const
std::pair< int64_t, int64_t > get_block_and_serial_extents(const LoopNest *block) const
const Bound & set_bounds(const FunctionDAG::Node *f, BoundContents *b) const
Definition LoopNest.h:379
bool has_dynamic_allocation_inside_thread(bool in_thread_loop) const
void memoize_points_computed_minimum(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features) const
bool has_constant_region_computed(const FunctionDAG::Node *node) const
double compute_local_mem_stride(double stride, double bytes) const
bool add_gpu_thread_tilings(const FunctionDAG::Node *f, const Anderson2021Params &params, const Target &target, int v, vector< IntrusivePtr< const LoopNest > > &result, const vector< int64_t > &max_size)
IntrusivePtr< const LoopNest > parallelize_in_tiles(const vector< int64_t > &tiling, const LoopNest *parent, const Anderson2021Params &params, const Target &target, bool inner_tiling, bool adjust_tiling, bool move_all_rvars_inward=true, const vector< int > &rvars_to_move_inward={}) const
void memoize_features(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features) const
void collect_all_inlined(NodeMap< bool > &all_inlined) const
Strides compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo &thread_info, bool verbose=false) const
std::vector< int > unrolled_loops(const Target &target, const LoopNest *parent, const LoopNest *grandparent) const
bool is_gpu_serial(const Target &target) const
Definition LoopNest.h:142
double max_idle_lane_wastage(const Target &target, GPULoopInfo gpu_loop_info) const
static void hash_combine(uint64_t &h, uint64_t next)
Definition LoopNest.h:187
const LoopNest * find_pure_stage_loop_nest(const FunctionDAG::Node *node) const
int64_t points_accessed_per_thread(const Anderson2021Params &params, const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector< const FunctionDAG::Edge * > &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose=false) const
void get_stage_sizes(const FunctionDAG::Node *f, vector< vector< int64_t > > &stage_sizes, vector< vector< int > > &pure_dims, vector< int > &vectorized_indices) const
uint64_t compute_hash_of_producers_stored_at_root(const StageMap< Sites > &sites) const
const FunctionDAG::Node::Stage * stage
Definition LoopNest.h:60
bool other_stage_has_same_producer(const FunctionDAG::Node *producer) const
void get_stages_computed_in_each_compute_root_loop(StageMap< StageMap< bool > > &descendants, const LoopNest *compute_root_loop_nest=nullptr) const
void compute_features(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, const StageMap< Sites > &sites, int64_t instances, int64_t parallelism, const LoopNest *parent, const LoopNest *grandparent, const LoopNest &root, GPULoopInfo gpu_loop_info, bool use_memoized_features, const StageMap< int64_t > &total_shared_mem_alloc_sizes, int64_t *working_set, int64_t *working_set_local_constant, int64_t *working_set_local_dynamic, StageMap< ScheduleFeatures > *features, Statistics &stats, bool verbose=false) const
void get_sites(const Target &target, StageMap< Sites > &sites, StageMap< int64_t > &shared_mem_alloc_sizes, const LoopNest *task=nullptr, const LoopNest *parent=nullptr, const LoopNest *current_thread_loop=nullptr) const
bool calls(const FunctionDAG::Node *f) const
bool producer_computed_here_or_further_in(const FunctionDAG::Node *producer) const
void copy_from_including_features(const LoopNest &n)
bool can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, const FunctionDAG::Node *accessed, int innermost_dim, int loop_index) const
void update_producers_to_be_staged(StageScheduleState &state, const NodeMap< bool > &all_inlined) const
bool can_vectorize_store_access(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, int loop_index, const GPUMemoryType &mem_type) const
void get_allocs_that_can_be_promoted_to_registers(const Target &target, StageMap< Sites > &sites, NodeMap< bool > &can_be_promoted_to_registers, const LoopNest *grandparent, const LoopNest *parent) const
void compute_warp_features(ScheduleFeatures &features, const GPULoopInfo &gpu_loop_info) const
void structural_hash(uint64_t &h, int depth) const
std::pair< const LoopNest *, const LoopNest * > find_innermost_and_parent() const
std::set< const FunctionDAG::Node * > store_at
Definition LoopNest.h:49
std::pair< int64_t, bool > compute_alloc_size_of_node_here(const FunctionDAG::Node *f) const
bool compute_here(const FunctionDAG::Node *f, bool tileable, int v, bool in_threads_loop, const Anderson2021Params &params, const Target &target)
int64_t get_total_local_mem_alloc_size(bool constant_allocs_only=false, bool in_threads_loop=false) const
void compute_working_set_from_features(int64_t *working_set, const StageMap< ScheduleFeatures > *features) const
int vectorized_access_size(size_t loop_index, bool verbose=false) const
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.
A struct representing a target machine and os to generate code for.
Definition Target.h:19
bool has_gpu_feature() const
Is a fully feature GPU compute runtime enabled?
A class that can represent Vars or RVars.
Definition Func.h:30