Halide 16.0.0
Halide compiler and libraries
 
Loading...
Searching...
No Matches
State.h
Go to the documentation of this file.
1#ifndef STATE_H
2#define STATE_H
3
4#include "ASLog.h"
5#include "CostModel.h"
6#include "DefaultCostModel.h"
7#include "Featurization.h"
8#include "FunctionDAG.h"
9#include "LoopNest.h"
10#include "PerfectHashMap.h"
11#include <set>
12#include <unordered_set>
13#include <vector>
14
15namespace Halide {
16namespace Internal {
17namespace Autoscheduler {
18
19using std::map;
20using std::pair;
21using std::set;
22using std::string;
23using std::unordered_set;
24using std::vector;
25
27
29
31
32constexpr int kLocalMemoryLimit = 524288; // 512 KB
33
34// Stack memory limit = Total GPU Memory / (# of SMs * maximum threads per SM)
35// = 103232 bytes
36// Not all 103232 bytes will be free for allocations so reduce it by factor to
37// allow a buffer
39
41
43
45 void operator()(LoopNest *new_loop_nest) const {
46 }
47};
48
49template<typename PostCreateMutator>
50void deep_copy_loop_nest(LoopNest *new_loop_nest, const LoopNest *new_loop_nest_parent, const IntrusivePtr<const LoopNest> &existing_loop_nest, const PostCreateMutator &post_create_mutator) {
51 new_loop_nest->copy_from(*existing_loop_nest);
52
53 for (std::size_t i = 0, N = new_loop_nest->children.size(); i < N; ++i) {
54 LoopNest *new_child = new LoopNest;
55 new_loop_nest->children[i] = new_child;
56 deep_copy_loop_nest(new_child, new_loop_nest, existing_loop_nest->children[i], post_create_mutator);
57 }
58
59 post_create_mutator(new_loop_nest);
60}
61
62template<typename PostCreateMutator>
63LoopNest *deep_copy_loop_nest(const IntrusivePtr<const LoopNest> &loop_nest, const PostCreateMutator &post_create_mutator) {
64 LoopNest *new_loop_nest = new LoopNest;
65 deep_copy_loop_nest(new_loop_nest, nullptr, loop_nest, post_create_mutator);
66 return new_loop_nest;
67}
68
69struct State {
70 mutable RefCount ref_count;
73 double cost = 0;
74 std::vector<double> cost_per_stage;
76 int num_decisions_made = 0;
77 bool penalized = false;
78 string schedule_source;
79
80 State() = default;
81 State(const State &) = delete;
82 State(State &&) = delete;
83 void operator=(const State &) = delete;
84 void operator=(State &&) = delete;
85
86 uint64_t structural_hash(int depth) const;
87
88 // Compute the parent and depth of every loop nest node
89 void compute_loop_nest_parents(map<const LoopNest *, pair<const LoopNest *, int>> &p,
90 const LoopNest *here, int depth) const;
91
92 const LoopNest *deepest_common_ancestor(const map<const LoopNest *, pair<const LoopNest *, int>> &parent,
93 const LoopNest *a, const LoopNest *b) const;
94
95 // We use the post_create_mutator so that the loop nests can be modified
96 // before they become IntrusivePtr<const LoopNest> as children and cannot be modified
97 template<typename PostCreateMutator>
98 LoopNest *create_feature_root(const PostCreateMutator &post_create_mutator) const {
99 LoopNest *new_root = new LoopNest;
100 deep_copy_loop_nest<PostCreateMutator>(new_root, nullptr, root, post_create_mutator);
101 return new_root;
102 }
103
105
107
111
112 void operator()(LoopNest *new_loop_nest) const;
113
114 // In phase 2, any compute_root loop marked 'none' will be split into
115 // blocks, threads, and serial loops. To enable the cost model to make a
116 // meaningful prediction on these pre-split loops, we assume a split into
117 // blocks and threads with a single full warp (if possible)
118 void split_compute_root_loops(LoopNest *loop_nest) const;
119
120 // If a loop nest does not have thread loops, split the outermost serial
121 // loops to create thread loops with extents 1
122 void add_outer_thread_loops(LoopNest *loop_nest) const;
123 };
124
126
127 void set_gpu_store_site(const map<const LoopNest *, pair<const LoopNest *, int>> &parent, const LoopNest *loop, LoopNest::Sites &site) const;
128
129 bool compute_featurization(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, StageMap<ScheduleFeatures> *features, Statistics &stats, bool verbose = false) const;
130
131 void save_featurization(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, std::ostream &out) const;
132
133 bool contains_store_at(const set<const FunctionDAG::Node *> &outermost_store_at, const IntrusivePtr<const LoopNest> &parent) const;
134
135 // For GPU, only allow store_at root or inside the outermost loop nest. Any
136 // store_ats further in will be hoisted and expanded, increasing the
137 // amount of shared memory required.
139
141
142 bool exceeds_serial_extents_limit(const Target &target) const;
143
144 int64_t get_shared_mem_alloc_size(const LoopNest *block, const LoopNest *loop) const;
145
146 bool exceeds_shared_memory_limit(const Anderson2021Params &params, const Target &target) const;
147
148 bool exceeds_local_memory_limit(const Anderson2021Params &params, const Target &target) const;
149
150 bool calculate_cost(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose = false);
151
152 // Make a child copy of this state. The loop nest is const (we
153 // make mutated copies of it, rather than mutating it), so we can
154 // continue to point to the same one and so this is a cheap
155 // operation.
157
158 void dump() const;
159
161
162 void fuse_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector<VarOrRVar> &parallel_vars, const vector<int64_t> &parallel_extents, const vector<int> &constant_extents) const;
163
164 void mark_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector<VarOrRVar> &parallel_vars, const vector<int64_t> &parallel_extents) const;
165
166 bool mark_gpu_threads(LoopNest::StageScheduleState *state, Stage &stage, std::unordered_set<std::string> &new_serial_vars, std::ostringstream &staged_funcs_schedule_source) const;
167
168 bool can_fuse_gpu(const vector<int64_t> &parallel_extents) const;
169
170 // Apply the schedule represented by this state to a Halide
171 // Pipeline. Also generate source code for the schedule for the
172 // user to copy-paste to freeze this schedule as permanent artifact.
173 void apply_schedule(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target);
174
178
179 const LoopNest *deepest_valid_compute_location(const Anderson2021Params &params, const map<const LoopNest *, pair<const LoopNest *, int>> &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap<int64_t> &total_shared_mem_alloc_sizes) const;
180 int64_t total_loop_extents_of_ancestors(const map<const LoopNest *, pair<const LoopNest *, int>> &parent, const LoopNest *loop) const;
181};
182
183// A priority queue of states, sorted according to increasing
184// cost. Never shrinks, to avoid reallocations.
185// Can't use std::priority_queue because it doesn't support unique_ptr.
187private:
188 struct CompareStates {
189 bool operator()(const IntrusivePtr<State> &a, const IntrusivePtr<State> &b) const {
190 return a->cost > b->cost;
191 }
192 };
193
194 std::vector<IntrusivePtr<State>> storage;
195 size_t sz = 0;
196
197public:
199 if (sz >= storage.size()) {
200 storage.resize(std::max(sz * 2, (size_t)64));
201 }
202 internal_assert(sz < storage.size()) << sz << " " << storage.size() << "\n";
203 storage[sz] = std::move(s);
204 sz++;
205 std::push_heap(storage.begin(), storage.begin() + sz, CompareStates{});
206 }
207
209 internal_assert(sz <= storage.size()) << sz << " " << storage.size() << "\n";
210 std::pop_heap(storage.begin(), storage.begin() + sz, CompareStates{});
211 sz--;
212 return std::move(storage[sz]);
213 }
214
216 return storage[0];
217 }
218
219 bool empty() const {
220 return sz == 0;
221 }
222
223 size_t size() const {
224 return sz;
225 }
226
227 void swap(StateQueue &other) {
228 storage.swap(other.storage);
229 std::swap(sz, other.sz);
230 }
231
233 return storage[idx];
234 }
235
236 void resort() {
237 std::make_heap(storage.begin(), storage.begin() + sz, CompareStates{});
238 }
239
240 void clear() {
241 for (size_t i = 0; i < sz; i++) {
242 storage[i] = IntrusivePtr<State>{};
243 }
244 sz = 0;
245 }
246};
247
248} // namespace Autoscheduler
249} // namespace Internal
250} // namespace Halide
251
252#endif // STATE_H
#define internal_assert(c)
Definition Errors.h:19
const IntrusivePtr< State > & top()
Definition State.h:215
void emplace(IntrusivePtr< State > &&s)
Definition State.h:198
IntrusivePtr< State > operator[](int idx) const
Definition State.h:232
A class representing a reference count to be used with IntrusivePtr.
A single definition of a Func.
Definition Func.h:70
PerfectHashMap< FunctionDAG::Node::Stage, T > StageMap
Definition LoopNest.h:24
constexpr int kLocalMemoryLimit
Definition State.h:32
PerfectHashMap< FunctionDAG::Node, T > NodeMap
Definition LoopNest.h:21
void deep_copy_loop_nest(LoopNest *new_loop_nest, const LoopNest *new_loop_nest_parent, const IntrusivePtr< const LoopNest > &existing_loop_nest, const PostCreateMutator &post_create_mutator)
Definition State.h:50
RefCount & ref_count(const T *t) noexcept
Because in this header we don't yet know how client classes store their RefCount (and we don't want t...
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
unsigned __INT64_TYPE__ uint64_t
signed __INT64_TYPE__ int64_t
std::vector< IntrusivePtr< const LoopNest > > children
Definition LoopNest.h:42
void operator()(LoopNest *new_loop_nest) const
Definition State.h:45
void save_featurization(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, std::ostream &out) const
bool exceeds_serial_extents_limit(const Target &target) const
int64_t get_shared_mem_alloc_size(const LoopNest *block, const LoopNest *loop) const
void compute_loop_nest_parents(map< const LoopNest *, pair< const LoopNest *, int > > &p, const LoopNest *here, int depth) const
void update_always_consider_inline_options(const FunctionDAG::Node *node)
bool can_fuse_gpu(const vector< int64_t > &parallel_extents) const
bool should_always_consider_inline(const FunctionDAG::Node *node) const
bool contains_store_at_further_in_than_outermost() const
int64_t total_loop_extents_of_ancestors(const map< const LoopNest *, pair< const LoopNest *, int > > &parent, const LoopNest *loop) const
void operator=(const State &)=delete
void fuse_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector< VarOrRVar > &parallel_vars, const vector< int64_t > &parallel_extents, const vector< int > &constant_extents) const
bool mark_gpu_threads(LoopNest::StageScheduleState *state, Stage &stage, std::unordered_set< std::string > &new_serial_vars, std::ostringstream &staged_funcs_schedule_source) const
NodeMap< bool > always_consider_inline
Definition State.h:75
IntrusivePtr< const State > parent
Definition State.h:26
uint64_t structural_hash(int depth) const
IntrusivePtr< const LoopNest > root
Definition State.h:24
bool exceeds_shared_memory_limit(const Anderson2021Params &params, const Target &target) const
const LoopNest * deepest_valid_compute_location(const Anderson2021Params &params, const map< const LoopNest *, pair< const LoopNest *, int > > &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap< int64_t > &total_shared_mem_alloc_sizes) const
bool compute_featurization(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, StageMap< ScheduleFeatures > *features, Statistics &stats, bool verbose=false) const
bool contains_store_at(const set< const FunctionDAG::Node * > &outermost_store_at, const IntrusivePtr< const LoopNest > &parent) const
void mark_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector< VarOrRVar > &parallel_vars, const vector< int64_t > &parallel_extents) const
IntrusivePtr< const LoopNest > get_root_for_features(const Anderson2021Params &params, const Target &target) const
bool calculate_cost(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose=false)
LoopNest * create_feature_root(const PostCreateMutator &post_create_mutator) const
Definition State.h:98
void set_gpu_store_site(const map< const LoopNest *, pair< const LoopNest *, int > > &parent, const LoopNest *loop, LoopNest::Sites &site) const
IntrusivePtr< State > make_child() const
bool exceeds_local_memory_limit(const Anderson2021Params &params, const Target &target) const
const LoopNest * deepest_common_ancestor(const map< const LoopNest *, pair< const LoopNest *, int > > &parent, const LoopNest *a, const LoopNest *b) const
void add_to_always_consider_inline_options(const FunctionDAG::Node *node)
void apply_schedule(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target)
std::vector< double > cost_per_stage
Definition State.h:74
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.
A struct representing a target machine and os to generate code for.
Definition Target.h:19