12#include <unordered_set>
23using std::unordered_set;
49template<
typename PostCreateMutator>
51 new_loop_nest->
copy_from(*existing_loop_nest);
53 for (std::size_t i = 0, N = new_loop_nest->
children.size(); i < N; ++i) {
55 new_loop_nest->
children[i] = new_child;
56 deep_copy_loop_nest(new_child, new_loop_nest, existing_loop_nest->children[i], post_create_mutator);
59 post_create_mutator(new_loop_nest);
62template<
typename PostCreateMutator>
90 const LoopNest *here,
int depth)
const;
97 template<
typename PostCreateMutator>
188 struct CompareStates {
190 return a->cost > b->cost;
194 std::vector<IntrusivePtr<State>> storage;
199 if (sz >= storage.size()) {
200 storage.resize(std::max(sz * 2, (
size_t)64));
202 internal_assert(sz < storage.size()) << sz <<
" " << storage.size() <<
"\n";
203 storage[sz] = std::move(s);
205 std::push_heap(storage.begin(), storage.begin() + sz, CompareStates{});
209 internal_assert(sz <= storage.size()) << sz <<
" " << storage.size() <<
"\n";
210 std::pop_heap(storage.begin(), storage.begin() + sz, CompareStates{});
212 return std::move(storage[sz]);
228 storage.swap(other.storage);
229 std::swap(sz, other.sz);
237 std::make_heap(storage.begin(), storage.begin() + sz, CompareStates{});
241 for (
size_t i = 0; i < sz; i++) {
#define internal_assert(c)
void swap(StateQueue &other)
const IntrusivePtr< State > & top()
void emplace(IntrusivePtr< State > &&s)
IntrusivePtr< State > pop()
IntrusivePtr< State > operator[](int idx) const
A class representing a reference count to be used with IntrusivePtr.
A single definition of a Func.
PerfectHashMap< FunctionDAG::Node::Stage, T > StageMap
bool compute_root_and_inline_only()
int64_t get_stack_memory_limit()
bool use_adjusted_tilings()
constexpr int kLocalMemoryLimit
double get_stack_memory_adjustment_factor()
bool verify_memoized_features()
PerfectHashMap< FunctionDAG::Node, T > NodeMap
bool is_memoize_blocks_enabled()
void deep_copy_loop_nest(LoopNest *new_loop_nest, const LoopNest *new_loop_nest_parent, const IntrusivePtr< const LoopNest > &existing_loop_nest, const PostCreateMutator &post_create_mutator)
RefCount & ref_count(const T *t) noexcept
Because in this header we don't yet know how client classes store their RefCount (and we don't want t...
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
unsigned __INT64_TYPE__ uint64_t
signed __INT64_TYPE__ int64_t
std::vector< IntrusivePtr< const LoopNest > > children
void copy_from(const LoopNest &n)
void operator()(LoopNest *new_loop_nest) const
void operator()(LoopNest *new_loop_nest) const
void split_compute_root_loops(LoopNest *loop_nest) const
void add_outer_thread_loops(LoopNest *loop_nest) const
const Anderson2021Params & params
void save_featurization(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target, std::ostream &out) const
bool exceeds_serial_extents_limit(const Target &target) const
int64_t get_shared_mem_alloc_size(const LoopNest *block, const LoopNest *loop) const
void compute_loop_nest_parents(map< const LoopNest *, pair< const LoopNest *, int > > &p, const LoopNest *here, int depth) const
bool has_compute_root_loops_without_blocks() const
void update_always_consider_inline_options(const FunctionDAG::Node *node)
bool can_fuse_gpu(const vector< int64_t > ¶llel_extents) const
bool should_always_consider_inline(const FunctionDAG::Node *node) const
bool contains_store_at_further_in_than_outermost() const
int64_t total_loop_extents_of_ancestors(const map< const LoopNest *, pair< const LoopNest *, int > > &parent, const LoopNest *loop) const
void operator=(const State &)=delete
State(const State &)=delete
bool has_dynamic_allocation_inside_thread() const
void fuse_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector< VarOrRVar > ¶llel_vars, const vector< int64_t > ¶llel_extents, const vector< int > &constant_extents) const
void print_compute_locations() const
bool mark_gpu_threads(LoopNest::StageScheduleState *state, Stage &stage, std::unordered_set< std::string > &new_serial_vars, std::ostringstream &staged_funcs_schedule_source) const
NodeMap< bool > always_consider_inline
IntrusivePtr< const State > parent
void operator=(State &&)=delete
uint64_t structural_hash(int depth) const
IntrusivePtr< const LoopNest > root
bool exceeds_shared_memory_limit(const Anderson2021Params ¶ms, const Target &target) const
const LoopNest * deepest_valid_compute_location(const Anderson2021Params ¶ms, const map< const LoopNest *, pair< const LoopNest *, int > > &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap< int64_t > &total_shared_mem_alloc_sizes) const
bool compute_featurization(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target, StageMap< ScheduleFeatures > *features, Statistics &stats, bool verbose=false) const
bool contains_store_at(const set< const FunctionDAG::Node * > &outermost_store_at, const IntrusivePtr< const LoopNest > &parent) const
void mark_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector< VarOrRVar > ¶llel_vars, const vector< int64_t > ¶llel_extents) const
IntrusivePtr< const LoopNest > get_root_for_features(const Anderson2021Params ¶ms, const Target &target) const
bool calculate_cost(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose=false)
LoopNest * create_feature_root(const PostCreateMutator &post_create_mutator) const
void set_gpu_store_site(const map< const LoopNest *, pair< const LoopNest *, int > > &parent, const LoopNest *loop, LoopNest::Sites &site) const
IntrusivePtr< State > make_child() const
bool exceeds_local_memory_limit(const Anderson2021Params ¶ms, const Target &target) const
const LoopNest * deepest_common_ancestor(const map< const LoopNest *, pair< const LoopNest *, int > > &parent, const LoopNest *a, const LoopNest *b) const
void add_to_always_consider_inline_options(const FunctionDAG::Node *node)
void apply_schedule(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target)
bool has_loop_nest_without_thread_loops() const
std::vector< double > cost_per_stage
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.
A struct representing a target machine and os to generate code for.