30 #ifndef ANKERL_NANOBENCH_H_INCLUDED 31 #define ANKERL_NANOBENCH_H_INCLUDED 34 #define ANKERL_NANOBENCH_VERSION_MAJOR 4 // incompatible API changes 35 #define ANKERL_NANOBENCH_VERSION_MINOR 3 // backwards-compatible changes 36 #define ANKERL_NANOBENCH_VERSION_PATCH 4 // backwards-compatible bug fixes 48 #define ANKERL_NANOBENCH(x) ANKERL_NANOBENCH_PRIVATE_##x() 50 #define ANKERL_NANOBENCH_PRIVATE_CXX() __cplusplus 51 #define ANKERL_NANOBENCH_PRIVATE_CXX98() 199711L 52 #define ANKERL_NANOBENCH_PRIVATE_CXX11() 201103L 53 #define ANKERL_NANOBENCH_PRIVATE_CXX14() 201402L 54 #define ANKERL_NANOBENCH_PRIVATE_CXX17() 201703L 56 #if ANKERL_NANOBENCH(CXX) >= ANKERL_NANOBENCH(CXX17) 57 # define ANKERL_NANOBENCH_PRIVATE_NODISCARD() [[nodiscard]] 59 # define ANKERL_NANOBENCH_PRIVATE_NODISCARD() 62 #if defined(__clang__) 63 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH() \ 64 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wpadded\"") 65 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP() _Pragma("clang diagnostic pop") 67 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH() 68 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP() 72 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH() _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Weffc++\"") 73 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP() _Pragma("GCC diagnostic pop") 75 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH() 76 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP() 79 #if defined(ANKERL_NANOBENCH_LOG_ENABLED) 81 # define ANKERL_NANOBENCH_LOG(x) \ 83 std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl; \ 86 # define ANKERL_NANOBENCH_LOG(x) \ 91 #if defined(__linux__) && defined(PERF_EVENT_IOC_ID) && defined(PERF_COUNT_HW_REF_CPU_CYCLES) && defined(PERF_FLAG_FD_CLOEXEC) && \ 92 !defined(ANKERL_NANOBENCH_DISABLE_PERF_COUNTERS) 95 # define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 1 97 # define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 0 100 #if defined(__clang__) 101 # define ANKERL_NANOBENCH_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__))) 103 # define ANKERL_NANOBENCH_NO_SANITIZE(...) 106 #if defined(_MSC_VER) 107 # define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __declspec(noinline) 109 # define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __attribute__((noinline)) 114 #if defined(__GNUC__) && __GNUC__ < 5 115 # define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__) 117 # define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value 123 namespace nanobench {
125 using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock,
126 std::chrono::steady_clock>::type;
281 void render(
char const* mustacheTemplate,
Bench const& bench, std::ostream& out);
282 void render(std::string
const& mustacheTemplate,
Bench const& bench, std::ostream& out);
292 void render(
char const* mustacheTemplate, std::vector<Result>
const& results, std::ostream& out);
293 void render(std::string
const& mustacheTemplate, std::vector<Result>
const& results, std::ostream& out);
296 namespace templates {
307 char const*
csv() noexcept;
327 char const*
pyperf() noexcept;
338 char const*
json() noexcept;
344 template <
typename T>
350 #if ANKERL_NANOBENCH(PERF_COUNTERS) 351 class LinuxPerformanceCounters;
361 namespace nanobench {
364 template <
typename T>
365 struct PerfCountSet {
379 std::string mBenchmarkTitle =
"benchmark";
380 std::string mBenchmarkName =
"noname";
381 std::string mUnit =
"op";
383 double mComplexityN = -1.0;
384 size_t mNumEpochs = 11;
385 size_t mClockResolutionMultiple =
static_cast<size_t>(1000);
386 std::chrono::nanoseconds mMaxEpochTime = std::chrono::milliseconds(100);
387 std::chrono::nanoseconds mMinEpochTime{};
388 uint64_t mMinEpochIterations{1};
389 uint64_t mEpochIterations{0};
390 uint64_t mWarmup = 0;
391 std::ostream* mOut =
nullptr;
392 std::chrono::duration<double> mTimeUnit = std::chrono::nanoseconds{1};
393 std::string mTimeUnitName =
"ns";
394 bool mShowPerformanceCounters =
true;
395 bool mIsRelative =
false;
432 void add(
Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc);
437 ANKERL_NANOBENCH(NODISCARD)
double medianAbsolutePercentError(Measure m) const;
450 static
Measure fromString(
std::
string const& str);
454 std::vector<std::vector<double>> mNameToMeasurements{};
482 static constexpr uint64_t(min)();
483 static constexpr uint64_t(max)();
495 Rng& operator=(
Rng const&) =
delete;
498 Rng(
Rng&&) noexcept = default;
499 Rng& operator=(
Rng&&) noexcept = default;
500 ~
Rng() noexcept = default;
527 explicit
Rng(uint64_t seed) noexcept;
528 Rng(uint64_t x, uint64_t y) noexcept;
529 Rng(
std::vector<uint64_t> const& data);
543 inline uint64_t operator()() noexcept;
561 inline uint32_t bounded(uint32_t range) noexcept;
572 inline
double uniform01() noexcept;
581 template <typename Container>
582 void shuffle(Container& container) noexcept;
590 std::vector<uint64_t> state() const;
593 static constexpr uint64_t rotl(uint64_t x,
unsigned k) noexcept;
645 template <
typename Op>
647 Bench& run(
char const* benchmarkName, Op&& op);
649 template <
typename Op>
651 Bench& run(std::string
const& benchmarkName, Op&& op);
657 template <
typename Op>
666 Bench& title(
char const* benchmarkTitle);
667 Bench& title(std::string
const& benchmarkTitle);
672 Bench&
name(std::string
const& benchmarkName);
684 template <
typename T>
685 Bench& batch(T b) noexcept;
696 Bench& unit(
char const* unit);
697 Bench& unit(std::string
const& unit);
709 Bench& timeUnit(std::chrono::duration<double>
const& tu, std::string
const& tuName);
710 ANKERL_NANOBENCH(NODISCARD) std::string
const& timeUnitName()
const noexcept;
711 ANKERL_NANOBENCH(NODISCARD) std::chrono::duration<double>
const& timeUnit()
const noexcept;
720 Bench& output(std::ostream* outstream) noexcept;
743 Bench& clockResolutionMultiple(
size_t multiple) noexcept;
744 ANKERL_NANOBENCH(NODISCARD)
size_t clockResolutionMultiple()
const noexcept;
761 Bench& epochs(
size_t numEpochs) noexcept;
774 Bench& maxEpochTime(std::chrono::nanoseconds t) noexcept;
775 ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds maxEpochTime()
const noexcept;
787 Bench& minEpochTime(std::chrono::nanoseconds t) noexcept;
788 ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds minEpochTime()
const noexcept;
800 Bench& minEpochIterations(uint64_t numIters) noexcept;
809 Bench& epochIterations(uint64_t numIters) noexcept;
821 Bench& warmup(uint64_t numWarmupIters) noexcept;
841 Bench& relative(
bool isRelativeEnabled) noexcept;
863 ANKERL_NANOBENCH(NODISCARD) std::vector<Result>
const& results()
const noexcept;
872 template <
typename Arg>
889 template <
typename T>
890 Bench& complexityN(T b) noexcept;
924 std::vector<BigO> complexityBigO()
const;
949 template <
typename Op>
950 BigO complexityBigO(
char const*
name, Op op)
const;
952 template <
typename Op>
953 BigO complexityBigO(std::string
const&
name, Op op)
const;
962 Bench&
render(
char const* templateContent, std::ostream& os);
963 Bench&
render(std::string
const& templateContent, std::ostream& os);
970 std::vector<Result> mResults{};
980 template <
typename Arg>
985 #if defined(_MSC_VER) 986 void doNotOptimizeAwaySink(
void const*);
988 template <
typename T>
996 template <
typename T>
999 asm volatile(
"" : :
"r,m"(val) :
"memory");
1002 template <
typename T>
1004 # if defined(__clang__) 1006 asm volatile(
"" :
"+r,m"(val) : :
"memory");
1009 asm volatile(
"" :
"+m,r"(val) : :
"memory");
1025 void moveResultTo(std::vector<Result>& results) noexcept;
1042 void beginMeasure();
1044 void updateResults(uint64_t numIters);
1050 #if ANKERL_NANOBENCH(PERF_COUNTERS) 1051 LinuxPerformanceCounters* mPc =
nullptr;
1067 template <
typename Op>
1069 for (
auto& rangeMeasure : data) {
1070 rangeMeasure.first = op(rangeMeasure.first);
1077 template <
typename Op>
1081 template <
typename Op>
1089 ANKERL_NANOBENCH(NODISCARD)
double normalizedRootMeanSquare() const noexcept;
1097 std::ostream&
operator<<(std::ostream& os, BigO
const& bigO);
1098 std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO>
const& bigOs);
1106 namespace nanobench {
1113 return (std::numeric_limits<uint64_t>::max)();
1117 uint64_t
Rng::operator()() noexcept {
1120 mX = UINT64_C(15241094284759029579) * mY;
1121 mY = rotl(mY - x, 27);
1127 uint32_t
Rng::bounded(uint32_t range) noexcept {
1128 uint64_t r32 =
static_cast<uint32_t
>(operator()());
1129 auto multiresult = r32 * range;
1130 return static_cast<uint32_t
>(multiresult >> 32U);
1134 auto i = (UINT64_C(0x3ff) << 52U) | (
operator()() >> 12U);
1138 std::memcpy(&d, &i,
sizeof(
double));
1142 template <
typename Container>
1144 auto size =
static_cast<uint32_t
>(container.size());
1145 for (
auto i = size; i > 1U; --i) {
1147 auto p = bounded(i);
1148 swap(container[i - 1], container[p]);
1153 constexpr uint64_t
Rng::rotl(uint64_t x,
unsigned k) noexcept {
1154 return (x << k) | (x >> (64U - k));
1157 template <
typename Op>
1164 while (
auto n = iterationLogic.numIters()) {
1166 Clock::time_point before = Clock::now();
1170 Clock::time_point after = Clock::now();
1172 pc.updateResults(iterationLogic.numIters());
1173 iterationLogic.
add(after - before, pc);
1180 template <
typename Op>
1182 name(benchmarkName);
1183 return run(std::forward<Op>(op));
1186 template <
typename Op>
1188 name(benchmarkName);
1189 return run(std::forward<Op>(op));
1192 template <
typename Op>
1197 template <
typename Op>
1204 template <
typename T>
1206 mConfig.mBatch =
static_cast<double>(b);
1211 template <
typename T>
1213 mConfig.mComplexityN =
static_cast<double>(n);
1218 template <
typename Arg>
1225 template <
typename Arg>
1232 #if defined(_MSC_VER) 1233 template <
typename T>
1235 doNotOptimizeAwaySink(&val);
1244 #if defined(ANKERL_NANOBENCH_IMPLEMENT) 1250 # include <algorithm> 1256 # include <iostream> 1260 # include <stdexcept> 1262 # if defined(__linux__) 1263 # include <unistd.h> 1265 # if ANKERL_NANOBENCH(PERF_COUNTERS) 1268 # include <linux/perf_event.h> 1269 # include <sys/ioctl.h> 1270 # include <sys/syscall.h> 1271 # include <unistd.h> 1277 namespace nanobench {
1288 class StreamStateRestorer;
1290 class MarkDownColumn;
1301 namespace nanobench {
1303 uint64_t splitMix64(uint64_t& state) noexcept;
1308 template <
typename T>
1309 inline double d(T t) noexcept {
1310 return static_cast<double>(t);
1312 inline double d(Clock::duration duration) noexcept {
1313 return std::chrono::duration_cast<std::chrono::duration<double>>(duration).
count();
1317 inline Clock::duration clockResolution() noexcept;
1321 namespace templates {
1323 char const*
csv() noexcept {
1324 return R
"DELIM("title";"name";"unit";"batch";"elapsed";"error %";"instructions";"branches";"branch misses";"total" 1325 {{#result}}"{{title}}";"{{name}}";"{{unit}}";{{batch}};{{median(elapsed)}};{{medianAbsolutePercentError(elapsed)}};{{median(instructions)}};{{median(branchinstructions)}};{{median(branchmisses)}};{{sumProduct(iterations, elapsed)}} 1330 return R
"DELIM(<html> 1333 <script src="https://cdn.plot.ly/plotly-latest.min.js"></script> 1337 <div id="myDiv"></div> 1342 y: [{{#measurement}}{{elapsed}}{{^-last}}, {{/last}}{{/measurement}}], 1346 var title = '{{title}}'; 1348 data = data.map(a => Object.assign(a, { boxpoints: 'all', pointpos: 0, type: 'box' })); 1349 var layout = { title: { text: title }, showlegend: false, yaxis: { title: 'time per unit', rangemode: 'tozero', autorange: true } }; Plotly.newPlot('myDiv', data, layout, {responsive: true}); 1356 char const*
pyperf() noexcept {
1363 {{#measurement}} {{elapsed}}{{^-last}}, 1364 {{/last}}{{/measurement}} 1371 "loops": {{sum(iterations)}}, 1372 "inner_loops": {{batch}}, 1373 "name": "{{title}}", 1380 char const*
json() noexcept {
1384 "title": "{{title}}", 1388 "complexityN": {{complexityN}}, 1389 "epochs": {{epochs}}, 1390 "clockResolution": {{clockResolution}}, 1391 "clockResolutionMultiple": {{clockResolutionMultiple}}, 1392 "maxEpochTime": {{maxEpochTime}}, 1393 "minEpochTime": {{minEpochTime}}, 1394 "minEpochIterations": {{minEpochIterations}}, 1395 "epochIterations": {{epochIterations}}, 1396 "warmup": {{warmup}}, 1397 "relative": {{relative}}, 1398 "median(elapsed)": {{median(elapsed)}}, 1399 "medianAbsolutePercentError(elapsed)": {{medianAbsolutePercentError(elapsed)}}, 1400 "median(instructions)": {{median(instructions)}}, 1401 "medianAbsolutePercentError(instructions)": {{medianAbsolutePercentError(instructions)}}, 1402 "median(cpucycles)": {{median(cpucycles)}}, 1403 "median(contextswitches)": {{median(contextswitches)}}, 1404 "median(pagefaults)": {{median(pagefaults)}}, 1405 "median(branchinstructions)": {{median(branchinstructions)}}, 1406 "median(branchmisses)": {{median(branchmisses)}}, 1407 "totalTime": {{sumProduct(iterations, elapsed)}}, 1410 "iterations": {{iterations}}, 1411 "elapsed": {{elapsed}}, 1412 "pagefaults": {{pagefaults}}, 1413 "cpucycles": {{cpucycles}}, 1414 "contextswitches": {{contextswitches}}, 1415 "instructions": {{instructions}}, 1416 "branchinstructions": {{branchinstructions}}, 1417 "branchmisses": {{branchmisses}} 1418 }{{^-last}},{{/-last}} 1420 }{{^-last}},{{/-last}} 1427 enum class Type { tag, content, section, inverted_section };
1431 std::vector<Node> children;
1436 bool operator==(
char const (&str)[N])
const noexcept {
1437 return static_cast<size_t>(std::distance(begin, end) + 1) == N && 0 == strncmp(str, begin, N - 1);
1442 static std::vector<Node> parseMustacheTemplate(
char const** tpl) {
1443 std::vector<Node> nodes;
1446 auto begin = std::strstr(*tpl,
"{{");
1448 if (begin !=
nullptr) {
1450 end = std::strstr(begin,
"}}");
1453 if (begin ==
nullptr || end ==
nullptr) {
1455 nodes.emplace_back(Node{*tpl, *tpl + std::strlen(*tpl), std::vector<Node>{}, Node::Type::content});
1459 nodes.emplace_back(Node{*tpl, begin - 2, std::vector<Node>{}, Node::Type::content});
1469 nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::section});
1473 nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::inverted_section});
1477 nodes.emplace_back(Node{begin, end, std::vector<Node>{}, Node::Type::tag});
1483 static bool generateFirstLast(Node
const& n,
size_t idx,
size_t size, std::ostream& out) {
1485 bool matchFirst = n ==
"-first";
1486 bool matchLast = n ==
"-last";
1487 if (!matchFirst && !matchLast) {
1491 bool doWrite =
false;
1492 if (n.type == Node::Type::section) {
1493 doWrite = (matchFirst && idx == 0) || (matchLast && idx == size - 1);
1494 }
else if (n.type == Node::Type::inverted_section) {
1495 doWrite = (matchFirst && idx != 0) || (matchLast && idx != size - 1);
1499 for (
auto const& child : n.children) {
1500 if (child.type == Node::Type::content) {
1501 out.write(child.begin, std::distance(child.begin, child.end));
1508 static bool matchCmdArgs(std::string
const& str, std::vector<std::string>& matchResult) {
1509 matchResult.clear();
1510 auto idxOpen = str.find(
'(');
1511 auto idxClose = str.find(
')', idxOpen);
1512 if (idxClose == std::string::npos) {
1516 matchResult.emplace_back(str.substr(0, idxOpen));
1519 matchResult.emplace_back(std::string{});
1520 for (
size_t i = idxOpen + 1; i != idxClose; ++i) {
1521 if (str[i] ==
' ' || str[i] ==
'\t') {
1525 if (str[i] ==
',') {
1527 matchResult.emplace_back(std::string{});
1531 matchResult.back() += str[i];
1536 static bool generateConfigTag(Node
const& n, Config
const& config, std::ostream& out) {
1540 out << config.mBenchmarkTitle;
1542 }
else if (n ==
"name") {
1543 out << config.mBenchmarkName;
1545 }
else if (n ==
"unit") {
1546 out << config.mUnit;
1548 }
else if (n ==
"batch") {
1549 out << config.mBatch;
1551 }
else if (n ==
"complexityN") {
1552 out << config.mComplexityN;
1554 }
else if (n ==
"epochs") {
1555 out << config.mNumEpochs;
1557 }
else if (n ==
"clockResolution") {
1558 out << d(detail::clockResolution());
1560 }
else if (n ==
"clockResolutionMultiple") {
1561 out << config.mClockResolutionMultiple;
1563 }
else if (n ==
"maxEpochTime") {
1564 out << d(config.mMaxEpochTime);
1566 }
else if (n ==
"minEpochTime") {
1567 out << d(config.mMinEpochTime);
1569 }
else if (n ==
"minEpochIterations") {
1570 out << config.mMinEpochIterations;
1572 }
else if (n ==
"epochIterations") {
1573 out << config.mEpochIterations;
1575 }
else if (n ==
"warmup") {
1576 out << config.mWarmup;
1578 }
else if (n ==
"relative") {
1579 out << config.mIsRelative;
1585 static std::ostream& generateResultTag(Node
const& n,
Result const& r, std::ostream& out) {
1586 if (generateConfigTag(n, r.config(), out)) {
1594 std::vector<std::string> matchResult;
1595 if (matchCmdArgs(std::string(n.begin, n.end), matchResult)) {
1596 if (matchResult.size() == 2) {
1602 if (matchResult[0] ==
"median") {
1603 return out << r.median(m);
1605 if (matchResult[0] ==
"average") {
1606 return out << r.average(m);
1608 if (matchResult[0] ==
"medianAbsolutePercentError") {
1609 return out << r.medianAbsolutePercentError(m);
1611 if (matchResult[0] ==
"sum") {
1612 return out << r.sum(m);
1614 if (matchResult[0] ==
"minimum") {
1615 return out << r.minimum(m);
1617 if (matchResult[0] ==
"maximum") {
1618 return out << r.maximum(m);
1620 }
else if (matchResult.size() == 3) {
1627 if (matchResult[0] ==
"sumProduct") {
1628 return out << r.sumProduct(m1, m2);
1637 throw std::runtime_error(
"command '" + std::string(n.begin, n.end) +
"' not understood");
1640 static void generateResultMeasurement(std::vector<Node>
const& nodes,
size_t idx,
Result const& r, std::ostream& out) {
1641 for (
auto const& n : nodes) {
1642 if (!generateFirstLast(n, idx, r.size(), out)) {
1645 case Node::Type::content:
1646 out.write(n.begin, std::distance(n.begin, n.end));
1649 case Node::Type::inverted_section:
1650 throw std::runtime_error(
"got a inverted section inside measurement");
1652 case Node::Type::section:
1653 throw std::runtime_error(
"got a section inside measurement");
1655 case Node::Type::tag: {
1660 out << r.get(idx, m);
1669 static void generateResult(std::vector<Node>
const& nodes,
size_t idx, std::vector<Result>
const& results, std::ostream& out) {
1670 auto const& r = results[idx];
1671 for (
auto const& n : nodes) {
1672 if (!generateFirstLast(n, idx, results.size(), out)) {
1675 case Node::Type::content:
1676 out.write(n.begin, std::distance(n.begin, n.end));
1679 case Node::Type::inverted_section:
1680 throw std::runtime_error(
"got a inverted section inside result");
1682 case Node::Type::section:
1683 if (n ==
"measurement") {
1684 for (
size_t i = 0; i < r.size(); ++i) {
1685 generateResultMeasurement(n.children, i, r, out);
1688 throw std::runtime_error(
"got a section inside result");
1692 case Node::Type::tag:
1693 generateResultTag(n, r, out);
1705 char const* getEnv(
char const*
name);
1706 bool isEndlessRunning(std::string
const&
name);
1707 bool isWarningsEnabled();
1709 template <
typename T>
1710 T parseFile(std::string
const& filename);
1712 void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations);
1713 void printStabilityInformationOnce(std::ostream* os);
1716 uint64_t& singletonHeaderHash() noexcept;
1719 Clock::duration calcClockResolution(
size_t numEvaluations) noexcept;
1726 class NumSep :
public std::numpunct<char> {
1728 explicit NumSep(
char sep);
1729 char do_thousands_sep()
const override;
1730 std::string do_grouping()
const override;
1739 class StreamStateRestorer {
1741 explicit StreamStateRestorer(std::ostream& s);
1742 ~StreamStateRestorer();
1748 StreamStateRestorer(StreamStateRestorer
const&) =
delete;
1749 StreamStateRestorer& operator=(StreamStateRestorer
const&) =
delete;
1750 StreamStateRestorer(StreamStateRestorer&&) =
delete;
1751 StreamStateRestorer& operator=(StreamStateRestorer&&) =
delete;
1754 std::ostream& mStream;
1755 std::locale mLocale;
1756 std::streamsize
const mPrecision;
1757 std::streamsize
const mWidth;
1758 std::ostream::char_type
const mFill;
1759 std::ostream::fmtflags
const mFmtFlags;
1766 Number(
int width,
int precision,
double value);
1767 Number(
int width,
int precision, int64_t value);
1768 std::string to_s()
const;
1771 friend std::ostream&
operator<<(std::ostream& os, Number
const& n);
1772 std::ostream& write(std::ostream& os)
const;
1780 std::string to_s(uint64_t s);
1782 std::ostream&
operator<<(std::ostream& os, Number
const& n);
1784 class MarkDownColumn {
1786 MarkDownColumn(
int w,
int prec, std::string
const& tit, std::string
const& suff,
double val);
1787 std::string title()
const;
1788 std::string separator()
const;
1789 std::string invalid()
const;
1790 std::string value()
const;
1796 std::string mSuffix;
1801 class MarkDownCode {
1803 explicit MarkDownCode(std::string
const& what);
1806 friend std::ostream&
operator<<(std::ostream& os, MarkDownCode
const& mdCode);
1807 std::ostream& write(std::ostream& os)
const;
1809 std::string mWhat{};
1812 std::ostream&
operator<<(std::ostream& os, MarkDownCode
const& mdCode);
1822 namespace nanobench {
1824 void render(
char const* mustacheTemplate, std::vector<Result>
const& results, std::ostream& out) {
1825 detail::fmt::StreamStateRestorer restorer(out);
1827 out.precision(std::numeric_limits<double>::digits10);
1828 auto nodes = templates::parseMustacheTemplate(&mustacheTemplate);
1830 for (
auto const& n : nodes) {
1833 case templates::Node::Type::content:
1834 out.write(n.begin, std::distance(n.begin, n.end));
1837 case templates::Node::Type::inverted_section:
1838 throw std::runtime_error(
"unknown list '" + std::string(n.begin, n.end) +
"'");
1840 case templates::Node::Type::section:
1841 if (n ==
"result") {
1842 const size_t nbResults = results.size();
1843 for (
size_t i = 0; i < nbResults; ++i) {
1844 generateResult(n.children, i, results, out);
1846 }
else if (n ==
"measurement") {
1847 if (results.size() != 1) {
1848 throw std::runtime_error(
1849 "render: can only use section 'measurement' here if there is a single result, but there are " +
1850 detail::fmt::to_s(results.size()));
1853 auto const& r = results.front();
1854 for (
size_t i = 0; i < r.size(); ++i) {
1855 generateResultMeasurement(n.children, i, r, out);
1858 throw std::runtime_error(
"render: unknown section '" + std::string(n.begin, n.end) +
"'");
1862 case templates::Node::Type::tag:
1863 if (results.size() == 1) {
1865 generateResultTag(n, results.front(), out);
1868 if (!generateConfigTag(n, results.back().config(), out)) {
1869 throw std::runtime_error(
"unknown tag '" + std::string(n.begin, n.end) +
"'");
1877 void render(std::string
const& mustacheTemplate, std::vector<Result>
const& results, std::ostream& out) {
1878 render(mustacheTemplate.c_str(), results, out);
1881 void render(
char const* mustacheTemplate,
const Bench& bench, std::ostream& out) {
1882 render(mustacheTemplate, bench.results(), out);
1885 void render(std::string
const& mustacheTemplate,
const Bench& bench, std::ostream& out) {
1886 render(mustacheTemplate.c_str(), bench.results(), out);
1892 # if defined(__clang__) 1893 # pragma clang diagnostic push 1894 # pragma clang diagnostic ignored "-Wexit-time-destructors" 1896 static PerformanceCounters pc;
1897 # if defined(__clang__) 1898 # pragma clang diagnostic pop 1907 # if defined(_MSC_VER) 1908 # pragma optimize("", off) 1909 void doNotOptimizeAwaySink(
void const*) {}
1910 # pragma optimize("", on) 1913 template <
typename T>
1914 T parseFile(std::string
const& filename) {
1921 char const* getEnv(
char const*
name) {
1922 # if defined(_MSC_VER) 1923 # pragma warning(push) 1924 # pragma warning(disable : 4996) // getenv': This function or variable may be unsafe. 1926 return std::getenv(
name);
1927 # if defined(_MSC_VER) 1928 # pragma warning(pop) 1932 bool isEndlessRunning(std::string
const&
name) {
1933 auto endless = getEnv(
"NANOBENCH_ENDLESS");
1934 return nullptr != endless && endless ==
name;
1938 bool isWarningsEnabled() {
1939 auto suppression = getEnv(
"NANOBENCH_SUPPRESS_WARNINGS");
1940 return nullptr == suppression || suppression == std::string(
"0");
1943 void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations) {
1945 recommendations.clear();
1947 bool recommendCheckFlags =
false;
1950 warnings.emplace_back(
"DEBUG defined");
1951 recommendCheckFlags =
true;
1954 bool recommendPyPerf =
false;
1955 # if defined(__linux__) 1956 auto nprocs = sysconf(_SC_NPROCESSORS_CONF);
1958 warnings.emplace_back(
"couldn't figure out number of processors - no governor, turbo check possible");
1962 for (
long id = 0;
id < nprocs; ++id) {
1963 auto idStr = detail::fmt::to_s(static_cast<uint64_t>(
id));
1964 auto sysCpu =
"/sys/devices/system/cpu/cpu" + idStr;
1965 auto minFreq = parseFile<int64_t>(sysCpu +
"/cpufreq/scaling_min_freq");
1966 auto maxFreq = parseFile<int64_t>(sysCpu +
"/cpufreq/scaling_max_freq");
1967 if (minFreq != maxFreq) {
1968 auto minMHz =
static_cast<double>(minFreq) / 1000.0;
1969 auto maxMHz =
static_cast<double>(maxFreq) / 1000.0;
1970 warnings.emplace_back(
"CPU frequency scaling enabled: CPU " + idStr +
" between " +
1971 detail::fmt::Number(1, 1, minMHz).to_s() +
" and " + detail::fmt::Number(1, 1, maxMHz).to_s() +
1973 recommendPyPerf =
true;
1978 auto currentGovernor = parseFile<std::string>(
"/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor");
1979 if (
"performance" != currentGovernor) {
1980 warnings.emplace_back(
"CPU governor is '" + currentGovernor +
"' but should be 'performance'");
1981 recommendPyPerf =
true;
1984 if (0 == parseFile<int>(
"/sys/devices/system/cpu/intel_pstate/no_turbo")) {
1985 warnings.emplace_back(
"Turbo is enabled, CPU frequency will fluctuate");
1986 recommendPyPerf =
true;
1991 if (recommendCheckFlags) {
1992 recommendations.emplace_back(
"Make sure you compile for Release");
1994 if (recommendPyPerf) {
1995 recommendations.emplace_back(
"Use 'pyperf system tune' before benchmarking. See https://github.com/psf/pyperf");
1999 void printStabilityInformationOnce(std::ostream* outStream) {
2000 static bool shouldPrint =
true;
2001 if (shouldPrint && outStream && isWarningsEnabled()) {
2002 auto& os = *outStream;
2003 shouldPrint =
false;
2004 std::vector<std::string> warnings;
2005 std::vector<std::string> recommendations;
2006 gatherStabilityInformation(warnings, recommendations);
2007 if (warnings.empty()) {
2011 os <<
"Warning, results might be unstable:" << std::endl;
2012 for (
auto const& w : warnings) {
2013 os <<
"* " << w << std::endl;
2016 os << std::endl <<
"Recommendations" << std::endl;
2017 for (
auto const& r : recommendations) {
2018 os <<
"* " << r << std::endl;
2024 uint64_t& singletonHeaderHash() noexcept {
2025 static uint64_t sHeaderHash{};
2030 inline uint64_t hash_combine(uint64_t seed, uint64_t val) {
2031 return seed ^ (val + UINT64_C(0x9e3779b9) + (seed << 6U) + (seed >> 2U));
2035 Clock::duration calcClockResolution(
size_t numEvaluations) noexcept {
2036 auto bestDuration = Clock::duration::max();
2037 Clock::time_point tBegin;
2038 Clock::time_point tEnd;
2039 for (
size_t i = 0; i < numEvaluations; ++i) {
2040 tBegin = Clock::now();
2042 tEnd = Clock::now();
2043 }
while (tBegin == tEnd);
2044 bestDuration = (std::min)(bestDuration, tEnd - tBegin);
2046 return bestDuration;
2050 Clock::duration clockResolution() noexcept {
2051 static Clock::duration sResolution = calcClockResolution(20);
2056 struct IterationLogic::Impl {
2057 enum class State { warmup, upscaling_runtime, measuring, endless };
2059 explicit Impl(Bench
const& bench)
2061 , mResult(bench.config()) {
2062 printStabilityInformationOnce(mBench.output());
2065 mTargetRuntimePerEpoch = detail::clockResolution() * mBench.clockResolutionMultiple();
2066 if (mTargetRuntimePerEpoch > mBench.maxEpochTime()) {
2067 mTargetRuntimePerEpoch = mBench.maxEpochTime();
2069 if (mTargetRuntimePerEpoch < mBench.minEpochTime()) {
2070 mTargetRuntimePerEpoch = mBench.minEpochTime();
2073 if (isEndlessRunning(mBench.name())) {
2074 std::cerr <<
"NANOBENCH_ENDLESS set: running '" << mBench.name() <<
"' endlessly" << std::endl;
2075 mNumIters = (std::numeric_limits<uint64_t>::max)();
2076 mState = State::endless;
2077 }
else if (0 != mBench.warmup()) {
2078 mNumIters = mBench.warmup();
2079 mState = State::warmup;
2080 }
else if (0 != mBench.epochIterations()) {
2082 mNumIters = mBench.epochIterations();
2083 mState = State::measuring;
2085 mNumIters = mBench.minEpochIterations();
2086 mState = State::upscaling_runtime;
2091 ANKERL_NANOBENCH(NODISCARD) uint64_t calcBestNumIters(std::chrono::nanoseconds elapsed, uint64_t iters) noexcept {
2092 auto doubleElapsed = d(elapsed);
2093 auto doubleTargetRuntimePerEpoch = d(mTargetRuntimePerEpoch);
2094 auto doubleNewIters = doubleTargetRuntimePerEpoch / doubleElapsed * d(iters);
2096 auto doubleMinEpochIters = d(mBench.minEpochIterations());
2097 if (doubleNewIters < doubleMinEpochIters) {
2098 doubleNewIters = doubleMinEpochIters;
2100 doubleNewIters *= 1.0 + 0.2 * mRng.uniform01();
2104 return static_cast<uint64_t
>(doubleNewIters + 0.5);
2108 if (elapsed * 10 < mTargetRuntimePerEpoch) {
2110 if (mNumIters * 10 < mNumIters) {
2112 showResult(
"iterations overflow. Maybe your code got optimized away?");
2118 mNumIters = calcBestNumIters(elapsed, mNumIters);
2122 void add(std::chrono::nanoseconds elapsed, PerformanceCounters
const& pc) noexcept {
2123 # if defined(ANKERL_NANOBENCH_LOG_ENABLED) 2124 auto oldIters = mNumIters;
2129 if (isCloseEnoughForMeasurements(elapsed)) {
2132 mState = State::measuring;
2133 mNumIters = calcBestNumIters(elapsed, mNumIters);
2136 mState = State::upscaling_runtime;
2141 case State::upscaling_runtime:
2142 if (isCloseEnoughForMeasurements(elapsed)) {
2144 mState = State::measuring;
2145 mTotalElapsed += elapsed;
2146 mTotalNumIters += mNumIters;
2147 mResult.add(elapsed, mNumIters, pc);
2148 mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2154 case State::measuring:
2157 mTotalElapsed += elapsed;
2158 mTotalNumIters += mNumIters;
2159 mResult.add(elapsed, mNumIters, pc);
2160 if (0 != mBench.epochIterations()) {
2161 mNumIters = mBench.epochIterations();
2163 mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2167 case State::endless:
2168 mNumIters = (std::numeric_limits<uint64_t>::max)();
2172 if (static_cast<uint64_t>(mResult.size()) == mBench.epochs()) {
2178 ANKERL_NANOBENCH_LOG(mBench.name() <<
": " << detail::fmt::Number(20, 3, static_cast<double>(elapsed.count())) <<
" elapsed, " 2179 << detail::fmt::Number(20, 3, static_cast<double>(mTargetRuntimePerEpoch.count()))
2180 <<
" target. oldIters=" << oldIters <<
", mNumIters=" << mNumIters
2181 <<
", mState=" <<
static_cast<int>(mState));
2184 void showResult(std::string
const& errorMessage)
const {
2187 if (mBench.output() !=
nullptr) {
2189 std::vector<fmt::MarkDownColumn> columns;
2193 if (mBench.relative()) {
2195 if (!mBench.results().empty()) {
2198 columns.emplace_back(11, 1,
"relative",
"%", d);
2201 if (mBench.complexityN() > 0) {
2202 columns.emplace_back(14, 0,
"complexityN",
"", mBench.complexityN());
2205 columns.emplace_back(22, 2, mBench.timeUnitName() +
"/" + mBench.unit(),
"",
2206 rMedian / (mBench.timeUnit().count() * mBench.batch()));
2207 columns.emplace_back(22, 2, mBench.unit() +
"/s",
"", rMedian <= 0.0 ? 0.0 : mBench.batch() / rMedian);
2210 columns.emplace_back(10, 1,
"err%",
"%", rErrorMedian * 100.0);
2212 double rInsMedian = -1.0;
2215 columns.emplace_back(18, 2,
"ins/" + mBench.unit(),
"", rInsMedian / mBench.batch());
2218 double rCycMedian = -1.0;
2221 columns.emplace_back(18, 2,
"cyc/" + mBench.unit(),
"", rCycMedian / mBench.batch());
2223 if (rInsMedian > 0.0 && rCycMedian > 0.0) {
2224 columns.emplace_back(9, 3,
"IPC",
"", rCycMedian <= 0.0 ? 0.0 : rInsMedian / rCycMedian);
2228 columns.emplace_back(17, 2,
"bra/" + mBench.unit(),
"", rBraMedian / mBench.batch());
2231 if (rBraMedian >= 1e-9) {
2234 columns.emplace_back(10, 1,
"miss%",
"%", p);
2241 auto& os = *mBench.output();
2245 hash = hash_combine(std::hash<std::string>{}(mBench.unit()), hash);
2246 hash = hash_combine(std::hash<std::string>{}(mBench.title()), hash);
2247 hash = hash_combine(std::hash<std::string>{}(mBench.timeUnitName()), hash);
2248 hash = hash_combine(std::hash<double>{}(mBench.timeUnit().count()), hash);
2249 hash = hash_combine(std::hash<bool>{}(mBench.relative()), hash);
2250 hash = hash_combine(std::hash<bool>{}(mBench.performanceCounters()), hash);
2252 if (hash != singletonHeaderHash()) {
2253 singletonHeaderHash() = hash;
2257 for (
auto const& col : columns) {
2260 os <<
"| " << mBench.title() << std::endl;
2262 for (
auto const& col : columns) {
2263 os << col.separator();
2265 os <<
"|:" << std::string(mBench.title().size() + 1U,
'-') << std::endl;
2268 if (!errorMessage.empty()) {
2269 for (
auto const& col : columns) {
2270 os << col.invalid();
2272 os <<
"| :boom: " << fmt::MarkDownCode(mBench.name()) <<
" (" << errorMessage <<
')' << std::endl;
2274 for (
auto const& col : columns) {
2278 auto showUnstable = isWarningsEnabled() && rErrorMedian >= 0.05;
2280 os <<
":wavy_dash: ";
2282 os << fmt::MarkDownCode(mBench.name());
2284 auto avgIters =
static_cast<double>(mTotalNumIters) / static_cast<double>(mBench.epochs());
2286 auto suggestedIters =
static_cast<uint64_t
>(avgIters * 10 + 0.5);
2288 os <<
" (Unstable with ~" << detail::fmt::Number(1, 1, avgIters)
2289 <<
" iters. Increase `minEpochIterations` to e.g. " << suggestedIters <<
")";
2296 ANKERL_NANOBENCH(NODISCARD)
bool isCloseEnoughForMeasurements(std::chrono::nanoseconds elapsed)
const noexcept {
2297 return elapsed * 3 >= mTargetRuntimePerEpoch * 2;
2300 uint64_t mNumIters = 1;
2301 Bench
const& mBench;
2302 std::chrono::nanoseconds mTargetRuntimePerEpoch{};
2305 std::chrono::nanoseconds mTotalElapsed{};
2306 uint64_t mTotalNumIters = 0;
2308 State mState = State::upscaling_runtime;
2312 IterationLogic::IterationLogic(Bench
const& bench) noexcept
2313 : mPimpl(
new Impl(bench)) {}
2315 IterationLogic::~IterationLogic() {
2321 uint64_t IterationLogic::numIters() const noexcept {
2323 return mPimpl->mNumIters;
2326 void IterationLogic::add(std::chrono::nanoseconds elapsed, PerformanceCounters
const& pc) noexcept {
2327 mPimpl->add(elapsed, pc);
2330 void IterationLogic::moveResultTo(std::vector<Result>& results) noexcept {
2331 results.emplace_back(std::move(mPimpl->mResult));
2334 # if ANKERL_NANOBENCH(PERF_COUNTERS) 2337 class LinuxPerformanceCounters {
2340 Target(uint64_t* targetValue_,
bool correctMeasuringOverhead_,
bool correctLoopOverhead_)
2341 : targetValue(targetValue_)
2342 , correctMeasuringOverhead(correctMeasuringOverhead_)
2343 , correctLoopOverhead(correctLoopOverhead_) {}
2345 uint64_t* targetValue{};
2346 bool correctMeasuringOverhead{};
2347 bool correctLoopOverhead{};
2350 ~LinuxPerformanceCounters();
2353 inline void start() {}
2355 inline void stop() {}
2357 bool monitor(perf_sw_ids swId, Target target);
2358 bool monitor(perf_hw_id hwId, Target target);
2360 bool hasError() const noexcept {
2366 inline void beginMeasure() {
2372 mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
2378 mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
2381 inline void endMeasure() {
2387 mHasError = (-1 == ioctl(mFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP));
2392 auto const numBytes =
sizeof(uint64_t) * mCounters.size();
2393 auto ret = read(mFd, mCounters.data(), numBytes);
2394 mHasError = ret !=
static_cast<ssize_t
>(numBytes);
2397 void updateResults(uint64_t numIters);
2400 template <
typename T>
2401 static inline T divRounded(T a, T divisor) {
2402 return (a + divisor / 2) / divisor;
2405 template <
typename Op>
2407 void calibrate(Op&& op) {
2409 for (
auto& v : mCalibratedOverhead) {
2414 auto newCalibration = mCalibratedOverhead;
2415 for (
auto& v : newCalibration) {
2416 v = (std::numeric_limits<uint64_t>::max)();
2418 for (
size_t iter = 0; iter < 100; ++iter) {
2426 for (
size_t i = 0; i < newCalibration.size(); ++i) {
2427 auto diff = mCounters[i];
2428 if (newCalibration[i] > diff) {
2429 newCalibration[i] = diff;
2434 mCalibratedOverhead = std::move(newCalibration);
2441 uint64_t
const numIters = 100000U + (std::random_device{}() & 3);
2442 uint64_t n = numIters;
2443 uint32_t x = 1234567;
2456 auto measure1 = mCounters;
2467 auto measure2 = mCounters;
2469 for (
size_t i = 0; i < mCounters.size(); ++i) {
2471 auto m1 = measure1[i] > mCalibratedOverhead[i] ? measure1[i] - mCalibratedOverhead[i] : 0;
2472 auto m2 = measure2[i] > mCalibratedOverhead[i] ? measure2[i] - mCalibratedOverhead[i] : 0;
2473 auto overhead = m1 * 2 > m2 ? m1 * 2 - m2 : 0;
2475 mLoopOverhead[i] = divRounded(overhead, numIters);
2481 bool monitor(uint32_t type, uint64_t eventid, Target target);
2483 std::map<uint64_t, Target> mIdToTarget{};
2486 std::vector<uint64_t> mCounters{3};
2487 std::vector<uint64_t> mCalibratedOverhead{3};
2488 std::vector<uint64_t> mLoopOverhead{3};
2490 uint64_t mTimeEnabledNanos = 0;
2491 uint64_t mTimeRunningNanos = 0;
2493 bool mHasError =
false;
2497 LinuxPerformanceCounters::~LinuxPerformanceCounters() {
2503 bool LinuxPerformanceCounters::monitor(perf_sw_ids swId, LinuxPerformanceCounters::Target target) {
2504 return monitor(PERF_TYPE_SOFTWARE, swId, target);
2507 bool LinuxPerformanceCounters::monitor(perf_hw_id hwId, LinuxPerformanceCounters::Target target) {
2508 return monitor(PERF_TYPE_HARDWARE, hwId, target);
2513 void LinuxPerformanceCounters::updateResults(uint64_t numIters) {
2515 for (
auto& id_value : mIdToTarget) {
2516 *id_value.second.targetValue = UINT64_C(0);
2523 mTimeEnabledNanos = mCounters[1] - mCalibratedOverhead[1];
2524 mTimeRunningNanos = mCounters[2] - mCalibratedOverhead[2];
2526 for (uint64_t i = 0; i < mCounters[0]; ++i) {
2527 auto idx =
static_cast<size_t>(3 + i * 2 + 0);
2528 auto id = mCounters[idx + 1U];
2530 auto it = mIdToTarget.find(
id);
2531 if (it != mIdToTarget.end()) {
2533 auto& tgt = it->second;
2534 *tgt.targetValue = mCounters[idx];
2535 if (tgt.correctMeasuringOverhead) {
2536 if (*tgt.targetValue >= mCalibratedOverhead[idx]) {
2537 *tgt.targetValue -= mCalibratedOverhead[idx];
2539 *tgt.targetValue = 0U;
2542 if (tgt.correctLoopOverhead) {
2543 auto correctionVal = mLoopOverhead[idx] * numIters;
2544 if (*tgt.targetValue >= correctionVal) {
2545 *tgt.targetValue -= correctionVal;
2547 *tgt.targetValue = 0U;
2554 bool LinuxPerformanceCounters::monitor(uint32_t type, uint64_t eventid, Target target) {
2555 *target.targetValue = (std::numeric_limits<uint64_t>::max)();
2560 auto pea = perf_event_attr();
2561 std::memset(&pea, 0,
sizeof(perf_event_attr));
2563 pea.size =
sizeof(perf_event_attr);
2564 pea.config = eventid;
2566 pea.exclude_kernel = 1;
2570 pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
2574 # if defined(PERF_FLAG_FD_CLOEXEC) // since Linux 3.14 2575 const unsigned long flags = PERF_FLAG_FD_CLOEXEC;
2577 const unsigned long flags = 0;
2580 auto fd =
static_cast<int>(syscall(__NR_perf_event_open, &pea, pid, cpu, mFd,
flags));
2590 if (-1 == ioctl(fd, PERF_EVENT_IOC_ID, &
id)) {
2596 mIdToTarget.emplace(
id, target);
2599 auto size = 3 + 2 * mIdToTarget.size();
2600 mCounters.resize(size);
2601 mCalibratedOverhead.resize(size);
2602 mLoopOverhead.resize(size);
2607 PerformanceCounters::PerformanceCounters()
2608 : mPc(new LinuxPerformanceCounters())
2612 mHas.pageFaults = mPc->monitor(PERF_COUNT_SW_PAGE_FAULTS, LinuxPerformanceCounters::Target(&mVal.pageFaults,
true,
false));
2613 mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_REF_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles,
true,
false));
2614 mHas.contextSwitches =
2615 mPc->monitor(PERF_COUNT_SW_CONTEXT_SWITCHES, LinuxPerformanceCounters::Target(&mVal.contextSwitches,
true,
false));
2616 mHas.instructions = mPc->monitor(PERF_COUNT_HW_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.instructions,
true,
true));
2617 mHas.branchInstructions =
2618 mPc->monitor(PERF_COUNT_HW_BRANCH_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.branchInstructions,
true,
false));
2619 mHas.branchMisses = mPc->monitor(PERF_COUNT_HW_BRANCH_MISSES, LinuxPerformanceCounters::Target(&mVal.branchMisses,
true,
false));
2624 auto before = ankerl::nanobench::Clock::now();
2625 auto after = ankerl::nanobench::Clock::now();
2630 if (mPc->hasError()) {
2632 mHas = PerfCountSet<bool>{};
2636 PerformanceCounters::~PerformanceCounters() {
2637 if (
nullptr != mPc) {
2642 void PerformanceCounters::beginMeasure() {
2643 mPc->beginMeasure();
2646 void PerformanceCounters::endMeasure() {
2650 void PerformanceCounters::updateResults(uint64_t numIters) {
2651 mPc->updateResults(numIters);
2656 PerformanceCounters::PerformanceCounters() =
default;
2657 PerformanceCounters::~PerformanceCounters() =
default;
2658 void PerformanceCounters::beginMeasure() {}
2659 void PerformanceCounters::endMeasure() {}
2660 void PerformanceCounters::updateResults(uint64_t) {}
2664 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t>
const& PerformanceCounters::val() const noexcept {
2667 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool>
const& PerformanceCounters::has() const noexcept {
2675 NumSep::NumSep(
char sep)
2678 char NumSep::do_thousands_sep()
const {
2682 std::string NumSep::do_grouping()
const {
2687 StreamStateRestorer::StreamStateRestorer(std::ostream& s)
2689 , mLocale(s.getloc())
2690 , mPrecision(s.precision())
2693 , mFmtFlags(s.
flags()) {}
2695 StreamStateRestorer::~StreamStateRestorer() {
2700 void StreamStateRestorer::restore() {
2701 mStream.imbue(mLocale);
2702 mStream.precision(mPrecision);
2703 mStream.width(mWidth);
2704 mStream.fill(mFill);
2705 mStream.flags(mFmtFlags);
2708 Number::Number(
int width,
int precision, int64_t value)
2710 , mPrecision(precision)
2711 , mValue(static_cast<double>(value)) {}
2713 Number::Number(
int width,
int precision,
double value)
2715 , mPrecision(precision)
2718 std::ostream& Number::write(std::ostream& os)
const {
2719 StreamStateRestorer restorer(os);
2720 os.imbue(std::locale(os.getloc(),
new NumSep(
',')));
2721 os << std::setw(mWidth) << std::setprecision(mPrecision) << std::fixed << mValue;
2725 std::string Number::to_s()
const {
2726 std::stringstream ss;
2731 std::string to_s(uint64_t n) {
2734 str +=
static_cast<char>(
'0' +
static_cast<char>(n % 10));
2737 std::reverse(str.begin(), str.end());
2741 std::ostream&
operator<<(std::ostream& os, Number
const& n) {
2745 MarkDownColumn::MarkDownColumn(
int w,
int prec, std::string
const& tit, std::string
const& suff,
double val)
2752 std::string MarkDownColumn::title()
const {
2753 std::stringstream ss;
2754 ss <<
'|' << std::setw(mWidth - 2) << std::right << mTitle <<
' ';
2758 std::string MarkDownColumn::separator()
const {
2759 std::string sep(static_cast<size_t>(mWidth),
'-');
2765 std::string MarkDownColumn::invalid()
const {
2766 std::string sep(static_cast<size_t>(mWidth),
' ');
2768 sep[sep.size() - 2] =
'-';
2772 std::string MarkDownColumn::value()
const {
2773 std::stringstream ss;
2774 auto width = mWidth - 2 -
static_cast<int>(mSuffix.size());
2775 ss <<
'|' << Number(width, mPrecision, mValue) << mSuffix <<
' ';
2780 MarkDownCode::MarkDownCode(std::string
const& what) {
2781 mWhat.reserve(what.size() + 2);
2782 mWhat.push_back(
'`');
2783 for (
char c : what) {
2786 mWhat.push_back(
'`');
2789 mWhat.push_back(
'`');
2792 std::ostream& MarkDownCode::write(std::ostream& os)
const {
2796 std::ostream&
operator<<(std::ostream& os, MarkDownCode
const& mdCode) {
2797 return mdCode.write(os);
2803 Config::Config() =
default;
2804 Config::~Config() =
default;
2805 Config& Config::operator=(Config
const&) =
default;
2806 Config& Config::operator=(Config&&) =
default;
2807 Config::Config(Config
const&) =
default;
2808 Config::Config(Config&&) noexcept = default;
2818 template <
typename T>
2819 inline constexpr
typename std::underlying_type<T>::type u(T val) noexcept {
2820 return static_cast<typename std::underlying_type<T>::type
>(val);
2826 : mConfig(benchmarkConfig)
2829 void Result::add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters
const& pc) {
2833 double dIters = d(iters);
2834 mNameToMeasurements[u(Result::Measure::iterations)].push_back(dIters);
2836 mNameToMeasurements[u(Result::Measure::elapsed)].push_back(d(totalElapsed) / dIters);
2837 if (pc.has().pageFaults) {
2838 mNameToMeasurements[u(Result::Measure::pagefaults)].push_back(d(pc.val().pageFaults) / dIters);
2840 if (pc.has().cpuCycles) {
2841 mNameToMeasurements[u(Result::Measure::cpucycles)].push_back(d(pc.val().cpuCycles) / dIters);
2843 if (pc.has().contextSwitches) {
2844 mNameToMeasurements[u(Result::Measure::contextswitches)].push_back(d(pc.val().contextSwitches) / dIters);
2846 if (pc.has().instructions) {
2847 mNameToMeasurements[u(Result::Measure::instructions)].push_back(d(pc.val().instructions) / dIters);
2849 if (pc.has().branchInstructions) {
2850 double branchInstructions = 0.0;
2852 if (pc.val().branchInstructions > iters + 1U) {
2853 branchInstructions = d(pc.val().branchInstructions - (iters + 1U));
2855 mNameToMeasurements[u(Result::Measure::branchinstructions)].push_back(branchInstructions / dIters);
2857 if (pc.has().branchMisses) {
2859 double branchMisses = d(pc.val().branchMisses);
2860 if (branchMisses > branchInstructions) {
2862 branchMisses = branchInstructions;
2866 branchMisses -= 1.0;
2867 if (branchMisses < 1.0) {
2870 mNameToMeasurements[u(Result::Measure::branchmisses)].push_back(branchMisses / dIters);
2875 Config
const& Result::config() const noexcept {
2879 inline double calcMedian(std::vector<double>& data) {
2883 std::sort(data.begin(), data.end());
2885 auto midIdx = data.size() / 2U;
2886 if (1U == (data.size() & 1U)) {
2887 return data[midIdx];
2889 return (data[midIdx - 1U] + data[midIdx]) / 2U;
2892 double Result::median(Measure m)
const {
2894 auto data = mNameToMeasurements[detail::u(m)];
2895 return calcMedian(data);
2898 double Result::average(Measure m)
const {
2900 auto const& data = mNameToMeasurements[detail::u(m)];
2906 return sum(m) / d(data.size());
2909 double Result::medianAbsolutePercentError(Measure m)
const {
2911 auto data = mNameToMeasurements[detail::u(m)];
2915 auto med = calcMedian(data);
2918 for (
auto& x : data) {
2924 return calcMedian(data);
2928 auto const& data = mNameToMeasurements[detail::u(m)];
2929 return std::accumulate(data.begin(), data.end(), 0.0);
2932 double Result::sumProduct(Measure m1, Measure m2)
const noexcept {
2933 auto const& data1 = mNameToMeasurements[detail::u(m1)];
2934 auto const& data2 = mNameToMeasurements[detail::u(m2)];
2936 if (data1.size() != data2.size()) {
2940 double result = 0.0;
2941 for (
size_t i = 0, s = data1.size(); i != s; ++i) {
2942 result += data1[i] * data2[i];
2947 bool Result::has(Measure m)
const noexcept {
2948 return !mNameToMeasurements[detail::u(m)].empty();
2951 double Result::get(
size_t idx, Measure m)
const {
2952 auto const& data = mNameToMeasurements[detail::u(m)];
2953 return data.at(idx);
2956 bool Result::empty() const noexcept {
2957 return 0U == size();
2960 size_t Result::size() const noexcept {
2961 auto const& data = mNameToMeasurements[detail::u(Measure::elapsed)];
2965 double Result::minimum(Measure m)
const noexcept {
2966 auto const& data = mNameToMeasurements[detail::u(m)];
2972 return *std::min_element(data.begin(), data.end());
2975 double Result::maximum(Measure m)
const noexcept {
2976 auto const& data = mNameToMeasurements[detail::u(m)];
2982 return *std::max_element(data.begin(), data.end());
2985 Result::Measure Result::fromString(std::string
const& str) {
2986 if (str ==
"elapsed") {
2987 return Measure::elapsed;
2988 }
else if (str ==
"iterations") {
2989 return Measure::iterations;
2990 }
else if (str ==
"pagefaults") {
2991 return Measure::pagefaults;
2992 }
else if (str ==
"cpucycles") {
2993 return Measure::cpucycles;
2994 }
else if (str ==
"contextswitches") {
2995 return Measure::contextswitches;
2996 }
else if (str ==
"instructions") {
2997 return Measure::instructions;
2998 }
else if (str ==
"branchinstructions") {
2999 return Measure::branchinstructions;
3000 }
else if (str ==
"branchmisses") {
3001 return Measure::branchmisses;
3004 return Measure::_size;
3010 mConfig.mOut = &std::cout;
3013 Bench::Bench(Bench&&) =
default;
3014 Bench& Bench::operator=(Bench&&) =
default;
3015 Bench::Bench(Bench
const&) =
default;
3016 Bench& Bench::operator=(Bench
const&) =
default;
3017 Bench::~Bench() noexcept = default;
3019 double Bench::batch() const noexcept {
3020 return mConfig.mBatch;
3023 double Bench::complexityN() const noexcept {
3024 return mConfig.mComplexityN;
3029 Bench& Bench::relative(
bool isRelativeEnabled) noexcept {
3030 mConfig.mIsRelative = isRelativeEnabled;
3033 bool Bench::relative() const noexcept {
3034 return mConfig.mIsRelative;
3038 mConfig.mShowPerformanceCounters = showPerformanceCounters;
3042 return mConfig.mShowPerformanceCounters;
3048 Bench& Bench::unit(
char const* u) {
3049 if (u != mConfig.mUnit) {
3056 Bench& Bench::unit(std::string
const& u) {
3057 return unit(u.c_str());
3060 std::string
const& Bench::unit() const noexcept {
3061 return mConfig.mUnit;
3064 Bench& Bench::timeUnit(std::chrono::duration<double>
const& tu, std::string
const& tuName) {
3065 mConfig.mTimeUnit = tu;
3066 mConfig.mTimeUnitName = tuName;
3070 std::string
const& Bench::timeUnitName() const noexcept {
3071 return mConfig.mTimeUnitName;
3074 std::chrono::duration<double>
const& Bench::timeUnit() const noexcept {
3075 return mConfig.mTimeUnit;
3079 Bench& Bench::title(
const char* benchmarkTitle) {
3080 if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3083 mConfig.mBenchmarkTitle = benchmarkTitle;
3086 Bench& Bench::title(std::string
const& benchmarkTitle) {
3087 if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3090 mConfig.mBenchmarkTitle = benchmarkTitle;
3094 std::string
const& Bench::title() const noexcept {
3095 return mConfig.mBenchmarkTitle;
3099 mConfig.mBenchmarkName = benchmarkName;
3103 Bench&
Bench::name(std::string
const& benchmarkName) {
3104 mConfig.mBenchmarkName = benchmarkName;
3109 return mConfig.mBenchmarkName;
3113 Bench& Bench::epochs(
size_t numEpochs) noexcept {
3114 mConfig.mNumEpochs = numEpochs;
3117 size_t Bench::epochs() const noexcept {
3118 return mConfig.mNumEpochs;
3122 Bench& Bench::clockResolutionMultiple(
size_t multiple) noexcept {
3123 mConfig.mClockResolutionMultiple = multiple;
3126 size_t Bench::clockResolutionMultiple() const noexcept {
3127 return mConfig.mClockResolutionMultiple;
3131 Bench& Bench::maxEpochTime(std::chrono::nanoseconds t) noexcept {
3132 mConfig.mMaxEpochTime = t;
3135 std::chrono::nanoseconds Bench::maxEpochTime() const noexcept {
3136 return mConfig.mMaxEpochTime;
3140 Bench& Bench::minEpochTime(std::chrono::nanoseconds t) noexcept {
3141 mConfig.mMinEpochTime = t;
3144 std::chrono::nanoseconds Bench::minEpochTime() const noexcept {
3145 return mConfig.mMinEpochTime;
3148 Bench& Bench::minEpochIterations(uint64_t numIters) noexcept {
3149 mConfig.mMinEpochIterations = (numIters == 0) ? 1 : numIters;
3152 uint64_t Bench::minEpochIterations() const noexcept {
3153 return mConfig.mMinEpochIterations;
3156 Bench& Bench::epochIterations(uint64_t numIters) noexcept {
3157 mConfig.mEpochIterations = numIters;
3160 uint64_t Bench::epochIterations() const noexcept {
3161 return mConfig.mEpochIterations;
3164 Bench& Bench::warmup(uint64_t numWarmupIters) noexcept {
3165 mConfig.mWarmup = numWarmupIters;
3168 uint64_t Bench::warmup() const noexcept {
3169 return mConfig.mWarmup;
3172 Bench& Bench::config(Config
const& benchmarkConfig) {
3173 mConfig = benchmarkConfig;
3176 Config
const& Bench::config() const noexcept {
3180 Bench& Bench::output(std::ostream* outstream) noexcept {
3181 mConfig.mOut = outstream;
3186 return mConfig.mOut;
3189 std::vector<Result>
const& Bench::results() const noexcept {
3193 Bench&
Bench::render(
char const* templateContent, std::ostream& os) {
3198 Bench&
Bench::render(std::string
const& templateContent, std::ostream& os) {
3203 std::vector<BigO> Bench::complexityBigO()
const {
3204 std::vector<BigO> bigOs;
3205 auto rangeMeasure = BigO::collectRangeMeasure(mResults);
3206 bigOs.emplace_back(
"O(1)", rangeMeasure, [](
double) {
3209 bigOs.emplace_back(
"O(n)", rangeMeasure, [](
double n) {
3212 bigOs.emplace_back(
"O(log n)", rangeMeasure, [](
double n) {
3213 return std::log2(n);
3215 bigOs.emplace_back(
"O(n log n)", rangeMeasure, [](
double n) {
3216 return n * std::log2(n);
3218 bigOs.emplace_back(
"O(n^2)", rangeMeasure, [](
double n) {
3221 bigOs.emplace_back(
"O(n^3)", rangeMeasure, [](
double n) {
3224 std::sort(bigOs.begin(), bigOs.end());
3231 std::random_device rd;
3232 std::uniform_int_distribution<uint64_t> dist;
3236 }
while (mX == 0 && mY == 0);
3240 uint64_t splitMix64(uint64_t& state) noexcept {
3241 uint64_t z = (state += UINT64_C(0x9e3779b97f4a7c15));
3242 z = (z ^ (z >> 30U)) * UINT64_C(0xbf58476d1ce4e5b9);
3243 z = (z ^ (z >> 27U)) * UINT64_C(0x94d049bb133111eb);
3244 return z ^ (z >> 31U);
3248 Rng::Rng(uint64_t seed) noexcept
3249 : mX(splitMix64(seed))
3250 , mY(splitMix64(seed)) {
3251 for (
size_t i = 0; i < 10; ++i) {
3257 Rng::Rng(uint64_t x, uint64_t y) noexcept
3261 Rng Rng::copy() const noexcept {
3265 Rng::Rng(std::vector<uint64_t>
const& data)
3268 if (data.size() != 2) {
3269 throw std::runtime_error(
"ankerl::nanobench::Rng::Rng: needed exactly 2 entries in data, but got " +
3270 detail::fmt::to_s(data.size()));
3276 std::vector<uint64_t> Rng::state()
const {
3277 std::vector<uint64_t> data(2);
3283 BigO::RangeMeasure BigO::collectRangeMeasure(std::vector<Result>
const& results) {
3284 BigO::RangeMeasure rangeMeasure;
3285 for (
auto const& result : results) {
3286 if (result.config().mComplexityN > 0.0) {
3287 rangeMeasure.emplace_back(result.config().mComplexityN, result.median(Result::Measure::elapsed));
3290 return rangeMeasure;
3293 BigO::BigO(std::string
const& bigOName, RangeMeasure
const& rangeMeasure)
3297 double sumRangeMeasure = 0.0;
3298 double sumRangeRange = 0.0;
3300 for (
size_t i = 0; i < rangeMeasure.size(); ++i) {
3301 sumRangeMeasure += rangeMeasure[i].first * rangeMeasure[i].second;
3302 sumRangeRange += rangeMeasure[i].first * rangeMeasure[i].first;
3304 mConstant = sumRangeMeasure / sumRangeRange;
3308 double sumMeasure = 0.0;
3309 for (
size_t i = 0; i < rangeMeasure.size(); ++i) {
3310 auto diff = mConstant * rangeMeasure[i].first - rangeMeasure[i].second;
3313 sumMeasure += rangeMeasure[i].second;
3316 auto n =
static_cast<double>(rangeMeasure.size());
3317 auto mean = sumMeasure / n;
3318 mNormalizedRootMeanSquare = std::sqrt(err / n) / mean;
3321 BigO::BigO(
const char* bigOName, RangeMeasure
const& rangeMeasure)
3322 : BigO(
std::string(bigOName), rangeMeasure) {}
3324 std::string
const&
BigO::name() const noexcept {
3328 double BigO::constant() const noexcept {
3332 double BigO::normalizedRootMeanSquare() const noexcept {
3333 return mNormalizedRootMeanSquare;
3337 return std::tie(mNormalizedRootMeanSquare, mName) < std::tie(other.mNormalizedRootMeanSquare, other.mName);
3340 std::ostream&
operator<<(std::ostream& os, BigO
const& bigO) {
3341 return os << bigO.constant() <<
" * " << bigO.name() <<
", rms=" << bigO.normalizedRootMeanSquare();
3344 std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO>
const& bigOs) {
3345 detail::fmt::StreamStateRestorer restorer(os);
3346 os << std::endl <<
"| coefficient | err% | complexity" << std::endl <<
"|--------------:|-------:|------------" << std::endl;
3347 for (
auto const& bigO : bigOs) {
3348 os <<
"|" << std::setw(14) << std::setprecision(7) << std::scientific << bigO.constant() <<
" ";
3349 os <<
"|" << detail::fmt::Number(6, 1, bigO.normalizedRootMeanSquare() * 100.0) <<
"% ";
3350 os <<
"| " << bigO.name();
3359 #endif // ANKERL_NANOBENCH_IMPLEMENT 3360 #endif // ANKERL_NANOBENCH_H_INCLUDED char const * json() noexcept
Template to generate JSON data.
void moveResultTo(std::vector< Result > &results) noexcept
#define ANKERL_NANOBENCH_LOG(x)
bool operator==(const CNetAddr &a, const CNetAddr &b)
std::ostream & operator<<(std::ostream &os, std::vector< ankerl::nanobench::BigO > const &bigOs)
static RangeMeasure collectRangeMeasure(std::vector< Result > const &results)
void add(std::chrono::nanoseconds elapsed, PerformanceCounters const &pc) noexcept
ANKERL_NANOBENCH(NODISCARD) std ANKERL_NANOBENCH(NODISCARD) double const ant() const noexcept
void render(char const *mustacheTemplate, Bench const &bench, std::ostream &out)
Renders output from a mustache-like template and benchmark results.
double uniform01() noexcept
Provides a random uniform double value between 0 and 1.
char const * htmlBoxplot() noexcept
HTML output that uses plotly to generate an interactive boxplot chart. See the tutorial for an exampl...
An extremely fast random generator.
State
The various states a (txhash,peer) pair can be in.
std::ostream & operator<<(std::ostream &os, BigO const &bigO)
std::vector< BigO > complexityBigO() const
#define ANKERL_NANOBENCH_NO_SANITIZE(...)
Bench & run(char const *benchmarkName, Op &&op)
Repeatedly calls op() based on the configuration, and performs measurements.
BigO(std::string const &bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
void doNotOptimizeAway(Arg &&arg)
Makes sure none of the given arguments are optimized away by the compiler.
Bench & complexityN(T b) noexcept
std::conditional< std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock, std::chrono::steady_clock >::type Clock
static Measure fromString(std::string const &str)
void render(std::string const &mustacheTemplate, std::vector< Result > const &results, std::ostream &out)
#define ANKERL_NANOBENCH(x)
char const * csv() noexcept
CSV data for the benchmark results.
void shuffle(Container &container) noexcept
Shuffles all entries in the given container.
bool operator<(const CNetAddr &a, const CNetAddr &b)
ANKERL_NANOBENCH(NODISCARD) std Bench & doNotOptimizeAway(Arg &&arg)
Retrieves all benchmark results collected by the bench object so far.
static constexpr uint64_t() max()
BigO(char const *bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
uint64_t result_type
This RNG provides 64bit randomness.
Main entry point to nanobench's benchmarking facility.
double mNormalizedRootMeanSquare
char const * pyperf() noexcept
Output in pyperf compatible JSON format, which can be used for more analyzations. ...
void doNotOptimizeAway(T const &val)
ANKERL_NANOBENCH(NODISCARD) std Bench & batch(T b) noexcept
Sets the batch size.
std::vector< std::pair< double, double > > RangeMeasure
PerformanceCounters & performanceCounters()
static constexpr uint64_t() min()
static RangeMeasure mapRangeMeasure(RangeMeasure data, Op op)