30 #ifndef ANKERL_NANOBENCH_H_INCLUDED
31 #define ANKERL_NANOBENCH_H_INCLUDED
34 #define ANKERL_NANOBENCH_VERSION_MAJOR 4 // incompatible API changes
35 #define ANKERL_NANOBENCH_VERSION_MINOR 0 // backwards-compatible changes
36 #define ANKERL_NANOBENCH_VERSION_PATCH 0 // backwards-compatible bug fixes
48 #define ANKERL_NANOBENCH(x) ANKERL_NANOBENCH_PRIVATE_##x()
50 #define ANKERL_NANOBENCH_PRIVATE_CXX() __cplusplus
51 #define ANKERL_NANOBENCH_PRIVATE_CXX98() 199711L
52 #define ANKERL_NANOBENCH_PRIVATE_CXX11() 201103L
53 #define ANKERL_NANOBENCH_PRIVATE_CXX14() 201402L
54 #define ANKERL_NANOBENCH_PRIVATE_CXX17() 201703L
56 #if ANKERL_NANOBENCH(CXX) >= ANKERL_NANOBENCH(CXX17)
57 # define ANKERL_NANOBENCH_PRIVATE_NODISCARD() [[nodiscard]]
59 # define ANKERL_NANOBENCH_PRIVATE_NODISCARD()
62 #if defined(__clang__)
63 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH() \
64 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wpadded\"")
65 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP() _Pragma("clang diagnostic pop")
67 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH()
68 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP()
72 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH() _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Weffc++\"")
73 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP() _Pragma("GCC diagnostic pop")
75 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH()
76 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP()
79 #if defined(ANKERL_NANOBENCH_LOG_ENABLED)
81 # define ANKERL_NANOBENCH_LOG(x) std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl
83 # define ANKERL_NANOBENCH_LOG(x)
86 #if defined(__linux__) && !defined(ANKERL_NANOBENCH_DISABLE_PERF_COUNTERS)
87 # define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 1
89 # define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 0
92 #if defined(__clang__)
93 # define ANKERL_NANOBENCH_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__)))
95 # define ANKERL_NANOBENCH_NO_SANITIZE(...)
99 # define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __declspec(noinline)
101 # define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __attribute__((noinline))
106 #if defined(__GNUC__) && __GNUC__ < 5
107 # define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
109 # define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value
115 namespace nanobench {
117 using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock,
118 std::chrono::steady_clock>::type;
271 void render(
char const* mustacheTemplate,
Bench const& bench, std::ostream& out);
281 void render(
char const* mustacheTemplate, std::vector<Result>
const& results, std::ostream& out);
284 namespace templates {
295 char const*
csv() noexcept;
318 char const*
json() noexcept;
324 template <
typename T>
330 #if ANKERL_NANOBENCH(PERF_COUNTERS)
331 class LinuxPerformanceCounters;
341 namespace nanobench {
344 template <
typename T>
345 struct PerfCountSet {
359 std::string mBenchmarkTitle =
"benchmark";
360 std::string mBenchmarkName =
"noname";
361 std::string mUnit =
"op";
363 double mComplexityN = -1.0;
364 size_t mNumEpochs = 11;
365 size_t mClockResolutionMultiple =
static_cast<size_t>(1000);
366 std::chrono::nanoseconds mMaxEpochTime = std::chrono::milliseconds(100);
367 std::chrono::nanoseconds mMinEpochTime{};
368 uint64_t mMinEpochIterations{1};
369 uint64_t mEpochIterations{0};
370 uint64_t mWarmup = 0;
371 std::ostream* mOut =
nullptr;
372 bool mShowPerformanceCounters =
true;
373 bool mIsRelative =
false;
377 Config& operator=(Config
const&);
378 Config& operator=(Config&&);
379 Config(Config
const&);
380 Config(Config&&) noexcept;
400 explicit Result(Config
const& benchmarkConfig);
403 Result& operator=(Result
const&);
404 Result& operator=(Result&&);
406 Result(Result&&) noexcept;
410 void add(
Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc);
428 static
Measure fromString(
std::
string const& str);
432 std::vector<std::vector<double>> mNameToMeasurements{};
460 static constexpr uint64_t(min)();
461 static constexpr uint64_t(max)();
473 Rng& operator=(
Rng const&) =
delete;
476 Rng(
Rng&&) noexcept = default;
477 Rng& operator=(
Rng&&) noexcept = default;
478 ~
Rng() noexcept = default;
505 explicit
Rng(uint64_t seed) noexcept;
506 Rng(uint64_t x, uint64_t y) noexcept;
520 inline uint64_t operator()() noexcept;
538 inline uint32_t bounded(uint32_t range) noexcept;
549 inline
double uniform01() noexcept;
558 template <typename Container>
559 void shuffle(Container& container) noexcept;
562 static constexpr uint64_t rotl(uint64_t x,
unsigned k) noexcept;
590 Bench(Bench&& other);
591 Bench& operator=(Bench&& other);
592 Bench(Bench
const& other);
593 Bench& operator=(Bench
const& other);
614 template <
typename Op>
616 Bench& run(
char const* benchmarkName, Op&& op);
618 template <
typename Op>
620 Bench& run(std::string
const& benchmarkName, Op&& op);
626 template <
typename Op>
635 Bench& title(
char const* benchmarkTitle);
636 Bench& title(std::string
const& benchmarkTitle);
640 Bench&
name(
char const* benchmarkName);
641 Bench&
name(std::string
const& benchmarkName);
653 template <
typename T>
654 Bench& batch(T b) noexcept;
665 Bench& unit(
char const* unit);
666 Bench& unit(std::string
const& unit);
676 Bench& output(std::ostream* outstream) noexcept;
699 Bench& clockResolutionMultiple(
size_t multiple) noexcept;
717 Bench& epochs(
size_t numEpochs) noexcept;
730 Bench& maxEpochTime(std::chrono::nanoseconds t) noexcept;
743 Bench& minEpochTime(std::chrono::nanoseconds t) noexcept;
756 Bench& minEpochIterations(uint64_t numIters) noexcept;
765 Bench& epochIterations(uint64_t numIters) noexcept;
777 Bench& warmup(uint64_t numWarmupIters) noexcept;
797 Bench& relative(
bool isRelativeEnabled) noexcept;
828 template <
typename Arg>
845 template <
typename T>
846 Bench& complexityN(T b) noexcept;
880 std::vector<BigO> complexityBigO()
const;
905 template <
typename Op>
906 BigO complexityBigO(
char const* name, Op op)
const;
908 template <
typename Op>
909 BigO complexityBigO(std::string
const& name, Op op)
const;
918 Bench&
render(
char const* templateContent, std::ostream& os);
920 Bench& config(Config
const& benchmarkConfig);
925 std::vector<Result> mResults{};
935 template <typename Arg>
940 #if defined(_MSC_VER)
941 void doNotOptimizeAwaySink(
void const*);
943 template <
typename T>
949 template <
typename T>
951 using Decayed =
typename std::decay<T>::type;
955 template <
typename T>
956 typename std::enable_if<!doNotOptimizeNeedsIndirect<T>()>::type
doNotOptimizeAway(T
const& val) {
958 asm volatile(
"" ::
"r"(val));
961 template <
typename T>
962 typename std::enable_if<doNotOptimizeNeedsIndirect<T>()>::type
doNotOptimizeAway(T
const& val) {
964 asm volatile(
"" ::
"m"(val) :
"memory");
974 explicit IterationLogic(
Bench const& config) noexcept;
979 void moveResultTo(std::vector<Result>& results) noexcept;
990 PerformanceCounters(PerformanceCounters
const&) =
delete;
991 PerformanceCounters& operator=(PerformanceCounters
const&) =
delete;
993 PerformanceCounters();
994 ~PerformanceCounters();
998 void updateResults(uint64_t numIters);
1004 #if ANKERL_NANOBENCH(PERF_COUNTERS)
1005 LinuxPerformanceCounters* mPc =
nullptr;
1021 template <
typename Op>
1023 for (
auto& rangeMeasure : data) {
1024 rangeMeasure.first = op(rangeMeasure.first);
1029 static RangeMeasure collectRangeMeasure(std::vector<Result>
const& results);
1031 template <
typename Op>
1033 : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
1035 template <
typename Op>
1037 : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
1039 BigO(
char const* bigOName, RangeMeasure
const& scaledRangeMeasure);
1040 BigO(std::string
const& bigOName, RangeMeasure
const& scaledRangeMeasure);
1049 double mNormalizedRootMeanSquare{};
1051 std::ostream&
operator<<(std::ostream& os, BigO
const& bigO);
1052 std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO>
const& bigOs);
1060 namespace nanobench {
1067 return (std::numeric_limits<uint64_t>::max)();
1071 uint64_t
Rng::operator()() noexcept {
1074 mX = UINT64_C(15241094284759029579) * mY;
1075 mY = rotl(mY - x, 27);
1081 uint32_t
Rng::bounded(uint32_t range) noexcept {
1082 uint64_t r32 =
static_cast<uint32_t
>(operator()());
1083 auto multiresult = r32 * range;
1084 return static_cast<uint32_t
>(multiresult >> 32U);
1088 auto i = (UINT64_C(0x3ff) << 52U) | (
operator()() >> 12U);
1096 template <
typename Container>
1098 auto size =
static_cast<uint32_t
>(container.size());
1099 for (
auto i = size; i > 1U; --i) {
1101 auto p = bounded(i);
1102 swap(container[i - 1], container[p]);
1106 constexpr uint64_t
Rng::rotl(uint64_t x,
unsigned k) noexcept {
1107 return (x << k) | (x >> (64U - k));
1110 template <
typename Op>
1117 while (
auto n = iterationLogic.numIters()) {
1119 Clock::time_point before = Clock::now();
1123 Clock::time_point after = Clock::now();
1125 pc.updateResults(iterationLogic.numIters());
1126 iterationLogic.
add(after - before, pc);
1133 template <
typename Op>
1135 name(benchmarkName);
1136 return run(std::forward<Op>(op));
1139 template <
typename Op>
1141 name(benchmarkName);
1142 return run(std::forward<Op>(op));
1145 template <
typename Op>
1150 template <
typename Op>
1157 template <
typename T>
1159 mConfig.mBatch =
static_cast<double>(b);
1164 template <
typename T>
1166 mConfig.mComplexityN =
static_cast<double>(n);
1171 template <
typename Arg>
1178 template <
typename Arg>
1185 #if defined(_MSC_VER)
1186 template <
typename T>
1188 doNotOptimizeAwaySink(&val);
1197 #if defined(ANKERL_NANOBENCH_IMPLEMENT)
1203 # include <algorithm>
1209 # include <iostream>
1213 # include <stdexcept>
1215 # if defined(__linux__)
1216 # include <unistd.h>
1218 # if ANKERL_NANOBENCH(PERF_COUNTERS)
1221 # include <linux/perf_event.h>
1222 # include <sys/ioctl.h>
1223 # include <sys/syscall.h>
1224 # include <unistd.h>
1230 namespace nanobench {
1241 class StreamStateRestorer;
1243 class MarkDownColumn;
1254 namespace nanobench {
1256 uint64_t splitMix64(uint64_t& state) noexcept;
1261 template <
typename T>
1262 inline double d(T t) noexcept {
1263 return static_cast<double>(t);
1265 inline double d(Clock::duration duration) noexcept {
1266 return std::chrono::duration_cast<std::chrono::duration<double>>(duration).
count();
1270 inline Clock::duration clockResolution() noexcept;
1274 namespace templates {
1276 char const*
csv() noexcept {
1277 return R
"DELIM("title";"name";"unit";"batch";"elapsed";"error %";"instructions";"branches";"branch misses";"total"
1278 {{#result}}"{{title}}";"{{name}}";"{{unit}}";{{batch}};{{median(elapsed)}};{{medianAbsolutePercentError(elapsed)}};{{median(instructions)}};{{median(branchinstructions)}};{{median(branchmisses)}};{{sumProduct(iterations, elapsed)}}
1283 return R
"DELIM(<html>
1286 <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
1290 <div id="myDiv"></div>
1295 y: [{{#measurement}}{{elapsed}}{{^-last}}, {{/last}}{{/measurement}}],
1299 var title = '{{title}}';
1301 data = data.map(a => Object.assign(a, { boxpoints: 'all', pointpos: 0, type: 'box' }));
1302 var layout = { title: { text: title }, showlegend: false, yaxis: { title: 'time per unit', rangemode: 'tozero', autorange: true } }; Plotly.newPlot('myDiv', data, layout, {responsive: true});
1309 char const*
json() noexcept {
1313 "title": "{{title}}",
1317 "complexityN": {{complexityN}},
1318 "epochs": {{epochs}},
1319 "clockResolution": {{clockResolution}},
1320 "clockResolutionMultiple": {{clockResolutionMultiple}},
1321 "maxEpochTime": {{maxEpochTime}},
1322 "minEpochTime": {{minEpochTime}},
1323 "minEpochIterations": {{minEpochIterations}},
1324 "epochIterations": {{epochIterations}},
1325 "warmup": {{warmup}},
1326 "relative": {{relative}},
1327 "median(elapsed)": {{median(elapsed)}},
1328 "medianAbsolutePercentError(elapsed)": {{medianAbsolutePercentError(elapsed)}},
1329 "median(instructions)": {{median(instructions)}},
1330 "medianAbsolutePercentError(instructions)": {{medianAbsolutePercentError(instructions)}},
1331 "median(cpucycles)": {{median(cpucycles)}},
1332 "median(contextswitches)": {{median(contextswitches)}},
1333 "median(pagefaults)": {{median(pagefaults)}},
1334 "median(branchinstructions)": {{median(branchinstructions)}},
1335 "median(branchmisses)": {{median(branchmisses)}},
1336 "totalTime": {{sumProduct(iterations, elapsed)}},
1339 "iterations": {{iterations}},
1340 "elapsed": {{elapsed}},
1341 "pagefaults": {{pagefaults}},
1342 "cpucycles": {{cpucycles}},
1343 "contextswitches": {{contextswitches}},
1344 "instructions": {{instructions}},
1345 "branchinstructions": {{branchinstructions}},
1346 "branchmisses": {{branchmisses}}
1347 }{{^-last}},{{/-last}}
1349 }{{^-last}},{{/-last}}
1356 enum class Type { tag, content, section, inverted_section };
1360 std::vector<Node> children;
1365 bool operator==(
char const (&str)[N])
const noexcept {
1366 return static_cast<size_t>(std::distance(begin, end) + 1) == N && 0 == strncmp(str, begin, N - 1);
1371 static
std::vector<Node> parseMustacheTemplate(
char const** tpl) {
1372 std::vector<Node> nodes;
1375 auto begin = std::strstr(*tpl,
"{{");
1377 if (begin !=
nullptr) {
1379 end = std::strstr(begin,
"}}");
1382 if (begin ==
nullptr || end ==
nullptr) {
1384 nodes.emplace_back(Node{*tpl, *tpl + std::strlen(*tpl), std::vector<Node>{}, Node::Type::content});
1388 nodes.emplace_back(Node{*tpl, begin - 2, std::vector<Node>{}, Node::Type::content});
1398 nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::section});
1402 nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::inverted_section});
1406 nodes.emplace_back(Node{begin, end, std::vector<Node>{}, Node::Type::tag});
1412 static bool generateFirstLast(Node
const& n,
size_t idx,
size_t size, std::ostream& out) {
1413 bool matchFirst = n ==
"-first";
1414 bool matchLast = n ==
"-last";
1415 if (!matchFirst && !matchLast) {
1419 bool doWrite =
false;
1420 if (n.type == Node::Type::section) {
1421 doWrite = (matchFirst && idx == 0) || (matchLast && idx == size - 1);
1422 }
else if (n.type == Node::Type::inverted_section) {
1423 doWrite = (matchFirst && idx != 0) || (matchLast && idx != size - 1);
1427 for (
auto const& child : n.children) {
1428 if (child.type == Node::Type::content) {
1429 out.write(child.begin, std::distance(child.begin, child.end));
1436 static bool matchCmdArgs(std::string
const& str, std::vector<std::string>& matchResult) {
1437 matchResult.clear();
1438 auto idxOpen = str.find(
'(');
1439 auto idxClose = str.find(
')', idxOpen);
1440 if (idxClose == std::string::npos) {
1444 matchResult.emplace_back(str.substr(0, idxOpen));
1447 matchResult.emplace_back(std::string{});
1448 for (
size_t i = idxOpen + 1; i != idxClose; ++i) {
1449 if (str[i] ==
' ' || str[i] ==
'\t') {
1453 if (str[i] ==
',') {
1455 matchResult.emplace_back(std::string{});
1459 matchResult.back() += str[i];
1464 static bool generateConfigTag(Node
const& n, Config
const& config, std::ostream& out) {
1468 out << config.mBenchmarkTitle;
1470 }
else if (n ==
"name") {
1471 out << config.mBenchmarkName;
1473 }
else if (n ==
"unit") {
1474 out << config.mUnit;
1476 }
else if (n ==
"batch") {
1477 out << config.mBatch;
1479 }
else if (n ==
"complexityN") {
1480 out << config.mComplexityN;
1482 }
else if (n ==
"epochs") {
1483 out << config.mNumEpochs;
1485 }
else if (n ==
"clockResolution") {
1486 out << d(detail::clockResolution());
1488 }
else if (n ==
"clockResolutionMultiple") {
1489 out << config.mClockResolutionMultiple;
1491 }
else if (n ==
"maxEpochTime") {
1492 out << d(config.mMaxEpochTime);
1494 }
else if (n ==
"minEpochTime") {
1495 out << d(config.mMinEpochTime);
1497 }
else if (n ==
"minEpochIterations") {
1498 out << config.mMinEpochIterations;
1500 }
else if (n ==
"epochIterations") {
1501 out << config.mEpochIterations;
1503 }
else if (n ==
"warmup") {
1504 out << config.mWarmup;
1506 }
else if (n ==
"relative") {
1507 out << config.mIsRelative;
1513 static std::ostream& generateResultTag(Node
const& n,
Result const& r, std::ostream& out) {
1514 if (generateConfigTag(n, r.config(), out)) {
1522 std::vector<std::string> matchResult;
1523 if (matchCmdArgs(std::string(n.begin, n.end), matchResult)) {
1524 if (matchResult.size() == 2) {
1530 if (matchResult[0] ==
"median") {
1531 return out << r.median(m);
1533 if (matchResult[0] ==
"average") {
1534 return out << r.average(m);
1536 if (matchResult[0] ==
"medianAbsolutePercentError") {
1537 return out << r.medianAbsolutePercentError(m);
1539 if (matchResult[0] ==
"sum") {
1540 return out << r.sum(m);
1542 if (matchResult[0] ==
"minimum") {
1543 return out << r.minimum(m);
1545 if (matchResult[0] ==
"maximum") {
1546 return out << r.maximum(m);
1548 }
else if (matchResult.size() == 3) {
1555 if (matchResult[0] ==
"sumProduct") {
1556 return out << r.sumProduct(m1, m2);
1565 throw std::runtime_error(
"command '" + std::string(n.begin, n.end) +
"' not understood");
1568 static void generateResultMeasurement(std::vector<Node>
const& nodes,
size_t idx,
Result const& r, std::ostream& out) {
1569 for (
auto const& n : nodes) {
1570 if (!generateFirstLast(n, idx, r.size(), out)) {
1573 case Node::Type::content:
1574 out.write(n.begin, std::distance(n.begin, n.end));
1577 case Node::Type::inverted_section:
1578 throw std::runtime_error(
"got a inverted section inside measurement");
1580 case Node::Type::section:
1581 throw std::runtime_error(
"got a section inside measurement");
1583 case Node::Type::tag: {
1588 out << r.get(idx, m);
1597 static void generateResult(std::vector<Node>
const& nodes,
size_t idx, std::vector<Result>
const& results, std::ostream& out) {
1598 auto const& r = results[idx];
1599 for (
auto const& n : nodes) {
1600 if (!generateFirstLast(n, idx, results.size(), out)) {
1603 case Node::Type::content:
1604 out.write(n.begin, std::distance(n.begin, n.end));
1607 case Node::Type::inverted_section:
1608 throw std::runtime_error(
"got a inverted section inside result");
1610 case Node::Type::section:
1611 if (n ==
"measurement") {
1612 for (
size_t i = 0; i < r.size(); ++i) {
1613 generateResultMeasurement(n.children, i, r, out);
1616 throw std::runtime_error(
"got a section inside result");
1620 case Node::Type::tag:
1621 generateResultTag(n, r, out);
1633 char const* getEnv(
char const*
name);
1634 bool isEndlessRunning(std::string
const&
name);
1636 template <
typename T>
1637 T parseFile(std::string
const& filename);
1639 void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations);
1640 void printStabilityInformationOnce(std::ostream* os);
1643 uint64_t& singletonHeaderHash() noexcept;
1646 Clock::duration calcClockResolution(
size_t numEvaluations) noexcept;
1653 class NumSep : public
std::numpunct<
char> {
1655 explicit NumSep(
char sep);
1656 char do_thousands_sep()
const override;
1657 std::string do_grouping()
const override;
1666 class StreamStateRestorer {
1668 explicit StreamStateRestorer(std::ostream& s);
1669 ~StreamStateRestorer();
1675 StreamStateRestorer(StreamStateRestorer
const&) =
delete;
1676 StreamStateRestorer& operator=(StreamStateRestorer
const&) =
delete;
1677 StreamStateRestorer(StreamStateRestorer&&) =
delete;
1678 StreamStateRestorer& operator=(StreamStateRestorer&&) =
delete;
1681 std::ostream& mStream;
1682 std::locale mLocale;
1683 std::streamsize
const mPrecision;
1684 std::streamsize
const mWidth;
1685 std::ostream::char_type
const mFill;
1686 std::ostream::fmtflags
const mFmtFlags;
1693 Number(
int width,
int precision,
double value);
1694 Number(
int width,
int precision, int64_t value);
1695 std::string to_s()
const;
1698 friend std::ostream&
operator<<(std::ostream& os, Number
const& n);
1699 std::ostream& write(std::ostream& os)
const;
1707 std::string to_s(uint64_t s);
1709 std::ostream&
operator<<(std::ostream& os, Number
const& n);
1711 class MarkDownColumn {
1713 MarkDownColumn(
int w,
int prec, std::string
const& tit, std::string
const& suff,
double val);
1714 std::string title()
const;
1715 std::string separator()
const;
1716 std::string invalid()
const;
1717 std::string value()
const;
1723 std::string mSuffix;
1728 class MarkDownCode {
1730 explicit MarkDownCode(std::string
const& what);
1733 friend std::ostream&
operator<<(std::ostream& os, MarkDownCode
const& mdCode);
1734 std::ostream& write(std::ostream& os)
const;
1736 std::string mWhat{};
1739 std::ostream&
operator<<(std::ostream& os, MarkDownCode
const& mdCode);
1749 namespace nanobench {
1751 void render(
char const* mustacheTemplate, std::vector<Result>
const& results, std::ostream& out) {
1752 detail::fmt::StreamStateRestorer restorer(out);
1754 out.precision(std::numeric_limits<double>::digits10);
1755 auto nodes = templates::parseMustacheTemplate(&mustacheTemplate);
1757 for (
auto const& n : nodes) {
1760 case templates::Node::Type::content:
1761 out.write(n.begin, std::distance(n.begin, n.end));
1764 case templates::Node::Type::inverted_section:
1765 throw std::runtime_error(
"unknown list '" + std::string(n.begin, n.end) +
"'");
1767 case templates::Node::Type::section:
1768 if (n ==
"result") {
1769 const size_t nbResults = results.size();
1770 for (
size_t i = 0; i < nbResults; ++i) {
1771 generateResult(n.children, i, results, out);
1774 throw std::runtime_error(
"unknown section '" + std::string(n.begin, n.end) +
"'");
1778 case templates::Node::Type::tag:
1780 if (!generateConfigTag(n, results.back().config(), out)) {
1781 throw std::runtime_error(
"unknown tag '" + std::string(n.begin, n.end) +
"'");
1788 void render(
char const* mustacheTemplate,
const Bench& bench, std::ostream& out) {
1789 render(mustacheTemplate, bench.results(), out);
1795 # if defined(__clang__)
1796 # pragma clang diagnostic push
1797 # pragma clang diagnostic ignored "-Wexit-time-destructors"
1799 static PerformanceCounters pc;
1800 # if defined(__clang__)
1801 # pragma clang diagnostic pop
1810 # if defined(_MSC_VER)
1811 # pragma optimize("", off)
1812 void doNotOptimizeAwaySink(
void const*) {}
1813 # pragma optimize("", on)
1816 template <
typename T>
1817 T parseFile(std::string
const& filename) {
1824 char const* getEnv(
char const*
name) {
1825 # if defined(_MSC_VER)
1826 # pragma warning(push)
1827 # pragma warning(disable : 4996) // getenv': This function or variable may be unsafe.
1829 return std::getenv(name);
1830 # if defined(_MSC_VER)
1831 # pragma warning(pop)
1835 bool isEndlessRunning(std::string
const&
name) {
1836 auto endless = getEnv(
"NANOBENCH_ENDLESS");
1837 return nullptr != endless && endless ==
name;
1840 void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations) {
1842 recommendations.clear();
1844 bool recommendCheckFlags =
false;
1847 warnings.emplace_back(
"DEBUG defined");
1848 recommendCheckFlags =
true;
1851 bool recommendPyPerf =
false;
1852 # if defined(__linux__)
1853 auto nprocs = sysconf(_SC_NPROCESSORS_CONF);
1855 warnings.emplace_back(
"couldn't figure out number of processors - no governor, turbo check possible");
1859 for (
long id = 0;
id < nprocs; ++id) {
1860 auto idStr = detail::fmt::to_s(static_cast<uint64_t>(
id));
1861 auto sysCpu =
"/sys/devices/system/cpu/cpu" + idStr;
1862 auto minFreq = parseFile<int64_t>(sysCpu +
"/cpufreq/scaling_min_freq");
1863 auto maxFreq = parseFile<int64_t>(sysCpu +
"/cpufreq/scaling_max_freq");
1864 if (minFreq != maxFreq) {
1865 auto minMHz =
static_cast<double>(minFreq) / 1000.0;
1866 auto maxMHz =
static_cast<double>(maxFreq) / 1000.0;
1867 warnings.emplace_back(
"CPU frequency scaling enabled: CPU " + idStr +
" between " +
1868 detail::fmt::Number(1, 1, minMHz).to_s() +
" and " + detail::fmt::Number(1, 1, maxMHz).to_s() +
1870 recommendPyPerf =
true;
1875 auto currentGovernor = parseFile<std::string>(
"/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor");
1876 if (
"performance" != currentGovernor) {
1877 warnings.emplace_back(
"CPU governor is '" + currentGovernor +
"' but should be 'performance'");
1878 recommendPyPerf =
true;
1881 if (0 == parseFile<int>(
"/sys/devices/system/cpu/intel_pstate/no_turbo")) {
1882 warnings.emplace_back(
"Turbo is enabled, CPU frequency will fluctuate");
1883 recommendPyPerf =
true;
1888 if (recommendCheckFlags) {
1889 recommendations.emplace_back(
"Make sure you compile for Release");
1891 if (recommendPyPerf) {
1892 recommendations.emplace_back(
"Use 'pyperf system tune' before benchmarking. See https://github.com/vstinner/pyperf");
1896 void printStabilityInformationOnce(std::ostream* outStream) {
1897 static bool shouldPrint =
true;
1898 if (shouldPrint && outStream) {
1899 auto& os = *outStream;
1900 shouldPrint =
false;
1901 std::vector<std::string> warnings;
1902 std::vector<std::string> recommendations;
1903 gatherStabilityInformation(warnings, recommendations);
1904 if (warnings.empty()) {
1908 os <<
"Warning, results might be unstable:" << std::endl;
1909 for (
auto const& w : warnings) {
1910 os <<
"* " << w << std::endl;
1913 os << std::endl <<
"Recommendations" << std::endl;
1914 for (
auto const& r : recommendations) {
1915 os <<
"* " << r << std::endl;
1921 uint64_t& singletonHeaderHash() noexcept {
1922 static uint64_t sHeaderHash{};
1927 inline uint64_t fnv1a(
std::
string const& str) noexcept {
1928 auto val = UINT64_C(14695981039346656037);
1929 for (
auto c : str) {
1930 val = (val ^
static_cast<uint8_t
>(c)) * UINT64_C(1099511628211);
1936 inline uint64_t hash_combine(uint64_t seed, uint64_t val) {
1937 return seed ^ (val + UINT64_C(0x9e3779b9) + (seed << 6U) + (seed >> 2U));
1941 Clock::duration calcClockResolution(
size_t numEvaluations) noexcept {
1942 auto bestDuration = Clock::duration::max();
1943 Clock::time_point tBegin;
1944 Clock::time_point tEnd;
1945 for (
size_t i = 0; i < numEvaluations; ++i) {
1946 tBegin = Clock::now();
1948 tEnd = Clock::now();
1949 }
while (tBegin == tEnd);
1950 bestDuration = (std::min)(bestDuration, tEnd - tBegin);
1952 return bestDuration;
1956 Clock::duration clockResolution() noexcept {
1957 static Clock::duration sResolution = calcClockResolution(20);
1962 struct IterationLogic::Impl {
1963 enum class State { warmup, upscaling_runtime, measuring, endless };
1965 explicit Impl(Bench
const& bench)
1967 , mResult(bench.config()) {
1968 printStabilityInformationOnce(mBench.output());
1971 mTargetRuntimePerEpoch = detail::clockResolution() * mBench.clockResolutionMultiple();
1972 if (mTargetRuntimePerEpoch > mBench.maxEpochTime()) {
1973 mTargetRuntimePerEpoch = mBench.maxEpochTime();
1975 if (mTargetRuntimePerEpoch < mBench.minEpochTime()) {
1976 mTargetRuntimePerEpoch = mBench.minEpochTime();
1979 if (isEndlessRunning(mBench.name())) {
1980 std::cerr <<
"NANOBENCH_ENDLESS set: running '" << mBench.name() <<
"' endlessly" << std::endl;
1981 mNumIters = (std::numeric_limits<uint64_t>::max)();
1982 mState = State::endless;
1983 }
else if (0 != mBench.warmup()) {
1984 mNumIters = mBench.warmup();
1985 mState = State::warmup;
1986 }
else if (0 != mBench.epochIterations()) {
1988 mNumIters = mBench.epochIterations();
1989 mState = State::measuring;
1991 mNumIters = mBench.minEpochIterations();
1992 mState = State::upscaling_runtime;
1998 auto doubleElapsed = d(elapsed);
1999 auto doubleTargetRuntimePerEpoch = d(mTargetRuntimePerEpoch);
2000 auto doubleNewIters = doubleTargetRuntimePerEpoch / doubleElapsed * d(iters);
2002 auto doubleMinEpochIters = d(mBench.minEpochIterations());
2003 if (doubleNewIters < doubleMinEpochIters) {
2004 doubleNewIters = doubleMinEpochIters;
2006 doubleNewIters *= 1.0 + 0.2 * mRng.uniform01();
2010 return static_cast<uint64_t
>(doubleNewIters + 0.5);
2014 if (elapsed * 10 < mTargetRuntimePerEpoch) {
2016 if (mNumIters * 10 < mNumIters) {
2018 showResult(
"iterations overflow. Maybe your code got optimized away?");
2024 mNumIters = calcBestNumIters(elapsed, mNumIters);
2028 void add(std::chrono::nanoseconds elapsed, PerformanceCounters
const& pc) noexcept {
2029 # if defined(ANKERL_NANOBENCH_LOG_ENABLED)
2030 auto oldIters = mNumIters;
2035 if (isCloseEnoughForMeasurements(elapsed)) {
2038 mState = State::measuring;
2039 mNumIters = calcBestNumIters(elapsed, mNumIters);
2042 mState = State::upscaling_runtime;
2047 case State::upscaling_runtime:
2048 if (isCloseEnoughForMeasurements(elapsed)) {
2050 mState = State::measuring;
2051 mTotalElapsed += elapsed;
2052 mTotalNumIters += mNumIters;
2053 mResult.add(elapsed, mNumIters, pc);
2054 mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2060 case State::measuring:
2063 mTotalElapsed += elapsed;
2064 mTotalNumIters += mNumIters;
2065 mResult.add(elapsed, mNumIters, pc);
2066 if (0 != mBench.epochIterations()) {
2067 mNumIters = mBench.epochIterations();
2069 mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2073 case State::endless:
2074 mNumIters = (std::numeric_limits<uint64_t>::max)();
2078 if (static_cast<uint64_t>(mResult.size()) == mBench.epochs()) {
2084 ANKERL_NANOBENCH_LOG(mBench.name() <<
": " << detail::fmt::Number(20, 3, static_cast<double>(elapsed.count())) <<
" elapsed, "
2085 << detail::fmt::Number(20, 3, static_cast<double>(mTargetRuntimePerEpoch.count()))
2086 <<
" target. oldIters=" << oldIters <<
", mNumIters=" << mNumIters
2087 <<
", mState=" <<
static_cast<int>(mState));
2090 void showResult(std::string
const& errorMessage)
const {
2093 if (mBench.output() !=
nullptr) {
2095 std::vector<fmt::MarkDownColumn> columns;
2099 if (mBench.relative()) {
2101 if (!mBench.results().empty()) {
2104 columns.emplace_back(11, 1,
"relative",
"%", d);
2107 if (mBench.complexityN() > 0) {
2108 columns.emplace_back(14, 0,
"complexityN",
"", mBench.complexityN());
2111 columns.emplace_back(22, 2,
"ns/" + mBench.unit(),
"", 1e9 * rMedian / mBench.batch());
2112 columns.emplace_back(22, 2, mBench.unit() +
"/s",
"", rMedian <= 0.0 ? 0.0 : mBench.batch() / rMedian);
2115 columns.emplace_back(10, 1,
"err%",
"%", rErrorMedian * 100.0);
2117 double rInsMedian = -1.0;
2120 columns.emplace_back(18, 2,
"ins/" + mBench.unit(),
"", rInsMedian / mBench.batch());
2123 double rCycMedian = -1.0;
2126 columns.emplace_back(18, 2,
"cyc/" + mBench.unit(),
"", rCycMedian / mBench.batch());
2128 if (rInsMedian > 0.0 && rCycMedian > 0.0) {
2129 columns.emplace_back(9, 3,
"IPC",
"", rCycMedian <= 0.0 ? 0.0 : rInsMedian / rCycMedian);
2133 columns.emplace_back(17, 2,
"bra/" + mBench.unit(),
"", rBraMedian / mBench.batch());
2136 if (rBraMedian >= 1e-9) {
2139 columns.emplace_back(10, 1,
"miss%",
"%", p);
2146 auto& os = *mBench.output();
2149 hash = hash_combine(fnv1a(mBench.unit()), hash);
2150 hash = hash_combine(fnv1a(mBench.title()), hash);
2151 hash = hash_combine(mBench.relative(), hash);
2152 hash = hash_combine(mBench.performanceCounters(), hash);
2154 if (hash != singletonHeaderHash()) {
2155 singletonHeaderHash() = hash;
2159 for (
auto const& col : columns) {
2162 os <<
"| " << mBench.title() << std::endl;
2164 for (
auto const& col : columns) {
2165 os << col.separator();
2167 os <<
"|:" << std::string(mBench.title().size() + 1U,
'-') << std::endl;
2170 if (!errorMessage.empty()) {
2171 for (
auto const& col : columns) {
2172 os << col.invalid();
2174 os <<
"| :boom: " << fmt::MarkDownCode(mBench.name()) <<
" (" << errorMessage <<
')' << std::endl;
2176 for (
auto const& col : columns) {
2180 auto showUnstable = rErrorMedian >= 0.05;
2182 os <<
":wavy_dash: ";
2184 os << fmt::MarkDownCode(mBench.name());
2186 auto avgIters =
static_cast<double>(mTotalNumIters) / static_cast<double>(mBench.epochs());
2188 auto suggestedIters =
static_cast<uint64_t
>(avgIters * 10 + 0.5);
2190 os <<
" (Unstable with ~" << detail::fmt::Number(1, 1, avgIters)
2191 <<
" iters. Increase `minEpochIterations` to e.g. " << suggestedIters <<
")";
2199 return elapsed * 3 >= mTargetRuntimePerEpoch * 2;
2202 uint64_t mNumIters = 1;
2203 Bench
const& mBench;
2204 std::chrono::nanoseconds mTargetRuntimePerEpoch{};
2207 std::chrono::nanoseconds mTotalElapsed{};
2208 uint64_t mTotalNumIters = 0;
2210 State mState = State::upscaling_runtime;
2214 IterationLogic::IterationLogic(Bench const& bench) noexcept
2215 : mPimpl(new Impl(bench)) {}
2217 IterationLogic::~IterationLogic() {
2223 uint64_t IterationLogic::numIters() const noexcept {
2225 return mPimpl->mNumIters;
2228 void IterationLogic::add(std::chrono::nanoseconds elapsed, PerformanceCounters
const& pc) noexcept {
2229 mPimpl->add(elapsed, pc);
2232 void IterationLogic::moveResultTo(std::vector<Result>& results) noexcept {
2233 results.emplace_back(std::move(mPimpl->mResult));
2236 # if ANKERL_NANOBENCH(PERF_COUNTERS)
2239 class LinuxPerformanceCounters {
2242 Target(uint64_t* targetValue_,
bool correctMeasuringOverhead_,
bool correctLoopOverhead_)
2243 : targetValue(targetValue_)
2244 , correctMeasuringOverhead(correctMeasuringOverhead_)
2245 , correctLoopOverhead(correctLoopOverhead_) {}
2247 uint64_t* targetValue{};
2248 bool correctMeasuringOverhead{};
2249 bool correctLoopOverhead{};
2252 ~LinuxPerformanceCounters();
2255 inline void start() {}
2257 inline void stop() {}
2259 bool monitor(perf_sw_ids swId, Target target);
2260 bool monitor(perf_hw_id hwId, Target target);
2262 bool hasError() const noexcept {
2268 inline void beginMeasure() {
2274 mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
2280 mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
2283 inline void endMeasure() {
2289 mHasError = (-1 == ioctl(mFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP));
2294 auto const numBytes =
sizeof(uint64_t) * mCounters.size();
2295 auto ret = read(mFd, mCounters.data(), numBytes);
2296 mHasError = ret !=
static_cast<ssize_t
>(numBytes);
2299 void updateResults(uint64_t numIters);
2302 template <
typename T>
2303 static inline T divRounded(T a, T divisor) {
2304 return (a + divisor / 2) / divisor;
2307 template <
typename Op>
2309 void calibrate(Op&& op) {
2311 for (
auto& v : mCalibratedOverhead) {
2316 auto newCalibration = mCalibratedOverhead;
2317 for (
auto& v : newCalibration) {
2318 v = (std::numeric_limits<uint64_t>::max)();
2320 for (
size_t iter = 0; iter < 100; ++iter) {
2328 for (
size_t i = 0; i < newCalibration.size(); ++i) {
2329 auto diff = mCounters[i];
2330 if (newCalibration[i] > diff) {
2331 newCalibration[i] = diff;
2336 mCalibratedOverhead = std::move(newCalibration);
2343 uint64_t
const numIters = 100000U + (std::random_device{}() & 3);
2344 uint64_t n = numIters;
2345 uint32_t x = 1234567;
2358 auto measure1 = mCounters;
2369 auto measure2 = mCounters;
2371 for (
size_t i = 0; i < mCounters.size(); ++i) {
2373 auto m1 = measure1[i] > mCalibratedOverhead[i] ? measure1[i] - mCalibratedOverhead[i] : 0;
2374 auto m2 = measure2[i] > mCalibratedOverhead[i] ? measure2[i] - mCalibratedOverhead[i] : 0;
2375 auto overhead = m1 * 2 > m2 ? m1 * 2 - m2 : 0;
2377 mLoopOverhead[i] = divRounded(overhead, numIters);
2383 bool monitor(uint32_t type, uint64_t eventid, Target target);
2385 std::map<uint64_t, Target> mIdToTarget{};
2388 std::vector<uint64_t> mCounters{3};
2389 std::vector<uint64_t> mCalibratedOverhead{3};
2390 std::vector<uint64_t> mLoopOverhead{3};
2392 uint64_t mTimeEnabledNanos = 0;
2393 uint64_t mTimeRunningNanos = 0;
2395 bool mHasError =
false;
2399 LinuxPerformanceCounters::~LinuxPerformanceCounters() {
2405 bool LinuxPerformanceCounters::monitor(perf_sw_ids swId, LinuxPerformanceCounters::Target target) {
2406 return monitor(PERF_TYPE_SOFTWARE, swId, target);
2409 bool LinuxPerformanceCounters::monitor(perf_hw_id hwId, LinuxPerformanceCounters::Target target) {
2410 return monitor(PERF_TYPE_HARDWARE, hwId, target);
2415 void LinuxPerformanceCounters::updateResults(uint64_t numIters) {
2417 for (
auto& id_value : mIdToTarget) {
2418 *id_value.second.targetValue = UINT64_C(0);
2425 mTimeEnabledNanos = mCounters[1] - mCalibratedOverhead[1];
2426 mTimeRunningNanos = mCounters[2] - mCalibratedOverhead[2];
2428 for (uint64_t i = 0; i < mCounters[0]; ++i) {
2429 auto idx =
static_cast<size_t>(3 + i * 2 + 0);
2430 auto id = mCounters[idx + 1U];
2432 auto it = mIdToTarget.find(
id);
2433 if (
it != mIdToTarget.end()) {
2435 auto& tgt =
it->second;
2436 *tgt.targetValue = mCounters[idx];
2437 if (tgt.correctMeasuringOverhead) {
2438 if (*tgt.targetValue >= mCalibratedOverhead[idx]) {
2439 *tgt.targetValue -= mCalibratedOverhead[idx];
2441 *tgt.targetValue = 0U;
2444 if (tgt.correctLoopOverhead) {
2445 auto correctionVal = mLoopOverhead[idx] * numIters;
2446 if (*tgt.targetValue >= correctionVal) {
2447 *tgt.targetValue -= correctionVal;
2449 *tgt.targetValue = 0U;
2456 bool LinuxPerformanceCounters::monitor(uint32_t type, uint64_t eventid, Target target) {
2457 *target.targetValue = (std::numeric_limits<uint64_t>::max)();
2462 auto pea = perf_event_attr();
2463 std::memset(&pea, 0,
sizeof(perf_event_attr));
2465 pea.size =
sizeof(perf_event_attr);
2466 pea.config = eventid;
2468 pea.exclude_kernel = 1;
2472 pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
2476 # if defined(PERF_FLAG_FD_CLOEXEC) // since Linux 3.14
2477 const unsigned long flags = PERF_FLAG_FD_CLOEXEC;
2479 const unsigned long flags = 0;
2482 auto fd =
static_cast<int>(syscall(__NR_perf_event_open, &pea, pid, cpu, mFd, flags));
2492 if (-1 == ioctl(fd, PERF_EVENT_IOC_ID, &
id)) {
2498 mIdToTarget.emplace(
id, target);
2501 auto size = 3 + 2 * mIdToTarget.size();
2502 mCounters.resize(size);
2503 mCalibratedOverhead.resize(size);
2504 mLoopOverhead.resize(size);
2509 PerformanceCounters::PerformanceCounters()
2510 : mPc(new LinuxPerformanceCounters())
2514 mHas.pageFaults = mPc->monitor(PERF_COUNT_SW_PAGE_FAULTS, LinuxPerformanceCounters::Target(&mVal.pageFaults,
true,
false));
2515 mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_REF_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles,
true,
false));
2516 mHas.contextSwitches =
2517 mPc->monitor(PERF_COUNT_SW_CONTEXT_SWITCHES, LinuxPerformanceCounters::Target(&mVal.contextSwitches,
true,
false));
2518 mHas.instructions = mPc->monitor(PERF_COUNT_HW_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.instructions,
true,
true));
2519 mHas.branchInstructions =
2520 mPc->monitor(PERF_COUNT_HW_BRANCH_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.branchInstructions,
true,
false));
2521 mHas.branchMisses = mPc->monitor(PERF_COUNT_HW_BRANCH_MISSES, LinuxPerformanceCounters::Target(&mVal.branchMisses,
true,
false));
2526 auto before = ankerl::nanobench::Clock::now();
2527 auto after = ankerl::nanobench::Clock::now();
2532 if (mPc->hasError()) {
2534 mHas = PerfCountSet<bool>{};
2538 PerformanceCounters::~PerformanceCounters() {
2539 if (
nullptr != mPc) {
2544 void PerformanceCounters::beginMeasure() {
2545 mPc->beginMeasure();
2548 void PerformanceCounters::endMeasure() {
2552 void PerformanceCounters::updateResults(uint64_t numIters) {
2553 mPc->updateResults(numIters);
2558 PerformanceCounters::PerformanceCounters() =
default;
2559 PerformanceCounters::~PerformanceCounters() =
default;
2560 void PerformanceCounters::beginMeasure() {}
2561 void PerformanceCounters::endMeasure() {}
2562 void PerformanceCounters::updateResults(uint64_t) {}
2577 NumSep::NumSep(
char sep)
2580 char NumSep::do_thousands_sep()
const {
2584 std::string NumSep::do_grouping()
const {
2589 StreamStateRestorer::StreamStateRestorer(std::ostream& s)
2591 , mLocale(s.getloc())
2592 , mPrecision(s.precision())
2595 , mFmtFlags(s.flags()) {}
2597 StreamStateRestorer::~StreamStateRestorer() {
2602 void StreamStateRestorer::restore() {
2603 mStream.imbue(mLocale);
2604 mStream.precision(mPrecision);
2605 mStream.width(mWidth);
2606 mStream.fill(mFill);
2607 mStream.flags(mFmtFlags);
2610 Number::Number(
int width,
int precision, int64_t value)
2612 , mPrecision(precision)
2613 , mValue(static_cast<double>(value)) {}
2615 Number::Number(
int width,
int precision,
double value)
2617 , mPrecision(precision)
2620 std::ostream& Number::write(std::ostream& os)
const {
2621 StreamStateRestorer restorer(os);
2622 os.imbue(std::locale(os.getloc(),
new NumSep(
',')));
2623 os << std::setw(mWidth) << std::setprecision(mPrecision) << std::fixed << mValue;
2627 std::string Number::to_s()
const {
2628 std::stringstream ss;
2633 std::string to_s(uint64_t n) {
2636 str +=
static_cast<char>(
'0' +
static_cast<char>(n % 10));
2639 std::reverse(str.begin(), str.end());
2643 std::ostream&
operator<<(std::ostream& os, Number
const& n) {
2647 MarkDownColumn::MarkDownColumn(
int w,
int prec, std::string
const& tit, std::string
const& suff,
double val)
2654 std::string MarkDownColumn::title()
const {
2655 std::stringstream ss;
2656 ss <<
'|' << std::setw(mWidth - 2) << std::right << mTitle <<
' ';
2660 std::string MarkDownColumn::separator()
const {
2661 std::string sep(static_cast<size_t>(mWidth),
'-');
2667 std::string MarkDownColumn::invalid()
const {
2668 std::string sep(static_cast<size_t>(mWidth),
' ');
2670 sep[sep.size() - 2] =
'-';
2674 std::string MarkDownColumn::value()
const {
2675 std::stringstream ss;
2676 auto width = mWidth - 2 -
static_cast<int>(mSuffix.size());
2677 ss <<
'|' << Number(width, mPrecision, mValue) << mSuffix <<
' ';
2682 MarkDownCode::MarkDownCode(std::string
const& what) {
2683 mWhat.reserve(what.size() + 2);
2684 mWhat.push_back(
'`');
2685 for (
char c : what) {
2688 mWhat.push_back(
'`');
2691 mWhat.push_back(
'`');
2694 std::ostream& MarkDownCode::write(std::ostream& os)
const {
2698 std::ostream&
operator<<(std::ostream& os, MarkDownCode
const& mdCode) {
2699 return mdCode.write(os);
2705 Config::Config() =
default;
2706 Config::~Config() =
default;
2707 Config& Config::operator=(Config
const&) =
default;
2708 Config& Config::operator=(Config&&) =
default;
2709 Config::Config(Config
const&) =
default;
2710 Config::Config(Config&&) noexcept = default;
2720 template <
typename T>
2721 inline constexpr
typename std::underlying_type<T>::type u(T val) noexcept {
2722 return static_cast<typename std::underlying_type<T>::type
>(val);
2728 : mConfig(benchmarkConfig)
2731 void Result::add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters
const& pc) {
2735 double dIters = d(iters);
2736 mNameToMeasurements[u(Result::Measure::iterations)].push_back(dIters);
2738 mNameToMeasurements[u(Result::Measure::elapsed)].push_back(d(totalElapsed) / dIters);
2739 if (pc.has().pageFaults) {
2740 mNameToMeasurements[u(Result::Measure::pagefaults)].push_back(d(pc.val().pageFaults) / dIters);
2742 if (pc.has().cpuCycles) {
2743 mNameToMeasurements[u(Result::Measure::cpucycles)].push_back(d(pc.val().cpuCycles) / dIters);
2745 if (pc.has().contextSwitches) {
2746 mNameToMeasurements[u(Result::Measure::contextswitches)].push_back(d(pc.val().contextSwitches) / dIters);
2748 if (pc.has().instructions) {
2749 mNameToMeasurements[u(Result::Measure::instructions)].push_back(d(pc.val().instructions) / dIters);
2751 if (pc.has().branchInstructions) {
2752 double branchInstructions = 0.0;
2754 if (pc.val().branchInstructions > iters + 1U) {
2755 branchInstructions = d(pc.val().branchInstructions - (iters + 1U));
2757 mNameToMeasurements[u(Result::Measure::branchinstructions)].push_back(branchInstructions / dIters);
2759 if (pc.has().branchMisses) {
2761 double branchMisses = d(pc.val().branchMisses);
2762 if (branchMisses > branchInstructions) {
2764 branchMisses = branchInstructions;
2768 branchMisses -= 1.0;
2769 if (branchMisses < 1.0) {
2772 mNameToMeasurements[u(Result::Measure::branchmisses)].push_back(branchMisses / dIters);
2777 Config
const& Result::config() const noexcept {
2781 inline double calcMedian(std::vector<double>& data) {
2785 std::sort(data.begin(), data.end());
2787 auto midIdx = data.size() / 2U;
2788 if (1U == (data.size() & 1U)) {
2789 return data[midIdx];
2791 return (data[midIdx - 1U] + data[midIdx]) / 2U;
2794 double Result::median(Measure m)
const {
2796 auto data = mNameToMeasurements[detail::u(m)];
2797 return calcMedian(data);
2800 double Result::average(Measure m)
const {
2802 auto const& data = mNameToMeasurements[detail::u(m)];
2808 return sum(m) / d(data.size());
2811 double Result::medianAbsolutePercentError(Measure m)
const {
2813 auto data = mNameToMeasurements[detail::u(m)];
2817 auto med = calcMedian(data);
2820 for (
auto& x : data) {
2826 return calcMedian(data);
2830 auto const& data = mNameToMeasurements[detail::u(m)];
2831 return std::accumulate(data.begin(), data.end(), 0.0);
2834 double Result::sumProduct(Measure m1, Measure m2)
const noexcept {
2835 auto const& data1 = mNameToMeasurements[detail::u(m1)];
2836 auto const& data2 = mNameToMeasurements[detail::u(m2)];
2838 if (data1.size() != data2.size()) {
2842 double result = 0.0;
2843 for (
size_t i = 0, s = data1.size(); i != s; ++i) {
2844 result += data1[i] * data2[i];
2849 bool Result::has(Measure m)
const noexcept {
2850 return !mNameToMeasurements[detail::u(m)].empty();
2853 double Result::get(
size_t idx, Measure m)
const {
2854 auto const& data = mNameToMeasurements[detail::u(m)];
2855 return data.at(idx);
2858 bool Result::empty() const noexcept {
2859 return 0U == size();
2862 size_t Result::size() const noexcept {
2863 auto const& data = mNameToMeasurements[detail::u(Measure::elapsed)];
2867 double Result::minimum(Measure m)
const noexcept {
2868 auto const& data = mNameToMeasurements[detail::u(m)];
2874 return *std::min_element(data.begin(), data.end());
2877 double Result::maximum(Measure m)
const noexcept {
2878 auto const& data = mNameToMeasurements[detail::u(m)];
2884 return *std::max_element(data.begin(), data.end());
2887 Result::Measure Result::fromString(std::string
const& str) {
2888 if (str ==
"elapsed") {
2889 return Measure::elapsed;
2890 }
else if (str ==
"iterations") {
2891 return Measure::iterations;
2892 }
else if (str ==
"pagefaults") {
2893 return Measure::pagefaults;
2894 }
else if (str ==
"cpucycles") {
2895 return Measure::cpucycles;
2896 }
else if (str ==
"contextswitches") {
2897 return Measure::contextswitches;
2898 }
else if (str ==
"instructions") {
2899 return Measure::instructions;
2900 }
else if (str ==
"branchinstructions") {
2901 return Measure::branchinstructions;
2902 }
else if (str ==
"branchmisses") {
2903 return Measure::branchmisses;
2906 return Measure::_size;
2912 mConfig.mOut = &std::cout;
2915 Bench::Bench(Bench&&) =
default;
2916 Bench& Bench::operator=(Bench&&) =
default;
2917 Bench::Bench(Bench
const&) =
default;
2918 Bench& Bench::operator=(Bench
const&) =
default;
2919 Bench::~Bench() noexcept = default;
2921 double Bench::batch() const noexcept {
2922 return mConfig.mBatch;
2925 double Bench::complexityN() const noexcept {
2926 return mConfig.mComplexityN;
2931 Bench& Bench::relative(
bool isRelativeEnabled) noexcept {
2932 mConfig.mIsRelative = isRelativeEnabled;
2935 bool Bench::relative() const noexcept {
2936 return mConfig.mIsRelative;
2940 mConfig.mShowPerformanceCounters = showPerformanceCounters;
2944 return mConfig.mShowPerformanceCounters;
2950 Bench& Bench::unit(
char const* u) {
2951 if (u != mConfig.mUnit) {
2958 Bench& Bench::unit(std::string
const& u) {
2959 return unit(u.c_str());
2962 std::string
const& Bench::unit() const noexcept {
2963 return mConfig.mUnit;
2967 Bench& Bench::title(
const char* benchmarkTitle) {
2968 if (benchmarkTitle != mConfig.mBenchmarkTitle) {
2971 mConfig.mBenchmarkTitle = benchmarkTitle;
2974 Bench& Bench::title(std::string
const& benchmarkTitle) {
2975 if (benchmarkTitle != mConfig.mBenchmarkTitle) {
2978 mConfig.mBenchmarkTitle = benchmarkTitle;
2982 std::string
const& Bench::title() const noexcept {
2983 return mConfig.mBenchmarkTitle;
2987 mConfig.mBenchmarkName = benchmarkName;
2991 Bench&
Bench::name(std::string
const& benchmarkName) {
2992 mConfig.mBenchmarkName = benchmarkName;
2997 return mConfig.mBenchmarkName;
3001 Bench& Bench::epochs(
size_t numEpochs) noexcept {
3002 mConfig.mNumEpochs = numEpochs;
3005 size_t Bench::epochs() const noexcept {
3006 return mConfig.mNumEpochs;
3010 Bench& Bench::clockResolutionMultiple(
size_t multiple) noexcept {
3011 mConfig.mClockResolutionMultiple = multiple;
3014 size_t Bench::clockResolutionMultiple() const noexcept {
3015 return mConfig.mClockResolutionMultiple;
3019 Bench& Bench::maxEpochTime(std::chrono::nanoseconds t) noexcept {
3020 mConfig.mMaxEpochTime = t;
3023 std::chrono::nanoseconds Bench::maxEpochTime() const noexcept {
3024 return mConfig.mMaxEpochTime;
3028 Bench& Bench::minEpochTime(std::chrono::nanoseconds t) noexcept {
3029 mConfig.mMinEpochTime = t;
3032 std::chrono::nanoseconds Bench::minEpochTime() const noexcept {
3033 return mConfig.mMinEpochTime;
3036 Bench& Bench::minEpochIterations(uint64_t numIters) noexcept {
3037 mConfig.mMinEpochIterations = (numIters == 0) ? 1 : numIters;
3040 uint64_t Bench::minEpochIterations() const noexcept {
3041 return mConfig.mMinEpochIterations;
3044 Bench& Bench::epochIterations(uint64_t numIters) noexcept {
3045 mConfig.mEpochIterations = numIters;
3048 uint64_t Bench::epochIterations() const noexcept {
3049 return mConfig.mEpochIterations;
3052 Bench& Bench::warmup(uint64_t numWarmupIters) noexcept {
3053 mConfig.mWarmup = numWarmupIters;
3056 uint64_t Bench::warmup() const noexcept {
3057 return mConfig.mWarmup;
3060 Bench& Bench::config(Config
const& benchmarkConfig) {
3061 mConfig = benchmarkConfig;
3064 Config
const& Bench::config() const noexcept {
3068 Bench& Bench::output(std::ostream* outstream) noexcept {
3069 mConfig.mOut = outstream;
3074 return mConfig.mOut;
3077 std::vector<Result>
const& Bench::results() const noexcept {
3081 Bench&
Bench::render(
char const* templateContent, std::ostream& os) {
3086 std::vector<BigO> Bench::complexityBigO()
const {
3087 std::vector<BigO> bigOs;
3088 auto rangeMeasure = BigO::collectRangeMeasure(mResults);
3089 bigOs.emplace_back(
"O(1)", rangeMeasure, [](
double) {
3092 bigOs.emplace_back(
"O(n)", rangeMeasure, [](
double n) {
3095 bigOs.emplace_back(
"O(log n)", rangeMeasure, [](
double n) {
3096 return std::log2(n);
3098 bigOs.emplace_back(
"O(n log n)", rangeMeasure, [](
double n) {
3099 return n * std::log2(n);
3101 bigOs.emplace_back(
"O(n^2)", rangeMeasure, [](
double n) {
3104 bigOs.emplace_back(
"O(n^3)", rangeMeasure, [](
double n) {
3107 std::sort(bigOs.begin(), bigOs.end());
3114 std::random_device rd;
3115 std::uniform_int_distribution<uint64_t> dist;
3119 }
while (mX == 0 && mY == 0);
3123 uint64_t splitMix64(uint64_t& state) noexcept {
3124 uint64_t z = (state += UINT64_C(0x9e3779b97f4a7c15));
3125 z = (z ^ (z >> 30U)) * UINT64_C(0xbf58476d1ce4e5b9);
3126 z = (z ^ (z >> 27U)) * UINT64_C(0x94d049bb133111eb);
3127 return z ^ (z >> 31U);
3131 Rng::Rng(uint64_t seed) noexcept
3132 : mX(splitMix64(seed))
3133 , mY(splitMix64(seed)) {
3134 for (
size_t i = 0; i < 10; ++i) {
3140 Rng::Rng(uint64_t x, uint64_t y) noexcept
3144 Rng Rng::copy() const noexcept {
3148 BigO::RangeMeasure BigO::collectRangeMeasure(std::vector<Result>
const& results) {
3149 BigO::RangeMeasure rangeMeasure;
3150 for (
auto const& result : results) {
3151 if (result.config().mComplexityN > 0.0) {
3152 rangeMeasure.emplace_back(result.config().mComplexityN, result.median(Result::Measure::elapsed));
3155 return rangeMeasure;
3158 BigO::BigO(std::string
const& bigOName, RangeMeasure
const& rangeMeasure)
3162 double sumRangeMeasure = 0.0;
3163 double sumRangeRange = 0.0;
3165 for (
size_t i = 0; i < rangeMeasure.size(); ++i) {
3166 sumRangeMeasure += rangeMeasure[i].first * rangeMeasure[i].second;
3167 sumRangeRange += rangeMeasure[i].first * rangeMeasure[i].first;
3169 mConstant = sumRangeMeasure / sumRangeRange;
3173 double sumMeasure = 0.0;
3174 for (
size_t i = 0; i < rangeMeasure.size(); ++i) {
3175 auto diff = mConstant * rangeMeasure[i].first - rangeMeasure[i].second;
3178 sumMeasure += rangeMeasure[i].second;
3181 auto n =
static_cast<double>(rangeMeasure.size());
3182 auto mean = sumMeasure / n;
3183 mNormalizedRootMeanSquare = std::sqrt(err / n) / mean;
3186 BigO::BigO(
const char* bigOName, RangeMeasure
const& rangeMeasure)
3187 : BigO(
std::string(bigOName), rangeMeasure) {}
3189 std::string
const&
BigO::name() const noexcept {
3193 double BigO::constant() const noexcept {
3197 double BigO::normalizedRootMeanSquare() const noexcept {
3198 return mNormalizedRootMeanSquare;
3202 return std::tie(mNormalizedRootMeanSquare, mName) < std::tie(other.mNormalizedRootMeanSquare, other.mName);
3205 std::ostream&
operator<<(std::ostream& os, BigO
const& bigO) {
3206 return os << bigO.constant() <<
" * " << bigO.name() <<
", rms=" << bigO.normalizedRootMeanSquare();
3209 std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO>
const& bigOs) {
3210 detail::fmt::StreamStateRestorer restorer(os);
3211 os << std::endl <<
"| coefficient | err% | complexity" << std::endl <<
"|--------------:|-------:|------------" << std::endl;
3212 for (
auto const& bigO : bigOs) {
3213 os <<
"|" << std::setw(14) << std::setprecision(7) << std::scientific << bigO.constant() <<
" ";
3214 os <<
"|" << detail::fmt::Number(6, 1, bigO.normalizedRootMeanSquare() * 100.0) <<
"% ";
3215 os <<
"| " << bigO.name();
3224 #endif // ANKERL_NANOBENCH_IMPLEMENT
3225 #endif // ANKERL_NANOBENCH_H_INCLUDED
char const * json() noexcept
Template to generate JSON data.
void moveResultTo(std::vector< Result > &results) noexcept
#define ANKERL_NANOBENCH_LOG(x)
std::vector< std::pair< double, double >> RangeMeasure
bool operator==(const CNetAddr &a, const CNetAddr &b)
std::ostream & operator<<(std::ostream &os, std::vector< ankerl::nanobench::BigO > const &bigOs)
static RangeMeasure collectRangeMeasure(std::vector< Result > const &results)
void add(std::chrono::nanoseconds elapsed, PerformanceCounters const &pc) noexcept
void render(char const *mustacheTemplate, Bench const &bench, std::ostream &out)
Renders output from a mustache-like template and benchmark results.
double uniform01() noexcept
Provides a random uniform double value between 0 and 1.
char const * htmlBoxplot() noexcept
HTML output that uses plotly to generate an interactive boxplot chart. See the tutorial for an exampl...
An extremely fast random generator.
State
The various states a (txhash,peer) pair can be in.
std::ostream & operator<<(std::ostream &os, BigO const &bigO)
std::enable_if<!doNotOptimizeNeedsIndirect< T >)>::type doNotOptimizeAway(T const &val)
#define ANKERL_NANOBENCH_NO_SANITIZE(...)
Bench & run(char const *benchmarkName, Op &&op)
Repeatedly calls op() based on the configuration, and performs measurements.
BigO(std::string const &bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
void doNotOptimizeAway(Arg &&arg)
Makes sure none of the given arguments are optimized away by the compiler.
Bench & complexityN(T b) noexcept
static constexpr uint64_t rotl(uint64_t x, unsigned k) noexcept
std::conditional< std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock, std::chrono::steady_clock >::type Clock
static Measure fromString(std::string const &str)
#define ANKERL_NANOBENCH(x)
void render(char const *mustacheTemplate, std::vector< Result > const &results, std::ostream &out)
Same as render(char const* mustacheTemplate, Bench const& bench, std::ostream& out), but for when you only have results available.
constexpr bool doNotOptimizeNeedsIndirect()
char const * csv() noexcept
CSV data for the benchmark results.
void shuffle(Container &container) noexcept
Shuffles all entries in the given container.
bool operator<(const CNetAddr &a, const CNetAddr &b)
ANKERL_NANOBENCH(NODISCARD) std Bench & doNotOptimizeAway(Arg &&arg)
Retrieves all benchmark results collected by the bench object so far.
void * memcpy(void *a, const void *b, size_t c)
static constexpr uint64_t() max()
BigO(char const *bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
uint64_t result_type
This RNG provides 64bit randomness.
Main entry point to nanobench's benchmarking facility.
std::vector< BigO > complexityBigO() const
ANKERL_NANOBENCH(NODISCARD) std Bench & batch(T b) noexcept
Sets the batch size.
PerformanceCounters & performanceCounters()
static constexpr uint64_t() min()
static RangeMeasure mapRangeMeasure(RangeMeasure data, Op op)
#define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...)