#include #include #ifdef __x86_64__ #include #endif #include "third_party/nanobench.h" struct Job { int *input; // Returned void* is a function pointer to the next continuation. We have to // use void* because otherwise the type would be recursive. typedef void *(*continuation)(Job *); continuation next; }; void *stepJob(Job *j) { auto done = --(*j->input) == 0; #ifdef __x86_64__ _mm_clflush(j->input); #endif return done ? nullptr : (void *)stepJob; } // So we can look at the disassembly more easily extern "C" { void sequential(Job **jobs, int count) { for (int i = 0; i < count; ++i) { do { jobs[i]->next = (Job::continuation)jobs[i]->next(jobs[i]); } while (jobs[i]->next); } } void sequentialNoFuncPtr(Job **jobs, int count) { for (int i = 0; i < count; ++i) { while (stepJob(jobs[i])) ; } } void interleaveSwapping(Job **jobs, int remaining) { int current = 0; while (remaining > 0) { auto next = (Job::continuation)jobs[current]->next(jobs[current]); jobs[current]->next = next; if (next == nullptr) { jobs[current] = jobs[remaining - 1]; --remaining; } else { ++current; } if (current == remaining) { current = 0; } } } void interleaveBoundedCyclicList(Job **jobs, int count) { if (count == 0) { return; } constexpr int kConcurrent = 32; Job *inProgress[kConcurrent]; int nextJob[kConcurrent]; int started = std::min(kConcurrent, count); for (int i = 0; i < started; i++) { inProgress[i] = jobs[i]; nextJob[i] = i + 1; } nextJob[started - 1] = 0; int prevJob = started - 1; int job = 0; for (;;) { auto next = (Job::continuation)inProgress[job]->next(inProgress[job]); inProgress[job]->next = next; if (next == nullptr) { if (started == count) { if (prevJob == job) break; nextJob[prevJob] = nextJob[job]; job = prevJob; } else { int temp = started++; inProgress[job] = jobs[temp]; } } prevJob = job; job = nextJob[job]; } } void interleaveCyclicList(Job **jobs, int count) { auto *nextJob = (int *)alloca(sizeof(int) * count); for (int i = 0; i < count - 1; ++i) { nextJob[i] = i + 1; } nextJob[count - 1] = 0; int prevJob = count - 1; int job = 0; for (;;) { auto next = (Job::continuation)jobs[job]->next(jobs[job]); jobs[job]->next = next; if (next == nullptr) { if (prevJob == job) break; nextJob[prevJob] = nextJob[job]; job = prevJob; } prevJob = job; job = nextJob[job]; } } } int main() { ankerl::nanobench::Bench bench; constexpr int kNumJobs = 100; bench.relative(true); Job jobs[kNumJobs]; Job jobsCopy[kNumJobs]; int iters = 0; int originalInput[kNumJobs]; for (int i = 0; i < kNumJobs; ++i) { originalInput[i] = rand() % 5 + 3; jobs[i].input = new int{originalInput[i]}; jobs[i].next = stepJob; iters += *jobs[i].input; } bench.batch(iters); for (auto [scheduler, name] : {std::make_pair(sequentialNoFuncPtr, "sequentialNoFuncPtr"), std::make_pair(sequential, "sequential"), std::make_pair(interleaveSwapping, "interleavingSwapping"), std::make_pair(interleaveBoundedCyclicList, "interleaveBoundedCyclicList"), std::make_pair(interleaveCyclicList, "interleaveCyclicList")}) { for (int i = 0; i < kNumJobs; ++i) { *jobs[i].input = originalInput[i]; } memcpy(jobsCopy, jobs, sizeof(jobs)); Job *ps[kNumJobs]; for (int i = 0; i < kNumJobs; ++i) { ps[i] = jobsCopy + i; } scheduler(ps, kNumJobs); for (int i = 0; i < kNumJobs; ++i) { if (*jobsCopy[i].input != 0) { fprintf(stderr, "%s failed\n", name); abort(); } } bench.run(name, [&]() { for (int i = 0; i < kNumJobs; ++i) { *jobs[i].input = originalInput[i]; } memcpy(jobsCopy, jobs, sizeof(jobs)); Job *ps[kNumJobs]; for (int i = 0; i < kNumJobs; ++i) { ps[i] = jobsCopy + i; } scheduler(ps, kNumJobs); }); } for (int i = 0; i < kNumJobs; ++i) { delete jobs[i].input; } }