diff --git a/CMakeLists.txt b/CMakeLists.txt index 5cda168..93f6502 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -358,6 +358,11 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR AND BUILD_TESTING) set_target_properties(server_bench PROPERTIES SKIP_BUILD_RPATH ON) add_executable(interleaving_test InterleavingTest.cpp) + # work around lack of musttail for gcc + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_BUILD_TYPE STREQUAL "Debug") + target_compile_options(interleaving_test PRIVATE -Og + -foptimize-sibling-calls) + endif() target_link_libraries(interleaving_test PRIVATE nanobench) endif() diff --git a/InterleavingTest.cpp b/InterleavingTest.cpp index dbc166a..4923548 100644 --- a/InterleavingTest.cpp +++ b/InterleavingTest.cpp @@ -22,9 +22,6 @@ void *stepJob(Job *j) { return done ? nullptr : (void *)stepJob; } -// So we can look at the disassembly more easily - -extern "C" { void sequential(Job **jobs, int count) { for (int i = 0; i < count; ++i) { do { @@ -94,6 +91,87 @@ void interleaveBoundedCyclicList(Job **jobs, int count) { } } +#ifndef __has_attribute +#define __has_attribute(x) 0 +#endif + +#if __has_attribute(musttail) +#define MUSTTAIL __attribute__((musttail)) +#else +#define MUSTTAIL +#endif + +struct Context { + constexpr static int kConcurrent = 32; + Job **jobs; + Job *inProgress[kConcurrent]; + void (*continuation[kConcurrent])(Context *, int64_t prevJob, int64_t job, + int64_t started, int64_t count); + int nextJob[kConcurrent]; +}; + +void keepGoing(Context *context, int64_t prevJob, int64_t job, int64_t started, + int64_t count) { + prevJob = job; + job = context->nextJob[job]; + MUSTTAIL return context->continuation[job](context, prevJob, job, started, + count); +} + +void stepJobTailCall(Context *context, int64_t prevJob, int64_t job, + int64_t started, int64_t count); + +void complete(Context *context, int64_t prevJob, int64_t job, int64_t started, + int64_t count) { + if (started == count) { + if (prevJob == job) { + return; + } + context->nextJob[prevJob] = context->nextJob[job]; + job = prevJob; + } else { + context->inProgress[job] = context->jobs[started++]; + context->continuation[job] = stepJobTailCall; + } + prevJob = job; + job = context->nextJob[job]; + MUSTTAIL return context->continuation[job](context, prevJob, job, started, + count); +} + +void stepJobTailCall(Context *context, int64_t prevJob, int64_t job, + int64_t started, int64_t count) { + auto *j = context->inProgress[job]; + auto done = --(*j->input) == 0; +#ifdef __x86_64__ + _mm_clflush(j->input); +#endif + if (done) { + MUSTTAIL return complete(context, prevJob, job, started, count); + } else { + context->continuation[job] = stepJobTailCall; + MUSTTAIL return keepGoing(context, prevJob, job, started, count); + } +} + +void useTailCalls(Job **jobs, int count) { + if (count == 0) { + return; + } + Context context; + context.jobs = jobs; + int64_t started = std::min(Context::kConcurrent, count); + for (int i = 0; i < started; i++) { + context.inProgress[i] = jobs[i]; + context.nextJob[i] = i + 1; + context.continuation[i] = stepJobTailCall; + } + context.nextJob[started - 1] = 0; + int prevJob = started - 1; + int job = 0; + return context.continuation[job](&context, prevJob, job, started, count); +} + void interleaveCyclicList(Job **jobs, int count) { auto *nextJob = (int *)alloca(sizeof(int) * count); @@ -117,12 +195,11 @@ void interleaveCyclicList(Job **jobs, int count) { job = nextJob[job]; } } -} int main() { ankerl::nanobench::Bench bench; - constexpr int kNumJobs = 100; + constexpr int kNumJobs = 10000; bench.relative(true); Job jobs[kNumJobs]; @@ -140,6 +217,7 @@ int main() { for (auto [scheduler, name] : {std::make_pair(sequentialNoFuncPtr, "sequentialNoFuncPtr"), std::make_pair(sequential, "sequential"), + std::make_pair(useTailCalls, "useTailCalls"), std::make_pair(interleaveSwapping, "interleavingSwapping"), std::make_pair(interleaveBoundedCyclicList, "interleaveBoundedCyclicList"),