benchmarks: add benchmark comparing taskflow to good old thread pool (taskflow#383)

Aaron Boxer · web-flow · commit 31699d74dfb9 · 2022-03-11T16:27:09.000-07:00
Thanks!!
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -264,6 +264,20 @@ target_link_libraries(
 )
 set_target_properties(graph_pipeline PROPERTIES COMPILE_FLAGS ${OpenMP_CXX_FLAGS})
 
+## benchmark 13: comparison with simple thread pool
+add_executable(
+  threadpool
+  ${TF_BENCHMARK_DIR}/threadpool/benchmark.cpp
+)
+target_include_directories(threadpool PRIVATE ${PROJECT_SOURCE_DIR}/3rd-party/CLI11)
+target_link_libraries(
+  threadpool
+  ${PROJECT_NAME}
+  ${TBB_IMPORTED_TARGETS}
+  ${OpenMP_CXX_LIBRARIES}
+  tf::default_settings
+)
+set_target_properties(threadpool PROPERTIES COMPILE_FLAGS ${OpenMP_CXX_FLAGS})
 
 ###############################################################################
 # CUDA benchmarks
diff --git a/benchmarks/benchmarks.md b/benchmarks/benchmarks.md
@@ -52,7 +52,7 @@ Results are illustrated in a plot and saved to `result.png`.
                    -o result.png
 ```
 
-When the program completes, you will see a combined plot of all specified benchmarsk.
+When the program completes, you will see a combined plot of all specified benchmarks.
 The x-axis represents the growth of problem size and the y-axis denotes the runtime
 in millisecond.
 
diff --git a/benchmarks/threadpool/ThreadPool.hpp b/benchmarks/threadpool/ThreadPool.hpp
@@ -0,0 +1,129 @@
+#pragma once
+
+#include <vector>
+#include <queue>
+#include <memory>
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+#include <future>
+#include <functional>
+#include <stdexcept>
+#include <map>
+#include <type_traits>
+#include <iostream>
+
+class ThreadPool
+{
+  public:
+	ThreadPool(size_t);
+	template<class F, class... Args>
+	auto enqueue(F&& f, Args&&... args)
+		-> std::future<typename std::invoke_result<F, Args...>::type>;
+	~ThreadPool();
+	int thread_number(std::thread::id id)
+	{
+		if(id_map.find(id) != id_map.end())
+			return (int)id_map[id];
+		return -1;
+	}
+	size_t num_threads()
+	{
+		return num_threads_;
+	}
+
+	static ThreadPool* get()
+	{
+		return instance(0);
+	}
+	static ThreadPool* instance(uint32_t numthreads)
+	{
+		std::unique_lock<std::mutex> lock(singleton_mutex);
+		if(!singleton)
+			singleton = new ThreadPool(numthreads ? numthreads : hardware_concurrency());
+		return singleton;
+	}
+	static void release()
+	{
+		std::unique_lock<std::mutex> lock(singleton_mutex);
+		delete singleton;
+		singleton = nullptr;
+	}
+	static uint32_t hardware_concurrency()
+	{
+		return std::thread::hardware_concurrency();
+	}
+
+  private:
+	std::vector<std::thread> workers;
+	std::queue<std::function<void()>> tasks;
+	std::mutex queue_mutex;
+	std::condition_variable condition;
+	bool stop;
+	std::map<std::thread::id, size_t> id_map;
+	size_t num_threads_;
+	static ThreadPool* singleton;
+	static std::mutex singleton_mutex;
+};
+inline ThreadPool::ThreadPool(size_t threads) : stop(false), num_threads_(threads)
+{
+	if(threads == 1)
+		return;
+
+	for(size_t i = 0; i < threads; ++i)
+		workers.emplace_back([this] {
+			for(;;)
+			{
+				std::function<void()> task;
+				{
+					std::unique_lock<std::mutex> lock(this->queue_mutex);
+					this->condition.wait(lock,
+										 [this] { return this->stop || !this->tasks.empty(); });
+					if(this->stop && this->tasks.empty())
+						return;
+					task = std::move(this->tasks.front());
+					this->tasks.pop();
+				}
+				task();
+			}
+		});
+	size_t thread_count = 0;
+	for(std::thread& worker : workers)
+	{
+		id_map[worker.get_id()] = thread_count;
+		thread_count++;
+	}
+}
+
+// add new work item to the pool
+template<class F, class... Args>
+auto ThreadPool::enqueue(F&& f, Args&&... args)
+	-> std::future<typename std::invoke_result<F, Args...>::type>
+{
+	assert(num_threads_ > 1);
+	using return_type = typename std::invoke_result<F, Args...>::type;
+
+	auto task = std::make_shared<std::packaged_task<return_type()>>(
+		std::bind(std::forward<F>(f), std::forward<Args>(args)...));
+
+	std::future<return_type> res = task->get_future();
+	{
+		std::unique_lock<std::mutex> lock(queue_mutex);
+		if(stop)
+			throw std::runtime_error("enqueue on stopped ThreadPool");
+
+		tasks.emplace([task]() { (*task)(); });
+	}
+	condition.notify_one();
+	return res;
+}
+inline ThreadPool::~ThreadPool()
+{
+	{
+		std::unique_lock<std::mutex> lock(queue_mutex);
+		stop = true;
+	}
+	condition.notify_all();
+	for(std::thread& worker : workers)
+		worker.join();
+}
diff --git a/benchmarks/threadpool/benchmark.cpp b/benchmarks/threadpool/benchmark.cpp
@@ -0,0 +1,83 @@
+#include <taskflow/taskflow.hpp>
+#include <chrono>
+#include "ThreadPool.hpp"
+
+ThreadPool* ThreadPool::singleton = nullptr;
+std::mutex ThreadPool::singleton_mutex;
+tf::Executor executor;
+
+class ChronoTimer {
+public:
+	ChronoTimer(void) {
+	}
+	void start(void){
+		startTime = std::chrono::high_resolution_clock::now();
+	}
+	void finish(std::string msg){
+		auto finish = std::chrono::high_resolution_clock::now();
+		std::chrono::duration<double> elapsed = finish - startTime;
+		printf("%s : %f ms\n",msg.c_str(), elapsed.count() * 1000);
+	}
+private:
+	std::chrono::high_resolution_clock::time_point startTime;
+};
+
+void benchFunc(uint64_t loopLen){
+	float acc = 0;
+	for (uint64_t k = 0; k < loopLen; ++k)
+		acc += k;
+}
+
+void bench(uint32_t iter){
+	printf("Benchmark with %d iterations\n",iter);
+	const uint64_t num_blocks = 1000;
+	const uint64_t loopLen = 100;
+	ChronoTimer timer;
+	ThreadPool *pool = ThreadPool::get();
+
+	timer.start();
+	for (uint64_t it = 0; it < iter; ++it) {
+		tf::Taskflow taskflow;
+		tf::Task node[num_blocks];
+		for (uint64_t i = 0; i < num_blocks; i++)
+			node[i] = taskflow.placeholder();
+		for (uint64_t i = 0; i < num_blocks; i++) {
+			node[i].work([=]() {
+				benchFunc(loopLen);
+			});
+		}
+		executor.run(taskflow).wait();
+	}
+	timer.finish("taskflow: time in ms: ");
+
+	timer.start();
+	for (uint64_t it = 0; it < iter; ++it) {
+		std::vector<std::future<int>> results;
+		for (uint64_t i = 0; i < num_blocks; i++) {
+			results.emplace_back(pool->enqueue([=]() {
+				benchFunc(loopLen);
+
+				return 0;
+			}));
+		}
+		for(auto& result : results)
+		{
+			result.get();
+		}
+	}
+	timer.finish("threadpool: time in ms: ");
+}
+
+int main() {
+	for (uint32_t i = 0; i < 5; ++i)
+		bench(100);
+	for (uint32_t i = 0; i < 5; ++i)
+		bench(50);
+	for (uint32_t i = 0; i < 5; ++i)
+		bench(20);
+	for (uint32_t i = 0; i < 5; ++i)
+		bench(10);
+	for (uint32_t i = 0; i < 5; ++i)
+		bench(5);
+
+}