diff --git a/.gitattributes b/.gitattributes index 2930e6490..ecfdac56f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,4 +1,11 @@ +# Mark non-source directories as vendored +# Common directories in Taskflow might include external dependencies or other assets. benchmark/* linguist-vendored doc/* linguist-vendored image/* linguist-vendored 3rd-party/* linguist-vendored + +# Mark C++ source files for Taskflow +*.cpp linguist-language=C++ +*.hpp linguist-language=C++ +*.h linguist-language=C++ diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index 0d1bb4b76..b575ed622 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -3,45 +3,45 @@ name: macOS on: [push, pull_request] jobs: - debug-test: + debug-test-cpp17: runs-on: macos-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: cmake - run: cmake -S . -B build -D CMAKE_BUILD_TYPE=Debug -D CMAKE_BUILD_BENCHMARKS=ON -D CMAKE_BUILD_PROFILER=ON + run: cmake -S . -B build -DCMAKE_BUILD_TYPE=Debug -DCMAKE_BUILD_BENCHMARKS=ON -DCMAKE_BUILD_PROFILER=ON -DCMAKE_CXX_STANDARD=17 - name: build run: cmake --build build --parallel 10 - name: test run: cd build ; ctest --output-on-failure - release-test: + release-test-cpp17: runs-on: macos-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: cmake - run: cmake -S . -B build -D CMAKE_BUILD_TYPE=Release + run: cmake -S . -B build -D CMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=17 - name: build run: cmake --build build --parallel 10 - name: test run: cd build ; ctest --output-on-failure - undefined-test: + undefined-test-cpp17: runs-on: macos-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: cmake - run: cmake -S . -B build -DCMAKE_CXX_FLAGS="-fsanitize=undefined -g" + run: cmake -S . -B build -DCMAKE_CXX_FLAGS="-fsanitize=undefined -g" -DCMAKE_CXX_STANDARD=17 - name: build run: cmake --build build --parallel 10 - name: test run: cd build ; ctest --output-on-failure - tsan-test: + tsan-test-cpp17: runs-on: macos-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: cmake - run: cmake -S . -B build -DCMAKE_CXX_FLAGS="-fsanitize=thread -g" + run: cmake -S . -B build -DCMAKE_CXX_FLAGS="-fsanitize=thread -g" -DCMAKE_CXX_STANDARD=17 - name: build run: cmake --build build --parallel 10 - name: test @@ -50,11 +50,22 @@ jobs: ############################################################################### # C++ 20 standard test: ############################################################################### + + debug-test-cpp20: + runs-on: macos-latest + steps: + - uses: actions/checkout@v3 + - name: cmake + run: cmake -S . -B build -DCMAKE_BUILD_TYPE=Debug -DCMAKE_BUILD_BENCHMARKS=ON -DCMAKE_BUILD_PROFILER=ON -DCMAKE_CXX_STANDARD=20 + - name: build + run: cmake --build build --parallel 10 + - name: test + run: cd build ; ctest --output-on-failure release-test-cpp20: - runs-on: ubuntu-latest + runs-on: macos-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: cmake run: cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=20 - name: build @@ -62,10 +73,21 @@ jobs: - name: test run: cd build ; ctest --output-on-failure + undefined-test-cpp20: + runs-on: macos-latest + steps: + - uses: actions/checkout@v3 + - name: cmake + run: cmake -S . -B build -DCMAKE_CXX_FLAGS="-fsanitize=undefined -g" -DCMAKE_CXX_STANDARD=20 + - name: build + run: cmake --build build --parallel 10 + - name: test + run: cd build ; ctest --output-on-failure + tsan-test-cpp20: - runs-on: ubuntu-latest + runs-on: macos-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: cmake run: cmake -S . -B build -DCMAKE_CXX_FLAGS="-fsanitize=thread -g" -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=20 - name: build diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 933fc3639..3eec21182 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -3,34 +3,34 @@ name: Ubuntu on: [push, pull_request] jobs: - debug-test: + debug-test-cpp17: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: cmake - run: cmake -S . -B build -D CMAKE_BUILD_TYPE=Debug -D CMAKE_BUILD_BENCHMARKS=ON -D CMAKE_BUILD_PROFILER=ON + run: cmake -S . -B build -DCMAKE_BUILD_TYPE=Debug -DCMAKE_BUILD_BENCHMARKS=ON -DCMAKE_BUILD_PROFILER=ON -DCMAKE_CXX_STANDARD=17 - name: build run: cmake --build build --parallel 10 - name: test run: cd build ; ctest --output-on-failure - release-test: + release-test-cpp17: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: cmake - run: cmake -S . -B build -D CMAKE_BUILD_TYPE=Release + run: cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=17 - name: build run: cmake --build build --parallel 10 - name: test run: cd build ; ctest --output-on-failure - leak-test: + leak-test-cpp17: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: cmake - run: cmake -S . -B build -DCMAKE_CXX_FLAGS="-fsanitize=address -fsanitize=leak -g" + run: cmake -S . -B build -DCMAKE_CXX_FLAGS="-fsanitize=address -fsanitize=leak -g" -DCMAKE_CXX_STANDARD=17 - name: build run: cmake --build build --parallel 10 - name: test @@ -39,7 +39,7 @@ jobs: #undefined-test: # runs-on: ubuntu-latest # steps: - # - uses: actions/checkout@v2 + # - uses: actions/checkout@v3 # - name: cmake # run: cmake -S . -B build -DCMAKE_CXX_FLAGS="-fsanitize=undefined -g" # - name: build @@ -47,12 +47,12 @@ jobs: # - name: test # run: cd build ; ctest -j 10 --output-on-failure - tsan-test: + tsan-test-cpp17: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: cmake - run: cmake -S . -B build -DCMAKE_CXX_FLAGS="-fsanitize=thread -g" + run: cmake -S . -B build -DCMAKE_CXX_FLAGS="-fsanitize=thread -g" -DCMAKE_CXX_STANDARD=17 - name: build run: cmake --build build --parallel 10 - name: test @@ -61,26 +61,48 @@ jobs: ############################################################################### # C++ 20 standard test: ############################################################################### + + debug-test-cpp20: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: cmake + run: cmake -S . -B build -DCMAKE_BUILD_TYPE=Debug -DCMAKE_BUILD_BENCHMARKS=ON -DCMAKE_BUILD_PROFILER=ON -DCMAKE_CXX_STANDARD=20 + - name: build + run: cmake --build build --parallel 10 + - name: test + run: cd build ; ctest --output-on-failure release-test-cpp20: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: cmake - run: cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_STANDARD=20 -DCMAKE_CXX_FLAGS="-stdlib=libc++" + run: cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=20 - name: build run: cmake --build build --parallel 10 - name: test run: cd build ; ctest --output-on-failure - # temporarily disable due to error in linking libc++ - #tsan-test-cpp20: - # runs-on: ubuntu-latest - # steps: - # - uses: actions/checkout@v2 - # - name: cmake - # run: cmake -S . -B build -DCMAKE_CXX_FLAGS="-stdlib=libc++ -fsanitize=thread -g" -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_STANDARD=20 - # - name: build - # run: cmake --build build --parallel 10 - # - name: test - # run: cd build ; ctest --output-on-failure + leak-test-cpp20: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: cmake + run: cmake -S . -B build -DCMAKE_CXX_FLAGS="-fsanitize=address -fsanitize=leak -g" -DCMAKE_CXX_STANDARD=20 + - name: build + run: cmake --build build --parallel 10 + - name: test + run: cd build ; ctest --output-on-failure + + tsan-test-cpp20: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: cmake + run: cmake -S . -B build -DCMAKE_CXX_FLAGS="-fsanitize=thread -g" -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=20 + - name: build + run: cmake --build build --parallel 10 + - name: test + run: cd build ; ctest --output-on-failure + diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 49264c0e0..9db2f9ac4 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -3,18 +3,62 @@ name: Windows on: [push, pull_request] jobs: - msvc2019: - # The CMake configure and build commands are platform agnostic and should work equally - # well on Windows or Mac. You can convert this to a matrix build if you need - # cross-platform coverage. - # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix + debug-test-cpp17: runs-on: windows-latest + steps: + - uses: actions/checkout@v3 + - name: cmake + run: cmake -S . -B build -DCMAKE_BUILD_TYPE=Debug -DCMAKE_BUILD_BENCHMARKS=ON -DCMAKE_BUILD_PROFILER=ON -DCMAKE_CXX_STANDARD=17 + - name: build + run: cmake --build build --parallel 10 + - name: test + run: cd build ; ctest --exclude-regex "test-unicode" --output-on-failure + release-test-cpp17: + runs-on: windows-latest + steps: + - uses: actions/checkout@v3 + - name: cmake + run: cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DCMAKE_BUILD_BENCHMARKS=ON -DCMAKE_BUILD_PROFILER=ON -DCMAKE_CXX_STANDARD=17 + - name: build + run: cmake --build build --parallel 10 + - name: test + run: cd build ; ctest --exclude-regex "test-unicode" --output-on-failure + +############################################################################### +# C++ 20 standard test: +############################################################################### + + debug-test-cpp20: + runs-on: windows-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: cmake - run: cmake -S . -B build -D CMAKE_BUILD_TYPE=Release -D CMAKE_BUILD_BENCHMARKS=ON -D CMAKE_BUILD_PROFILER=ON + run: cmake -S . -B build -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_STANDARD=20 - name: build run: cmake --build build --parallel 10 - name: test run: cd build ; ctest --exclude-regex "test-unicode" --output-on-failure + + release-test-cpp20: + runs-on: windows-latest + steps: + - uses: actions/checkout@v3 + - name: cmake + run: cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=20 + - name: build + run: cmake --build build --parallel 10 + - name: test + run: cd build ; ctest --exclude-regex "test-unicode" --output-on-failure + + #release-test-cpp20-atomic-notifier: + # runs-on: windows-latest + # steps: + # - uses: actions/checkout@v3 + # - name: cmake + # run: cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=20 -DCMAKE_CXX_FLAGS="-DTF_ENABLE_ATOMIC_NOTIFIER=1" + # - name: build + # run: cmake --build build --parallel 10 + # - name: test + # run: cd build ; ctest --exclude-regex "test-unicode" --output-on-failure + diff --git a/3rd-party/CLI11/CLI11.hpp b/3rd-party/CLI11/CLI11.hpp index 27256e61b..9fa9cc026 100644 --- a/3rd-party/CLI11/CLI11.hpp +++ b/3rd-party/CLI11/CLI11.hpp @@ -1,15 +1,11 @@ -#pragma once - -// CLI11: Version 1.7.1 +// CLI11: Version 2.5.0 // Originally designed by Henry Schreiner // https://github.com/CLIUtils/CLI11 // // This is a standalone header file generated by MakeSingleHeader.py in CLI11/scripts -// from: v1.7.1 +// from: v2.5.0 // -// From LICENSE: -// -// CLI11 1.7 Copyright (c) 2017-2019 University of Cincinnati, developed by Henry +// CLI11 2.5.0 Copyright (c) 2017-2025 University of Cincinnati, developed by Henry // Schreiner under NSF AWARD 1414736. All rights reserved. // // Redistribution and use in source and binary forms of CLI11, with or without @@ -35,18 +31,25 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once // Standard combined includes: - #include -#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include #include #include #include -#include #include +#include #include #include #include @@ -55,29 +58,21 @@ #include #include #include -#include -#include #include #include #include #include -// Verbatim copy from CLI/Version.hpp: - +#define CLI11_VERSION_MAJOR 2 +#define CLI11_VERSION_MINOR 5 +#define CLI11_VERSION_PATCH 0 +#define CLI11_VERSION "2.5.0" -#define CLI11_VERSION_MAJOR 1 -#define CLI11_VERSION_MINOR 7 -#define CLI11_VERSION_PATCH 1 -#define CLI11_VERSION "1.7.1" - -// Verbatim copy from CLI/Macros.hpp: - - -// The following version macro is very similar to the one in PyBind11 +// The following version macro is very similar to the one in pybind11 #if !(defined(_MSC_VER) && __cplusplus == 199711L) && !defined(__INTEL_COMPILER) #if __cplusplus >= 201402L #define CLI11_CPP14 @@ -85,18 +80,27 @@ #define CLI11_CPP17 #if __cplusplus > 201703L #define CLI11_CPP20 +#if __cplusplus > 202002L +#define CLI11_CPP23 +#if __cplusplus > 202302L +#define CLI11_CPP26 +#endif +#endif #endif #endif #endif #elif defined(_MSC_VER) && __cplusplus == 199711L -// MSVC sets _MSVC_LANG rather than __cplusplus (supposedly until the standard is fully implemented) +// MSVC sets _MSVC_LANG rather than __cplusplus (supposedly until the standard was fully implemented) // Unless you use the /Zc:__cplusplus flag on Visual Studio 2017 15.7 Preview 3 or newer #if _MSVC_LANG >= 201402L #define CLI11_CPP14 #if _MSVC_LANG > 201402L && _MSC_VER >= 1910 #define CLI11_CPP17 -#if __MSVC_LANG > 201703L && _MSC_VER >= 1910 +#if _MSVC_LANG > 201703L && _MSC_VER >= 1910 #define CLI11_CPP20 +#if _MSVC_LANG > 202002L && _MSC_VER >= 1922 +#define CLI11_CPP23 +#endif #endif #endif #endif @@ -110,138 +114,411 @@ #define CLI11_DEPRECATED(reason) __attribute__((deprecated(reason))) #endif +// GCC < 10 doesn't ignore this in unevaluated contexts +#if !defined(CLI11_CPP17) || \ + (defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER) && __GNUC__ < 10 && __GNUC__ > 4) +#define CLI11_NODISCARD +#else +#define CLI11_NODISCARD [[nodiscard]] +#endif + +/** detection of rtti */ +#ifndef CLI11_USE_STATIC_RTTI +#if (defined(_HAS_STATIC_RTTI) && _HAS_STATIC_RTTI) +#define CLI11_USE_STATIC_RTTI 1 +#elif defined(__cpp_rtti) +#if (defined(_CPPRTTI) && _CPPRTTI == 0) +#define CLI11_USE_STATIC_RTTI 1 +#else +#define CLI11_USE_STATIC_RTTI 0 +#endif +#elif (defined(__GCC_RTTI) && __GXX_RTTI) +#define CLI11_USE_STATIC_RTTI 0 +#else +#define CLI11_USE_STATIC_RTTI 1 +#endif +#endif + +/** availability */ +#if defined CLI11_CPP17 && defined __has_include && !defined CLI11_HAS_FILESYSTEM +#if __has_include() +// Filesystem cannot be used if targeting macOS < 10.15 +#if defined __MAC_OS_X_VERSION_MIN_REQUIRED && __MAC_OS_X_VERSION_MIN_REQUIRED < 101500 +#define CLI11_HAS_FILESYSTEM 0 +#elif defined(__wasi__) +// As of wasi-sdk-14, filesystem is not implemented +#define CLI11_HAS_FILESYSTEM 0 +#else +#include +#if defined __cpp_lib_filesystem && __cpp_lib_filesystem >= 201703 +#if defined _GLIBCXX_RELEASE && _GLIBCXX_RELEASE >= 9 +#define CLI11_HAS_FILESYSTEM 1 +#elif defined(__GLIBCXX__) +// if we are using gcc and Version <9 default to no filesystem +#define CLI11_HAS_FILESYSTEM 0 +#else +#define CLI11_HAS_FILESYSTEM 1 +#endif +#else +#define CLI11_HAS_FILESYSTEM 0 +#endif +#endif +#endif +#endif + +/** availability */ +#if !defined(CLI11_CPP26) && !defined(CLI11_HAS_CODECVT) +#if defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER) && __GNUC__ < 5 +#define CLI11_HAS_CODECVT 0 +#else +#define CLI11_HAS_CODECVT 1 +#include +#endif +#else +#if defined(CLI11_HAS_CODECVT) +#if CLI11_HAS_CODECVT > 0 +#include +#endif +#else +#define CLI11_HAS_CODECVT 0 +#endif +#endif + +/** disable deprecations */ +#if defined(__GNUC__) // GCC or clang +#define CLI11_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push") +#define CLI11_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop") +#define CLI11_DIAGNOSTIC_IGNORE_DEPRECATED _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") +#elif defined(_MSC_VER) +#define CLI11_DIAGNOSTIC_PUSH __pragma(warning(push)) +#define CLI11_DIAGNOSTIC_POP __pragma(warning(pop)) -// Verbatim copy from CLI/Optional.hpp: +#define CLI11_DIAGNOSTIC_IGNORE_DEPRECATED __pragma(warning(disable : 4996)) -#ifdef __has_include +#else +#define CLI11_DIAGNOSTIC_PUSH +#define CLI11_DIAGNOSTIC_POP -// You can explicitly enable or disable support -// by defining these to 1 or 0. -#if defined(CLI11_CPP17) && __has_include() && \ - !defined(CLI11_STD_OPTIONAL) -#define CLI11_STD_OPTIONAL 1 -#elif !defined(CLI11_STD_OPTIONAL) -#define CLI11_STD_OPTIONAL 0 -#endif +#define CLI11_DIAGNOSTIC_IGNORE_DEPRECATED -#if defined(CLI11_CPP14) && __has_include() && \ - !defined(CLI11_EXPERIMENTAL_OPTIONAL) \ - && (!defined(CLI11_STD_OPTIONAL) || CLI11_STD_OPTIONAL == 0) -#define CLI11_EXPERIMENTAL_OPTIONAL 1 -#elif !defined(CLI11_EXPERIMENTAL_OPTIONAL) -#define CLI11_EXPERIMENTAL_OPTIONAL 0 #endif -#if __has_include() && !defined(CLI11_BOOST_OPTIONAL) -#include -#if BOOST_VERSION >= 105800 -#define CLI11_BOOST_OPTIONAL 1 -#endif -#elif !defined(CLI11_BOOST_OPTIONAL) -#define CLI11_BOOST_OPTIONAL 0 +/** Inline macro **/ +#ifdef CLI11_COMPILE +#define CLI11_INLINE +#else +#define CLI11_INLINE inline #endif + + +#if defined CLI11_HAS_FILESYSTEM && CLI11_HAS_FILESYSTEM > 0 +#include // NOLINT(build/include) +#else +#include +#include #endif -#if CLI11_STD_OPTIONAL -#include + + + +#ifdef CLI11_CPP17 +#include +#endif // CLI11_CPP17 + +#if defined CLI11_HAS_FILESYSTEM && CLI11_HAS_FILESYSTEM > 0 +#include +#include // NOLINT(build/include) +#endif // CLI11_HAS_FILESYSTEM + + + +#if defined(_WIN32) +#if !(defined(_AMD64_) || defined(_X86_) || defined(_ARM_)) +#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || \ + defined(_M_AMD64) +#define _AMD64_ +#elif defined(i386) || defined(__i386) || defined(__i386__) || defined(__i386__) || defined(_M_IX86) +#define _X86_ +#elif defined(__arm__) || defined(_M_ARM) || defined(_M_ARMT) +#define _ARM_ +#elif defined(__aarch64__) || defined(_M_ARM64) +#define _ARM64_ +#elif defined(_M_ARM64EC) +#define _ARM64EC_ +#endif #endif -#if CLI11_EXPERIMENTAL_OPTIONAL -#include + +// first +#ifndef NOMINMAX +// if NOMINMAX is already defined we don't want to mess with that either way +#define NOMINMAX +#include +#undef NOMINMAX +#else +#include #endif -#if CLI11_BOOST_OPTIONAL -#include + +// second +#include +// third +#include +#include #endif -// From CLI/Version.hpp: +namespace CLI { + +/// Convert a wide string to a narrow string. +CLI11_INLINE std::string narrow(const std::wstring &str); +CLI11_INLINE std::string narrow(const wchar_t *str); +CLI11_INLINE std::string narrow(const wchar_t *str, std::size_t size); +/// Convert a narrow string to a wide string. +CLI11_INLINE std::wstring widen(const std::string &str); +CLI11_INLINE std::wstring widen(const char *str); +CLI11_INLINE std::wstring widen(const char *str, std::size_t size); -// From CLI/Macros.hpp: +#ifdef CLI11_CPP17 +CLI11_INLINE std::string narrow(std::wstring_view str); +CLI11_INLINE std::wstring widen(std::string_view str); +#endif // CLI11_CPP17 +#if defined CLI11_HAS_FILESYSTEM && CLI11_HAS_FILESYSTEM > 0 +/// Convert a char-string to a native path correctly. +CLI11_INLINE std::filesystem::path to_path(std::string_view str); +#endif // CLI11_HAS_FILESYSTEM -// From CLI/Optional.hpp: -namespace CLI { -#if CLI11_STD_OPTIONAL -template std::istream &operator>>(std::istream &in, std::optional &val) { - T v; - in >> v; - val = v; - return in; +namespace detail { + +#if !CLI11_HAS_CODECVT +/// Attempt to set one of the acceptable unicode locales for conversion +CLI11_INLINE void set_unicode_locale() { + static const std::array unicode_locales{{"C.UTF-8", "en_US.UTF-8", ".UTF-8"}}; + + for(const auto &locale_name : unicode_locales) { + if(std::setlocale(LC_ALL, locale_name) != nullptr) { + return; + } + } + throw std::runtime_error("CLI::narrow: could not set locale to C.UTF-8"); } -#endif -#if CLI11_EXPERIMENTAL_OPTIONAL -template std::istream &operator>>(std::istream &in, std::experimental::optional &val) { - T v; - in >> v; - val = v; - return in; +template struct scope_guard_t { + F closure; + + explicit scope_guard_t(F closure_) : closure(closure_) {} + ~scope_guard_t() { closure(); } +}; + +template CLI11_NODISCARD CLI11_INLINE scope_guard_t scope_guard(F &&closure) { + return scope_guard_t{std::forward(closure)}; } -#endif -#if CLI11_BOOST_OPTIONAL -template std::istream &operator>>(std::istream &in, boost::optional &val) { - T v; - in >> v; - val = v; - return in; +#endif // !CLI11_HAS_CODECVT + +CLI11_DIAGNOSTIC_PUSH +CLI11_DIAGNOSTIC_IGNORE_DEPRECATED + +CLI11_INLINE std::string narrow_impl(const wchar_t *str, std::size_t str_size) { +#if CLI11_HAS_CODECVT +#ifdef _WIN32 + return std::wstring_convert>().to_bytes(str, str + str_size); + +#else + return std::wstring_convert>().to_bytes(str, str + str_size); + +#endif // _WIN32 +#else // CLI11_HAS_CODECVT + (void)str_size; + std::mbstate_t state = std::mbstate_t(); + const wchar_t *it = str; + + std::string old_locale = std::setlocale(LC_ALL, nullptr); + auto sg = scope_guard([&] { std::setlocale(LC_ALL, old_locale.c_str()); }); + set_unicode_locale(); + + std::size_t new_size = std::wcsrtombs(nullptr, &it, 0, &state); + if(new_size == static_cast(-1)) { + throw std::runtime_error("CLI::narrow: conversion error in std::wcsrtombs at offset " + + std::to_string(it - str)); + } + std::string result(new_size, '\0'); + std::wcsrtombs(const_cast(result.data()), &str, new_size, &state); + + return result; + +#endif // CLI11_HAS_CODECVT } -#endif -// Export the best optional to the CLI namespace -#if CLI11_STD_OPTIONAL -using std::optional; -#elif CLI11_EXPERIMENTAL_OPTIONAL -using std::experimental::optional; -#elif CLI11_BOOST_OPTIONAL -using boost::optional; -#endif +CLI11_INLINE std::wstring widen_impl(const char *str, std::size_t str_size) { +#if CLI11_HAS_CODECVT +#ifdef _WIN32 + return std::wstring_convert>().from_bytes(str, str + str_size); + +#else + return std::wstring_convert>().from_bytes(str, str + str_size); + +#endif // _WIN32 +#else // CLI11_HAS_CODECVT + (void)str_size; + std::mbstate_t state = std::mbstate_t(); + const char *it = str; + + std::string old_locale = std::setlocale(LC_ALL, nullptr); + auto sg = scope_guard([&] { std::setlocale(LC_ALL, old_locale.c_str()); }); + set_unicode_locale(); + + std::size_t new_size = std::mbsrtowcs(nullptr, &it, 0, &state); + if(new_size == static_cast(-1)) { + throw std::runtime_error("CLI::widen: conversion error in std::mbsrtowcs at offset " + + std::to_string(it - str)); + } + std::wstring result(new_size, L'\0'); + std::mbsrtowcs(const_cast(result.data()), &str, new_size, &state); + + return result; + +#endif // CLI11_HAS_CODECVT +} + +CLI11_DIAGNOSTIC_POP + +} // namespace detail + +CLI11_INLINE std::string narrow(const wchar_t *str, std::size_t str_size) { return detail::narrow_impl(str, str_size); } +CLI11_INLINE std::string narrow(const std::wstring &str) { return detail::narrow_impl(str.data(), str.size()); } +// Flawfinder: ignore +CLI11_INLINE std::string narrow(const wchar_t *str) { return detail::narrow_impl(str, std::wcslen(str)); } + +CLI11_INLINE std::wstring widen(const char *str, std::size_t str_size) { return detail::widen_impl(str, str_size); } +CLI11_INLINE std::wstring widen(const std::string &str) { return detail::widen_impl(str.data(), str.size()); } +// Flawfinder: ignore +CLI11_INLINE std::wstring widen(const char *str) { return detail::widen_impl(str, std::strlen(str)); } + +#ifdef CLI11_CPP17 +CLI11_INLINE std::string narrow(std::wstring_view str) { return detail::narrow_impl(str.data(), str.size()); } +CLI11_INLINE std::wstring widen(std::string_view str) { return detail::widen_impl(str.data(), str.size()); } +#endif // CLI11_CPP17 + +#if defined CLI11_HAS_FILESYSTEM && CLI11_HAS_FILESYSTEM > 0 +CLI11_INLINE std::filesystem::path to_path(std::string_view str) { + return std::filesystem::path{ +#ifdef _WIN32 + widen(str) +#else + str +#endif // _WIN32 + }; +} +#endif // CLI11_HAS_FILESYSTEM + -// This is true if any optional is found -#if CLI11_STD_OPTIONAL || CLI11_EXPERIMENTAL_OPTIONAL || CLI11_BOOST_OPTIONAL -#define CLI11_OPTIONAL 1 + + +namespace detail { +#ifdef _WIN32 +/// Decode and return UTF-8 argv from GetCommandLineW. +CLI11_INLINE std::vector compute_win32_argv(); #endif +} // namespace detail -} // namespace CLI -// From CLI/StringTools.hpp: -namespace CLI { namespace detail { -// Based on http://stackoverflow.com/questions/236129/split-a-string-in-c -/// Split a string by a delim -inline std::vector split(const std::string &s, char delim) { - std::vector elems; - // Check to see if empty string, give consistent result - if(s.empty()) - elems.emplace_back(""); - else { - std::stringstream ss; - ss.str(s); - std::string item; - while(std::getline(ss, item, delim)) { - elems.push_back(item); - } +#ifdef _WIN32 +CLI11_INLINE std::vector compute_win32_argv() { + std::vector result; + int argc = 0; + + auto deleter = [](wchar_t **ptr) { LocalFree(ptr); }; + // NOLINTBEGIN(*-avoid-c-arrays) + auto wargv = std::unique_ptr(CommandLineToArgvW(GetCommandLineW(), &argc), deleter); + // NOLINTEND(*-avoid-c-arrays) + + if(wargv == nullptr) { + throw std::runtime_error("CommandLineToArgvW failed with code " + std::to_string(GetLastError())); } - return elems; + + result.reserve(static_cast(argc)); + for(size_t i = 0; i < static_cast(argc); ++i) { + result.push_back(narrow(wargv[i])); + } + + return result; +} +#endif + +} // namespace detail + + + + +/// Include the items in this namespace to get free conversion of enums to/from streams. +/// (This is available inside CLI as well, so CLI11 will use this without a using statement). +namespace enums { + +/// output streaming for enumerations +template ::value>::type> +std::ostream &operator<<(std::ostream &in, const T &item) { + // make sure this is out of the detail namespace otherwise it won't be found when needed + return in << static_cast::type>(item); } +} // namespace enums + +/// Export to CLI namespace +using enums::operator<<; + +namespace detail { +/// a constant defining an expected max vector size defined to be a big number that could be multiplied by 4 and not +/// produce overflow for some expected uses +constexpr int expected_max_vector_size{1 << 29}; +// Based on http://stackoverflow.com/questions/236129/split-a-string-in-c +/// Split a string by a delim +CLI11_INLINE std::vector split(const std::string &s, char delim); + /// Simple function to join a string template std::string join(const T &v, std::string delim = ",") { std::ostringstream s; - size_t start = 0; - for(const auto &i : v) { - if(start++ > 0) + auto beg = std::begin(v); + auto end = std::end(v); + if(beg != end) + s << *beg++; + while(beg != end) { + s << delim << *beg++; + } + auto rval = s.str(); + if(!rval.empty() && delim.size() == 1 && rval.back() == delim[0]) { + // remove trailing delimiter if the last entry was empty + rval.pop_back(); + } + return rval; +} + +/// Simple function to join a string from processed elements +template ::value>::type> +std::string join(const T &v, Callable func, std::string delim = ",") { + std::ostringstream s; + auto beg = std::begin(v); + auto end = std::end(v); + auto loc = s.tellp(); + while(beg != end) { + auto nloc = s.tellp(); + if(nloc > loc) { s << delim; - s << i; + loc = nloc; + } + s << func(*beg++); } return s.str(); } @@ -249,7 +526,7 @@ template std::string join(const T &v, std::string delim = ",") { /// Join a string in reverse order template std::string rjoin(const T &v, std::string delim = ",") { std::ostringstream s; - for(size_t start = 0; start < v.size(); start++) { + for(std::size_t start = 0; start < v.size(); start++) { if(start > 0) s << delim; s << v[v.size() - start - 1]; @@ -260,33 +537,16 @@ template std::string rjoin(const T &v, std::string delim = ",") { // Based roughly on http://stackoverflow.com/questions/25829143/c-trim-whitespace-from-a-string /// Trim whitespace from left of string -inline std::string <rim(std::string &str) { - auto it = std::find_if(str.begin(), str.end(), [](char ch) { return !std::isspace(ch, std::locale()); }); - str.erase(str.begin(), it); - return str; -} +CLI11_INLINE std::string <rim(std::string &str); /// Trim anything from left of string -inline std::string <rim(std::string &str, const std::string &filter) { - auto it = std::find_if(str.begin(), str.end(), [&filter](char ch) { return filter.find(ch) == std::string::npos; }); - str.erase(str.begin(), it); - return str; -} +CLI11_INLINE std::string <rim(std::string &str, const std::string &filter); /// Trim whitespace from right of string -inline std::string &rtrim(std::string &str) { - auto it = std::find_if(str.rbegin(), str.rend(), [](char ch) { return !std::isspace(ch, std::locale()); }); - str.erase(it.base(), str.end()); - return str; -} +CLI11_INLINE std::string &rtrim(std::string &str); /// Trim anything from right of string -inline std::string &rtrim(std::string &str, const std::string &filter) { - auto it = - std::find_if(str.rbegin(), str.rend(), [&filter](char ch) { return filter.find(ch) == std::string::npos; }); - str.erase(it.base(), str.end()); - return str; -} +CLI11_INLINE std::string &rtrim(std::string &str, const std::string &filter); /// Trim whitespace from string inline std::string &trim(std::string &str) { return ltrim(rtrim(str)); } @@ -300,40 +560,59 @@ inline std::string trim_copy(const std::string &str) { return trim(s); } +/// remove quotes at the front and back of a string either '"' or '\'' +CLI11_INLINE std::string &remove_quotes(std::string &str); + +/// remove quotes from all elements of a string vector and process escaped components +CLI11_INLINE void remove_quotes(std::vector &args); + +/// Add a leader to the beginning of all new lines (nothing is added +/// at the start of the first line). `"; "` would be for ini files +/// +/// Can't use Regex, or this would be a subs. +CLI11_INLINE std::string fix_newlines(const std::string &leader, std::string input); + /// Make a copy of the string and then trim it, any filter string can be used (any char in string is filtered) inline std::string trim_copy(const std::string &str, const std::string &filter) { std::string s = str; return trim(s, filter); } -/// Print a two part "help" string -inline std::ostream &format_help(std::ostream &out, std::string name, std::string description, size_t wid) { - name = " " + name; - out << std::setw(static_cast(wid)) << std::left << name; - if(!description.empty()) { - if(name.length() >= wid) - out << "\n" << std::setw(static_cast(wid)) << ""; - out << description; - } - out << "\n"; - return out; -} + +/// Print subcommand aliases +CLI11_INLINE std::ostream &format_aliases(std::ostream &out, const std::vector &aliases, std::size_t wid); /// Verify the first character of an option -template bool valid_first_char(T c) { return std::isalpha(c, std::locale()) || c == '_'; } +/// - is a trigger character, ! has special meaning and new lines would just be annoying to deal with +template bool valid_first_char(T c) { + return ((c != '-') && (static_cast(c) > 33)); // space and '!' not allowed +} /// Verify following characters of an option template bool valid_later_char(T c) { - return std::isalnum(c, std::locale()) || c == '_' || c == '.' || c == '-'; + // = and : are value separators, { has special meaning for option defaults, + // and control codes other than tab would just be annoying to deal with in many places allowing space here has too + // much potential for inadvertent entry errors and bugs + return ((c != '=') && (c != ':') && (c != '{') && ((static_cast(c) > 32) || c == '\t')); } -/// Verify an option name -inline bool valid_name_string(const std::string &str) { - if(str.empty() || !valid_first_char(str[0])) - return false; - for(auto c : str.substr(1)) - if(!valid_later_char(c)) - return false; - return true; +/// Verify an option/subcommand name +CLI11_INLINE bool valid_name_string(const std::string &str); + +/// Verify an app name +inline bool valid_alias_name_string(const std::string &str) { + static const std::string badChars(std::string("\n") + '\0'); + return (str.find_first_of(badChars) == std::string::npos); +} + +/// check if a string is a container segment separator (empty or "%%") +inline bool is_separator(const std::string &str) { + static const std::string sep("%%"); + return (str.empty() || str == sep); +} + +/// Verify that str consists of letters only +inline bool isalpha(const std::string &str) { + return std::all_of(str.begin(), str.end(), [](char c) { return std::isalpha(c, std::locale()); }); } /// Return a lower case version of a string @@ -351,137 +630,687 @@ inline std::string remove_underscore(std::string str) { } /// Find and replace a substring with another substring -inline std::string find_and_replace(std::string str, std::string from, std::string to) { +CLI11_INLINE std::string find_and_replace(std::string str, std::string from, std::string to); - size_t start_pos = 0; +/// check if the flag definitions has possible false flags +inline bool has_default_flag_values(const std::string &flags) { + return (flags.find_first_of("{!") != std::string::npos); +} - while((start_pos = str.find(from, start_pos)) != std::string::npos) { - str.replace(start_pos, from.length(), to); - start_pos += to.length(); - } +CLI11_INLINE void remove_default_flag_values(std::string &flags); - return str; -} +/// Check if a string is a member of a list of strings and optionally ignore case or ignore underscores +CLI11_INLINE std::ptrdiff_t find_member(std::string name, + const std::vector names, + bool ignore_case = false, + bool ignore_underscore = false); /// Find a trigger string and call a modify callable function that takes the current string and starting position of the /// trigger and returns the position in the string to search for the next trigger string template inline std::string find_and_modify(std::string str, std::string trigger, Callable modify) { - size_t start_pos = 0; + std::size_t start_pos = 0; while((start_pos = str.find(trigger, start_pos)) != std::string::npos) { start_pos = modify(str, start_pos); } return str; } -/// Split a string '"one two" "three"' into 'one two', 'three' -/// Quote characters can be ` ' or " -inline std::vector split_up(std::string str) { - - const std::string delims("\'\"`"); - auto find_ws = [](char ch) { return std::isspace(ch, std::locale()); }; - trim(str); +/// close a sequence of characters indicated by a closure character. Brackets allows sub sequences +/// recognized bracket sequences include "'`[(<{ other closure characters are assumed to be literal strings +CLI11_INLINE std::size_t close_sequence(const std::string &str, std::size_t start, char closure_char); - std::vector output; - bool embeddedQuote = false; - char keyChar = ' '; - while(!str.empty()) { - if(delims.find_first_of(str[0]) != std::string::npos) { - keyChar = str[0]; - auto end = str.find_first_of(keyChar, 1); - while((end != std::string::npos) && (str[end - 1] == '\\')) { // deal with escaped quotes - end = str.find_first_of(keyChar, end + 1); - embeddedQuote = true; - } - if(end != std::string::npos) { - output.push_back(str.substr(1, end - 1)); - str = str.substr(end + 1); - } else { - output.push_back(str.substr(1)); - str = ""; - } - } else { - auto it = std::find_if(std::begin(str), std::end(str), find_ws); - if(it != std::end(str)) { - std::string value = std::string(str.begin(), it); - output.push_back(value); - str = std::string(it, str.end()); - } else { - output.push_back(str); - str = ""; - } - } - // transform any embedded quotes into the regular character - if(embeddedQuote) { - output.back() = find_and_replace(output.back(), std::string("\\") + keyChar, std::string(1, keyChar)); - embeddedQuote = false; - } - trim(str); - } - return output; -} +/// Split a string '"one two" "three"' into 'one two', 'three' +/// Quote characters can be ` ' or " or bracket characters [{(< with matching to the matching bracket +CLI11_INLINE std::vector split_up(std::string str, char delimiter = '\0'); -/// Add a leader to the beginning of all new lines (nothing is added -/// at the start of the first line). `"; "` would be for ini files -/// -/// Can't use Regex, or this would be a subs. -inline std::string fix_newlines(std::string leader, std::string input) { - std::string::size_type n = 0; - while(n != std::string::npos && n < input.size()) { - n = input.find('\n', n); - if(n != std::string::npos) { - input = input.substr(0, n + 1) + leader + input.substr(n + 1); - n += leader.size(); - } - } - return input; -} +/// get the value of an environmental variable or empty string if empty +CLI11_INLINE std::string get_environment_value(const std::string &env_name); /// This function detects an equal or colon followed by an escaped quote after an argument /// then modifies the string to replace the equality with a space. This is needed /// to allow the split up function to work properly and is intended to be used with the find_and_modify function /// the return value is the offset+1 which is required by the find_and_modify function. -inline size_t escape_detect(std::string &str, size_t offset) { - auto next = str[offset + 1]; - if((next == '\"') || (next == '\'') || (next == '`')) { - auto astart = str.find_last_of("-/ \"\'`", offset - 1); - if(astart != std::string::npos) { - if(str[astart] == ((str[offset] == '=') ? '-' : '/')) - str[offset] = ' '; // interpret this as a space so the split_up works properly - } - } - return offset + 1; -} +CLI11_INLINE std::size_t escape_detect(std::string &str, std::size_t offset); -/// Add quotes if the string contains spaces -inline std::string &add_quotes_if_needed(std::string &str) { - if((str.front() != '"' && str.front() != '\'') || str.front() != str.back()) { - char quote = str.find('"') < str.find('\'') ? '\'' : '"'; - if(str.find(' ') != std::string::npos) { - str.insert(0, 1, quote); - str.append(1, quote); - } - } - return str; -} +/// @brief detect if a string has escapable characters +/// @param str the string to do the detection on +/// @return true if the string has escapable characters +CLI11_INLINE bool has_escapable_character(const std::string &str); -} // namespace detail -} // namespace CLI +/// @brief escape all escapable characters +/// @param str the string to escape +/// @return a string with the escapable characters escaped with '\' +CLI11_INLINE std::string add_escaped_characters(const std::string &str); -// From CLI/Error.hpp: +/// @brief replace the escaped characters with their equivalent +CLI11_INLINE std::string remove_escaped_characters(const std::string &str); -namespace CLI { +/// generate a string with all non printable characters escaped to hex codes +CLI11_INLINE std::string binary_escape_string(const std::string &string_to_escape); -// Use one of these on all error classes. -// These are temporary and are undef'd at the end of this file. -#define CLI11_ERROR_DEF(parent, name) \ - protected: \ - name(std::string ename, std::string msg, int exit_code) : parent(std::move(ename), std::move(msg), exit_code) {} \ - name(std::string ename, std::string msg, ExitCodes exit_code) \ - : parent(std::move(ename), std::move(msg), exit_code) {} \ - \ - public: \ - name(std::string msg, ExitCodes exit_code) : parent(#name, std::move(msg), exit_code) {} \ - name(std::string msg, int exit_code) : parent(#name, std::move(msg), exit_code) {} +CLI11_INLINE bool is_binary_escaped_string(const std::string &escaped_string); + +/// extract an escaped binary_string +CLI11_INLINE std::string extract_binary_string(const std::string &escaped_string); + +/// process a quoted string, remove the quotes and if appropriate handle escaped characters +CLI11_INLINE bool process_quoted_string(std::string &str, char string_char = '\"', char literal_char = '\''); + +/// This function formats the given text as a paragraph with fixed width and applies correct line wrapping +/// with a custom line prefix. The paragraph will get streamed to the given ostream. +CLI11_INLINE std::ostream &streamOutAsParagraph(std::ostream &out, + const std::string &text, + std::size_t paragraphWidth, + const std::string &linePrefix = "", + bool skipPrefixOnFirstLine = false); + +} // namespace detail + + + + +namespace detail { +CLI11_INLINE std::vector split(const std::string &s, char delim) { + std::vector elems; + // Check to see if empty string, give consistent result + if(s.empty()) { + elems.emplace_back(); + } else { + std::stringstream ss; + ss.str(s); + std::string item; + while(std::getline(ss, item, delim)) { + elems.push_back(item); + } + } + return elems; +} + +CLI11_INLINE std::string <rim(std::string &str) { + auto it = std::find_if(str.begin(), str.end(), [](char ch) { return !std::isspace(ch, std::locale()); }); + str.erase(str.begin(), it); + return str; +} + +CLI11_INLINE std::string <rim(std::string &str, const std::string &filter) { + auto it = std::find_if(str.begin(), str.end(), [&filter](char ch) { return filter.find(ch) == std::string::npos; }); + str.erase(str.begin(), it); + return str; +} + +CLI11_INLINE std::string &rtrim(std::string &str) { + auto it = std::find_if(str.rbegin(), str.rend(), [](char ch) { return !std::isspace(ch, std::locale()); }); + str.erase(it.base(), str.end()); + return str; +} + +CLI11_INLINE std::string &rtrim(std::string &str, const std::string &filter) { + auto it = + std::find_if(str.rbegin(), str.rend(), [&filter](char ch) { return filter.find(ch) == std::string::npos; }); + str.erase(it.base(), str.end()); + return str; +} + +CLI11_INLINE std::string &remove_quotes(std::string &str) { + if(str.length() > 1 && (str.front() == '"' || str.front() == '\'' || str.front() == '`')) { + if(str.front() == str.back()) { + str.pop_back(); + str.erase(str.begin(), str.begin() + 1); + } + } + return str; +} + +CLI11_INLINE std::string &remove_outer(std::string &str, char key) { + if(str.length() > 1 && (str.front() == key)) { + if(str.front() == str.back()) { + str.pop_back(); + str.erase(str.begin(), str.begin() + 1); + } + } + return str; +} + +CLI11_INLINE std::string fix_newlines(const std::string &leader, std::string input) { + std::string::size_type n = 0; + while(n != std::string::npos && n < input.size()) { + n = input.find('\n', n); + if(n != std::string::npos) { + input = input.substr(0, n + 1) + leader + input.substr(n + 1); + n += leader.size(); + } + } + return input; +} + +CLI11_INLINE std::ostream &format_aliases(std::ostream &out, const std::vector &aliases, std::size_t wid) { + if(!aliases.empty()) { + out << std::setw(static_cast(wid)) << " aliases: "; + bool front = true; + for(const auto &alias : aliases) { + if(!front) { + out << ", "; + } else { + front = false; + } + out << detail::fix_newlines(" ", alias); + } + out << "\n"; + } + return out; +} + +CLI11_INLINE bool valid_name_string(const std::string &str) { + if(str.empty() || !valid_first_char(str[0])) { + return false; + } + auto e = str.end(); + for(auto c = str.begin() + 1; c != e; ++c) + if(!valid_later_char(*c)) + return false; + return true; +} + +CLI11_INLINE std::string find_and_replace(std::string str, std::string from, std::string to) { + + std::size_t start_pos = 0; + + while((start_pos = str.find(from, start_pos)) != std::string::npos) { + str.replace(start_pos, from.length(), to); + start_pos += to.length(); + } + + return str; +} + +CLI11_INLINE void remove_default_flag_values(std::string &flags) { + auto loc = flags.find_first_of('{', 2); + while(loc != std::string::npos) { + auto finish = flags.find_first_of("},", loc + 1); + if((finish != std::string::npos) && (flags[finish] == '}')) { + flags.erase(flags.begin() + static_cast(loc), + flags.begin() + static_cast(finish) + 1); + } + loc = flags.find_first_of('{', loc + 1); + } + flags.erase(std::remove(flags.begin(), flags.end(), '!'), flags.end()); +} + +CLI11_INLINE std::ptrdiff_t +find_member(std::string name, const std::vector names, bool ignore_case, bool ignore_underscore) { + auto it = std::end(names); + if(ignore_case) { + if(ignore_underscore) { + name = detail::to_lower(detail::remove_underscore(name)); + it = std::find_if(std::begin(names), std::end(names), [&name](std::string local_name) { + return detail::to_lower(detail::remove_underscore(local_name)) == name; + }); + } else { + name = detail::to_lower(name); + it = std::find_if(std::begin(names), std::end(names), [&name](std::string local_name) { + return detail::to_lower(local_name) == name; + }); + } + + } else if(ignore_underscore) { + name = detail::remove_underscore(name); + it = std::find_if(std::begin(names), std::end(names), [&name](std::string local_name) { + return detail::remove_underscore(local_name) == name; + }); + } else { + it = std::find(std::begin(names), std::end(names), name); + } + + return (it != std::end(names)) ? (it - std::begin(names)) : (-1); +} + +static const std::string escapedChars("\b\t\n\f\r\"\\"); +static const std::string escapedCharsCode("btnfr\"\\"); +static const std::string bracketChars{"\"'`[(<{"}; +static const std::string matchBracketChars("\"'`])>}"); + +CLI11_INLINE bool has_escapable_character(const std::string &str) { + return (str.find_first_of(escapedChars) != std::string::npos); +} + +CLI11_INLINE std::string add_escaped_characters(const std::string &str) { + std::string out; + out.reserve(str.size() + 4); + for(char s : str) { + auto sloc = escapedChars.find_first_of(s); + if(sloc != std::string::npos) { + out.push_back('\\'); + out.push_back(escapedCharsCode[sloc]); + } else { + out.push_back(s); + } + } + return out; +} + +CLI11_INLINE std::uint32_t hexConvert(char hc) { + int hcode{0}; + if(hc >= '0' && hc <= '9') { + hcode = (hc - '0'); + } else if(hc >= 'A' && hc <= 'F') { + hcode = (hc - 'A' + 10); + } else if(hc >= 'a' && hc <= 'f') { + hcode = (hc - 'a' + 10); + } else { + hcode = -1; + } + return static_cast(hcode); +} + +CLI11_INLINE char make_char(std::uint32_t code) { return static_cast(static_cast(code)); } + +CLI11_INLINE void append_codepoint(std::string &str, std::uint32_t code) { + if(code < 0x80) { // ascii code equivalent + str.push_back(static_cast(code)); + } else if(code < 0x800) { // \u0080 to \u07FF + // 110yyyyx 10xxxxxx; 0x3f == 0b0011'1111 + str.push_back(make_char(0xC0 | code >> 6)); + str.push_back(make_char(0x80 | (code & 0x3F))); + } else if(code < 0x10000) { // U+0800...U+FFFF + if(0xD800 <= code && code <= 0xDFFF) { + throw std::invalid_argument("[0xD800, 0xDFFF] are not valid UTF-8."); + } + // 1110yyyy 10yxxxxx 10xxxxxx + str.push_back(make_char(0xE0 | code >> 12)); + str.push_back(make_char(0x80 | (code >> 6 & 0x3F))); + str.push_back(make_char(0x80 | (code & 0x3F))); + } else if(code < 0x110000) { // U+010000 ... U+10FFFF + // 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx + str.push_back(make_char(0xF0 | code >> 18)); + str.push_back(make_char(0x80 | (code >> 12 & 0x3F))); + str.push_back(make_char(0x80 | (code >> 6 & 0x3F))); + str.push_back(make_char(0x80 | (code & 0x3F))); + } +} + +CLI11_INLINE std::string remove_escaped_characters(const std::string &str) { + + std::string out; + out.reserve(str.size()); + for(auto loc = str.begin(); loc < str.end(); ++loc) { + if(*loc == '\\') { + if(str.end() - loc < 2) { + throw std::invalid_argument("invalid escape sequence " + str); + } + auto ecloc = escapedCharsCode.find_first_of(*(loc + 1)); + if(ecloc != std::string::npos) { + out.push_back(escapedChars[ecloc]); + ++loc; + } else if(*(loc + 1) == 'u') { + // must have 4 hex characters + if(str.end() - loc < 6) { + throw std::invalid_argument("unicode sequence must have 4 hex codes " + str); + } + std::uint32_t code{0}; + std::uint32_t mplier{16 * 16 * 16}; + for(int ii = 2; ii < 6; ++ii) { + std::uint32_t res = hexConvert(*(loc + ii)); + if(res > 0x0F) { + throw std::invalid_argument("unicode sequence must have 4 hex codes " + str); + } + code += res * mplier; + mplier = mplier / 16; + } + append_codepoint(out, code); + loc += 5; + } else if(*(loc + 1) == 'U') { + // must have 8 hex characters + if(str.end() - loc < 10) { + throw std::invalid_argument("unicode sequence must have 8 hex codes " + str); + } + std::uint32_t code{0}; + std::uint32_t mplier{16 * 16 * 16 * 16 * 16 * 16 * 16}; + for(int ii = 2; ii < 10; ++ii) { + std::uint32_t res = hexConvert(*(loc + ii)); + if(res > 0x0F) { + throw std::invalid_argument("unicode sequence must have 8 hex codes " + str); + } + code += res * mplier; + mplier = mplier / 16; + } + append_codepoint(out, code); + loc += 9; + } else if(*(loc + 1) == '0') { + out.push_back('\0'); + ++loc; + } else { + throw std::invalid_argument(std::string("unrecognized escape sequence \\") + *(loc + 1) + " in " + str); + } + } else { + out.push_back(*loc); + } + } + return out; +} + +CLI11_INLINE std::size_t close_string_quote(const std::string &str, std::size_t start, char closure_char) { + std::size_t loc{0}; + for(loc = start + 1; loc < str.size(); ++loc) { + if(str[loc] == closure_char) { + break; + } + if(str[loc] == '\\') { + // skip the next character for escaped sequences + ++loc; + } + } + return loc; +} + +CLI11_INLINE std::size_t close_literal_quote(const std::string &str, std::size_t start, char closure_char) { + auto loc = str.find_first_of(closure_char, start + 1); + return (loc != std::string::npos ? loc : str.size()); +} + +CLI11_INLINE std::size_t close_sequence(const std::string &str, std::size_t start, char closure_char) { + + auto bracket_loc = matchBracketChars.find(closure_char); + switch(bracket_loc) { + case 0: + return close_string_quote(str, start, closure_char); + case 1: + case 2: + case std::string::npos: + return close_literal_quote(str, start, closure_char); + default: + break; + } + + std::string closures(1, closure_char); + auto loc = start + 1; + + while(loc < str.size()) { + if(str[loc] == closures.back()) { + closures.pop_back(); + if(closures.empty()) { + return loc; + } + } + bracket_loc = bracketChars.find(str[loc]); + if(bracket_loc != std::string::npos) { + switch(bracket_loc) { + case 0: + loc = close_string_quote(str, loc, str[loc]); + break; + case 1: + case 2: + loc = close_literal_quote(str, loc, str[loc]); + break; + default: + closures.push_back(matchBracketChars[bracket_loc]); + break; + } + } + ++loc; + } + if(loc > str.size()) { + loc = str.size(); + } + return loc; +} + +CLI11_INLINE std::vector split_up(std::string str, char delimiter) { + + auto find_ws = [delimiter](char ch) { + return (delimiter == '\0') ? std::isspace(ch, std::locale()) : (ch == delimiter); + }; + trim(str); + + std::vector output; + while(!str.empty()) { + if(bracketChars.find_first_of(str[0]) != std::string::npos) { + auto bracketLoc = bracketChars.find_first_of(str[0]); + auto end = close_sequence(str, 0, matchBracketChars[bracketLoc]); + if(end >= str.size()) { + output.push_back(std::move(str)); + str.clear(); + } else { + output.push_back(str.substr(0, end + 1)); + if(end + 2 < str.size()) { + str = str.substr(end + 2); + } else { + str.clear(); + } + } + + } else { + auto it = std::find_if(std::begin(str), std::end(str), find_ws); + if(it != std::end(str)) { + std::string value = std::string(str.begin(), it); + output.push_back(value); + str = std::string(it + 1, str.end()); + } else { + output.push_back(str); + str.clear(); + } + } + trim(str); + } + return output; +} + +CLI11_INLINE std::size_t escape_detect(std::string &str, std::size_t offset) { + auto next = str[offset + 1]; + if((next == '\"') || (next == '\'') || (next == '`')) { + auto astart = str.find_last_of("-/ \"\'`", offset - 1); + if(astart != std::string::npos) { + if(str[astart] == ((str[offset] == '=') ? '-' : '/')) + str[offset] = ' '; // interpret this as a space so the split_up works properly + } + } + return offset + 1; +} + +CLI11_INLINE std::string binary_escape_string(const std::string &string_to_escape) { + // s is our escaped output string + std::string escaped_string{}; + // loop through all characters + for(char c : string_to_escape) { + // check if a given character is printable + // the cast is necessary to avoid undefined behaviour + if(isprint(static_cast(c)) == 0) { + std::stringstream stream; + // if the character is not printable + // we'll convert it to a hex string using a stringstream + // note that since char is signed we have to cast it to unsigned first + stream << std::hex << static_cast(static_cast(c)); + std::string code = stream.str(); + escaped_string += std::string("\\x") + (code.size() < 2 ? "0" : "") + code; + } else if(c == 'x' || c == 'X') { + // need to check for inadvertent binary sequences + if(!escaped_string.empty() && escaped_string.back() == '\\') { + escaped_string += std::string("\\x") + (c == 'x' ? "78" : "58"); + } else { + escaped_string.push_back(c); + } + + } else { + escaped_string.push_back(c); + } + } + if(escaped_string != string_to_escape) { + auto sqLoc = escaped_string.find('\''); + while(sqLoc != std::string::npos) { + escaped_string[sqLoc] = '\\'; + escaped_string.insert(sqLoc + 1, "x27"); + sqLoc = escaped_string.find('\''); + } + escaped_string.insert(0, "'B\"("); + escaped_string.push_back(')'); + escaped_string.push_back('"'); + escaped_string.push_back('\''); + } + return escaped_string; +} + +CLI11_INLINE bool is_binary_escaped_string(const std::string &escaped_string) { + size_t ssize = escaped_string.size(); + if(escaped_string.compare(0, 3, "B\"(") == 0 && escaped_string.compare(ssize - 2, 2, ")\"") == 0) { + return true; + } + return (escaped_string.compare(0, 4, "'B\"(") == 0 && escaped_string.compare(ssize - 3, 3, ")\"'") == 0); +} + +CLI11_INLINE std::string extract_binary_string(const std::string &escaped_string) { + std::size_t start{0}; + std::size_t tail{0}; + size_t ssize = escaped_string.size(); + if(escaped_string.compare(0, 3, "B\"(") == 0 && escaped_string.compare(ssize - 2, 2, ")\"") == 0) { + start = 3; + tail = 2; + } else if(escaped_string.compare(0, 4, "'B\"(") == 0 && escaped_string.compare(ssize - 3, 3, ")\"'") == 0) { + start = 4; + tail = 3; + } + + if(start == 0) { + return escaped_string; + } + std::string outstring; + + outstring.reserve(ssize - start - tail); + std::size_t loc = start; + while(loc < ssize - tail) { + // ssize-2 to skip )" at the end + if(escaped_string[loc] == '\\' && (escaped_string[loc + 1] == 'x' || escaped_string[loc + 1] == 'X')) { + auto c1 = escaped_string[loc + 2]; + auto c2 = escaped_string[loc + 3]; + + std::uint32_t res1 = hexConvert(c1); + std::uint32_t res2 = hexConvert(c2); + if(res1 <= 0x0F && res2 <= 0x0F) { + loc += 4; + outstring.push_back(static_cast(res1 * 16 + res2)); + continue; + } + } + outstring.push_back(escaped_string[loc]); + ++loc; + } + return outstring; +} + +CLI11_INLINE void remove_quotes(std::vector &args) { + for(auto &arg : args) { + if(arg.front() == '\"' && arg.back() == '\"') { + remove_quotes(arg); + // only remove escaped for string arguments not literal strings + arg = remove_escaped_characters(arg); + } else { + remove_quotes(arg); + } + } +} + +CLI11_INLINE void handle_secondary_array(std::string &str) { + if(str.size() >= 2 && str.front() == '[' && str.back() == ']') { + // handle some special array processing for arguments if it might be interpreted as a secondary array + std::string tstr{"[["}; + for(std::size_t ii = 1; ii < str.size(); ++ii) { + tstr.push_back(str[ii]); + tstr.push_back(str[ii]); + } + str = std::move(tstr); + } +} + +CLI11_INLINE bool process_quoted_string(std::string &str, char string_char, char literal_char) { + if(str.size() <= 1) { + return false; + } + if(detail::is_binary_escaped_string(str)) { + str = detail::extract_binary_string(str); + handle_secondary_array(str); + return true; + } + if(str.front() == string_char && str.back() == string_char) { + detail::remove_outer(str, string_char); + if(str.find_first_of('\\') != std::string::npos) { + str = detail::remove_escaped_characters(str); + } + handle_secondary_array(str); + return true; + } + if((str.front() == literal_char || str.front() == '`') && str.back() == str.front()) { + detail::remove_outer(str, str.front()); + handle_secondary_array(str); + return true; + } + return false; +} + +std::string get_environment_value(const std::string &env_name) { + char *buffer = nullptr; + std::string ename_string; + +#ifdef _MSC_VER + // Windows version + std::size_t sz = 0; + if(_dupenv_s(&buffer, &sz, env_name.c_str()) == 0 && buffer != nullptr) { + ename_string = std::string(buffer); + free(buffer); + } +#else + // This also works on Windows, but gives a warning + buffer = std::getenv(env_name.c_str()); + if(buffer != nullptr) { + ename_string = std::string(buffer); + } +#endif + return ename_string; +} + +CLI11_INLINE std::ostream &streamOutAsParagraph(std::ostream &out, + const std::string &text, + std::size_t paragraphWidth, + const std::string &linePrefix, + bool skipPrefixOnFirstLine) { + if(!skipPrefixOnFirstLine) + out << linePrefix; // First line prefix + + std::istringstream lss(text); + std::string line = ""; + while(std::getline(lss, line)) { + std::istringstream iss(line); + std::string word = ""; + std::size_t charsWritten = 0; + + while(iss >> word) { + if(word.length() + charsWritten > paragraphWidth) { + out << '\n' << linePrefix; + charsWritten = 0; + } + + out << word << " "; + charsWritten += word.length() + 1; + } + + if(!lss.eof()) + out << '\n' << linePrefix; + } + return out; +} + +} // namespace detail + + + +// Use one of these on all error classes. +// These are temporary and are undef'd at the end of this file. +#define CLI11_ERROR_DEF(parent, name) \ + protected: \ + name(std::string ename, std::string msg, int exit_code) : parent(std::move(ename), std::move(msg), exit_code) {} \ + name(std::string ename, std::string msg, ExitCodes exit_code) \ + : parent(std::move(ename), std::move(msg), exit_code) {} \ + \ + public: \ + name(std::string msg, ExitCodes exit_code) : parent(#name, std::move(msg), exit_code) {} \ + name(std::string msg, int exit_code) : parent(#name, std::move(msg), exit_code) {} // This is added after the one above if a class is used directly and builds its own message #define CLI11_ERROR_SIMPLE(name) \ @@ -523,9 +1352,9 @@ class Error : public std::runtime_error { std::string error_name{"Error"}; public: - int get_exit_code() const { return actual_exit_code; } + CLI11_NODISCARD int get_exit_code() const { return actual_exit_code; } - std::string get_name() const { return error_name; } + CLI11_NODISCARD std::string get_name() const { return error_name; } Error(std::string name, std::string msg, int exit_code = static_cast(ExitCodes::BaseClass)) : runtime_error(msg), actual_exit_code(exit_code), error_name(std::move(name)) {} @@ -573,9 +1402,15 @@ class BadNameString : public ConstructionError { CLI11_ERROR_DEF(ConstructionError, BadNameString) CLI11_ERROR_SIMPLE(BadNameString) static BadNameString OneCharName(std::string name) { return BadNameString("Invalid one char name: " + name); } + static BadNameString MissingDash(std::string name) { + return BadNameString("Long names strings require 2 dashes " + name); + } static BadNameString BadLongName(std::string name) { return BadNameString("Bad long name: " + name); } - static BadNameString DashesOnly(std::string name) { - return BadNameString("Must have a name, not just dashes: " + name); + static BadNameString BadPositionalName(std::string name) { + return BadNameString("Invalid positional Name: " + name); + } + static BadNameString ReservedName(std::string name) { + return BadNameString("Names '-','--','++' are reserved and not allowed as option names " + name); } static BadNameString MultiPositionalNames(std::string name) { return BadNameString("Only one positional name allowed, remove: " + name); @@ -588,10 +1423,10 @@ class OptionAlreadyAdded : public ConstructionError { explicit OptionAlreadyAdded(std::string name) : OptionAlreadyAdded(name + " is already added", ExitCodes::OptionAlreadyAdded) {} static OptionAlreadyAdded Requires(std::string name, std::string other) { - return OptionAlreadyAdded(name + " requires " + other, ExitCodes::OptionAlreadyAdded); + return {name + " requires " + other, ExitCodes::OptionAlreadyAdded}; } static OptionAlreadyAdded Excludes(std::string name, std::string other) { - return OptionAlreadyAdded(name + " excludes " + other, ExitCodes::OptionAlreadyAdded); + return {name + " excludes " + other, ExitCodes::OptionAlreadyAdded}; } }; @@ -611,19 +1446,26 @@ class Success : public ParseError { }; /// -h or --help on command line -class CallForHelp : public ParseError { - CLI11_ERROR_DEF(ParseError, CallForHelp) +class CallForHelp : public Success { + CLI11_ERROR_DEF(Success, CallForHelp) CallForHelp() : CallForHelp("This should be caught in your main function, see examples", ExitCodes::Success) {} }; /// Usually something like --help-all on command line -class CallForAllHelp : public ParseError { - CLI11_ERROR_DEF(ParseError, CallForAllHelp) +class CallForAllHelp : public Success { + CLI11_ERROR_DEF(Success, CallForAllHelp) CallForAllHelp() : CallForAllHelp("This should be caught in your main function, see examples", ExitCodes::Success) {} }; -/// Does not output a diagnostic in CLI11_PARSE, but allows to return from main() with a specific error code. +/// -v or --version on command line +class CallForVersion : public Success { + CLI11_ERROR_DEF(Success, CallForVersion) + CallForVersion() + : CallForVersion("This should be caught in your main function, see examples", ExitCodes::Success) {} +}; + +/// Does not output a diagnostic in CLI11_PARSE, but allows main() to return with a specific error code. class RuntimeError : public ParseError { CLI11_ERROR_DEF(ParseError, RuntimeError) explicit RuntimeError(int exit_code = 1) : RuntimeError("Runtime error", exit_code) {} @@ -663,12 +1505,34 @@ class ValidationError : public ParseError { class RequiredError : public ParseError { CLI11_ERROR_DEF(ParseError, RequiredError) explicit RequiredError(std::string name) : RequiredError(name + " is required", ExitCodes::RequiredError) {} - static RequiredError Subcommand(size_t min_subcom) { - if(min_subcom == 1) + static RequiredError Subcommand(std::size_t min_subcom) { + if(min_subcom == 1) { return RequiredError("A subcommand"); - else - return RequiredError("Requires at least " + std::to_string(min_subcom) + " subcommands", - ExitCodes::RequiredError); + } + return {"Requires at least " + std::to_string(min_subcom) + " subcommands", ExitCodes::RequiredError}; + } + static RequiredError + Option(std::size_t min_option, std::size_t max_option, std::size_t used, const std::string &option_list) { + if((min_option == 1) && (max_option == 1) && (used == 0)) + return RequiredError("Exactly 1 option from [" + option_list + "]"); + if((min_option == 1) && (max_option == 1) && (used > 1)) { + return {"Exactly 1 option from [" + option_list + "] is required but " + std::to_string(used) + + " were given", + ExitCodes::RequiredError}; + } + if((min_option == 1) && (used == 0)) + return RequiredError("At least 1 option from [" + option_list + "]"); + if(used < min_option) { + return {"Requires at least " + std::to_string(min_option) + " options used but only " + + std::to_string(used) + " were given from [" + option_list + "]", + ExitCodes::RequiredError}; + } + if(max_option == 1) + return {"Requires at most 1 options be given from [" + option_list + "]", ExitCodes::RequiredError}; + + return {"Requires at most " + std::to_string(max_option) + " options be used but " + std::to_string(used) + + " were given from [" + option_list + "]", + ExitCodes::RequiredError}; } }; @@ -676,19 +1540,31 @@ class RequiredError : public ParseError { class ArgumentMismatch : public ParseError { CLI11_ERROR_DEF(ParseError, ArgumentMismatch) CLI11_ERROR_SIMPLE(ArgumentMismatch) - ArgumentMismatch(std::string name, int expected, size_t recieved) + ArgumentMismatch(std::string name, int expected, std::size_t received) : ArgumentMismatch(expected > 0 ? ("Expected exactly " + std::to_string(expected) + " arguments to " + name + - ", got " + std::to_string(recieved)) + ", got " + std::to_string(received)) : ("Expected at least " + std::to_string(-expected) + " arguments to " + name + - ", got " + std::to_string(recieved)), + ", got " + std::to_string(received)), ExitCodes::ArgumentMismatch) {} - static ArgumentMismatch AtLeast(std::string name, int num) { - return ArgumentMismatch(name + ": At least " + std::to_string(num) + " required"); + static ArgumentMismatch AtLeast(std::string name, int num, std::size_t received) { + return ArgumentMismatch(name + ": At least " + std::to_string(num) + " required but received " + + std::to_string(received)); + } + static ArgumentMismatch AtMost(std::string name, int num, std::size_t received) { + return ArgumentMismatch(name + ": At Most " + std::to_string(num) + " required but received " + + std::to_string(received)); } static ArgumentMismatch TypedAtLeast(std::string name, int num, std::string type) { return ArgumentMismatch(name + ": " + std::to_string(num) + " required " + type + " missing"); } + static ArgumentMismatch FlagOverride(std::string name) { + return ArgumentMismatch(name + " was given a disallowed flag override"); + } + static ArgumentMismatch PartialType(std::string name, int num, std::string type) { + return ArgumentMismatch(name + ": " + type + " only partially specified: " + std::to_string(num) + + " required for each element"); + } }; /// Thrown when a requires option is missing @@ -713,6 +1589,12 @@ class ExtrasError : public ParseError { : "The following argument was not expected: ") + detail::rjoin(args, " "), ExitCodes::ExtrasError) {} + ExtrasError(const std::string &name, std::vector args) + : ExtrasError(name, + (args.size() > 1 ? "The following arguments were not expected: " + : "The following argument was not expected: ") + + detail::rjoin(args, " "), + ExitCodes::ExtrasError) {} }; /// Thrown when extra values are found in an INI file @@ -742,7 +1624,7 @@ class HorribleError : public ParseError { // After parsing -/// Thrown when counting a non-existent option +/// Thrown when counting a nonexistent option class OptionNotFound : public Error { CLI11_ERROR_DEF(Error, OptionNotFound) explicit OptionNotFound(std::string name) : OptionNotFound(name + " not found", ExitCodes::OptionNotFound) {} @@ -753,3545 +1635,8818 @@ class OptionNotFound : public Error { /// @} -} // namespace CLI -// From CLI/TypeTools.hpp: -namespace CLI { // Type tools +// Utilities for type enabling +namespace detail { +// Based generally on https://rmf.io/cxx11/almost-static-if +/// Simple empty scoped class +enum class enabler {}; + +/// An instance to use in EnableIf +constexpr enabler dummy = {}; +} // namespace detail + /// A copy of enable_if_t from C++14, compatible with C++11. /// /// We could check to see if C++14 is being used, but it does not hurt to redefine this -/// (even Google does this: https://github.com/google/skia/blob/master/include/private/SkTLogic.h) +/// (even Google does this: https://github.com/google/skia/blob/main/include/private/SkTLogic.h) /// It is not in the std namespace anyway, so no harm done. - template using enable_if_t = typename std::enable_if::type; -/// Check to see if something is a vector (fail check by default) -template struct is_vector { static const bool value = false; }; +/// A copy of std::void_t from C++17 (helper for C++11 and C++14) +template struct make_void { + using type = void; +}; + +/// A copy of std::void_t from C++17 - same reasoning as enable_if_t, it does not hurt to redefine +template using void_t = typename make_void::type; -/// Check to see if something is a vector (true if actually a vector) -template struct is_vector> { static bool const value = true; }; +/// A copy of std::conditional_t from C++14 - same reasoning as enable_if_t, it does not hurt to redefine +template using conditional_t = typename std::conditional::type; /// Check to see if something is bool (fail check by default) -template struct is_bool { static const bool value = false; }; +template struct is_bool : std::false_type {}; /// Check to see if something is bool (true if actually a bool) -template <> struct is_bool { static bool const value = true; }; +template <> struct is_bool : std::true_type {}; + +/// Check to see if something is a shared pointer +template struct is_shared_ptr : std::false_type {}; + +/// Check to see if something is a shared pointer (True if really a shared pointer) +template struct is_shared_ptr> : std::true_type {}; + +/// Check to see if something is a shared pointer (True if really a shared pointer) +template struct is_shared_ptr> : std::true_type {}; + +/// Check to see if something is copyable pointer +template struct is_copyable_ptr { + static bool const value = is_shared_ptr::value || std::is_pointer::value; +}; + +/// This can be specialized to override the type deduction for IsMember. +template struct IsMemberType { + using type = T; +}; + +/// The main custom type needed here is const char * should be a string. +template <> struct IsMemberType { + using type = std::string; +}; + +namespace adl_detail { +/// Check for existence of user-supplied lexical_cast. +/// +/// This struct has to be in a separate namespace so that it doesn't see our lexical_cast overloads in CLI::detail. +/// Standard says it shouldn't see them if it's defined before the corresponding lexical_cast declarations, but this +/// requires a working implementation of two-phase lookup, and not all compilers can boast that (msvc, ahem). +template class is_lexical_castable { + template + static auto test(int) -> decltype(lexical_cast(std::declval(), std::declval()), std::true_type()); + + template static auto test(...) -> std::false_type; + + public: + static constexpr bool value = decltype(test(0))::value; +}; +} // namespace adl_detail namespace detail { -// Based generally on https://rmf.io/cxx11/almost-static-if -/// Simple empty scoped class -enum class enabler {}; -/// An instance to use in EnableIf -constexpr enabler dummy = {}; +// These are utilities for IsMember and other transforming objects -// Type name print +/// Handy helper to access the element_type generically. This is not part of is_copyable_ptr because it requires that +/// pointer_traits be valid. -/// Was going to be based on -/// http://stackoverflow.com/questions/1055452/c-get-name-of-type-in-template -/// But this is cleaner and works better in this case +/// not a pointer +template struct element_type { + using type = T; +}; -template ::value && std::is_signed::value, detail::enabler> = detail::dummy> -constexpr const char *type_name() { - return "INT"; -} +template struct element_type::value>::type> { + using type = typename std::pointer_traits::element_type; +}; -template ::value && std::is_unsigned::value, detail::enabler> = detail::dummy> -constexpr const char *type_name() { - return "UINT"; +/// Combination of the element type and value type - remove pointer (including smart pointers) and get the value_type of +/// the container +template struct element_value_type { + using type = typename element_type::type::value_type; +}; + +/// Adaptor for set-like structure: This just wraps a normal container in a few utilities that do almost nothing. +template struct pair_adaptor : std::false_type { + using value_type = typename T::value_type; + using first_type = typename std::remove_const::type; + using second_type = typename std::remove_const::type; + + /// Get the first value (really just the underlying value) + template static auto first(Q &&pair_value) -> decltype(std::forward(pair_value)) { + return std::forward(pair_value); + } + /// Get the second value (really just the underlying value) + template static auto second(Q &&pair_value) -> decltype(std::forward(pair_value)) { + return std::forward(pair_value); + } +}; + +/// Adaptor for map-like structure (true version, must have key_type and mapped_type). +/// This wraps a mapped container in a few utilities access it in a general way. +template +struct pair_adaptor< + T, + conditional_t, void>> + : std::true_type { + using value_type = typename T::value_type; + using first_type = typename std::remove_const::type; + using second_type = typename std::remove_const::type; + + /// Get the first value (really just the underlying value) + template static auto first(Q &&pair_value) -> decltype(std::get<0>(std::forward(pair_value))) { + return std::get<0>(std::forward(pair_value)); + } + /// Get the second value (really just the underlying value) + template static auto second(Q &&pair_value) -> decltype(std::get<1>(std::forward(pair_value))) { + return std::get<1>(std::forward(pair_value)); + } +}; + +// Warning is suppressed due to "bug" in gcc<5.0 and gcc 7.0 with c++17 enabled that generates a -Wnarrowing warning +// in the unevaluated context even if the function that was using this wasn't used. The standard says narrowing in +// brace initialization shouldn't be allowed but for backwards compatibility gcc allows it in some contexts. It is a +// little fuzzy what happens in template constructs and I think that was something GCC took a little while to work out. +// But regardless some versions of gcc generate a warning when they shouldn't from the following code so that should be +// suppressed +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wnarrowing" +#endif +// check for constructibility from a specific type and copy assignable used in the parse detection +template class is_direct_constructible { + template + static auto test(int, std::true_type) -> decltype( +// NVCC warns about narrowing conversions here +#ifdef __CUDACC__ +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diag_suppress 2361 +#else +#pragma diag_suppress 2361 +#endif +#endif + TT{std::declval()} +#ifdef __CUDACC__ +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diag_default 2361 +#else +#pragma diag_default 2361 +#endif +#endif + , + std::is_move_assignable()); + + template static auto test(int, std::false_type) -> std::false_type; + + template static auto test(...) -> std::false_type; + + public: + static constexpr bool value = decltype(test(0, typename std::is_constructible::type()))::value; +}; +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + +// Check for output streamability +// Based on https://stackoverflow.com/questions/22758291/how-can-i-detect-if-a-type-can-be-streamed-to-an-stdostream + +template class is_ostreamable { + template + static auto test(int) -> decltype(std::declval() << std::declval(), std::true_type()); + + template static auto test(...) -> std::false_type; + + public: + static constexpr bool value = decltype(test(0))::value; +}; + +/// Check for input streamability +template class is_istreamable { + template + static auto test(int) -> decltype(std::declval() >> std::declval(), std::true_type()); + + template static auto test(...) -> std::false_type; + + public: + static constexpr bool value = decltype(test(0))::value; +}; + +/// Check for complex +template class is_complex { + template + static auto test(int) -> decltype(std::declval().real(), std::declval().imag(), std::true_type()); + + template static auto test(...) -> std::false_type; + + public: + static constexpr bool value = decltype(test(0))::value; +}; + +/// Templated operation to get a value from a stream +template ::value, detail::enabler> = detail::dummy> +bool from_stream(const std::string &istring, T &obj) { + std::istringstream is; + is.str(istring); + is >> obj; + return !is.fail() && !is.rdbuf()->in_avail(); } -template ::value, detail::enabler> = detail::dummy> -constexpr const char *type_name() { - return "FLOAT"; +template ::value, detail::enabler> = detail::dummy> +bool from_stream(const std::string & /*istring*/, T & /*obj*/) { + return false; } -/// This one should not be used, since vector types print the internal type -template ::value, detail::enabler> = detail::dummy> -constexpr const char *type_name() { - return "VECTOR"; +// check to see if an object is a mutable container (fail by default) +template struct is_mutable_container : std::false_type {}; + +/// type trait to test if a type is a mutable container meaning it has a value_type, it has an iterator, a clear, and +/// end methods and an insert function. And for our purposes we exclude std::string and types that can be constructed +/// from a std::string +template +struct is_mutable_container< + T, + conditional_t().end()), + decltype(std::declval().clear()), + decltype(std::declval().insert(std::declval().end())>(), + std::declval()))>, + void>> : public conditional_t::value || + std::is_constructible::value, + std::false_type, + std::true_type> {}; + +// check to see if an object is a mutable container (fail by default) +template struct is_readable_container : std::false_type {}; + +/// type trait to test if a type is a container meaning it has a value_type, it has an iterator, and an end +/// method. +template +struct is_readable_container< + T, + conditional_t().end()), decltype(std::declval().begin())>, void>> + : public std::true_type {}; + +// check to see if an object is a wrapper (fail by default) +template struct is_wrapper : std::false_type {}; + +// check if an object is a wrapper (it has a value_type defined) +template +struct is_wrapper, void>> : public std::true_type {}; + +// Check for tuple like types, as in classes with a tuple_size type trait +// Even though in C++26 std::complex gains a std::tuple interface, for our purposes we treat is as NOT a tuple +template class is_tuple_like { + template ::value, detail::enabler> = detail::dummy> + // static auto test(int) + // -> decltype(std::conditional<(std::tuple_size::value > 0), std::true_type, std::false_type>::type()); + static auto test(int) -> decltype(std::tuple_size::type>::value, std::true_type{}); + template static auto test(...) -> std::false_type; + + public: + static constexpr bool value = decltype(test(0))::value; +}; + +/// This will only trigger for actual void type +template struct type_count_base { + static const int value{0}; +}; + +/// Type size for regular object types that do not look like a tuple +template +struct type_count_base::value && !is_mutable_container::value && + !std::is_void::value>::type> { + static constexpr int value{1}; +}; + +/// the base tuple size +template +struct type_count_base::value && !is_mutable_container::value>::type> { + static constexpr int value{// cppcheck-suppress unusedStructMember + std::tuple_size::type>::value}; +}; + +/// Type count base for containers is the type_count_base of the individual element +template struct type_count_base::value>::type> { + static constexpr int value{type_count_base::value}; +}; + +/// Convert an object to a string (directly forward if this can become a string) +template ::value, detail::enabler> = detail::dummy> +auto to_string(T &&value) -> decltype(std::forward(value)) { + return std::forward(value); } +/// Construct a string from the object template ::value && !std::is_integral::value && !is_vector::value, + enable_if_t::value && !std::is_convertible::value, detail::enabler> = detail::dummy> -constexpr const char *type_name() { - return "TEXT"; +std::string to_string(T &&value) { + return std::string(value); // NOLINT(google-readability-casting) } -// Lexical cast - -/// Signed integers / enums +/// Convert an object to a string (streaming must be supported for that type) template ::value && std::is_signed::value), detail::enabler> = detail::dummy> -bool lexical_cast(std::string input, T &output) { - try { - size_t n = 0; - long long output_ll = std::stoll(input, &n, 0); - output = static_cast(output_ll); - return n == input.size() && static_cast(output) == output_ll; - } catch(const std::invalid_argument &) { - return false; - } catch(const std::out_of_range &) { - return false; - } + enable_if_t::value && !std::is_constructible::value && + is_ostreamable::value, + detail::enabler> = detail::dummy> +std::string to_string(T &&value) { + std::stringstream stream; + stream << value; + return stream.str(); } -/// Unsigned integers -template ::value && std::is_unsigned::value, detail::enabler> = detail::dummy> -bool lexical_cast(std::string input, T &output) { - if(!input.empty() && input.front() == '-') - return false; // std::stoull happily converts negative values to junk without any errors. - - try { - size_t n = 0; - unsigned long long output_ll = std::stoull(input, &n, 0); - output = static_cast(output_ll); - return n == input.size() && static_cast(output) == output_ll; - } catch(const std::invalid_argument &) { - return false; - } catch(const std::out_of_range &) { - return false; - } -} +// additional forward declarations -/// Floats -template ::value, detail::enabler> = detail::dummy> -bool lexical_cast(std::string input, T &output) { - try { - size_t n = 0; - output = static_cast(std::stold(input, &n)); - return n == input.size(); - } catch(const std::invalid_argument &) { - return false; - } catch(const std::out_of_range &) { - return false; - } -} +/// Print tuple value string for tuples of size ==1 +template ::value && !std::is_constructible::value && + !is_ostreamable::value && is_tuple_like::value && type_count_base::value == 1, + detail::enabler> = detail::dummy> +inline std::string to_string(T &&value); -/// String and similar +/// Print tuple value string for tuples of size > 1 template ::value && !std::is_integral::value && - std::is_assignable::value, + enable_if_t::value && !std::is_constructible::value && + !is_ostreamable::value && is_tuple_like::value && type_count_base::value >= 2, detail::enabler> = detail::dummy> -bool lexical_cast(std::string input, T &output) { - output = input; - return true; +inline std::string to_string(T &&value); + +/// If conversion is not supported, return an empty string (streaming is not supported for that type) +template < + typename T, + enable_if_t::value && !std::is_constructible::value && + !is_ostreamable::value && !is_readable_container::type>::value && + !is_tuple_like::value, + detail::enabler> = detail::dummy> +inline std::string to_string(T &&) { + return {}; } -/// Non-string parsable +/// convert a readable container to a string template ::value && !std::is_integral::value && - !std::is_assignable::value, + enable_if_t::value && !std::is_constructible::value && + !is_ostreamable::value && is_readable_container::value, detail::enabler> = detail::dummy> -bool lexical_cast(std::string input, T &output) { - std::istringstream is; - - is.str(input); - is >> output; - return !is.fail() && !is.rdbuf()->in_avail(); +inline std::string to_string(T &&variable) { + auto cval = variable.begin(); + auto end = variable.end(); + if(cval == end) { + return {"{}"}; + } + std::vector defaults; + while(cval != end) { + defaults.emplace_back(CLI::detail::to_string(*cval)); + ++cval; + } + return {"[" + detail::join(defaults) + "]"}; } -} // namespace detail -} // namespace CLI +/// Convert a tuple like object to a string -// From CLI/Split.hpp: +/// forward declarations for tuple_value_strings +template +inline typename std::enable_if::value, std::string>::type tuple_value_string(T && /*value*/); -namespace CLI { -namespace detail { +/// Recursively generate the tuple value string +template +inline typename std::enable_if<(I < type_count_base::value), std::string>::type tuple_value_string(T &&value); -// Returns false if not a short option. Otherwise, sets opt name and rest and returns true -inline bool split_short(const std::string ¤t, std::string &name, std::string &rest) { - if(current.size() > 1 && current[0] == '-' && valid_first_char(current[1])) { - name = current.substr(1, 1); - rest = current.substr(2); - return true; - } else - return false; +/// Print tuple value string for tuples of size ==1 +template ::value && !std::is_constructible::value && + !is_ostreamable::value && is_tuple_like::value && type_count_base::value == 1, + detail::enabler>> +inline std::string to_string(T &&value) { + return to_string(std::get<0>(value)); } -// Returns false if not a long option. Otherwise, sets opt name and other side of = and returns true -inline bool split_long(const std::string ¤t, std::string &name, std::string &value) { - if(current.size() > 2 && current.substr(0, 2) == "--" && valid_first_char(current[2])) { - auto loc = current.find_first_of('='); - if(loc != std::string::npos) { - name = current.substr(2, loc - 2); - value = current.substr(loc + 1); - } else { - name = current.substr(2); - value = ""; - } - return true; - } else - return false; +/// Print tuple value string for tuples of size > 1 +template ::value && !std::is_constructible::value && + !is_ostreamable::value && is_tuple_like::value && type_count_base::value >= 2, + detail::enabler>> +inline std::string to_string(T &&value) { + auto tname = std::string(1, '[') + tuple_value_string(value); + tname.push_back(']'); + return tname; } -// Returns false if not a windows style option. Otherwise, sets opt name and value and returns true -inline bool split_windows(const std::string ¤t, std::string &name, std::string &value) { - if(current.size() > 1 && current[0] == '/' && valid_first_char(current[1])) { - auto loc = current.find_first_of(':'); - if(loc != std::string::npos) { - name = current.substr(1, loc - 1); - value = current.substr(loc + 1); - } else { - name = current.substr(1); - value = ""; - } - return true; - } else - return false; +/// Empty string if the index > tuple size +template +inline typename std::enable_if::value, std::string>::type tuple_value_string(T && /*value*/) { + return std::string{}; } -// Splits a string into multiple long and short names -inline std::vector split_names(std::string current) { - std::vector output; - size_t val; - while((val = current.find(",")) != std::string::npos) { - output.push_back(trim_copy(current.substr(0, val))); - current = current.substr(val + 1); - } - output.push_back(trim_copy(current)); - return output; +/// Recursively generate the tuple value string +template +inline typename std::enable_if<(I < type_count_base::value), std::string>::type tuple_value_string(T &&value) { + auto str = std::string{to_string(std::get(value))} + ',' + tuple_value_string(value); + if(str.back() == ',') + str.pop_back(); + return str; } -/// Get a vector of short names, one of long names, and a single name -inline std::tuple, std::vector, std::string> -get_names(const std::vector &input) { +/// special template overload +template ::value, detail::enabler> = detail::dummy> +auto checked_to_string(T &&value) -> decltype(to_string(std::forward(value))) { + return to_string(std::forward(value)); +} - std::vector short_names; - std::vector long_names; - std::string pos_name; +/// special template overload +template ::value, detail::enabler> = detail::dummy> +std::string checked_to_string(T &&) { + return std::string{}; +} +/// get a string as a convertible value for arithmetic types +template ::value, detail::enabler> = detail::dummy> +std::string value_string(const T &value) { + return std::to_string(value); +} +/// get a string as a convertible value for enumerations +template ::value, detail::enabler> = detail::dummy> +std::string value_string(const T &value) { + return std::to_string(static_cast::type>(value)); +} +/// for other types just use the regular to_string function +template ::value && !std::is_arithmetic::value, detail::enabler> = detail::dummy> +auto value_string(const T &value) -> decltype(to_string(value)) { + return to_string(value); +} - for(std::string name : input) { - if(name.length() == 0) - continue; - else if(name.length() > 1 && name[0] == '-' && name[1] != '-') { - if(name.length() == 2 && valid_first_char(name[1])) - short_names.emplace_back(1, name[1]); - else - throw BadNameString::OneCharName(name); - } else if(name.length() > 2 && name.substr(0, 2) == "--") { - name = name.substr(2); - if(valid_name_string(name)) - long_names.push_back(name); - else - throw BadNameString::BadLongName(name); - } else if(name == "-" || name == "--") { - throw BadNameString::DashesOnly(name); - } else { - if(pos_name.length() > 0) - throw BadNameString::MultiPositionalNames(name); - pos_name = name; - } - } +/// template to get the underlying value type if it exists or use a default +template struct wrapped_type { + using type = def; +}; - return std::tuple, std::vector, std::string>( - short_names, long_names, pos_name); -} +/// Type size for regular object types that do not look like a tuple +template struct wrapped_type::value>::type> { + using type = typename T::value_type; +}; -} // namespace detail -} // namespace CLI +/// Set of overloads to get the type size of an object -// From CLI/ConfigFwd.hpp: +/// forward declare the subtype_count structure +template struct subtype_count; -namespace CLI { +/// forward declare the subtype_count_min structure +template struct subtype_count_min; -class App; +/// This will only trigger for actual void type +template struct type_count { + static const int value{0}; +}; -namespace detail { +/// Type size for regular object types that do not look like a tuple +template +struct type_count::value && !is_tuple_like::value && !is_complex::value && + !std::is_void::value>::type> { + static constexpr int value{1}; +}; -/// Comma separated join, adds quotes if needed -inline std::string ini_join(std::vector args) { - std::ostringstream s; - size_t start = 0; - for(const auto &arg : args) { - if(start++ > 0) - s << " "; +/// Type size for complex since it sometimes looks like a wrapper +template struct type_count::value>::type> { + static constexpr int value{2}; +}; - auto it = std::find_if(arg.begin(), arg.end(), [](char ch) { return std::isspace(ch, std::locale()); }); - if(it == arg.end()) - s << arg; - else if(arg.find(R"(")") == std::string::npos) - s << R"(")" << arg << R"(")"; - else - s << R"(')" << arg << R"(')"; - } +/// Type size of types that are wrappers,except complex and tuples(which can also be wrappers sometimes) +template struct type_count::value>::type> { + static constexpr int value{subtype_count::value}; +}; - return s.str(); +/// Type size of types that are wrappers,except containers complex and tuples(which can also be wrappers sometimes) +template +struct type_count::value && !is_complex::value && !is_tuple_like::value && + !is_mutable_container::value>::type> { + static constexpr int value{type_count::value}; +}; + +/// 0 if the index > tuple size +template +constexpr typename std::enable_if::value, int>::type tuple_type_size() { + return 0; } -} // namespace detail +/// Recursively generate the tuple type name +template + constexpr typename std::enable_if < I::value, int>::type tuple_type_size() { + return subtype_count::type>::value + tuple_type_size(); +} -/// Holds values to load into Options -struct ConfigItem { - /// This is the list of parents - std::vector parents; +/// Get the type size of the sum of type sizes for all the individual tuple types +template struct type_count::value>::type> { + static constexpr int value{tuple_type_size()}; +}; - /// This is the name - std::string name; +/// definition of subtype count +template struct subtype_count { + static constexpr int value{is_mutable_container::value ? expected_max_vector_size : type_count::value}; +}; - /// Listing of inputs - std::vector inputs; +/// This will only trigger for actual void type +template struct type_count_min { + static const int value{0}; +}; - /// The list of parents and name joined by "." - std::string fullname() const { - std::vector tmp = parents; - tmp.emplace_back(name); - return detail::join(tmp, "."); - } +/// Type size for regular object types that do not look like a tuple +template +struct type_count_min< + T, + typename std::enable_if::value && !is_tuple_like::value && !is_wrapper::value && + !is_complex::value && !std::is_void::value>::type> { + static constexpr int value{type_count::value}; }; -/// This class provides a converter for configuration files. -class Config { - protected: - std::vector items; +/// Type size for complex since it sometimes looks like a wrapper +template struct type_count_min::value>::type> { + static constexpr int value{1}; +}; - public: - /// Convert an app into a configuration - virtual std::string to_config(const App *, bool, bool, std::string) const = 0; +/// Type size min of types that are wrappers,except complex and tuples(which can also be wrappers sometimes) +template +struct type_count_min< + T, + typename std::enable_if::value && !is_complex::value && !is_tuple_like::value>::type> { + static constexpr int value{subtype_count_min::value}; +}; + +/// 0 if the index > tuple size +template +constexpr typename std::enable_if::value, int>::type tuple_type_size_min() { + return 0; +} + +/// Recursively generate the tuple type name +template + constexpr typename std::enable_if < I::value, int>::type tuple_type_size_min() { + return subtype_count_min::type>::value + tuple_type_size_min(); +} + +/// Get the type size of the sum of type sizes for all the individual tuple types +template struct type_count_min::value>::type> { + static constexpr int value{tuple_type_size_min()}; +}; + +/// definition of subtype count +template struct subtype_count_min { + static constexpr int value{is_mutable_container::value + ? ((type_count::value < expected_max_vector_size) ? type_count::value : 0) + : type_count_min::value}; +}; + +/// This will only trigger for actual void type +template struct expected_count { + static const int value{0}; +}; + +/// For most types the number of expected items is 1 +template +struct expected_count::value && !is_wrapper::value && + !std::is_void::value>::type> { + static constexpr int value{1}; +}; +/// number of expected items in a vector +template struct expected_count::value>::type> { + static constexpr int value{expected_max_vector_size}; +}; + +/// number of expected items in a vector +template +struct expected_count::value && is_wrapper::value>::type> { + static constexpr int value{expected_count::value}; +}; + +// Enumeration of the different supported categorizations of objects +enum class object_category : int { + char_value = 1, + integral_value = 2, + unsigned_integral = 4, + enumeration = 6, + boolean_value = 8, + floating_point = 10, + number_constructible = 12, + double_constructible = 14, + integer_constructible = 16, + // string like types + string_assignable = 23, + string_constructible = 24, + wstring_assignable = 25, + wstring_constructible = 26, + other = 45, + // special wrapper or container types + wrapper_value = 50, + complex_number = 60, + tuple_value = 70, + container_value = 80, + +}; + +/// Set of overloads to classify an object according to type + +/// some type that is not otherwise recognized +template struct classify_object { + static constexpr object_category value{object_category::other}; +}; + +/// Signed integers +template +struct classify_object< + T, + typename std::enable_if::value && !std::is_same::value && std::is_signed::value && + !is_bool::value && !std::is_enum::value>::type> { + static constexpr object_category value{object_category::integral_value}; +}; + +/// Unsigned integers +template +struct classify_object::value && std::is_unsigned::value && + !std::is_same::value && !is_bool::value>::type> { + static constexpr object_category value{object_category::unsigned_integral}; +}; + +/// single character values +template +struct classify_object::value && !std::is_enum::value>::type> { + static constexpr object_category value{object_category::char_value}; +}; + +/// Boolean values +template struct classify_object::value>::type> { + static constexpr object_category value{object_category::boolean_value}; +}; + +/// Floats +template struct classify_object::value>::type> { + static constexpr object_category value{object_category::floating_point}; +}; +#if defined _MSC_VER +// in MSVC wstring should take precedence if available this isn't as useful on other compilers due to the broader use of +// utf-8 encoding +#define WIDE_STRING_CHECK \ + !std::is_assignable::value && !std::is_constructible::value +#define STRING_CHECK true +#else +#define WIDE_STRING_CHECK true +#define STRING_CHECK !std::is_assignable::value && !std::is_constructible::value +#endif + +/// String and similar direct assignment +template +struct classify_object< + T, + typename std::enable_if::value && !std::is_integral::value && WIDE_STRING_CHECK && + std::is_assignable::value>::type> { + static constexpr object_category value{object_category::string_assignable}; +}; + +/// String and similar constructible and copy assignment +template +struct classify_object< + T, + typename std::enable_if::value && !std::is_integral::value && + !std::is_assignable::value && (type_count::value == 1) && + WIDE_STRING_CHECK && std::is_constructible::value>::type> { + static constexpr object_category value{object_category::string_constructible}; +}; + +/// Wide strings +template +struct classify_object::value && !std::is_integral::value && + STRING_CHECK && std::is_assignable::value>::type> { + static constexpr object_category value{object_category::wstring_assignable}; +}; + +template +struct classify_object< + T, + typename std::enable_if::value && !std::is_integral::value && + !std::is_assignable::value && (type_count::value == 1) && + STRING_CHECK && std::is_constructible::value>::type> { + static constexpr object_category value{object_category::wstring_constructible}; +}; + +/// Enumerations +template struct classify_object::value>::type> { + static constexpr object_category value{object_category::enumeration}; +}; + +template struct classify_object::value>::type> { + static constexpr object_category value{object_category::complex_number}; +}; + +/// Handy helper to contain a bunch of checks that rule out many common types (integers, string like, floating point, +/// vectors, and enumerations +template struct uncommon_type { + using type = typename std::conditional< + !std::is_floating_point::value && !std::is_integral::value && + !std::is_assignable::value && !std::is_constructible::value && + !std::is_assignable::value && !std::is_constructible::value && + !is_complex::value && !is_mutable_container::value && !std::is_enum::value, + std::true_type, + std::false_type>::type; + static constexpr bool value = type::value; +}; + +/// wrapper type +template +struct classify_object::value && is_wrapper::value && + !is_tuple_like::value && uncommon_type::value)>::type> { + static constexpr object_category value{object_category::wrapper_value}; +}; + +/// Assignable from double or int +template +struct classify_object::value && type_count::value == 1 && + !is_wrapper::value && is_direct_constructible::value && + is_direct_constructible::value>::type> { + static constexpr object_category value{object_category::number_constructible}; +}; + +/// Assignable from int +template +struct classify_object::value && type_count::value == 1 && + !is_wrapper::value && !is_direct_constructible::value && + is_direct_constructible::value>::type> { + static constexpr object_category value{object_category::integer_constructible}; +}; + +/// Assignable from double +template +struct classify_object::value && type_count::value == 1 && + !is_wrapper::value && is_direct_constructible::value && + !is_direct_constructible::value>::type> { + static constexpr object_category value{object_category::double_constructible}; +}; + +/// Tuple type +template +struct classify_object< + T, + typename std::enable_if::value && + ((type_count::value >= 2 && !is_wrapper::value) || + (uncommon_type::value && !is_direct_constructible::value && + !is_direct_constructible::value) || + (uncommon_type::value && type_count::value >= 2))>::type> { + static constexpr object_category value{object_category::tuple_value}; + // the condition on this class requires it be like a tuple, but on some compilers (like Xcode) tuples can be + // constructed from just the first element so tuples of can be constructed from a string, which + // could lead to issues so there are two variants of the condition, the first isolates things with a type size >=2 + // mainly to get tuples on Xcode with the exception of wrappers, the second is the main one and just separating out + // those cases that are caught by other object classifications +}; + +/// container type +template struct classify_object::value>::type> { + static constexpr object_category value{object_category::container_value}; +}; + +// Type name print + +/// Was going to be based on +/// http://stackoverflow.com/questions/1055452/c-get-name-of-type-in-template +/// But this is cleaner and works better in this case + +template ::value == object_category::char_value, detail::enabler> = detail::dummy> +constexpr const char *type_name() { + return "CHAR"; +} + +template ::value == object_category::integral_value || + classify_object::value == object_category::integer_constructible, + detail::enabler> = detail::dummy> +constexpr const char *type_name() { + return "INT"; +} + +template ::value == object_category::unsigned_integral, detail::enabler> = detail::dummy> +constexpr const char *type_name() { + return "UINT"; +} + +template ::value == object_category::floating_point || + classify_object::value == object_category::number_constructible || + classify_object::value == object_category::double_constructible, + detail::enabler> = detail::dummy> +constexpr const char *type_name() { + return "FLOAT"; +} + +/// Print name for enumeration types +template ::value == object_category::enumeration, detail::enabler> = detail::dummy> +constexpr const char *type_name() { + return "ENUM"; +} + +/// Print name for enumeration types +template ::value == object_category::boolean_value, detail::enabler> = detail::dummy> +constexpr const char *type_name() { + return "BOOLEAN"; +} + +/// Print name for enumeration types +template ::value == object_category::complex_number, detail::enabler> = detail::dummy> +constexpr const char *type_name() { + return "COMPLEX"; +} + +/// Print for all other types +template ::value >= object_category::string_assignable && + classify_object::value <= object_category::other, + detail::enabler> = detail::dummy> +constexpr const char *type_name() { + return "TEXT"; +} +/// typename for tuple value +template ::value == object_category::tuple_value && type_count_base::value >= 2, + detail::enabler> = detail::dummy> +std::string type_name(); // forward declaration + +/// Generate type name for a wrapper or container value +template ::value == object_category::container_value || + classify_object::value == object_category::wrapper_value, + detail::enabler> = detail::dummy> +std::string type_name(); // forward declaration + +/// Print name for single element tuple types +template ::value == object_category::tuple_value && type_count_base::value == 1, + detail::enabler> = detail::dummy> +inline std::string type_name() { + return type_name::type>::type>(); +} + +/// Empty string if the index > tuple size +template +inline typename std::enable_if::value, std::string>::type tuple_name() { + return std::string{}; +} + +/// Recursively generate the tuple type name +template +inline typename std::enable_if<(I < type_count_base::value), std::string>::type tuple_name() { + auto str = std::string{type_name::type>::type>()} + ',' + + tuple_name(); + if(str.back() == ',') + str.pop_back(); + return str; +} + +/// Print type name for tuples with 2 or more elements +template ::value == object_category::tuple_value && type_count_base::value >= 2, + detail::enabler>> +inline std::string type_name() { + auto tname = std::string(1, '[') + tuple_name(); + tname.push_back(']'); + return tname; +} + +/// get the type name for a type that has a value_type member +template ::value == object_category::container_value || + classify_object::value == object_category::wrapper_value, + detail::enabler>> +inline std::string type_name() { + return type_name(); +} + +// Lexical cast + +/// Convert to an unsigned integral +template ::value, detail::enabler> = detail::dummy> +bool integral_conversion(const std::string &input, T &output) noexcept { + if(input.empty() || input.front() == '-') { + return false; + } + char *val{nullptr}; + errno = 0; + std::uint64_t output_ll = std::strtoull(input.c_str(), &val, 0); + if(errno == ERANGE) { + return false; + } + output = static_cast(output_ll); + if(val == (input.c_str() + input.size()) && static_cast(output) == output_ll) { + return true; + } + val = nullptr; + std::int64_t output_sll = std::strtoll(input.c_str(), &val, 0); + if(val == (input.c_str() + input.size())) { + output = (output_sll < 0) ? static_cast(0) : static_cast(output_sll); + return (static_cast(output) == output_sll); + } + // remove separators + if(input.find_first_of("_'") != std::string::npos) { + std::string nstring = input; + nstring.erase(std::remove(nstring.begin(), nstring.end(), '_'), nstring.end()); + nstring.erase(std::remove(nstring.begin(), nstring.end(), '\''), nstring.end()); + return integral_conversion(nstring, output); + } + if(std::isspace(static_cast(input.back()))) { + return integral_conversion(trim_copy(input), output); + } + if(input.compare(0, 2, "0o") == 0 || input.compare(0, 2, "0O") == 0) { + val = nullptr; + errno = 0; + output_ll = std::strtoull(input.c_str() + 2, &val, 8); + if(errno == ERANGE) { + return false; + } + output = static_cast(output_ll); + return (val == (input.c_str() + input.size()) && static_cast(output) == output_ll); + } + if(input.compare(0, 2, "0b") == 0 || input.compare(0, 2, "0B") == 0) { + // LCOV_EXCL_START + // In some new compilers including the coverage testing one binary strings are handled properly in strtoull + // automatically so this coverage is missing but is well tested in other compilers + val = nullptr; + errno = 0; + output_ll = std::strtoull(input.c_str() + 2, &val, 2); + if(errno == ERANGE) { + return false; + } + output = static_cast(output_ll); + return (val == (input.c_str() + input.size()) && static_cast(output) == output_ll); + // LCOV_EXCL_STOP + } + return false; +} + +/// Convert to a signed integral +template ::value, detail::enabler> = detail::dummy> +bool integral_conversion(const std::string &input, T &output) noexcept { + if(input.empty()) { + return false; + } + char *val = nullptr; + errno = 0; + std::int64_t output_ll = std::strtoll(input.c_str(), &val, 0); + if(errno == ERANGE) { + return false; + } + output = static_cast(output_ll); + if(val == (input.c_str() + input.size()) && static_cast(output) == output_ll) { + return true; + } + if(input == "true") { + // this is to deal with a few oddities with flags and wrapper int types + output = static_cast(1); + return true; + } + // remove separators and trailing spaces + if(input.find_first_of("_'") != std::string::npos) { + std::string nstring = input; + nstring.erase(std::remove(nstring.begin(), nstring.end(), '_'), nstring.end()); + nstring.erase(std::remove(nstring.begin(), nstring.end(), '\''), nstring.end()); + return integral_conversion(nstring, output); + } + if(std::isspace(static_cast(input.back()))) { + return integral_conversion(trim_copy(input), output); + } + if(input.compare(0, 2, "0o") == 0 || input.compare(0, 2, "0O") == 0) { + val = nullptr; + errno = 0; + output_ll = std::strtoll(input.c_str() + 2, &val, 8); + if(errno == ERANGE) { + return false; + } + output = static_cast(output_ll); + return (val == (input.c_str() + input.size()) && static_cast(output) == output_ll); + } + if(input.compare(0, 2, "0b") == 0 || input.compare(0, 2, "0B") == 0) { + // LCOV_EXCL_START + // In some new compilers including the coverage testing one binary strings are handled properly in strtoll + // automatically so this coverage is missing but is well tested in other compilers + val = nullptr; + errno = 0; + output_ll = std::strtoll(input.c_str() + 2, &val, 2); + if(errno == ERANGE) { + return false; + } + output = static_cast(output_ll); + return (val == (input.c_str() + input.size()) && static_cast(output) == output_ll); + // LCOV_EXCL_STOP + } + return false; +} + +/// Convert a flag into an integer value typically binary flags sets errno to nonzero if conversion failed +inline std::int64_t to_flag_value(std::string val) noexcept { + static const std::string trueString("true"); + static const std::string falseString("false"); + if(val == trueString) { + return 1; + } + if(val == falseString) { + return -1; + } + val = detail::to_lower(val); + std::int64_t ret = 0; + if(val.size() == 1) { + if(val[0] >= '1' && val[0] <= '9') { + return (static_cast(val[0]) - '0'); + } + switch(val[0]) { + case '0': + case 'f': + case 'n': + case '-': + ret = -1; + break; + case 't': + case 'y': + case '+': + ret = 1; + break; + default: + errno = EINVAL; + return -1; + } + return ret; + } + if(val == trueString || val == "on" || val == "yes" || val == "enable") { + ret = 1; + } else if(val == falseString || val == "off" || val == "no" || val == "disable") { + ret = -1; + } else { + char *loc_ptr{nullptr}; + ret = std::strtoll(val.c_str(), &loc_ptr, 0); + if(loc_ptr != (val.c_str() + val.size()) && errno == 0) { + errno = EINVAL; + } + } + return ret; +} + +/// Integer conversion +template ::value == object_category::integral_value || + classify_object::value == object_category::unsigned_integral, + detail::enabler> = detail::dummy> +bool lexical_cast(const std::string &input, T &output) { + return integral_conversion(input, output); +} + +/// char values +template ::value == object_category::char_value, detail::enabler> = detail::dummy> +bool lexical_cast(const std::string &input, T &output) { + if(input.size() == 1) { + output = static_cast(input[0]); + return true; + } + return integral_conversion(input, output); +} + +/// Boolean values +template ::value == object_category::boolean_value, detail::enabler> = detail::dummy> +bool lexical_cast(const std::string &input, T &output) { + errno = 0; + auto out = to_flag_value(input); + if(errno == 0) { + output = (out > 0); + } else if(errno == ERANGE) { + output = (input[0] != '-'); + } else { + return false; + } + return true; +} + +/// Floats +template ::value == object_category::floating_point, detail::enabler> = detail::dummy> +bool lexical_cast(const std::string &input, T &output) { + if(input.empty()) { + return false; + } + char *val = nullptr; + auto output_ld = std::strtold(input.c_str(), &val); + output = static_cast(output_ld); + if(val == (input.c_str() + input.size())) { + return true; + } + while(std::isspace(static_cast(*val))) { + ++val; + if(val == (input.c_str() + input.size())) { + return true; + } + } + + // remove separators + if(input.find_first_of("_'") != std::string::npos) { + std::string nstring = input; + nstring.erase(std::remove(nstring.begin(), nstring.end(), '_'), nstring.end()); + nstring.erase(std::remove(nstring.begin(), nstring.end(), '\''), nstring.end()); + return lexical_cast(nstring, output); + } + return false; +} + +/// complex +template ::value == object_category::complex_number, detail::enabler> = detail::dummy> +bool lexical_cast(const std::string &input, T &output) { + using XC = typename wrapped_type::type; + XC x{0.0}, y{0.0}; + auto str1 = input; + bool worked = false; + auto nloc = str1.find_last_of("+-"); + if(nloc != std::string::npos && nloc > 0) { + worked = lexical_cast(str1.substr(0, nloc), x); + str1 = str1.substr(nloc); + if(str1.back() == 'i' || str1.back() == 'j') + str1.pop_back(); + worked = worked && lexical_cast(str1, y); + } else { + if(str1.back() == 'i' || str1.back() == 'j') { + str1.pop_back(); + worked = lexical_cast(str1, y); + x = XC{0}; + } else { + worked = lexical_cast(str1, x); + y = XC{0}; + } + } + if(worked) { + output = T{x, y}; + return worked; + } + return from_stream(input, output); +} + +/// String and similar direct assignment +template ::value == object_category::string_assignable, detail::enabler> = detail::dummy> +bool lexical_cast(const std::string &input, T &output) { + output = input; + return true; +} + +/// String and similar constructible and copy assignment +template < + typename T, + enable_if_t::value == object_category::string_constructible, detail::enabler> = detail::dummy> +bool lexical_cast(const std::string &input, T &output) { + output = T(input); + return true; +} + +/// Wide strings +template < + typename T, + enable_if_t::value == object_category::wstring_assignable, detail::enabler> = detail::dummy> +bool lexical_cast(const std::string &input, T &output) { + output = widen(input); + return true; +} + +template < + typename T, + enable_if_t::value == object_category::wstring_constructible, detail::enabler> = detail::dummy> +bool lexical_cast(const std::string &input, T &output) { + output = T{widen(input)}; + return true; +} + +/// Enumerations +template ::value == object_category::enumeration, detail::enabler> = detail::dummy> +bool lexical_cast(const std::string &input, T &output) { + typename std::underlying_type::type val; + if(!integral_conversion(input, val)) { + return false; + } + output = static_cast(val); + return true; +} + +/// wrapper types +template ::value == object_category::wrapper_value && + std::is_assignable::value, + detail::enabler> = detail::dummy> +bool lexical_cast(const std::string &input, T &output) { + typename T::value_type val; + if(lexical_cast(input, val)) { + output = val; + return true; + } + return from_stream(input, output); +} + +template ::value == object_category::wrapper_value && + !std::is_assignable::value && std::is_assignable::value, + detail::enabler> = detail::dummy> +bool lexical_cast(const std::string &input, T &output) { + typename T::value_type val; + if(lexical_cast(input, val)) { + output = T{val}; + return true; + } + return from_stream(input, output); +} + +/// Assignable from double or int +template < + typename T, + enable_if_t::value == object_category::number_constructible, detail::enabler> = detail::dummy> +bool lexical_cast(const std::string &input, T &output) { + int val = 0; + if(integral_conversion(input, val)) { + output = T(val); + return true; + } + + double dval = 0.0; + if(lexical_cast(input, dval)) { + output = T{dval}; + return true; + } + + return from_stream(input, output); +} + +/// Assignable from int +template < + typename T, + enable_if_t::value == object_category::integer_constructible, detail::enabler> = detail::dummy> +bool lexical_cast(const std::string &input, T &output) { + int val = 0; + if(integral_conversion(input, val)) { + output = T(val); + return true; + } + return from_stream(input, output); +} + +/// Assignable from double +template < + typename T, + enable_if_t::value == object_category::double_constructible, detail::enabler> = detail::dummy> +bool lexical_cast(const std::string &input, T &output) { + double val = 0.0; + if(lexical_cast(input, val)) { + output = T{val}; + return true; + } + return from_stream(input, output); +} + +/// Non-string convertible from an int +template ::value == object_category::other && std::is_assignable::value, + detail::enabler> = detail::dummy> +bool lexical_cast(const std::string &input, T &output) { + int val = 0; + if(integral_conversion(input, val)) { +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4800) +#endif + // with Atomic this could produce a warning due to the conversion but if atomic gets here it is an old style + // so will most likely still work + output = val; +#ifdef _MSC_VER +#pragma warning(pop) +#endif + return true; + } + // LCOV_EXCL_START + // This version of cast is only used for odd cases in an older compilers the fail over + // from_stream is tested elsewhere an not relevant for coverage here + return from_stream(input, output); + // LCOV_EXCL_STOP +} + +/// Non-string parsable by a stream +template ::value == object_category::other && !std::is_assignable::value && + is_istreamable::value, + detail::enabler> = detail::dummy> +bool lexical_cast(const std::string &input, T &output) { + return from_stream(input, output); +} + +/// Fallback overload that prints a human-readable error for types that we don't recognize and that don't have a +/// user-supplied lexical_cast overload. +template ::value == object_category::other && !std::is_assignable::value && + !is_istreamable::value && !adl_detail::is_lexical_castable::value, + detail::enabler> = detail::dummy> +bool lexical_cast(const std::string & /*input*/, T & /*output*/) { + static_assert(!std::is_same::value, // Can't just write false here. + "option object type must have a lexical cast overload or streaming input operator(>>) defined, if it " + "is convertible from another type use the add_option(...) with XC being the known type"); + return false; +} + +/// Assign a value through lexical cast operations +/// Strings can be empty so we need to do a little different +template ::value && + (classify_object::value == object_category::string_assignable || + classify_object::value == object_category::string_constructible || + classify_object::value == object_category::wstring_assignable || + classify_object::value == object_category::wstring_constructible), + detail::enabler> = detail::dummy> +bool lexical_assign(const std::string &input, AssignTo &output) { + return lexical_cast(input, output); +} + +/// Assign a value through lexical cast operations +template ::value && std::is_assignable::value && + classify_object::value != object_category::string_assignable && + classify_object::value != object_category::string_constructible && + classify_object::value != object_category::wstring_assignable && + classify_object::value != object_category::wstring_constructible, + detail::enabler> = detail::dummy> +bool lexical_assign(const std::string &input, AssignTo &output) { + if(input.empty()) { + output = AssignTo{}; + return true; + } + + return lexical_cast(input, output); +} // LCOV_EXCL_LINE + +/// Assign a value through lexical cast operations +template ::value && !std::is_assignable::value && + classify_object::value == object_category::wrapper_value, + detail::enabler> = detail::dummy> +bool lexical_assign(const std::string &input, AssignTo &output) { + if(input.empty()) { + typename AssignTo::value_type emptyVal{}; + output = emptyVal; + return true; + } + return lexical_cast(input, output); +} + +/// Assign a value through lexical cast operations for int compatible values +/// mainly for atomic operations on some compilers +template ::value && !std::is_assignable::value && + classify_object::value != object_category::wrapper_value && + std::is_assignable::value, + detail::enabler> = detail::dummy> +bool lexical_assign(const std::string &input, AssignTo &output) { + if(input.empty()) { + output = 0; + return true; + } + int val{0}; + if(lexical_cast(input, val)) { +#if defined(__clang__) +/* on some older clang compilers */ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wsign-conversion" +#endif + output = val; +#if defined(__clang__) +#pragma clang diagnostic pop +#endif + return true; + } + return false; +} + +/// Assign a value converted from a string in lexical cast to the output value directly +template ::value && std::is_assignable::value, + detail::enabler> = detail::dummy> +bool lexical_assign(const std::string &input, AssignTo &output) { + ConvertTo val{}; + bool parse_result = (!input.empty()) ? lexical_cast(input, val) : true; + if(parse_result) { + output = val; + } + return parse_result; +} + +/// Assign a value from a lexical cast through constructing a value and move assigning it +template < + typename AssignTo, + typename ConvertTo, + enable_if_t::value && !std::is_assignable::value && + std::is_move_assignable::value, + detail::enabler> = detail::dummy> +bool lexical_assign(const std::string &input, AssignTo &output) { + ConvertTo val{}; + bool parse_result = input.empty() ? true : lexical_cast(input, val); + if(parse_result) { + output = AssignTo(val); // use () form of constructor to allow some implicit conversions + } + return parse_result; +} + +/// primary lexical conversion operation, 1 string to 1 type of some kind +template ::value <= object_category::other && + classify_object::value <= object_category::wrapper_value, + detail::enabler> = detail::dummy> +bool lexical_conversion(const std::vector &strings, AssignTo &output) { + return lexical_assign(strings[0], output); +} + +/// Lexical conversion if there is only one element but the conversion type is for two, then call a two element +/// constructor +template ::value <= 2) && expected_count::value == 1 && + is_tuple_like::value && type_count_base::value == 2, + detail::enabler> = detail::dummy> +bool lexical_conversion(const std::vector &strings, AssignTo &output) { + // the remove const is to handle pair types coming from a container + using FirstType = typename std::remove_const::type>::type; + using SecondType = typename std::tuple_element<1, ConvertTo>::type; + FirstType v1; + SecondType v2; + bool retval = lexical_assign(strings[0], v1); + retval = retval && lexical_assign((strings.size() > 1) ? strings[1] : std::string{}, v2); + if(retval) { + output = AssignTo{v1, v2}; + } + return retval; +} + +/// Lexical conversion of a container types of single elements +template ::value && is_mutable_container::value && + type_count::value == 1, + detail::enabler> = detail::dummy> +bool lexical_conversion(const std::vector &strings, AssignTo &output) { + output.erase(output.begin(), output.end()); + if(strings.empty()) { + return true; + } + if(strings.size() == 1 && strings[0] == "{}") { + return true; + } + bool skip_remaining = false; + if(strings.size() == 2 && strings[0] == "{}" && is_separator(strings[1])) { + skip_remaining = true; + } + for(const auto &elem : strings) { + typename AssignTo::value_type out; + bool retval = lexical_assign(elem, out); + if(!retval) { + return false; + } + output.insert(output.end(), std::move(out)); + if(skip_remaining) { + break; + } + } + return (!output.empty()); +} + +/// Lexical conversion for complex types +template ::value, detail::enabler> = detail::dummy> +bool lexical_conversion(const std::vector &strings, AssignTo &output) { + + if(strings.size() >= 2 && !strings[1].empty()) { + using XC2 = typename wrapped_type::type; + XC2 x{0.0}, y{0.0}; + auto str1 = strings[1]; + if(str1.back() == 'i' || str1.back() == 'j') { + str1.pop_back(); + } + auto worked = lexical_cast(strings[0], x) && lexical_cast(str1, y); + if(worked) { + output = ConvertTo{x, y}; + } + return worked; + } + return lexical_assign(strings[0], output); +} + +/// Conversion to a vector type using a particular single type as the conversion type +template ::value && (expected_count::value == 1) && + (type_count::value == 1), + detail::enabler> = detail::dummy> +bool lexical_conversion(const std::vector &strings, AssignTo &output) { + bool retval = true; + output.clear(); + output.reserve(strings.size()); + for(const auto &elem : strings) { + + output.emplace_back(); + retval = retval && lexical_assign(elem, output.back()); + } + return (!output.empty()) && retval; +} + +// forward declaration + +/// Lexical conversion of a container types with conversion type of two elements +template ::value && is_mutable_container::value && + type_count_base::value == 2, + detail::enabler> = detail::dummy> +bool lexical_conversion(std::vector strings, AssignTo &output); + +/// Lexical conversion of a vector types with type_size >2 forward declaration +template ::value && is_mutable_container::value && + type_count_base::value != 2 && + ((type_count::value > 2) || + (type_count::value > type_count_base::value)), + detail::enabler> = detail::dummy> +bool lexical_conversion(const std::vector &strings, AssignTo &output); + +/// Conversion for tuples +template ::value && is_tuple_like::value && + (type_count_base::value != type_count::value || + type_count::value > 2), + detail::enabler> = detail::dummy> +bool lexical_conversion(const std::vector &strings, AssignTo &output); // forward declaration + +/// Conversion for operations where the assigned type is some class but the conversion is a mutable container or large +/// tuple +template ::value && !is_mutable_container::value && + classify_object::value != object_category::wrapper_value && + (is_mutable_container::value || type_count::value > 2), + detail::enabler> = detail::dummy> +bool lexical_conversion(const std::vector &strings, AssignTo &output) { + + if(strings.size() > 1 || (!strings.empty() && !(strings.front().empty()))) { + ConvertTo val; + auto retval = lexical_conversion(strings, val); + output = AssignTo{val}; + return retval; + } + output = AssignTo{}; + return true; +} + +/// function template for converting tuples if the static Index is greater than the tuple size +template +inline typename std::enable_if<(I >= type_count_base::value), bool>::type +tuple_conversion(const std::vector &, AssignTo &) { + return true; +} + +/// Conversion of a tuple element where the type size ==1 and not a mutable container +template +inline typename std::enable_if::value && type_count::value == 1, bool>::type +tuple_type_conversion(std::vector &strings, AssignTo &output) { + auto retval = lexical_assign(strings[0], output); + strings.erase(strings.begin()); + return retval; +} + +/// Conversion of a tuple element where the type size !=1 but the size is fixed and not a mutable container +template +inline typename std::enable_if::value && (type_count::value > 1) && + type_count::value == type_count_min::value, + bool>::type +tuple_type_conversion(std::vector &strings, AssignTo &output) { + auto retval = lexical_conversion(strings, output); + strings.erase(strings.begin(), strings.begin() + type_count::value); + return retval; +} + +/// Conversion of a tuple element where the type is a mutable container or a type with different min and max type sizes +template +inline typename std::enable_if::value || + type_count::value != type_count_min::value, + bool>::type +tuple_type_conversion(std::vector &strings, AssignTo &output) { + + std::size_t index{subtype_count_min::value}; + const std::size_t mx_count{subtype_count::value}; + const std::size_t mx{(std::min)(mx_count, strings.size() - 1)}; + + while(index < mx) { + if(is_separator(strings[index])) { + break; + } + ++index; + } + bool retval = lexical_conversion( + std::vector(strings.begin(), strings.begin() + static_cast(index)), output); + if(strings.size() > index) { + strings.erase(strings.begin(), strings.begin() + static_cast(index) + 1); + } else { + strings.clear(); + } + return retval; +} + +/// Tuple conversion operation +template +inline typename std::enable_if<(I < type_count_base::value), bool>::type +tuple_conversion(std::vector strings, AssignTo &output) { + bool retval = true; + using ConvertToElement = typename std:: + conditional::value, typename std::tuple_element::type, ConvertTo>::type; + if(!strings.empty()) { + retval = retval && tuple_type_conversion::type, ConvertToElement>( + strings, std::get(output)); + } + retval = retval && tuple_conversion(std::move(strings), output); + return retval; +} + +/// Lexical conversion of a container types with tuple elements of size 2 +template ::value && is_mutable_container::value && + type_count_base::value == 2, + detail::enabler>> +bool lexical_conversion(std::vector strings, AssignTo &output) { + output.clear(); + while(!strings.empty()) { + + typename std::remove_const::type>::type v1; + typename std::tuple_element<1, typename ConvertTo::value_type>::type v2; + bool retval = tuple_type_conversion(strings, v1); + if(!strings.empty()) { + retval = retval && tuple_type_conversion(strings, v2); + } + if(retval) { + output.insert(output.end(), typename AssignTo::value_type{v1, v2}); + } else { + return false; + } + } + return (!output.empty()); +} + +/// lexical conversion of tuples with type count>2 or tuples of types of some element with a type size>=2 +template ::value && is_tuple_like::value && + (type_count_base::value != type_count::value || + type_count::value > 2), + detail::enabler>> +bool lexical_conversion(const std::vector &strings, AssignTo &output) { + static_assert( + !is_tuple_like::value || type_count_base::value == type_count_base::value, + "if the conversion type is defined as a tuple it must be the same size as the type you are converting to"); + return tuple_conversion(strings, output); +} + +/// Lexical conversion of a vector types for everything but tuples of two elements and types of size 1 +template ::value && is_mutable_container::value && + type_count_base::value != 2 && + ((type_count::value > 2) || + (type_count::value > type_count_base::value)), + detail::enabler>> +bool lexical_conversion(const std::vector &strings, AssignTo &output) { + bool retval = true; + output.clear(); + std::vector temp; + std::size_t ii{0}; + std::size_t icount{0}; + std::size_t xcm{type_count::value}; + auto ii_max = strings.size(); + while(ii < ii_max) { + temp.push_back(strings[ii]); + ++ii; + ++icount; + if(icount == xcm || is_separator(temp.back()) || ii == ii_max) { + if(static_cast(xcm) > type_count_min::value && is_separator(temp.back())) { + temp.pop_back(); + } + typename AssignTo::value_type temp_out; + retval = retval && + lexical_conversion(temp, temp_out); + temp.clear(); + if(!retval) { + return false; + } + output.insert(output.end(), std::move(temp_out)); + icount = 0; + } + } + return retval; +} + +/// conversion for wrapper types +template ::value == object_category::wrapper_value && + std::is_assignable::value, + detail::enabler> = detail::dummy> +bool lexical_conversion(const std::vector &strings, AssignTo &output) { + if(strings.empty() || strings.front().empty()) { + output = ConvertTo{}; + return true; + } + typename ConvertTo::value_type val; + if(lexical_conversion(strings, val)) { + output = ConvertTo{val}; + return true; + } + return false; +} + +/// conversion for wrapper types +template ::value == object_category::wrapper_value && + !std::is_assignable::value, + detail::enabler> = detail::dummy> +bool lexical_conversion(const std::vector &strings, AssignTo &output) { + using ConvertType = typename ConvertTo::value_type; + if(strings.empty() || strings.front().empty()) { + output = ConvertType{}; + return true; + } + ConvertType val; + if(lexical_conversion(strings, val)) { + output = val; + return true; + } + return false; +} + +/// Sum a vector of strings +inline std::string sum_string_vector(const std::vector &values) { + double val{0.0}; + bool fail{false}; + std::string output; + for(const auto &arg : values) { + double tv{0.0}; + auto comp = lexical_cast(arg, tv); + if(!comp) { + errno = 0; + auto fv = detail::to_flag_value(arg); + fail = (errno != 0); + if(fail) { + break; + } + tv = static_cast(fv); + } + val += tv; + } + if(fail) { + for(const auto &arg : values) { + output.append(arg); + } + } else { + std::ostringstream out; + out.precision(16); + out << val; + output = out.str(); + } + return output; +} + +} // namespace detail + + + +namespace detail { + +// Returns false if not a short option. Otherwise, sets opt name and rest and returns true +CLI11_INLINE bool split_short(const std::string ¤t, std::string &name, std::string &rest); + +// Returns false if not a long option. Otherwise, sets opt name and other side of = and returns true +CLI11_INLINE bool split_long(const std::string ¤t, std::string &name, std::string &value); + +// Returns false if not a windows style option. Otherwise, sets opt name and value and returns true +CLI11_INLINE bool split_windows_style(const std::string ¤t, std::string &name, std::string &value); + +// Splits a string into multiple long and short names +CLI11_INLINE std::vector split_names(std::string current); + +/// extract default flag values either {def} or starting with a ! +CLI11_INLINE std::vector> get_default_flag_values(const std::string &str); + +/// Get a vector of short names, one of long names, and a single name +CLI11_INLINE std::tuple, std::vector, std::string> +get_names(const std::vector &input, bool allow_non_standard = false); + +} // namespace detail + + + +namespace detail { + +CLI11_INLINE bool split_short(const std::string ¤t, std::string &name, std::string &rest) { + if(current.size() > 1 && current[0] == '-' && valid_first_char(current[1])) { + name = current.substr(1, 1); + rest = current.substr(2); + return true; + } + return false; +} + +CLI11_INLINE bool split_long(const std::string ¤t, std::string &name, std::string &value) { + if(current.size() > 2 && current.compare(0, 2, "--") == 0 && valid_first_char(current[2])) { + auto loc = current.find_first_of('='); + if(loc != std::string::npos) { + name = current.substr(2, loc - 2); + value = current.substr(loc + 1); + } else { + name = current.substr(2); + value = ""; + } + return true; + } + return false; +} + +CLI11_INLINE bool split_windows_style(const std::string ¤t, std::string &name, std::string &value) { + if(current.size() > 1 && current[0] == '/' && valid_first_char(current[1])) { + auto loc = current.find_first_of(':'); + if(loc != std::string::npos) { + name = current.substr(1, loc - 1); + value = current.substr(loc + 1); + } else { + name = current.substr(1); + value = ""; + } + return true; + } + return false; +} + +CLI11_INLINE std::vector split_names(std::string current) { + std::vector output; + std::size_t val = 0; + while((val = current.find(',')) != std::string::npos) { + output.push_back(trim_copy(current.substr(0, val))); + current = current.substr(val + 1); + } + output.push_back(trim_copy(current)); + return output; +} + +CLI11_INLINE std::vector> get_default_flag_values(const std::string &str) { + std::vector flags = split_names(str); + flags.erase(std::remove_if(flags.begin(), + flags.end(), + [](const std::string &name) { + return ((name.empty()) || (!(((name.find_first_of('{') != std::string::npos) && + (name.back() == '}')) || + (name[0] == '!')))); + }), + flags.end()); + std::vector> output; + output.reserve(flags.size()); + for(auto &flag : flags) { + auto def_start = flag.find_first_of('{'); + std::string defval = "false"; + if((def_start != std::string::npos) && (flag.back() == '}')) { + defval = flag.substr(def_start + 1); + defval.pop_back(); + flag.erase(def_start, std::string::npos); // NOLINT(readability-suspicious-call-argument) + } + flag.erase(0, flag.find_first_not_of("-!")); + output.emplace_back(flag, defval); + } + return output; +} + +CLI11_INLINE std::tuple, std::vector, std::string> +get_names(const std::vector &input, bool allow_non_standard) { + + std::vector short_names; + std::vector long_names; + std::string pos_name; + for(std::string name : input) { + if(name.length() == 0) { + continue; + } + if(name.length() > 1 && name[0] == '-' && name[1] != '-') { + if(name.length() == 2 && valid_first_char(name[1])) { + short_names.emplace_back(1, name[1]); + } else if(name.length() > 2) { + if(allow_non_standard) { + name = name.substr(1); + if(valid_name_string(name)) { + short_names.push_back(name); + } else { + throw BadNameString::BadLongName(name); + } + } else { + throw BadNameString::MissingDash(name); + } + } else { + throw BadNameString::OneCharName(name); + } + } else if(name.length() > 2 && name.substr(0, 2) == "--") { + name = name.substr(2); + if(valid_name_string(name)) { + long_names.push_back(name); + } else { + throw BadNameString::BadLongName(name); + } + } else if(name == "-" || name == "--" || name == "++") { + throw BadNameString::ReservedName(name); + } else { + if(!pos_name.empty()) { + throw BadNameString::MultiPositionalNames(name); + } + if(valid_name_string(name)) { + pos_name = name; + } else { + throw BadNameString::BadPositionalName(name); + } + } + } + return std::make_tuple(short_names, long_names, pos_name); +} + +} // namespace detail + + + +class App; + +/// Holds values to load into Options +struct ConfigItem { + /// This is the list of parents + std::vector parents{}; + + /// This is the name + std::string name{}; + /// Listing of inputs + std::vector inputs{}; + /// @brief indicator if a multiline vector separator was inserted + bool multiline{false}; + /// The list of parents and name joined by "." + CLI11_NODISCARD std::string fullname() const { + std::vector tmp = parents; + tmp.emplace_back(name); + return detail::join(tmp, "."); + (void)multiline; // suppression for cppcheck false positive + } +}; + +/// This class provides a converter for configuration files. +class Config { + protected: + std::vector items{}; + + public: + /// Convert an app into a configuration + virtual std::string to_config(const App *, bool, bool, std::string) const = 0; /// Convert a configuration into an app virtual std::vector from_config(std::istream &) const = 0; - /// Convert a flag to a bool - virtual std::vector to_flag(const ConfigItem &item) const { - if(item.inputs.size() == 1) { - std::string val = item.inputs.at(0); - val = detail::to_lower(val); + /// Get a flag value + CLI11_NODISCARD virtual std::string to_flag(const ConfigItem &item) const { + if(item.inputs.size() == 1) { + return item.inputs.at(0); + } + if(item.inputs.empty()) { + return "{}"; + } + throw ConversionError::TooManyInputsFlag(item.fullname()); // LCOV_EXCL_LINE + } + + /// Parse a config file, throw an error (ParseError:ConfigParseError or FileError) on failure + CLI11_NODISCARD std::vector from_file(const std::string &name) const { + std::ifstream input{name}; + if(!input.good()) + throw FileError::Missing(name); + + return from_config(input); + } + + /// Virtual destructor + virtual ~Config() = default; +}; + +/// This converter works with INI/TOML files; to write INI files use ConfigINI +class ConfigBase : public Config { + protected: + /// the character used for comments + char commentChar = '#'; + /// the character used to start an array '\0' is a default to not use + char arrayStart = '['; + /// the character used to end an array '\0' is a default to not use + char arrayEnd = ']'; + /// the character used to separate elements in an array + char arraySeparator = ','; + /// the character used separate the name from the value + char valueDelimiter = '='; + /// the character to use around strings + char stringQuote = '"'; + /// the character to use around single characters and literal strings + char literalQuote = '\''; + /// the maximum number of layers to allow + uint8_t maximumLayers{255}; + /// the separator used to separator parent layers + char parentSeparatorChar{'.'}; + /// comment default values + bool commentDefaultsBool = false; + /// specify the config reader should collapse repeated field names to a single vector + bool allowMultipleDuplicateFields{false}; + /// Specify the configuration index to use for arrayed sections + int16_t configIndex{-1}; + /// Specify the configuration section that should be used + std::string configSection{}; + + public: + std::string + to_config(const App * /*app*/, bool default_also, bool write_description, std::string prefix) const override; + + std::vector from_config(std::istream &input) const override; + /// Specify the configuration for comment characters + ConfigBase *comment(char cchar) { + commentChar = cchar; + return this; + } + /// Specify the start and end characters for an array + ConfigBase *arrayBounds(char aStart, char aEnd) { + arrayStart = aStart; + arrayEnd = aEnd; + return this; + } + /// Specify the delimiter character for an array + ConfigBase *arrayDelimiter(char aSep) { + arraySeparator = aSep; + return this; + } + /// Specify the delimiter between a name and value + ConfigBase *valueSeparator(char vSep) { + valueDelimiter = vSep; + return this; + } + /// Specify the quote characters used around strings and literal strings + ConfigBase *quoteCharacter(char qString, char literalChar) { + stringQuote = qString; + literalQuote = literalChar; + return this; + } + /// Specify the maximum number of parents + ConfigBase *maxLayers(uint8_t layers) { + maximumLayers = layers; + return this; + } + /// Specify the separator to use for parent layers + ConfigBase *parentSeparator(char sep) { + parentSeparatorChar = sep; + return this; + } + /// comment default value options + ConfigBase *commentDefaults(bool comDef = true) { + commentDefaultsBool = comDef; + return this; + } + /// get a reference to the configuration section + std::string §ionRef() { return configSection; } + /// get the section + CLI11_NODISCARD const std::string §ion() const { return configSection; } + /// specify a particular section of the configuration file to use + ConfigBase *section(const std::string §ionName) { + configSection = sectionName; + return this; + } + + /// get a reference to the configuration index + int16_t &indexRef() { return configIndex; } + /// get the section index + CLI11_NODISCARD int16_t index() const { return configIndex; } + /// specify a particular index in the section to use (-1) for all sections to use + ConfigBase *index(int16_t sectionIndex) { + configIndex = sectionIndex; + return this; + } + /// specify that multiple duplicate arguments should be merged even if not sequential + ConfigBase *allowDuplicateFields(bool value = true) { + allowMultipleDuplicateFields = value; + return this; + } +}; + +/// the default Config is the TOML file format +using ConfigTOML = ConfigBase; + +/// ConfigINI generates a "standard" INI compliant output +class ConfigINI : public ConfigTOML { + + public: + ConfigINI() { + commentChar = ';'; + arrayStart = '\0'; + arrayEnd = '\0'; + arraySeparator = ' '; + valueDelimiter = '='; + } +}; + + + +class Option; + +/// @defgroup validator_group Validators + +/// @brief Some validators that are provided +/// +/// These are simple `std::string(const std::string&)` validators that are useful. They return +/// a string if the validation fails. A custom struct is provided, as well, with the same user +/// semantics, but with the ability to provide a new type name. +/// @{ + +/// +class Validator { + protected: + /// This is the description function, if empty the description_ will be used + std::function desc_function_{[]() { return std::string{}; }}; + + /// This is the base function that is to be called. + /// Returns a string error message if validation fails. + std::function func_{[](std::string &) { return std::string{}; }}; + /// The name for search purposes of the Validator + std::string name_{}; + /// A Validator will only apply to an indexed value (-1 is all elements) + int application_index_ = -1; + /// Enable for Validator to allow it to be disabled if need be + bool active_{true}; + /// specify that a validator should not modify the input + bool non_modifying_{false}; + + Validator(std::string validator_desc, std::function func) + : desc_function_([validator_desc]() { return validator_desc; }), func_(std::move(func)) {} + + public: + Validator() = default; + /// Construct a Validator with just the description string + explicit Validator(std::string validator_desc) : desc_function_([validator_desc]() { return validator_desc; }) {} + /// Construct Validator from basic information + Validator(std::function op, std::string validator_desc, std::string validator_name = "") + : desc_function_([validator_desc]() { return validator_desc; }), func_(std::move(op)), + name_(std::move(validator_name)) {} + /// Set the Validator operation function + Validator &operation(std::function op) { + func_ = std::move(op); + return *this; + } + /// This is the required operator for a Validator - provided to help + /// users (CLI11 uses the member `func` directly) + std::string operator()(std::string &str) const; + + /// This is the required operator for a Validator - provided to help + /// users (CLI11 uses the member `func` directly) + std::string operator()(const std::string &str) const { + std::string value = str; + return (active_) ? func_(value) : std::string{}; + } + + /// Specify the type string + Validator &description(std::string validator_desc) { + desc_function_ = [validator_desc]() { return validator_desc; }; + return *this; + } + /// Specify the type string + CLI11_NODISCARD Validator description(std::string validator_desc) const; + + /// Generate type description information for the Validator + CLI11_NODISCARD std::string get_description() const { + if(active_) { + return desc_function_(); + } + return std::string{}; + } + /// Specify the type string + Validator &name(std::string validator_name) { + name_ = std::move(validator_name); + return *this; + } + /// Specify the type string + CLI11_NODISCARD Validator name(std::string validator_name) const { + Validator newval(*this); + newval.name_ = std::move(validator_name); + return newval; + } + /// Get the name of the Validator + CLI11_NODISCARD const std::string &get_name() const { return name_; } + /// Specify whether the Validator is active or not + Validator &active(bool active_val = true) { + active_ = active_val; + return *this; + } + /// Specify whether the Validator is active or not + CLI11_NODISCARD Validator active(bool active_val = true) const { + Validator newval(*this); + newval.active_ = active_val; + return newval; + } + + /// Specify whether the Validator can be modifying or not + Validator &non_modifying(bool no_modify = true) { + non_modifying_ = no_modify; + return *this; + } + /// Specify the application index of a validator + Validator &application_index(int app_index) { + application_index_ = app_index; + return *this; + } + /// Specify the application index of a validator + CLI11_NODISCARD Validator application_index(int app_index) const { + Validator newval(*this); + newval.application_index_ = app_index; + return newval; + } + /// Get the current value of the application index + CLI11_NODISCARD int get_application_index() const { return application_index_; } + /// Get a boolean if the validator is active + CLI11_NODISCARD bool get_active() const { return active_; } + + /// Get a boolean if the validator is allowed to modify the input returns true if it can modify the input + CLI11_NODISCARD bool get_modifying() const { return !non_modifying_; } + + /// Combining validators is a new validator. Type comes from left validator if function, otherwise only set if the + /// same. + Validator operator&(const Validator &other) const; + + /// Combining validators is a new validator. Type comes from left validator if function, otherwise only set if the + /// same. + Validator operator|(const Validator &other) const; + + /// Create a validator that fails when a given validator succeeds + Validator operator!() const; + + private: + void _merge_description(const Validator &val1, const Validator &val2, const std::string &merger); +}; + +/// Class wrapping some of the accessors of Validator +class CustomValidator : public Validator { + public: +}; +// The implementation of the built in validators is using the Validator class; +// the user is only expected to use the const (static) versions (since there's no setup). +// Therefore, this is in detail. +namespace detail { + +/// CLI enumeration of different file types +enum class path_type { nonexistent, file, directory }; + +/// get the type of the path from a file name +CLI11_INLINE path_type check_path(const char *file) noexcept; + +/// Check for an existing file (returns error message if check fails) +class ExistingFileValidator : public Validator { + public: + ExistingFileValidator(); +}; + +/// Check for an existing directory (returns error message if check fails) +class ExistingDirectoryValidator : public Validator { + public: + ExistingDirectoryValidator(); +}; + +/// Check for an existing path +class ExistingPathValidator : public Validator { + public: + ExistingPathValidator(); +}; + +/// Check for an non-existing path +class NonexistentPathValidator : public Validator { + public: + NonexistentPathValidator(); +}; + +/// Validate the given string is a legal ipv4 address +class IPV4Validator : public Validator { + public: + IPV4Validator(); +}; + +class EscapedStringTransformer : public Validator { + public: + EscapedStringTransformer(); +}; + +} // namespace detail + +// Static is not needed here, because global const implies static. + +/// Check for existing file (returns error message if check fails) +const detail::ExistingFileValidator ExistingFile; + +/// Check for an existing directory (returns error message if check fails) +const detail::ExistingDirectoryValidator ExistingDirectory; + +/// Check for an existing path +const detail::ExistingPathValidator ExistingPath; + +/// Check for an non-existing path +const detail::NonexistentPathValidator NonexistentPath; + +/// Check for an IP4 address +const detail::IPV4Validator ValidIPV4; + +/// convert escaped characters into their associated values +const detail::EscapedStringTransformer EscapedString; + +/// Validate the input as a particular type +template class TypeValidator : public Validator { + public: + explicit TypeValidator(const std::string &validator_name) + : Validator(validator_name, [](std::string &input_string) { + using CLI::detail::lexical_cast; + auto val = DesiredType(); + if(!lexical_cast(input_string, val)) { + return std::string("Failed parsing ") + input_string + " as a " + detail::type_name(); + } + return std::string(); + }) {} + TypeValidator() : TypeValidator(detail::type_name()) {} +}; + +/// Check for a number +const TypeValidator Number("NUMBER"); + +/// Modify a path if the file is a particular default location, can be used as Check or transform +/// with the error return optionally disabled +class FileOnDefaultPath : public Validator { + public: + explicit FileOnDefaultPath(std::string default_path, bool enableErrorReturn = true); +}; + +/// Produce a range (factory). Min and max are inclusive. +class Range : public Validator { + public: + /// This produces a range with min and max inclusive. + /// + /// Note that the constructor is templated, but the struct is not, so C++17 is not + /// needed to provide nice syntax for Range(a,b). + template + Range(T min_val, T max_val, const std::string &validator_name = std::string{}) : Validator(validator_name) { + if(validator_name.empty()) { + std::stringstream out; + out << detail::type_name() << " in [" << min_val << " - " << max_val << "]"; + description(out.str()); + } + + func_ = [min_val, max_val](std::string &input) { + using CLI::detail::lexical_cast; + T val; + bool converted = lexical_cast(input, val); + if((!converted) || (val < min_val || val > max_val)) { + std::stringstream out; + out << "Value " << input << " not in range ["; + out << min_val << " - " << max_val << "]"; + return out.str(); + } + return std::string{}; + }; + } + + /// Range of one value is 0 to value + template + explicit Range(T max_val, const std::string &validator_name = std::string{}) + : Range(static_cast(0), max_val, validator_name) {} +}; + +/// Check for a non negative number +const Range NonNegativeNumber((std::numeric_limits::max)(), "NONNEGATIVE"); + +/// Check for a positive valued number (val>0.0), ::min here is the smallest positive number +const Range PositiveNumber((std::numeric_limits::min)(), (std::numeric_limits::max)(), "POSITIVE"); + +/// Produce a bounded range (factory). Min and max are inclusive. +class Bound : public Validator { + public: + /// This bounds a value with min and max inclusive. + /// + /// Note that the constructor is templated, but the struct is not, so C++17 is not + /// needed to provide nice syntax for Range(a,b). + template Bound(T min_val, T max_val) { + std::stringstream out; + out << detail::type_name() << " bounded to [" << min_val << " - " << max_val << "]"; + description(out.str()); + + func_ = [min_val, max_val](std::string &input) { + using CLI::detail::lexical_cast; + T val; + bool converted = lexical_cast(input, val); + if(!converted) { + return std::string("Value ") + input + " could not be converted"; + } + if(val < min_val) + input = detail::to_string(min_val); + else if(val > max_val) + input = detail::to_string(max_val); + + return std::string{}; + }; + } + + /// Range of one value is 0 to value + template explicit Bound(T max_val) : Bound(static_cast(0), max_val) {} +}; + +namespace detail { +template ::type>::value, detail::enabler> = detail::dummy> +auto smart_deref(T value) -> decltype(*value) { + return *value; +} + +template < + typename T, + enable_if_t::type>::value, detail::enabler> = detail::dummy> +typename std::remove_reference::type &smart_deref(T &value) { + return value; +} +/// Generate a string representation of a set +template std::string generate_set(const T &set) { + using element_t = typename detail::element_type::type; + using iteration_type_t = typename detail::pair_adaptor::value_type; // the type of the object pair + std::string out(1, '{'); + out.append(detail::join( + detail::smart_deref(set), + [](const iteration_type_t &v) { return detail::pair_adaptor::first(v); }, + ",")); + out.push_back('}'); + return out; +} + +/// Generate a string representation of a map +template std::string generate_map(const T &map, bool key_only = false) { + using element_t = typename detail::element_type::type; + using iteration_type_t = typename detail::pair_adaptor::value_type; // the type of the object pair + std::string out(1, '{'); + out.append(detail::join( + detail::smart_deref(map), + [key_only](const iteration_type_t &v) { + std::string res{detail::to_string(detail::pair_adaptor::first(v))}; + + if(!key_only) { + res.append("->"); + res += detail::to_string(detail::pair_adaptor::second(v)); + } + return res; + }, + ",")); + out.push_back('}'); + return out; +} + +template struct has_find { + template + static auto test(int) -> decltype(std::declval().find(std::declval()), std::true_type()); + template static auto test(...) -> decltype(std::false_type()); + + static const auto value = decltype(test(0))::value; + using type = std::integral_constant; +}; + +/// A search function +template ::value, detail::enabler> = detail::dummy> +auto search(const T &set, const V &val) -> std::pair { + using element_t = typename detail::element_type::type; + auto &setref = detail::smart_deref(set); + auto it = std::find_if(std::begin(setref), std::end(setref), [&val](decltype(*std::begin(setref)) v) { + return (detail::pair_adaptor::first(v) == val); + }); + return {(it != std::end(setref)), it}; +} + +/// A search function that uses the built in find function +template ::value, detail::enabler> = detail::dummy> +auto search(const T &set, const V &val) -> std::pair { + auto &setref = detail::smart_deref(set); + auto it = setref.find(val); + return {(it != std::end(setref)), it}; +} + +/// A search function with a filter function +template +auto search(const T &set, const V &val, const std::function &filter_function) + -> std::pair { + using element_t = typename detail::element_type::type; + // do the potentially faster first search + auto res = search(set, val); + if((res.first) || (!(filter_function))) { + return res; + } + // if we haven't found it do the longer linear search with all the element translations + auto &setref = detail::smart_deref(set); + auto it = std::find_if(std::begin(setref), std::end(setref), [&](decltype(*std::begin(setref)) v) { + V a{detail::pair_adaptor::first(v)}; + a = filter_function(a); + return (a == val); + }); + return {(it != std::end(setref)), it}; +} + +// the following suggestion was made by Nikita Ofitserov(@himikof) +// done in templates to prevent compiler warnings on negation of unsigned numbers + +/// Do a check for overflow on signed numbers +template +inline typename std::enable_if::value, T>::type overflowCheck(const T &a, const T &b) { + if((a > 0) == (b > 0)) { + return ((std::numeric_limits::max)() / (std::abs)(a) < (std::abs)(b)); + } + return ((std::numeric_limits::min)() / (std::abs)(a) > -(std::abs)(b)); +} +/// Do a check for overflow on unsigned numbers +template +inline typename std::enable_if::value, T>::type overflowCheck(const T &a, const T &b) { + return ((std::numeric_limits::max)() / a < b); +} + +/// Performs a *= b; if it doesn't cause integer overflow. Returns false otherwise. +template typename std::enable_if::value, bool>::type checked_multiply(T &a, T b) { + if(a == 0 || b == 0 || a == 1 || b == 1) { + a *= b; + return true; + } + if(a == (std::numeric_limits::min)() || b == (std::numeric_limits::min)()) { + return false; + } + if(overflowCheck(a, b)) { + return false; + } + a *= b; + return true; +} + +/// Performs a *= b; if it doesn't equal infinity. Returns false otherwise. +template +typename std::enable_if::value, bool>::type checked_multiply(T &a, T b) { + T c = a * b; + if(std::isinf(c) && !std::isinf(a) && !std::isinf(b)) { + return false; + } + a = c; + return true; +} + +} // namespace detail +/// Verify items are in a set +class IsMember : public Validator { + public: + using filter_fn_t = std::function; + + /// This allows in-place construction using an initializer list + template + IsMember(std::initializer_list values, Args &&...args) + : IsMember(std::vector(values), std::forward(args)...) {} + + /// This checks to see if an item is in a set (empty function) + template explicit IsMember(T &&set) : IsMember(std::forward(set), nullptr) {} + + /// This checks to see if an item is in a set: pointer or copy version. You can pass in a function that will filter + /// both sides of the comparison before computing the comparison. + template explicit IsMember(T set, F filter_function) { + + // Get the type of the contained item - requires a container have ::value_type + // if the type does not have first_type and second_type, these are both value_type + using element_t = typename detail::element_type::type; // Removes (smart) pointers if needed + using item_t = typename detail::pair_adaptor::first_type; // Is value_type if not a map + + using local_item_t = typename IsMemberType::type; // This will convert bad types to good ones + // (const char * to std::string) + + // Make a local copy of the filter function, using a std::function if not one already + std::function filter_fn = filter_function; + + // This is the type name for help, it will take the current version of the set contents + desc_function_ = [set]() { return detail::generate_set(detail::smart_deref(set)); }; + + // This is the function that validates + // It stores a copy of the set pointer-like, so shared_ptr will stay alive + func_ = [set, filter_fn](std::string &input) { + using CLI::detail::lexical_cast; + local_item_t b; + if(!lexical_cast(input, b)) { + throw ValidationError(input); // name is added later + } + if(filter_fn) { + b = filter_fn(b); + } + auto res = detail::search(set, b, filter_fn); + if(res.first) { + // Make sure the version in the input string is identical to the one in the set + if(filter_fn) { + input = detail::value_string(detail::pair_adaptor::first(*(res.second))); + } + + // Return empty error string (success) + return std::string{}; + } + + // If you reach this point, the result was not found + return input + " not in " + detail::generate_set(detail::smart_deref(set)); + }; + } + + /// You can pass in as many filter functions as you like, they nest (string only currently) + template + IsMember(T &&set, filter_fn_t filter_fn_1, filter_fn_t filter_fn_2, Args &&...other) + : IsMember( + std::forward(set), + [filter_fn_1, filter_fn_2](std::string a) { return filter_fn_2(filter_fn_1(a)); }, + other...) {} +}; + +/// definition of the default transformation object +template using TransformPairs = std::vector>; + +/// Translate named items to other or a value set +class Transformer : public Validator { + public: + using filter_fn_t = std::function; + + /// This allows in-place construction + template + Transformer(std::initializer_list> values, Args &&...args) + : Transformer(TransformPairs(values), std::forward(args)...) {} + + /// direct map of std::string to std::string + template explicit Transformer(T &&mapping) : Transformer(std::forward(mapping), nullptr) {} + + /// This checks to see if an item is in a set: pointer or copy version. You can pass in a function that will filter + /// both sides of the comparison before computing the comparison. + template explicit Transformer(T mapping, F filter_function) { + + static_assert(detail::pair_adaptor::type>::value, + "mapping must produce value pairs"); + // Get the type of the contained item - requires a container have ::value_type + // if the type does not have first_type and second_type, these are both value_type + using element_t = typename detail::element_type::type; // Removes (smart) pointers if needed + using item_t = typename detail::pair_adaptor::first_type; // Is value_type if not a map + using local_item_t = typename IsMemberType::type; // Will convert bad types to good ones + // (const char * to std::string) + + // Make a local copy of the filter function, using a std::function if not one already + std::function filter_fn = filter_function; + + // This is the type name for help, it will take the current version of the set contents + desc_function_ = [mapping]() { return detail::generate_map(detail::smart_deref(mapping)); }; + + func_ = [mapping, filter_fn](std::string &input) { + using CLI::detail::lexical_cast; + local_item_t b; + if(!lexical_cast(input, b)) { + return std::string(); + // there is no possible way we can match anything in the mapping if we can't convert so just return + } + if(filter_fn) { + b = filter_fn(b); + } + auto res = detail::search(mapping, b, filter_fn); + if(res.first) { + input = detail::value_string(detail::pair_adaptor::second(*res.second)); + } + return std::string{}; + }; + } + + /// You can pass in as many filter functions as you like, they nest + template + Transformer(T &&mapping, filter_fn_t filter_fn_1, filter_fn_t filter_fn_2, Args &&...other) + : Transformer( + std::forward(mapping), + [filter_fn_1, filter_fn_2](std::string a) { return filter_fn_2(filter_fn_1(a)); }, + other...) {} +}; + +/// translate named items to other or a value set +class CheckedTransformer : public Validator { + public: + using filter_fn_t = std::function; + + /// This allows in-place construction + template + CheckedTransformer(std::initializer_list> values, Args &&...args) + : CheckedTransformer(TransformPairs(values), std::forward(args)...) {} + + /// direct map of std::string to std::string + template explicit CheckedTransformer(T mapping) : CheckedTransformer(std::move(mapping), nullptr) {} + + /// This checks to see if an item is in a set: pointer or copy version. You can pass in a function that will filter + /// both sides of the comparison before computing the comparison. + template explicit CheckedTransformer(T mapping, F filter_function) { + + static_assert(detail::pair_adaptor::type>::value, + "mapping must produce value pairs"); + // Get the type of the contained item - requires a container have ::value_type + // if the type does not have first_type and second_type, these are both value_type + using element_t = typename detail::element_type::type; // Removes (smart) pointers if needed + using item_t = typename detail::pair_adaptor::first_type; // Is value_type if not a map + using local_item_t = typename IsMemberType::type; // Will convert bad types to good ones + // (const char * to std::string) + using iteration_type_t = typename detail::pair_adaptor::value_type; // the type of the object pair + + // Make a local copy of the filter function, using a std::function if not one already + std::function filter_fn = filter_function; + + auto tfunc = [mapping]() { + std::string out("value in "); + out += detail::generate_map(detail::smart_deref(mapping)) + " OR {"; + out += detail::join( + detail::smart_deref(mapping), + [](const iteration_type_t &v) { return detail::to_string(detail::pair_adaptor::second(v)); }, + ","); + out.push_back('}'); + return out; + }; + + desc_function_ = tfunc; + + func_ = [mapping, tfunc, filter_fn](std::string &input) { + using CLI::detail::lexical_cast; + local_item_t b; + bool converted = lexical_cast(input, b); + if(converted) { + if(filter_fn) { + b = filter_fn(b); + } + auto res = detail::search(mapping, b, filter_fn); + if(res.first) { + input = detail::value_string(detail::pair_adaptor::second(*res.second)); + return std::string{}; + } + } + for(const auto &v : detail::smart_deref(mapping)) { + auto output_string = detail::value_string(detail::pair_adaptor::second(v)); + if(output_string == input) { + return std::string(); + } + } + + return "Check " + input + " " + tfunc() + " FAILED"; + }; + } + + /// You can pass in as many filter functions as you like, they nest + template + CheckedTransformer(T &&mapping, filter_fn_t filter_fn_1, filter_fn_t filter_fn_2, Args &&...other) + : CheckedTransformer( + std::forward(mapping), + [filter_fn_1, filter_fn_2](std::string a) { return filter_fn_2(filter_fn_1(a)); }, + other...) {} +}; + +/// Helper function to allow ignore_case to be passed to IsMember or Transform +inline std::string ignore_case(std::string item) { return detail::to_lower(item); } + +/// Helper function to allow ignore_underscore to be passed to IsMember or Transform +inline std::string ignore_underscore(std::string item) { return detail::remove_underscore(item); } + +/// Helper function to allow checks to ignore spaces to be passed to IsMember or Transform +inline std::string ignore_space(std::string item) { + item.erase(std::remove(std::begin(item), std::end(item), ' '), std::end(item)); + item.erase(std::remove(std::begin(item), std::end(item), '\t'), std::end(item)); + return item; +} + +/// Multiply a number by a factor using given mapping. +/// Can be used to write transforms for SIZE or DURATION inputs. +/// +/// Example: +/// With mapping = `{"b"->1, "kb"->1024, "mb"->1024*1024}` +/// one can recognize inputs like "100", "12kb", "100 MB", +/// that will be automatically transformed to 100, 14448, 104857600. +/// +/// Output number type matches the type in the provided mapping. +/// Therefore, if it is required to interpret real inputs like "0.42 s", +/// the mapping should be of a type or . +class AsNumberWithUnit : public Validator { + public: + /// Adjust AsNumberWithUnit behavior. + /// CASE_SENSITIVE/CASE_INSENSITIVE controls how units are matched. + /// UNIT_OPTIONAL/UNIT_REQUIRED throws ValidationError + /// if UNIT_REQUIRED is set and unit literal is not found. + enum Options { + CASE_SENSITIVE = 0, + CASE_INSENSITIVE = 1, + UNIT_OPTIONAL = 0, + UNIT_REQUIRED = 2, + DEFAULT = CASE_INSENSITIVE | UNIT_OPTIONAL + }; + + template + explicit AsNumberWithUnit(std::map mapping, + Options opts = DEFAULT, + const std::string &unit_name = "UNIT") { + description(generate_description(unit_name, opts)); + validate_mapping(mapping, opts); + + // transform function + func_ = [mapping, opts](std::string &input) -> std::string { + Number num{}; + + detail::rtrim(input); + if(input.empty()) { + throw ValidationError("Input is empty"); + } + + // Find split position between number and prefix + auto unit_begin = input.end(); + while(unit_begin > input.begin() && std::isalpha(*(unit_begin - 1), std::locale())) { + --unit_begin; + } + + std::string unit{unit_begin, input.end()}; + input.resize(static_cast(std::distance(input.begin(), unit_begin))); + detail::trim(input); + + if(opts & UNIT_REQUIRED && unit.empty()) { + throw ValidationError("Missing mandatory unit"); + } + if(opts & CASE_INSENSITIVE) { + unit = detail::to_lower(unit); + } + if(unit.empty()) { + using CLI::detail::lexical_cast; + if(!lexical_cast(input, num)) { + throw ValidationError(std::string("Value ") + input + " could not be converted to " + + detail::type_name()); + } + // No need to modify input if no unit passed + return {}; + } + + // find corresponding factor + auto it = mapping.find(unit); + if(it == mapping.end()) { + throw ValidationError(unit + + " unit not recognized. " + "Allowed values: " + + detail::generate_map(mapping, true)); + } + + if(!input.empty()) { + using CLI::detail::lexical_cast; + bool converted = lexical_cast(input, num); + if(!converted) { + throw ValidationError(std::string("Value ") + input + " could not be converted to " + + detail::type_name()); + } + // perform safe multiplication + bool ok = detail::checked_multiply(num, it->second); + if(!ok) { + throw ValidationError(detail::to_string(num) + " multiplied by " + unit + + " factor would cause number overflow. Use smaller value."); + } + } else { + num = static_cast(it->second); + } + + input = detail::to_string(num); + + return {}; + }; + } + + private: + /// Check that mapping contains valid units. + /// Update mapping for CASE_INSENSITIVE mode. + template static void validate_mapping(std::map &mapping, Options opts) { + for(auto &kv : mapping) { + if(kv.first.empty()) { + throw ValidationError("Unit must not be empty."); + } + if(!detail::isalpha(kv.first)) { + throw ValidationError("Unit must contain only letters."); + } + } + + // make all units lowercase if CASE_INSENSITIVE + if(opts & CASE_INSENSITIVE) { + std::map lower_mapping; + for(auto &kv : mapping) { + auto s = detail::to_lower(kv.first); + if(lower_mapping.count(s)) { + throw ValidationError(std::string("Several matching lowercase unit representations are found: ") + + s); + } + lower_mapping[detail::to_lower(kv.first)] = kv.second; + } + mapping = std::move(lower_mapping); + } + } + + /// Generate description like this: NUMBER [UNIT] + template static std::string generate_description(const std::string &name, Options opts) { + std::stringstream out; + out << detail::type_name() << ' '; + if(opts & UNIT_REQUIRED) { + out << name; + } else { + out << '[' << name << ']'; + } + return out.str(); + } +}; + +inline AsNumberWithUnit::Options operator|(const AsNumberWithUnit::Options &a, const AsNumberWithUnit::Options &b) { + return static_cast(static_cast(a) | static_cast(b)); +} + +/// Converts a human-readable size string (with unit literal) to uin64_t size. +/// Example: +/// "100" => 100 +/// "1 b" => 100 +/// "10Kb" => 10240 // you can configure this to be interpreted as kilobyte (*1000) or kibibyte (*1024) +/// "10 KB" => 10240 +/// "10 kb" => 10240 +/// "10 kib" => 10240 // *i, *ib are always interpreted as *bibyte (*1024) +/// "10kb" => 10240 +/// "2 MB" => 2097152 +/// "2 EiB" => 2^61 // Units up to exibyte are supported +class AsSizeValue : public AsNumberWithUnit { + public: + using result_t = std::uint64_t; + + /// If kb_is_1000 is true, + /// interpret 'kb', 'k' as 1000 and 'kib', 'ki' as 1024 + /// (same applies to higher order units as well). + /// Otherwise, interpret all literals as factors of 1024. + /// The first option is formally correct, but + /// the second interpretation is more wide-spread + /// (see https://en.wikipedia.org/wiki/Binary_prefix). + explicit AsSizeValue(bool kb_is_1000); + + private: + /// Get mapping + static std::map init_mapping(bool kb_is_1000); + + /// Cache calculated mapping + static std::map get_mapping(bool kb_is_1000); +}; + +namespace detail { +/// Split a string into a program name and command line arguments +/// the string is assumed to contain a file name followed by other arguments +/// the return value contains is a pair with the first argument containing the program name and the second +/// everything else. +CLI11_INLINE std::pair split_program_name(std::string commandline); + +} // namespace detail +/// @} + + + + +CLI11_INLINE std::string Validator::operator()(std::string &str) const { + std::string retstring; + if(active_) { + if(non_modifying_) { + std::string value = str; + retstring = func_(value); + } else { + retstring = func_(str); + } + } + return retstring; +} + +CLI11_NODISCARD CLI11_INLINE Validator Validator::description(std::string validator_desc) const { + Validator newval(*this); + newval.desc_function_ = [validator_desc]() { return validator_desc; }; + return newval; +} + +CLI11_INLINE Validator Validator::operator&(const Validator &other) const { + Validator newval; + + newval._merge_description(*this, other, " AND "); + + // Give references (will make a copy in lambda function) + const std::function &f1 = func_; + const std::function &f2 = other.func_; + + newval.func_ = [f1, f2](std::string &input) { + std::string s1 = f1(input); + std::string s2 = f2(input); + if(!s1.empty() && !s2.empty()) + return std::string("(") + s1 + ") AND (" + s2 + ")"; + return s1 + s2; + }; + + newval.active_ = active_ && other.active_; + newval.application_index_ = application_index_; + return newval; +} + +CLI11_INLINE Validator Validator::operator|(const Validator &other) const { + Validator newval; + + newval._merge_description(*this, other, " OR "); + + // Give references (will make a copy in lambda function) + const std::function &f1 = func_; + const std::function &f2 = other.func_; + + newval.func_ = [f1, f2](std::string &input) { + std::string s1 = f1(input); + std::string s2 = f2(input); + if(s1.empty() || s2.empty()) + return std::string(); + + return std::string("(") + s1 + ") OR (" + s2 + ")"; + }; + newval.active_ = active_ && other.active_; + newval.application_index_ = application_index_; + return newval; +} + +CLI11_INLINE Validator Validator::operator!() const { + Validator newval; + const std::function &dfunc1 = desc_function_; + newval.desc_function_ = [dfunc1]() { + auto str = dfunc1(); + return (!str.empty()) ? std::string("NOT ") + str : std::string{}; + }; + // Give references (will make a copy in lambda function) + const std::function &f1 = func_; + + newval.func_ = [f1, dfunc1](std::string &test) -> std::string { + std::string s1 = f1(test); + if(s1.empty()) { + return std::string("check ") + dfunc1() + " succeeded improperly"; + } + return std::string{}; + }; + newval.active_ = active_; + newval.application_index_ = application_index_; + return newval; +} + +CLI11_INLINE void +Validator::_merge_description(const Validator &val1, const Validator &val2, const std::string &merger) { + + const std::function &dfunc1 = val1.desc_function_; + const std::function &dfunc2 = val2.desc_function_; + + desc_function_ = [=]() { + std::string f1 = dfunc1(); + std::string f2 = dfunc2(); + if((f1.empty()) || (f2.empty())) { + return f1 + f2; + } + return std::string(1, '(') + f1 + ')' + merger + '(' + f2 + ')'; + }; +} + +namespace detail { + +#if defined CLI11_HAS_FILESYSTEM && CLI11_HAS_FILESYSTEM > 0 +CLI11_INLINE path_type check_path(const char *file) noexcept { + std::error_code ec; + auto stat = std::filesystem::status(to_path(file), ec); + if(ec) { + return path_type::nonexistent; + } + switch(stat.type()) { + case std::filesystem::file_type::none: // LCOV_EXCL_LINE + case std::filesystem::file_type::not_found: + return path_type::nonexistent; // LCOV_EXCL_LINE + case std::filesystem::file_type::directory: + return path_type::directory; + case std::filesystem::file_type::symlink: + case std::filesystem::file_type::block: + case std::filesystem::file_type::character: + case std::filesystem::file_type::fifo: + case std::filesystem::file_type::socket: + case std::filesystem::file_type::regular: + case std::filesystem::file_type::unknown: + default: + return path_type::file; + } +} +#else +CLI11_INLINE path_type check_path(const char *file) noexcept { +#if defined(_MSC_VER) + struct __stat64 buffer; + if(_stat64(file, &buffer) == 0) { + return ((buffer.st_mode & S_IFDIR) != 0) ? path_type::directory : path_type::file; + } +#else + struct stat buffer; + if(stat(file, &buffer) == 0) { + return ((buffer.st_mode & S_IFDIR) != 0) ? path_type::directory : path_type::file; + } +#endif + return path_type::nonexistent; +} +#endif + +CLI11_INLINE ExistingFileValidator::ExistingFileValidator() : Validator("FILE") { + func_ = [](std::string &filename) { + auto path_result = check_path(filename.c_str()); + if(path_result == path_type::nonexistent) { + return "File does not exist: " + filename; + } + if(path_result == path_type::directory) { + return "File is actually a directory: " + filename; + } + return std::string(); + }; +} + +CLI11_INLINE ExistingDirectoryValidator::ExistingDirectoryValidator() : Validator("DIR") { + func_ = [](std::string &filename) { + auto path_result = check_path(filename.c_str()); + if(path_result == path_type::nonexistent) { + return "Directory does not exist: " + filename; + } + if(path_result == path_type::file) { + return "Directory is actually a file: " + filename; + } + return std::string(); + }; +} + +CLI11_INLINE ExistingPathValidator::ExistingPathValidator() : Validator("PATH(existing)") { + func_ = [](std::string &filename) { + auto path_result = check_path(filename.c_str()); + if(path_result == path_type::nonexistent) { + return "Path does not exist: " + filename; + } + return std::string(); + }; +} + +CLI11_INLINE NonexistentPathValidator::NonexistentPathValidator() : Validator("PATH(non-existing)") { + func_ = [](std::string &filename) { + auto path_result = check_path(filename.c_str()); + if(path_result != path_type::nonexistent) { + return "Path already exists: " + filename; + } + return std::string(); + }; +} + +CLI11_INLINE IPV4Validator::IPV4Validator() : Validator("IPV4") { + func_ = [](std::string &ip_addr) { + auto result = CLI::detail::split(ip_addr, '.'); + if(result.size() != 4) { + return std::string("Invalid IPV4 address must have four parts (") + ip_addr + ')'; + } + int num = 0; + for(const auto &var : result) { + using CLI::detail::lexical_cast; + bool retval = lexical_cast(var, num); + if(!retval) { + return std::string("Failed parsing number (") + var + ')'; + } + if(num < 0 || num > 255) { + return std::string("Each IP number must be between 0 and 255 ") + var; + } + } + return std::string{}; + }; +} + +CLI11_INLINE EscapedStringTransformer::EscapedStringTransformer() { + func_ = [](std::string &str) { + try { + if(str.size() > 1 && (str.front() == '\"' || str.front() == '\'' || str.front() == '`') && + str.front() == str.back()) { + process_quoted_string(str); + } else if(str.find_first_of('\\') != std::string::npos) { + if(detail::is_binary_escaped_string(str)) { + str = detail::extract_binary_string(str); + } else { + str = remove_escaped_characters(str); + } + } + return std::string{}; + } catch(const std::invalid_argument &ia) { + return std::string(ia.what()); + } + }; +} +} // namespace detail + +CLI11_INLINE FileOnDefaultPath::FileOnDefaultPath(std::string default_path, bool enableErrorReturn) + : Validator("FILE") { + func_ = [default_path, enableErrorReturn](std::string &filename) { + auto path_result = detail::check_path(filename.c_str()); + if(path_result == detail::path_type::nonexistent) { + std::string test_file_path = default_path; + if(default_path.back() != '/' && default_path.back() != '\\') { + // Add folder separator + test_file_path += '/'; + } + test_file_path.append(filename); + path_result = detail::check_path(test_file_path.c_str()); + if(path_result == detail::path_type::file) { + filename = test_file_path; + } else { + if(enableErrorReturn) { + return "File does not exist: " + filename; + } + } + } + return std::string{}; + }; +} + +CLI11_INLINE AsSizeValue::AsSizeValue(bool kb_is_1000) : AsNumberWithUnit(get_mapping(kb_is_1000)) { + if(kb_is_1000) { + description("SIZE [b, kb(=1000b), kib(=1024b), ...]"); + } else { + description("SIZE [b, kb(=1024b), ...]"); + } +} + +CLI11_INLINE std::map AsSizeValue::init_mapping(bool kb_is_1000) { + std::map m; + result_t k_factor = kb_is_1000 ? 1000 : 1024; + result_t ki_factor = 1024; + result_t k = 1; + result_t ki = 1; + m["b"] = 1; + for(std::string p : {"k", "m", "g", "t", "p", "e"}) { + k *= k_factor; + ki *= ki_factor; + m[p] = k; + m[p + "b"] = k; + m[p + "i"] = ki; + m[p + "ib"] = ki; + } + return m; +} + +CLI11_INLINE std::map AsSizeValue::get_mapping(bool kb_is_1000) { + if(kb_is_1000) { + static auto m = init_mapping(true); + return m; + } + static auto m = init_mapping(false); + return m; +} + +namespace detail { + +CLI11_INLINE std::pair split_program_name(std::string commandline) { + // try to determine the programName + std::pair vals; + trim(commandline); + auto esp = commandline.find_first_of(' ', 1); + while(detail::check_path(commandline.substr(0, esp).c_str()) != path_type::file) { + esp = commandline.find_first_of(' ', esp + 1); + if(esp == std::string::npos) { + // if we have reached the end and haven't found a valid file just assume the first argument is the + // program name + if(commandline[0] == '"' || commandline[0] == '\'' || commandline[0] == '`') { + bool embeddedQuote = false; + auto keyChar = commandline[0]; + auto end = commandline.find_first_of(keyChar, 1); + while((end != std::string::npos) && (commandline[end - 1] == '\\')) { // deal with escaped quotes + end = commandline.find_first_of(keyChar, end + 1); + embeddedQuote = true; + } + if(end != std::string::npos) { + vals.first = commandline.substr(1, end - 1); + esp = end + 1; + if(embeddedQuote) { + vals.first = find_and_replace(vals.first, std::string("\\") + keyChar, std::string(1, keyChar)); + } + } else { + esp = commandline.find_first_of(' ', 1); + } + } else { + esp = commandline.find_first_of(' ', 1); + } + + break; + } + } + if(vals.first.empty()) { + vals.first = commandline.substr(0, esp); + rtrim(vals.first); + } + + // strip the program name + vals.second = (esp < commandline.length() - 1) ? commandline.substr(esp + 1) : std::string{}; + ltrim(vals.second); + return vals; +} + +} // namespace detail +/// @} + + + + +class Option; +class App; + +/// This enum signifies the type of help requested +/// +/// This is passed in by App; all user classes must accept this as +/// the second argument. + +enum class AppFormatMode { + Normal, ///< The normal, detailed help + All, ///< A fully expanded help + Sub, ///< Used when printed as part of expanded subcommand +}; + +/// This is the minimum requirements to run a formatter. +/// +/// A user can subclass this is if they do not care at all +/// about the structure in CLI::Formatter. +class FormatterBase { + protected: + /// @name Options + ///@{ + + /// The width of the left column (options/flags/subcommands) + std::size_t column_width_{30}; + + /// The width of the right column (description of options/flags/subcommands) + std::size_t right_column_width_{65}; + + /// The width of the description paragraph at the top of help + std::size_t description_paragraph_width_{80}; + + /// The width of the footer paragraph + std::size_t footer_paragraph_width_{80}; + + /// @brief The required help printout labels (user changeable) + /// Values are Needs, Excludes, etc. + std::map labels_{}; + + ///@} + /// @name Basic + ///@{ + + public: + FormatterBase() = default; + FormatterBase(const FormatterBase &) = default; + FormatterBase(FormatterBase &&) = default; + FormatterBase &operator=(const FormatterBase &) = default; + FormatterBase &operator=(FormatterBase &&) = default; + + /// Adding a destructor in this form to work around bug in GCC 4.7 + virtual ~FormatterBase() noexcept {} // NOLINT(modernize-use-equals-default) + + /// This is the key method that puts together help + virtual std::string make_help(const App *, std::string, AppFormatMode) const = 0; + + ///@} + /// @name Setters + ///@{ + + /// Set the "REQUIRED" label + void label(std::string key, std::string val) { labels_[key] = val; } + + /// Set the left column width (options/flags/subcommands) + void column_width(std::size_t val) { column_width_ = val; } + + /// Set the right column width (description of options/flags/subcommands) + void right_column_width(std::size_t val) { right_column_width_ = val; } + + /// Set the description paragraph width at the top of help + void description_paragraph_width(std::size_t val) { description_paragraph_width_ = val; } + + /// Set the footer paragraph width + void footer_paragraph_width(std::size_t val) { footer_paragraph_width_ = val; } + + ///@} + /// @name Getters + ///@{ + + /// Get the current value of a name (REQUIRED, etc.) + CLI11_NODISCARD std::string get_label(std::string key) const { + if(labels_.find(key) == labels_.end()) + return key; + return labels_.at(key); + } + + /// Get the current left column width (options/flags/subcommands) + CLI11_NODISCARD std::size_t get_column_width() const { return column_width_; } + + /// Get the current right column width (description of options/flags/subcommands) + CLI11_NODISCARD std::size_t get_right_column_width() const { return right_column_width_; } + + /// Get the current description paragraph width at the top of help + CLI11_NODISCARD std::size_t get_description_paragraph_width() const { return description_paragraph_width_; } + + /// Get the current footer paragraph width + CLI11_NODISCARD std::size_t get_footer_paragraph_width() const { return footer_paragraph_width_; } + + ///@} +}; + +/// This is a specialty override for lambda functions +class FormatterLambda final : public FormatterBase { + using funct_t = std::function; + + /// The lambda to hold and run + funct_t lambda_; + + public: + /// Create a FormatterLambda with a lambda function + explicit FormatterLambda(funct_t funct) : lambda_(std::move(funct)) {} + + /// Adding a destructor (mostly to make GCC 4.7 happy) + ~FormatterLambda() noexcept override {} // NOLINT(modernize-use-equals-default) + + /// This will simply call the lambda function + std::string make_help(const App *app, std::string name, AppFormatMode mode) const override { + return lambda_(app, name, mode); + } +}; + +/// This is the default Formatter for CLI11. It pretty prints help output, and is broken into quite a few +/// overridable methods, to be highly customizable with minimal effort. +class Formatter : public FormatterBase { + public: + Formatter() = default; + Formatter(const Formatter &) = default; + Formatter(Formatter &&) = default; + Formatter &operator=(const Formatter &) = default; + Formatter &operator=(Formatter &&) = default; + + /// @name Overridables + ///@{ + + /// This prints out a group of options with title + /// + CLI11_NODISCARD virtual std::string + make_group(std::string group, bool is_positional, std::vector opts) const; + + /// This prints out just the positionals "group" + virtual std::string make_positionals(const App *app) const; + + /// This prints out all the groups of options + std::string make_groups(const App *app, AppFormatMode mode) const; + + /// This prints out all the subcommands + virtual std::string make_subcommands(const App *app, AppFormatMode mode) const; + + /// This prints out a subcommand + virtual std::string make_subcommand(const App *sub) const; + + /// This prints out a subcommand in help-all + virtual std::string make_expanded(const App *sub, AppFormatMode mode) const; + + /// This prints out all the groups of options + virtual std::string make_footer(const App *app) const; + + /// This displays the description line + virtual std::string make_description(const App *app) const; + + /// This displays the usage line + virtual std::string make_usage(const App *app, std::string name) const; + + /// This puts everything together + std::string make_help(const App *app, std::string, AppFormatMode mode) const override; + + ///@} + /// @name Options + ///@{ + + /// This prints out an option help line, either positional or optional form + virtual std::string make_option(const Option *, bool) const; + + /// @brief This is the name part of an option, Default: left column + virtual std::string make_option_name(const Option *, bool) const; + + /// @brief This is the options part of the name, Default: combined into left column + virtual std::string make_option_opts(const Option *) const; + + /// @brief This is the description. Default: Right column, on new line if left column too large + virtual std::string make_option_desc(const Option *) const; + + /// @brief This is used to print the name on the USAGE line + virtual std::string make_option_usage(const Option *opt) const; + + ///@} +}; + + + + +using results_t = std::vector; +/// callback function definition +using callback_t = std::function; + +class Option; +class App; + +using Option_p = std::unique_ptr

Enum documentation

-
+

- enum class tf::TaskPriority: unsigned + enum class tf::TaskType: int +

-

enumeration of all task priority values

-

A priority is an enumerated value of type unsigned. Currently, Taskflow defines three priority levels, HIGH, NORMAL, and LOW, starting from 0, 1, to 2. That is, the lower the value, the higher the priority.

+

enumeration of all task types

- + - + - + - + - - -
Enumerators
HIGHPLACEHOLDER -

value of the highest priority (i.e., 0)

+

placeholder task type

NORMALSTATIC -

value of the normal priority (i.e., 1)

+

static task type

LOWRUNTIME -

value of the lowest priority (i.e., 2)

+

runtime task type

MAXSUBFLOW -

conventional value for iterating priority values

-
-
-
-

- enum class tf::TaskType: int -

-

enumeration of all task types

- - - - - - - - - - - - - - @@ -835,9 +820,17 @@

Enumerators
PLACEHOLDER -

placeholder task type

-
STATIC -

static task type

-
SUBFLOW -

dynamic (subflow) task type

+

dynamic (subflow) task type

+
+

+ enum class tf::ObserverType: int + +

+

enumeration of all observer types

+

enum class tf::PartitionerType: int +

enumeration of all partitioner types

@@ -861,6 +854,7 @@

enum class tf::PipeType: int +

enumeration of all pipe types

@@ -881,171 +875,73 @@

-
-

- enum class tf::cudaTaskType: int -

-

enumeration of all cudaTask types

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Enumerators
EMPTY -

empty task type

-
HOST -

host task type

-
MEMSET -

memory set task type

-
MEMCPY -

memory copy task type

-
KERNEL -

memory copy task type

-
SUBFLOW -

subflow (child graph) task type

-
CAPTURE -

capture task type

-
UNDEFINED -

undefined task type

-
-

Typedef documentation

-
+
+

+ using tf::observer_stamp_t = std::chrono::time_point<std::chrono::steady_clock> + +

+

default time point type of observers

+
+

- using tf::DefaultPartitioner = GuidedPartitioner<> + using tf::DefaultPartitioner = GuidedPartitioner<> +

default partitioner set to tf::GuidedPartitioner

-

Guided partitioner can achieve decent performance for most parallel algorithms, especially for those with irregular and unbalanced workload per iteration.

+

Guided partitioning algorithm can achieve stable and decent performance for most parallel algorithms.

-
-
-

Function documentation

-
+

- const char* tf::to_string(TaskType type) + using tf::cudaEvent = cudaEventBase<cudaEventCreator, cudaEventDeleter> +

-

convert a task type to a human-readable string

-

The name of each task type is the litte-case string of its characters.

TaskType::PLACEHOLDER     ->  "placeholder"
-TaskType::STATIC          ->  "static"
-TaskType::SUBFLOW         ->  "subflow"
-TaskType::CONDITION       ->  "condition"
-TaskType::MODULE          ->  "module"
-TaskType::ASYNC           ->  "async"
+

default smart pointer type to manage a cudaEvent_t object with unique ownership

-
+

-
- template<typename Input, typename Output, typename C> -
- auto tf::make_data_pipe(PipeType d, - C&& callable) + using tf::cudaStream = cudaStreamBase<cudaStreamCreator, cudaStreamDeleter> +

-

function to construct a data pipe (tf::DataPipe)

- - - - - - - - - - - - - - - - - - -
Template parameters
Inputinput data type
Outputoutput data type
Ccallable type
-

tf::make_data_pipe is a helper function to create a data pipe (tf::DataPipe) in a data-parallel pipeline (tf::DataPipeline). The first argument specifies the direction of the data pipe, either tf::PipeType::SERIAL or tf::PipeType::PARALLEL, and the second argument is a callable to invoke by the pipeline scheduler. Input and output data types are specified via template parameters, which will always be decayed by the library to its original form for storage purpose. The callable must take the input data type in its first argument and returns a value of the output data type.

tf::make_data_pipe<int, std::string>(
-  tf::PipeType::SERIAL, 
-  [](int& input) {
-    return std::to_string(input + 100);
-  }
-);

The callable can additionally take a reference of tf::Pipeflow, which allows you to query the runtime information of a stage task, such as its line number and token number.

tf::make_data_pipe<int, std::string>(
-  tf::PipeType::SERIAL, 
-  [](int& input, tf::Pipeflow& pf) {
-    printf("token=%lu, line=%lu\n", pf.token(), pf.line());
-    return std::to_string(input + 100);
-  }
-);
+

default smart pointer type to manage a cudaStream_t object with unique ownership

-
+

-
- template<typename T> -
- T* tf::cuda_malloc_device(size_t N, - int d) + using tf::cudaGraph = cudaGraphBase<cudaGraphCreator, cudaGraphDeleter> +

-

allocates memory on the given device for holding N elements of type T

-

The function calls cudaMalloc to allocate N*sizeof(T) bytes of memory on the given device d and returns a pointer to the starting address of the device memory.

+

default smart pointer type to manage a cudaGraph_t object with unique ownership

-
+

-
- template<typename T> -
- T* tf::cuda_malloc_device(size_t N) + using tf::cudaGraphExec = cudaGraphExecBase<cudaGraphExecCreator, cudaGraphExecDeleter> +

-

allocates memory on the current device associated with the caller

-

The function calls malloc_device from the current device associated with the caller.

+

default smart pointer type to manage a cudaGraphExec_t object with unique ownership

-
+
+
+

Function documentation

+

- template<typename T> + template<typename T, std::enable_if_t<(std::is_unsigned_v<std::decay_t<T>> && sizeof(T)==8), void>* = nullptr>
- T* tf::cuda_malloc_shared(size_t N) + T tf::next_pow2(T x) constexpr

-

allocates shared memory for holding N elements of type T

-

The function calls cudaMallocManaged to allocate N*sizeof(T) bytes of memory and returns a pointer to the starting address of the shared memory.

+

rounds the given 64-bit unsigned integer to the nearest power of 2

+

rounds the given 32-bit unsigned integer to the nearest power of 2

-
+

- template<typename T> + template<typename T, std::enable_if_t<std::is_integral_v<std::decay_t<T>>, void>* = nullptr>
- void tf::cuda_free(T* ptr, - int d) + bool tf::is_pow2(const T& x) constexpr

-

frees memory on the GPU device

+

checks if the given number is a power of 2

@@ -1053,7 +949,7 @@

- + @@ -1061,25 +957,27 @@

- - + + + + - - + + - +
Template parameters
Tpointer typeThe type of the input. Must be an integral type.
ptrdevice pointer to memory to freexThe integer to check.
ddevice context identifierReturnstrue if x is a power of 2, otherwise false.
-

This methods call cudaFree to free the memory space pointed to by ptr using the given device context.

+

This function determines if the given integer is a power of 2.

-
+

template<typename T>
- void tf::cuda_free(T* ptr) + size_t tf::floor_log2(T n) constexpr

-

frees memory on the GPU device

+

computes the floor of the base-2 logarithm of a number using count-leading-zeros (CTL).

@@ -1087,7 +985,7 @@

- + @@ -1095,100 +993,96 @@

- - + + + + + + + +
Template parameters
Tpointer typeinteger type (uint32_t or uint64_t).
ptrdevice pointer to memory to freeninput number.
Returnsfloor of log2(n)
-

This methods call cudaFree to free the memory space pointed to by ptr using the current device context of the caller.

+

This function efficiently calculates the floor of log2(n) for both 32-bit and 64-bit integers.

-
+

- void tf::cuda_memcpy_async(cudaStream_t stream, - void* dst, - const void* src, - size_t count) +
+ template<typename RandItr, typename C> +
+ RandItr tf::median_of_three(RandItr l, + RandItr m, + RandItr r, + C cmp)

-

copies data between host and device asynchronously through a stream

+

finds the median of three numbers pointed to by iterators using the given comparator

- + - - - - - - - - - - + + - - + + -
Parameters
Template parameters
streamstream identifier
dstdestination memory address
srcsource memory addressRandItrThe type of the random-access iterator.
countsize in bytes to copyCThe type of the comparator.
-

The method calls cudaMemcpyAsync with the given stream using cudaMemcpyDefault to infer the memory space of the source and the destination pointers. The memory areas may not overlap.

-
-
-

- void tf::cuda_memset_async(cudaStream_t stream, - void* devPtr, - int value, - size_t count) -

-

initializes or sets GPU memory to the given value byte by byte

- - - + + - - + + - - + + - - + + + + + + + +
Parameters
streamstream identifierlIterator to the first element.
devPtrpointer to GPU memprymIterator to the second element.
valuevalue to set for each byte of the specified memoryrIterator to the third element.
countsize in bytes to setcmpThe comparator used to compare the dereferenced iterator values.
ReturnsThe iterator pointing to the median value among the three elements.
-

The method calls cudaMemsetAsync with the given stream to fill the first count bytes of the memory area pointed to by devPtr with the constant byte value value.

+

This function determines the median value of the elements pointed to by three random-access iterators using the provided comparator.

-
+

- template<typename P, typename C> + template<typename RandItr, typename C>
- void tf::cuda_single_task(P&& p, - C c) + RandItr tf::pseudo_median_of_nine(RandItr beg, + RandItr end, + C cmp)

-

runs a callable asynchronously using one kernel thread

+

finds the pseudo median of a range of items using a spread of nine numbers

- - + + - + @@ -1196,44 +1090,49 @@

- - + + - - + + + + + + + + + + + +
Template parameters
Pexecution policy typeRandItrThe type of the random-access iterator.
Cclosure typeThe type of the comparator.
pexecution policybegIterator to the beginning of the range.
cclosure to run by one kernel threadendIterator to the end of the range.
cmpThe comparator used to compare the dereferenced iterator values.
ReturnsThe iterator pointing to the pseudo median of the range.
-

The function launches a single kernel thread to run the given callable through the stream in the execution policy object.

+

This function computes an approximate median of a range of items by sampling nine values spread across the range and finding their median. It uses a combination of the median_of_three function to determine the pseudo median.

-
+

- template<typename P, typename I, typename C> + template<typename Iter, typename Compare>
- void tf::cuda_for_each(P&& p, - I first, - I last, - C c) + void tf::sort2(Iter a, + Iter b, + Compare comp)

-

performs asynchronous parallel iterations over a range of items

+

sorts two elements of dereferenced iterators using the given comparison function

- - - - - - + + - - + + @@ -1241,55 +1140,44 @@

- - - - - - + + - - + + - - + +
Template parameters
Pexecution policy type
Iinput iterator typeIterThe type of the iterator.
Cunary operator typeCompareThe type of the comparator.
pexecution policy object
firstiterator to the beginning of the rangeaIterator to the first element.
lastiterator to the end of the rangebIterator to the second element.
cunary operator to apply to each dereferenced iteratorcompThe comparator used to compare the dereferenced iterator values.
-

This function is equivalent to a parallel execution of the following loop on a GPU:

for(auto itr = first; itr != last; itr++) {
-  c(*itr);
-}
+

This function compares two elements pointed to by iterators and swaps them if they are out of order according to the provided comparator.

-
+

- template<typename P, typename I, typename C> + template<typename Iter, typename Compare>
- void tf::cuda_for_each_index(P&& p, - I first, - I last, - I inc, - C c) + void tf::sort3(Iter a, + Iter b, + Iter c, + Compare comp)

-

performs asynchronous parallel iterations over an index-based range of items

+

Sorts three elements of dereferenced iterators using the given comparison function.

- - - - - - + + - - + + @@ -1297,69 +1185,69 @@

- - - - - - + + - - + + - - + + - - + +
Template parameters
Pexecution policy type
Iinput index typeIterThe type of the iterator.
Cunary operator typeCompareThe type of the comparator.
pexecution policy object
firstindex to the beginning of the rangeaIterator to the first element.
lastindex to the end of the rangebIterator to the second element.
incstep size between successive iterationscIterator to the third element.
cunary operator to apply to each indexcompThe comparator used to compare the dereferenced iterator values.
-

This function is equivalent to a parallel execution of the following loop on a GPU:

// step is positive [first, last)
-for(auto i=first; i<last; i+=step) {
-  c(i);
-}
-
-// step is negative [first, last)
-for(auto i=first; i>last; i+=step) {
-  c(i);
-}
+

This function sorts three elements pointed to by iterators in ascending order according to the provided comparator. The sorting is performed using a sequence of calls to the sort2 function to ensure the correct order of elements.

-
+

- template<typename P, typename I, typename O, typename C> + template<typename T, std::enable_if_t<std::is_integral_v<T>, void>* = nullptr>
- void tf::cuda_transform(P&& p, - I first, - I last, - O output, - C op) + T tf::unique_id()

-

performs asynchronous parallel transforms over a range of items

+

generates a program-wide unique ID of the given type in a thread-safe manner

- - - - - - + + + + - - + + + +
Template parameters
Pexecution policy type
Iinput iterator typeTThe type of the ID to generate. Must be an integral type.
Ooutput iterator typeReturnsA unique ID of type T.
+

This function provides a globally unique identifier of the specified integral type. It uses a static std::atomic counter to ensure thread safety and increments the counter in a relaxed memory ordering for efficiency.

+
+
+

+
+ template<typename T> +
+ void tf::atomic_max(std::atomic<T>& v, + const T& max_v) noexcept +

+

updates an atomic variable with the maximum value

+ + + + + - - + + @@ -1367,68 +1255,34 @@

- - - - - - - - - - - - - - + + - - + +
Template parameters
Cunary operator typeTThe type of the atomic variable. Must be trivially copyable and comparable.
pexecution policy
firstiterator to the beginning of the range
lastiterator to the end of the range
outputiterator to the beginning of the output rangevThe atomic variable to update.
opunary operator to apply to transform each itemmax_vThe value to compare with the current value of v.
-

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
-  *output++ = op(*first++);
-}
+

This function atomically updates the provided atomic variable v to hold the maximum of its current value and max_v. The update is performed using a relaxed memory ordering for efficiency in non-synchronizing contexts.

-
+

- template<typename P, typename I1, typename I2, typename O, typename C> + template<typename T>
- void tf::cuda_transform(P&& p, - I1 first1, - I1 last1, - I2 first2, - O output, - C op) -

-

performs asynchronous parallel transforms over two ranges of items

+ void tf::atomic_min(std::atomic<T>& v, + const T& min_v) noexcept + +

updates an atomic variable with the minimum value

- - - - - - - - - - - - - - - - - - + + @@ -1436,137 +1290,60 @@

- - - - - - - - - - - - - - - - - - + + - - + +
Template parameters
Pexecution policy type
I1first input iterator type
I2second input iterator type
Ooutput iterator type
Cbinary operator typeTThe type of the atomic variable. Must be trivially copyable and comparable.
pexecution policy
first1iterator to the beginning of the first range
last1iterator to the end of the first range
first2iterator to the beginning of the second range
outputiterator to the beginning of the output rangevThe atomic variable to update.
opbinary operator to apply to transform each pair of itemsmin_vThe value to compare with the current value of v.
-

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first1 != last1) {
-  *output++ = op(*first1++, *first2++);
-}
+

This function atomically updates the provided atomic variable v to hold the minimum of its current value and min_v. The update is performed using a relaxed memory ordering for efficiency in non-synchronizing contexts.

-
+

- template<typename P, typename I, typename T, typename O> + template<typename T>
- void tf::cuda_reduce(P&& p, - I first, - I last, - T* res, - O op, - void* buf) -

-

performs asynchronous parallel reduction over a range of items

+ T tf::seed() noexcept + +

generates a random seed based on the current system clock

- - - - - - - - - - - - - - + + - - - - - - - - - - - - - - - - - - - - - - - - + - - + + - +
Template parameters
Pexecution policy type
Iinput iterator type
Tvalue type
Obinary operator typeTThe type of the returned seed. Must be an integral type.
Parameters
pexecution policy
firstiterator to the beginning of the range
lastiterator to the end of the range
respointer to the result
opbinary operator to apply to reduce elements
bufpointer to the temporary bufferReturnsA seed value based on the system clock.
-

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
-  *result = op(*result, *first++);
-}
+

This function returns a seed value derived from the number of clock ticks since the epoch as measured by the system clock. The seed can be used to initialize random number generators.

-
+

- template<typename P, typename I, typename T, typename O> + template<typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
- void tf::cuda_uninitialized_reduce(P&& p, - I first, - I last, - T* res, - O op, - void* buf) -

-

performs asynchronous parallel reduction over a range of items without an initial value

+ auto tf::ctz(T x) + +

counts the number of trailing zeros in an integer.

- - - - - - - - - - - - - - + + @@ -1574,206 +1351,132 @@

- - - - - - - - - - - - - - - - - - + + + + - - + + - +
Template parameters
Pexecution policy type
Iinput iterator type
Tvalue type
Obinary operator typeTinteger type (32-bit or 64-bit).
pexecution policy
firstiterator to the beginning of the range
lastiterator to the end of the range
respointer to the result
opbinary operator to apply to reduce elementsxnon-zero integer to count trailing zeros from
bufpointer to the temporary bufferReturnsthe number of trailing zeros in x
-

This method is equivalent to the parallel execution of the following loop on a GPU:

*result = *first++;  // no initial values partitipcate in the loop
-while (first != last) {
-  *result = op(*result, *first++);
-}
+

This function provides a portable implementation for counting the number of trailing zeros across different platforms and integer sizes (32-bit and 64-bit).

-
+

-
- template<typename P, typename I, typename T, typename O, typename U> -
- void tf::cuda_transform_reduce(P&& p, - I first, - I last, - T* res, - O bop, - U uop, - void* buf) -

-

performs asynchronous parallel reduction over a range of transformed items without an initial value

+ size_t tf::coprime(size_t N) constexpr + +

computes a coprime of a given number

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + - - + + - +
Template parameters
Pexecution policy type
Iinput iterator type
Tvalue type
Obinary operator type
Uunary operator type
Parameters
pexecution policy
firstiterator to the beginning of the range
lastiterator to the end of the range
respointer to the result
bopbinary operator to apply to reduce elements
uopunary operator to apply to transform elementsNinput number for which a coprime is to be found.
bufpointer to the temporary bufferReturnsthe largest number < N that is coprime to N
-

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
-  *result = bop(*result, uop(*first++));
-}
+

This function finds the largest number less than N that is coprime (i.e., has a greatest common divisor of 1) with N. If N is less than 3, it returns 1 as a default coprime.

-
+

- template<typename P, typename I, typename T, typename O, typename U> + template<size_t N>
- void tf::cuda_uninitialized_transform_reduce(P&& p, - I first, - I last, - T* res, - O bop, - U uop, - void* buf) -

-

performs asynchronous parallel reduction over a range of transformed items with an initial value

+ std::array<size_t, N> tf::make_coprime_lut() constexpr + +

generates a compile-time array of coprimes for numbers from 0 to N-1

- - - - - - - - - - - - - - + + + + - - + + - + +
Template parameters
Pexecution policy type
Iinput iterator type
Tvalue type
Obinary operator typeNthe size of the array to generate (should be greater than 0).
Uunary operator typeReturnsa constexpr array of size N where each index holds a coprime of its value.
+

This function constructs a constexpr array where each element at index i contains a coprime of i (the largest number less than i that is coprime to it).

+
+
+

+ std::string tf::get_env(const std::string& str) +

+

retrieves the value of an environment variable

+ - - - - - - - - - - - - - - + + + + - - + + + +
Parameters
pexecution policy
firstiterator to the beginning of the range
lastiterator to the end of the range
respointer to the resultstrThe name of the environment variable to retrieve.
bopbinary operator to apply to reduce elementsReturnsThe value of the environment variable as a string, or an empty string if not found.
+

This function fetches the value of an environment variable by name. If the variable is not found, it returns an empty string.

+
+
+

+ bool tf::has_env(const std::string& str) +

+

checks whether an environment variable is defined

+ + + + + - - + + + + - - + + - +
Parameters
uopunary operator to apply to transform elementsstrThe name of the environment variable to check.
bufpointer to the temporary bufferReturnstrue if the environment variable exists, false otherwise.
-

This method is equivalent to the parallel execution of the following loop on a GPU:

*result = uop(*first++);  // no initial values partitipcate in the loop
-while (first != last) {
-  *result = bop(*result, uop(*first++));
-}
+

This function determines if a specific environment variable exists in the current environment.

+
+
+

+ void tf::pause() +

+

This function is used in spin-wait loops to hint the CPU that the current thread is in a busy-wait state. It helps reduce power consumption and improves performance on hyper-threaded processors by preventing the CPU from consuming unnecessary cycles while waiting. It is particularly useful in low-contention scenarios, where the thread is likely to quickly acquire the lock or condition it's waiting for, avoiding an expensive context switch. On modern x86 processors, this instruction can be invoked using __builtin_ia32_pause() in GCC/Clang or _mm_pause() in MSVC. In non-x86 architectures, alternative mechanisms such as yielding the CPU may be used instead.

-
+

- template<typename P, typename I, typename O, typename C> + template<typename P>
- void tf::cuda_inclusive_scan(P&& p, - I first, - I last, - O output, - C op, - void* buf) -

-

performs asynchronous inclusive scan over a range of items

+ void tf::spin_until(P&& predicate) + +

spins until the given predicate becomes true

@@ -1781,19 +1484,7 @@

- - - - - - - - - - - - - + @@ -1801,70 +1492,39 @@

- - - - - - - - - - - - - - - - - - - - - - + +
Template parameters
Pexecution policy type
Iinput iterator
Ooutput iterator
Cbinary operator typethe type of the predicate function or callable.
pexecution policy
firstiterator to the beginning of the input range
lastiterator to the end of the input range
outputiterator to the beginning of the output range
opbinary operator to apply to scan
bufpointer to the temporary bufferpredicatethe callable that returns a boolean value, which is checked in the loop.
+

This function repeatedly checks the provided predicate in a spin-wait loop and uses a backoff strategy to minimize CPU waste during the wait. Initially, it uses the pause() instruction for the first 100 iterations to hint to the CPU that the thread is waiting, thus reducing power consumption and avoiding unnecessary cycles. After 100 iterations, it switches to yielding the CPU using std::this_thread::yield() to allow other threads to run and improve system responsiveness.

The function operates as follows:

  1. For the first 100 iterations, it invokes pause() to reduce power consumption during the spin-wait.
  2. After 100 iterations, it uses std::this_thread::yield() to relinquish the CPU, allowing other threads to execute.
-
+

- template<typename P, typename I, typename O, typename C, typename U> + template<typename B, typename E, typename S>
- void tf::cuda_transform_inclusive_scan(P&& p, - I first, - I last, - O output, - C bop, - U uop, - void* buf) -

-

performs asynchronous inclusive scan over a range of transformed items

+ std::enable_if_t<std::is_integral_v<std::decay_t<B>> && std::is_integral_v<std::decay_t<E>> && std::is_integral_v<std::decay_t<S>>, bool> tf::is_index_range_invalid(B beg, + E end, + S step) constexpr + +

checks if the given index range is invalid

- - - - - - - - - - + + - - + + - - + + @@ -1872,69 +1532,53 @@

- - - - - - + + - - + + - - - - - - - - - - + + + + - - + + - +
Template parameters
Pexecution policy type
Iinput iterator
Ooutput iteratorBtype of the beginning index
Cbinary operator typeEtype of the ending index
Uunary operator typeStype of the step size
pexecution policy
firstiterator to the beginning of the input rangebegstarting index of the range
lastiterator to the end of the input rangeendending index of the range
outputiterator to the beginning of the output range
bopbinary operator to apply to scan
uopunary operator to apply to transform each item before scanstepstep size to traverse the range
bufpointer to the temporary bufferReturnsreturns true if the range is invalid; false otherwise.
+

A range is considered invalid under the following conditions:

  • The step is zero and the begin and end values are not equal.
  • A positive range (begin < end) with a non-positive step.
  • A negative range (begin > end) with a non-negative step.
-
+

- template<typename P, typename I, typename O, typename C> + template<typename B, typename E, typename S>
- void tf::cuda_exclusive_scan(P&& p, - I first, - I last, - O output, - C op, - void* buf) -

-

performs asynchronous exclusive scan over a range of items

+ std::enable_if_t<std::is_integral_v<std::decay_t<B>> && std::is_integral_v<std::decay_t<E>> && std::is_integral_v<std::decay_t<S>>, size_t> tf::distance(B beg, + E end, + S step) constexpr + +

calculates the number of iterations in the given index range

- - - - - - + + - - + + - - + + @@ -1942,70 +1586,51 @@

- - - - - - - - - - + + - - + + - - + + + + - - + + - +
Template parameters
Pexecution policy type
Iinput iteratorBtype of the beginning index
Ooutput iteratorEtype of the ending index
Cbinary operator typeStype of the step size
pexecution policy
firstiterator to the beginning of the input range
lastiterator to the end of the input rangebegstarting index of the range
outputiterator to the beginning of the output rangeendending index of the range
opbinary operator to apply to scanstepstep size to traverse the range
bufpointer to the temporary bufferReturnsreturns the number of required iterations to traverse the range
+

The distance of a range represents the number of required iterations to traverse the range from the beginning index to the ending index (exclusive) with the given step size.

Example 1:

// Range: 0 to 10 with step size 2
+size_t dist = distance(0, 10, 2);  // Returns 5, the sequence is [0, 2, 4, 6, 8]

Example 2:

// Range: 10 to 0 with step size -2
+size_t dist = distance(10, 0, -2);  // Returns 5, the sequence is [10, 8, 6, 4, 2]

Example 3:

// Range: 5 to 20 with step size 5
+size_t dist = distance(5, 20, 5);  // Returns 3, the sequence is [5, 10, 15]
-
+

+
- template<typename P, typename I, typename O, typename C, typename U> + template<typename T, typename... ArgsT>
- void tf::cuda_transform_exclusive_scan(P&& p, - I first, - I last, - O output, - C bop, - U uop, - void* buf) -

-

performs asynchronous exclusive scan over a range of items

+ std::unique_ptr<T> tf::make_worker_interface(ArgsT && ... args) + +

helper function to create an instance derived from tf::WorkerInterface

- - - - - - - - - - - - - - + + - - + + @@ -2013,183 +1638,92 @@

- - - - - - - - - - - - - - - - - - - - - - - - - - + +
Template parameters
Pexecution policy type
Iinput iterator
Ooutput iterator
Cbinary operator typeTtype derived from tf::WorkerInterface
Uunary operator typeArgsTargument types to construct T
pexecution policy
firstiterator to the beginning of the input range
lastiterator to the end of the input range
outputiterator to the beginning of the output range
bopbinary operator to apply to scan
uopunary operator to apply to transform each item before scan
bufpointer to the temporary bufferargsarguments to forward to the constructor of T
-
+
+

+ const char* tf::to_string(TaskType type) + +

+

convert a task type to a human-readable string

+

The name of each task type is the litte-case string of its characters.

+
+
+

+ std::ostream& tf::operator<<(std::ostream& os, + const Task& task) + +

+

overload of ostream inserter operator for Task

+
+
+

+ const char* tf::to_string(ObserverType type) + +

+

convert an observer type to a human-readable string

+
+

- template<typename P, typename a_keys_it, typename a_vals_it, typename b_keys_it, typename b_vals_it, typename c_keys_it, typename c_vals_it, typename C> + template<typename Input, typename Output, typename C>
- void tf::cuda_merge_by_key(P&& p, - a_keys_it a_keys_first, - a_keys_it a_keys_last, - a_vals_it a_vals_first, - b_keys_it b_keys_first, - b_keys_it b_keys_last, - b_vals_it b_vals_first, - c_keys_it c_keys_first, - c_vals_it c_vals_first, - C comp, - void* buf) -

-

performs asynchronous key-value merge over a range of keys and values

+ auto tf::make_data_pipe(PipeType d, + C&& callable) + +

function to construct a data pipe (tf::DataPipe)

- - - - - - - - - - - - - - - - - - - - - - + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +
Template parameters
Pexecution policy type
a_keys_itfirst key iterator type
a_vals_itfirst value iterator type
b_keys_itsecond key iterator type
b_vals_itsecond value iterator type
c_keys_itoutput key iterator typeInputinput data type
c_vals_itoutput value iterator typeOutputoutput data type
Ccomparator type
Parameters
pexecution policy
a_keys_firstiterator to the beginning of the first key range
a_keys_lastiterator to the end of the first key range
a_vals_firstiterator to the beginning of the first value range
b_keys_firstiterator to the beginning of the second key range
b_keys_lastiterator to the end of the second key range
b_vals_firstiterator to the beginning of the second value range
c_keys_firstiterator to the beginning of the output key range
c_vals_firstiterator to the beginning of the output value range
compcomparator
bufpointer to the temporary buffercallable type
-

Performs a key-value merge that copies elements from [a_keys_first, a_keys_last) and [b_keys_first, b_keys_last) into a single range, [c_keys_first, c_keys_last + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first)) such that the resulting range is in ascending key order.

At the same time, the merge copies elements from the two associated ranges [a_vals_first + (a_keys_last - a_keys_first)) and [b_vals_first + (b_keys_last - b_keys_first)) into a single range, [c_vals_first, c_vals_first + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first)) such that the resulting range is in ascending order implied by each input element's associated key.

For example, assume:

  • a_keys = {1, 8};
  • a_vals = {2, 1};
  • b_keys = {3, 7};
  • b_vals = {3, 4};

After the merge, we have:

  • c_keys = {1, 3, 7, 8}
  • c_vals = {2, 3, 4, 1}
+

tf::make_data_pipe is a helper function to create a data pipe (tf::DataPipe) in a data-parallel pipeline (tf::DataPipeline). The first argument specifies the direction of the data pipe, either tf::PipeType::SERIAL or tf::PipeType::PARALLEL, and the second argument is a callable to invoke by the pipeline scheduler. Input and output data types are specified via template parameters, which will always be decayed by the library to its original form for storage purpose. The callable must take the input data type in its first argument and returns a value of the output data type.

tf::make_data_pipe<int, std::string>(
+  tf::PipeType::SERIAL, 
+  [](int& input) {
+    return std::to_string(input + 100);
+  }
+);

The callable can additionally take a reference of tf::Pipeflow, which allows you to query the runtime information of a stage task, such as its line number and token number.

tf::make_data_pipe<int, std::string>(
+  tf::PipeType::SERIAL, 
+  [](int& input, tf::Pipeflow& pf) {
+    printf("token=%lu, line=%lu\n", pf.token(), pf.line());
+    return std::to_string(input + 100);
+  }
+);
-
+

- template<typename P, typename a_keys_it, typename b_keys_it, typename c_keys_it, typename C> + template<typename T>
- void tf::cuda_merge(P&& p, - a_keys_it a_keys_first, - a_keys_it a_keys_last, - b_keys_it b_keys_first, - b_keys_it b_keys_last, - c_keys_it c_keys_first, - C comp, - void* buf) -

-

performs asynchronous key-only merge over a range of keys

+ auto tf::make_module_task(T&& target) + +

creates a module task using the given target

- - - - - - - - - - - - - - - - - - + + @@ -2197,174 +1731,248 @@

- - - - - - - - - - - - - - + + + + - - + + - - - - - - - - - - - - - +
Template parameters
Pexecution policy type
a_keys_itfirst key iterator type
b_keys_itsecond key iterator type
c_keys_itoutput key iterator type
Ccomparator typeTType of the target object, which must define the method tf::Graph& graph().
pexecution policy
a_keys_firstiterator to the beginning of the first key range
a_keys_lastiterator to the end of the first key range
b_keys_firstiterator to the beginning of the second key rangetargetThe target object used to create the module task.
b_keys_lastiterator to the end of the second key rangeReturnsmodule task that can be used by Taskflow or asynchronous tasking.
c_keys_firstiterator to the beginning of the output key range
compcomparator
bufpointer to the temporary buffer
-

This function is equivalent to tf::cuda_merge_by_key without values.

+

This example demonstrates how to create and launch multiple taskflows in parallel using asynchronous tasking:

tf::Executor executor;
+
+tf::Taskflow A;
+tf::Taskflow B;
+tf::Taskflow C;
+tf::Taskflow D;
+
+A.emplace([](){ printf("Taskflow A\n"); }); 
+B.emplace([](){ printf("Taskflow B\n"); }); 
+C.emplace([](){ printf("Taskflow C\n"); }); 
+D.emplace([](){ printf("Taskflow D\n"); }); 
+
+// launch the four taskflows using asynchronous tasking
+executor.async(tf::make_module_task(A));
+executor.async(tf::make_module_task(B));
+executor.async(tf::make_module_task(C));
+executor.async(tf::make_module_task(D));
+executor.wait_for_all();  

The module task maker, tf::make_module_task, is basically the same as tf::Taskflow::composed_of but provides a more generic interface that can be used beyond Taskflow. For instance, the following two approaches achieve the same functionality.

// approach 1: composition using composed_of
+tf::Task m1 = taskflow1.composed_of(taskflow2);
+
+// approach 2: composition using make_module_task
+tf::Task m1 = taskflow1.emplace(tf::make_module_task(taskflow2));
+
+
+

+ size_t tf::cuda_get_num_devices() + +

+

queries the number of available devices

+
+
+

+ int tf::cuda_get_device() + +

+

gets the current device associated with the caller thread

+
+
+

+ void tf::cuda_set_device(int id) + +

+

switches to a given device context

+
+
+

+ void tf::cuda_get_device_property(int i, + cudaDeviceProp& p) + +

+

obtains the device property

+
+
+

+ cudaDeviceProp tf::cuda_get_device_property(int i) + +

+

obtains the device property

+
+
+

+ void tf::cuda_dump_device_property(std::ostream& os, + const cudaDeviceProp& p) + +

+

dumps the device property

+
+
+

+ size_t tf::cuda_get_device_max_threads_per_block(int d) + +

+

queries the maximum threads per block on a device

+
+
+

+ size_t tf::cuda_get_device_max_x_dim_per_block(int d) + +

+

queries the maximum x-dimension per block on a device

+
+
+

+ size_t tf::cuda_get_device_max_y_dim_per_block(int d) + +

+

queries the maximum y-dimension per block on a device

+
+
+

+ size_t tf::cuda_get_device_max_z_dim_per_block(int d) + +

+

queries the maximum z-dimension per block on a device

+
+
+

+ size_t tf::cuda_get_device_max_x_dim_per_grid(int d) + +

+

queries the maximum x-dimension per grid on a device

+
+
+

+ size_t tf::cuda_get_device_max_y_dim_per_grid(int d) + +

+

queries the maximum y-dimension per grid on a device

+
+
+

+ size_t tf::cuda_get_device_max_z_dim_per_grid(int d) + +

+

queries the maximum z-dimension per grid on a device

+
+
+

+ size_t tf::cuda_get_device_max_shm_per_block(int d) + +

+

queries the maximum shared memory size in bytes per block on a device

+
+
+

+ size_t tf::cuda_get_device_warp_size(int d) + +

+

queries the warp size on a device

+
+
+

+ int tf::cuda_get_device_compute_capability_major(int d) + +

+

queries the major number of compute capability of a device

+
+
+

+ int tf::cuda_get_device_compute_capability_minor(int d) + +

+

queries the minor number of compute capability of a device

+
+
+

+ bool tf::cuda_get_device_unified_addressing(int d) + +

+

queries if the device supports unified addressing

+
+
+

+ int tf::cuda_get_driver_version() + +

+

queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver

+
+
+

+ int tf::cuda_get_runtime_version() + +

+

queries the CUDA Runtime version (1000 * major + 10 * minor)

+
+
+

+ size_t tf::cuda_get_free_mem(int d) + +

+

queries the free memory (expensive call)

+
+
+

+ size_t tf::cuda_get_total_mem(int d) + +

+

queries the total available memory (expensive call)

+
+
+

+ +
+ template<typename T> +
+ T* tf::cuda_malloc_device(size_t N, + int d) +

+

allocates memory on the given device for holding N elements of type T

+

The function calls cudaMalloc to allocate N*sizeof(T) bytes of memory on the given device d and returns a pointer to the starting address of the device memory.

-
+

+
- template<typename P, typename K, typename V = cudaEmpty> + template<typename T>
- unsigned tf::cuda_sort_buffer_size(unsigned count) + T* tf::cuda_malloc_device(size_t N)

-

queries the buffer size in bytes needed to call sort kernels for the given number of elements

- - - - - - - - - - - - - - - - - - - - - - - - - - - -
Template parameters
Pexecution policy type
Kkey type
Vvalue type (default tf::cudaEmpty)
Parameters
countnumber of keys/values to sort
-

The function is used to allocate a buffer for calling tf::cuda_sort.

+

allocates memory on the current device associated with the caller

+

The function calls malloc_device from the current device associated with the caller.

-
+

+
- template<typename P, typename K_it, typename V_it, typename C> + template<typename T>
- void tf::cuda_sort_by_key(P&& p, - K_it k_first, - K_it k_last, - V_it v_first, - C comp, - void* buf) -

-

performs asynchronous key-value sort on a range of items

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Template parameters
Pexecution policy type
K_itkey iterator type
V_itvalue iterator type
Ccomparator type
Parameters
pexecution policy
k_firstiterator to the beginning of the key range
k_lastiterator to the end of the key range
v_firstiterator to the beginning of the value range
compbinary comparator
bufpointer to the temporary buffer
-

Sorts key-value elements in [k_first, k_last) and [v_first, v_first + (k_last - k_first)) into ascending key order using the given comparator comp. If i and j are any two valid iterators in [k_first, k_last) such that i precedes j, and p and q are iterators in [v_first, v_first + (k_last - k_first)) corresponding to i and j respectively, then comp(*j, *i) evaluates to false.

For example, assume:

  • keys are {1, 4, 2, 8, 5, 7}
  • values are {'a', 'b', 'c', 'd', 'e', 'f'}

After sort:

  • keys are {1, 2, 4, 5, 7, 8}
  • values are {'a', 'c', 'b', 'e', 'f', 'd'}
+ T* tf::cuda_malloc_shared(size_t N) + +

allocates shared memory for holding N elements of type T

+

The function calls cudaMallocManaged to allocate N*sizeof(T) bytes of memory and returns a pointer to the starting address of the shared memory.

-
+

+
- template<typename P, typename K_it, typename C> + template<typename T>
- void tf::cuda_sort(P&& p, - K_it k_first, - K_it k_last, - C comp, - void* buf) + void tf::cuda_free(T* ptr, + int d)

-

performs asynchronous key-only sort on a range of items

+

frees memory on the GPU device

- - - - - - - - - - + + @@ -2372,57 +1980,34 @@

- - - - - - - - - - - - - - + + - - + +
Template parameters
Pexecution policy type
K_itkey iterator type
Ccomparator typeTpointer type
pexecution policy
k_firstiterator to the beginning of the key range
k_lastiterator to the end of the key range
compbinary comparatorptrdevice pointer to memory to free
bufpointer to the temporary bufferddevice context identifier
-

This method is equivalent to tf::cuda_sort_by_key without values.

+

This methods call cudaFree to free the memory space pointed to by ptr using the given device context.

-
+

+
- template<typename P, typename I, typename U> + template<typename T>
- void tf::cuda_find_if(P&& p, - I first, - I last, - unsigned* idx, - U op) + void tf::cuda_free(T* ptr)

-

finds the index of the first element that satisfies the given criteria

+

frees memory on the GPU device

- - - - - - - - - - + + @@ -2430,182 +2015,92 @@

- - - - - - - - - - - - - - - - - - + +
Template parameters
Pexecution policy type
Iinput iterator type
Uunary operator typeTpointer type
pexecution policy
firstiterator to the beginning of the range
lastiterator to the end of the range
idxpointer to the index of the found element
opunary operator which returns true for the required elementptrdevice pointer to memory to free
-

The function launches kernels asynchronously to find the index idx of the first element in the range [first, last) such that op(*(first+idx)) is true. This is equivalent to the parallel execution of the following loop:

unsigned idx = 0;
-for(; first != last; ++first, ++idx) {
-  if (p(*first)) {
-    return idx;
-  }
-}
-return idx;
+

This methods call cudaFree to free the memory space pointed to by ptr using the current device context of the caller.

-
+

-
- template<typename P, typename I, typename O> -
- void tf::cuda_min_element(P&& p, - I first, - I last, - unsigned* idx, - O op, - void* buf) -

-

finds the index of the minimum element in a range

+ void tf::cuda_memcpy_async(cudaStream_t stream, + void* dst, + const void* src, + size_t count) + + +

copies data between host and device asynchronously through a stream

- - - - - - - - - - - - - - - - - - - - - - - - - - - + + - - + + - - + + - - + +
Template parameters
Pexecution policy type
Iinput iterator type
Ocomparator type
Parameters
pexecution policy object
firstiterator to the beginning of the range
lastiterator to the end of the rangestreamstream identifier
idxsolution index of the minimum elementdstdestination memory address
opcomparison function objectsrcsource memory address
bufpointer to the buffercountsize in bytes to copy
-

The function launches kernels asynchronously to find the smallest element in the range [first, last) using the given comparator op. You need to provide a buffer that holds at least tf::cuda_min_element_bufsz bytes for internal use. The function is equivalent to a parallel execution of the following loop:

if(first == last) {
-  return 0;
-}
-auto smallest = first;
-for (++first; first != last; ++first) {
-  if (op(*first, *smallest)) {
-    smallest = first;
-  }
-}
-return std::distance(first, smallest);
-
-
+

The method calls cudaMemcpyAsync with the given stream using cudaMemcpyDefault to infer the memory space of the source and the destination pointers. The memory areas may not overlap.

+
+

-
- template<typename P, typename I, typename O> -
- void tf::cuda_max_element(P&& p, - I first, - I last, - unsigned* idx, - O op, - void* buf) -

-

finds the index of the maximum element in a range

+ void tf::cuda_memset_async(cudaStream_t stream, + void* devPtr, + int value, + size_t count) + + +

initializes or sets GPU memory to the given value byte by byte

- - - - - - - - - - - - - - - - - - - - - - - - - - - + + - - + + - - + + - - + +
Template parameters
Pexecution policy type
Iinput iterator type
Ocomparator type
Parameters
pexecution policy object
firstiterator to the beginning of the range
lastiterator to the end of the rangestreamstream identifier
idxsolution index of the maximum elementdevPtrpointer to GPU memory
opcomparison function objectvaluevalue to set for each byte of the specified memory
bufpointer to the buffercountsize in bytes to set
-

The function launches kernels asynchronously to find the largest element in the range [first, last) using the given comparator op. You need to provide a buffer that holds at least tf::cuda_max_element_bufsz bytes for internal use. The function is equivalent to a parallel execution of the following loop:

if(first == last) {
-  return 0;
-}
-auto largest = first;
-for (++first; first != last; ++first) {
-  if (op(*largest, *first)) {
-    largest = first;
-  }
-}
-return std::distance(first, largest);
+

The method calls cudaMemsetAsync with the given stream to fill the first count bytes of the memory area pointed to by devPtr with the constant byte value value.

-
+

- const char* tf::version() constexpr + cudaGraphNodeType tf::cuda_get_graph_node_type(cudaGraphNode_t node) +

+

queries the type of a native CUDA graph node

+

valid type values are:

  • cudaGraphNodeTypeKernel = 0x00
  • cudaGraphNodeTypeMemcpy = 0x01
  • cudaGraphNodeTypeMemset = 0x02
  • cudaGraphNodeTypeHost = 0x03
  • cudaGraphNodeTypeGraph = 0x04
  • cudaGraphNodeTypeEmpty = 0x05
  • cudaGraphNodeTypeWaitEvent = 0x06
  • cudaGraphNodeTypeEventRecord = 0x07
+
+
+

+ const char* tf::version() constexpr +

queries the version information in a string format major.minor.patch

Release notes are available here: https://taskflow.github.io/taskflow/Releases.html

@@ -2615,56 +2110,112 @@

Variable documentation

+
template<typename P>
bool tf::is_task_params_v constexpr

determines if the given type is a task parameter type

-

Task parameters can be specified in one of the following types:

+

Task parameters can be specified in one of the following types:

+
+
+

+ +
+ template<typename T> +
+ bool tf::has_graph_v constexpr +

+

determines if the given type has a member function Graph& graph()

+ + + + + + + + + + +
Template parameters
TThe type to inspect.
+

This trait determines if the provided type T contains a member function with the exact signature tf::Graph& graph(). It uses SFINAE and std::void_t to detect the presence of the member function and its return type.

Example usage:

struct A {
+  tf::Graph& graph() { return my_graph; };
+  tf::Graph my_graph;
+
+  // other custom members to alter my_graph
+};
+
+struct C {}; // No graph function
+
+static_assert(has_graph_v<A>, "A has graph()");
+static_assert(!has_graph_v<C>, "C does not have graph()");
+
+
+

+ std::array<TaskType, 7> tf::TASK_TYPES constexpr + +

+

array of all task types (used for iterating task types)

+
+
+

+ +
+ template<typename C> +
+ bool tf::is_static_task_v constexpr +

+

determines if a callable is a static task

+

A static task is a callable object constructible from std::function<void()>.

+
template<typename C>
bool tf::is_subflow_task_v constexpr

-

determines if a callable is a dynamic task

-

A dynamic task is a callable object constructible from std::function<void(Subflow&)>.

+

determines if a callable is a subflow task

+

A subflow task is a callable object constructible from std::function<void(Subflow&)>.

-
+

+
template<typename C>
- bool tf::is_condition_task_v constexpr + bool tf::is_runtime_task_v constexpr

-

determines if a callable is a condition task

-

A condition task is a callable object constructible from std::function<int()> or std::function<int(tf::Runtime&)>.

+

determines if a callable is a runtime task

+

A runtime task is a callable object constructible from std::function<void(Runtime&)>.

-
+

+
template<typename C>
- bool tf::is_multi_condition_task_v constexpr + bool tf::is_condition_task_v constexpr

-

determines if a callable is a multi-condition task

-

A multi-condition task is a callable object constructible from std::function<tf::SmallVector<int>()> or std::function<tf::SmallVector<int>(tf::Runtime&)>.

+

determines if a callable is a condition task

+

A condition task is a callable object constructible from std::function<int()>.

-
+

+
template<typename C>
- bool tf::is_static_task_v constexpr + bool tf::is_multi_condition_task_v constexpr

-

determines if a callable is a static task

-

A static task is a callable object constructible from std::function<void()> or std::function<void(tf::Runtime&)>.

+

determines if a callable is a multi-condition task

+

A multi-condition task is a callable object constructible from std::function<tf::SmallVector<int>()>.

+
template<typename P>
@@ -2718,7 +2269,7 @@

-

Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
Generated by Doxygen 1.9.1 and m.css.

+

Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
Generated by Doxygen 1.12.0 and m.css.

diff --git a/docs/observer_8hpp.html b/docs/observer_8hpp.html index 0104d3453..93a0c1fb6 100644 --- a/docs/observer_8hpp.html +++ b/docs/observer_8hpp.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -129,7 +129,7 @@

Classes

-

Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
Generated by Doxygen 1.9.1 and m.css.

+

Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
Generated by Doxygen 1.12.0 and m.css.

diff --git a/docs/opentimer.html b/docs/opentimer.html index e2a745e1d..f55fdac9a 100644 --- a/docs/opentimer.html +++ b/docs/opentimer.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -59,438 +59,438 @@

Contents

  • References
  • -

    We have applied Taskflow to solve a real-world VLSI static timing analysis problem that incorporates hundreds of millions of tasks and dependencies. The goal is to analyze the timing behavior of a design.

    OpenTimer: A High-performance Timing Analysis Tool

    Static timing analysis (STA) is an important step in the overall chip design flow. It verifies the static behavior of a circuit design and ensure its correct functionality under the given clock speed. However, efficient parallel timing analysis is extremely challenging to design and implement, due to large irregularity and graph-oriented computing. The following figure shows an extracted timing graph from an industrial design.

    Image

    We consider our research project OpenTimer, an open-source static timing analyzer that has been used in many industrial and academic projects. The first release v1 in 2015 implemented the pipeline-based levelization algorithm using the OpenMP 4.5 task dependency clause. To overcome the performance bottleneck caused by pipeline, we rewrote the core incremental timing engine using Taskflow in the second release v2.

    Programming Effort

    The table below measures the software costs of two OpenTimer versions using the Linux tool SLOCCount. In OpenTimer v2, a large amount of exhaustive OpenMP dependency clauses that were used to carry out task dependencies are now replaced with only a few lines of flexible Taskflow code (9123 vs 4482). The maximum cyclomatic complexity in a single function is reduced from 58 to 20, due to Taskflow's programmability.

    ToolTask ModelLines of CodeCyclomatic ComplexityCost
    OpenTimer v1OpenMP 4.5912358$275,287
    OpenTimer v2Taskflow448220$130,523

    OpenTimer v1 relied on a pipeline data structure to adtop loop parallelism with OpenMP. We found it very difficult to go beyond this paradigm because of the insufficient support for dynamic dependencies in OpenMP. With Taskflow in place, we can break this bottleneck and easily model both static and dynamic task dependencies at programming time and runtime. The task dependency graph flows computations naturally with the timing graph, providing improved asynchrony and performance. The following figure shows a task graph to carry one iteration of timing update.

    +

    We have applied Taskflow to solve a real-world VLSI static timing analysis problem that incorporates hundreds of millions of tasks and dependencies. The goal is to analyze the timing behavior of a design.

    OpenTimer: A High-performance Timing Analysis Tool

    Static timing analysis (STA) is an important step in the overall chip design flow. It verifies the static behavior of a circuit design and ensure its correct functionality under the given clock speed. However, efficient parallel timing analysis is extremely challenging to design and implement, due to large irregularity and graph-oriented computing. The following figure shows an extracted timing graph from an industrial design.

    Image

    We consider our research project OpenTimer, an open-source static timing analyzer that has been used in many industrial and academic projects. The first release v1 in 2015 implemented the pipeline-based levelization algorithm using the OpenMP 4.5 task dependency clause. To overcome the performance bottleneck caused by pipeline, we rewrote the core incremental timing engine using Taskflow in the second release v2.

    Programming Effort

    The table below measures the software costs of two OpenTimer versions using the Linux tool SLOCCount. In OpenTimer v2, a large amount of exhaustive OpenMP dependency clauses that were used to carry out task dependencies are now replaced with only a few lines of flexible Taskflow code (9123 vs 4482). The maximum cyclomatic complexity in a single function is reduced from 58 to 20, due to Taskflow's programmability.

    ToolTask ModelLines of CodeCyclomatic ComplexityCost
    OpenTimer v1OpenMP 4.5912358$275,287
    OpenTimer v2Taskflow448220$130,523

    OpenTimer v1 relied on a pipeline data structure to adtop loop parallelism with OpenMP. We found it very difficult to go beyond this paradigm because of the insufficient support for dynamic dependencies in OpenMP. With Taskflow in place, we can break this bottleneck and easily model both static and dynamic task dependencies at programming time and runtime. The task dependency graph flows computations naturally with the timing graph, providing improved asynchrony and performance. The following figure shows a task graph to carry one iteration of timing update.

    Codestin Search App Codestin Search App - -[A33] bprop_tau2015_clk + +[A33] bprop_tau2015_clk Codestin Search App - -[A32] bprop_f1:CLK + +[A32] bprop_f1:CLK Codestin Search App - - + + Codestin Search App - -[A31] bprop_f1:Q + +[A31] bprop_f1:Q Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - -[A30] bprop_u4:B + +[A30] bprop_u4:B Codestin Search App - - + + Codestin Search App - -[A29] bprop_u2:A + +[A29] bprop_u2:A Codestin Search App - - + + Codestin Search App - -[A28] bprop_u2:Y + +[A28] bprop_u2:Y Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - -[A27] bprop_u3:A + +[A27] bprop_u3:A Codestin Search App - - + + Codestin Search App - -[A26] bprop_u3:Y + +[A26] bprop_u3:Y Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - -[A25] bprop_out + +[A25] bprop_out Codestin Search App - - + + Codestin Search App - -[A24] bprop_inp2 + +[A24] bprop_inp2 Codestin Search App - -[A23] bprop_u1:B + +[A23] bprop_u1:B Codestin Search App - - + + Codestin Search App - -[A22] bprop_inp1 + +[A22] bprop_inp1 Codestin Search App - -[A21] bprop_u1:A + +[A21] bprop_u1:A Codestin Search App - - + + Codestin Search App - -[A20] bprop_u1:Y + +[A20] bprop_u1:Y Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - -[A19] bprop_u4:A + +[A19] bprop_u4:A Codestin Search App - - + + Codestin Search App - -[A18] bprop_u4:Y + +[A18] bprop_u4:Y Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - -[A17] bprop_f1:D + +[A17] bprop_f1:D Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - -[A16] fprop_f1:D + +[A16] fprop_f1:D Codestin Search App - - + + Codestin Search App - -[A15] fprop_u4:Y + +[A15] fprop_u4:Y Codestin Search App - - + + Codestin Search App - -[A14] fprop_u4:A + +[A14] fprop_u4:A Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - -[A13] fprop_u1:Y + +[A13] fprop_u1:Y Codestin Search App - - + + Codestin Search App - -[A12] fprop_u1:A + +[A12] fprop_u1:A Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - -[A11] fprop_inp1 + +[A11] fprop_inp1 Codestin Search App - - + + Codestin Search App - -[A10] fprop_u1:B + +[A10] fprop_u1:B Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - -[A9] fprop_inp2 + +[A9] fprop_inp2 Codestin Search App - - + + Codestin Search App - -[A8] fprop_out + +[A8] fprop_out Codestin Search App - - + + Codestin Search App - -[A7] fprop_u3:Y + +[A7] fprop_u3:Y Codestin Search App - - + + Codestin Search App - -[A6] fprop_u3:A + +[A6] fprop_u3:A Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - -[A5] fprop_u2:Y + +[A5] fprop_u2:Y Codestin Search App - - + + Codestin Search App - -[A4] fprop_u2:A + +[A4] fprop_u2:A Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - -[A3] fprop_u4:B + +[A3] fprop_u4:B Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - -[A2] fprop_f1:Q + +[A2] fprop_f1:Q Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - -[A1] fprop_f1:CLK + +[A1] fprop_f1:CLK Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - -[A0] fprop_tau2015_clk + +[A0] fprop_tau2015_clk Codestin Search App - - + + @@ -539,7 +539,7 @@

    Contents

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/pages.html b/docs/pages.html index da178b8f2..6d9202fe9 100644 --- a/docs/pages.html +++ b/docs/pages.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -51,7 +51,11 @@

    Pages

    Release Notes @@ -109,33 +111,13 @@

    Pages

  • Parallel Sort
  • Parallel Scan
  • Parallel Find
  • +
  • Module Algorithm
  • Task-parallel Pipeline
  • Task-parallel Scalable Pipeline
  • Task-parallel Pipeline with Token Dependencies
  • Data-parallel Pipeline
  • -
  • - cudaFlow Algorithms - -
  • -
  • - CUDA Standard Algorithms - -
  • Learning from Examples
      @@ -144,9 +126,9 @@

      Pages

    • Flip Coins
    • Graph Traversal
    • Matrix Multiplication
    • -
    • Matrix Multiplication (cudaFlow)
    • +
    • Matrix Multiplication with CUDA GPU
    • k-means Clustering
    • -
    • k-means Clustering (cudaFlow)
    • +
    • k-means Clustering with CUDA GPU
    • Text Processing Pipeline
    • Graph Processing Pipeline
    • Taskflow Processing Pipeline
    • @@ -237,7 +219,7 @@

      Pages

      -

      Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
      Generated by Doxygen 1.9.1 and m.css.

      +

      Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
      Generated by Doxygen 1.12.0 and m.css.

      diff --git a/docs/partitioner_8hpp.html b/docs/partitioner_8hpp.html index 47f7e6b92..6533c2022 100644 --- a/docs/partitioner_8hpp.html +++ b/docs/partitioner_8hpp.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -72,31 +72,31 @@

      Namespaces

      Classes

      - struct tf::DefaultClosureWrapper + class tf::DefaultClosureWrapper
      -
      default closure wrapper that simplies runs the given closure as is
      +
      class to create a default closure wrapper
      -
      template<typename C = DefaultClosureWrapper>
      +
      template<typename C = DefaultClosureWrapper>
      class tf::PartitionerBase
      class to derive a partitioner for scheduling parallel algorithms
      -
      template<typename C = DefaultClosureWrapper>
      +
      template<typename C = DefaultClosureWrapper>
      class tf::GuidedPartitioner
      -
      class to construct a guided partitioner for scheduling parallel algorithms
      +
      class to create a guided partitioner for scheduling parallel algorithms
      -
      template<typename C = DefaultClosureWrapper>
      +
      template<typename C = DefaultClosureWrapper>
      class tf::DynamicPartitioner
      -
      class to construct a dynamic partitioner for scheduling parallel algorithms
      +
      class to create a dynamic partitioner for scheduling parallel algorithms
      -
      template<typename C = DefaultClosureWrapper>
      +
      template<typename C = DefaultClosureWrapper>
      class tf::StaticPartitioner
      class to construct a static partitioner for scheduling parallel algorithms
      -
      template<typename C = DefaultClosureWrapper>
      +
      template<typename C = DefaultClosureWrapper>
      class tf::RandomPartitioner
      class to construct a random partitioner for scheduling parallel algorithms
      @@ -146,7 +146,7 @@

      Classes

      -

      Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
      Generated by Doxygen 1.9.1 and m.css.

      +

      Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
      Generated by Doxygen 1.12.0 and m.css.

      diff --git a/docs/pipeline_8hpp.html b/docs/pipeline_8hpp.html index 40085f505..440c7449e 100644 --- a/docs/pipeline_8hpp.html +++ b/docs/pipeline_8hpp.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -136,7 +136,7 @@

      Classes

      -

      Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
      Generated by Doxygen 1.9.1 and m.css.

      +

      Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
      Generated by Doxygen 1.12.0 and m.css.

      diff --git a/docs/release-1-x-x.html b/docs/release-1-x-x.html index c53130462..7b37d7771 100644 --- a/docs/release-1-x-x.html +++ b/docs/release-1-x-x.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -94,7 +94,7 @@

      -

      Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
      Generated by Doxygen 1.9.1 and m.css.

      +

      Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
      Generated by Doxygen 1.12.0 and m.css.

      diff --git a/docs/release-2-0-0.html b/docs/release-2-0-0.html index b4ca1e22b..993bb9c63 100644 --- a/docs/release-2-0-0.html +++ b/docs/release-2-0-0.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -104,7 +104,7 @@

      Contents

      -

      Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
      Generated by Doxygen 1.9.1 and m.css.

      +

      Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
      Generated by Doxygen 1.12.0 and m.css.

      diff --git a/docs/release-2-1-0.html b/docs/release-2-1-0.html index 4932bbd5a..5d3030954 100644 --- a/docs/release-2-1-0.html +++ b/docs/release-2-1-0.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -104,7 +104,7 @@

      Contents

      -

      Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
      Generated by Doxygen 1.9.1 and m.css.

      +

      Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
      Generated by Doxygen 1.12.0 and m.css.

      diff --git a/docs/release-2-2-0.html b/docs/release-2-2-0.html index cabbbdb8c..ae5dfbcaa 100644 --- a/docs/release-2-2-0.html +++ b/docs/release-2-2-0.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -59,17 +59,17 @@

      Contents

      Cpp-Taskflow 2.2.0 is the 3rd release in the 2.x line! This release includes several new changes such as tf::ExecutorObserverInterface, tf::Executor, isolation of taskflow graph and executor, benchmarks, and so forth. In particular, this release improve the performance of the work stealing scheduler.

      Download

      Cpp-Taskflow 2.2.0 can be downloaded from here.

      New Features

      • A new executor class to isolate the execution module from a taskflow
      • A new observer interface to inspect the activities of an executor
      • A decomposable taskflow construction interface
      • A new work-stealing algorithm to improve the performance

      Breaks and Deprecated Features

      In this release, we isolated the executor interface from tf::Taskflow, and merge tf::Framework with tf::Taskflow. This change largely improved the modularity and composability of Cpp-Taskflow in creating clean task dependency graphs and execution flows. Performance is also better. While this introduced some breaks in tf::Taskflow, we have managed to make it as less painful as possible for users to adapt to the new change.

      Previously, tf::Taskflow is a hero class that manages both a task dependency graph and the execution of all graphs including frameworks. For example:

      // before v2.2.0, tf::Taskflow manages both graph and execution
       tf::Taskflow taskflow(4);  // create a taskflow object with 4 threads
      -taskflow.emplace([] () { std::cout << "task A\n"; });
      +taskflow.emplace([] () { std::cout << "task A\n"; });
       taskflow.wait_for_all();   // dispatch the present graph
       
       tf::Framework framework;   // create a framework object
      -framework.emplace([] () { std::cout << "task B\n"; });
      +framework.emplace([] () { std::cout << "task B\n"; });
       taskflow.run(framework);   // run the framework once
       taskflow.wait_for_all();   // wait until the framework finishes

      However, this design is awkward in many aspects. For instance, calling wait_for_all dispatches the present graph and the graph vanishes when the execution completes. To reuse a graph, users have to create another special graph called framework and mix its execution with the one in a taskflow object. Given the user feedback and lessons we have learned so far, we decided to isolate the executor interface out of tf::Taskflow and merge tf::Framework with tf::Taskflow. All execution methods such as dispatch and wait_for_all have been moved from tf::Taskflow to tf::Executor.

      // starting from v2.2.0, tf::Executor manages the execution of graphs
       tf::Taskflow taskflow;      // create a taskflow to build dependent tasks
      -tf::Task A = taskflow.emplace([] () { std::cout << "task A\n"; });
      -tf::Task B = taskflow.emplace([] () { std::cout << "task B\n"; });
      -A.precede(B);
      +tf::Task A = taskflow.emplace([] () { std::cout << "task A\n"; });
      +tf::Task B = taskflow.emplace([] () { std::cout << "task B\n"; });
      +A.precede(B);
       
       tf::Executor executor(4);   // create an executor of 4 threads
       executor.run(taskflow);     // run the taskflow once
      @@ -119,7 +119,7 @@ 

      Contents

      -

      Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
      Generated by Doxygen 1.9.1 and m.css.

      +

      Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
      Generated by Doxygen 1.12.0 and m.css.

      diff --git a/docs/release-2-3-0.html b/docs/release-2-3-0.html index 0c42b1c52..81ad80e34 100644 --- a/docs/release-2-3-0.html +++ b/docs/release-2-3-0.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -58,7 +58,7 @@

      Contents

    • Deprecated Items
    -

    Cpp-Taskflow 2.3.0 is the 4th release in the 2.x line! This release includes several new changes such as conditional tasking, modified scheduling flows, benchmarks, documentation, and so forth.

    Download

    Cpp-Taskflow 2.3.0 can be downloaded from here.

    New Features

    Bug Fixes

    • Fixed the stack overflow problem in zero worker execution
    • Fixed the missing comma in output execution timelines from an executor
    • Fixed the bug in empty taskflow

    Deprecated Items

    • Removed zero worker thread support in execution
    • Removed gather method in task handle
    • Removed std::vector and std::initializer_list support in task's preceed/succeed methods
    • Removed taskflow::silent_emplace method
    +

    Cpp-Taskflow 2.3.0 is the 4th release in the 2.x line! This release includes several new changes such as conditional tasking, modified scheduling flows, benchmarks, documentation, and so forth.

    Download

    Cpp-Taskflow 2.3.0 can be downloaded from here.

    New Features

    Bug Fixes

    • Fixed the stack overflow problem in zero worker execution
    • Fixed the missing comma in output execution timelines from an executor
    • Fixed the bug in empty taskflow

    Deprecated Items

    • Removed zero worker thread support in execution
    • Removed gather method in task handle
    • Removed std::vector and std::initializer_list support in task's preceed/succeed methods
    • Removed taskflow::silent_emplace method
  • @@ -103,7 +103,7 @@

    Contents

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/release-2-3-1.html b/docs/release-2-3-1.html index b27893ffe..76c17db7a 100644 --- a/docs/release-2-3-1.html +++ b/docs/release-2-3-1.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -94,7 +94,7 @@

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/release-2-4-0.html b/docs/release-2-4-0.html index 6608850ad..50bced609 100644 --- a/docs/release-2-4-0.html +++ b/docs/release-2-4-0.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -58,7 +58,7 @@

    Contents

  • Miscellaneous Items
  • -

    Cpp-Taskflow 2.4.0 is the 6th release in the 2.x line! This release includes several new changes such as CPU-GPU tasking, improved scheduling flow, documentation, and unit tests.

    Download

    Cpp-Taskflow 2.4.0 can be downloaded from here.

    New Features

    Bug Fixes

    • fixed the bug in nested execution (#152)
    • fixed the nameless union/struct extension warning in MS environment (#153)
    • fixed the warning/error by changing the type of join counter to std::size (#137)

    Miscellaneous Items

    +

    Cpp-Taskflow 2.4.0 is the 6th release in the 2.x line! This release includes several new changes such as CPU-GPU tasking, improved scheduling flow, documentation, and unit tests.

    Download

    Cpp-Taskflow 2.4.0 can be downloaded from here.

    New Features

    • added tf::cudaFlow for concurrent CPU-GPU tasking
    • added a new method tf::Executor::num_topologies to query the number of running taskflows in an executor
    • added std::hash support for tf::Task
    • added a new work-stealing algorithm capable of general heterogeneous domains
    • added unittests for CUDA work (enable by -DTF_ENABLE_CUDA during cmake)

    Bug Fixes

    • fixed the bug in nested execution (#152)
    • fixed the nameless union/struct extension warning in MS environment (#153)
    • fixed the warning/error by changing the type of join counter to std::size (#137)

    Miscellaneous Items

    @@ -103,7 +103,7 @@

    Contents

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/release-2-5-0.html b/docs/release-2-5-0.html index 67d2b1c5f..337013270 100644 --- a/docs/release-2-5-0.html +++ b/docs/release-2-5-0.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -58,7 +58,7 @@

    Contents

  • Miscellaneous Items
  • -

    Starting from v2.5.0, we have renamed Cpp-Taskflow to Taskflow to broaden its impact and support. Taskflow will explore multiple scopes of applications and language bindings, rather than just C++. This also made Taskflow naming more succinct and concise.

    Taskflow 2.5.0 is the 7th release in the 2.x line! This release includes several new changes such as CPU-GPU tasking, web-based profiler, documentation, and unit tests.

    Download

    Taskflow 2.5.0 can be downloaded from here.

    To download the newest version of Taskflow, please clone from Taskflow's GitHub.

    New Features

    Image

    Bug Fixes

    • fixed the bug in assigning the block pointer before constructor of an object in object pool
    • fixed the namespace conflicting in using MPark.Variant from upstream code

    Miscellaneous Items

    • fixed the warning between unsigned and size_t conversion in tf::Executor
    • submitted the technical paper to arXiv
    +

    Starting from v2.5.0, we have renamed Cpp-Taskflow to Taskflow to broaden its impact and support. Taskflow will explore multiple scopes of applications and language bindings, rather than just C++. This also made Taskflow naming more succinct and concise.

    Taskflow 2.5.0 is the 7th release in the 2.x line! This release includes several new changes such as CPU-GPU tasking, web-based profiler, documentation, and unit tests.

    Download

    Taskflow 2.5.0 can be downloaded from here.

    To download the newest version of Taskflow, please clone from Taskflow's GitHub.

    New Features

    • enhanced the performance of the work-stealing algorithm
    • enhanced the interface of concurrent CPU-GPU tasking (added tf::cudaFlow::zero, tf::cudaFlow::memset, tf::cudaFlow::memcpy, tf::cudaFlow::fill)
    • enhanced unittests for tf::cudaFlow
    • added per-thread stream to avoid synchronizing with the default stream in running a cudaFlow
    • added tf::cudaFlow::repeat and tf::cudaFlow::predicate for iterative execution of a cudaFlow
    • added Learning from Examples pages
    • made observer a std::shared_ptr object
    • enabled multiple observers to coexit in an executor
    • created the TFProf project (image below) to provide visualization and tooling needed for Taskflow programs
    Image

    Bug Fixes

    • fixed the bug in assigning the block pointer before constructor of an object in object pool
    • fixed the namespace conflicting in using MPark.Variant from upstream code

    Miscellaneous Items

    • fixed the warning between unsigned and size_t conversion in tf::Executor
    • submitted the technical paper to arXiv
    @@ -103,7 +103,7 @@

    Contents

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/release-2-6-0.html b/docs/release-2-6-0.html index 8c8d02091..522424dc3 100644 --- a/docs/release-2-6-0.html +++ b/docs/release-2-6-0.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -59,7 +59,7 @@

    Contents

  • Miscellaneous Items
  • -

    Taskflow 2.6.0 is the 8th release in the 2.x line! This release includes several new changes such as CPU-GPU tasking, algorithm collection, enhanced web-based profiler, documentation, and unit tests.

    We have a new webpage for Taskflow!

    Download

    Taskflow 2.6.0 can be downloaded from here.

    New Features

    Bug Fixes

    • fixed the bug of iteratively detaching a subflow from a run loop or a condition loop (see Detach a Subflow)
    • fixed the bug of conflict macro with boost (#184)

    Deprecated Items

    • removed two methods, tf::detached and tf::joined, due to the new join/detach behavior

    Miscellaneous Items

    +

    Taskflow 2.6.0 is the 8th release in the 2.x line! This release includes several new changes such as CPU-GPU tasking, algorithm collection, enhanced web-based profiler, documentation, and unit tests.

    We have a new webpage for Taskflow!

    Download

    Taskflow 2.6.0 can be downloaded from here.

    New Features

    Bug Fixes

    • fixed the bug of iteratively detaching a subflow from a run loop or a condition loop
    • fixed the bug of conflict macro with boost (#184)

    Deprecated Items

    • removed two methods, tf::detached and tf::joined, due to the new join/detach behavior

    Miscellaneous Items

    @@ -104,7 +104,7 @@

    Contents

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/release-2-7-0.html b/docs/release-2-7-0.html index b44e47701..e9b88476d 100644 --- a/docs/release-2-7-0.html +++ b/docs/release-2-7-0.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -59,7 +59,7 @@

    Contents

  • Miscellaneous Items
  • -

    Taskflow 2.7.0 is the 9th release in the 2.x line! This release includes several new changes such as CPU-GPU tasking, algorithm collection, enhanced web-based profiler, documentation, and unit tests.

    Download

    Taskflow 2.7.0 can be downloaded from here.

    New Features

    Bug Fixes

    There are no bug fixes in this release.

    Deprecated Items

    • removed redundant methods, tf::Taskflow::broadcast, tf::Taskflow::precede, tf::Taskflow::succeed
    • removed tf::cudaFlow::predicate (replaced with tf::cudaFlow::join_until)
    • removed tf::cudaFlow::stream; the executor automatically determines a local, faster stream

    Miscellaneous Items

    +

    Taskflow 2.7.0 is the 9th release in the 2.x line! This release includes several new changes such as CPU-GPU tasking, algorithm collection, enhanced web-based profiler, documentation, and unit tests.

    Download

    Taskflow 2.7.0 can be downloaded from here.

    New Features

    • added tf::Executor::async to support asynchronously calling a function (see Asynchronous Tasking)
    • added kernel algorithm, tf::cudaFlow::for_each
    • added kernel algorithm, tf::cudaFlow::for_each_index
    • added explicit join method at tf::cudaFlow::join, tf::cudaFlow::join_n, tf::cudaFlow::join_until

    Bug Fixes

    There are no bug fixes in this release.

    Deprecated Items

    • removed redundant methods, tf::Taskflow::broadcast, tf::Taskflow::precede, tf::Taskflow::succeed
    • removed tf::cudaFlow::predicate (replaced with tf::cudaFlow::join_until)
    • removed tf::cudaFlow::stream; the executor automatically determines a local, faster stream

    Miscellaneous Items

    @@ -104,7 +104,7 @@

    Contents

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/release-3-0-0.html b/docs/release-3-0-0.html index 6d6bf93ef..7cca33348 100644 --- a/docs/release-3-0-0.html +++ b/docs/release-3-0-0.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -78,7 +78,7 @@

    Contents

  • Miscellaneous Items
  • -

    Taskflow 3.0.0 is the 1st release in the 3.x line! This release includes several new changes such as CPU-GPU tasking, algorithm collection, enhanced web-based profiler, documentation, and unit tests.

    Download

    Taskflow 3.0.0 can be downloaded from here.

    System Requirements

    To use Taskflow v3.0.0, you need a compiler that supports C++17:

    • GNU C++ Compiler at least v7.0 with -std=c++17
    • Clang C++ Compiler at least v6.0 with -std=c++17
    • Microsoft Visual Studio at least v19.27 with /std:c++17
    • AppleClang Xcode Version at least v12.0 with -std=c++17
    • Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17
    • Intel C++ Compiler at least v19.0.1 with -std=c++17

    Taskflow works on Linux, Windows, and Mac OS X.

    Working Items

    • enhancing the taskflow profiler (TFProf)
    • adding methods for updating tf::cudaFlow (with unit tests)
    • adding support for cuBLAS
    • adding support for cuDNN
    • adding support for SYCL (ComputeCpp and DPC++)

    New Features

    Taskflow Core

    cudaFlow

    • added tf::cudaFlowCapturer for building a cudaFlow through stream capture (see GPU Tasking (cudaFlowCapturer))
    • added tf::cudaFlowCapturerBase for creating custom capturers
    • added tf::cudaFlow::capture for capturing a cudaFlow within a parent cudaFlow
    • added tf::Taskflow::emplace_on to place a cudaFlow on a GPU
    • added tf::cudaFlow::dump and tf::cudaFlowCapturer::dump to visualize cudaFlow
    • added tf::cudaFlow::offload and update methods to run and update a cudaFlow explicitly
    • supported standalone cudaFlow
    • supported standalone cudaFlowCapturer
    • added tf::cublasFlowCapturer to support cuBLAS (see LinearAlgebracublasFlowCapturer)

    Utilities

    • added utility functions to grab the cuda device properties (see cuda_device.hpp)
    • added utility functions to control cuda memory (see cuda_memory.hpp)
    • added utility functions for common mathematics operations
    • added serializer and deserializer libraries to support tfprof
    • added per-thread pool for CUDA streams to improve performance

    Taskflow Profiler (TFProf)

    • added visualization for asynchronous tasks
    • added server-based profiler to support large profiling data (see Profile Taskflow Programs)

    New Algorithms

    CPU Algorithms

    GPU Algorithms

    Bug Fixes

    • fixed the bug in stream capturing (need to use ThreadLocal mode)
    • fixed the bug in reporting wrong worker ids when compiling a shared library due to the use of thread_local (now with C++17 inline variable)

    Breaking Changes

    Deprecated and Removed Items

    • removed tf::cudaFlow::device; users may call tf::Taskflow::emplace_on to associate a cudaflow with a GPU device
    • removed tf::cudaFlow::join, use tf::cudaFlow::offload instead
    • removed the legacy tf::Framework
    • removed external mutable use of tf::TaskView

    Documentation

    Miscellaneous Items

    We have presented Taskflow in the following C++ venues with recorded videos:

    We have published Taskflow in the following conferences and journals:

    +

    Taskflow 3.0.0 is the 1st release in the 3.x line! This release includes several new changes such as CPU-GPU tasking, algorithm collection, enhanced web-based profiler, documentation, and unit tests.

    Download

    Taskflow 3.0.0 can be downloaded from here.

    System Requirements

    To use Taskflow v3.0.0, you need a compiler that supports C++17:

    • GNU C++ Compiler at least v7.0 with -std=c++17
    • Clang C++ Compiler at least v6.0 with -std=c++17
    • Microsoft Visual Studio at least v19.27 with /std:c++17
    • AppleClang Xcode Version at least v12.0 with -std=c++17
    • Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17
    • Intel C++ Compiler at least v19.0.1 with -std=c++17

    Taskflow works on Linux, Windows, and Mac OS X.

    Working Items

    • enhancing the taskflow profiler (TFProf)
    • adding methods for updating tf::cudaFlow (with unit tests)
    • adding support for cuBLAS
    • adding support for cuDNN
    • adding support for SYCL (ComputeCpp and DPC++)

    New Features

    Taskflow Core

    cudaFlow

    • added tf::cudaFlowCapturer for building a cudaFlow through stream capture
    • added tf::cudaFlowCapturerBase for creating custom capturers
    • added tf::cudaFlow::capture for capturing a cudaFlow within a parent cudaFlow
    • added tf::Taskflow::emplace_on to place a cudaFlow on a GPU
    • added tf::cudaFlow::dump and tf::cudaFlowCapturer::dump to visualize cudaFlow
    • added tf::cudaFlow::offload and update methods to run and update a cudaFlow explicitly
    • supported standalone cudaFlow
    • supported standalone cudaFlowCapturer
    • added tf::cublasFlowCapturer to support cuBLAS (see LinearAlgebracublasFlowCapturer)

    Utilities

    • added utility functions to grab the cuda device properties (see cuda_device.hpp)
    • added utility functions to control cuda memory (see cuda_memory.hpp)
    • added utility functions for common mathematics operations
    • added serializer and deserializer libraries to support tfprof
    • added per-thread pool for CUDA streams to improve performance

    Taskflow Profiler (TFProf)

    • added visualization for asynchronous tasks
    • added server-based profiler to support large profiling data (see Profile Taskflow Programs)

    New Algorithms

    CPU Algorithms

    GPU Algorithms

    • added single task
    • added parallel iterations
    • added parallel transforms
    • added parallel reduction

    Bug Fixes

    • fixed the bug in stream capturing (need to use ThreadLocal mode)
    • fixed the bug in reporting wrong worker ids when compiling a shared library due to the use of thread_local (now with C++17 inline variable)

    Breaking Changes

    Deprecated and Removed Items

    • removed tf::cudaFlow::device; users may call tf::Taskflow::emplace_on to associate a cudaflow with a GPU device
    • removed tf::cudaFlow::join, use tf::cudaFlow::offload instead
    • removed the legacy tf::Framework
    • removed external mutable use of tf::TaskView

    Documentation

    Miscellaneous Items

    We have presented Taskflow in the following C++ venues with recorded videos:

    We have published Taskflow in the following conferences and journals:

    @@ -123,7 +123,7 @@

    Contents

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/release-3-1-0.html b/docs/release-3-1-0.html index 6f960377a..30b2c640f 100644 --- a/docs/release-3-1-0.html +++ b/docs/release-3-1-0.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -70,7 +70,7 @@

    Contents

  • Miscellaneous Items
  • -

    Taskflow 3.1.0 is the 2nd release in the 3.x line! This release includes several new changes such as CPU-GPU tasking, algorithm collection, enhanced web-based profiler, documentation, and unit tests.

    Download

    Taskflow 3.1.0 can be downloaded from here.

    System Requirements

    To use Taskflow v3.1.0, you need a compiler that supports C++17:

    • GNU C++ Compiler at least v8.4 with -std=c++17
    • Clang C++ Compiler at least v6.0 with -std=c++17
    • Microsoft Visual Studio at least v19.27 with /std:c++17
    • AppleClang Xcode Version at least v12.0 with -std=c++17
    • Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17
    • Intel C++ Compiler at least v19.0.1 with -std=c++17
    • Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 and SYCL20

    Taskflow works on Linux, Windows, and Mac OS X.

    New Features

    Taskflow Core

    • optimized task node storage by using std::unique_ptr for semaphores
    • merged the execution flow of cudaFlow and cudaFlow capturer

    cudaFlow

    • optimized tf::cudaRoundRobinCapturing through an event-pruning heuristic
    • optimized the default block size used in cudaFlow algorithms
    • added tf::cudaFlow::clear() to clean up a cudaFlow
    • added tf::cudaFlow::num_tasks() to query the task count in a cudaFlow
    • added tf::cudaTask::num_dependents() to query the dependent count in a cudaTask
    • added tf::cudaFlowCapturer::clear() to clean up a cudaFlow capturer
    • added tf::cudaFlowCapturer::num_tasks() to query the task count in a cudaFlow capturer
    • added tf::cudaFlowCapturer rebind methods:
      • tf::cudaFlowCapturer::rebind_single_task
      • tf::cudaFlowCapturer::rebind_for_each
      • tf::cudaFlowCapturer::rebind_for_each_index
      • tf::cudaFlowCapturer::rebind_transform
      • tf::cudaFlowCapturer::rebind_reduce
      • tf::cudaFlowCapturer::rebind_uninitialized_reduce
    • added tf::cudaFlow update methods:
      • tf::cudaFlow::update_for_each
      • tf::cudaFlow::update_for_each_index
      • tf::cudaFlow::update_transform
      • tf::cudaFlow::update_reduce
      • tf::cudaFlow::update_uninitialized_reduce
    • added cudaFlow examples:
      • parallel reduction (examples/cuda/cuda_reduce.cu)
      • parallel transform (examples/cuda/cuda_transform.cu)
      • rebind (examples/cuda/cuda_rebind.cu)

    Utilities

    • resolved the compiler warning in serializer caused by constexpr if
    • resolved the compiler error of nvcc when parsin variadic namespace

    Taskflow Profiler (TFProf)

    No update for TFProf in this release.

    Bug Fixes

    • fixed the macro expansion issue with MSVC on TF_CUDA_CHECK
    • fixed the serializer compile error (#288)
    • fixed the tf::cudaTask::type bug in mixing host and empty task types

    Breaking Changes

    There are no breaking changes in this release.

    Deprecated and Removed Items

    There are no deprecated or removed items in this release.

    Documentation

    Miscellaneous Items

    • removed Circle-CI from the continuous integration
    • updated grok to the user list
    • updated RavEngine to the user list
    • updated RPGMPacker to the user list
    • updated Leanify to the user list
    +

    Taskflow 3.1.0 is the 2nd release in the 3.x line! This release includes several new changes such as CPU-GPU tasking, algorithm collection, enhanced web-based profiler, documentation, and unit tests.

    Download

    Taskflow 3.1.0 can be downloaded from here.

    System Requirements

    To use Taskflow v3.1.0, you need a compiler that supports C++17:

    • GNU C++ Compiler at least v8.4 with -std=c++17
    • Clang C++ Compiler at least v6.0 with -std=c++17
    • Microsoft Visual Studio at least v19.27 with /std:c++17
    • AppleClang Xcode Version at least v12.0 with -std=c++17
    • Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17
    • Intel C++ Compiler at least v19.0.1 with -std=c++17
    • Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 and SYCL20

    Taskflow works on Linux, Windows, and Mac OS X.

    New Features

    Taskflow Core

    • optimized task node storage by using std::unique_ptr for semaphores
    • merged the execution flow of cudaFlow and cudaFlow capturer

    cudaFlow

    • optimized tf::cudaRoundRobinCapturing through an event-pruning heuristic
    • optimized the default block size used in cudaFlow algorithms
    • added tf::cudaFlow::clear() to clean up a cudaFlow
    • added tf::cudaFlow::num_tasks() to query the task count in a cudaFlow
    • added tf::cudaTask::num_dependents() to query the dependent count in a cudaTask
    • added tf::cudaFlowCapturer::clear() to clean up a cudaFlow capturer
    • added tf::cudaFlowCapturer::num_tasks() to query the task count in a cudaFlow capturer
    • added tf::cudaFlowCapturer rebind methods:
      • tf::cudaFlowCapturer::rebind_single_task
      • tf::cudaFlowCapturer::rebind_for_each
      • tf::cudaFlowCapturer::rebind_for_each_index
      • tf::cudaFlowCapturer::rebind_transform
      • tf::cudaFlowCapturer::rebind_reduce
      • tf::cudaFlowCapturer::rebind_uninitialized_reduce
    • added tf::cudaFlow update methods:
      • tf::cudaFlow::update_for_each
      • tf::cudaFlow::update_for_each_index
      • tf::cudaFlow::update_transform
      • tf::cudaFlow::update_reduce
      • tf::cudaFlow::update_uninitialized_reduce
    • added cudaFlow examples:
      • parallel reduction (examples/cuda/cuda_reduce.cu)
      • parallel transform (examples/cuda/cuda_transform.cu)
      • rebind (examples/cuda/cuda_rebind.cu)

    Utilities

    • resolved the compiler warning in serializer caused by constexpr if
    • resolved the compiler error of nvcc when parsin variadic namespace

    Taskflow Profiler (TFProf)

    No update for TFProf in this release.

    Bug Fixes

    • fixed the macro expansion issue with MSVC on TF_CUDA_CHECK
    • fixed the serializer compile error (#288)
    • fixed the tf::cudaTask::type bug in mixing host and empty task types

    Breaking Changes

    There are no breaking changes in this release.

    Deprecated and Removed Items

    There are no deprecated or removed items in this release.

    Documentation

    Miscellaneous Items

    • removed Circle-CI from the continuous integration
    • updated grok to the user list
    • updated RavEngine to the user list
    • updated RPGMPacker to the user list
    • updated Leanify to the user list
    @@ -115,7 +115,7 @@

    Contents

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/release-3-10-0.html b/docs/release-3-10-0.html new file mode 100644 index 000000000..3205b9b4c --- /dev/null +++ b/docs/release-3-10-0.html @@ -0,0 +1,176 @@ + + + + + Codestin Search App + + + + + + + +
    +
    +
    +
    +
    +

    + Release Notes » + Release 3.10.0 (2025/05/01) +

    + +

    Release Summary

    This release improves scheduling performance through optimized work-stealing threshold tuning and a constrained decentralized buffer. It also introduces index-range-based parallel-for and parallel-reduction algorithms and modifies subflow tasking behavior to significantly enhance the performance of recursive parallelism.

    Download

    Taskflow 3.10.0 can be downloaded from here.

    System Requirements

    To use Taskflow v3.10.0, you need a compiler that supports C++17:

    • GNU C++ Compiler at least v8.4 with -std=c++17
    • Clang C++ Compiler at least v6.0 with -std=c++17
    • Microsoft Visual Studio at least v19.27 with /std:c++17
    • Apple Clang Xcode Version at least v12.0 with -std=c++17
    • Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17
    • Intel C++ Compiler at least v19.0.1 with -std=c++17
    • Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17

    Taskflow works on Linux, Windows, and Mac OS X.

    New Features

    Taskflow Core

    • optimized work-stealing loop with an adaptive breaking strategy
    • optimized shut-down signal detection using decentralized variables
    • optimized memory layout of node by combining successors and predecessors together
    • changed the default notifier to use the atomic notification algorithm under C++20
    • added debug mode for the windows CI to GitHub actions
    • added index range-based parallel-for algorithm (#551)
    // initialize data1 and data2 to 10 using two different approaches
    +std::vector<int> data1(100), data2(100);
    +
    +// Approach 1: initialize data1 using explicit index range
    +taskflow.for_each_index(0, 100, 1, [&](int i){ data1[i] = 10; });
    +
    +// Approach 2: initialize data2 using tf::IndexRange
    +tf::IndexRange<int> range(0, 100, 1);
    +taskflow.for_each_by_index(range, [&](tf::IndexRange<int>& subrange){
    +  for(int i=subrange.begin(); i<subrange.end(); i+=subrange.step_size()) {
    +    data2[i] = 10;
    +  }
    +});
    • added index range-based parallel-reduction algorithm (#654)
    std::vector<double> data(100000);
    +double res = 1.0;
    +taskflow.reduce_by_index(
    +  // index range
    +  tf::IndexRange<size_t>(0, N, 1),
    +  // final result
    +  res,
    +  // local reducer
    +  [&](tf::IndexRange<size_t> subrange, std::optional<double> running_total) { 
    +    double residual = running_total ? *running_total : 0.0;
    +    for(size_t i=subrange.begin(); i<subrange.end(); i+=subrange.step_size()) {
    +      data[i] = 1.0;
    +      residual += data[i];
    +    }
    +    printf("partial sum = %lf\n", residual);
    +    return residual;
    +  },
    +  // global reducer
    +  std::plus<double>()
    +);

    Utilities

    Bug Fixes

    • fixed the compilation error of CLI11 due to version incompatibility (#672)
    • fixed the compilation error of template deduction on packaged_task (#657)
    • fixed the MSVC compilation error due to macro clash with std::min and std::max (#670)
    • fixed the runtime error due to the use of latch in tf::Executor::Executor (#667)
    • fixed the compilation error due to incorrect const qualifier used in algorithms (#673)
    • fixed the TSAN error when using find-if algorithm tasks with closure wrapper (#675)
    • fixed the task trait bug in incorrect detection for subflow and runtime tasks (#679)
    • fixed the infinite steal caused by incorrect num_empty_steals (#681)

    Breaking Changes

    • corrected the terminology by replacing 'dependents' with 'predecessors'
    • disabled the support for tf::Subflow::detach due to multiple intricate and unresolved issues:
      • detached subflows are inherently difficult to reason about their execution logic
      • detached subflows can incur excessive memory consumption, especially in recursive workloads
      • detached subflows lack a manner to safe life cycle control and graph cleanup
      • detached subflows have limited practical benefits for most use cases
      • detached subflows can be re-implemented using taskflow composition
    • changed the default behavior of tf::Subflow to no longer retain its task graph after join
      • default retention can incur significant memory consumption problem (#674)
      • users must explicitly call tf::Subflow::retain to retain a subflow after join
    tf::Taskflow taskflow;
    +tf::Executor executor;
    +
    +taskflow.emplace([&](tf::Subflow& sf){
    +  sf.retain(true);  // retain the subflow after join for visualization
    +  auto A = sf.emplace([](){ std::cout << "A\n"; });
    +  auto B = sf.emplace([](){ std::cout << "B\n"; });
    +  auto C = sf.emplace([](){ std::cout << "C\n"; });
    +  A.precede(B, C);  // A runs before B and C
    +});  // subflow implicitly joins here
    +
    +executor.run(taskflow).wait();
    +
    +// The subflow graph is now retained and can be visualized using taskflow.dump(...)
    +taskflow.dump(std::cout);
    // programming tf::cudaGraph is consistent with Nvidia CUDA Graph but offers a simpler 
    +// and more intuitive interface by abstracting away low-level CUDA Graph boilerplate.
    +tf::cudaGraph cg;
    +cg.kernel(...);   // same as cudaFlow/cudaFlowCapturer
    +
    +// unlike cudaFlow/cudaFlowCapturer, you need to explicitly instantiate an executable 
    +// CUDA graph now and submit it to a stream for execution
    +tf::cudaGraphExec exec(cg);
    +tf::cudaStream stream;
    +stream.run(exec).synchronize();

    Documentation

    Miscellaneous Items

    If you are interested in collaborating with us on applying Taskflow to your projects, please feel free to reach out to Dr. Tsung-Wei Huang!

    +
    +
    +
    +
    + + + + + + diff --git a/docs/release-3-11-0.html b/docs/release-3-11-0.html new file mode 100644 index 000000000..b42e87bc4 --- /dev/null +++ b/docs/release-3-11-0.html @@ -0,0 +1,122 @@ + + + + + Codestin Search App + + + + + + + +
    +
    +
    +
    +
    +

    + Release Notes » + Release 3.11.0 (Master) +

    + +

    Taskflow 3.11.0 is the newest developing line to new features and improvements we continue to support. It is also where this documentation is generated. Many things are considered experimental and may change or break from time to time. While it may be difficult to be keep all things consistent when introducing new features, we continue to try our best to ensure backward compatibility.

    Download

    To download the newest version of Taskflow, please clone the master branch from Taskflow's GitHub.

    System Requirements

    To use Taskflow v3.11.0, you need a compiler that supports C++17:

    • GNU C++ Compiler at least v8.4 with -std=c++17
    • Clang C++ Compiler at least v6.0 with -std=c++17
    • Microsoft Visual Studio at least v19.27 with /std:c++17
    • Apple Clang Xcode Version at least v12.0 with -std=c++17
    • Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17
    • Intel C++ Compiler at least v19.0.1 with -std=c++17
    • Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17

    Taskflow works on Linux, Windows, and Mac OS X.

    Release Summary

    New Features

    Taskflow Core

    • added examples/task_visitor.cpp to demonstrate how to traverse a taskflow (#699)
    • added five benchmarks to showcase the capability of tf::Runtime
      • fibonacci
      • skynet
      • integrate
      • nqueens
      • primes

    Utilities

    Bug Fixes

    • fixed missing exception on thread creation failure in tf::Executor (#693)
    • fixed segmentation fault caused by empty async dependency (#700)

    Breaking Changes

    Documentation

    Miscellaneous Items

    If you are interested in collaborating with us on applying Taskflow to your projects, please feel free to reach out to Dr. Tsung-Wei Huang!

    +
    +
    +
    +
    + + + + + + diff --git a/docs/release-3-2-0.html b/docs/release-3-2-0.html index 6593f49e1..9817cec67 100644 --- a/docs/release-3-2-0.html +++ b/docs/release-3-2-0.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -73,7 +73,7 @@

    Contents

  • Miscellaneous Items
  • -

    Taskflow 3.2.0 is the 3rd release in the 3.x line! This release includes several new changes such as CPU-GPU tasking, algorithm collection, enhanced web-based profiler, documentation, and unit tests.

    Download

    Taskflow 3.2.0 can be downloaded from here.

    System Requirements

    To use Taskflow v3.2.0, you need a compiler that supports C++17:

    • GNU C++ Compiler at least v8.4 with -std=c++17
    • Clang C++ Compiler at least v6.0 with -std=c++17
    • Microsoft Visual Studio at least v19.27 with /std:c++17
    • AppleClang Xcode Version at least v12.0 with -std=c++17
    • Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17
    • Intel C++ Compiler at least v19.0.1 with -std=c++17
    • Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 and SYCL20

    Taskflow works on Linux, Windows, and Mac OS X.

    Working Items

    • enhancing support for SYCL with Intel DPC++
    • enhancing parallel CPU and GPU algorithms
    • designing pipeline interface and its scheduling algorithms

    New Features

    Taskflow Core

    cudaFlow

    New algorithms in tf::cudaFlow and tf::cudaFlowCapturer:

    • added tf::cudaFlow::reduce
    • added tf::cudaFlow::transform_reduce
    • added tf::cudaFlow::uninitialized_reduce
    • added tf::cudaFlow::transform_uninitialized_reduce
    • added tf::cudaFlow::inclusive_scan
    • added tf::cudaFlow::exclusive_scan
    • added tf::cudaFlow::transform_inclusive_scan
    • added tf::cudaFlow::transform_exclusive_scan
    • added tf::cudaFlow::merge
    • added tf::cudaFlow::merge_by_key
    • added tf::cudaFlow::sort
    • added tf::cudaFlow::sort_by_key
    • added tf::cudaFlow::find_if
    • added tf::cudaFlow::min_element
    • added tf::cudaFlow::max_element
    • added tf::cudaFlowCapturer::reduce
    • added tf::cudaFlowCapturer::transform_reduce
    • added tf::cudaFlowCapturer::uninitialized_reduce
    • added tf::cudaFlowCapturer::transform_uninitialized_reduce
    • added tf::cudaFlowCapturer::inclusive_scan
    • added tf::cudaFlowCapturer::exclusive_scan
    • added tf::cudaFlowCapturer::transform_inclusive_scan
    • added tf::cudaFlowCapturer::transform_exclusive_scan
    • added tf::cudaFlowCapturer::merge
    • added tf::cudaFlowCapturer::merge_by_key
    • added tf::cudaFlowCapturer::sort
    • added tf::cudaFlowCapturer::sort_by_key
    • added tf::cudaFlowCapturer::find_if
    • added tf::cudaFlowCapturer::min_element
    • added tf::cudaFlowCapturer::max_element
    • added tf::cudaLinearCapturing

    syclFlow

    CUDA Standard Parallel Algorithms

    Utilities

    • added CUDA meta programming
    • added SYCL meta programming

    Taskflow Profiler (TFProf)

    Bug Fixes

    • fixed compilation errors in constructing tf::cudaRoundRobinCapturing
    • fixed compilation errors of TLS worker pointer in tf::Executor
    • fixed compilation errors of nvcc v11.3 in auto template deduction
      • std::scoped_lock
      • tf::Serializer and tf::Deserializer
    • fixed memory leak when moving a tf::Taskflow

    Breaking Changes

    There are no breaking changes in this release.

    Deprecated and Removed Items

    • removed tf::cudaFlow::kernel_on method
    • removed explicit partitions in parallel iterations and reductions
    • removed tf::cudaFlowCapturerBase
    • removed tf::cublasFlowCapturer
    • renamed update and rebind methods in tf::cudaFlow and tf::cudaFlowCapturer to overloads

    Documentation

    Miscellaneous Items

    We have published tf::cudaFlow in the following conference:

    • Dian-Lun Lin and Tsung-Wei Huang, "Efficient GPU Computation using Task Graph Parallelism," European Conference on Parallel and Distributed Computing (EuroPar), 2021
    +

    Taskflow 3.2.0 is the 3rd release in the 3.x line! This release includes several new changes such as CPU-GPU tasking, algorithm collection, enhanced web-based profiler, documentation, and unit tests.

    Download

    Taskflow 3.2.0 can be downloaded from here.

    System Requirements

    To use Taskflow v3.2.0, you need a compiler that supports C++17:

    • GNU C++ Compiler at least v8.4 with -std=c++17
    • Clang C++ Compiler at least v6.0 with -std=c++17
    • Microsoft Visual Studio at least v19.27 with /std:c++17
    • AppleClang Xcode Version at least v12.0 with -std=c++17
    • Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17
    • Intel C++ Compiler at least v19.0.1 with -std=c++17
    • Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 and SYCL20

    Taskflow works on Linux, Windows, and Mac OS X.

    Working Items

    • enhancing support for SYCL with Intel DPC++
    • enhancing parallel CPU and GPU algorithms
    • designing pipeline interface and its scheduling algorithms

    New Features

    Taskflow Core

    cudaFlow

    • improved the execution flow of tf::cudaFlowCapturer when updates involve

    New algorithms in tf::cudaFlow and tf::cudaFlowCapturer:

    • added tf::cudaFlow::reduce
    • added tf::cudaFlow::transform_reduce
    • added tf::cudaFlow::uninitialized_reduce
    • added tf::cudaFlow::transform_uninitialized_reduce
    • added tf::cudaFlow::inclusive_scan
    • added tf::cudaFlow::exclusive_scan
    • added tf::cudaFlow::transform_inclusive_scan
    • added tf::cudaFlow::transform_exclusive_scan
    • added tf::cudaFlow::merge
    • added tf::cudaFlow::merge_by_key
    • added tf::cudaFlow::sort
    • added tf::cudaFlow::sort_by_key
    • added tf::cudaFlow::find_if
    • added tf::cudaFlow::min_element
    • added tf::cudaFlow::max_element
    • added tf::cudaFlowCapturer::reduce
    • added tf::cudaFlowCapturer::transform_reduce
    • added tf::cudaFlowCapturer::uninitialized_reduce
    • added tf::cudaFlowCapturer::transform_uninitialized_reduce
    • added tf::cudaFlowCapturer::inclusive_scan
    • added tf::cudaFlowCapturer::exclusive_scan
    • added tf::cudaFlowCapturer::transform_inclusive_scan
    • added tf::cudaFlowCapturer::transform_exclusive_scan
    • added tf::cudaFlowCapturer::merge
    • added tf::cudaFlowCapturer::merge_by_key
    • added tf::cudaFlowCapturer::sort
    • added tf::cudaFlowCapturer::sort_by_key
    • added tf::cudaFlowCapturer::find_if
    • added tf::cudaFlowCapturer::min_element
    • added tf::cudaFlowCapturer::max_element
    • added tf::cudaLinearCapturing

    syclFlow

    CUDA Standard Parallel Algorithms

    • added tf::cuda_for_each
    • added tf::cuda_for_each_index
    • added tf::cuda_transform
    • added tf::cuda_reduce
    • added tf::cuda_uninitialized_reduce
    • added tf::cuda_transform_reduce
    • added tf::cuda_transform_uninitialized_reduce
    • added tf::cuda_inclusive_scan
    • added tf::cuda_exclusive_scan
    • added tf::cuda_transform_inclusive_scan
    • added tf::cuda_transform_exclusive_scan
    • added tf::cuda_merge
    • added tf::cuda_merge_by_key
    • added tf::cuda_sort
    • added tf::cuda_sort_by_key
    • added tf::cuda_find_if
    • added tf::cuda_min_element
    • added tf::cuda_max_element

    Utilities

    • added CUDA meta programming
    • added SYCL meta programming

    Taskflow Profiler (TFProf)

    Bug Fixes

    • fixed compilation errors in constructing tf::cudaRoundRobinCapturing
    • fixed compilation errors of TLS worker pointer in tf::Executor
    • fixed compilation errors of nvcc v11.3 in auto template deduction
      • std::scoped_lock
      • tf::Serializer and tf::Deserializer
    • fixed memory leak when moving a tf::Taskflow

    Breaking Changes

    There are no breaking changes in this release.

    Deprecated and Removed Items

    • removed tf::cudaFlow::kernel_on method
    • removed explicit partitions in parallel iterations and reductions
    • removed tf::cudaFlowCapturerBase
    • removed tf::cublasFlowCapturer
    • renamed update and rebind methods in tf::cudaFlow and tf::cudaFlowCapturer to overloads

    Documentation

    Miscellaneous Items

    We have published tf::cudaFlow in the following conference:

    • Dian-Lun Lin and Tsung-Wei Huang, "Efficient GPU Computation using Task Graph Parallelism," European Conference on Parallel and Distributed Computing (EuroPar), 2021
    @@ -118,7 +118,7 @@

    Contents

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/release-3-3-0.html b/docs/release-3-3-0.html index 7f25e67a9..3a5604e93 100644 --- a/docs/release-3-3-0.html +++ b/docs/release-3-3-0.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -72,7 +72,7 @@

    Contents

  • Miscellaneous Items
  • -

    Taskflow 3.3.0 is the 4th release in the 3.x line! This release includes several new changes, such as sanitized data race, pipeline parallelism, documentation, and unit tests.

    Download

    Taskflow 3.3.0 can be downloaded from here.

    System Requirements

    To use Taskflow v3.3.0, you need a compiler that supports C++17:

    • GNU C++ Compiler at least v8.4 with -std=c++17
    • Clang C++ Compiler at least v6.0 with -std=c++17
    • Microsoft Visual Studio at least v19.27 with /std:c++17
    • AppleClang Xcode Version at least v12.0 with -std=c++17
    • Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17
    • Intel C++ Compiler at least v19.0.1 with -std=c++17
    • Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 and SYCL20

    Taskflow works on Linux, Windows, and Mac OS X.

    Release Summary

    1. This release has resolved data race issues reported by tsan and has incorporated essential sanitizers into the continuous integration workflows for detecting data race, illegal memory access, and memory leak of the Taskflow codebase.
    2. This release has introduced a new pipeline interface (tf::Pipeline) that allow users to create a pipeline scheduling framework for implementing pipeline algorithms.
    3. This release has introduced a new thread-id mapping algorithm to resolve unexpected thread-local storage (TLS) errors when building Taskflow projects in a shared library environment.

    New Features

    Taskflow Core

    • Changed all lambda operators in parallel algorithms to copy by default
    • Cleaned up data race errors in tsan caused by incorrect memory order
    • Enhanced scheduling performance by caching tasks in the invoke loop
    • Added tf::Task::data to allow associating a task with user-level data
    • Added tf::Executor::named_async to allow associating an asynchronous task a name
    • Added tf::Executor::named_silent_async to allow associating a silent asynchronous task a name
    • Added tf::Subflow::named_async to allow associating an asynchronous task a name
    • Added tf::Subflow::named_silent_async to allow associating a silent asynchronous task a name
    • Added multi-conditional tasking to allow a task to jump to multiple successors
    • Added tf::Runtime tasking interface to enable in-task scheduling control
    • Added tf::Taskflow::transform to perform parallel-transform algorithms
    • Added tf::Graph interface to allow users to create custom module tasks
    • Added tf::FlowBuilder::erase to remove a task from the associated graph

    cudaFlow

    Starting from v3.3, using tf::cudaFlow needs to include the header, taskflow/cuda/cudaflow.hpp. See Breaking Changes.

    syclFlow

    This release does not have any update on syclFlow.

    Utilities

    • Added tf::SmallVector to the documentation
    • Added relax_cpu call to optimize the work-stealing loop

    Taskflow Profiler (TFProf)

    This release does not have any update on the profiler.

    Bug Fixes

    If you encounter any potential bugs, please submit an issue at issue tracker.

    Breaking Changes

    For the purpose of compilation speed, you will need to separately include the follwoing files for using specific features and algorithms:

    • taskflow/algorithm/reduce.hpp for creating a parallel-reduction task
    • taskflow/algorithm/sort.hpp for creating a parallel-sort task
    • taskflow/algorithm/transform.hpp for creating a parallel-transform task
    • taskflow/algorithm/pipeline.hpp for creating a parallel-pipeline task
    • taskflow/cuda/cudaflow.hpp for creating a tf::cudaFlow and a tf::cudaFlowCapturer tasks
    • taskflow/cuda/algorithm/for_each.hpp for creating a single-threaded task on a CUDA GPU
    • taskflow/cuda/algorithm/for_each.hpp for creating a parallel-iteration task on a CUDA GPU
    • taskflow/cuda/algorithm/transform.hpp for creating a parallel-transform task on a CUDA GPU
    • taskflow/cuda/algorithm/reduce.hpp for creating a parallel-reduce task on a CUDA GPU
    • taskflow/cuda/algorithm/scan.hpp for creating a parallel-scan task on a CUDA GPU
    • taskflow/cuda/algorithm/merge.hpp for creating a parallel-merge task on a CUDA GPU
    • taskflow/cuda/algorithm/sort.hpp for creating a parallel-sort task on a CUDA GPU
    • taskflow/cuda/algorithm/find.hpp for creating a parallel-find task on a CUDA GPU

    Deprecated and Removed Items

    This release does not have any deprecated and removed items.

    Documentation

    Miscellaneous Items

    We have published Taskflow in the following venues:

    1. Tsung-Wei Huang, Dian-Lun Lin, Chun-Xun Lin, and Yibo Lin, "Taskflow: A Lightweight Parallel and Heterogeneous Task Graph Computing System," IEEE Transactions on Parallel and Distributed Systems (TPDS), vol. 33, no. 6, pp. 1303-1320, June 2022
    2. Tsung-Wei Huang, "TFProf: Profiling Large Taskflow Programs with Modern D3 and C++," IEEE International Workshop on Programming and Performance Visualization Tools (ProTools), St. Louis, Missouri, 2021

    Please do not hesitate to contact Dr. Tsung-Wei Huang if you intend to collaborate with us on using Taskflow in your scientific computing projects.

    +

    Taskflow 3.3.0 is the 4th release in the 3.x line! This release includes several new changes, such as sanitized data race, pipeline parallelism, documentation, and unit tests.

    Download

    Taskflow 3.3.0 can be downloaded from here.

    System Requirements

    To use Taskflow v3.3.0, you need a compiler that supports C++17:

    • GNU C++ Compiler at least v8.4 with -std=c++17
    • Clang C++ Compiler at least v6.0 with -std=c++17
    • Microsoft Visual Studio at least v19.27 with /std:c++17
    • AppleClang Xcode Version at least v12.0 with -std=c++17
    • Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17
    • Intel C++ Compiler at least v19.0.1 with -std=c++17
    • Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 and SYCL20

    Taskflow works on Linux, Windows, and Mac OS X.

    Release Summary

    1. This release has resolved data race issues reported by tsan and has incorporated essential sanitizers into the continuous integration workflows for detecting data race, illegal memory access, and memory leak of the Taskflow codebase.
    2. This release has introduced a new pipeline interface (tf::Pipeline) that allow users to create a pipeline scheduling framework for implementing pipeline algorithms.
    3. This release has introduced a new thread-id mapping algorithm to resolve unexpected thread-local storage (TLS) errors when building Taskflow projects in a shared library environment.

    New Features

    Taskflow Core

    • Changed all lambda operators in parallel algorithms to copy by default
    • Cleaned up data race errors in tsan caused by incorrect memory order
    • Enhanced scheduling performance by caching tasks in the invoke loop
    • Added tf::Task::data to allow associating a task with user-level data
    • Added tf::Executor::named_async to allow associating an asynchronous task a name
    • Added tf::Executor::named_silent_async to allow associating a silent asynchronous task a name
    • Added tf::Subflow::named_async to allow associating an asynchronous task a name
    • Added tf::Subflow::named_silent_async to allow associating a silent asynchronous task a name
    • Added multi-conditional tasking to allow a task to jump to multiple successors
    • Added tf::Runtime tasking interface to enable in-task scheduling control
    • Added tf::Taskflow::transform to perform parallel-transform algorithms
    • Added tf::Graph interface to allow users to create custom module tasks
    • Added tf::FlowBuilder::erase to remove a task from the associated graph

    cudaFlow

    Starting from v3.3, using tf::cudaFlow needs to include the header, taskflow/cuda/cudaflow.hpp. See Breaking Changes.

    syclFlow

    This release does not have any update on syclFlow.

    Utilities

    • Added tf::SmallVector to the documentation
    • Added relax_cpu call to optimize the work-stealing loop

    Taskflow Profiler (TFProf)

    This release does not have any update on the profiler.

    Bug Fixes

    If you encounter any potential bugs, please submit an issue at issue tracker.

    Breaking Changes

    For the purpose of compilation speed, you will need to separately include the follwoing files for using specific features and algorithms:

    • taskflow/algorithm/reduce.hpp for creating a parallel-reduction task
    • taskflow/algorithm/sort.hpp for creating a parallel-sort task
    • taskflow/algorithm/transform.hpp for creating a parallel-transform task
    • taskflow/algorithm/pipeline.hpp for creating a parallel-pipeline task
    • taskflow/cuda/cudaflow.hpp for creating a tf::cudaFlow and a tf::cudaFlowCapturer tasks
    • taskflow/cuda/algorithm/for_each.hpp for creating a single-threaded task on a CUDA GPU
    • taskflow/cuda/algorithm/for_each.hpp for creating a parallel-iteration task on a CUDA GPU
    • taskflow/cuda/algorithm/transform.hpp for creating a parallel-transform task on a CUDA GPU
    • taskflow/cuda/algorithm/reduce.hpp for creating a parallel-reduce task on a CUDA GPU
    • taskflow/cuda/algorithm/scan.hpp for creating a parallel-scan task on a CUDA GPU
    • taskflow/cuda/algorithm/merge.hpp for creating a parallel-merge task on a CUDA GPU
    • taskflow/cuda/algorithm/sort.hpp for creating a parallel-sort task on a CUDA GPU
    • taskflow/cuda/algorithm/find.hpp for creating a parallel-find task on a CUDA GPU

    Deprecated and Removed Items

    This release does not have any deprecated and removed items.

    Documentation

    Miscellaneous Items

    We have published Taskflow in the following venues:

    1. Tsung-Wei Huang, Dian-Lun Lin, Chun-Xun Lin, and Yibo Lin, "Taskflow: A Lightweight Parallel and Heterogeneous Task Graph Computing System," IEEE Transactions on Parallel and Distributed Systems (TPDS), vol. 33, no. 6, pp. 1303-1320, June 2022
    2. Tsung-Wei Huang, "TFProf: Profiling Large Taskflow Programs with Modern D3 and C++," IEEE International Workshop on Programming and Performance Visualization Tools (ProTools), St. Louis, Missouri, 2021

    Please do not hesitate to contact Dr. Tsung-Wei Huang if you intend to collaborate with us on using Taskflow in your scientific computing projects.

    @@ -117,7 +117,7 @@

    Contents

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/release-3-4-0.html b/docs/release-3-4-0.html index 6facb8797..360c8ce95 100644 --- a/docs/release-3-4-0.html +++ b/docs/release-3-4-0.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -71,7 +71,7 @@

    Contents

  • Miscellaneous Items
  • -

    Taskflow 3.4.0 is the 5th release in the 3.x line! This release includes several new changes, such as pipeline parallelism, deadlock-free execution methods, documentation, examples, and unit tests.

    Download

    Taskflow 3.4.0 can be downloaded from here.

    System Requirements

    To use Taskflow v3.4.0, you need a compiler that supports C++17:

    • GNU C++ Compiler at least v8.4 with -std=c++17
    • Clang C++ Compiler at least v6.0 with -std=c++17
    • Microsoft Visual Studio at least v19.27 with /std:c++17
    • AppleClang Xcode Version at least v12.0 with -std=c++17
    • Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17
    • Intel C++ Compiler at least v19.0.1 with -std=c++17
    • Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 and SYCL20

    Taskflow works on Linux, Windows, and Mac OS X.

    Release Summary

    This release enhances our task-parallel pipeline programming model and executor methods, supplied with several new examples and unit tests.

    New Features

    Taskflow Core

    • Improved the pipeline performance using vertical stack optimization
    • Added tf::ScalablePipeline to allow programming variable lengths of pipes
    • Added tf::Runtime::run_and_wait to allow spawning a subflow
    • Added tf::Executor::run_and_wait to allow running taskflows from a worker
    • Added an example of attaching data to a task (examples/attach_data.cpp)
    • Added an example of text processing pipeline (examples/parallel_text_pipeline.cpp)
    • Added an example of graph processing pipeline (examples/parallel_graph_pipeline.cpp)
    • Added an example of taskflow processing pipeline (examples/parallel_taskflow_pipeline.cpp)
    • Added an example of running a task graph from a worker (examples/run_and_wait.cpp)

    cudaFlow

    • Added tf::cudaStream as a move-only, RAII-styled wrapper over a native CUDA stream
    • Added tf::cudaEvent as a move-only, RAII-styled wrapper over a native CUDA event

    syclFlow

    There is no update on syclFlow in this release.

    Utilities

    • Removed serializer to improve compilation speed

    Bug Fixes

    • Fixed the compilation error due to non-portable include of immintrin.h (#371)
    • Fixed the compilation error due to using old version of doctest (#372)
    • Fixed the infinite loop bug due to unexpected share states in pipeline (#402)

    If you encounter any potential bugs, please submit an issue at issue tracker.

    Breaking Changes

    • Replaced tf::Runtime::run with tf::Runtime::run_and_wait to comply with tf::Executor::run_and_wait

    Deprecated and Removed Items

    There are no deprecated items in this release.

    Documentation

    Miscellaneous Items

    We have published Taskflow in the following venues:

    Please do not hesitate to contact Dr. Tsung-Wei Huang if you intend to collaborate with us on using Taskflow in your scientific computing projects.

    +

    Taskflow 3.4.0 is the 5th release in the 3.x line! This release includes several new changes, such as pipeline parallelism, deadlock-free execution methods, documentation, examples, and unit tests.

    Download

    Taskflow 3.4.0 can be downloaded from here.

    System Requirements

    To use Taskflow v3.4.0, you need a compiler that supports C++17:

    • GNU C++ Compiler at least v8.4 with -std=c++17
    • Clang C++ Compiler at least v6.0 with -std=c++17
    • Microsoft Visual Studio at least v19.27 with /std:c++17
    • AppleClang Xcode Version at least v12.0 with -std=c++17
    • Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17
    • Intel C++ Compiler at least v19.0.1 with -std=c++17
    • Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 and SYCL20

    Taskflow works on Linux, Windows, and Mac OS X.

    Release Summary

    This release enhances our task-parallel pipeline programming model and executor methods, supplied with several new examples and unit tests.

    New Features

    Taskflow Core

    • Improved the pipeline performance using vertical stack optimization
    • Added tf::ScalablePipeline to allow programming variable lengths of pipes
    • Added tf::Runtime::run_and_wait to allow spawning a subflow
    • Added tf::Executor::run_and_wait to allow running taskflows from a worker
    • Added an example of attaching data to a task (examples/attach_data.cpp)
    • Added an example of text processing pipeline (examples/parallel_text_pipeline.cpp)
    • Added an example of graph processing pipeline (examples/parallel_graph_pipeline.cpp)
    • Added an example of taskflow processing pipeline (examples/parallel_taskflow_pipeline.cpp)
    • Added an example of running a task graph from a worker (examples/run_and_wait.cpp)

    cudaFlow

    • Added tf::cudaStream as a move-only, RAII-styled wrapper over a native CUDA stream
    • Added tf::cudaEvent as a move-only, RAII-styled wrapper over a native CUDA event

    syclFlow

    There is no update on syclFlow in this release.

    Utilities

    • Removed serializer to improve compilation speed

    Bug Fixes

    • Fixed the compilation error due to non-portable include of immintrin.h (#371)
    • Fixed the compilation error due to using old version of doctest (#372)
    • Fixed the infinite loop bug due to unexpected share states in pipeline (#402)

    If you encounter any potential bugs, please submit an issue at issue tracker.

    Breaking Changes

    • Replaced tf::Runtime::run with tf::Runtime::run_and_wait to comply with tf::Executor::run_and_wait

    Deprecated and Removed Items

    There are no deprecated items in this release.

    Documentation

    Miscellaneous Items

    We have published Taskflow in the following venues:

    Please do not hesitate to contact Dr. Tsung-Wei Huang if you intend to collaborate with us on using Taskflow in your scientific computing projects.

    @@ -116,7 +116,7 @@

    Contents

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/release-3-5-0.html b/docs/release-3-5-0.html index de4916a87..e7658e993 100644 --- a/docs/release-3-5-0.html +++ b/docs/release-3-5-0.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -71,7 +71,7 @@

    Contents

  • Miscellaneous Items
  • -

    Taskflow 3.5.0 is the 6th release in the 3.x line! This release includes several new changes, such as pipeline parallelism, improved work-stealing performance, profiling, documentation, examples, and unit tests.

    Download

    Taskflow 3.5.0 can be downloaded from here.

    System Requirements

    To use Taskflow v3.5.0, you need a compiler that supports C++17:

    • GNU C++ Compiler at least v8.4 with -std=c++17
    • Clang C++ Compiler at least v6.0 with -std=c++17
    • Microsoft Visual Studio at least v19.27 with /std:c++17
    • AppleClang Xcode Version at least v12.0 with -std=c++17
    • Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17
    • Intel C++ Compiler at least v19.0.1 with -std=c++17
    • Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 and SYCL20

    Taskflow works on Linux, Windows, and Mac OS X.

    Release Summary

    This release introduces a new data-parallel pipeline programming model, solves the busy-waiting problem in our work-stealing scheduler, and adds a new text-based feature for profiler report.

    New Features

    Taskflow Core

    cudaFlow

    This release has no update on tf::cudaFlow.

    Utilities

    • Added tf::unroll to unroll loops using template techniques
    • Added tf::CachelineAligned to create a cacheline-aligned object
    • Replaced std::aligned_union (deprecated in C++23) with a custom byte type (#445)

    Taskflow Profiler (TFProf)

    Bug Fixes

    • Fixed the compilation error in taking move-only types for tf::Taskflow::transform_reduce
    • Fixed the compilation error in the graph pipeline benchmark
    • Fixed the compilation error in unknown OS (replaced with TF_OS_UNKNOWN)

    If you encounter any potential bugs, please submit an issue at issue tracker.

    Breaking Changes

    This release has no breaking changes.

    Deprecated and Removed Items

    This release has no deprecated and removed items.

    Documentation

    Miscellaneous Items

    We have published Taskflow in the following venues:

    Please do not hesitate to contact Dr. Tsung-Wei Huang if you intend to collaborate with us on using Taskflow in your scientific computing projects.

    +

    Taskflow 3.5.0 is the 6th release in the 3.x line! This release includes several new changes, such as pipeline parallelism, improved work-stealing performance, profiling, documentation, examples, and unit tests.

    Download

    Taskflow 3.5.0 can be downloaded from here.

    System Requirements

    To use Taskflow v3.5.0, you need a compiler that supports C++17:

    • GNU C++ Compiler at least v8.4 with -std=c++17
    • Clang C++ Compiler at least v6.0 with -std=c++17
    • Microsoft Visual Studio at least v19.27 with /std:c++17
    • AppleClang Xcode Version at least v12.0 with -std=c++17
    • Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17
    • Intel C++ Compiler at least v19.0.1 with -std=c++17
    • Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 and SYCL20

    Taskflow works on Linux, Windows, and Mac OS X.

    Release Summary

    This release introduces a new data-parallel pipeline programming model, solves the busy-waiting problem in our work-stealing scheduler, and adds a new text-based feature for profiler report.

    New Features

    Taskflow Core

    cudaFlow

    This release has no update on tf::cudaFlow.

    Utilities

    Taskflow Profiler (TFProf)

    Bug Fixes

    • Fixed the compilation error in taking move-only types for tf::Taskflow::transform_reduce
    • Fixed the compilation error in the graph pipeline benchmark
    • Fixed the compilation error in unknown OS (replaced with TF_OS_UNKNOWN)

    If you encounter any potential bugs, please submit an issue at issue tracker.

    Breaking Changes

    This release has no breaking changes.

    Deprecated and Removed Items

    This release has no deprecated and removed items.

    Documentation

    Miscellaneous Items

    We have published Taskflow in the following venues:

    Please do not hesitate to contact Dr. Tsung-Wei Huang if you intend to collaborate with us on using Taskflow in your scientific computing projects.

    @@ -116,7 +116,7 @@

    Contents

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/release-3-6-0.html b/docs/release-3-6-0.html index f814857a1..d1553a861 100644 --- a/docs/release-3-6-0.html +++ b/docs/release-3-6-0.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -70,61 +70,61 @@

    Contents

  • Miscellaneous Items
  • -

    Taskflow 3.6.0 is the 7th release in the 3.x line! This release includes several new changes, such as dynamic task graph parallelism, improved parallel algorithms, modified GPU tasking interface, documentation, examples, and unit tests.

    Download

    Taskflow 3.6.0 can be downloaded from here.

    System Requirements

    To use Taskflow v3.6.0, you need a compiler that supports C++17:

    • GNU C++ Compiler at least v8.4 with -std=c++17
    • Clang C++ Compiler at least v6.0 with -std=c++17
    • Microsoft Visual Studio at least v19.27 with /std:c++17
    • AppleClang Xcode Version at least v12.0 with -std=c++17
    • Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17
    • Intel C++ Compiler at least v19.0.1 with -std=c++17
    • Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 and SYCL20

    Taskflow works on Linux, Windows, and Mac OS X.

    Release Summary

    This release contains several changes to largely enhance the programmability of GPU tasking and standard parallel algorithms. More importantly, we have introduced a new dependent asynchronous tasking model that offers great flexibility for expressing dynamic task graph parallelism.

    New Features

    Taskflow Core

    cudaFlow

    Utilities

    • Added all_same templates to check if a parameter pack has the same type

    Taskflow Profiler (TFProf)

    • Removed cudaFlow and syclFlow tasks

    Bug Fixes

    If you encounter any potential bugs, please submit an issue at issue tracker.

    Breaking Changes

    • Dropped support for cancelling asynchronous tasks
    // previous - no longer supported
    -tf::Future<int> fu = executor.async([](){
    -  return 1;
    -});
    -fu.cancel();
    +

    Taskflow 3.6.0 is the 7th release in the 3.x line! This release includes several new changes, such as dynamic task graph parallelism, improved parallel algorithms, modified GPU tasking interface, documentation, examples, and unit tests.

    Download

    Taskflow 3.6.0 can be downloaded from here.

    System Requirements

    To use Taskflow v3.6.0, you need a compiler that supports C++17:

    • GNU C++ Compiler at least v8.4 with -std=c++17
    • Clang C++ Compiler at least v6.0 with -std=c++17
    • Microsoft Visual Studio at least v19.27 with /std:c++17
    • AppleClang Xcode Version at least v12.0 with -std=c++17
    • Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17
    • Intel C++ Compiler at least v19.0.1 with -std=c++17
    • Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 and SYCL20

    Taskflow works on Linux, Windows, and Mac OS X.

    Release Summary

    This release contains several changes to largely enhance the programmability of GPU tasking and standard parallel algorithms. More importantly, we have introduced a new dependent asynchronous tasking model that offers great flexibility for expressing dynamic task graph parallelism.

    New Features

    Taskflow Core

    cudaFlow

    • removed algorithms that require buffer from tf::cudaFlow due to update limitation
    • removed support for a dedicated cudaFlow task in Taskflow
      • all usage of tf::cudaFlow and tf::cudaFlowCapturer are standalone now

    Utilities

    • Added all_same templates to check if a parameter pack has the same type

    Taskflow Profiler (TFProf)

    • Removed cudaFlow and syclFlow tasks

    Bug Fixes

    • Fixed the compilation error caused by clashing MAX_PRIORITY wtih winspool.h (#459)
    • Fixed the compilation error caused by tf::TaskView::for_each_successor and tf::TaskView::for_each_dependent
    • Fixed the infinite-loop bug when corunning a module task from tf::Runtime

    If you encounter any potential bugs, please submit an issue at issue tracker.

    Breaking Changes

    • Dropped support for cancelling asynchronous tasks
    // previous - no longer supported
    +tf::Future<int> fu = executor.async([](){
    +  return 1;
    +});
    +fu.cancel();
     std::optional<int> res = fu.get();  // res may be std::nullopt or 1
     
     // now - use std::future instead
    -std::future<int> fu = executor.async([](){
    -  return 1;
    -});
    -int res = fu.get();
    • Dropped in-place support for running tf::cudaFlow from a dedicated task
    // previous - no longer supported
    -taskflow.emplace([](tf::cudaFlow& cf){
    -  cf.offload();
    -});
    +std::future<int> fu = executor.async([](){
    +  return 1;
    +});
    +int res = fu.get();
    • Dropped in-place support for running tf::cudaFlow from a dedicated task
    // previous - no longer supported
    +taskflow.emplace([](tf::cudaFlow& cf){
    +  cf.offload();
    +});
     
     // now - user to fully control tf::cudaFlow for maximum flexibility
    -taskflow.emplace([](){
    -  tf::cudaFlow cf;
    +taskflow.emplace([](){
    +  tf::cudaFlow cf;
     
       // offload the cudaflow asynchronously through a stream
    -  tf::cudaStream stream;
    -  cf.run(stream);
    +  tf::cudaStream stream;
    +  cf.run(stream);
     
       // wait for the cudaflow completes
    -  stream.synchronize();
    -});
    // previous - now longer supported
    -taskflow.emplace([](tf::cudaFlowCapturer& cf){
    -  cf.offload();
    -});
    +  stream.synchronize();
    +});
    • Dropped in-place support for running tf::cudaFlowCapturer from a dedicated task
    // previous - now longer supported
    +taskflow.emplace([](tf::cudaFlowCapturer& cf){
    +  cf.offload();
    +});
     
     // now - user to fully control tf::cudaFlowCapturer for maximum flexibility
    -taskflow.emplace([](){
    -  tf::cudaFlowCapturer cf;
    +taskflow.emplace([](){
    +  tf::cudaFlowCapturer cf;
     
       // offload the cudaflow asynchronously through a stream
    -  tf::cudaStream stream;
    -  cf.run(stream);
    +  tf::cudaStream stream;
    +  cf.run(stream);
     
       // wait for the cudaflow completes
    -  stream.synchronize();
    -});
    // previous - no longer supported
    -tf::cuda_reduce_buffer_size<tf::cudaDefaultExecutionPolicy, int>(N);
    +  stream.synchronize();
    +});
    • Dropped in-place support for running tf::syclFlow from a dedicated task
      • SYCL can just be used out of box together with Taskflow
    • Move all buffer query methods of CUDA standard algorithms inside execution policy
      • tf::cudaExecutionPolicy<NT, VT>::reduce_bufsz
      • tf::cudaExecutionPolicy<NT, VT>::scan_bufsz
      • tf::cudaExecutionPolicy<NT, VT>::merge_bufsz
      • tf::cudaExecutionPolicy<NT, VT>::min_element_bufsz
      • tf::cudaExecutionPolicy<NT, VT>::max_element_bufsz
    // previous - no longer supported
    +tf::cuda_reduce_buffer_size<tf::cudaDefaultExecutionPolicy, int>(N);
     
     // now (and similarly for other parallel algorithms)
    -tf::cudaDefaultExecutionPolicy policy(stream);
    -policy.reduce_bufsz<int>(N);
    • Renamed tf::Executor::run_and_wait to tf::Executor::corun for expressiveness
    • Renamed tf::Executor::loop_until to tf::Executor::corun_until for expressiveness
    • Renamed tf::Runtime::run_and_wait to tf::Runtime::corun for expressiveness
    • Disabled argument support for all asynchronous tasking features
      • users are responsible for creating their own wrapper to make the callable
    // previous - async allows passing arguments to the callable
    +tf::cudaDefaultExecutionPolicy policy(stream);
    +policy.reduce_bufsz<int>(N);
    • Renamed tf::Executor::run_and_wait to tf::Executor::corun for expressiveness
    • Renamed tf::Executor::loop_until to tf::Executor::corun_until for expressiveness
    • Renamed tf::Runtime::run_and_wait to tf::Runtime::corun for expressiveness
    • Disabled argument support for all asynchronous tasking features
      • users are responsible for creating their own wrapper to make the callable
    // previous - async allows passing arguments to the callable
     executor.async([](int i){ std::cout << i << std::endl; }, 4);  
     
     // now - users are responsible of wrapping the arumgnets into a callable
    -executor.async([i=4]( std::cout << i << std::endl; ){});
    • Replaced named_async with an overload that takes the name string on the first argument
    // previous - explicitly calling named_async to assign a name to an async task
    -executor.named_async("name", [](){});
    +executor.async([i=4]( std::cout << i << std::endl; ){});
    • Replaced named_async with an overload that takes the name string on the first argument
    // previous - explicitly calling named_async to assign a name to an async task
    +executor.named_async("name", [](){});
     
     // now - overlaod
    -executor.async("name", [](){});

    Documentation

    Miscellaneous Items

    We have published Taskflow in the following venues:

    Please do not hesitate to contact Dr. Tsung-Wei Huang if you intend to collaborate with us on using Taskflow in your scientific computing projects.

    +executor.async("name", [](){});

    Documentation

    Miscellaneous Items

    We have published Taskflow in the following venues:

    Please do not hesitate to contact Dr. Tsung-Wei Huang if you intend to collaborate with us on using Taskflow in your scientific computing projects.

    @@ -169,7 +169,7 @@

    Contents

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/release-3-7-0.html b/docs/release-3-7-0.html index f126b24b4..6755fed6e 100644 --- a/docs/release-3-7-0.html +++ b/docs/release-3-7-0.html @@ -2,10 +2,10 @@ - Codestin Search App + Codestin Search App - + @@ -47,7 +47,7 @@

    Release Notes » - Release 3.7.0 (Master) + Release 3.7.0 (2024/05/07)

    -

    Taskflow 3.7.0 is the newest developing line to new features and improvements we continue to support. It is also where this documentation is generated. Many things are considered experimental and may change or break from time to time. While it may be difficult to be keep all things consistent when introducing new features, we continue to try our best to ensure backward compatibility.

    Download

    To download the newest version of Taskflow, please clone the master branch from Taskflow's GitHub.

    System Requirements

    To use Taskflow v3.7.0, you need a compiler that supports C++17:

    • GNU C++ Compiler at least v8.4 with -std=c++17
    • Clang C++ Compiler at least v6.0 with -std=c++17
    • Microsoft Visual Studio at least v19.27 with /std:c++17
    • AppleClang Xcode Version at least v12.0 with -std=c++17
    • Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17
    • Intel C++ Compiler at least v19.0.1 with -std=c++17
    • Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 and SYCL20

    Taskflow works on Linux, Windows, and Mac OS X.

    Release Summary

    This release introduces a new exception interface to help identify C++ errors in taskflow programs. Additionally, this release enhances the scheduling performance through integration of C++20 atomic-wait into scheduler, executor, and notifier.

    New Features

    Taskflow Core

    • Improved scheduling performance of dependent asynchronous tasks
    • Improved scheduling performance of module task by removing busy looping
    • Improved tf::Executor::wait_for_all using C++20 atomic wait
    • Improved tf::Notifier using C++20 atomic wait
    • Improved worker-thread ID mapping performance using C++20 atomic wait
    • Added -Wshadow to the compilation check
    • Added tf::AsyncTask::is_done to query the completion status of an async task
    • Added tf::Taskflow::remove_dependency to remove dependencies from the graph
    • Added support for exception in tf::Taskflow and tf::Executor
    tf::Executor executor;
    -tf::Taskflow taskflow;
    -taskflow.emplace([](){ throw std::runtime_error("exception"); });
    -try {
    -  executor.run(taskflow).get();
    -}
    -catch(const std::runtime_error& e) {
    -  std::cerr << e.what() << std::endl;
    -}
    • Modified the CI to exclude exception test under sanitizers
    • Modified the tf::PartitionerBase to allow defining custom closure wrappers
    std::atomic<int> count = 0;
    -tf::Taskflow taskflow;
    +

    Taskflow 3.7.0 is the 8th release in the 3.x line! This release includes several new changes, such as exception support, improved scheduling algorithms, documentation, examples, and unit tests.

    Download

    Taskflow 3.7.0 can be downloaded from here.

    System Requirements

    To use Taskflow v3.7.0, you need a compiler that supports C++17:

    • GNU C++ Compiler at least v8.4 with -std=c++17
    • Clang C++ Compiler at least v6.0 with -std=c++17
    • Microsoft Visual Studio at least v19.27 with /std:c++17
    • AppleClang Xcode Version at least v12.0 with -std=c++17
    • Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17
    • Intel C++ Compiler at least v19.0.1 with -std=c++17
    • Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17

    Taskflow works on Linux, Windows, and Mac OS X.

    Release Summary

    This release introduces a new exception interface to help identify C++ errors in taskflow programs.

    New Features

    Taskflow Core

    • Improved scheduling performance of dependent asynchronous tasks
    • Improved scheduling performance of module task by removing busy looping
    • Improved tf::Executor::wait_for_all using C++20 atomic wait
    • Improved tf::Notifier using C++20 atomic wait
    • Improved worker-thread ID mapping performance using C++20 atomic wait
    • Added -Wshadow to the compilation check
    • Added tf::AsyncTask::is_done to query the completion status of an async task
    • Added tf::Taskflow::remove_dependency to remove dependencies from the graph
    • Added support for exception in tf::Taskflow and tf::Executor
    tf::Executor executor;
    +tf::Taskflow taskflow;
    +taskflow.emplace([](){ throw std::runtime_error("exception"); });
    +try {
    +  executor.run(taskflow).get();
    +}
    +catch(const std::runtime_error& e) {
    +  std::cerr << e.what() << std::endl;
    +}
    • Modified the CI to exclude exception test under sanitizers
    • Modified the tf::PartitionerBase to allow defining custom closure wrappers
    std::atomic<int> count = 0;
    +tf::Taskflow taskflow;
     taskflow.for_each_index(0, 100, 1, 
       [](){                 
         printf("%d\n", i); 
    -  },
    -  tf::StaticPartitioner(0, [](auto&& closure){
    +  },
    +  tf::StaticPartitioner(0, [](auto&& closure){
         // do something before invoking the partitioned task
         // ...
         
         // invoke the partitioned task
    -    closure();
    +    closure();
     
         // do something else after invoking the partitioned task
         // ...
    -  }
    -);
    -executor.run(taskflow).wait();

    Utilities

    Bug Fixes

    Breaking Changes

    • Renamed tf::Runtime::join to tf::Runtime::corun_all
    • Removed tf::WorkerInterface due to the support of exception

    Documentation

    Miscellaneous Items

    We have published Taskflow in the following venues:

    Please do not hesitate to contact Dr. Tsung-Wei Huang if you intend to collaborate with us on using Taskflow in your scientific computing projects.

    + } +); +executor.run(taskflow).wait();

    Utilities

    Bug Fixes

    Breaking Changes

    Documentation

    Miscellaneous Items

    We have published Taskflow in the following venues:

    Please do not hesitate to contact Dr. Tsung-Wei Huang if you intend to collaborate with us on using Taskflow in your scientific computing projects.

    @@ -138,7 +138,7 @@

    Contents

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/release-3-8-0.html b/docs/release-3-8-0.html new file mode 100644 index 000000000..b664ed3bd --- /dev/null +++ b/docs/release-3-8-0.html @@ -0,0 +1,134 @@ + + + + + Codestin Search App + + + + + + + +
    +
    +
    +
    +
    +

    + Release Notes » + Release 3.8.0 (2024/10/02) +

    + +

    Release Summary

    This releases (1) enhances the scheduling performance through C++20 atomic notification and a bounded queue strategy and (2) revised the semaphore model for better runtime control.

    Download

    Taskflow 3.8.0 can be downloaded from here.

    System Requirements

    To use Taskflow v3.8.0, you need a compiler that supports C++17:

    • GNU C++ Compiler at least v8.4 with -std=c++17
    • Clang C++ Compiler at least v6.0 with -std=c++17
    • Microsoft Visual Studio at least v19.27 with /std:c++17
    • AppleClang Xcode Version at least v12.0 with -std=c++17
    • Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17
    • Intel C++ Compiler at least v19.0.1 with -std=c++17
    • Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17

    Taskflow works on Linux, Windows, and Mac OS X.

    New Features

    Taskflow Core

    • Enhanced the core scheduling algorithm using a new bounded queue strategy
    • Enhanced the core scheduling performance using C++20 atomic notification
    # compile your taskflow program with C++20 enabled
    +~$ g++ -std=c++20 my_taskflow.cpp 
    • Revised the semaphore programming model for better runtime control through tf::Runtime
    tf::Executor executor(8);   // create an executor of 8 workers
    +tf::Taskflow taskflow;
    +tf::Semaphore semaphore(1); // create a semaphore with initial count 1
    +for(size_t i=0; i<1000; i++) {
    +  taskflow.emplace([&](tf::Runtime& rt){ 
    +    rt.acquire(semaphore);
    +    std::cout << "critical section here (one worker here only)\n"; 
    +    critical_section();
    +    rt.release(semaphore);
    +  });
    +}
    +executor.run(taskflow).wait();
    • Enhanced async-tasking performance through TLS
    • Added async-task benchmark
    • Added non-blocking notifier and atomic notifier modules
    • Added tf::BoundedTaskQueue and tf::UnboundedTaskQueue
    • Added tf::Freelist module to replace the centralized overflow queue
    • Removed the redundant exception handling in object pool

    Utilities

    Bug Fixes

    • Fixed the compilation error for not finding the C++ atomic library
    • Fixed the missing tf::Runtime in asynchronous tasking
    • Fixed the non-heterogeneity of tf::Taskflow::for_each_index
    • Fixed the bug of UUID unit test in a multithreaded environment

    Breaking Changes

    • Removed the support of object pool by default
    • Removed the support of prioritized tasking due to inconsistency with work stealing

    Documentation

    Miscellaneous Items

    Please do not hesitate to contact Dr. Tsung-Wei Huang if you intend to collaborate with us on using Taskflow in your scientific computing projects.

    +
    +
    +
    +
    + + + + + + diff --git a/docs/release-3-9-0.html b/docs/release-3-9-0.html new file mode 100644 index 000000000..f757e3d4c --- /dev/null +++ b/docs/release-3-9-0.html @@ -0,0 +1,122 @@ + + + + + Codestin Search App + + + + + + + +
    +
    +
    +
    +
    +

    + Release Notes » + Release 3.9.0 (2025/01/02) +

    + +

    Release Summary

    This release improves scheduling performance with a decentralized work-stealing strategy and enhances exception handling across all task types.

    Download

    Taskflow 3.9.0 can be downloaded from here.

    System Requirements

    To use Taskflow v3.9.0, you need a compiler that supports C++17:

    • GNU C++ Compiler at least v8.4 with -std=c++17
    • Clang C++ Compiler at least v6.0 with -std=c++17
    • Microsoft Visual Studio at least v19.27 with /std:c++17
    • AppleClang Xcode Version at least v12.0 with -std=c++17
    • Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17
    • Intel C++ Compiler at least v19.0.1 with -std=c++17
    • Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17

    Taskflow works on Linux, Windows, and Mac OS X.

    New Features

    Taskflow Core

    • improved the core scheduling algorithm using a decentralized work-stealing strategy
    • enhanced tf::Runtime to support preemptible execution flows
    • optimized task storage by storing detached tasks in their original subflows
    • optimized the query efficiency for strong dependencies by embedding their values in node states
    • updated tf::Graph to derive from a vector of unique pointers to nodes
    • expanded unit tests to include more exception handling scenarios
    • decoupled tf::Runtime from static task to accommodate distinct execution logic
    • removed the blocking behavior to avoid underutilized threads for the following tasks:
      • module task (#649)
      • subflow task
      • all parallel algorithms (through preemptible async tasks)
    • removed std::bind from asynchronous tasks to ensure proper constexpr switch
    • added compile-time macros to enable specific features
      • TF_ENABLE_TASK_POOL to enable the use of task pool
    • added taskflow execution through asynchronous tasking with tf::make_module_task
    • added tf::WorkerInterface for users to configure the behaviors of workers
    • added worker interface example and unit tests

    Utilities

    • added tf::pause to relax CPU during busy spinning loop
    • added tf::seed to generate a random seed based on calling time point
    • added tf::atomic_min to update an atomic variable with the minimum value
    • added tf::atomic_max to update an atomic variable with the maximum value
    • added TF_CPP20 and TF_CPP17 macro for testing cpp versions

    Bug Fixes

    • fixed AppleClang compile error in tsq.hpp (#651)
    • fixed wrong range in uuid test (#632)
    • fixed the exception bug in tf::Subflow::join (#602)
    • fixed the wrong prefix of target when running benchmark.py
    • fixed a bug in the join counter reset logic for scheduling condition tasks (#652)

    Breaking Changes

    Documentation

    Miscellaneous Items

    Please do not hesitate to contact Dr. Tsung-Wei Huang if you intend to collaborate with us on using Taskflow in your scientific computing projects.

    +
    +
    +
    +
    + + + + + + diff --git a/docs/release-roadmap.html b/docs/release-roadmap.html index 9c7bd578a..eb995475c 100644 --- a/docs/release-roadmap.html +++ b/docs/release-roadmap.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -55,7 +55,7 @@

    Contents

  • Milestone Summary
  • -

    This page describes the upcoming milestones of the Taskflow project.

    Image

    Milestone Summary

    The table below summarizes the milestones of Taskflow we plan to achieve by the end of 2021. Each milestone releases technical items that significantly enhances the capability of Taskflow.

    MilestoneReleaseTime of Arrival
    Migrate the codebase to C++20v4.x(under progress)
    Design a custom thread-creation interfaceTBD(under progress)
    Design a distributed tasking interface with schedulingTBD(under progress)
    Design a pipeline scheduling framework with token dependencyv3.x(under progress)
    Design a dynamic task graph modelv3.62023/05/08 (done)
    Design a pipeline scheduling frameworkv3.32022/01/03 (done)
    Integrate thread sanitizer into the CIv3.32022/01/03 (done)
    Integrate OpenCL and SYCL to tf::syclFlowv3.12021/04/14 (done)
    Integrate cuBLAS into tf::cudaFlowv3.02020/01/01 (done)
    Support building cudaFlow through stream capturev3.02021/01/01 (done)
    Support profiling large data in tfprofv3.02021/01/01 (done)
    Support cancelling Taskflowv3.02021/01/01 (done)
    Support limiting maximum concurrencyv3.02021/01/01 (done)
    Migrate the codebase to C++17v3.02021/01/01 (done)

    Along with the project development, we expect to have multiple releases for feature requests, bug fixes, and technical improvement.

    +

    This page describes the upcoming milestones of the Taskflow project.

    Image

    Milestone Summary

    The table below summarizes the milestones of Taskflow we plan to achieve by the end of 2021. Each milestone releases technical items that significantly enhances the capability of Taskflow.

    MilestoneRelease
    Migrate the codebase to C++20v4.x
    Design a custom thread-creation interfaceTBD
    Design a distributed tasking interface with schedulingTBD
    Design a pipeline scheduling framework with token dependencyRelease 3.7.0 (2024/05/07)
    Design a dynamic task graph modelRelease 3.6.0 (2023/05/07)
    Design a pipeline scheduling frameworkRelease 3.3.0 (2022/01/03)
    Integrate thread sanitizer into the CIRelease 3.3.0 (2022/01/03)
    Integrate OpenCL and SYCL to tf::syclFlowRelease 3.1.0 (2021/04/14)
    Integrate cuBLAS into tf::cudaFlowRelease 3.0.0 (2021/01/01)
    Support building cudaFlow through stream captureRelease 3.0.0 (2021/01/01)
    Support profiling large data in tfprofRelease 3.0.0 (2021/01/01)
    Support cancelling TaskflowRelease 3.0.0 (2021/01/01)
    Support limiting maximum concurrencyRelease 3.0.0 (2021/01/01)
    Migrate the codebase to C++17Release 3.0.0 (2021/01/01)

    Along with the project development, we expect to have multiple releases for feature requests, bug fixes, and technical improvement.

    @@ -100,7 +100,7 @@

    Contents

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/rules.html b/docs/rules.html index 71bc326a6..4628b6cc1 100644 --- a/docs/rules.html +++ b/docs/rules.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -106,7 +106,7 @@

    Contents

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/search-v2.js b/docs/search-v2.js index 1fb71e1f8..2bb9a364d 100644 --- a/docs/search-v2.js +++ b/docs/search-v2.js @@ -1,8 +1,9 @@ /* This file is part of m.css. - Copyright © 2017, 2018, 2019, 2020, 2021, 2022, 2023 + Copyright © 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025 Vladimír Vondruš + Copyright © 2020 Sergei Izmailov Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), diff --git a/docs/searchdata-v2.js b/docs/searchdata-v2.js index 7f429cfef..9cccf0272 100644 --- a/docs/searchdata-v2.js +++ b/docs/searchdata-v2.js @@ -1,2 +1,2 @@ /* Generated by https://mcss.mosra.cz/documentation/doxygen/. Do not edit. */ -Search.load('O+!-x00000;sO8w@W%iE4~hW*a>oDw0RR9100Aik00001Gzb6y00A)$00001FBAX(00A@_00001FdzT`00A#100001F)RQ800A;G00001FgO4J00A;S00001C`14N00AIM00001Fi-#h00Aym000312LJ#8DPjNs00B8>00001GHd_<00A#@00001H+TR500A(500001FN6R900A+I00001GLQfO00A(T00001GMoSa00AhX00001AgBNU00A(r00001F0=pu0RTY&00Ak$00001H^cw{00A({00001FVFx000A}D00001Fx&tD00A%J00001Gw1*S00A=Y00001F!TTb00A=k00001C;$Nf00AHd0RR92Fbn|z00Ax%0RRC2*#Q6nDINg;00A)~0RR92Fem{400A#90RR92F*E@H00A&M0RR92FF*kR00A*Z0RR92GE4yg00A&k0RR92GFSls00Ago0RR92AY=gm00A&+0RR92E^q+=0RSHY00Aj{0RR92Gk^gA00A00A)~1ONa5Fen5700A#91ONa5Ff;@J00A*N1ONa5FF*tU00A&Y1ONa5GE4*j00A&k1ONa5GFSuv00Ago1ONa5AY=pp00A&+1ONa5E^q_@0RY>00Af(1poj6ARq+*00A&21poj6E-VEA0RSxm00AjD1poj6I6MUa00A;W1poj6FGvLd00B5o1poj6FjNHq00A#r1poj6IA8?;00A*(1poj6Fl+??00A;`1poj6D0l?`00AI=1poj6FoXpF00AzF1pom6hynlsDV7BQ00A?a1poj6F`xwi00A$i1poj6Gpq#w00A(v1poj6FSrE&00A(*1poj6GQw00AyK1^@sEF*GtaHZ?FaTmt~Q0{|EV0Cofb)C2%B1ptr*07M1=00Aya1^@v7?*ae;aB>C!00Cik1^@s7ZF~j*00Crz1^@s7VTc9*00D1~1^@v7GXnqtDVhcV00B6j1^@s7F{B0n00A(n1^@s7GOz{!00AJj1^@s7WxNIe00DBs1^@s7Z^#A!00Cys1^@s7Wz+@$00Cm!1^@s7DBuPF00AK81^@s7c00ajB00AHh2LJ#8WgG_p00DC%2LJ#8VJHUx00Ct#2LJ#8Y%~V|00Ct>2LJ#8azF(pb6y7k00CuY2LJ#8bZiFz00D1u2LJ#CZZk4+F?a_6{sI7O1^|=>00ajB0RUJ800AkK2LJ&AR|Fs^eg^=U2Y>(pWu6BB00DEV2LJ#8VXy}P00CvT2LJ&80RR92bixM!00C~s2LJ#8Wy}Wv00C{%2LJ#8W!MJ*00Cp(2LJ#8VdMt@00C_32LJ#8aPS8J00AKO2LJ#8Z2SiR00C?Q2mk;9We5lW00CnT2mk;9AQT7y00Cqg2mk;9av%r*00Chp2mk;9WGn~(0RXxJ00DD22mk;9X*>u300DDC2mk;9c}NHV00C@H2mk;9VN?hJ00C}V2mk;9VPFUV00AIo2mk;9XKV-n00C}t2mk;9X?O?#00C`&2mk;9X@m#>00DH02mk;9Admo2mk;9VWABmf}L2!H?qbJhp|00Cv*2mk;9bL0pB00Cj@2mk;9WAF$700AKO2mk;9W&8*L00DCX2><{AbqEOn00AHl2><{AY!nFq0sy}NCjbBeav%u+00Ctt2><{AXe<{AZ#W4600BBa2>=2CzyK%!Iz$P800Cr52><{AZd3^X00C)Q2>=KHVgxz>?*Nkm7y%&z8UY*u00Ajx2><~AWCQ>KDSin60RdwKD1r%q00C==2><{Aa*zoC00D272><{Acbo|T2m)aQIRNeek^&b2AOjfz8v!U^cnN@_2><{Abf^gc00Cvf2><~A`2YX`aLx$;00D5)2><{AXxIq=0RT<{00AlF2><~A{sI62DeegX0RjC2DDnw_00DaV2><{Aa{vke00CwO3IG5Bbqops00CkW3IG8EO#&_`U)~7-=m~%r3IG5BWgZFu00Cnz3IG5Bbu{3m06Gc)YzhF@3IKcy0H6y10RRdB00Ak?3jhHD2>>Y23xEItW!4J-00C{@3jhECY2*t400DIB3jhECZtx2L00DLO3jhECa{LPb0RRjD00Aio3;+QE3jin#41fRuauN&x00D0r3;+NDbRY}>00DI-3;+NDV=N2+00Ct(3;+NDcsL9I0RRpF00AjP3;+QE4FD)e41fRuWljtL00C@P3;+NDbzBSp00CrT3;+NDWoQfl00C%j3;+TE6aW?g00Aj_3;+TE7yudo00Ak63;+TF7XTRmD2xn%00Cr@3;+NDWtF3;+NDW2g)O00CvL3;+NDXtWFf00CpV3;+NDZomux00DKz3;+TG695$eC|`gKfXoa400Cpt3;+NDZr}_600DXC3;+NDbLbhy00DG%4FCWEZhQ>@0RSKX00Ak84FCZF9{?zf4S)avY?2KC00C^84FCWFbzy=H0H6&30RkHUC|_6&fTj%q00C~W4FCWEb-WD#00DBs4FCWEZ^#V*0RSQZ00Ak~4FCZFApj`Y4S)ava^4L900Cv@4FCWEYwQgG00DCH4FCWEZ}<%W00w?#b75n50t^5?3;-Mr08R}6&J6(m4FCWEIs^^?0RbZbIvx&y00Ctr4gdfFZ7dD|00C(-4gdxKOao2>0|br$Py@A00DGD4*&oGX-E$M00DGP4*&oGa#Rlh00CiI4*&oGa9|Gr00C`g4*&oGZ)^_$00Crj4*&oGZg>v>00Cis4*&rGvH$=9DT)sO0Rgc9D2@++00Ci?4*&oGXq*oK00D5K4*&oGY^V0RS)o00Aj{5dZ-JF99fk5r6;zWrh&|00DZA5dZ)IX_OHF00DEF5dZ)IU!V~H00C>J5dZ)IZmbai00DKf5dZ)IXt)so00CpZ5dZ)IU&Ijr00Csm5dZ)IWzZ1-00DH?5dZ)IbKDUC00DL45dZ)IYUmLF00Cs~5dZ)KbaP?+5CC=&0Q3<600BDs5dZ`NF##z7D*-J5Iw%MdfC>?S00D9i5&!@JWgrp&00C|$5&!@JZ!8i300C(-5&!@JbT|?K00C(}5&!@JbVL#W00DAJ5&!@JVNenP0RRC400C!Q5&!@JZeS7s00C)c5&!@JYitq#00DDy5&!@KVQF|00E7qt00DG<5&!}MMFC9#a3EF^0EiL*00Co;5&!@JX`B)O00M4wG6(>o5&!}KgaU>F00Ako5&!}Lg93#DD7X@U00C^i5&!@JVaO5y0RYDW00Ak~5&!`K#sVnV5`X{!aNZIC0s!~|`T_s}DeMve0s;2|`2r~N5`X{!ZTJ!Z00ChF6951KWe5`h00d!WZ*r&-0L~Hs<`Mu76954K_y7O_a3&J~00D3+6951KXfP810RRL800AjH6954L0|O{P6Mz5#dPWlf00DDO6951KW>gaZ00DJc6951KVqg;h00J&wHWL756954K1OWg6baoQ}00Ccm6951KWq=a^00D4@6951KX^ay91po#C2muNK4gm}S00AkO695GP1px;E2>}fO3jrvk6Mz5#WvUYZ00C~g6951KX}}Wz00C^q6951KWy})*00D5)6951KY1k7000D5`6951KW#kh800C_36951KV(=3H00Ck46951KZ2S`d00ChF6aWALV+a%g0ss;L5CH%IDHId{0s#>L4*@6|6o3E$Js=bS00D9;6aWALZ!i=900DG16aWALVLTK700DAB6aWALWk?hN00D4L6aWDL6afGMDOwZ&0Ra;MC|(qR00DDk6aWALWo#4x00C}t6aWALX?PR>0RR>O00Ak46aWDM6#*!S6o3E$bB+`M00Cu|6aWALaGVqX00C*D6aWDL9svLWDXtU%0RbHWD6$lQ00DEj6aWALZom`(00Cvj6aWALYs?e?00D2(6aWANY;bgb6ac6c0N4}&00Cd#6aWALZR`{P00DLK6aWJN7y%jq9033UDgG1y0|6HS837vsCa)00Cn*761SNZbTLU00DVQ761SNb5Irl00CiE761SNUtAUd00DGj761SNZfF(&00Cug761SNWONn)00C}#761SNWq=j{00D4@761SNWsDX800LoTAQk|U761SNUzipE00DHO761SNZm1Rj00CvL761eQ2?TZk3j}xo00Akw761eR2n2Ni3IulmD8v?k00C>t761SNbJ!LD00Cj%761SNbmSHQ0swXdmjeI+Yw#8T00DFM761SNVf+>V00DFY7XSbPUm)rh00U}cLFG1G8ceQ7XSbOWmXpe00V4pdbAb*Bo_c?7XSeOEC2ujDS8(G0RbxjD1H}!00Cu$7XSbOa*P)M00D237XSbOXqXoO00D5G7XSbOVWbxT00C{P7XSbOWv~|j0RS!l00Aky7XSePEdVIM7k~f(bjB9|00C~!7XSbOb<`IC00D2>7XSbPb7Qs_0N@t@00BDY7XSeO=K=r$aP}7f00D6N7XSbOXaE=h0s}7qD*!quF7OwCeiwiS7ytkPWegYq00D9y7ytkPZzLE100C$!7ytkPa4;AE00Ch(7yttR+yXuWMF0Q+DMAF7ytkPbEp^q00CjH7ytkPbhH=%0RSBX00Ak$7ytnPBLe^dDasfC0RbWdD9#vw00Cvx7ytkPY}^*iM00Cx}82|zRRs^O300AkY82|zSRRpC1D5x2L00CsK82|tQWw;pt00Cvb82|tQW5gK%0szzk(gFYhDbN`J0s+$k(E=#e8Grx*dE6NQ00DC582|tQVeAw00Aj-8UO?V6#^3i83G#uD0~`#00BLM8UO$Ra*!GT00D278UO$RbetLh00Cj58UO$Ra;O>r00CvL8UO$RaI_i#00MJwY#IQ%8UO$RI=~tL2m&1f2LcNM5dsMU4+0kg4FWnS${K(i8h`)+a?lz800D368UO$RbnqGg00Cq68UO$RW&9cd0RWN%00D3a8vp)00C|e8~^|TZxkE=00C(l8~^|TbRZl600Chp8~^|Tax5GG00Ct(8~^|TbT}LU0RX!I00C}98~^|TZ%7;f00C)E8~^|TbW|Jw00CoK8~^|TbzmF-00CrX8~^|TWo#S(0RY?q00C}z8~^|TVSF3_00Coy8~_0Ty8{3LbB-JU00C{18~^|Ta+n+d00D2F8~^|TW~3Yd00DEV8~^|TZm=8x00CjP8~^|Ta=aV>0RVRd00Cvl8~^|TXUrS`00DB+8~^|TW!M}50RVpl00Cs=8~^|TZs;5U00C+48~^|YX>xOPZDu|k0CF4vh#UaI901-N0Q4LH00AKS8~^|TY!n>;00Cth9RL6UY#00Cvb9RL6UY{VS^00C#p9RL6UZqOY700C*%9RL6WaAk8o9RQpi0Nfn_00AK49RL6UyYL+V00F}G9RL9V1p*-a9e@AI%00DFs9smFVX&fE^00D9$9smFVZzvuB00Cz%9smFVY&0GK00Ch-9smFVAV3}f00Cr19smFVa!eio00CiA9smFVWLO>m00C}Z9smFVVPqZv00DGr9smFVb8sF20RRO600D4(9smFVaDW~F00C%*9smFVE{q-k00C{19smFVVVE8O00Cv59smFVa-<#r00DHW9smIVnF0U-aJC)*00D5i9smFVXuuu-00Az>9smFVdCVRF00Cpt9smFVY1keB00C^?9smFVZ{!{T00D679smFVU+^9P00D0H9smFVZ~Ptr00C(N9{>OWbO;{+00DIl9{>OWV-z0%00Cth9{>OWcpx7D0RS2U00D3;9{>OWa4;VL00C$=9{>OWE<7Iq00C=49{>OWb4VWm00Ci69{>RWL<0Z;a9SS#00D4b9{>OWXk;G%00Ay)9{>OWWpEz=00Com9{>OWX?!0500DM_9{>OWWr!aD0RT1x00D569{>OWaF`ze00C&89{>OWE~Fm-00DBU9{>OWWw0Lr00DZo9{>OWX}li*00C{n9{>OWX~-V{00DH)9{>OWaMT|F0RVvn00D5|9{>OWaO58V00C&~9{>OWF7O`!00DXS9{>OWa{M0v00D0TAOHXXZ3rL$00CtVAOHaXngjp=a2g;000D3wAOHXXXe1y200Ay4AOHXXaxfqO00Ct-AOHXXay%da00DJEAOHXXbVwio00D4LAOHXXVN@Uh00(nrbYyRBW3V0oCLaJ)9{`RY0NNh_6d(XvAOHaXAOZjZaDE^F00D4V00DH`AOHXXa^N5U00Lug)F1%nAOHXXF5U?M00Cw8AOHXXWBech00DIZApigYWC$Su00CtVApijYA_4#ba2g>100D3wApigYXe1#300Ay4ApigYZ7?AK00D9~ApigYZ#*FY00Cw~ApigYb4Vco00C}JApigYVN@Xi00DAZApijYyaE6LaAqL@00D4nApigYXmB9_00Ay`ApigYWPBk200C}-ApijYnFIg7W00D54ApigYXqX`Y00AzZApigYXrv(k00CpFApigYVXz?p00CvTApigYU%Vjz00DBsApigZX>W)j0LUQ#0RZL#00D5=ApigYaNHpP00C&?ApigYF6bcu00Cw0ApigYXY?Te00DCPApigYWdI@o00nb$bY^YrAOI900AL{i&LIE>A^-pZFAO4p00C_xA^-pZXec5800DF^A^-pZX*41L00DA3A^-pZZ$Kgd00C!4A^-pZY)m2m00LuS!XN-rA^-sZa{&MWDPkf30ReIWC}tvn00CueA^-pZV{{?_00C)wA^-pZc7P%P00Cu&A^-pZWQ-yJ00Cc;A^-pZbeJLl0RXK500AkaA^-satN|#fB7gt^Ypx;y00DEhA^-pZVZ0&$00DHuA^-pZU&tZ=00CvrA^-pZY}6tE00C#(A^-pZZr~yS0RYPZ00AlNA^-sa$^j_wB7gt^W%eQf00DabA^-pZX#^tx00DCfBLDyaUl1bz00D9qBLDyaWgH^_00CwqBLDyaW+)>70RYVb00Aj9BLD#b%mFAkBY*$_c|Icm00Cu2BLDyaYfK{m00Cc8BLDybb$Ko$09Ydc00ClNBLD#a&jA1dDQ+VG0RqkeUnpoJ0CFRM00DG(BLDyaa)cuQ00eYpX>Y0?0G=WM<{|)!BLD#aZvg-SDWW3)0Re6SD5fKT00DEXBLDyaWwavz00CpVBLDyaX}}`@00DN!BLDyaWy~W000CsuBLDyaU)Uo600C{@BLDyab>t%e0RV9U00AlRBLD&bcL8+)00AlbBLD&cb^&w&C;%jY00DUhBme*bbPyx}00D9qBme*bWgH{`00D3!Bme*bZzv=H00D9?Bme;bdjS9eDLNzo0RegeC_W^B00C=6Bme*bV@xCf00D1OBme*bY*-`!00ClNBme*bUt}Zz00DApBme*bWpE?_00D4zBme*bUwkA000DD?Bme*bWQZgH00Ci+Bme*bWt1cU00DBEBme*bXrLqj0RVjg00AkiBme;cd;uu1B!B<`Yqlf+00CpZBme*bZ^R@300C^uBme;bg8={mDb^$a0Re&mDB2`|00Cs+Bme*bY3L*X00DCDBme*cVrS4K0Q4jP00Ce6Bme*bas(v+00CtRB>(^ca1bQ`00CbXB>(^cZ5$;400C(tB>(^cWGE#700CbvB>({ce*pjiDLN$p0ResiC_W{C00C=6B>(^cV@xFg00D1OB>(^cY*-}#0RV*o00AjvB>({dgaIgMC4c|{WNsw@00C)sB>(^ca(pEK00LrXUL^p6B>(^cUx+0D00DB6B>(^cWtb%Z00D5GB>(^cU!)}f00C{PB>(^cX|N>#00CsSB>(^cU%Vv%0RVvk00Ak;B>({dfB`7XC4c|{YtkhE00Cp#B>(^cZ{Q^W00C^~B>({chXDWqDe@%%0Re^qDE1|Q00CtDB>(^cX#^$!00D9eCIA2eVrT3n01zes00CbXCIA2dav&xE00CttCIA2da4aSO00CbzCIA2dZ8#|$(00D9&CjbBeZ!9MO00C+;CjbEej{yJyDLy9v0RfHyC_*QI00DAHCjbBeZ%`)y00C}RCjbBfVQDxg09+>k00C`cCjbBeUu-7;00DV&CjbBeba*EK00C)!CjbBeY=kEO00C)=CjbBeVvr{Q00Ci^CjbBeaGWOq00Cj5CjbBeW2h$p00CdFCjbBeWwa*%00DHmCjbBeb-*V800D5uCjbBeZOkVC00D2(CjbEekpTb!Dc&al0RfN!DB>r800C#{CjbBeZty1n00C+CCjbBebNnX&00DCXC;$KfWe6w$00D9iC;$KfWE3a>00CqgC;$KfVIU{~00CbnC;$KfWGpBE00Ct(C;$KfX*ehV00Cw`C;$KfX+$Uh00C}FC;$KjaBX*Ebu=UZ(k1{DCji(d08l6Z0RnIVC|~p=fL15~00CueC;$KfV|XY400C)!C;$Kfc7!MZ0RWQ$00AkGC;$Ngk^v}`D1ZO~Zki|n00D2JC;$KfX{aaw00DEZC;$Kfa00DHyDF6TgZ_FtG00nMja%ObuBLM0s08}XekSPGtDF6TgU)U)C00DIFDF6TgW%MZk0RVXc00AigDgXficmXH`Du4h1c?v2300DFoDgXchau_NA00CtlDgXcha3m@K00D0*DgXchaxf|Y00D3|DgXchUpy)R00Ct}DgXchV@N6h00C)EDgXchc2p_=00CuMDgXchWMC=)00CcSDgXchaBM0700C`sDgXcib!B)e0Eiy|0sx%>n*jg;DTpcn0s))>ngJ+|Du4h1Wt1ua00Cp3DgXchX{0Iu00DNYDgXchWw0s$0RW!?00AkyDgXfio&hMpDu4h1WX38000CvrDgXcha?~mS00CjzDgXchXy7UU00LxlwkiPTDgXchU+gLX00Cq6DgXchZ~Q6%00C?QD*yoi)d2tjDGnnoD*)Il00JNY0RYDV00AlND*yoj#sMhsD}Vq2dG;#+00CwGD*yliYXmF+00CbLEC2ujc@QiB00CkaEC2xj$pHWXDIzQY0RqSYUnm?b046Me00CtzEC2ujXE-bX00M1t-YWn;EC2xj(*XbhDNZZ^0RhqhC{iqd00DGZEC2ujZeT0`00CuYEC2ujZEP$600CukEC2ujY00CpdEdT%kbI2_K00CdlEdT%kWz;PI00DN|EdT%kY2YmY00DF6EdT%kb?hww00C_BEdT%kWB4rq00D0PEdT)k#Q^{TDGDwC0RhATC=M=w00C|kE&u=lVH_?100CnnE&u=lb0{tV00CbvE&u=lWi&1T00DM7E&u=lX+SOj00DDGE&u=lbxbY*00C@LE&u=lV^}T#00DSjE&u=na%p9xEdT^A0Awxz0s^uDv;inzbS;2tE&u=lZFnvK00DA_E&u=lZ;UPg00Cx_E&u=lbC@mw00C~EE&u=lVWchq00MGhsxAOd9{>RWw*deFDY`BI0RgrFD84R$00CvhE&u=lW6Ul900DK}3>00CtrF8}}mV=ONK00DI}F8}}mWH>JX00Ct_F8}}mazrlx00Cc0F8}}mZBQ=&00DAVF8}}mZ(J_`00CxVF8}}mb7(IB00C}pF8}}mVRSD500DA(F8}}na�g0Dvz500CcyF8}}mWRNca00Cu|F8}}mdYmr+00C*DF8}}mY^W~)00CjHF8}}mX|yi@00DHmF8}}mX}~W400C~sF8}}mY0NJG00C~&F8~1mzX1RNDc&yt0Rg@NDB>@G00D03F8}}mVel^i00Cq6F8}}mbNnv=00CbDFaQ7nWe6|;00DLmFaQ7nX%sL300DCvFaQ7nbs#VR00C?!FaQ7nV=OQL00C|?FaQAn!vO#RDLya&0Rh4RC_*rR00C}DFaQ7nVNfst00CoGFaQ7nb6hY000CcOFaQ7nWoR$}00DMxFaQ7nX>>3E00DD)FaQ7nb$~Dc00C@00DHmF#rGoZon}B00C*nF#rGoZ_F_O0RZ9w00Cv#F#rGoW85(S00C~|F#rGoW#};g00DCDF#rGoW%Mxs00CzDF#rGoWdJe&00MAw(lG!AG5`Sp_yGU`DHbvS0Ri^`C>k<=00DF$G5`PpV<<8J00DI_G5`PqUvdmG05mcH00DG5G5`Ppb3`%#00C}FG5`Yr<^kvd>Hz=&DONH70|Dg$=K<*fC|)vv00DAjG5`PpZ*VdI00DG%G5`PpVSF+G00CoyG5`PpZ-_Dg00C@{G5`PpY?Lwp00Ci|G5`PpWuP(u00CpBG5`PpX{<5;00DNgG5`PpWwI0MIf30RZCx00Cv-G5`PpaO5%o00DXGG5`Ppbnr3&00Ce2G5`PpW&AP#00DXeGXMYqX$Ug_0RZFy00CtbGXMYqa2PWH00DU(GXMYqbR;tX00CbrGXMYqWiT@U00Cn*GXMYqZagyp00Ct}GXMYqa!4}(00CuAGXMYqW>hl(00CxNGXMbq^Z@_?DP}VO0Ri&?C~7l+00CuiGXMYqba*oW00CisGXMYqV}vsR00D1`GXMYqY>+bm00C^4GXMbq`T+m|DWWp~0Ri~|D5f)j00DWdGXMYqZ?rQ200DBkGXMYqbigwJ00LoioHGE%GXMYrX=PwD0L(K00RZd)00AlBGXMbr>j5a_Gk^dAa_Tbx00D3EGXMYqboes>00CkCGXMYqV+1q+00D0bGynhrY!Ea600C?kGynhrVH`9700CtpGynhrV<GynhrWI!|k00Ch}GynhrWK1*w00DJUGynhrV^}l*0s!s-@Bsh;DP%MN0s-v-?*S-kG=KmBb8s{O00DD)GynhrWq>pQ00DA_GynhrWQ;Tb0RZv=00AkOGynks@c}5DG=KmBWui0y00DHWGynhrVX!m+00CpRGynhrZ@e@B00C^mGynhsWNeZ&0LU}|0RZ*^00Al3Gynks^#Lf{G=KmBW#Tjd00DaHGynhrY49`v00DFMGynhrU;H!x00DRcH2?qsVF)z<0Ra2~00Ai!H2?tt`vE8zHGlvCJsvdx0Ra9100Aj1H2?tt{Q)R2HGlvCJvKD}00KQBCN%&)H2?qsazr%%00D1KH2?qsbW}9}00CiIH2?qsa$q$800CuYH2?qsaBMXI019$;aAR|1eqn8INHG9{F#xJD0M;@94l@AOGXP#R0M0Z34mALBH2?qsI(Rhz1Oop7a+@qBVeCGJpU9a;P-`00D2hH2?qsbig$L00CjfH2?qsW6U)G00D2(H2?qsY}hpb00C^?H2?qsVdOOc00Cv{H2?qsWAHTq00C+CH2?tsUI73Bc>*>700CnLHUIztX$&?100C?gHUIztZx}WJ00D3wHUIztZX`AU00D0*HUIztX)rbb00DG1HUIztbv!lz00Cn{HUIztWk@yv00DSTHUIztWmGl*00DGbHUIztY+yD300DJoHUIztVQe-400MSq{51e_HUIztWq39K0RVOc00C==HUIztbBs0s00Ci=HUIztbeJ{(00AJLHUIztWu!I$00C^OHUIztXRtN^00C~cHUIztX}mT70RVmk00DEzHUIztZp=0S00D2(HUIztY1lRZ00DH~HUIztVdORd00DC9HUIztW$-ot00DINHUI$tfCK;ma{@O200C_VHvj+uatt>B00D0jHvj+uW*9dB00DCzHvj+uZX`DV00ChtHvj+uaxgal00L=r{5AkKHvj+uAUrn!00C@9Hvj+uWlT2!00C@LHvj+uY*;q{00CiMHvj+ua%49E00CiYHvj+vb8x~o0B|<|00AI&Hvj+uyMQ+U00F{=HvjEbCNdz00C{9Hvj+uXrMO$00DHSHvj+uX{-00Al1Hvj?wO8`#*DBL%I00DC1Hvj+uW$ZTq00DCHHvj+ub@(>`00DIVHvj+ua0EC200ChNH~;_vV-Pq100DOvH~;_vZyY!P00C?wH~;_vW+*rS00ChxH~;_vWHdMc0RUJ400AjLH~;|wR{$tPIDh~Fa!NP=00CiAH~;_vWmq@>0sv|NYybcODP%YR0s(0NYXB%}IDh~Fd2l!Y00D4%H~;|vi~s-uDTX)z0Rf8uD2h0M00Cu?H~;_va+o*(00DKLH~;_vbfh={00D5SH~;_yb!=~8IyeAcH~@e+0I)a!0RT<_00Ak;H~;|wO#mp&IDh~Fa?&^e00Cv%H~;_va^N@s00DL8H~;_vbnG|)00D6FH~;_vVfZ)z00CqEH~;_vcLX^A00D0bIRF3wY!EpB00CweIRF3wVH`OC00CqoIRF3wbtpLi00CnzIRF6wQUCw}DLOd-0Rd3}C_XuW00BKjIRF3wa!fe@00D1OIRF3wbXYk600CiMIRF3wa%4FG00CucIRF9xUI1VK00Aj>IRF9yT>xJID114900MAsa5(^iIRF6wQ~&@0DV8|^0RdA0D4IEd00DWRIRF3wbf`H100D5WIRF3wZL~Q60RUD200Ak$IRF6xRRAc&Ie-8GbILgY00C>#IRF3wbJ#fm00Cj%IRF3wbmTbz00Cd>IRF3wZSXk&0svwFWB>pGDf~GA0s&zFV*n@uI)DHHa0og800D0jIsgFxkN^MyDIPii0RfKyC?Yz500C$yIsgCxa400DLeI{*LyX$(6600DFoI{*LyVHi6A00C|uI{*O!SpX;X00LukWIF(cI{*LyZHzkr0RV~s00AkSI{*Ozi2x{|JAeQIa;7@~00CvLI{*LydbB$L00C*bI{*LyZNNJK00C*nI{*Lybj&*d00D5)I{*LyZ`eBk00Cd#I{*LyW#l^m00C?2I{*LzWnq>(0Ps5i0swLVbN~PWDF8eG0s(OVa{wp?Jb(ZJYz#aA00CtdJOBUzZX7%S00D9$JOBUzWhguV0swXZcmMzaDKtC)0s(aZcK|3lJb(ZJYd|~z00DDKJOBUzVNg5(00DGXJOBUzUtBx@00CuUJOBUzY-l_H00C!iJOBUzZge~V00C)wJOBa!dH{R?00Ak6JOBa!egJ>~00AkIJOBa#eE@#|D40Be00DTOJOBUzWvDy=00CsKJOBUzZnQiA00C*bJOBa$c>sF=C|`^`fWSNe00C&mJOBUzW7IqV00CjzJOBUzW#Bvj00Cd-JOBUza_l?+00D3EJOBg$f&hj9gaC*D00AlfJOBg%fdGX7g8+vBCEa-ux|00CvPJpcd!a=1MJ00DKrJpcd!bi_RX00D5yJpcd!VbDDQ0RZCy00Al7Jpcg#;sPk(J%9iKa^^h%00Cw0Jpcd!dh|U200C+GJpcd!Z2&$100C(RJ^%m#bPPTK00D3kJ^%m#Zx}uR00C?sJ^%m#VI)2P00C(#J^%m#bTB>u00C|`J^%m#WjsCr00DJEJ^%m#aY#M@00CuAJ^%m#b5uS600DPeJ^%m#Z(u$E00C@fJ^%m#W^6tH00CigJ^%m#WOzOR00DJ=J^%m#V}w2c00BCRJ^%s&jMeKY#!MavDDX00CtpKL7v$dMG~t00C((KL7v$Z8Sds00C(_KL7v$bU;4<00D4DKL7v$Z%jV`00DARKL7v$VOT!^00CuQKL7v$Ze%|I00C)gKL7v$Y;ZpS00DP)KL7v$Z+t%h00C@*KL7v$W{5uk00Ci+KL7v$WRyPu00DKHKL7v$W1v3(00BCtKL7#(>jLQlIw-6^fD}J~00DBcKL7v$WxziG00DZ&KL7v$Y0N(W00C{%KL7v$Y1lsi00DH~KL7v$aO6J#00D36KL7v$a_~O@00Ck4KL7v$W&A$?00C|SKmY&%X$U|70ssjE3j+WFDHK2e0s#mE3Iiw_K!5-Nav(qe00CtxKmY&%dN4o$00C(>KmY&%Z9G5#00C)2KmY&%bVxt|00D4LKmY&%Z&W}400C}VKmY&%X<$GA00ClVKmY&%Z)`vS00DAxKmY&%WOzUT00C}(KmY&%b%a0w00D1`KmY&%a*#j(00DQFKmY&%Z=65?00C^GKmY&%W~e{_00CjHKmY&%WVAp400DKnKmY;&4FeAY00Ak+KmY;(300C_VK>z>&bqqlO00Lufz(4>JK>z>&Iv7C!0|F2O2m=ZOIw&GRfEqx600D9+K>z>&WjH|q00DYFK>z>&X+%K)00C`EK>z>&X;47`00DGXK>z>&a9lwE00D1eK>z>&ZfHRO00C)kK>z>&Vst?O00D1$K>z>&a)3bq00Cr%K>z>&Zj3z>&Z0nDWpLF0s+zk)&nT2L4W`Ocd$VK00D2hK>z>&Y`{SP00CykK>z>&Va!1Q00CsuK>z^&-va;vDc(T<0Ri3vDB?kY00DC7K>z>&Veml!00Cw8K>z{({{sO800AiiLI45*{sRC6CVLVy4PWiUbj00DA3LI3~(bwEM@00DGHLI3~(a7;n~00ebxZ(-O$0Q^A!9zp<8LI42()dK(lDQZFh0RhwlC~iW400DP&LI3~(Z+t=k00C@*LI3~(W{5%n00Ci+LI3~(WRyYx00DKHLI3~(W1vC+0RY(p00AkiLI42)*aIlALVy4PJ+?vs00DBoLI3~(Z^S|X00DH$LI3~(VbDSV00DB=LI3~(W!ypl00D5~LI42(+XDarDegi50Rh?rDDpyp00DXULI3~(bO1vD00D3YLjV8)Z45&I0RY_t00Ai&LjVB*+yf{aLx2DQb0R|k00C<%LjV8)b1*{y00Ch(LjV8)bUZ@<00Cb@LjV8)ZAe1^0RZI#00AjfLjVB*q@(DV#(A0s-a&=mRLCM1TMRbf`oC00DEdL;wH*Z@5GN0s!j+?E?S-Da1qo0s-m+>;ovuM1TMRY|umi00Cv%L;wH*Zs0@!00DC5L;wH*W$Z)%0s!v=@dE$>DfmPH0s-y=@B=9RM1TMRbOc2J00CtVMF0W-^8@t*00Ai)MF0W;@&oh(C?G|E00DU>MF0Q+a4C|E^+00D1aMF0Q+a%e>W00CugMF0W-`vd(000Aj_MF0W;`UCs}D1b$P00C@-6&0ssvJ4+H=KDV#+B0ss*N69fPODX2vN0s#;N5(Fr&MSuVSc(g?T00CvbMF0Q+WW+@P00C~wMF0Q+Y0yOg0s;&K4g@G)qD6q#MF0Q+Xxv2r00Cp_MF0Q+Vemx&00Cw8MF0Q+U;IS?00D9WMgRZ;X>XQA00>3^0RRI800Ai&MgRc;0t6@=Mt}eTZX!ki00DI>MgRf;2?PrS00AjBMgRf<2m}fQC_F}h00C=4MgRZ-b4*4600CiAMgRZ-bXZ0J00CcKMgRZ-Wn@MG00C@jMgRZ-XK+RU00C}xMgRZ-X?#Wi1OOET83Y#u8w3CWDTqb@1OXHT7z7ps8U!ekMt}eTZJ0&?00DBQMgRZ-Z>&ZD00CyQMgRZ-bGSwT00C~kMgRZ-VZ=rN00DB!MgRZ~V}5UCZe(a{ZF**Mb97ij0H#6!=t2MzLjVj!0G31mzC-}>L;y@h0CYtF4n_blMgW3F0MJGN00BDIMgRi{AOz9_)&n48E^};hIw;UL0Mer5J!NjL4W`OcNRwg00D14M*si;Y(z%@0RX=M00DAPM*si;WmHE100C%PM*si;bYMpS00D1iM*si;I&4P(0Rg}OI&w#V00CrpM*si;Zh%Js00C)+M*sl;o&*2^Ws*k#00Co`M*si;Zk$H|00Cv9M*si;a;QfD00CvLM*siqy(k}s0083Dbh#)0|BE1r39x0DB4JX00DB~NB{r00DH4NdN!=UzAAz00Cv1NdN!=W1vX@00C~MNdN!=Wvod600DBcNdN!=Ww=QI00CycNdN!=X2eMV0RXZD00Ak`NdN%>u>>g8Nq_(WW!gyq00DI3NdN!=VdzN!00Cp}NdN!=Z}dq300C_JNdN!=Yye6C0RXrJ00AisN&o=?w*)8awJLs00DF=N&o-?VRHsb05D1b00L=c%1Hn=N&o=>sssQ5DN0HJ0RgE5C{9X%00DATN&o->Z(K?M00DGjN&o->VQ5MK00CoeN&o->Z*)ok00C@zN&o->Y=BAt00Ci!N&o->ZH!6)00DE7N&o->b(l&300Cj1N&o->WTZ*}00DKXN&o->W3Wm90syQ8t^@!9DZEMm0s*T8tpq5-N`L?XbI3{n00DE-N&o->W!Opp00DB|N&o->WaLT!0RXTB00AlRN&o=?uLLOcN`L?XW%^1000DFYO8@`?VF*hA00CnTO8@`?Zxl-a00C?oO8@`@WNhk603b^M0RXfF00Aj5O8@}@vjiwKOMn0YWjad$00DYJO8@`?X-G=|00DDOO8@`?UsOu~00DSfO8@`?VPH!D0RXxL00Aj%O8@}@xdbS1OMn0YJ$6d~0RX%N00Ak4O8@}@y96kROMn0YJ&sEN00KQBeoFwBO8@`?a-2&500D2NO8@`?bgWAN00CjLO8@`?a=1$X00CvbO8@`?aKuXh019$;aAR|1eqn8I#76-BM*t>B06<9qcu4?2N&vP>047TSW=jCdO8@`?I?zi11OmMTqXeY{rvy4E`b&V?NPqwVasW&K00D0jOaK4@bQnwk00ChhOaK4@V3dY00DM#OaK4@Wq3>g00CrvOaK4@WQ0rr00Cu+OaK4@aF9#@00D27OaK4@W1LI?00DENOaK4@VW><100CsKOaK4@b+k+X0RSZd00Ak$OaK7^BmpSIOn?9ZWy(wd00CptOaK4@Y1m8v00DO1OaK4@W#mi%00Cs`OaK4@WbjM?00Cw8OaK4@aQsXF00D0TO#lD^V+c(E00DCjO#lD^VH8aO00CqgO#lD^bs$Xu00CnrO#lD_V}80!04z-a00BBOO#lJ{CIKP=Iw(9%fLKg`00Ct}O#lD^V^B>100C)MO#lD^c3e#W00CuUO#lD^WN1wQ00CrfO#lD^Wpqsd00D4%O#lD^Z-7kz0szDV!UF&SDU3}30s+GV!2>9gO@IIaZJ12}00Cj5O#lD^WvER600DBYO#lD^bhJ$X00DElO#lD^VZcoQ00CsiO#lD^b<9lw00DX88vp?S$O8ZYDc(&00RhJYDB?|k00Cv_O#lD^a_~(600DLOO#lD^bo@;K00D3UP5=M_VF*qD00CnTP5=M_UldLN00C|qP5=M_X&_Dj00CzvP5=M_Wh_nr0RYMa00AjDP5=P`$pa`nPJjRbWkOB>00DAJP5=M_bx=+K00DGXP5=M_a9mCR00CiQP5=M_V`xqQ00CcaP5=M_WOPmd00C}#P5=M_W1tHF00Cu&P5=P_&I14eDUwbA0RhbeD3(ru00DHIP5=M_X{1g700CjDP5=M~V{>9!d&`yAoO@IIaZPrcz00Cj00DCHP5=M`V|0d10QgP-0RU4000DUjPXGV`bPP`b00D3kPXGV`Z5U4g0RU7100DF)PXGV`b0|*$00D0hPyhe{bI4Et00CjnPyhe{bktA)00CjzPyhe{WZ+N$00DL8Pyhh{qyhi|De_PN0Rf`|DE3f*00CwEPyhe{WCT$F00CtRQ2+n|V-QgQ00CtdQ2+n|avV_r0RW}~00Ai|Q2+q}r2;4{QGfseWHM0z00Ct>Q2+n|Wk68?00Co0Q2+n|V@y#100DJUQ2+t}ssgA200AjpQ2+t~sRE}0C}dH900CucQ2+n|ZFErp0RXH500Ak0Q2+q}s{$y5QGfsebBa*_00DB6Q2+n|Z00Cv@Q2+n|Wb9D@00D0DQ2+n|W%yA500D6RQ2+n|Wdu?H00MJl!chPUQUCw}Ul39N00C_pQUCw~VRd>@03cET0RXT900Aj5QUCz~uL3AEQh)#fc{)-600DGDQUCw}a7a=B00C`IQUCz}vH}1BDOyqh0RgcBC|**400CuWQUCw}aBNZl00DV&QUCz}v;qJDDSlD_0RgiDD1uUe00D4_QUCw}ZIDs`00DKDQUCz}wgLbFDWXyU0RgoFD5g??00DBWQUCw}Z?sYX00DElQUCw}bHGvn00CvjQUCw}W6V+j00CptQUCw}b=Xn>0RXrH00AlFQUCz~w*n~WQh)#fbna3B00D0HQUCw}W&BbA00CqIQvd(~ZU|EV00CtVQvd(~a1>Jj00CthQvd)0b7bC903cHU00CbnQvd(~XfRU%00Cn*Qvd(~VLVd+00Ct}Qvd(~Ur18`00DANQvd(~Z&XtN010nnaC2^DbYy0%Pyp;u03uNUSWy5bQUFv^0C-XWoKgT2v00CvvQvd(~cGyz@00Cv*Qvd(~VdPT)00Cs`Qvd(~b?{RF00Cq6Qvd(~I{Z@r1OhAqC;}z|Dgrtv0#tyyQ-A;gbO=-c00C|qQ~&@0Wgt`l0RZp<0RZv>0Ra3000Aj9Q~&`1`vNF9RDb{hb3Rl600C=8Q~&@0V@y;400D1OQ~&@0Y*t00C*vQ~&@0bktM;00DK{Q~&@0W8hQ(00Cv@Q~&@0cnd+00DJARR911WJFZ}00Cu6RR94100RI4DOObg0RjI4C|Xs300DYnRR911b7)ln00CxhRR911b#zq#00ClpRR911Uw~Br00DG{RR911Zj4m`00Cu^RR911ZJ1R600Cv5RR911Y@}5H00CvHRR911U$9jG00C~cRR9410s{a6DZ*6%0RaI6D8^NQ00DZ;RR911bJSG;00Cy&RR911b>LM100Cm=RR911U+h%?00DIJRR911ZunII00CwGRR911Z3I>T00CtRRsaA2Y!Fre00CtdRsaA2UmR8d00DR+RsaD21_J;ADK1t30RaUAC^A-n00DY9RsaA2b3j%A00Cx3RsaA2bxc+O00ClBRsaA2UszTE00CuQRsaA2XJl3Y00DApRsaA4X<=o&RRAbf0B}|S00wS$V{>wCc2od}Q~(N904P-eP*nhSRsaA2I)qjL0|EyF^8)n(Iw+o2fRt2#00DWVRsaA2W3*NP00C*bRsaA2Y`|6k00D2tRsaA2aLiT!00C~&RsaA2Z`f7<00C*RR{#J3bF^0g00CjTR{#J3bih{t00CddR{#J3W6W0o00C~&R{#J3dDvF~00DE}R{#M3_W%F^aOzh800D6BR{#J3X!KVA00A!gR{#J3cK}!b00D0XSO5S4Yz$Zc00CwaSO5S4YZzDn00DCzSO5S4VI)`p00DF=SO5S4FECgD00Ct-SO5S4ay(c700D18SO5V4dISIga86hN00D4PSO5S4XjoVP00L$%5?BCUSO5S4Yh+je00DDuSO5V4d;|aiaC%q(00D4*SO5S4XoOe*00AzFSO5S4agbO500LoibXWkESO5V4G64VqaHd!Q00D5SSO5S4Xs}oS00AzxSO5S4a=cgo00D2pSO5S4bjVl$00DK*SO5S4W7Jpx00Cv%SO5S4c;Hw70RSTc00D69SO5S4aPU|F00C(BSO5S4F8o*k00D9WSpWb5We8aS00CqUSpWb5Y!q1l00C(lSpWb5bs$*)00CkqSpWb5Uo2Sw00DP0SpWb5Z#Y>100C@1SpWe52x00D5ySpWb5XwX>z00A!6SpWb5a@<(}00Cvjd^T7Uoob!C!Q09slA0RTV%00C!gS^xk6Zg5%v00C)sS^xk6YkXP&00DD?S^xk6VTf7)00DH4S^xk6Ae34F00C^8S^xk6VW3(700C~MS^xk6Z>(AX00C*TS^xk6bhugo00C*fS^xk6WW-tk0RZ{{00C#vS^xk6Zq!-;00C**S^xk6Yv5V{00DF6S^xk6VeDD}00DIJS^xk6AoyAU00CwGS^xk6Yy?{X00CkOTL1t7VGvsY00DCrTL1t7ZyZ|y00D3!TL1t8Zf(k104Q4k00D0k000DQFTL1t7Ae>tO00DHOTL1t7W2jpI00CjHTL1t7a00D1?TmS$8a*SL600C%{TmS$8W0+h300C~ETmS$8d8Awb00DEVTmS$8VR#Y%00DHiTmS$8AiP`v00CpdTmS$8X~zA00D32TmS$8Z0uYB00Cz5TmS$8V)$GD00MM%+FStsTmS(89s&UXBmw{dDH2@(00D9qT>t<9WgJ}q00D9$T>t<9btqi`00DF^T>t<9a5P;200Ch-T>tt<9Z%|zT00C@PT>t<9W?WqW00CiQT>t<9WN2Lg00DJwT>t<9V{}~r00Ag^T>t<9Ab?!}00C!)T>t<9Zj4<300C)|T>t<9YnWXC00DEJT>t<9VWeFE00DHWT>t<9Ah2Bk00DKjT>t<9aJ*dr00D8<3IG5BWyoCs0RZg+00DW_T>t<9W7u5)00C~^T>t<9W#nA|00DC9T>t<9a`0UM00DLOT>t<9WBgqJ00C|SUH||AZwOuh00CnTUH||AAQWBz00C_pUH||Abs$~<00C_#UH||AX)Im<00DS1UH||AVK`m@00C`2UH||AAVgjO00Cu6UH||AXi#1N00DGXUH||AAY5Jm00DGjUH||AX=q*m00C`oUH||AX>?uy0RZj-00C!!UH||AZiHR{00C)=UH||AYmi<500DEBUH||AVVqt700DHOUH||AAgEpd00CsKUH||AWwc%Z00DZsUH||AX~13p00DHyUH||AY0O>#00DB+UH||AZ`fV{0RWi;00C~~UH||AZ|GhC00C+4UH||Abo5>T00Ck8UH||Ab^u=h00C(RUjP6BbPQhr00D0jUjP6BZ5Uqw00AH#UjP6BbR=H@00CnvUjP6BWiVd=0RWo>00DD6UjP6BZ9rcD00Ch}UjP6Ba!g+U00C!GUjP6BZ&+Ud00DAdUjP6BaAaQq00AIsUjP6BcW_?-00D1yUjP6BY!o*0RWQ&00C#%UjP6BZroo000C*@UjP6BZ0KJA00Cs~UjP6BZuDOO00Ck8UjP6BXaHaU00AHdU;qFCZVX@m00D0jU;qFCX&7Jt00DF!U;qFCa3o*=00CtxU;qICxdQ+JayDQ900D10U;qFCbUuU;qFCcvxTn010DhVRL74Y;a{bTL8>k0I*yD1YH2aT>#Eq0D4{k(q8~#U;qFCAZTC!00Fy>U;qFC!jxbD0Rg%LAevx+00C>DU;qFCZ>V4Z00D2VU;qFCVzgiY00C>dU;qIDM*(4=S^xk6ZpL5$00D2#U;qFCY1Ci<00DH`U;qFCY2aW00sx-`;Q;^va_nFL00Cw4U;qFCbogKZ00D0PU;qFCX#`;a00D0bVE_UEpakLp00CtdVE_ODV;o@s00C|yVE_ODWhh|)00D9?VE_ODWi(*`00Cw?VE_ODWk6v700MAw5McmDVE_UEwgmVA00AjhVE_UFwFLJ8C|qHH00DGjVE_ODV{Bmn00DJ!VE_OEUvf-g0C-^l00DGT}U0GMF_00DG9VgLXEbx2|W00Cl7VgLXEX;fkW00DAZVgLXGWOH=pU;v0=0AOMO1OWd7Y5)NQYybcODRg201Ofg7X#fBOYXB&IVt@bvd4ysB0RX)K00Ci`VgLXEWSn9E00DKPVgLXEW2j;P00AJbVgLXEXtZJg00DHmVgLXEX~1Ft00DQ#VgLXEAk1O_00DQ>VgLXEZ`fi000C^?VgLXEX5?Z300C?2VgLXEbMRsS00Ck4VgLXEbo^of00AHZV*mgFWe8&c00C?cV*mmGQUpE%00Ai)V*mmHQ3O2#C?I2i00CwuV*mgFZ!lv300Cb%V*mgFWISU400LoU5L*C3V*mgFb4X(V00L=m6k`BVV*mgFa9Cpi0RRI600DGpV*mgFV{Bsp00DJ!V*mgFWO!o$00C}(V*mgFZ-iq200Co)V*mgFAdq7K00Cx}V*mgFZ=7QQ00AJPV*mgFWvF8S00(k)Z*FjHWMWzX)E5B2U;t)f0Fq(=Vq*ZVV*myK-T<)!-vibHIs;e$00Ak|V*myL-2kuy-UHPFIRjS!DBxp&00DC5V*mgFVfbSJ0RVyo00CtLWB>pGa0p}o00DUpWB>pGbQEL&0sxW(kOTk$DIjD30s@f)j|5*R8e{+_WPktxax7#300Ct>WB>pGa6n`L00D4DWB>pGVN7HI00DARWB>pGcUWWq00CcKWB>pGWn^Rk00DApWB>pGb#P<=00DD$WB>pHWpDmt0DNQs0|1Nw9Rwc)00AkCWB>#KivS!19t2qdD3oM?00Cv1WB>pGa;Rhg00DKbWB>vHdIEd`00AkwWB>vIc>;R^D8OWZ00DW%WB>pGbkJk~00C*%WB>pGW87o_00MMjv}6F{WB>vH!2p&300DIJWB>vH%K)MP00AlfWB>vJ$^fANUnuxw00d=#00CtRWdHyHY#3z#00CkiWdHyHVI*Y$00C?&WdH#HF#`YrDK=#Q0Rb=rC^}_;00C@3WdHyHWk_WJ00Co8WdHyJaBObqWB@Q_090iF0sxEzjsyS!DQIN?0s@N!jRapP0uTUhWq<$yWprf#00DY}WdHyHX^3S200DE3WdHyHUzBA400C>7WdHyHZlGlV0suGzHUj_wDXe7x0s%JzH3KNJWq<$yaJXdv00C{nWdHyHb;xA^0RTD!00Ak~WdH#IIRhxzWq<$ya^7VC00Cju0RcM$C<11H00DCdW&i*IYY=7t00DCrW&i*IVH{=v00DF&W&i*IUnph(00C_-W&i*Ibu?xG0RTP&00AjLW&i;JJp(92W`FW&i*IbckjE0RTb+00AkKW&i;JK?5k5W`F%O0RYVe00AjfX8-{K%mgS{XMg|!ZeC{q00D1iX8-^JX>4Zz00DGzX8-^JV|Zr(00CuwX8-^JbA)FA00C@@X8-^JVUTA400Co`X8-^JX`E*O00DHOX8-^JX{cua00DBYX8-{J&jbJgDY|C>0RhegD86Ta00CsgX8-^JWXxv(00LuSv}XX)X8-^JI@o6b0s_zk%mg|p;%9(ZXMg|!Zs=zK00D3IX8-^JY5ZpZ00DFYXaE2KV+d#f00CtVXaE2LE^|(202F8d00C?oXaE2KVI*h(00CnvXaE2KX)tI200DG1XaE2KX*_5E00wn$Y+-0}I#&R+V*r9=09s`LG-m)pXaEBMlmLDN0ssI3b7E)!00C}lXaE2KZ*XV;00C)sXaE2KbbM$400CiwXaE2Ka)@XE00Cu=XaE2Kbd+cS0sxc*y8r+IZlGuY00D2NXaE2KX{=}f00DHeXaE2KW4LGl00DKrXaE2KWW;Cy00CvnXaE8Lz5v_-00C~+XaE2KVcci{0RX)N00DI9XaE2Ka_ndT00Lug;%ET!XaEBMGy{MHy8{3La{_4q00C_ZX#fBLau8_%00D0nX#fBLW*lh%00DC%X#fBLZYXI000ChxX#fBLax`fG0sz1Re*^#lWI$;E00C}BX#fBLX-sJV00wDtb982HnrHyfXaM+V06J*^QfUC*8~_0UPyrxVX#fBLY;tJ;00CusX#fBLY=CJ100C@vIiv*1XC~jqd00CtpY5)KMdMs)H00C(-Y5)KMb2w@M00Cb*m00L}i9BKeoY5)NMg9HEpWoBvs00DYxY5)KMX>e))00DD$Y5)KMUwmo+00C=)Y5)KMZis3C00DK5Y5)NMgaiNqa++!Y00Cv5Y5)KMaHMJg00D5SY5)KMVX$fd00DBgY5)KMcf4u<00CdZY5)KMWyop(00DB&Y5)KMb<}DA00DE_Y5)KMZ{TVG00L-ilxhIxY5)KMW9(`G0|15uhy;oR00AldY5)TPg#?ELi3BJFYk&X&WeRHm00DCrYXATNVH|4!00CkmYXATNawuy600Ct#YXATNZZvBE00D10YXATNX+UcL00DGHYXATNX-sPX00DGTYXATNa#(8s00CiMYXATPV_$GwY5?|X0Ay00C*nYXATNa?EQ000D2(YXATNXV_~100C^?YXATNVdQH700AKCYXATNXYgwP00D0HYXATOWofW$0Q_qJ00C|SYybcOZwzbz00C(dYybcObQo*^00C(pYybcPVRVdX03>Vx00D9;YybfO1ONa5a5`)N00D45YybcOXhduP0st}qGynhrDNt+x0s%1qGXN-7Y=8g(Wn63k00C}hYybcOX>4o&00C@rYybcOWq51=00D4*YybcOX@qP50RT1t00AkGYybfPH2^4-Y=8g(bDC@b00Cv9YybcOZm4Vk00C*PYybfOH~;_vDY|R`0RcAvD86if00DEvYybcOWz1{<00D5)YybcOY1nK40RTJz00AlFYybfPI{+x?Y=8g(bM9;a00D0HYybcOW&CUa00CSDG+V|0s+DR!~iH3Zh!y*WgKn*00D3&ZU6uQX)JC40szMV$p8QWDL8Hb0s+PV$N(rlZh!y*WkhZO00D4LZU6uQc~ouy00U!ibP8?&GHw7`ZU6uQI$&-90|3JS!ve(s00AjaLI00C^0ZU6uQVVG_J00DBIZU6uQVWe&V00DA-7ytkPWw34l00MAxs%`+bZU6uQI=pTG0Rd70I>v5*00CvpZU6uQaMW%900DX0ZU6xQzXAXODduhf0Rg@ODC%y200DRKZU6uQZ}@Hi00C_NZU6xQ!vX*SDGF}@0RqATW+()201j_}00CtbZvX%Ra3F6000C(xZvX)R!2$pQDKc*W0Rg}QC^m0^00Ct@ZvX%RZbWYY00C)AZvX)R#R32UDOPU)0RhAUC|YlT00C}bZvX%RWoT~z00C=mZvX%RZ**?}00D325&!@JbbxOF0RYJY00AkCZvX)S$O0&kZ-4*+bCz!a00C^CZvX%RVWe*W00DBUZvX%Ra;BNp5Z~y=SIuLLG0s_nez5+TZDsX`6Zh!y*cQ9}O00D14Z~y=SY(#JX1P06i!T`bo!~kqMC|`7Dz-<6-Zh(AlfO-=E;BEj)Z~y=SWl(Sc00D4vZ~y@SKmq^(DSmJO0s%h)L;)y*aDV^-a)@vM00Cu^Z~y=SWSDRO00C^CZ~y=SZ=`Sl00C&KZ~y=SWw3An00CpRZ~y=SVZ3kv0sx%^-vIyta>#H100CvrZ~y=SbkuMF00C~=Z~y`TfC7R700AlHZ~y`Ue*%F5DC}^600L=n;BWx)Z~y@SegXgiDF$%>0RepiC<<|a00C$aaR2}Ta~N>|0svG5qyhi|DI{?K0s&J5qXH-@aex2;WiW9700Cq=aR2}TWk7KN00Co0aR34UU<6(S00AjdaR37WUj$tQzXT{)aex2@S_00DA(aR2}TZ-8+C00U)dZ%lCjhH(JCUjP6GVQFk{b#gRp0C;c!!f*imZ~z{00E}?}00AyaYybcOWwdbs00C~gaR2}TX~1y+00C^qaR2}TWz2B^00D5)aR2}TE*5G400DB|aR2}TW#n-H00D05aR2}TZ}4#d00C+CaR2}Tbo_Au00C(NasU7UbO>?)00D9iasU7VX<^uL02Fcn00AyeX8-^JY$S3300ChtasU7UV=!_600C(>asU7UbUbnZ00C)2asU7Ua!7Ij00MAh8gc+masUAVAp|c}a)1BNasU7UbFgv%00LoiwsHWLSO5SCVPj=xZ*zBaXXIA^NLT>oSO7#>0E$@vz*zwLSpd9p04`bp0Rfc&FUE3!00Cv}asU7Ua`bWl00MPyq;dfIasUAU00IC3a0+t(00D3ga{vGVXcTh*00AxVbpQYXZ@6^;00DHqbpQYXAjEY500C&qbpQYXbkKDG00C*%bpQYXcieRV0Re0RAmViZ00Cv_bpQYXZt!&g00C+CbpQYXZ2WZq00CtJb^rhYa0qq)00C(Zb^rhZb8u9302For00AHxb^rhYY$SF700Ctxb^rhYY%q2J00C?^b^rhYVLWyK00DABb^rhYVMulW00D4Lb^rhYEmU>@00C=Sb^rhYb6|D=00CiUb^rkY-2ngrWpZ`^00C}xb^rhYX?%7700C@*b^rhYWr%hF00D50b^rhYX_R&V00D5Cb^rhYAfR>t00C^Kb^rhYWvq4p00C^Wb^rhYY`As+00CjXb^rhYa>RB300Cjjb^rhYaL{%D00Ax5b^rhYVcd2A00DI3b^rhYVd!=M00VGzWbkwVY<2+db^rhYAoO+s00FxKcK`qZ!U%T&0RaXAAP#qc00DCpcK`qZZ5($100C$scK`qZbSQTK00C((cK`qZax`}U00D10cK`qZXFzuV00C@9cK`tZeFOjjWm0zl00C}RcK`qZX@l000D4%cK`qZAb@uO00C!)cK`qZZj5&T00C)|cK`qZbC`Dk00DEJcK`qZWu$ii00CpFcK`qZZ?Ja&00DBgcK`qaVQ@@$0K9hq0|1=^p97l%00Ak?cK`zcoCBT%ngb}*cYpu^ciMLV00D32cK`qZZ0vUc00Cz5cK`qZYxs8n00DFUcK`qZVFY*p0RW)`00AiwcmM$bpaUorcz^%_JsNlb00D9$cmMzaZzy;G00DF^cmMzaVKjIE00DA3cmMzaWk7fU00D4DcmM$aqXPf|DN=X<0Rf@|C{}oY00DPgcmMzaZ)A7?00C@jcmMzaW^i}_00C=ucmMzab9{IJ00CiwcmMzabclEW0syB2r2_x~DU^5s0s*E2qys3Lcz^%_aG-bq00C{PcmMzab+C8<0RX8300AkycmM$br~@d#cz^%_bH;c800C>xcmMzabJTbM00CjzcmMzabl`XZ00Cd-cmMzaZR~ge0syZAtpfl7DfoB*0s*cAtOF?icz^%_WdwNu00C_dc>n+cbz$;&02Fxu0RXE500Ai^c>nn+ba6EYc00C`6c>n$}BDNcC+0RgZBC{lTV00DAXc>n+bVPJUx00CuYc>n+bY;1V|0RXcD00Aj@c>nn+bVUT$M00DHCc>n+bUz~XW00C&Cc>n+bW2kum00CjHc>n+bWwdz!00CdRc>n+ba=>{200D2tc>n+bWLgUV00Cdpc>n+bW!QND00DO1c>n+bZ{&FZ00C|4c>n+bW$<|b0RXoH00Aldc>nrSZe?R;a%T#708DrQj(7mJcmNuC07!WNa(MvEc>wl#03dn*00BBCdH@9lxC531oCBT%ngcH&E;=ZlbAZBk02*@uP7|bCP=i00C{9djJ3dVW4{e00DBQdjJ3ea$z2O0IYie0RTh+00DBmdjJ3dWx#s?00CsidjJ3dY|MKA00D2(djJ3dXxMuI00Cv*djJ3dW8`}P00Cj@djJ3dZ18&k0RTk-00CqCdjJ3dX#jix00DFcd;kCeVGMi#0RTn;00DOxd;kCeZybC800C?wd;kCeW+;3B00Ckyd;kCfbafJZ05p660RTq<00C}9d;kCeZ%BLq00C)Ed;kCebX0r*00C)Qd;kCeWMF&%00C}hd;kCeZ)|)30RTt=00Cuqd;kCeYW14&b00C~Ed;kCed8B**00DEVd;kFeN&x@?WVU<&00CvXd;kCeZoqs100C*nd;kCeX3Ts500Cvvd;kCeWY~NF00C~^d;kCjaC2jAVRiO<06KgCa(n=kd;qX~0OWiC00BDcd;kFfO946reSiP~WeR-&00D3keE00CuQeEVeE?;00AlfeE=NX03dz<00Cj*WB>pGa4>!V00Ch(egFUgaPn{f00MPy9&rFdegFUlbbe)XV{n3f0Lpy;_#xF00D2#egFUgbJTtS00DE_egFUgW#E1Q00Cp>egFUgW9)ta0RSlk00AlZegFXhCP00Cq=e*gdhUqF8V00C!4e*gdhZcKjw00D1Oe*gdha#()=00MP%@_qnbe*gghCje*gdhWsH9S00Cr@e*gdhZkT@n00Cv5e*gdhaHM|#00CvHe*gghD+B-mDYkzA0RbumD7t@u00DEre*gdhbjW`I00C~!e*gdhWz>HF00Cs$e*gdhZs30a00Cv@e*gdhaO{5o00Cw4e*gdhWcYso00CeAe*gdhYXpD*00ChNfB*miWe|V>00VPmcVvG6uzvs+fB*miUmSn{00C_(fB*mjVRf*605E_60RSxo00AjLfB*pjECeV-fPer2a!P;z00D1OfB*mib69`?00DDefB*miWn_Q=00CoafB*miV{m`~00DJ&fB*piF9ZMqDT06i0Rb)qD29N500DH2fB*miZj^uk00Cv1fB*miWT1cm00C~MfB*miWvqYz00D5afB*miWw?L<00MJle1HJHfB*miU&Md_00C&ufB*miW7L2E00CjzfB*miW#E7S00Cd-fB*mia_oQr00D3EfB*piF$4esDgJ-}0Rb=sC<1|i00CtPfdBvja1en200DUxfdByjGXwwuDI$RY0Rb`uC?c0suG!Is^a#DV%`-0s%J!IRq%8fq(!3Yp8($00DEdfdByja0CDWDZYUK0Re9WD8hk&00CvlfdBvjaL|DO00LooxPbuHfdB#kJOn-j00AlHfdByjKm-5*De{2;0RcY*DE5JX0s=b(Jp?E{?16y%fdByjLIeN-DH4JJ0Rce-C>DZ%00BK5f&c&lJs|{w03?C{00D9;f&c&kZ#04c00DG5f&c&kVL*Za00DAFf&c&kWlVwq00D4Pf&c;lL00Ak0f&c*lMFc2>f`9-4bBclh00DB6f&c&kZlXApw`00C|mg8%>lZybXF00D9$g8%>mb#%Of04Re10RTt@00AjDg8%^mM+7K5gMa`5b3%gv00DGLg8%>lZcu{&00CuIg8%>lWL$#)00C}dg8%>lWoUx{00D4rg8%>lWpsl80RT({00Ak0g8%^mO9Uu{gMa`5bBcoi00DH8g8%>lZkU4r00Cv5g8%>lWTb-t00C~Qg8%>lWw3()00D5eg8%>lWxRs`00Cseg8%>lU&w<100C>xg8%>lVbp^F00Cv%g8%>nb7Xfeg8+Ji0N{fF00Cd-g8%>lZS;cx00LokdV&D@g8%^lPy_%0DGG!D0Rc}0C=P^x00CL00Chlg#Z8nWhjLJ0RUnI00Aj9g#ZBpVFV~&IE8?s2><{AbUuXu00Cu6g#Z8oY;!J!08oVh00CuIg#ZBnS_A+ADQ1NL0RdSAC~Ad(00Cuig#Z8na(IOR00C)!g#Z8nb%ccg00D7|g#Z8nV~~Xa0sveDbOZnaDV&7>0s&hDa|9@&g@6D7VW@=w00DHeg#ZBnas&VYDZYgO0ReFYD8hw+00D5wg#Z8nZP0}P00LokxP<`Lg#ZBnW&{8MDdvR$0Rd$MDC&iP00DXMg#Z8nbohk;00D6Rg#Z8nZ3Kn@0RU(O00Aiwh5!KpX9Oq|hJXM8YZ`_C00D9$h5!HoZzzTU00DO{h5!KoZUg`UDLRG#0Re3UC_aXO00Cu0h5!Hobxei;00C@Lh5!HoVOWL$00DMhh5!HoUu1>=00LiVG=>0bh5!Hob8vm?eE>jx0Fr(HmVp3zhk&AifL>St00C=&hX4QpbFha100DG34gdfFZoG#80RguFI$98b00DByhX4QpWzdHJ00C~+hX4QpZ`_9f00C*@hX4Qpbm)fw00C+4hX4Qpbo7S+00DCPhX4QpVE~8#00D3YhyVZqZ48J200D0jhyVZqWEhA500C|uhyVZrbz#DX03?V20RjI3I-)gz00D9|hyVZqZ#;+q00DGDhyVZqVMvGo00Co8hyVZqZ&Zi?00C@ThyVZqY+#5000CiUhyVZqWo(E500CoihyVZqX?TbL00MSqeuw~aHUIztWrTj@N00CvrhyVZqden#j00C**hyVZqZQzIi00C*{hyVZqbnJ)#00D6FhyVZqZ}^A+00C_NhyVZqVFZZ)00C(Vi2wirbP$OE00C|mi2wirWgLkB00DI(i2wiraVUuZ00Ct#i2wls>jFBgKY#!Mayp3s00Ct}i2wirdPs=?00C)Ei2wirZB&T>00C)Qi2wirbYO`900D4ji2wirZ)}MG00DAxi2wirVR(rE00Cuwi2wirZiI;d00C)=i2wls5Cb|QL4W`Oa+Zkz00Cv5i2wirdZdW}00C*Li2wirZLo;|00C*Xi2wirbi9cG00D5qi2wirZ^(%N00C~!i2wirY1D}T00Cm!i2wirZ{Udl00DC5i2wirWbBCm00D0Di2wirb@+(@00D3Qi2wlwAOvG`Y;rolhyXN+0Fa3Q1d0FfWhivVCP06>cXC00D2pivR!tX~>HJ00DH)ivR!tb<~Rh00Cp#ivR!tW#Eed00MS+kc$B3ivR!zWM*}9bYEqThyX5%0Cg1H00VVsavF>PAdCQZi~s=uDgXcha*B)q00Cu=i~s-uaFmPy00D5Ci~s-uVW5lv00DBQi~s-ucdU#600CvPi~s-ua=45D00DKri~s-ubHt1Q00D2xi~s-uY|xAV0RUYA00DB`i~s-uW#Ehe00D01i~s-uZ|sZ!00C+8i~s-uboh(_00C+Ki~s-ubOen600D9ejQ{`vVGxY~0RWf;00DCxjQ{`vZ6J*R00ChpjQ{`vax9Gi00Ch#jQ{`va5#+s00C=0jQ{`vb3}~*00Ci2jQ{`xV{mlVi~tsm08otp00DGXjQ{`vY-Ei900DJsjQ{`vVQ`HA00CxpjQ{}vmH+?&bb^fl0RW-^00AkCjQ{}xp#WbfhK&G_jer0FWtNQq00C^GjQ{`vVyKM(00CjHjQ{`vY_yF400C^ejQ{`vVZe<50RWc(00DH&jQ{`vU(k&J00DH?jQ{`vb=-{r00D5~jQ{`vZs?5w0RWf)00DILjQ{`vU-*px00DIVjQ{`vbp(z800D3cjsO4wbP$dJ00DItjsOAxngE;t00Ai?jsOAynE;yrC@7AA00Ct#jsO4wa5#bjFPU?u`H(jsRSa0ECVJ00BCRjsO7wi2?utbf%5~0sxBwjRF7xDX@+J0s)Ewi~=aOj(`9GWxS3600C~sjsO4wY0QoQ00C^$jsO4wW!R1Y00D5`jsO4wY2=Oo00Lids*V8ajsO4wVepOs00DIRjsO4wVE~T+0RWEz00Aisj{pGyjshqUkAMIHa~6*P00Ctlj{pDxZX}NY00C(#j{pGxkpch#DK?J)0RfN#C_0aT00DDAj{pDxWk`2d0B&x8;BEjYlK=n#WiFEd00D49lK=q$lLR_=Yk&X&Wloa-00DDWlK=n#VO)~{00MMk0BQhYlK=n#a%htP00CuklK=n#Zg`Ub00D1)lK=n#X@rvi00DH0lK=n#X^@iu00DHClK=n#a-5R@00L=YNRt4flK=q#LjV8)DYBCQ0Rch)D7KS;00CvZlK=n#W5kmH00CjjlK=n#X3&!W00DB=lK=n#W!#eh00DI3lK=n#Zs?N$00C+4lK=n#a`ck`00CwClK=n#b^w$B00D9almGw$Wek)600DCnlmGw$Vi=SF00D0vlmGz$MF0Q+DJqlz0Rcn+C@z$M00D3`lmGw$bv%>+00Cb@lmGw$bV!r{00CuAlmGz$M*si;DO!{O0Rct;C|;C+00DVqlmGw$a%_|U00DGzlmGz$NdN!=DSngy0Rcz=D1wxL00DG}lmGw$X^@lv00M4!c$5H^lmGw$Wt@}%00Cd7lmGw$Zmg6500Vw=ZzPleRFnX+lmGw$I=GYo0Rc(?I>wZM00CvplmGw$W7L!Y00CjzlmGw$X5f?n00DC5lmGw$W$csy00DIJlmGw$Zupb{0RT(^00CtLl>h(%a0rzE00DUpl>h+%Tmb+9bQ+Za00Cbfl>h(%a3qxg00C_(l>h(%VK9{d00DG1l>h(%b3By*00U`sU;dN;6qNu%l>h(%a!8c`00CuIl>h(%c3hPJ00DAhl>h(%WoVTE00DDul>h<&X#uMN00Aj_l>h<(XaTALD1eoK00BIPl>h(%Jdl+D00DBAl>h(%Z=96?00DHOl>h(%VW^b=00DBYl>h(%Wwez500Lrgbd>h?(&H&N?&;S4dDaw@q0|Ctd(E!f?DAJXH00C&&l>h(%aO9N$00Cj@l>h(%a`2S^0RYqh00Aldl>h+&(*P&{mVf{OXa<%500D3gmH+?&VHB1C00D9umH+?&XCRgU0sz|wf00CjrmH+?&W!RPg00C^?mH+?-XMS&Gb7S_E04A0IY?c6)mH@hz0OXbc00BDcmH+_(-vBxUmw*5PXbP7A00D3kmjD0(VHlSH0RY1U00Ai^mjD3)!UQNNmw*5PWiFQh00D3|mjD0(c|4Z@0|3PY#{|g)00AjTmjD9+#016!$OI@*mw*5Pa#oiB00CuUmjD0(ZfKVP00D1qmjD0(X>^wW00DG*mjD0(X@Hji00DG{mjD0(a*US%00Ci=mjD0(aF~|>00Cs4mjD0(Wu%t?00CsGmjD0(X|R_700DKjmjD0)bZ0`B0KAs~00BC}mjD3)$^<&hmw*5Pa?+Oo00Cv%mjD0(Zs3;y00D32mjD0(Y3!E(00DIJmjD0(Y511_00DIVmjD0(as-$F00ChNm;e9)a1fXP00Cqcm;e9)WgM6Q00Cqom;e9)X(*Tg00MG#9+vPx$DbSbz0s-m(=m03zn1BEQJ=~Z800DC5m;e9)Z|s-=00DIJm;e9)VfdH;00DCTm;e9)WdxZ300D3cnE(L*?EnA)DHfRk0Rij)C>oi700DU*nE(I*bSRks00D3=nE(I*Z8Vtx0RZm+00AjLnE(L+?f@u6nScNRbV`{300CuEnE(I*b6A-G00CuQnE(L*@c;k;DQcMj0Riv;C~ld600CumnE(I*b$pos00C@*nE(I*VThRk00DN6nE(I*UzC{u00C&4nE(I*bD)_300Cj9nE(L*^8f$=DYBUW0Ri#=D7Kk^00DHonE(I*Zp4`Y00DK%nE(I*Z_t?l00CpxnE(I*U)-4h00Cv0RZ&?00AlVnE(L+^Z+RMnScNRW&W7}00C|Wng9R+Zw#6M00CqYng9R+Ul^JI00DCzng9R@VSaCAa%gpFf|vlxm;ex&06LifWSIc0nE>vY03@0K00BBGng9U-_5eCent%WSYf_p300DDang9R+VPKj700DGnng9R+V{DoL00C}tng9R+d3c%t00DD;ng9U+HUR(uDT@U%ngGn2092a*00BB$n*ad-5(5AMDQ=qp0Ra&MC~}*C00Crpn*ad-6axSODTbQ>0Ra;OD2kha00Cu?n*aa-dYGF400C*9n*ad-76SkQDXNE-00DLOn*aa-W&E1}00DIZoB#jov?00^7_00BAS%00D1`oB#j;Y>=D)0S9#iaCBdBW_CKVdjKAN06={Je4GH5oB)=A0D6aj00C>9oB#j;bG)1Y0{~|MYXPYN00Ak;oB#s>W&vser~xR@oPYoUXV#nm00C~|oB#j;Y3Q5)00DCDoB#j;bo87600DFQoB#j;UjUr|0szng&H?}dDGZ$e0s+qg%>pPAoqzxVa2TBc00C_xod5sE00DA%od5sb@%0RY$n00AkOod5v=*8(VNod5s00C_Ro&W#=bqJmS0RY|t00Ai!o&W&>-2x~Wo`3)Wc^;ks00D9)o&W#=bS$0#0RZ3v00AjDo&W&>-vTH+o`3)WbV8m000C)Ao&W#>Zg?`D08pL)00CuIo&W#=Utpd900C}ho&W#^WOHL~Z!VnxWSsz#od6D=0BoKB00BC3o&W&>;Q~5{o`3)Wa*mz=00Cu|o&W#=cATC700DBMo&W#=WvHG200DEZo&W#=VzizB00D2ho&W#=X26~R00D2to&W#=a?G9p00D5)o&W#?VQ*%_oB#%$0N9=Y0RUeC00DOFo&W#=U+|s)00DFMo&W#=ZTy}900ChFp8x;>atNOQ00ChRp8x;>a1@^a00Cbbp8x;>YapKh00DC*p8x;>VJx2j0RUkE00DM5p8x;>Up$`x00C=4p8x;>b4Z^600Ci6p8x;>bX1=J00CcGp8x;>cVM3Y00D1ip8x;>Y;2zZ00Cxlp8x;>VtAhb0RUtH00DM{p8x;>Ux=Ro00C=`p8x;>bCjO|00Ci|p8x;>bfBLA00Cd7p8x;>W2~P500C*Tp8x;>bhw`Y00CjXp8x;?b##880K}gF0RUnF00DN?p8x;>U)Y}j00C>>p8x;>bL5`@00Cj@p8x;>bnu@500Ce2p8x;>Zv3AB00D0Tpa1{?X$YVI00DFkpa1{?X%wIU00Cqgpa1{?ZXlol00D0%pa1~?V*vmGb~2y<00Cb%pa1{?YdoL;00DDCpa1{?VMw3=00DGPpa1{?UsRv~00C}Vpa1{?Z(yJR00C)cpa1{?bZnpi00C)opa1{?WO$$e00C}(pa1{?Z-k%#00Co)pa1{?Uyz^x00C*1pa1{?bey0700C^Gpa1{?b*P{K0RUwI00DNipa1{?U$~$E00DBopa1{?WyGKW00C~wpa1{?Z_uCs00C*%pa1{?blji-00C*@pa1{?bm*V}00DCDpa1{?Vf3H?00ne&V{LFUp8(3A04$&YuAl(=pa1{?UjU&100DCrp#T8@Z2$_o00C#tp#T8@FaiJpDcYd`0Rb-pDBhuf00DC3p#T5@W$d8<00Ct3p#T5@Z1|x700C+Kp#T5@bp)aS00CkOq5uE^cMzfg00D0nq5uH^Mgjl00Aj1q5uH_M*=7?qJRJacs8N{00Ct_q5uE^WJIC>00C}Fq5uE^X;7j70Rlw=C|@R`fL5Xa00C%Rq5uE^V`!oP00Cicq5uE^Wpttd00Ccmq5uE^a)6=$0RUhE00AkCq5uH^VgdjGDVCxD0RdqGD4L>x00DHMq5uE^Z>XXG00C~Uq5uH`Ujis!kfMOJq5uE^X1byP00C*nq5uE^U(BKa00Csuq5uE^ZrGv#00V4qX&j;ehN1x8q5uT|G6FOLHUc;TIsyOzDe|HK1pzSvGXgaNHv%~VDE^{=00CtLqW}N_V-%wR00ChdqW}N_Y#^fm00D3&qW}Q_JOTg#DKet~0RcM#C^n;j00Ct@qW}N_b3~&600Ci2qW}Q_SONe6DORHZ0RdM6C|aX{00C}bqW}N_VQ8ZO00CoeqW}N_b9AEs00CcmqW}N_Wq_jq00DM}qW}N_X^f))00DE7qW}N_b(o_700C^CqW}N_W2B=100V7ucr2p;P@@2qM$00Ch>qyPW`Y(%600suk+L;?T-DNv*U0s%n+LjovPq<{bcWn82H00DYtqyPW`X>6nb00DAxqyPW`VR)nf00CuwqyPW`ZiJ)&00C)=qyPc{N&-v*00AkMqyPc{Py$i{00AkYqyPc|PXbW_D5#`>00CvLqyPW`W4NRM00DKrqyPW`WW=NZ00CvnqyPc{S^``G00Al1qyPc|Spr)EDBPrg00C~|qyPW`VeF&;00Cq2qyPW`bNHkH00CeAqyPW`Wdx-F00DLir2qf{X%M9V00DCrr2qf{bsVJt00C?wr2qf{V<@En00C|;r2qi{UIG9CDLSP90RdeCC_bft00C}9r2qf{VN9g}00CoCr2qf{b6BMS00CcKr2qf{Wn`rQ00DMtr2qf{X>g?g00DD$r2qf{b$q1&00C@*r2qf{V~C{y00DT8r2qf}a%pAIqyRLf0FrT_o|c0i^800C)6rT_o|b4;cH00DJUrT_o|Y*?lM00CoOrT_o|Ze*qa0RUtI00Aj*rT_r}V*)62rhotebb6)$00C}-rT_o|Wr(H#00C`|rT_o|Wt64>00C^8rT_o|WuT@200Cd7rT_o|ZmgyN0RUzK00AkurT_r}WdbO?rhotebi$?p00C~wrT_o|WzePo00C{*rT_o|W!$C!00C^`rT_o|W$2~=00Cd_rT_o|c=VD`Vp#bEf0I;I~2BZK=qyUhl0N|wn2BrXNrT_q^03fFT00BBCrvL!~YXUk{r+@$fa$2VV00CuUrvLx}WN4=V00C@nrvLx}X>_Lm00DJ+rvLx}Vt}Ur00DP~rvLx}Z;Yn^0{|xjD+4J500AkOrvL*1CIc!1C<7>Py^0WelkR00DLqsQ>^0Zy2cn00C_tsQ>^1b!7ml03@jZ0RUtJ00Aj9sQ>{0W&;2LDL$zH0Rd$LC_<@#00C@BsQ>^0X;7&E00DGXsQ>^0Zd|DV00DJksQ>{2V*@B(IH`bWsQ>^0Zf>an00DJ+sQ>^0a)7A-00D1?sQ>{0XafKNDUzuG0Rd+ND3+;!00C^AsQ>^0Y^13G00CjDsQ>^0U$ChF00DBgsQ>^0Z@j4h00CygsQ>^0U&yHd00DH)sQ>^0Y1F9z00CjzsQ>{0Y6AcPDdwpF0Rd?PDC((z00DFGsQ>^0a`>qL00CwGsQ>^0YXqtQ00D9essI21ZxE^g0RU_R00Ai+ssI52YXc}Cs(=6ib0(?)00Ct#ssI21X*8+;00Cz@ssI21Z$PR500C@9ssI21Z%nEH00D4PssI51ZUX=TDPF1o0Re3TC}OIB00DDossI21cW|lz00D1yssI21Y<#K!00Cx#ssI21Ylx}<00DE3ssI22Z(&%f0F%Lx00CvrssI21bJVH;00Cm!ssI23cXV$SssNg*0N|j00Cbfs{jB2awMw&00Ctxs{jB2YcQ(-00D9~s{jB2Z#=6200DPGs{jB2Ur4I}00DDOs{jB2X;iBK00C%Ps{jE2asvPXDQ2qx0ReFXC~B*K00DAvs{jB2Wq7Lq00DM>s{jB2a)he@00Cu+s{jB2bC9b500Cl_s{jB2Z=9Rs{jB2VYI6N0swjgcmn_dDZr}$0s(mgcLONKtAGFjW6Y}n00C~+s{jB2dEBc200DF2s{jH3egk|100AlPs{jH4eFJ*~DD0359V00BB8tpEW6*a13lV6A}Co&X8}6$F9+N&*=Kh5$?g7X*X=8w7{|00Aj-tpExG6a;|)Ndg!Ig#b$e76gL;8U%*`C@x=!t$->a0Gg!$00C`|tpET4aQ#{h5yo&d4}F$Bo~pa1{?DcY?73jw77Cjr6)`vArOZv>qHu>vpz$N--JDDtg<00CwAtpET4a1^cp0RW@`00Ctnt^fc5ZX~V%00C(#t^fc5Y%s0>00Ct-t^fc5a6GO600C)2t^fc5a7eBI00AIQt^fc5XH>2L00C}Vt^fc5X<)7Z00DDmt^fc5b8M~v00Cukt^fc5V|cCr00D1)t^fc5a)hn`00D4{t^fc5Ads#A00DHCt^ff5GywnsZKAFK00MYnoUQ<-t^fc6a%sSt0930000LiY0G$ACZvX%YVP>vOXApl?@0L~!*1|k4XasUAVAp|e%u7CgmZ3?dd00C$cuK)l6bQrGy00C(puK)l6awM++00D0*uK)r7umth}00AjBuK)r8uLSV{C_Jx#00Ct}uK)l6bWE=R00CiAuK)l6V_2^M00D1auK)l8XK-vVuK=PC0A#NK1ON~KdIJ&wcmn_dDR{2{1OoN|-~i?T@sucLPZQC|_uFg0FzaR{)Y+0H&`100CpHuK)l6Zpg0y00DWt_1J_00Al5uK)xAs|4)_3jjw0C@wlUuzWKP093F500DGbumAu7b!4yr00CoaumAu8Y-OIK04iVr00L!WaIgS&umA}FW(3**umY;Q2BQvh=UQvoQRuz&yod8Dua00MMuzOVowqW}N_aKx|x00ek#a&CaI0M4)gP@@2PP5=P{)B|HE{x|?`LVy4PcjmAF00D3EumAu7Z1}JM00d@pWOe?q01iz6Kr{fDN&o->VFa-N00Cqcu>b%8bsVt(00MDltbG7*cmMzaYbdb*00DC{u>b%8VK}h>00(k!V`pJ>W&8^O9+Utgu>c^J0K}L8KCuAOr~m>4JpmyAC^~j7*bRVan}9N$0M=Om00DAhu>b)APXuKsc(DKugn$46Yksi+00DA}u>b)8>;eD*ZI-bB00DEFu>b%8X`rzH00C^Ku>b%8WvsCP00C^Wu>b%8Y`C!i00CjXu>b%8a>TI!00Cjju>b%8aL};;00AJ@u>b%8blkB300C~|u>b%8Z|JcA00DCDu>b%8X7sTD00CwCu>b)A%>!R3$f*F9PJjRbbOy2j00MSt0I~oQvH$=AZ()$J02s0W0RY+o00Aj1vH$=9cQCR500D0{vH$=9Y&^0600Cw~vH$=9VMww700Cr9vH$=9byTtd00CoKvH$=9C}6Sx0RbfdAZD@v00C!gvH$=9ZgjE$00C)wvH$=9a)7b`00Cu&vH$=9bd0h900DE7vH$=9b(pdM00C^CvH$=9W2CYG00AJXvH$=9bFi`i00C~cvH$=9VZ5>c00CvfvH$=9ZOF0!00Aw|vH$=9c$}^P00D5kcK`qZAmFk900DRAvH$=9Z|t%F00C_BvH$=9X85uI00C?MvH$=9a|E*h00L!U+Ohx&vj71AyZ`_JavHM$00Ctlvj6}AVkENw00C_(vj6}AbuhC400C|`vj6}AAUv}G00C)2vj6}AV@R_A00Co8vj6}AVN|mK00C}Vvj6}AZ(y?k00ClVvj71A8v_6Vb8@o)00C}xvj6}AX?(K)00D1;vj6}AV~Dc=00AJ5vj6}AaFnwE00C*5vj6}BX>4q>0HCt~0RS@r00C^Uvj6}AVYIUV00DElvj6}Aa=^0y00Cvjvj6}AcFeN?00Cjrvj6}Aa@ex~0RWW)00Cv>vj6}AZs@ZB00C+4vj6}AZ1l4L00CwCvj6}AZ~(Lb00C(Rv;Y7Ba168n00AHpv;Y7BXBe~q00C|uv;Y7BX(Y4&00DC@)0RhVdAi}f&00C~uv;Y7BZ_u;=00C*%v;Y7BblkK600Cj*v;Y7BW9YO100C+4v;Y7BZ1l7M00D6Nv;Y7BX#lkV00DFcwEzGCYz(yk00DIpwEzGCZ5Xux00AH#wEzGCcqFv|00C(#wEzGCaxk?300DG1wEzGCVLY_}00wJxW@mS8)Up5&vjC>E0Cuzhg0uiawEzGCAV{?U00FyVwEzGC!f3Ss0RbffAa1pQ00DD!wEzGCWqh>&00C@*wEzGCaEP@400C`|wEzGCVU)E100DTKwEzGCWuUbH00AJTwEzGCZLGBb00D2ZwEzGCa=5hs00CycwEzGCAjGu*00C#pwEzGCZqT&=00C*%wEzGCZrrs100DC1wEzGCVd%900Rq4RVJLW{05)%c00Cw6wEzGDZfyXz0B&9Y00U)jY3{WE#CiY*wg3bG>jQEC?E`cG00Ai;wg3eI>H~2A>;rQER{00DD~wg3YF-U2`aM*si;DVDYX0|DIvKLbVpD4w=}00DWVwg3PDatCUjSby^0okcIe-!X*Z?mBp##|iJOt|i76Lv5q5|mv6arEJ5&;?l90Jw=Edvk%00Aimw*UhGKm@u3`~d&~DKfVJ0|7q-xdi(GC_1-*5(3u%E(4$g*aJHR>Hrl2Jp`cw=l~M}Q2-GE83G#u)c`C54*@7WEVqC_w*UhGX#uMNoB{v=DSEd60|96OssWnyw*Zv40JOIN00DBkw*UYEZ^*X*00DH)w*UYEVbr$(00DB^w*UYEW#G2}00nenZg6!n2>_6`0Qj~5=C=SAjsO4xVP$ZQ0QheJ0syiE^Z@_?DGay(0s*lE^8qLlxPSlwWf-^s00DF&xBvhFVJNr&00CnzxBvhFZ#1|700C?|xBvhFY(TgG0sy!K`T+m|DNMKk0s*%K`2i?WxPSlwd04mr00D1exBvhFa%i{!00DGvxBvhJW?^%5a00jhMz{cUxBw~{0IIA23IL}A&;mFEv;uMjr32vu&H^?AS^xk6DUi4T0s!L!TmS$8DXh2v0s-O!TL37sxPSlwXt=ll00D5qxBvhFVaT`u00DB&xBvhFXVka=00CdxxBvhFW#G5~00DO9xBvhFY3#TF00DIJxBvhFVfeTJ00D0PxBvhp!0|Z+Ha|FNya4labcDVr7b^xl50HTF}00Ciqxc~qGbdC00C+Cxc~qGZTz_a00ChFx&QzHZV0*n0sx`}r4100C}3x&QzHWkk9F00DAJx&QzHWl*{R00CxJx&QzHW?Z@e00m`rVR>nKxBxD>0Ghb~4!Qthx&Q+JHUN(T6afGMDSo;D0|7Mvjsg<_D2BR#00DE1x&QzHWth4E00C~Ex&QzHX{5RU0{}PxkpdP000Akmx&Q+KHvo_V6#*!?x_|%ybH2I&00Cvnx&QzHaL~E{00C*%x&Q+JJOGsf9svLWDdM^S0|7e#lmZPHJOfq$00Ai&y8s0Nr~}*s*8)2ORRAa?yMO=!b7m^L0D`Ij00C<{y8r+Ib40rU00M7e4!ZzKy8r_KMg*(^B?JHgDO$S#0|7+@s{$khC}O*S00DDoy8r+Ia&)@@00D1$y8r+IbAY=500DD`y8r+IWsJK300Co?y8r+IW0<=D0su+`DFgrkDWtmq0s%<`Cw00DC_yZ`_JbUeHO00C}7yZ`_JWk|dL00Cr9yZ`_JZdAMg00CuMyZ`_KaAsh<0K9Ji00CuYyZ{0KY6IE=00Aj>yZ{0LX#?2;D15wt00DD?yZ`_Ja*VtH00Cu^yZ`_JYnZ$M00DBIyZ{0KOavWB>pGDMq~j0|Dd%VE|(QC{DeA00D4Ry#N3LZ*pM00Iqrf0ssXBkN^MyDR8|20s#aBj{qoky?_7#Xnef@00D4@y#N3KVT`>100DB6y#N9L2LzG;00AkUy#N9M1_Y4+D5SlB00CvHy#N3KY_z=q00CmUy#N3KVZglr00DHyy#N3Kby#N3KW%RuO00DOTy#N3KX#l*|k00Ctpz5oCMbZrK{064w?00nh#VRUvty#Qvt06x9|EK~q6Q~(D6&Hz3Dw*!{~AOX?<&;S4dDPFz+2n62%%>X?BwgZ*|9|6$-&j2VoATDmcfCQI-RI~sPS^xk6Xmq{+00D58z5oCLVVu4I00Cj9t^fc5WvIRY00CsKV*mgFZ?wJu00C^&4gdfFb-=y=00VSma<0Ary1oF$z5oCLAk4l100Fz&z5oCL!r;CD0RbTbAm+Y+00Cv}z5oCLWAwfN00D0Lz5oCLVF14X00C|WzW@LMatyx!00CtZzW@LMb{M|^1Omzg#016!$OJkl%$I;rmw*5PawNY100Ct-zW@LMZalvL00D18zW@LMX-K~S00DGPzW@LMX;i-e00DGbzW@LMa$vsz00CiUzW@LMaBRN-00L}e>R|wKzW@LMWq7{;00CrzzW@LMX^6i700eSxaCM@-03N>pyj=i}zW@XP^8;o8^#f=C00AkczW@aR@&jc6^aE!ARskrmzkmP%dA7d*00D5uzW@XP?*ncC@dI!G00Ak|zW@aR?gMQA@B?oERRJj6zkmP%bmG4N00Cw4zW@LNV{^>E0QkQE00J&w=pg`DRsaA2WeC6k00CzbzyJUOZF2s<02sgk00MAg(mMc_s{jB3Z)7OI0A8U000CbzzyJUNWjw$D0syoG_5lC^DM-Ko0s*rG^#LeOz<>Y&WmLcb00DYlzyJUNX=K0v0Rm+LC||s$fb3NO00DGxzyJUNZhXK100Cu!zyJUNZHT}C00Cu=zyJUNY?QzN00MJmYQO-RzyJXOLIGc(zyJUPYj|`*zyPYi05-J%0RR~T00C#fzyJUNZp6R<00C*rzyJUNZP35~00C{*zyJUNVcfs~00DC1zyJUNXXwBH00D3AzyJUNa`eCe00D6NzyJUNAOOJt00C4&yzyOZH0MNkz1po~MdH_ZO4+MMw00AlJ!2ksS5d?k!NCFcCfB*miDf+%S5(Iw$CUx!omQSc>o0i3p!vFvQbIij400Cjr!vFvQAlSnI00DX4!vFvQY~;fL00DIB!vFvQZt%kZ00Cw8!vFvQb^OBs00D6V!~g&RWeCIo1OhSxCIc!1C<8huFsOi_r+@$fWfa5!00D9)!~g&Rbu7dH00DF|!~g&VZ)t3Dbr8b00VAwWK_fee8d2b!~g*RMFRi-!~g&Ra^S=O00CjSZ5YJ>00D0v#Q*>SZvMmo00Ctx#Q*>TV`3!505HV>00AI2#Q*>SyFkSN00F{A#Q*^Tmjob8#ee_-XHvxg00C}V#Q*>SX<)?w00C@f#Q*>SY;45<00Cig#Q*>Sba=%800DD;#Q*>SZiK}E00C)=#Q*>SAdtlX00Cr{#Q*>SZk)vc00Cj5#Q*>SAgILv00C#N#Q*>SZnVV!00C*b#Q*>SWWdD$00C^q#Q*>SY0Sj{00L!o1jPW-#Q*~UZUi#~@c;k;DdNQd0|9LWG6e7dDC)(400Cw2#Q*>Sb^OHu00C?Q#sB~TVF<00Crr#sB~Vb8d9#nE-&s0FrY60suMz6axSODU`+l0s%Pz69Xuk#()3;WuV3Y00DZc#sB~TX|Tos0suS#76SkQDZIu20s%V#6$2>3#()3;dC0~900DH;#sB~TY1qa900Cp(#sB~TVdTaD00D67#sB~TVerNP00MJkw#ESV#sB~TU;M@Z00CtN#{d8UbqvP<00CtZ#{d8Ubr{D00sx!@-T?psWhBP{00D3+#{d8Uc`(NS00DG1#{d8UUp&VE00Ct}#{d8Ubx6km00C@H#{d8WWnpxzp#W6J0EAQk00L)Y&Y=M8Y5)Kla&u#FbYW$FcWi5KWNv3|W@%z*b#ZoIYI@KR0O%V4NN50>aRAD$03fgc)UW_Vu>c~n01&nSR<;1_w*Y9m07$+7l)nJh!2mwQ0GPx8*u?-y#sG%K03OExTE_rr#{dQy0CYtFM-u>h69B0b0M`=$3lsn@6aZKh0E`p>z!U)U6aXd_04Ws!TonLc6#$kM0Gbs5%@qLA6#xkq01XxZM-~7}766JC0FD*_(iQ;L762a?03#OwRu=$T7XXtN0GAg4%NGF77Xbek00I~QCKv!h7yxG&0FM{|ychuX7yv360Baclr5OOJ835lI0OJ_|3mO168UT740Dc+(%o+gD8UP9#01g`fNE-l58vuM80Dl_*x*Gt#8vyPb0P!0D0vrG`8~|n<0HYiLrW^p>901`Q04N;*EFA!H9RPM60InSX=^X&<9RMgE04p8b60B|Azy&?d?A^;U502w0ycOw9MBLK1^0Jb9l_agxNBLFod068Q8eIx*ZBml%D0LUZ&6(s-}B>-0?09z#hqa^^RB>?Ls0PiILJthD_CIE&e0E#96ye0s@CII;+0R1KaG$#NzCjfIN0Cpz;k0$`ZCjjp!03j#M!7u>CFaQKG00=PvLoonHF#vrr0D&<8wlM&@F#zf@0PQgV4Ke^hG5|y}0C_S1d@=yQG62Lf00T1s2QvUmGXPIB0FE;Nk~09vGXTsp0Q@rm|1$tOGypy{0C+S2d^7;GGyu3X0N^wL4mAKXH2_>S0E#sLxitX1H30cF0Q@xoKQ;hEHUNt@0FO2R$uSX1H~;}T00lV!KREzHIRJe*0D(CGu{i*=IRNN60PHycC^`TvIsj)n0Bbq`n>qlVIsnu<01`U@7CQh^I{;NX0E;^Sx;p^9I{^7R0Q@@uK0E+IJOGJ20F68V=R5%GJOCp-07g9ka6JHpJpi~p0K7c_^F09fJpeL305(1Vbv^)jJ^-6O0M9-E`aS?BKLAlb0DC_Gxjz8wKL8j&07gInc0d55KmgA`0RKP$IY9tzK>(UT0LVcA|3LsQLI7Jr0F^=j!9oD+LI4><07gRqb3*{3Ljcc10RKY(IYa<)L;$Nq0PREoB}D*NMF5aR0Lw)H3`PJ%MgV+90Juf~yhZ>EM*t5;08d8%Q%3-qM*y8i0Nh6a-$wu$NB|s209i-?T}S}5NC3D<0RBh-0!aXBNdRt10KrKB#Yq4bN&p&40BcGBZ%P2bN&v)402E6A7)tt4#pzO#tyt05DDfG)@3?P5^jL0KiTF#7+PQPXG%~08UQ;QcnPlPXLim0LM=N%TE9=PyjPf0FO`rlTZNDPyp9Z04h-cE>Qq)Q2=vM0Jl*9yHNlLQUDB608~-{SW*C{QUIz_0QXV=`%(Z_Qvh010JKv8xKjWiQ~)GY0C-dYd{hA4Q~=>r05w$rIaL6fRREq<01Z|E5mo?sRselg0L4}S$yNX(R{$qh0CZOXcvk?%R{+Xa03%ocCs+W0SOA1r0Jc~FxmWz|I0I*#E{9OP5UH~~>06ks+m0kduUI6i40QFt~F<$^wUjTex0H$96&tCuoU;sB@09s%GkYE6lU;x=*0Nr2!9bo_=VE}Jo0CQmgyI}yoVE_VR00v?JOkw~~VgQk10F`0@-(mpcVgM^+054+zdt(59V*smT0N`T)6l4H5WB^fQ0DxowtYiT3WB?Uq090iFSY-f=WdM(50MBIr(`5krWdJW`09|GPnPvc;W&qM=0M=#z3ugciX8=TJ07z#5eP;lHX8^lr0KaDd^Jf6}X8O?ri|_Z2&fI06J~}aBcu}ZUCol0IO~Q*KPm~ZvaVe08MWIf^PtYZveY*0Kabl{civPZ~!)N06K60cW?lEZ~(M$0Jv}f`EUUJZ~!-P06TF2X>kB-aR8!m0H$#O)^PyYaR3o=02OioC~^Q?asY~Q0Mc>*)^Y$Sa{w%J0A_OlYI6Xja{#Dw0N8T?+j9UGbO0K307`TKO>_WWbO4=n0K#+t6?Fg^bpT9t08e!Qk#zu-bpY6P0Nix|7j^&}b^um(09tkcn|1)7b^z3N0N8c_7k2;~cK}*<0A6ra30GN3In|T1xc>vUT010{k4SE1WdH_dy0Csu+d3pf1dH}n60P%VN^?CpXdjLp#08D!TfqMXjdjQIN0M2^=27CYtd;mIp06lyFcYFYQd;qt60K0qu@_YdHd;mRt06~2KhkXExeE_|E00Vvi27UlFegIg00Dpb}q<#R{egGPO03Ck-M1KHpe*m+80OfxG5P$$WfBfrS91g#goq0QrRg8HNBzh5$>30DOi3e}(|Fh5)yQ0PuzY^o9T!hX6T;0BVN-ZHEA>hXAjK0O*GR?1um?hyXE&0Bwi>afkq-hybOC0LO>`{D=T9i2!hk0Cb4}sfhrsi2&V+0N{xL8j1iOiU3uL09lFvpo##biU8`00PczaAd3J>ivW6y0IrJw=ZgUAivS;t03wV4S&RT(i~yC40GW&c)rx}^KjQ}=|06LBUe2xHsjsU}s0LP914vzp5j{sVa0A7y(kdFYAj{wb&0MU;C2#^2^kN`)J085YnijV+~kO0h(0ML*C50L;9kpNeb09%m&u8{z;kpS?K0Q8XnERp~)k^p0p0B4c_rjh`vk^tnA0O*ncI+FlClK_>I0GX2jls&0C1H6mX!dpl>pF{00fo*G?oBamH?KP0M3>G_m%(@mjFbU0Ew3Xjh6txmjJ_;005W(1egFqm;gqY0D71JeV72hm;l3=0Qi^y{FneXnE*PO0C1TAbeRCPnE<(&0PdLp@|gf4ngAu50A`v1YMKDBngFz#0OXng=b8ZcngAS|05Y2ZVw(Vxn*g|*0PdRr1)KmYoB(W`0C1cDrJMk$oB-sU0O*_mJDmVOodA2C0DqkT!JPobod5-%012J|GoAoko&buT0KJ|7z@7l~o&fir05qQfIG+G^p8$HF0I;6`w4VU(p8)co03x6OC7=Ljpa5&20J@+6zMuf@paAfo03x9PCZPagp#Wx~0Jfn3x}g9Rq5v4809&E}U!nl4q5!a>0Q;f<|Dpg?qX1Z=0Fk2rm7@UIqX68a02rhI9HanZqyT250I8$^t)u|@qyYY;063)pJf#40r2uxN0J5b3wWR>;r2z1y05PTjHKqV>rT}xM0Jo+9yQTpCrT_w`08gg?Q>OrtrvR0w0Ntkm;imv4r~oOb0C%VWd#C`gr~tI60P&~*^{4<}tE&LKs{r4t01vDHKCA#itN?qg0Dr6izN`SktN;Y900^xBPOSh^tpJ#<0GzD=(X9a0tpF6R02r8=3nt^h8t05Y!tWUl~duK=L00Hm(~->(4UuK*sf03xseUa$aSumGO00HUw}*02EDumBmc03ERaRgv(0FAK#!m$9wu>kwA0ROQ7II;jdvH*gz0EV&v%(4K`vH%OS01vYOM6&=$vjB>-0FJW&!?OU#vjF9@01LDLGPD3@v;da00I{?H&$IyOv;Zr$09LgCd$jA3(Kx&TVL0DHOsu(|-}x&Rov08hIBin{=fy8z$20OPv=IJ^Knya0H-0DHUuw!8qkyZ`~c00q4OPQ3t9y#SfL0G+)6*1Z7Qy#OY@04lx!W4-`qz5u+w0KmQg6~6!(zW`6a0DHdxtG@u(zX1Hd04u-%F2De4zyNK)0I$FRv%moQzySWh05-t@ZNUJo!2q$r0O!E~>%jmY!T=Z;7#NtC7#J9-sHv!_si~={si~={si~={si~={si~-`s7MePn0QziSgEP0sYnp1si~={si~={sHv%`s7M$Xsi>)`si~={NU5l)sHv%`si~={si~-`si~={si~={sHv%`si~={si~={si~={si~={si~={si~={si~={si~={si~={si~-`si~-`s7M%yczAfJsHv%`si~={sYn>9sHv!_si~={si~={si~={sYn)`si~={si{b*sI9H3si~-~t*NQ0si~={s7R@(si>)`si~={si~={si>)`si~={NSK(I7^$eKsHv%`sYrOKsHv%`sjaQ4sjaQ4si~={si~-`si~={si~+)si~=`NU5l)sHv%`si~<*m>3wTsi~={si~<*7}(g@*vQz}*x1)_si~={si~={si~={si~={si~={si~={si~={si~={si>)_si~={si~={si~={si~={si~={si~={si~={si~={si~={si~={si~={si~={si~={si~={si~={si~={si+_r7#J8BczAetczAfJsHv%`si~-`si~={si~={si~={si~={sYsX@7#J835UHrCsi~={si~={si~={si~={si~<*7#Nrkn3xz4si~=`si{a9=&7lxsi~={si~={si~={si~-`si~={si~={si~={si~={si~={si~={si~={si~={si~={si~={sYn>9si~-`si~={si~={si~={si~-`si~-`sYn=jsi~={si~={si>)`si;Vp7^$hLsHv%`si~={si~={si~={si~={si~={si~<*7#Nrksi>)`si~={si>)`si{b*si~={si~={sHv%`si~={si~<*si;Vpn3x!;s7M$X7#Qg2=&7iwsi~-`si~={si~={si~={sYt1)si~={si{b*si~={sYnnIm>8+3si>)`si{a9si>{3sj024si~=`NEjHYsi~={sHv%`si~={sYsX@7^$hLsHv%`si~={si~={si~={si~={si~={si~={si~-`si~={si~={si~={si~={si~={si~={si~={si~={si~={sYno*m>8HC7#J8Bsi~=`si~={si>)`si~={si~={si~=`si~={si;U85EvL37#J9-si~-`si~={si~={sYsZJsi>)_si~={si~={si>)`si~=`si~={si~={si~={si~={si~={si~={si~={si~=`si>)`si~={si~={si~={si~={si~={si{basi~={si~={si~={si~={si{bqn5n3#si~={si~={si~={si~={si~={si~={si~={si~={si~={si~={sHv!_si~={si~={si~={si~<*7#J8Bm=Ks47#J9sc76?czAetc&VtVsi~={sHv%`si~={si~={si~={si~<*=;-KJsi>)_si~={NEoT9si{bi0{|exyC70!Y-M3{Wgs&yF)lD5C^9fIF)uJQFEKPJ0CHt)Wnpt=Ei)}KEif)0{bO2;>Wnpb_Y+++%E@*UZYy|=UAi}#KP+@XmY;0w0AW3v(a$$67Z*Fq{LsdjUQ&dDoZ*oOpV`wgDbZu;q0{|exyC70!Y-M3{Wgs#xF)lD5C^9fHIWI6WFEKSK0CHt)Wnpt=Eix@JEif)et1X>MgMXmD@GfDF7p3G-YKpIAJ(4WM(mDIX5*pVL3TDWHC5pHZUo6pH#sviG&2nVIRqmB7X>;xVRLzIV<<)@CMf_TVKp{5IASp~Ff%k}F*iA5G+{VmGGZ`gVq#)4F*PtXF%JMa10w(z3OYJrb9ruKC{QLQEFeZECMf_TVKXx`VK+8pHZx^4FlIAkWM*PzVKy*hG&40eVmLNtGZ6qe2qOR&1v)x&X>4U~bYEd}d2VAUMkXdH03%^AV`VZqWoBVAF)%h`IAb+oF*GwWWi(+mHDO{hWiT}p067CA02c~6I&*1kWo~p|VRLzIV<=E2CM+OECMGEWBVjc&GGsVoWH32pVr6ADIWRIaWiw%9HZ^85GBPk`G-VY4IS3;F7X>;xb7^d4ZggK^b9ruKUv+L{Xk}w-Wn?HuCMGEWBVl4QGGR9{IAmfmV`DQiWH~uEVm3HpG-fz7I5K8qH)R(9IRhgA7YaH$b7^d4ZggK^b9ruKUv+L{Xk}w-Wn?H&CMGN(MkXdH03%^#H8C_aW;HWrWiw$lGG#YpH8*5sIb|?0I59P2I59LC067RF02c*1I%98gb#5qBCMGEWBVjRPH#ah6IWRUcG&3+VFfue_W??llW??usWH&WqVPhKrIRhgA7X>;xV{dYGZeMk7bZKlTP$niR03%^AH#lZvG-Wq3FfueUHZn3XWHdEpG&Eu`IALTqFl9C!067CA02cx}I%98gb#7l_Y-}hg03%^$V`DKmG&nS2IW=ZvGGRC~G&wXdWH>iCWnpGCWM*R@068N77XmsucW-iQWpXGf03%^#F=S$6Wj8fuWH&Q6HDP5lVrDZqIW#ynGdN{4G-YEU06EwJ06IESb#8QNZDjysY+-YAbY@>MUold3ZggpFWiDuRZER9yVQe5(b7deyVRL130CjU^V_|b;b1rCfZEPh103gDMl#Yi(s=ZgVbZbZu;v001vxb9ruKUvyz}Yc6PTZ~$R*d2VB0Uvyz}YhO5MaBwbYbZu-X04_Q@Z**v7asVS?He@h2IXPlEGB;yoH#9S5G-hEjH#B25VL3Q3HZnLe{{a9xI&x)VX>Mcyb98cbV{~+8Uol@XV|8RbZ>HBF<&uqWnyV=WG-lQZETPO03gDlW@R;FF*Y$VWMW}6H85l`Fg0Q~Ff%tdW;kPHGAjT(1}^{^2s%1rZ*^{TC@COgZ*FsR03%^IWi&D~Ha9geWn^SIFf(OiFf=(bGBYtSH#TBnGGbvZ06PZQ0RTEWQ)O*oaA@39b1rCfZEQaP7!x`=P-$>wY-w&~D069gWnXkGAW(B4CMF;*E-onmBVl1=I5#sjHa9k9G-f$wWH>Z8G+|^hVPiEhVl_BsGi5OVIuSnr7#%t~P-$>wY-w&~D069gWnXkGAaitNIy!W9aBO8fP;)LWEZ8IAt|rGh;C`HDX~lGXOdpKL8jAIy!E3ZC`9@Ze?>QDIjBSZgX@1BVji=W-vB3Vqs%sHe)eiVq`KnV=^{mI59akIW;w9IW;u^ItD)g7zjE#Zgp*6aA|O5b0{ewV{dMAbO0k^WM*Y%Fk?0_W;kXzVPj=7HaBE9V`MlrG&nRhH)dutHvl>YKL8j4Iy!P?b7gcWDF7p3Gchq_I5#w6IXGlxHfAtnWo9^GH)J?AIb}FxF*RajIRH9802l~5I&O7sUvzJ4Wo~mQDIjBSZgX@1BVjl>VP-ZrF*IT}HZf#nWHmWCH!x#iV>DwoGh#DkW;r_mItD)g7y>#vXL4b1XecQFBVjZ%GB9FfWH@3tV`eZlV>w}AV=-bmW??WeFl97hGc`Q`I?MnMY-w&~0BmV)WiDuRZEU&&0wBV>AVY6%WNCD1Z*F01AXH&4l9Ght;iWnnaCG%;mmWHL5mF=94mHZU|o07FUuAqYA;b7gd2b#N$iX?kT}bSVHMVKgyiHf1(4W-&5iW;tXzWMedDVK+23Gcsm2I5IV4IYaI5{>qH(@blW@9vBF*ag1Gc;pkMgT(;N&q1iIy!G|UuAe{bSPJEa%*LBR%vB-EFe^2b8A*zBIy!!1b!1^iY;Si%VQ_SHa%FNTDF7p3I5asjWH&c5HZx&jGGQ?_V`F79F=RAiW?^MDVly)_O#niZ03Z}PI%9QYVMc6kcSB)tbairNawubUWMM{ZZ+AmsaCCKYWpXAaDF7p3Vr64tHaRmfIX5yhG&Es3H)A+9GGj4jIAk+nVmV@BPXIy^lK>zTIy!G~WpZJ3Z*n~-V|8RYDc_=9$V{dMAbO0k^Gc++XVKOjbH)A+5GC4CgHDxhnWjSVIHDP8nVP-L9QvgB+lK>zHIy!E3ZC`X@b8B-bDIjBSZgX@1BVlD?I5IXjI5uW6H8?P0VP!EhGc{y6G-hUEWnwpFFfvsDLI#rnAObo%V{Bz%awsVPBVjN$W-&20HexnmH8nXfI5RS4Fl8_~VK6i~W;kIsGBQ^HLX!X>6*@X(b!~7cb97`nI&X7ya%Ev{CMh6eZ*FsR03%^JFk~?@HZ@{nV=^~3G&eagHezLDHf1(AH#TEAVPQ8}074U!03a1QI%IWia9?g=bZK^FUuSY*aA+uVbYwa@Z*z2VWnpb5DIjBSZgX@1BVjo=Ic8-tVK-uCGGj9~F=H?_Fk>(`G%_(TI59alV>ep>LKBk!AO$))Z*C|qWHDuBUjRZ4lK>zBIy!D|Z*V9n03%^FIWsdeH8(aeWnncjH)CaCH90pjGdD3aH)UoyGcz+`078=hAPPDzlIy!A-ZDVkGC}VYGVN_vrYb+pkZ)s#IEFfcVZgX@Xc5i89Dl8y#X?kT}bSVHMVKOjeVm35oFgP}1W;ii8GGa3|FlAw8HDowAFgIalIcESuB$EIj7CJg(Z*X}iR4ObWV{dMAbRbkJEFg1fdSzd9DF7p3VmB~mF)}bUFl9G5Vl_B8W;J6nGh{e7G+|*hWi>N6X#heLlK>zfIyz%-aCsWo>h1bSQRjX=ExaAZc!NEFg1fdSzd9DF7p3Fk~}0IW{$7GdM72IW#b0W;SDHV>UK6H(@e3F)%eUZ2&?NlK>zdIy!A-ZF6OGC}VYGVN_vrYb+pkZ)s#IEFfuabSxlqX?kT}bSVHMVKiYkV>D!BI5B25WiT*hIAJtnH#0LdF*jmkVKXu?HE#ey9FqVbB|185WpZw1Y$#-DZ8Iz&WNB?PEFg1fdSzd9EFeZKAVG3xb5tNECLk^@E-3&bVK!zkHe)wtHaRskWH@9tH(_KrIc6|rWMpP#GB`D5IdK3&Ba;9iFFHDFWpZw1Y$#)OWMNccb89RhWNB?PEFffQZ8Iz&b7^{IUvw-WMl2vfa%Xc?ASNatE-o%903%^IH85j0V=yu}Wid4~HDqEjWMyGxV`MlnIb=98WoBY?075O303ZW8I&*1mXKZC(bYXLAC_^a#BVl7XG&Ey3WHK-?H8y21HZwD2W;irvF<~+_Vl^@{IW~0wLIINiAPqV?b7^j8Y-L|`VRLIJV|8RosF*##3VK!nkHZ(P2Gh}9FWnyJAFna((5t9HQ3_3byZ*pH{VPj}tX>Md?cqmCMAW1ABNh}~kDF7p3WnwrtH)b?4Wo0vAHZnJ3G%#i{V>x3uG-Wd}Vqr65eE>oVlK>zVIyz==a$jX(V`yJ#Ze(S6C}VYGVN_vrYb+p1EFeiNAW1ABLn#0wVK6mgVKruAG-NO^HD)npW-v8oWHDtlGcht|V`MWmH-7*^6q5iT3_3b=a$#CF0074Lx03aMXI&^YjZgXaDa&0JMb!1^wVRLINAW1PSAW1PSAW1STAWtkHLn#0wVKp{5WMgDxGBP$~H)S+7Hf1tpWic=|W@BVzWi@1CIfnp38j}DZ4mvt*VQXbyZ*X*JZE1RCawtJ^XLD2_CMF;*E-onmBVl1NWH2!{Vl_5nFg9aiGB{;nVrFGxW@KbHWMX16G-QbYLJX4tAObo%V_|S~b#i4WDF7p3Gcjf|IX5>lW;8c3GdMIjV=-bdW@a&AHa1~1WMgA9ivU8C03Z%JI&yVxC}VYGVN-N+Wnpb!bSVHMVL3H2H90nBWH)0pHeq9BWi(?lG&MP7IA&!tIb>lnIE?^843hvL0y;WwVRUJBWnX7V?GE@*UZY?cHBFJo_VWdJfVFk@siWI19uV`4KnV>mQ0Vl*{1WHn?rHDfY0Vqz|6bZu;-02n$tV_|G;VPb4$UvvN?VKrl8Gh#P3W-&H1H#ufBV`4I8V`OD!Ha28sWieu5Got_)Iyz}?aCLNFbO0k^Wi@3vWMnZgFkvtI&XD!aCLNFbO0k^IAJ+CVL3Q3W-w-9HezEnH8LwC{Sr|WmI`^Wh@|LVQg$+Vr*q!bS5S#03%^FG%__aF*rFjV`4coHZx;3GC472Fk@voWHB-}IX5($06G?<02l~5I&^t(Whf~iV{dMAbO0k^I5K5sIAu6yFg7$aG-5I}F=Jz9Wiv7}W?^GuW;Qorod7xpqW~BRIy!WDaAhb^X>es!d2nSZ03%^$H(@hlGGsF`Gi79CG&D0}Ff=)3IWsqAIAvovV`Mm=06GYx02l>2I%8pMY++(-Whhl9CMf_TVKq53F=jAfWin)9IAt&}W;9`BG-EU}VKO#hWMXAuWuX8%1K0roIyyvQbYW0waAg2vY+-YAbY@>MUok{sbYW0waAhuNbZu-U0stVwyC76$cyu68a&Kd0b8~5KXCP2%aAj<1Ze;*eWq5Q@a&Kd0b8~5KXHaQyWo&6~WiDuRZEUvyAp$x&ba`-PC@BCVVKrkmG+{AhH#sz8Wn*PFWn?lkHfCZmV=*)~GcsdhVx<5>w*VmmIyzEeZe(w5P+@X(X>@6CZe?;PDF7p3HDO{hIAJ$2Vl+2nWiVu9Wo9uqHZ(LgIb=3BF*P$}rvO8@03irEI#OY7WN&RyVRCe7bZKvHWpXHUX?kT}bSVHMVK*{nVK-$sG&yEwVlZPlVm382H)UjFH!(S5Vq{@8FsT4T2Dbnq4LUkfVQyq^ZBSuybZK;HZ*FCBD069gWnXkGAVVf5DF7p3Gch$dVP-OBF*7qbH8y55HZm|~VPrBeWMyMyIAk<1s{lg_w*VmyIyzEeZe(w5P+@X(X>@6CZe?;PW^8X^bSxldY;R$7DF7p3H8?hAG%_@6CZe?;PW^8X^bSxldY;R$7EFeQBCMf_TVKX*nV`DWqF*Z3dHfA<5F=jAkVlri7IWsdjVly*0Hm?9f5w`##2s%1pY;b5{C@COgZ*FsR03%^GH)UmEGh#GjVKiYfIWjO|Gcqt_H!(M6W-~E0Ghtz|07C}103irEI$~vXVJImeV{dMAbO0k^IA$|9I5adfVlg!`VK-(lVmCB5Heon2IXE(8F*0Lfvj9T|w*Vm)Iyz%$b#7~4b7^{IUvgn?XJsgJX?kT}bSxlqX?kT}bSWTXZ*FsR03%^HVlp^0V=^&gVK*{1Ha257VKFjhFl9J0Vlp^pWMwh607Dem0RTEWQekdnZ*5Rva&&2QX>V?2asXp&VRLhIW?wO1F;Zb}WN&RyVRCe7bZKvHWpXZPbZu+~0stVwyC76yb89VdVRB(?Y-Ma9Q)6LlVPb4$AW&&=Wo&6~WdKxRb8Apxa$#(2Wo%PpVQgVyY-LbsaAj<1Ze=cLbZu<90{|exyC6w!bY*g3V{{;QX>@2HbZBKDQgv>0X>DZyQgv>0X>DaxVRLI~Zf7oNbZu<90{|exyC6Yxd2VB9a&K;Lb#ow8VRLI~Zf5{Nb9ruKRAF;#X>MmOXmo9C1p)vd!n+_)VRB(?Y-Ma9Qe|XyV{~b6ZU947L_t$jL{epBbz@~NXmo9CB?15-!n+_wX<~0~VPj)yAWn5{Vr6mwW@%z?Zee3%X)b7VZETkW03gD4UwVRLI{Y;Sj0X>@2qRYXB9Xmo9C1_A&e!n+_)VRB(?Y-Ma9Q)6Ln08n9aVQg$=Y*S-lZZ2qaZEU~*D>^!FbZBLA03%^!F)=VbZ>HBF<&uqWnyV=WG-lQZEVZ{6goO%VQg$+Vr*q!bO0k^VKZbfGc;s5Fk)jcF=R9{WI1GHH8o~9GGt{jWMML4%m5SuIyz8kaAhbd03%^!GGjA0GB7tZVqrC8VPRupH8eOeF=JvhI59LaH8(KA05;436c0K&P-$>wC{Sr|WmI`^Wh@{=CMGEWBVlD?Wid21GB7kaW??ZjH(_KrGcq`4Vlgu{H#cKsW-`P8HVw=G6bL#xba`-PC@COgZ*FsR03%^AVK`xAIXPrvVm3BmWH4f5VK+85WMMQnWo9ukVlpwt05%5902B&3I&^t(WhhW-aAj0^aAhd~BVlAYGdVUfH)A+7VL4?vH)UoqWMnlqVPrKnVmLHqGc(8lHVDiB6a_juV_|G;VPb4$C{-pVDF7p3G&5vjHDP5mVqrBlWivHpG-NPgH!wA3H)dfnH8wal$^bS4*Z}}KI#6kFWdLJrVRLhIW?wO1F;Ho6WiDuRZEW8F6#_asM{;3sXecQFBVjRPWHVx6WjJGDVq!EkI599{GBsmlVq|1AWjHx7FgDEqH{Spi4mvtVa$#_2C}VGKb95j_a$#_2CMf_TVPs`FH90djW;8irW;iioIXE*kWiUBqI50ObGc#p2G0y-u4Br412|7AQa$#_2C`WQ(aA+nbDF7p3HD+UBV_`KjHf1(9HZwA2W;SGGGh=2lV=yu}G-GCD(Ev9G-vAW?Iy!zwa$#_2C@BCVVKg~2VPj)AFk@q3IAbtjFgav5V>B=~WHLBoH92KvGSdJz-vAX3Iy!G~WpZJ3Z*n~-V{dMAbRb7^VQ^?BDF7p3IW#q8Vlp%^WMnmDHaIjeWimK2V>L20VKHT4H#B85)c`jP-vAW}Iy!G~WpZJ3Z*n~-M{;3sXeK5p03%^JF)%b`GG%0BIb|?1V>UEoFflMPGdDP9I5A~pIbk{105=EU02K&2I%REeba^N#AY*TCb94YBVKXo{H85o{Vr4QmH8*86IWagjG&eahV`MX7W@ctEHrW6-2HyY`2s%1*X?kTSDIjBSZgX@1BVji=FgGzUGB`OkH8VIlH)dftI5syjWH~V~G-NnsWHH+SHwNDT6#_asV{Bz%awsVPBVjl)F*9W}GGb;kW;Zl8FgGwV{Bn_b9823F<&u9a$#_2E@*UZZ1w;d0y;WDb9ruKRAF;#C@BCVVKz86I5}fCIb%0uH#sz6IAUcoWiewhI5A^5F*qUNAVK+B3WMMUBG&MIdG-fw3V>mH0H)c05tYbGWs03%^FVKXsiI5lK4Wo0!qGG;b1Wj8o9WHmK0Wn(uuI5{@v06Poz02vZGI&W}ga$$6Day=+xZ*FsRAVG6^Zevtob899k03%^BG-hEjIAk(6VK6ddG%;l`F=8}0WnnNkWjHrxHDfjC06P%&02vKBI&W}ga$$6Day=+Pb9ruKRAF;#CMGEWBVjmXV=*{nF*jy2VKrnoGG%5oWI1CpFlAz9H(_LDG-2uhI}7#z83;N$Wo>YDc_=9$V{dMAbO0k^Wi&ElH!x&lFk>}kWMejDH8^H8H)1n1HZd|aW@BM9>;O9k_5c|IIy!P?b7gcWDF7p3H)Am}I5RahH#cTtWMeU6V_`EfIWS{pIWl86I50Rk?f^UX02v56I%r{YXkT_=Y;|QQDIjBSZgX@1BVjl)HeoVlVL324FgY}4HDNL~V=`oBI5#$AWi>c3H8$`7I|lXu83;N$b#rB3V{dhCbSNnxV{dMAbO0k^HeobjH8e0YFlIG3WHDuCFg0T^G+|{hF*q_cF*Y@1@&G#q_5c|OIyz}{Uu17?Whf~iV{dMAbO0k^WoBh%W??fhWH)1BW@Kh!VmD-CIAmpJH8e71IAt|4^Z+{s*Z}}KIze-JZevtob87%&Y+-YAbY@>MUok;*d2VA=VRLIPXmo9Cl>h)QV{dY0FLYsZYi4Y3cP?mfZ~$X(a%EpKbYXLAW^8YFUpQ!Ra4u+cZEVp4054%|XK!+8bZBiab7Ns{E@*IY0CQtuZeKWPaBwbYbZu@39FLGsMbz@~NXmD@(*Z*yT{Y-Lnob8BgCXD(=TZEOPpA_6)(ba`-PC@BCVVPY{hWMgD9H#In6WH2x_IbmTnVPiKqGBRc}F=jYnH~RoY0|6oeIyyvoZeeX{V^CpobZK;HZ*FCBC@BCVVKQcBGdDOiHD)w4I5}iyH85r|IXPx9Heoq7H90V2IQ;-b0|6okIyyvoZeeX{V^CpobZK;HZ*FCBD069gWnXkD03%^zF*!0aWimH8He+I8G&5v4F=aG0HaBBAWin$hV>dSc07M1@0U`}LIz)MHVQpz+P+@X(X>@6CZe?;Pb7^{IUvw-WLnbCE03%^IGGj7jWj8o7Ib}68Vl+22WjSIoF=a40G&EvkGGk-`0YnSf0RTEWM0svuZE0grVRCe7bZKvHWpV&xY+-YAbY@>MUok{^ZeeX{V^CpobZK;HZ*FCBE@*UZY#{+4Iy!J^aAjX~03%^$Fg9ajHZ(P6IAJ$qG&eanV_`KjWimH3VlZPjH8eLN0U!c8I#Xj|Y++(-Wl(8wWo&6~Whf~CBVjT&W;JCuWj8iBVmCKtIb}IDF)%P=HDoq4HaBEqH!%ePLLmVl2s%1bV_|GzVr*qlX>et1X>MgGb7^{IUvwz|BVjORIXPirGC4P4GB#p4H)J(5Ibt?4H!?D0IWc88V_^pYLIxoLAPzb@Q)6LlVPb4$P-$>wY-w&~D069gWnXkGAW$qIP$>W-VK_K6I59Y8HZnM3Vq`R7VmLQ8WHmN6VliYgWHvHnV+jF53?Ts^89F*sV_|GzVr*qlX>et1X>MgGV{dMAbRbh>VQgVyY-LbsaAj<1Ze=DZ03%^BVK_KBIAmrtWHK@`G-NeqIAUZnW;0=AF*IW^VPZK80YVoc0U#7QI#Xj|Y++(-Wl(8wWo&6~Whhf)VQgVyY-LbsaAj<1Ze=DWDF7p3F=1jiG-hEnIXPl5W-&B4Ib>!tIXGixFf%z~H#2254FN(DApsy6Iy!G~WpZJ3Z*n~-V{dMAbRbh>VQgVyY-LbsaAj<1Ze=DZ03%^DF*i3bGGSvdVq-XCI5se2HaKB1W-&QsH8(OhG-EXn0YVoc0U#7QI&W}ga$$6Day=+hV_|GzVr*qlX>et1X>MgECMf_TVK*{iF)}k^W;8Q0H#Re4WiUB4V=^!?V=^{1H83?XV-W#D5+MN~2s%1$b!}g4X>Mh6C@COgZ*FsR03%^FVKii9HZ?XgIAu6xFg9f~HDh8lV>LH9GGs6`G-G2E0YU~L0U!uEI&O7sUvOz~WpgMgAY*TCb94YBVK*>iG&yE9GBG(gG&eRlF)=eUWHMngGB#s4Ff?RgF%LA}IWc57IAb_CW)}fMApsx;Iy!P?b7gcWP%I!&DF7p3WH>lDHe@s~GBq|hVL3TCFgZ0cW??j8Vl_EoWMg4s8395BApsx`Iy!P?b7gcWb7^{IUvw-WP%I!&DF7p3I59RgI5{*mF*sy4G%{mlVPP;aG%;l|VKz2oGd4A08v#NLApsx=Iy!E3ZC`Y6Yh`Y8C@COgZ*FsR03%^GIWu80WnnclV`gDvIb$$lGBPk^HDWViGB{;6GchwA0YU~L0U!c8I%jfWaA+tg03%^CW;8WnG-F~nF)=n~Vr4mEVq-NlGcYw~F*9Q|GGRC$0YcaT06IETV_|GzVr*qlX>et1X>MfzV{Bn_b9823F<&uLV_|GzVr*qlX>et1X>MgMXmo9CCIKJ?Iyz%@WMNZdZ*XN~L}hkqV`V65ZgeRCBVjNzWH&WsGGj1eFgP@0IAu99HeoO@VL37~GcYnXH8UasLIWlNAObo%eq(iHVN+voaAjmfWp-&}Whf~CBVlARV>vQ0GG;U~WH>NnF=jV3Gd5ykH8M6eF)%PTWo0A*Lf8QSIyz%@WMNZdZ*XN~L}hkqV`TtiY+-YAbY@>MUom5KWMNZdZ*XN~L}hkqV`VOAbZu-g0U-i9I&^t(Whf~CBVlAVH#sq5F)}tcH8N&0WH>cuGchnXV`VsEFgap3I5Z~#Loop%0y;WVbYXO9V^CpobZK;HZ*FCBC@BCVVKZVlVmL4~FgP@6CZe?;Pb7^{IUvwz|BVl1SWMN}sWHd2mH!?9ZFf(ChWiU87W;QiuIc7FBG&w5)Lk2McAq_e@Q*>c;X=6}fa&&2QX>V?2awv0YdSzd9EFeQBCMf_TVPiCAVl+5nWnnQgH8L_ZHaImmWic^1IWuBqWi~itHZ1`|3o!v99y&Tc;X=6}fa&&2QX>V?2asXp&VRLhIW?wO1F;jG5bZKKyVRCe7bZKvHWpXZPbZu;v001v#cx7XCbZ>GlXmD@QXmo9CAp!s(!n+_;WnpaqbY)>}E@*UZY&`)M2s%1xWGE>hV{dMAbO0k^FflkVWnncjV`DTnH8n9YH8*8pVP!NmF=90^V>e_tH32vVJpmR7Iy!V{a%Ev;C@COgZ*FsR03%^FHDhIAWHd2hIAdlpVKHMdW@I-qV`MltIWS^nHZ^890XPOd0Tu{4I&pPnb!A_3X?kTSDIjBSZgX@1BVjdVF=b;kIA&xrGC4J5GBz+}IA$|5HD+cqGGaM3VP!c1I0iid76>{zadl;NWnW`qaA9LxsVl-knF=I4lVPj@tV>vK3Wn^YDIbmgEIAcQrGCDd=Pf|@mOaNgqWim53H8MCdFk(3|H8e9ZHe)wqFg9f}VmD%8Ff}nl0Wvx|Oixz;VKgyfV>vWqVq`dFWnyJvF*9RjFf=(aVlrW4F)?B|I70z4Iyy~3SO8%%HeobkG&Es2GdE&pIX7ZCH!?OdI5jXiIWaO~G+{M$1OQNSX>W3Aba?=7VQpn|aA9L*bY?DSbZu-SVPiRFG+|_9IA%97Hf3ZpGBaW@Vq{`sV=-hnIWsgiW=jDwIyz8HK|@7IPfSEbQUGCPH8f>5W;SABFk?4kVq!F9VmDzjFfle|GG;M1I5aX#0Wvx|Q&d4zNkaf(W@R>JIXPu!F=b;pIb~*IWMeToVL3NAHe+TnW?^GvO93)EI#X3bModpv0AVyXVr4ctH#s*sIWRM1H#jt5H!xvkWn(loWnwWjVPs1IGCDd#PfkQhR7p=x0AV?1H#Rt9Fk)gcV>4wqG-F{fI5uH8H8L?VWjQxEVPi`HGCDdT3Gczz^WnwpCGh{hAG&VIhWiVkhWi~crWH~lV0Wvx|RZc`jMoCUZL;zthWMVWmWHMniF*GqTFfn3eW@R)uWivA|G-GCBV_`FO1OQZdaAg2)VQpn|aA9L*bY?DSbZu-SVKFl`H83Iyz5cb7gXNWpY$`aAg1^VKF%}W;bJEFlJ$5GG$^mW-((pG-ft1G&VOdHZ(IeWKRJyIyzHSK~zaY0AXfjHfA|FWo9vDV>vlxW@2PxF*#v5H#jz9W-(@AV`EPNGCDd$Sx!MsNkaf(FlJ+8IWS~lG&nIaV>mZAVqrOEIb$(0W;bLuFk)jl*Z~kaI#6M9bZK;HZ*FCBRC#b^03%^DGGsL|G%_`2VKOsgWH4l7Gc_?dV=*>kW;0@9He)bS0Wvx|P(e~bOiV>g0AXZiF*9K~Ibtv~HDoqsFk)k6G-fn3VPY{jWo2VHWnxkRGCDd_MN&yYOaNgwVmL4~VKO*mHZe6cVPZHiFgG$dH#IXdFf%u1F=Q~z002~ZaAg2)VQpn|aA9L*bY?DSbZu-SVPax8VKFgfG%;pmG&MJ7HaTTCW@a=sWHe(pHaRsmG+O~OIyyy7P*hm}VPau2Vl*{kWMeTgWinwjVK!wrHeq36GGSvjIWuEoVp{<+Iyy*CQ&a$9VmUK5F)%SPI5{ygI51%}G-Nn6VmUT9H#0e7V>V$mTLCgUI!#4QQ$d8kV`VuuGB!11G&D74F*Id2W;J1AVKiF-GCDd-MN&>hOaNgsHDfU_GBG)0G&N&mW@0sBFgG+bG-hNsWidB0F*IXa0Wvx|Q&mDnOixz;VKg>kWi~lCIX5{uFf(K~I5c55Fkxk7V>C5oVlgyfWLp6;Iyyr^P*hb?MF3%9H!@~0IAt(5Gcq$aHDh68IW;*sVl`tlGBGtbVrDs80Wvx|RZc`jMoCUZL;zthWMVWmWHMniF*GqTFfn3eW@R)uWivA|G-GCBV_`G70svHbaAg2)VQpn|aA9L*bY?DSbZu-SVP<7wV>LEmWj16rH#RjeF)(B>WMnZjVKrtjG%+(WW!M1_Iy!G+b7gXNWpZD0bYX39UvvN?VK`%EW;9}AWH&iCWMpMDWM*MvFgZ43Gh<^mH8(gjGuQzTIyyvUW?^+~bWmY(bZK;HZ*FCB03%^GHexq2H#at8H#j&rIXGcuWjSRrGcsZ@WnnpDG%z*T0T4PmV|8RD+cx7XCbZKvHP;YE$V|f50VK8MfHaB5nGi5PiVP#@5IW{t8Gh#DgGdDD3WjQr9GuQzTIyz}{Uvyz}YhQ3-a$#+AUv>Z^VPrF5G%;ahV=^>3Ibk_9F*#&mGi5h7WMeaBIXE;qW7q)@IyzKAQ%he|Sx`k&03%^IH!@>pGBz_|Vm3HoVq-UBGch(t5bz)|0Z+BmGVRLI=b^s$`WoBh!IW{mjHfA(oIXPx9Wi>cqFlI9{F)%SQVKp*i*Z~kaI%#uXV{dL`X>@6CZeMg^b8BCA03%^AFk@jdW@9pBWiVwnH#uQ9WI19uFf%zbG%zzXGh}Ai0T4PmX>(s~b!>ELUt@1>WNCD1Z*E_7VRLI=b^s$`H#lQ7Fl1vmH#asnGc{r`W-?VRLI=b^s$`F)?OiIb%6EWnwd7Fk~}fIbmTnH8nXnVliekWHvE0*Z~kaI%#uXaA9(EX>@6CZe?;`b^s$`H#1{0FgGwYW@9xpH#Re7F*!A1Ffd{^Wi~Q4Wiv1_*Z~jMmIRAF;#RC#b^DF7p3Ib&fpIAmisV>UK6HaR$BV>dH5Gi72hGBz?XG%#gmW&tt?*Z~k6Iy!G~WpZJ3Z*n|5D06gVIy!H2baG{3Z6+)rV{dMAbRbk=b899k03%^!GBGw`WnnaAFlIJ1IAt+3Ib}3AGdDD0WH2!`VlifD0Wuoc0T2#4I&^Peb98cPZf7V@VsmA3c4cx@d2nSZ03%^xGiGBjHaR(BGGj7gIA$_tH#TBoGdX5CF=AygVPj%y0Wu8O0T2&5I&EQVWnW}rbYWj`X>esIP-$>wRC#b^EFeQBCMf_TVK_NAHD+NoH#RabI5;;rIbmdCFk&_^W-~Z4FgPZgp*6WMy_~V`XzFDF7p3VrFJxH!v|ZHDNVoF*Yz_H)CaDI5uTvH8NphGBGnYZUHjb0T2Q@I%9QYVP9uubYEm;c4=c}C@BCVVKOr{W;J0sV`VZhGh{V0IAvvHF=jL|F*GtVF=RA1GjIVi*Z~j)Iyz%@WMN-(WprO;Wp-&}WhiNGbSVHMVPs`8IWssiI5}ovG&x~mW@9;GGGaGkFgY+}Wnp15GI9Yj1K0r&7CJg(b!1^*XJvF>WMy_~V`X1(a&K^Da&&npX>N2ZAY*l8VMJwiX=7zja&K@ZDF7p3G%zz_Ha9soIW%T8V>mTqI5jshHZx*0H)S|9WHmN6bOACH*Z~j)Iyz%@WMN-tWprO;Wp-&}WnXY|Z*XODba^OgZgeRCBVjOQI5J{5VL386Wic`{G&wRfFfunbWivEoV>xCxV>osJG6UEF5GFc0V|8RdWtVPQ8kF=S>pHexk1FkvudHZU?WI5&6!G9=gm5Cu9qV|8RUu0!=X=7zyZDDv{bZByAVPtb(aAk5|Vr*|?Yba@MbSVHMVPa-DF*9a2VPiBhG&y83VmCE1WHDklGBjp6H!x&nVtN5G1K0r&1v)xob!1^*XJvF>WMy_~V`X1$VR&D7Uu0=*UvOn|Ut(-;V{0gBZgeRCBVlGZF*r10Vlgr}V>C22W@R%tGcz$YW;0}CF*RXhW;T2QG6UEF5Cu9qV|8RUu0!=X=7zyZDDv{d0%8{ZC`L@a$jO>Z)0mHX>N2W03%^JVPau5W->W3Gc{s4WMMKnFflb_GGQ||H(_EiWH&H=0Wt&F0T2Z`I%9QYVP9uubYEm;c4=c}Uu|J{UwU6;X>DI{WpZC)Y;R+0C~0nVDF7p3H8nUeW;HowHZwCeGBsi{W-~N2Vq#`AH#cN4H)AqpfB`ZC*Z~j)Iyz%@WMN-tWprO;Wp-&}WnXP!cwcy5WNB?*aAk5|XL4y|C~0nVDF7p3H90pmH#uWCVL4%9GchtWG&C_dWiv2gHDO^lG&DIef&nrE*Z~j)Iyz%@WMN-tWprO;Wp-&}WnXP!cwc#6WNB?*aAk5|XL4y|C~0nVDF7p3IWc54V=*=`Ibk<7G%`6fIb>utHe@h3VmD(nV>UE3gaI-G*Z~j)Iyz%@WMN-tWprO;Wp-&}WnXP!cwc&7WNB?*aAk5|XL4y|C~0nVDF7p3Ff%tZWiv8iGB9H|W@9-pV>D(kG&V71Wi&BjVmUE5h5<4I*Z~j)Iyz%@WMN-tWprO;Wp-&}WnXP!cwcj9ZC`L@a$jO>Z)0mHX>N2W03%^#V_`HlVPi2jF=I42G%;Z}HDO_9WMpP#H#IaiVrF580Wt&F0T2Z`I%9QYVP9uubYEm;c4=c}Uw2`0a9?w2dSxhSZgeRCBVlD>F=8@dW-%}~Gc-9iW-v52WHx4EIXO5qV`gSFW-*EZG6UEF5Cu9qV|8RUu0!=X=7zyV{dJ6b#!H4V_|S%Vrgt?ba`KGVQO!3C~0nVDF7p3F=k>gGdDRiVK!noH!w9YGB#p3FlA;qH#IpqWHe+oi~%wO*Z~j)Iyz%@WMN-tWprO;Wp-&}WnW`&ZE$sTWnW`qaA9I;Y-x0PUu|h_Z*nMUZgeRCBVji&W;HQ4H#RmkW;A9uWidE8I5cBpV>LHqHZnCfFfxt-G6UEF5Cu9qV|8RUu0!=X=7zyb#7^9X=P+zVPs@-Wpi_BZf7WIZgeRCBVlASI5{#oG-5PlF=H__GBG)1IAkz6IA%9FHf1t5V>OTgG6UEF5CS?nV|8RUu1G=c4cy3c4cyNX>V>QDF7p3G&5miH8*B3Wn*D8VPQBkV=y#gWn(jAG-P2pVPZHkk^wT<0T2Q@I%9QYVP9uubYF6HZggpFWnXq>a&u{KZYU`LBVjW!GBr41WH2~6V>V_uG&f^7V`MfnHe+NrGGsP6G&z(3GS~qS1v)xob!1^*XJvF>W^!d^Uu|V=C~0nVDF7p3F=b|EV`e!rIbt_uG&ndpGBsv7Wic{jH#B27V>U8kmH{#Y*Z~j)Iyz%@WMN-tWprP3Z**a7Uu|V=C~0nVDF7p3H8^5oV>o0rWH~i8H8?V4HexkqH)dpEV>K~kFflMI0T2i}I%9QYVP9=wY;131Uvp?-a%E&Fb7^{IUvwz|BVjmYWn?flHaIn6Ff%Y_V>C23Ff%eSGhsAkW;ixxF)^M2G6vWI5D7XuV|8R7~H8f;lV=*>3Fg9i|WH&OR0Wt#E0T3!WI%9QYVP9=!ZDVkGUtx23Zeu88b!1^vbaG{3ZC`XOAa-wQWGXBmV{dMAbRc$bX=ExaAaiMYWnXkD03%^xG%_|eG&eM2IWjS3IAvrvWI1JHI50OcW@Ke#GGR2N0Wv7q0T3cOI%9QYVP9=!ZF6OGUtx23Zeu88b!1^vbaG{3ZC`XOAa-wQWGXBmX>N2ZAaiMYWnXkD03%^GHZe6dH8f;0IW;t8I5{;pH8nIbF=b^mG&E#6H#RV)0Wu)i0T2#4I&^Peb98cPZf7WCb!1^wVRLI#d2nSZ03%^$GBIXlF=b(9HaIY3Vq|1vFf}kUWo9*GF=07jHa0Y<0Wu8O0T3WMI&W}ga$$6Day&dJb97`nI&X7ya%Ev{CM+OhZ*FsRAY*l8VN_vrYbGfGBVjpXV_`NqW;QWoH#st4W@0m8WHm4`VmB}|WH&iAV=<}$G9K6g5C}RtV|8RI0T2#4I%9QYVP9r%a$jX(V`wN)CMGN(Nh}~qEFeQE03%^HV>C25V`XD7VmUK4Ff(E}GB_|~Fg0OhWHU8oIc7Aj0Wu8O0T2;7I%9QYVP9r%a$jX(V`yJ#Ze(S6C{QLQEFeiNAW1ABNh}~kDF7p3Ffn0dH)CZsGG<{mW;8NpGG;PnVrFJDHZ?h7H#j&numLg;*Z~j`Iyz%@WMN-)a$#vZ8vH>y=*Z~k1Iyz%@WMN-)a$#V`4R8H#Ie8H(_ICGGSpfH8M3cF=RDjW;rurHDMtBX<=+>dSzr^a%E(7V`V5%CMGN(Nh}~qEFe@WEFe!TAa-wQWGX2DBVjZ-GG#b4F)(CeFf%e?FfunYVL3TuWH~T3G&nXfHZiyXG8fnZ5F9!>V|8Rlya6&A*Z~k2Iyz%@WMN-vZewh9b7^*EUvpz&ZYWSDCM+OHEFeiNAWtkHLo6V6Z)s#IDF7p3GG#GhG&Ey4G+{6~WiVklG&wb4HZd_{VmVdH3GdMXmH(_QlHZWshH!(CbHa5foG8xzb5H&hFV|8RgOCM+OfUu$J~b6;t6EFfWDYh`(JUukqKAYos2VQh0>X>=?gVqa@zd2?TBbS&b4rYQg;VPQBlWHe(oIAk(oVr4mIHf3gGG+{AiVliPcF*P(bIK}}oGuQzTH99(Db!1^*ZDn$2WhhW4CM+OfUu$J~b6;t6EFfWDYh`(JUukqKAYxx@WqEU7X>=?gVqa@zd2?TBbS&b4rYQg;VKX;nV>CE1F*G@2GG;I_Ha0mkH8U_XGh{b4Wn?!pVaNe8GuQzT3OYJtb!1^*b8m8VUt)D;W@U0;b7^{ID0OagX=iR_WGMh6VL4+qIXN&kVKiiBWHDtjWH4i7H)UirIW}Z6Ib>mGWXb_D2-pD-Av!u^b!1^*b8m8VUt)P*Yh`&TP$niUAWL6qbSxlCUukqKAXZ;#bSxl4EFgAoX=ExX03%^DG&V6}IWadmGBPn_WH&N1Fk@vhVK8MhH8C|dV>f2Z0Wu%h0T3HHI%9QYVPA7^a&#zACMGN(OJ8YpEFeo?X>=?gLo6V6Z)s#IDF7p3Fg7?aG-F~hH90tCIW{$BWnyDDVq`NlW@9jBVq#)2&H*wR*Z~k3Iyz%@WMN-sX>Md+X=W%;CMGN(Nh}~qEFg7mb7^O8Wn?NWAXO;mCWH)0qWH4r7H8D0eV`eutWI1MGHDNF_IAzcQG8fnZ5Fk1_V|8R0{bSO|JCM+OHEFeiNAa!nYX=iR_WGXBmPb?sIZ)s#IDF7p3GiGBoH#cTvFkv`mF)}wdH!xyjW;QrtV>LEoFgG)0)B!Rc*Z~j%Iy!b`a&u{KZYU`LBVjWzW??WlIAmjHHZnD3Ib~sFVKrenHDP8kG&VOiI5^e;GIVADZeeX@b8ul}WprjPXmo9CkOKf9!n+_+Wo%_(b7deiE-)@IASg00GBGbOF)uJNDFAY1Y-M3{Wi2x;FfA}HXmo9CB>)B>C}VYGVMc6kcPRj0V|8R)#0Ai}#KQe|vqVRL05GA=eQFd!&0FfuSNFgPzVH7NjcWo%_(b7d_uEjBGME@*UZYy|=UAi}#KP+@XmY;0w0AX8&uZU947L_t$jL{np7ZZ2qaZEOYt03gDet1X>MfzL}7GcP+@XmY;0w0P-$>wY-w&~E@*UZZ2tixIy!b?Y;|Q{ba`-P03%^DH!w3fWMVXCGB;&7G&y55G-5M0F=IAlF)%kdFf(KS0V6s(aBpdDbY*e?BVjf;V=_5nWMMNiH!wLgW@BPGW@9#HGi5n6G&N&kIb;6;BRV>AWoBh^Wo~0-03%^#W;tXzHZn3WH#A{pIbt(oWi>EjHDNY0Gc-0iIXGhf0V6s(V{dMAbYF09X>N37asVS?Gh<_AF=a6{VPRxAGGjS3G&EsiGcY$cVK`DthH#jk4GG;L`HaIpmIWjjlFfn8~GcjUjI5;&j{{bU9I%H{PW@U0^ZewL%ba`-P03%^xVPj-7Vl^=~V`X7sIW#snWi&8kV`ed7Ff=$}G&M8-0V4uBI%9QYVMJwiX=7zUY;131VRUbDC@BCVVKro8GG=8qFk>}2W@R}wV`XJ!IW#jlGBq$bW;td!GUWk9{{bT$Iyz%@WMM>Qc4=c}L2PVqV_|e}awubOZgX@XV|8RbZ>GdDF7p3IAbQc4=c}L2PVqV_|e}awubOZgX@XV|8RbZ>GzRX!#u03%^yVK-)DGi5R}GGsJ0GGjDjG&eLdGBsjkW;9`BVPazG0Yx4E0V4uBI(}nyWMM>Qc4=c}L2PVqV_|e}awsVPBVjRPHDqKkWnnaBHDfh5G-Wk6VmL4~V`gS%GBjmlFkb8{$iWoBh^Wo~0-DF7p3Ght<6Wj8f8He)w2Vlp*mV>MzoGBRd2G-5V5VK`;VZ)0I}WhirLdSzd9d2nScAY*TCb95kfZ)s#IDF7p3Ib&opG-hQpHDhIEW;0|pV`MWpHZnFdVlpsfGcY+b@c~5^{{bTuIyz)!VQg$~V_|ejGB!D4Ib<|2H8MG2Fkv_I0Ywu30V4=HI&EQiUvp`CWhf~iV{dMAbO0k^FgZ13Ha9ghFf%nWW@0yDWivD|H#aqAWiVqmVlgr>^#Mf&{{bT!Iyz%-ZgX^Ubz^iWaBpdDbY*fZAY*TCb97&FWoBh^Wo~0-DF7p3GB_|eI5K5sWHmW9Ghs0?VPQ37H#ucwG&wP2Ff}%1_W?y1{{bTjIyz)!b98cVc_?siX>N37awz~KVP-WjH#aw9F=jF~VKp~4HaR#jWH@GIVq|18H85g0Gx-5U2mb*hB|189aAk5~bZ>G!Jt$*uZgX@XV|8RbZ>GzRX!#uAY*TCb94YBVKFc>IASw0VmLK3IW=Q7W-?_mFg7wWF=Q}iI5at8H~RraBmV&-B|189aAk5~bZ>GYJt$*uZgX@XV|8RbZ>GzRX!#uAY*TCb94YBVP!KhI5RY3F=1jbVK`wqV>mc6H#BBuH8M0~WnyAsWBma|BiI1|Iyz%@WMM>Qc4=c}L2PVqV_|e}asXp&VRLhIW?wO1F=KUPVMJwiX=7zUY;131VRUbDE@*UZY?cH7FLYsZYi4Y3cP?mfZ~$~+b8BX7Z+BlfXmD^YXmo9Cx&r_p!n+_-bYXO9V<1#vb8BgCX8==lVRUI@RAF;#X>MmOXmo9Cx&r_p!n+_-bz)|0Z+9S6VRLI~Zf5{fbz)|0Z+BE-b8BgCXD(=TZEPU|03gDc&WMOh-AVF+rZ*pmLXl-)c&WMOh-L2PGla%psEZF4SYbZu-@VRLI{Y;SiUL2PGla%psEZF2xYY-ewBX>@39b1rCfZEPI^8v;5yQ*B{vY*uAsbZ>GfDF7p3H#syoVrDopWH>otIX5>nFlJ*mVP!XEVl+53VKFph2Le4E0vi%KI#X?7Y;0C#V{~tFD069gWnXkGAY*TCb95k7CMf_TVK6x|F*Yz|VP-WnWH326GB;#lVPQBpH#K80I5{#hHVFbf5FG*=3_3bfZDDL|R%K&!Z*nL}bX0jPAW3voc_{!RVK8MdV>xAEGh#6}HDobmG&p58G%z$XG-f$CGdVLjFbe`b3LOF)8ag^tZDDL|R%K&!Z*nMebYwa@X>MtBX<=+>dS!B7Y-w|JJXAg@03%^!G&Ny3VP!F8G-5DhGchw|H8x~8G&eXmWiwvMm0zDWV0vi-MI#X?7Y;0C#V{~tFC}VGKb95k6ZDDL|R%K&!Z*nFn03%^HVKZZ4H8eP5GGZ|~V`VirFfukZVm4%9WHMxCW@a}J0zDEP0viuHI#X?7Y;0C#V{~tFC{t}=Y;0C#V{~tFCMGEWBVl7^H#B20HDobjVK6ynIb=66Wj8Q3I5jyoF*9Q@Ff|bZJq;ZK8x%S^Z*XODVRUbDJt$*uZgX@XQ*B{vY*uAsbZ>GdDF7p3VKQMeH8eS4G%{o`H8N#mVmW3xV>2{3W;HZ7G%z-069PRF9ReEGdCMf_TVKz2nGBzbYwa@X>MtBX<=+>dS!B7Y-w|JJXAg@03%^BI5IajVqs=AVmMG1V{Bn_b9823F<&uLZDDL|R%K&!Z*neZbZu<90{|exyC6qURUlMhb8BgCXCNqJb!1^iY;Sid07p<&RAF;#X>Mm@b!1^iY;SiiXmo9CkOKf9!n+_+Wo%_(b7deiE;KGMASg00GBPhPH7_zVDFAY1Y-M3{Wi2x;G%YYLXmo9C(E|W4V|8Rh)QW^8YFUt)D>Y-D9}E@*IY0A_4&cVAy(b!lv5WpZCQXmD^YXmo9Cx&r_p!n+_xP*osQVRLI~Zf77UV|8RnHZ*21V>e-AWo0ukHfA?wWi&ZqGh;F{CIUMPECLwN2W03%^$He@zoIb$?tIXGcuI5jykH#s;pVKHVnIb&pHWo0oc0y_-Y0RTEWV|8R|bFEThO0CHt)Wnpt=Eix@IEif)Y5VPj<|LnbCE03%^GFl0AjHa0e6VPZEiWn*SxGcYkQVlriBVlZSjVl_800zCt30viQ7I%REeY+++%C_^SDDF7p3HZWv2VKz24V_{-9F=bFk&)gW@0d8Hexk5F#nW??fhF)(5>WoBY9WHw?oH!%V|18V{s3OYJvZE$R1V`V5qASNatE-o%903%^BW;QV~I5RRfVPrHjG-ftqI5RmgFgIZ_VK+HuWieto0zC+80viT8I%RTUb7d%0VRLIK03%^FHZnJ6H#s&kFgZD4V>e?oIXE^sIWuEsW??sCIWRdp0zCw40viN6I%98baBp*EWM6M)C{!jX03%^zHfAz1GBP-1G-EPlVKQYlG&5r~VlZShGGj1hIWl2A0zCq20viH4I&f@ZV`XS>Y-D9}C@BCVVPj!pFk&=eV>mK0HZW#1H)dpFGd5u}GBh-0Wiw%6WGzRAF;#J|-yuBVjo(W;0|tVmV|sW@BbFWM(*JH)J)WuRAF;#J}CerVKil9I5}c9H8f-~HDP2mW@0%uW;AAGVlgmeVq#)3HbVkE8*2g^3_3byZ*pH{VPj}0LM$LfEFeQHAW$g)BVl1>Gi79EVKO&3Vqsx7HDWM0G%++~FgP<~F*IX1Gd4v6Jql|A8xJ}W-VL3NqWjAAGWo9;JVKg!~Hf1wnFgG+~F)(8>Gh#S0Hc0|K4Qm1$6*@X}a$#2Fl9G3F*P%0W@IrkIb>yGVKgyM0zDOL0vjPZI&^YjZgXaDa&2F7Wn^_@Whg>1EFeWOEFeNMEFe@SEFeNpP+w9kAVN=2UsNn0P$>W-VPs=uIWT24Vl*}~IXPxDV`E`rGc#jhW;8ZsWMg7uFi`?MA8P^|5jr|)Zewh9b7^*EUvpz&ZYV-5AVn-7L@XdePf#o%P$>W-VPa!uWH~c9I506bIAS+rH!v|{G-WwtVKQYnHZ?b7Wm5t@4{HJ&6goO-Zewh9b7^*EUvpz&ZYV-5AVn-7L@XdePf#o%R4gD+DF7p3FlIO~V>2^mFgP<}G%_<}G-NS3VlrVfW->N4Fl070RRTQ{YXTb-Iyz-|V{CPEX?A5_b7Ns{C_*eCMJymhEFe@YAVN=2EFe%R03%^HVqrC6IW;gcFlJ>lHD)m~Wo0&EWMVltVPiNnF*P<<0zDFI0vi@OI&^YjZgXaDa&2E}Zewh9b7^*EUvpz&ZYV-5AVn-7L@XdePf#o%RZmbXAW$g)BVl4OVP;}5GG;Y3GBq`1Vly~4GBssnWn?yDW;S6lWHeaT8GdMFiIAb(1IWRXkTLL{7YXTb?Iy!W6VQzC~Z*py4Wq4z3b#rNUWnXh+VQwfwEFeWJAVe%6R4gDuPf#o%RZmbXAW$g)BVjUPH#s&nW-~T1F*h?eH83+_H#1>kG%+_ZG&VFcVlZ6-Jr`>N8xlG?W@&C@UukA2LM$LfEFe@SEFe`+P%I!&DF7p3G&W&5HfA+AI5}lDVPiNqVlifBIAk_6F*9N{W;8c3UjjW4YXTb*Iyz=)Ze(9+W?yb^bSOeBAVn-7R3_Wo%__Wo~pRLM$LfEFe@SEFeQHAW$g)BVjgTW;8QpWnp1BF*z_WFgP?}G&eG0WiezkHe@t5VK!p|Jq~LE8xT4=ZDDv{Wo%__Wo~pRLM$LfEFe@SEFeQHAW$g)BVjgTWi@0sW-v7~VK_E8HZ(V5Gh#VqFl0GnWM(#EHaKMhJq~LE8wol(b8m8VC_*eCMJymgDF7p3Gc{!~I51^mHZx>pHe)$vGB{;4GchZ5WjQ!CHe)wqHZ@{0He+LCVmLEqW@KSvF=RGr0zCxS0RTEWMr?0)LUn0uWMy&yV{Bn_b9823F<&u8Y;Si$b!lv5WpXZPbZu+~0stVwyC76yb89VdVRB(?Y-Ma9P-$>wY-w&~090XfYfxcwVQg$=Y*1-%Wo&6~WiDuRZES}E83{T%RAF;#QFUc?WhiNGbT%|!bSVHMVK*@~H8(V7H8o^2VK6miV>B={FlJ*jIWS^oV`MfnGHn7o2ZsU~0y;W=RAF;#QFUc?Whf~CBVjQ$WHn+mHD)w8GGk(4H#0ahH8C<_Ib$_qV`5}CV`gsxJBI=p2s%1tZE$pXC@COgZ*FsR03%^!W@KhJFfuhYWim8mHZo&0Hexd|VlX*lG&f{7IA%9-0y_qW0vQlGI%REeba^OsZgXj8Ze?UCAY*TCb94YBVP-WqWnwaAVq-L9IWjRaFl0AnVP-fsIAvosH)1t1G;;zw4u=962s%1*X?kTSDIjBSZgX@1BVlAUIXGfvW->A{F=A#kV`DivWHvTrI51%`GG$_7Heq!FI|hdW84x-;b7^{ID0OagX=iR_WGNtHZ*FsR03%^AGdMP3WH~TnWn?)sF)%VUWHB^lVL3H5V`VfjWH@7Y0y_?e0vQN8I%8pQVPk1@c_=9$V{dMAbO0k^WHdKoGB!6cVPQ2gH!xyjWH2>0Fg0T|Fg7zXI51{rc>+5IhXNT8Iyz%vaA9L7F*GwcHDod~Wo0%nGdVOdGdVG2W`6=ZhXNT2Iy!J~a42W{VJBI=p3OYJ-bY)>|D0OagX=iR_WGMh6VPj!9IW}T7WHmWqH#lK1F*spmVly!_H#RV1VK*~1VTA%a2z3MxQFUc?WdKoiWp!mPXmo9CkOKf9!n+_+Wo%_(b7dehE;BAMASg00GB7VNGcPeSDFAY1Y-M3{Wi2u-Gc7SLXmo9Cm;xL+I%HvVVPA9rBVjaVVP<1yWHvEjF*!CeIASz7Ff=k^H#uc6WHV+sHf5Ls91}V^L}7GcP-$>wY-w&~D069gWnXkGAW(B4CMF;*E-onmBVjQ$H92N7F*svkIWb~1F=RGlI5#n2I5;slGhr|^Wi^TdJ`tD#93473L}7GcP-$>wY-w&~D069gWnXkGAaitNIy!W9aBO8fP;)LWEFzm;xLKIy!E3ZC`9@Ze?>QDIjBSZgX@1BVl1-H#A{2GGt+6HaIagHDNhoH#256W;ZZoIb<<3VPTE}J_eWq90)o(Zgp*6aA|O5b0{ewV{dMAbO0k^I5#pdVKgu_WivQ2F=8?}Wiw=CHDWn9V>vK1H#9jmkODpim;xLEIy!P?b7gcWDF7p3WHvK3HDNI^H)3LAG-NYmHZV74Vm4+uHD+dHIXE~uk^(-M0vrfBI&O7sUvzJ4Wo~mQDIjBSZgX@1BVjUQGB7wWGcq$VGchnVWHB@{Win$pV`XDDH85dgW@VHDJ_eWq90EEzXL4b1XecQFBVlA_V>d8lVK!!DW@ckFVK8AeHezHlW@RvDFl1veH)NIqKBE8=Y-w&~0BmV)WiDuRZEVp4054;8WMN-rcx7XCbZKvHUvO`1X=8aVXmD@V>{UvO`1X=8a`IB0NiE@*UZYHV0Ap`%baH88b#!lXb1rCfZEU#$7$7=2Z*XODVRUbDJUl3KbYwa@Z*z2VWnpb5EFfcVZgX@XV|8R6WMwmBV>4tiodPG!C}VGKb95kMb!1^wVRLIHDF7p3W-&KoH8(J0VL4$mGht*mF=I4hHZnFgGc{#lIb&jCp#nM&xdIpqIy!K2Wn*PzWhhj0ASNatE-o%903%^yWMnotGB`J7V>D!DW;8M}HeqHnG-NSkVl*^lF=8~M0y+t~0vHQAI&*bnV`XJzC{%MGCMF;*E-onmBVjaQIb&flVKX#hVPiBmWi(?sVlXsiVl-xAGG;e7H#MaKItjS~7!*1>ZeeX@C}VGKb95kcbYwa@b98cPZf7Pb03%^yI5A>0W;ZcmG&EsmIAk_4H#ImlGBjo|V=^~1IW{t<0y+}80vHH7I&NWYWhf~iV{dMAbO0k^VK^{mVmDzhHexb3GB{#nIW=Q6GiG8mHZ^52Fkxb4sRB9%xdIpnIy!E3ZC`VBV`F7=b8m8UC@COgZ*FsR03%^FI59RiGht*nGhsA1VKQVhVl-wjHfAw0H832?V0y+k{0vHH7I&O7sUu0!)Wo~3;Zgg`fDIjBSZgX@1BVlG`GBG(bGdM9dHZeJ3IA%35W-w)DVKO%`Heq2gHf5~>ItIA{7zjE#Wo>YDc_=9$V{dMAbO0k^F=8-eVL4?sG&5p2FfcGUFg7zZFg9joF=A#iFgId3uL3#-xdIpnIy!WDaAhbdAY*TCb94YBVK-%AVlXmgV>UK3GhsDjW-&KoV=*;2IXGluGGsP1II#jc2Dt(l3OYJub!~7cR3<4PV{dMAbO0k^W-?=CH8?P0Fk)t6H#ae5G&VD3Ha1~qHZ)^0V>LH1vjRE@xdIpqIyz==a$jX(V`yJ2}{G&y25GiGCEw*oo|*Z}}KI%9QYVN_vrYXDWoBh^Wo~0-a{y9hW@U0^ZewM0E@*UZY|H`}0y;WSX>es`Y;SicDF7p3I506dH#amiGiEn1G&W*6Wi&M~F*Yz`Fg0T`He@$sz5+VT0vHH7I&5ifWhf~iV{dMAbO0k^W@TkCH8f>5VL3TDHZ^24Vl!nkHfClyFf%eTH)S?szydl3%mNq)Iy!J^aAhbdAY*TCb94YBVKg~0G-EMlGdM9}GcYkWWjQ!9I5lE0F*##qHZWv1WWoYE2FwB&2s%1+Z);_4C@COgZ*FsR03%^CIW=W9WH>l9Ha0b5FgG@4G-YNrWH&f5GdX5AV>L0v0y+lF0vG~1I&*Yya40DNBVjl*Fk&}mGB7k_VmLQuW@a%rWjJIsGBi0cIAl3AH8aKnI?MtX2s%1$b!}f{WoBh^a$#(9C@COgZ*FsR03%^CW;rl9V>dNrF*RZ~Vq|7GGiG6CGGj1eH#B26VK+3$0y+lF0vHH7I%H*LWpXHUX?kT}bSVHMVKruAWHK`{Hexn7VKzB9GBz|aGh${iH8MD1H)AqCZ;Q}5AIyz)^ZEz?lAY*TCb94YBVKq5rI5J~mGiEtrFkxh2GdN*kF=S+AVKFpjF*IZ_GS31)2H^r86*@X|b!}~7a(O6obYwa@Z*z2VWnpb5DIjBSZgX@1BVjf%FfwChWMW}vGG#O{W;kLvW@b4tHaIdjGC4OfIXKY*Koj8t9tb)*b9HTPVRCsWDIjBSZgX@1BVjgXF*0E}GB;r}GB{#nH90h6W;SCpV>4kkVrFGIIWp4%KnCFg9s)W#V{Bz%awsVPBVjdVF=jS1Ff%k^IW}Q6VK+HrWo9;HVKgx^WoBeDFf-KxK;Z%&2s%1$b!}gCVRLJ9C@COgZ*FsR03%^IVl!eeVr6AGH#Rb1W;Qi6V`gPyWHMr6GG%2sIAvtl0zd}g0v-rDI&O7sUw3bEYh`kCC@COgZ*FsR03%^GGGk+AGB`0gIb}07H#9goFf%nZIXE;pFgG%0V=_6}0zd}g0v-rDI&)=oUv+ROb7^{IUvwz|BVl1MGcaU0F*P(~WH2*fWnnX7IA&ouV`VflWic@|F=E>SKnCFg9u_(}Z*E^@Zgg^aC|7TCYh`j)X=QgTAXH&=%8Z*ps8a#m?&cPt=OVRLI%X=QgQ03%^$Ghs0{GcaN`VliZ5H8V6ZVKOs0VKrnlV=*#gI5{!j0zeel0RTEWR7Oy8Z)Q(ob7gXNWpV&xY+-YAbY@>MUolihP;zf(PhxXra&~2ME@*UZY~%tf0y;Wlb!1^iY;Si{WpQ<7Zggp3Y)^1>X>DnGWpXGf03%^IGh;YpHZeG2Fl94kVKiYlGdMUjHDY8qF*z_bH)S#60!<(U6H{ezb!BdJX<=+naCB*HX?kUH08?dgb!BdJX<=+naCB*HX?kUHE@*UZY?S~2FK=RVWpZ|9axQ3aZ~$*&b7gXNWpZCQXmD^YXmo9C(E|W4VQgn_a%psEZ7*$Qa%W{OXmD@jEbNIyz%@WMM{ZZ+A>-Ze?L|PjGZ;ZE1RCawsVPBVjc-Wic=^F=aO}GG#S2GdMFeV>UK9Ha0UiVmC83W-;jkNgxChOlfXqVRBD!bZKpAdS!9|OlfXqVRBD!bZKpAdS!AhXmo9CB?15-!n+_>VRmI^a&K;QAW&g)VQg$=Y-w|C0C!<_WoB}3ZgehabZu<90{|exyC6(yZE18MbZBKDO<{OxZFOxRLvLOlfUtbW~_%O<{OxZFOx!Z*F6Ca&l#EV|gxUbZu<90{|exyC6_W3AbZL5JWFS;wb8BgCX8=%gX>W3AbZL5JWK>~uYiVw0E@*UZY>)#0Ai}#KQe|vqVRL05Qg2~oZDDW#a%F5~VRL0Ia&KW|ZDDXOXmo9C2LmHII&O3TBVl4EbO0k^IW#daW-($jVKrl7IbkznV>M+sWHvM~Vq-UCGdVdh2LmHII&O9VBVjo*VPiF6GcqwWH#ImkH#0b9GcsW}GB9OfFgRs1W;6!_BLX@)V|8RbaG{3Z75@PWMNZua%Ev{Uvwz|BVjdWGGR1iHexe1VP-TrG%zzaH8y2sGBP_Wo%__Wo~p|Vs&P7dMI^nb7^O8Wn?J;BVl4=VP;}5Ffn6fHaKH3F*7wYIWlH6Vq;`sHDNMgFg5=IMFDtgH#Rk7W-~A|HDxg}Vr4WrVP-fuVPOFSMFQ8WI1DEWi&M~GGR4hHa0OfH(_EqH8nE@14RmW3GdDLgVqs=DH)duxG%#c`V`XD8G%++e3Ik0B5Cba+Iy!E3ZC`VAa%Ev{b0{ewV{dMAbO0k^GBRdjIW{&oV`VcrGh;I`WH~T6V>4r8G-fbjVKy}|3DnGWpV&gZ*^{DQg32uZclJ@X>DnGWpXZPbZu-G0~!cAI%#AmDIjBSZgX@1BVl4OGGjR~F=8-cWH@0tW-(@AGc#l}Fg0N_GGS#hH8~LjJO&p78VEW%adl;NWnXh?dSxgnAY*TCb94YBVP;}EIb$+6V>mK0G&eIcIWjS3VPs-CIAdjIW;8NoG!p|n1{VVw2s%1(b!ByBUt?i#VPk1@c_=9$V{dMAbO0k^Wn?rkIAJ+sVrDiqVlXv1IA&ooH)CWsHDozrIWaY56$3m5JpmC`X=Qf+R%vB-E@*UZY({cradl;GbZmJbL33+mWFS#>Wpi|CZ*Fq{MnO?7Xmo9CO>bmna&90)D=Q#SVRB(?Y-Ma9RAF;#AW(8|XL4a}ZE0?20BLSyWq2-VbZu(tAVY6yZgT);Y-w;~Z)t9GE@*UZY$O900y;WsZ)t8QDF7p3H92NuVPi2fGBag2Fl0G0FfcecWMnfpVq`F4Wim8190NHd0~Z21I%H*ZVPj}0DF7p3V`gM!WMemFIW{*oF)}t~Ibkz6Wn*SHVl`niH#jyl9s@Zf0~ZE5I&x)mWppTFZ*OcV03%^FG%#XiH83+|W;8Z3F=AyiWjHf5GGaM7Vlg)|Ibkp$133gF0~ZK7I%;oeZee037~HDx$CIAS(oH8o;3Wi>H3Gh;V2V>LB3HDxt7A_F-F*Z}}KI#YFGW^8YF0Ap-nb8~cNUol@XQ*~lyY;SiiXmo9CdjtR=!n+_xb!lW}Y-w&~a{y;`X=G(=X>Mh6E@*UZY%&8D0y;WIb#!%dWhf~CBVjc%FgG|gGd4A0F)}$YIW%N9Wn*SBF=a9_H#IOhWMU=*I5Gnk4>~$Vb#!%dWhi5BZgX@XMs;*`a%Cnd03%^!GBP$aHD)qmV`5}7VP-TmIb&vHWHUBpVPj=EVKHJT12_#b0~QK8I!1MLb#i4WMs;*`a%CnaDF7p3Wj13qW;ru8VKOi_GG;Y6Ff}w{Fg7=2GdX58Wj8osDg!tOG6NP5Iy!G~WpZJ3Z*n~-V{dMAbRb4`bairNCMf_TVP-QkHZ(V8IW#agHDh98VPj)5HZW#0HaRgjWj1DJGAsi)4Kf233OYJ(aAk5~bZ>G!C`NU3b#i4UCMf_TVP-RCGh;V2HDoz6H90glIAt+1H8wC~WnwZhH8*BsHZB7=2r>f}0y;WlVQyn(Y$z!JBVjXQW;JFpFf=$eG+|;gH90V1HZ(9fH!(6~Ghs9~IbtvaIM@LIIyy#mbairN0Ap-nb8~cNUol@XMs;*`a%C=PbZu;q0{|exyC70!Y-M3{WgsywcrJJ#C}L%1Z*pZIGB7bXDFAY1Y-M3{Wi2r+crAD?Xmo9C1_A&e!n+_)VRB(?Y-Ma9RB~Z%b7pUHZF2xnVRB(?Y-Maza$#biKn6nt9tb)*b7gd2b#N$iX?kT}bSVHMVPQCAHDozsGh{GiWjAFvFfcY`GB+?cIb=9xVly~6I6VVE215fL7CJg_ZeL|?baHtpS8sA_WpY+&Wp^wfRAF;#R%vB-DF7p3HfCWpFfwCdHDzWtVrF7FWHK=}IAmfmIWRFYH8?djKLbD%LjxWbIy!G|UuAe{bSPJEa%*LBR%vB-EFe^2b8A*EiGcqzXHZfs1Ff%~|Kor;k06IEDXmW3DWlv&rWpZ|9asXp&VRLhIW?wO1F+*r_Z*65yVsmA3c4cxdXmo9C(E|W4V|8R3IXGfBIW=WsG+{YM13Cte0~i82I(|iXWn*=8Z*nLp03%^FVKp%^VlZQ4GBaW_HZ(XgHZwOfWHV%8Vq;}yHa9Ry13Hfb7z;W&a&>MfRAF;#W^8YFCMf_TVKp&1H#awCH8(gjIXE$7H8V72Ib}67VmUcCF)}bVIZFdN36BF93_3b;b#5qBVRLI{Y;SiaCMf_TVKic5Wo9)qHZeG4Vlg;1GB7YRVK8H4F=a7jG%#Z|FiitG3XcOA5IQNBZYWe?b8BX7Z+9jvAaiMYWnXkD03%^GWH2@{H)UmFV>vW3F=jDhVKHVnGGk>nWMnWhV=ypN13D3p0~i!KI&yVxUv4N=VRLI{Y;SiaCM+OxX?kT}bSVHMVPr8dVKFjeIb|_4V>mH0GG#O|G-EkoG&nRgGh{h4I8_5W5|0BI7&oW@BVm13DIu0~i@PI&yVxUv4N=VRLI{Y;SiaCM+OxX?kT}bSxl4CMGEWBVlAQG&D0XVPiHjW-w(oG-WwuGBGzXGGbvfHDfX|WnozZIv0-v7!W!-a&>NBb#8QNY$#M=b8BX7Z+9jvAW$YIDF7p3FlIF}WjSPIHZ)>3HDO|5GcsdgFl8|_V=-l9GcYh=TLU@{j{_JHIy!Q7ZeMk7bZKlTRAF;#W^8YFCMGN(P$niR03%^yF)(IfV=_24Ib=96Ff=)3HDNb3WHK-~G+{S5Wn^Jp13C|n0~i%LI&yVxUv+MDX>2G|VRLI{Y;SiaEFe%OCM+OBCMGEWBVlGZG&nUjWi?|oV>U1~W@BPAV`64vV_{@4HaTH7VK`p{Iunlr7#2D@a&>NBb#8QNY$#M=b8BX7Z+9jpEFe%OCM+OBCMGEWBVlGUFk?13G&g22HZWy0Vq|7BG&E&7Fk&`hG&E*oH!xuXIuwrs7z8>xV{dYGZYWeHDF7p3IA&vHIb|?4H8(jjGGbylF*!6jIXPx9V`MZlGi6~oV*@$@j{_J5Iyz%-a&>NBb#8QNY$#ABCMf_TVK8Q6Hf3TlIc707IAmiqVL4-DFkv@iHZeFiV=-hhWMu<71CIk30y;W(VQF+GB9B=Wi??qIb<|AHDWk5X9GHq0~iQ8I&O7sUw3bEYh`kCC@COgZ*FsR03%^JWHMwlHZ);kGGRD0Wn*G9Vl!o7H#amlWjJLrH#TBv13Cte0~iQ8I&O7sUvzJ9Z)|U8X=QULDIjBSZgX@1BVjgVHe)h4Wn?utGBGh_G-EP1VL3T9H!)>3VrDR6He+i8ItGse7zjE#Zgp*6bYXLAW^8YFb0{ewV{dMAbO0k^HD+QoGdN^pFlIMmIb|_nWin=EGiEY0Fk?6}W-&K3Z38+6j{_J8Iy!V{X>(t9Z*ps8a$jj=C@COgZ*FsR03%^GG&nb8H8?qAVliemVlX#2Vm3KuWHVx7F)%nXGc+}C13Cte0~iiEI&EQVWnXV%b7gXNWpXG%a%Xc?ASNatE-o%903%^$W;ZuvWo9`xVP!WoFl06~IAmlvG-Wk6F)}kZV>mN$13CV(nVq-TkW;A9sGB#!~HDqQtcLO>Ij{_J5IyzxdEkW-vBkW;tUwFfn@rItY&g7zH{yb7^d4ZggK^b9ruKC`KkGDF7p3I5#+DV=-c5Gcz}1H)UpJGBP$XIbt#~Vqr67V`OGCeFHiJj{_JJIy!S{Y-MhAUu0!)Wo~3;ZggK^b9ruKC`KkGEFe^2b8B-TCMF;*E-onmBVjOQGGsS7GG;P3F*7z|I5ILfVl*)`WH4hgH)J(qH)ekWIuVZp7#KP_b7^d4ZggK{WpHI~WMyu2Utx23Zeu7=CMGN(MkXdKAXH&4U~bYEm;aAj^}Wo~p|VRLzIV<<)@CM+OHEFeiK03%^xIbtw4Wi&TsHaKBrF=aJ2IA$?8VPQ8hFf?OgIbtz`13C$h0~ioGI&*1kWo~p|WMyz=Ze(R{bYEd}d2VAUP$niUAVwx8EFeiNAW10zBVlG_W@0d9W;SLyIW;mkI5IgoW;bSKH)S<5V=!Z8Gh&4UIu4Hm7!x`=WMyz=Ze(R{bYEd}d2VAUMkXdKAXH&@rnb8TU4Y+rU|V{~tFE@*IY0CR0&Y;0d&c4cF9Z*pHaXmD^YXmo9Cx&r_p!n+_vcw=R7bZKvHAV^_uWNc|}X8=WbV`Xr3X>V>wVQyq>X>MmOXmo9C(E|W4VQgn_a%psEZ7*|ga&#_eaBu)~Z*p{BIB0NiE@*UZY$XB!Ai}#KM{;3sXdqB>Z)0V1b7^j8AW&&=Wo&6~WdKKVVQ^?ra&Kd0b8~5KXHaQyWo&6~WiDuRZEOYt03gD@6CZfS03L2PGla%psEZ7yhZZEOVs03gDXKZC4RAF;#07F$oK~q#jQ)zBzY-Lnob89YWbZuV>WXmo9CxC0mzIyzKgb8BX7Z+9qTZ*FsRAaitNIy!T7a%pa7CMf_TVK*`~GGR9~WMg76Ha9adVl-tmFgP(bHZ(V4IWsdbHJSrD61W2x0y;WWVRLI{Y;SicDF7p3WMwyIV>UD~WHB=~VKZhpV>vc9I5J}}I50FYHZnG0oC7+z0~icCI#gkEYi4Y3cPLb0b8BX7Z+9jpDF7p3W@I-xFl1#mWMVdCF*ammW;8c4IXN~rV=-l9WnyAAo&!1xxC0mrIy!G~WpZJ3Z*n~-RAF;#W^8YFCMGEWBVl1QIW;$7G%;m1Gi5PkF<~}lH#cK7G%#d0Fk~<^W?`TMItsW07y>#vepF#|Yi4Y3cPJ?UBVl1VHa1~4IW##lWn*JFWn*GFF*7k`W;0_oGc`0~IAx*(I=BNE6*@X(b!~7cb97`nI&X7ya%Ev{CMh6eZ*FsR03%^zG&3_WF*q_dGG#P0VlgsfG-G2qW-wzkH#9h8H#In<13DA90~iQ8I%IWia40DtV{dMAbO0k^Ib$?AFfe3cF*q|ZWMgGFFgP|gG+{6{Vl!nkVKZi8rUN<#xC0mnIy!E3ZC`X@b8B-bDIjBSZgX@1BVlGVW-v2iV_`5jG-PFBHDfX|GB{ylI5{&mWHU21G&!gPItI7{7zjE#Wo>YDc_=9$V{dMAbO0k^GBPwXFflkZGGsY4V>C2oWH)AIIXPrFVPi64VKOu^sslO(xC0mzIy!D)ZDlB9Z*FsRAaitNIy!T7a%pa7CMf_TVPrKoFg9g1I59b4VK6aaHZx;1VPP_5Gc-6gH8eJ2IjjRZ61W2x2s%1$VQpn7DIjBSZgX@1BVjo)F=b?AIW=KpV=*%_GBh(YVq|1yV`gPxW-?=AVlb`)ItI7{7y>#vV{Bz%awsVPBVjRSFfd|bWj8WmF*P+aV>vc8Hf3XBWi(_tIAUTsVP>!cI=BNE3pzSxZ*pH{VPj}tbYXLAC{`vWDIjBSZgX@1BVjf(H85l_WHvT6Ght;pVP!XBIW%K4HDxq1W@ct(FgLOTItjP~7z{c(a%F9Ac4c2=WpHI~WMyt+c_>t2b89RhRAF;#DF7p3Gc`0}H#RpjW?^DwVP-R8Wi~cBGB{ymHexk5GdVCYv;#T{xC0mhIyz@^VQ^?DDF7p3IAmctI5cE2G&y24GdX2AI5ajmVrDWnWiT>jV`efpwgWnK1P^9xZ+8G@Y;SiiXmo9Cx&r_p!n+_vcx7XCbZ>G1MR;Xnb#!G^VRLI{Y;SiiXmo9CLvL?uVsCG207GwYYhrJ2Yc6PXZEOVs03gDx}VQzC~Z*pyO07F$oK~q#jRB~Z%b7pUHZ7yhZZEUdg1bZ8(%WpHI~WMyt+X=QT&L}hSgZe(R{bU|}@Zevtob8BgCXD(=TZEOYt03gDu_a&!PtVRB(?Y-MayZ*p`lXmo9CYyuP@cWHEJAXIN_Wo{ruWpHI~WMyt+X=QT&S7~%;RBvl#ZbW5pWo~3;ZewX>b1rCfZEPU|03gDb!=sGE@*UZYz6`VAi}#KP+@XmY;0w0AVz6!WB^cMa$#(2Wo$-iZe%WKbZu)D&jT6)Iyz%@WMNZua%Ev{C@BCVVL37~Wn?xpGh=2jHa9S0GBROfVKFo_Fl9GkI599{F~b8q&jT6=Iy!TCZewV2Z*FONWhf~iV{dMAbO0k^F=03{WHvA`HDx$tHZWmiFgP+~Vqr2fFf(B|VPP{c#REJB&jT7AIyz!yXK8LCE3I5;pfF*Y?dV`DZsF=8?{H)LfwHZ?IeVKz3&13U)L0~!-LI&x)WZ*pWPV|8R7VKq1~Vr64CF)%eYIWc23Gh;S6In4t+5!e9$Iyz%@WMNZua%Ev{0Ap-nb8~cNUol@XV|8RKNZEPR}7y>#vV|8RFgRp0H_`(-AOsi!Iy!!1b!1^iY;SicDF7p3I5Ie6Ght#fH#IdfF*Y|cWi(}DHe)w9WHVtoWo0%p)B`#o1Q-lDI%9QYVMc6kcPL|ZWMM{ZZ+9jpDF7p3Ha9n6G-5R|Fkxb4Winx7Ha0aeIWsdjIbt#}W;Zf5)&n{UAOsi;Iy!G~WpZJ3Z*n~-V|8RYDc_=9$V{dMAbO0k^F*!0eW;8WnFf%x0IA&pCIb$(mF=b^mGiGEmIbk_7+5#vV{Bz%awsVPBVl1=H!?P4W@R!lWjQx7Ib=CDV>UH4GcaK_He@+DH8I`;Iv@lX6*@X(b!~7cb97`nI&X7ya%Ev{CMh6eZ*FsR03%^HW;r)vHaKQsH)A+BIb>oqIbt$4VPQ8hVKg>jH)Cet13D8R1Q-=MI%IWia9?g=bZK^FUuSY*aA+uVbYwa@Z*z2VWnpb5DIjBSZgX@1BVjZ%H83}3GBIUhIb&isH#RmmW@cqDVPiLCHeqEpGd1D^IujrS7y>#vZf|dJC@BCVVKXpdGG#F^H)CQlV>vKkGdE^5HaB82IAk$mG%{f~G~@$1AOsi%Iyz`?b95*}CMGEWBVjN$Fl8}gIW{pdF=H|@Gc;pmFlIP8HZ(FYVKg#hWH9CfIs+gC7!Nu+Xm4|LC}VYGVN_vrYb+o`CMGEWBVjNyWi(|qV`evGFf(E~WH2~4WHx4EW;HZrVmLNpVrJ+AIt?HM7$G`3Yh`k7Wo#&9X>BtsAY^H6Gb|u;X?kT}bSxl7EFeL0XLD38E-onmBVjf;W;Ql9H83+|F*9K|VmL5nVmC86Ic8!qW-wzhH8bi1Iv*ee7%MtDYh`k7Wo#&8b!1^wVRLINAY^H6Gb|uvX>BtsAaiMYWnXkGAVw@8L2_qvR4y(qDF7p3I5IIYF*jsnHZ(9eFgIdiH)UfnF*IT{HaIvqWM*P9>;pO}AOsi`Iy!A-ZF6OGD0XjYWGXBmX>N2ZAaiMYWnXkD03%^AH#uWrHZ^2nGc_|XF=aJjVP<4BH8?joVKF&nIb<^K13D5Q1Q;GVI&Ecbb7gcWV|8RdH5W@0fn@B=y=AOsj3Iy!A-ZDVkGD0XjYWGXBmV{dMAbRc$bX=ExaAaiMYWnXkD03%^!GdDIfH#RUfW-vB1GiEb3Fl9MqWMMI8Ibk+sH8nZ%13DTY1Q;edI&EcbV{myWV|8RC22G-ftrI50J4H#IpjF=b&hV=`egV)g?%3Lpd+7CJh5WpZyQV|8RnF=l3EHZ(Uk_yal=AOsi>Iyz=)Y-}i0Dl8yWEFg1fdSzd9DF7p3GBIQ{G&DD2V>mN3W;A3oWieu6G&eUgV>mT1WH31}`U5%*AOsj0Iyz=)Y-}iFb!1^wVRLINAXF+WAXF?Mb7^{IUvwz|BVjW+Wn?)wV>vN2GC4IlV>dHoG-EO~V>2{lH!)&lHD&w*Iu{@W7#2D@V{dSIC{!veAY*TCb95k7Dl8y#X?kT}bSVHMVP-HhWiT{1H!)&1H8o~AWi~Q7WnwgAGBRZ`Ibtz6X8r>@6d(i`AUZl@Z*X}iV|8RMfV|8R#vZeet3c4c2>cx7XCbYWs_Whf~CBVjdSW@KSzHZn0`Vm2{1VPZ9AFk@jhGc-0fH8(IaH8utWIv@lX13Ef$X>Mn1WnXk*b89F=DF7p3V=_4~He+HiFfcM^V>B@_VL3Q6WHB%>VKZbkIAk_52n0F-AOsiW;iuAGG#F^W;tPDWjJCqG-hNrVPj-vV-^HD6d(i`5;{6`a$#V-9H)3TnVm3K2IAu6yH!(FgFk>||GcYq^VL4?Q1UecZ1Q-Q6I%8pQbairNC_^SDDF7p3I5}f8I5}o!W;QfoF*ap2WMnvFHZV9hHDohlH8C_~90WQ8AOsivM~I5ROZIWje7WMpGuH68>y3)lewIyz%@WMM{ZZ+8G=Y+-YAbY@>MUom5KWMM{ZZ+9+ebZu;w1OzW(Y-ewBX>@3905CLTF*7h`WMXAyV`FAvFf%haWMVmHH#cE5WMeX8GcIU!ZEQ3I7zjE#ZeeX@C@COgZ*FsR03%^zIAdfrI59UdWMw#EVl+55HDxnnGiEU~GGt;hWMVNQ1Ud#Z1Q-Z9I&O7sUvqV1V`X!5Z*p@eDIjBSZgX@1BVlG{F*al}HaTHqH#K8sW??loVKXr@H90e8GB9K_Fk~bIItDZZ7zjE#Zgp*6WMyz=Ze(R{baN;vAY*TCb94YBVPj)7Win!AW;8Z5WjJPHVl_7~He+RGWM(utGGQ__W+ntW1~dd12s%1$b!}gBbaHQQXJ2GxaAj^}Wo~qHC@COgZ*FsR03%^zH)3OAIb&imWo0;KFfuS?Wi>WtWMXDGWi~Z2GdMUX1Ud#Z1Q-Z9I&O7sUw36;YhPq#aAj^}Wo~qHC@COgZ*FsR03%^!F=S+9IbkmZ5GB#s0H)c3yI4lG@2{Z&43pzSxZ*pH{VPj}tWMyz=Ze(R{bSPFPCMh6eZ*FsR03%^FH8wRdWimE2Gd5yoV=^>3Ib&gDVPj!5VKOvjG&p1~1Ud;c1Q-Z9I&^t(Whf~iV{dMAbO0k^VKOjeH)1t6IW{mhG-ESiVlg>1Wj16tWnyJAVPs-4Fa$aVGz1t3Iyz`!b7)_7VQh6}C@COgZ*FsR03%^yWjQrvVmW0vHa25qH)1$3GdE;4WjJGDHDqOBV`Vln1Ud$F1P@keWp@BpX=QgVXmo9CAp`&~V{&P9X=7n*E@*IY0Aq4#bZKK@Y+pENaBwbYbZumEjVKg^1VqrG~HgyCP4LUkhVRLIJV{dMAbRbk=b899k03%^FGh}6EW@R=oWH)0sH8^BuG%_|dVP#@CF)%hWIWjmo1U3tG1QZQAI&W}ga$$6Day=+xZ*FsRAXH&N0WMN@qHa9b2IX5vhH8?v2HVbtG6c9Q(Z*XODVRUbDJt%W@WI8%-b!=>KbaG#GDF7p3G-5DiGh{YlF*!IjVKX%|HZnG2F*P|iFk>|_V>mJ}Jp?umbp#X>Iy!G~WpZJ3Z*o07C}VGKb95k7VRLIHDIjBSZgX@1BVlAXH)3KmIXGf4VKil7WiVqiH#K5oH92B7He_OAVK_eoHW76M6cai+Z*XODVRUbDAw4K#Z*FsRAXH&K}`Wj8crI5K2rF*7w}MFchmbp#X$Iy!E3ZC_+%aAj^}Wo~qHC@COgZ*FsR03%^JH#B5pF=RJ0IAS(3VmLB6VPrBkF=Az9VP;}vHaBBO1U3eB1QZB5I&O7sUvqSFZ*FH_WMyz=Ze(R{baN;vAY*TCb94YBVPrHpVL38oI5;&pV>4qiIXGfvWH2%@HDxn2F=940Vo3xx26Y4!2s%1$b!}gFWnpVyWMyz=Ze(R{baN;vAY*TCb94YBVP-R8W;i%DGC46}W;bRtIXN{qWHd8mF*h(cGiGHlH%kOI26Y4!6goO?VQpn7V{dMAbRctdWI8%?baH8KXC^5CBVjo(H8*8rWH4qkI5RV1H!(LfI51^7F*Ig4H8wbAFfvU9HWGCN6a_jucW-iQC_^SDDF7p3GG;brG-G9BG%{o`F*ap2WnwrvGdVD3Gd412W@R}!PXsmtbp#XyIyz%-ZE$aMWn^D(W++r9DF7p3Vly~mWi>J|W@R@pFk@mgV_`97GchtVFgRg3H8nUjQ3N&ubp#X(Iy!K2Wn*PzWhhj0ASNatE-o%903%^IV>dWsG&M6aIW=K5H8W!|V=yr`Wi&80GGt*mH8?m^1U3nE1QZK8I&*bnV`XJzC{%MGCMF;*E-onmBVjW$F=8=dH!?OgH8n6dGG#YoH)Js|IW#dcH8*B9HZ)ZPHVJhE6bw2#a%F5~VRL0DQ)O*oaAw&KQ)O*oaAV_pVK6f^Hf1q3V`gGCW@TkJWMedAWjI*`HVSnF6bCvwWMOn+D0XjYWGX2DBVlG_I5J^kHe@z6F*sx~F*Gt^H!@>1WHMtpH)durF*I8QHU)JA6b?E%aB^vHa%psVC{$r{Yfy4&Z*pmLc_{!RVKy~lVKFu^V=*)~G-5O`I5cKBHDY8rWHV%BG%+$cF@rgDIjBSZgX@1BVl4WFk&-hI5{|EW@0p9IAk(3V`F1EIbt?3HDY39IbmM}HU@PB6aqRra%FR6bSNnRBVjWzGG;X~Wn?!oH)JqjG-WwCWn^PDFlIS2VKp}~W;0;~HgyCP0y;W!Wpib8Uw3bEYbYrIBVlD@Ght#kF*0Q_GBGnZH8e6}H)S+pV>2-^Wo0>LGB9HVHgyCP2s%1tZE$pXC@COgZ*FsR03%^IF*G@4WMgJIWn(g8W?^G7IAmloH!)^6VKp^lWiU2n1U3eB1QZB5I%r{YUw3bEYbYroV{dMAbO0k^W@9vEV`4ZtWMnjDG&VFfVl!o9GiG8pV=-c4FlA#kX9P9|bp#X(Iyz==a$jX(V`yJW3HZwP6G+{S3Gc-3gVlrhkYXmk4bp#X$Iyz`!b7)_7VQh6}C@COgZ*FsR03%^BV>vT0F=jGhGczz_V=^;kWidEqIA%66HaIvjG%;pv1U3eB1QZB5I&^t(Whf~iV{dMAbO0k^W-?@EIW=WAV>n|qI5#k3Ff%h`HaRk7I5=TuFk)jeZv-|5bp#X@Iyz)^ZEz@abYwa@Z*z2VWnpb5DIjBSZgX@1BVjW$F*so~Ic8=#WH4bcF=J#gWjJMJHa9ggH8Eu~VmEOFHWPIO6bL#xWMOn+C@COgZ*FsR03%^DGB7wYH)CQqFfutcVK!!EGGk*iH#RvjG%z(ZFgas$1U3fP0RTEWRAF;#0Ap-nb8~cNUol@XRAF;#E@*UZY%l--Ai}#KQ)zBzY-J!+VRLH$Q)zBzY-Lnob8ACYL_scSbZu+}0stVwyC6_ua$#(2Wo#f#WpZa_07F$oK~q#jO=WUtWiDuRZETPO03gDet1X>MfzRAF;#W^8YFP;zf$Wpi_BZf8(waAj<1Ze=cLbZu-f001DuyC6_ua$#(2Wo#fxbY*g3bZKvHa{xwfaz$ZdXhT&*K`v-?ZEOVs03gDHyZe##MRYXBkR76H;Ze%WKbZu-f001DuyC6_ua$#(2Wo#f+a$#VAd0#kaaBwbYbZu;t1Rpv&V{C78b#i52cXDBHaAk5|ba`-P03%^CVm2`~GG;O~V=`egWHx0tH83+YIb<+0Wn*GAH90kt1Rpv&Ut?%>Zfjq2X?kS1H85o|Vq-aEF=CSh9|Af$P+@X(X>@6CZe?;pVRL0DDF7p3WH2x@H)S_4H!?P1Ff}waGB;s4GBY?bF=b;iWH~iqg#VUBoG-fhlVPP^bGBso~I53X{K?aiq9|bx(V{C78b#i52cXDBHaAk5RMkXdH03%^JIb||4V`VpEFlJ$9GhsP1GGSseVmD!9H8Ev1IA%1F1VIDX0RTEWP+@X(X>@6CZe?;pVRK~wV{Bn_b9823F<&uIVRCe7bZKvHWpYAcb7d}QbZu+~0stVwyC6_ua$#(2Wo#f)Wn^_@bZKvH08n9aVQg$=Y*J-pbz^jCZ*DGVbZu;v001v%a$#_2E@*IY0B3SxaA;pRXmD^YXmo9CbYXLAW^8YF0Ay)$UpP2qVPrHhWnncnV_`BeH#cQ5G%;i|GGRG0WjQl3V=icPZEQkyX>4R^Zf782Ze$=yZgX^DY;0+6X8>t#b97;BY%XYYZEV;906IEEWoBV@Y;;s%b8Apxa$#+A0CRM5bz^jNW?wO1F+^o%VRdYDRAF;#P+@XmZF4SYbZu@39FJ^CYUu9uqXf9}QZ~$g+a$jF%VPj}tIB0NiE@*UZY|#S%FJpCNVP9imaCCKYWpXZPaBu))b!1^*Ut?i#bairNa$h)TaBwbYbZu<90{|exyC6_4V3E@*UZY`p{^Iy!b?Y;|Q{ba`-P03%^EVPZKrF=R71V>31}F*0H`I5}Z6Vm3EpH8C(|V>K|n1Ry#(aBpdDbY*e?BVjc-Fl9F|G&L|dWimA|Gchq{G-P9CIb$|7F*9U2FfhFYAUZm7WoBh^Wo~0-03%^#H!v_|V`VooVl-uCVlZYgGiEk1GBhN37asVS?F=Aq5H8y5BH#KHtG&3_}F=k{qHe@w0FgPI1Rw%BI%9QYVO3L2L2PVqV_|e}awsVPBVjQxV>2*iIW;k2Gh|_6I59FdF=H||WjQfoFlA#lGGe0yLcIha89F*+b!1^xQ%yl^Y;R*>bZ>GfV{dMAbRc7OWMNfPO+jpIZ)0I}Z*nFn03%^IVmLT4V>31|IAt<8I5ssgGh;9_HZ)~9W@KYDF*jzV1VR_R1RxwbI%9QYVO3L2L2PVqV_|e}awubOZgX@XV|8RDF7p3F*!LnF*G?gVlgygHDNM2V`DOAH!);pV=^^mWn*M9rvyS8y#ycvIy!!1b!1^xQ%yl^Y;R*>bZ>GfDF7p3IAUgHVrFADG-6{nH#jjkH8wamFlI3{GhsM1W-v2isRTm31Rx7KI$>mFa%FRKC~{?HWpZV1V`V7-BVjQ&G-NX}Gd4A0HaIe5VKOl^HaTT6H8o;6HDog~HZZFMLJ7SDAQw71VPs@-Wpi^VV{dMAbYF61W@U0^ZewLBAY*TCb94YBVP!dGIXPisF=b;gIAk<2IX5{tWi>EnIAk?qG&wjhGOYwc6}bY&=TZ)t9HWpXSab7^{IUvznJWhnq7VK8N5F*rG6IW#e1GdMXgHZn0VHeoY2W@KeFHa0gmVX*{461@Z<2s%1#VR&D2X?kTSDIjBSZgX@1BVl4>W;0_lFfue`Wi>fBGG#YpF*9T_HDok7HZWs3I5e{aLI%AAAR0P4V{dMAbaHiLbSQ9dX>N37ax5TYZ*FsRUvgz;WpZV1V`V7-BVjo*F*sy5GGt>pFk)jkH)1hiV>UE6W;J4FIWjXeGdZ;cLKwXSAPG7;WMy-7a&LJkaBpdDbY*fW03%^DGi6~nIXEyeF=IG0H)A$6GBq(`H#Ih4WiwmLV{dMAbRc7OWMNfPO+jpIZ)0I}Z*n|UJ|-z3V{dMAbO0k^WnnjCIAb|7Gcsm1I5=iuVP-NnG-EV0GdW{nIWsn7xdcKWy#yd4Iy!G~WpZJ3Z*n0$C}VGKb95kMb!1^xQ%yl^Y;R*>bZ>GzRX!#uAY*TCb94YBVPa-DVl_54VmLT9Gh#JiG&V40I59S1HD)tpHD+WtV!H%FAlLx_Iyz%@WMNfPO+jpIZ)0I}Z*l-*Y+-YAbY@>MUom5KWMNfPO+jpIZ)0I}Z*neZbZuNHaKN6Ha0fF1RFX!aB^vHa%psV03%^CVKOyjH)Aq1F*0T|V`4F4IAl06Wi(?lW?^ABG-P7I1RFX!WMOn+03%^yFk?7mH8x^pVK_1{W@I%nGBsglWH>f7WMncqIbvpY1OQNBa$#+A0CRM5bz^jNW?wO1F;ro5YfxcwVQq6RXmo9C$^;<-Iy!WDaAhbd03%^#G%++bI5=TtHDzOEIW%QBVlrbfFgRpoWi&E2Hf1ux1VhRMAp$x&M|EjrWn@rca&&2QX>V?2awsVPBVlAYGBRUkHZ*2qGc`3eF*9Q~H8wIdH)J$rGGaD4WHZDBL&^jp2s%1Pb!lW}WKdypbZK;HZ*FCBD069gWnXkD03%^yH8nIaWHmQ7F=b>nWMW}vFf%t}WHd7|Gd4G3Ic74(1VaYO1R)JNI!ASBWMyPfVRCe7bZKvHWpXHUX?kT}bSxl4CMGEWBVjgSG%{mmH83|ZW@9!sH8wXfGc+}3WoBe$GGjGmV=%}BLkrje06IEHb!lW}WKdypbZK;HZ*FCB0Ap-nb8~cNUol@XM|EjrWn@rca&&2QX>V?2axQ3eZEPh103gDe?5V_|e@Z*Bl>VRUk7cwcRGY;dTuV>UH3HOvG-2G9f_3OYJrWMn8*VRLJ9E-o%903%^yW;tVGWMw%sH8f!`Hf1noWo0#HWo0sDVKQQ7V>C6+1VISc0RTEWLvm?!X=7n*Q)OdxX>V=-V{Bn_b9823F<&u5a%psFV_|GlWn*+{Z*DGVbZu+`9swc(D*-S8H32&TKm$huN(D{@QUO>1TmWPMXK-O>Wo}_@Wpi+0V`XP@Z*_2EY+-YAb98cbV{~V?Hd2nT9WoBe)a%O34WoC75V`OD!X>Mg@Zgp*CZgp)Sc42IGVR8Tf'); +Search.load('O+!-x00000uL1x77P|ldzHb2ln7RM}0RR9100CzN00001ZU_JX00C(Z00001YZL$g00DCv00001VITki00DF+00001AS?g?00C?=00001VK@K)00C}300001Z$tn900C)A00001bWi{Q00C)M00001WLy9M0RU(N00C!c00001ZfpPm00C)o00001Yj^+v00DD;00001VT1qx00DH000001Admn600Cu|00001Y@7f900Cm600001VW00D1?0RR92bc_K200DK90RR92W0(N|0RTz@00C#D0RR92Zm0nO00C*P0RR92Y_tIY00CsW0RR92ZomNm00Cjf0RR92Xv_fs00AJ<0RR92ZrA|;00D2_0RR92Y2*O_00DIB0RR92aPR>D00Cw80RR93Wn-QJ0Q>;}00DRc0ssL3N&x@?ZW00j00D0n0ssI3X&eFo00DF&0ssI3VJHFs00DL`0ssI3X*2==00DG50ssI3Z$JV700C`A0ssI3AWQ-P00DGT0ssI3V^{(J00CuQ0ssL3H3R?wb7}$r00C`o0ssI3VRQlj00DA(0ssI3XMh3#00D1?0ssI3a*P5100D540ssI3AeaIG00DQN0ssI3Z=?bM00C^O0ssI3X0QSP00C>Z0ssI3bG!lo00Cjb0ssI3bjSh#00AJ*0ssI3Wz+%y00C^;0ssI3Y2X3?00L@eWC8%@0ssI3Z|nj900DCL0ssL3O921@ZUO@U00D0X0{{R4X$%7Z00DFo0{{R4VHg7d00C?s0{{R4Y$O8!00Ctx0{{R4V=w~%00C|`0{{R4VLSr>00Cn{0{{R4AV>oM00DGP0{{R4b5sKW00CuM0{{R4bzlPk00D7k0{{U4X9EBMXL17o00C}x0{{R4X?z0!00C=)0{{R4bBF@~00Ci+0{{R4bd&=C00AJH0{{R4WuOB900C{L0{{R4X{-YP00DHe0{{R4Zny&g00L!oYy$wk0{{U4>;V7)XU+ov00C~&0{{R4Y1jh*00C>>0{{R4bL0a600Cj@0{{R4bnpWJ00AKO0{{R4b^HSW00D3U1ONd5?EwG*XAT4a00C|i1ONa5X&3|m00C>~!~*~b1OTiA0F(s)00AJH1poj6ySxPe00F|n1pom7dITWI1%LnnYt97#00D2-1poj6Z`=g{00Cm+1poj6Yv=_40Re{tVK@K)00D0F1poj6Z~O%S00C(N1^@s7bO;6j00C(Z1^@v7C;Y^^0A200Cnv2LJ#8WiST-00DS52LJ&8Gynhra6$(F00D4D2LJ#8XiNtH00Ayi2LJ#8WmpFQ00DAd2LJ#8Z)67m00C%f2LJ#8aBv3z00Cik2LJ#8ZF~m+00Cu!2LJ&8@c;k;aE=E600D542LJ#8XqX2800AzZ2LJ#8a-;_U00CvH2LJ#8Yp@3Z00DBg2LJ#8Z@dQp0RXcB00D5w2LJ#8aLfk)00C&y2LJ#8F4zYE00C&;2LJ#8aO4L700Cj@2LJ#8a_|QL0RZv=00D6P2LJ#8Z~zDZ00C$Q2mk>9c?19fa1sar00D3o2mk;9XdDOt00Ax{2mk;9cPIz|00D0<2mk;9Y%~Y}00J&%39{s901a8?Ka00D4X2mk;9XkZ8c00Ay$2mk;9acl?x00LoiPzV5W2mk>9^#K3@aDoT`00D4@2mk;9Xp9H|00AzN2mk;9a+nAJ00Cv52mk;9WTXfH00C^O2mk;9X|M00D5|2mk;9aO4O800C&~2mk;9F7OBd00C?E2mk;9bNmPZ00ChF2><{AbO;Fm00CbP2><{AV-yJh00C|q2><{Ac_0Y@00DC*2><~AgaiNqa54!100D3|2><{AXgmo300AyS2><{Aa!3gP00CuA2><{Ac2o%f00DAZ2><{AWnc*a00DDm2><{AVr&Tj00?DscV~2FVQ)MK0Eh<~B4goKC34j9tA^~XuAOQdYDWVAg0RSWc00Akm2><~BBLOJ134j0rY`O^m00C^m2><~AYXJZODb5K10Rd_ODAEal00C^+2><{AY2XO}00DI72><{AZtMvF00Lok$O!=Q2>=5EApvLs9|0&|stJJj2><{AZUPDb00L!onh5|B3IG8BTmS$8bRr4>00Cnr3IG5Bbu0=100Cq&3IG5BZa4}600D143IG5BV?+u700AIM3IG5BW>5+M00D1S3IG5BAY2Ln00CuU3IG8Bb^rhYVQvZl00Crj3IG5Bb$AK@00Cou3IG5BAcP7400C%<3IG5BbdU-F00C*13IG5Bcbp0U00AJP3IG5Bcc=;g00D2V3IG5BY_tjh00CyY3IG5BYrqNs00DEx3IG5BVay5u00DH;3IG5BAlM2300Cv*3IG5BY~%_60stBU2m=5CDewva0s$EU2LmYf3V;9sX8Z~O00D0X3jhECUknQX00CqY3jhEDWnmZ#0H^=}00DCz3jhEDX>aNZ04NIp00D3=3jhKDTLiZN00AjJ3jhKES_HNLC`1c@0RX)N00Ajf3jhHDyaOm$3xEItWnK#a00C`g3jhECX>1Ds00Mb(N(%sT3jhEHZ*FpAZE)@d0P+R^7zzMr3II3@0C)=k0ssU61^@s7DVz%c0s#X61pp|b3xEItDX0qo00Ahn3jhECa<~fs00D2l3jhECbi@k)00Cjj3jhECa?lF^00Cvz3jhECaNG+300D2}3jhECI_L`k0Raa9I_?XA00DCJ3jhECZ~O}Y00DFY3;+NDVF(NW00CtV3;+NDauf^z0RW=_00Ctn3;+NDa3l->00DU_3;+NDbTAA600Cb%3;+NDWjqW300DDC3;+NDVMq)B0sy4|rvLx}DO3yq0s*7|rT{2f41fRuWnc^d00DDq3;+NDVQ>rp00Cll3;+NDV|)w%00Cu!3;+NDc!&%D00Cu=3;+NDXp{^900D5C3;+NDVW12E00DBQ3;+NDXRHhW00CjL3;+NDWVj3f0RXoE00Ak)3;+QEwg4!|41fRudCm*~00D5;3;+NEb#J~50Ne}!0RX8000AlN3;+QEr~oMN41fRuJ@yO$00DCT3;+NDZv+hh00DFg4FCWEVGs=f00D9q4FCWEWgHCv00D3!4FCZEs{jB2DJ~5F0RgH2C^8Lz00DG34FCWEb3hFM00D1C4FCZEtpET4DN+pp0RgN4C{_)C00C@V4FCWEWn>Ki00C}l4FCWEa&Qd*00Cuo4FCZEuK)l6DS{0E0RgT6D25Gy00DH24FCWEWt0s70RXW800AkW4FCZFumC8e4S)avd8!Qn00D5a4FCWFb7Ptf0JseR00C{j4FCWEWylQx0RXcA00Ak~4FCZFvH&R94S)avZ{7_600DC54FCWEW$X5fJzPk00C%F4gdfFV_XgZ00CiQ4gdfFWoQln00Cca4gdfFa&!&=00L=m5Dox(4gdiFxc~qGDT)pN0RgxGD2@(*00C>14gdfFbDRzU00Cj54gdfFbf^vh00CdF4gdfFWwZ_e00C^e4gdfFXTS~s00C~s4gdfFY0M4)0sy}N!2kdODcBAG0s+1NzyK)T4uAjwZR8FB00DCD4gdfFZ}biT00CzD4gdfFa{vzj00C|W4*&oGVGIud00D9m4*&oPVqg0RZ9v00Aj*4*&rH;Q%Of4}bsxDS8h800Ah14*&oGa)=KA00D1~4*&oGbd(PO00Ci|4*&oGa-a_Y00CvD4*&oGaI6mi00D2Z4*&oGI=Bx20RiIxI=&Bp00DBu4*&oGWy}u%00DH;4*&oGW!Mh@00C^?4*&oGW#kV41OR9RYy@fqZUg`UDew;f1OaCRYXoToZ3HO#4}bsxDF6@v00Aft5C8xHaug5%00D0r5C8xHbRZA_00Chp5C8xHax4%400Ct(5C8xHa5xYE00D145C8xHIz$iv0Re9WI!X|L00DAP5C8xHZ&(lj00DGf5C8xHVPp^h00Cuc5C8xHa&Qm;0Rxx=Vq{}#4*=>90Co@n00Cos5C8xHWsDF200DTC5C8!H?*ae;DV`7j0Rip;D54O600Ake5C8xHD6kL!00DBg5C8xHZ@dry00DHu5C8xHVaN~w00DB&5C8xHWz-M=00D5?5C8xHZ{QFB00BDY5C8!I@B%vQ5P$#ya`F%W00CwC5C8xHbN~?m00CtN5dZ)IYzz?q00CtZ5dZ-IJp=#&WgZa#00D3!5dZ)Ic_5dZ)Ib3hRQ00Ch}5dZ=JKLkMp00Ajd5dZ=KJ_JAnC|D7I00CuQ5dZ)Ib7&C&00Cic5dZ)IVssGz00C%v5dZ)IaDWj200Ci!5dZ)Ia*PoG00C!`5dZ)IVVDsB00Cs45dZ-ITLb_BDXI|w0RdVBD6SEJ00DWl5dZ)IaJ&%!00MPyq!9qZ5dZ-ILj(W;Dbf)D0Rch;DAo~x00BMR5dZ)Ia^w*J00D365dZ)Ibnp=X00Ck45dZ)Ia{Lhh00CtJ5&!@Ja0n6r0RTk=00Ai!5&!`KL5`X{!aM}_800C{{5&!@Jb?6cR0RT?~00AlV5&!`KP6R0U5`X{!bp8?m00DCb6951KZwwOv0RT}100Ai&6954LPy{F(6Mz5#Y$6i?00Ctx6951KZZH!700D9~6951KWjqrA0RU4300AjT6954LQUoYW6Mz5#bW#%l00CuM6954KRRjP5DP|J@0RdD5C~6ac00DV$6951KaCj2{00MJkU=sj-6951KZG;m500Cu=6954KR|Eh7DVh@i0RdJ7D4r9500D2L6951Ka;y^o00CvP6954KSp)z9DZUc`0RdP9D8dtf00C^s6951KY|s+`0RUeF00Al76954KVFUmHDdrOZ0RdnHDC!e{00DUL6951KW%v^S00CtF6951KZUhtn00C(V6aWDNUIZv#;1hrl6aWALXciOz00Cnn6aWALVJH*;00Ct#6aWALUo;c|00DA36aWAMX>ZmO06-J~0RUYD00Ajb6aWDMTm&do6o3E$Yg!Zl00DDi6aWALVQ3Tp00DGv6aWALUvv}z00Cus6aWALY=9I100C!)6aWALZj2NF00C)|6aWGMV+3Uc00AkU6aWGNVgzIaD5MmC00C{P6aWALa$01grWoDu-a5&-TJ01^`bLK6U#69Bjq07eu5m=pls6aWALI^+}p0|I6QJ_JAnIw%?yfLIZL00Ctn6#xJMb1)SE00Ch(6#xMQmjh#EWMUc&0GJQ}7!d$G6#xJMXhIbL00D4T6#xJMVO$je00DAh6#xMMNB{r00Az36#xJMZG;s700Ci&6#xJMWsnsB00DBA6#xJMbet6c0RRsH00D5Q6#xJMaI6&o00C&S6#xJMF1Qr{00DWv6#xJMa>Nw?00D2x6#xJMZO|0}00Cvz6#xMMv;hDCaN-pJ00D636#xJMXzUdL00A!Y6#xJMW%v~U00CqE6#xJMX#^Gk00DLi761SNWe^qs00VPvWTF)S+!X*8761VNQvd(~a4Hr600D3=761SNXfzf800AyK761SNcR&^Z00D1C761SNY)lpa00LiT92Njl761SNVOSOb00CrT761SNb!Zj<0RZp;00D4x761SNaCjB~00C%z761SNE`$~U00C`^761SNa*!4P00D27761SNW}FrP00DEN761SNZm1Rj00CjH761SNa00C&q761SNF3=VL00C&$761SNW84-100Cj*761SNW#|?F00Cd_761SNa`YAe00D3M761SObY{91000*N00A!s7k~f(Z4eg#00C$g7XSbObQ~7|00C(t7XSbOawr!700D0<7XSbOXEYZ800C?|7XSbPV_|L<06-T20RZ0v00Ajb7XSeP-UBF97k~f(Wm*>i00CoS7XSbOX=oP!00DMx7XSbOWpoz+00Crr7XSbOUw{_?00DG{7XSbPbY+|s0E`y^0RRyM00AkS7XSeP5CkZo7k~f(W2P4X00C~U7XSbOd9)V*00DEl7XSbOVZavv00Cdd7XSbOdCV6800D5)7XSeO69fPODc%4@00Cp@7XSbOZtxcX00DXS7XSbObNm+o00ChF7ytkPUkDfg00DFk7ytkPWfT|y00VDhbIKI}*cSjA7ytkPZ6Fu`0ssjF2Lu2CDKr=W0s#mF1_UTN7=Qo)Wk47J00Co47ytkPX;2sd00DMZ7ytkPWn35l0RRgG00Ajz7ytnQ3Ir%@7=Qo)WO5h)00Cus7ytkPa)1~B00Ci!7ytkPXp9&D00LxlVi*9D7ytkPUzivG00Cp77ytkPZ>Sgm00C^S7ytkPY_u2v00L!UE*Jp17ytnPuu00Coa82|tQX>b_;00DM(82|tQWqcU`00Crz82|tQUx*n100C`|82|tQb(9$Z0RY_t00AkW82|zR;{)LX00Akg82|zS;sf9VD6koT00DWn82|tQbif$^00DBw82|tQWy~1>00D5)82|tQZ`c_C00DB|82|wQ=K}x%De4&j0RiR%DDD}600C?C82|tQWBeHa00D0T8UO$RYzP_v00CkS8UO$RUlbYu00D9u8UO$RWgr>=00D3&8UO$RUo08`00DC{8UO$RWH=fC00Ch>8UO$RWkebP00DAJ8UO$RXiype0RZU(00Ajn8UO(S=mRKV8h`)+Yi1e%00Coe8UO$RZ*&>}00C@z8UO(R?*jk8UO$RX_y)S00DBI8UO$SVrPIF0HhiK00CdB8UO$Ra00Cdd8UO$RZOj?~00C*z8UO$RWY`)200Cd#8UO(R>jMA*De4*k0Rid*DDE1700C?C8UO$RWBeKb00D0T8vp00Ai!8vp?T@B=6q8-M@-WF8v;00C(x8vp;ou#8-M@-Yl0g900Co)8vp0Ri*_C<+{a00CRJ00C;^8~_0T`vU+0DefEq0Rj30DDoVD00DCN8~^|TZvY(t00C|W9RL6VVQJ_b01O=f00C_h9RL6UUmP6(00DU-9RL6UbSNDF00C((9RL6UY&0DJ00C(_9RL6UVn7`L00Ch}9RL6Ua7-Nl00CiA9RL6UV^|#k00CcK9RL6UWn>)y00DGr9RL6Ub#NU300D4z9RL6UZG0U700D1;9RL9U{R032DUKZg0Rj92D3Tq300C#19RL6UZlE0i00C*H9RL6UbF3Wz00DEd9RL6UWw;#x00DBo9RL6UWW*f+00Csm9RL6UVbC1_00Cdt9RL6UWZWG900Cv<9RL6UY3LmQ00Cz19RL6UY4jZc00D0L9RL6YaBX*Eb>tZUf*b(E8~}(N0015U0Rr3uC|{%*fCe4_00Ctj9smFVV<;W~00C((9smFVb~GLU0RaC400AjL9smIW{sSmP9)JJ=Zb}{i00D1O9smFVX;>Zr00DDe9smFVa%3I=00Cuc9smFVc5og500Cci9smFVa(o^D00Cu!9smFVc8DGT00C)^9smFWWpX+m0F)j80RRC600Aka9smIW00bzg9)JJ=Zmu2x00D2d9smFVX}lf)00DEt9smFVa>yP400Cvr9smFVcGMmK00Cdx9smFVW#AqF00C|09smFVY3v>V00DIJ9smFVZulMm00DLW9smIV0|Wp8DGDC|0RaL8C=MTh00C_j9{>OWWgH&>00C_x9{>OWUnm~{00Ct#9{>OWWi%fE0RS5W00AjL9{>RX8U!drAAkS>b4nio00C`M9{>OWa#$Y#00CiM9{>OWaAY3<00CcW9{>OWY;Yd{00C@v9{>OXa%nmr0DK<+0RROA00AkC9{>RX1OzCMAAkS>ZI&MZ00Cv59{>OWZKNLn00CdB9{>OWY_J~y00CjP9{>OWbi5w`00D2p9{>RW6$AhQDb61N0Ra>QDAFH*00DE@9{>OWZQvgO00DC59{>OWVeB6O00D6F9{>OWU-%yY00DXa9{>OWa0DO#00D0bAOHaX7X$zSDHb3A0Ra{SC>kJu00DC#AOHXXZ73iB00D9?AOHXXVKg8B00D41AOHXXUqB!L00DVMAOHXXa7-Wo0RR~U00AjjAOHaY7z8L>Ab00C`kAOHXXa&RC300CikAOHXXaC{&D00CcuAOHXXbci4T00Cu=AOHXYV{=j<0F)pA00C{9AOHXXWuzbg0RSBY00AkmAOHaY90Vw|AbyV600CjnAOHXXaMU0G00CdxAOHXXZ{Q#R00DC5AOHXXW$YjT0RSrm00AlZAOHaYDg-F}Abc00DChA^-pZWfUR+00CzjA^-pZWFR5{00VMvWzr!4{2>4)A^-pZUo0X300C_}A^-sZB?JHgDMlgy0RbcgC`uxL00DDQA^-pZWmqBr00CrPA^-pab#Fi-0AwNn0RSfi00AjBLDyaWke$Y00DAJBLDyeb8Td2W|9{Gz!(5X82}_B08k?U0RUJ400Aj%BLD#bR{$t*BY*$_Wp*O~00CouBLDyaX@nyH00DN2BLDyaWsoBP00Cr{BLDyaWSk=a00Cv9BLDyaaHt~y00D2VBLDyaW3(dx00DElBLDyaVZb8*00CsiBLDyab<86G0RUP600Al3BLD#bSpX>9BY*$_W#S_M00Cp_BLDyaY49Te00DOPBLDyaW&9%m00CqIBme*bWC$bx00CtVBme*ba1eTL4!8Iw({mfN&##00CuMBme*bV`wA*00C)kBme*bc61~F00CusBme*bWPl_900Cr%Bme*bWsD>M00D54Bme*bZB!B<`DXt^{00AhrBme*ba=at}00D2pBme*bbjTzC00CjnBme*ba?~UM00Cv%Bme*baNr~W00D32Bme*bI_x9>0RhnfI`Sld00DCNBme*bWdJ1r00DFcB>(^cWeg<%00C?gB>(^cWf&y@0RU+O00CtrB>(^ca401J00DU}B>(^cbTlOZ00Cb*B>(^cWk4kW00DDGB>({ca{~YYDN-c>0ReIYC{`ta00CuOB>(^ca%3d{00DJsB>(^cbZ{jA00D4zB>(^cVSFV300CoyB>(^cUx+0D00C}}B>(^cX_O@Z00C#3B>(^dVP#Au0H7rR0sw0RZ36%SDX=8~0s(3RYy&8^C4c|{WxOQ-00DExB>(^cVaz1}00CmsB>(^cZP+CM00Cj%B>(^cW#lCQ00DC9B>(^cbnqnr00DFMB>(^cVf-Zk00CqICIA2dbqFQ^0RV3U00Ai!CIA5eZUZP7CV&6|Jsu_i00D9)CIA2dZ!9JN00DF|CIA2dVK^oL00DA7CIA2dWke@3eDMBXz0ReaeC`KoM00DGNCjbBeX;dcw00CiICjbBkVqB>)a408l0Xo+be7CIC1m0AMEo00BB?CjbKiegkR)Yy&zdgeQQuC4c|{Wr`;N00DEFCjbBeVW1}f0swIYa|8eZDXb>|0s(LYas(){Cx8F}DYz#900Ah%CjbBea>yqD00D2#CjbBebkrvR00CjzCjbBea^NQb00Cv@CjbBeaO@`l00D3ECjbBeI`}650ReObI{qhs00D9YC;$KfZwx2^00DFoC;$KfVHhX?00CtlC;$KfawI4K0Rxr;WMX3;B><);04gW|00C_E00DGzC;$KfUw9}000CuwC;$KfbA%`W00Ci&C;$QgodBNz00AkMC;$QhoB*BxD4ZyO00Cv9C;$KfbF3%;00CjLC;$KfVz?*(00DHqC;$KfZp0`600CvnC;$KfcF-sQ00CvzC;$KfVcaMH00Cs;C;$Kfb?7Jn0RW)@00AlVC;$Ngpa3ZND1ZO~J^m;F00D9aDF6TgZwx5_00DFoDF6TgVHha@00D9yDF6TgWh5y800D3+DF6TiVq00Aj%DF6cj_5k_-_y8z$DS!Y0DS9aY00Ah5DF6Tga*Qbe00D23DF6TgbeJgs00Cj1DF6Tga-=B$00CvHDF6TgaIh%=00D2dDF6TgI=m?W0Rj8~I>IS{00DByDF6TgZ_p_K00DH?DF6TgVcaPI00Cv0Rj5~IzlRd00DAHDgXchWl$;r00DGXDgXchWn3x%00C@bDgXchWoRk@0Rxo-Vq;`xDFE&%0B$M(00DG#DgXchZiFfT00Cu+DgXchc91Fn0RZj+00DBGDgXchWuPhm00C&GDgXchbgU`>00D2ZDgXchI=CtT0Ris;I=(7^00CsgDgXchZp00C}pD*yliX>=<900ClpD*yliWq>OH00DA_D*yliI*cm-00BCZD}Vq2a+oUs00D2FD*ylibfhZ)00CjDD*yliW3Vd#00D2dD*yliY`iM~00C^mD*yliVaO{000CvrD*yliW7I1E00C**D*ylicHk=j00Cv@D*yoiI|Bd#De@}-0RcJ#DE2FW00BJwD*yoiK?48*DGDqA0Rcb*C=M)u00KNc1S|j)EC2ujavUrG00D0%EC2ujbSx|Y00Ch#EC2ujayTpi00Ct_EC2uja6~Ks0suV&KLY>(DNrl`0s%Y&J_9ILEPwz3Yg{Y<00DDmEC2ujVQee_00DGzEC2ujVR$S600CrvEC2ujb%ZPc0RTe-00AkGEC2xkLIWt2EPwz3WtuDi00Cs8EC2ujWvDCw00CpJEC2ujWwa~+00DBkEC2xjMFRi00AldEC2xkMgu4SEr0+4a|SH{00D9iEdT%kZxk&600DCvEdT%kb094M00CttEdT%kV=OHI00Cn%EdT%kbvP{m0RTw@00AjPEdT)lNCPNHEr0+4b51P)00DAVEdT%kZ(J<^00DDiEdT%kb7(C900CugEdT%kV{|P500CusEdT%kWPmLI00Cu&EdT%ka*Qni00MJxJ}m%}EdT%kUzjZb00C{HEdT%kb*L=>0RT$_00AkqEdT)lN&_gkEr0+4Wxg!{00D5uEdT%kdCV;U0RT+{00Al3EdT)lOamy~Er0+4aN;ch00C|4EdT%kb?_|!00(blaC2^SWJ)XmiYx%YECBW_0In?n(k%e?EdT%kI{YmF0|HJ1JOe%hIw%@0fL1Jk00C_QWgMMC;*l!05~fE>?;5~E&u=lVL~nd00CrbE&u`mUjTLj00DG%E&u`mX#j!(00Ak2E&u`oXaIo%UnqPo0EjMt00Cu=E&u=lY@99t00Cm6E&u=lVW=(u00C^SE&u`mz5#3k00AkwE&u`ny#Z?iD8Md&00DW%E&u=lbkHsU00C*%E&u=lW85wP00Cj*E&u@lTmk?ADef)+0RdYADDp0V00C_HE&u=lWdJV#00CnLF8~1mHUt0xVG=I@00DFsF8}}mVH__20suG!Is^a#DJU-h0s%J!IRq#!FMt36bTlsj00Ct_F8}}nWM?8T07NeU00BBmF8~1nI|MpZFMt36WLhr(00CuUF8}}mZfGw600C!iF8}}mX>>0D00C@zF8}}mVSq0H00Cu&F8}}mZj3Jg00C)|F8}}mY?vnCjlK=n#Dc&yt0|Jx)mH?6fUntlw0Ol`%00DCBF8}}mW&AGy00D3UFaQ7na0oB}00ChRFaQ7nauhHC00DOzFaQ7nUm!3500CttFaQ7nax5?a00DI}FaQ7nb2u;n0{}n)1Obl%00AjPFaQGqKLG;)jshr5Fn|C7a#An=00CiMFaQ7oZ)H9(0Aw%#0sxNykpKVzDReLZ0RY_r00Ak0FaQAo+yN+rFn|C7Wr{EW00C!`FaQ7nZkR9t00Cj1FaQ7na-=W-0s@WzkN_xOdN6>hFaQ7nWw0;+00DZwFaQ7nX~Zx900DE#FaQ7nU(hfB00C>(FaQ7nZrm^c0ssO500962Dd;c&0s#R5{{SfNFn|C7aP%+$00C|OFaQ7nbp$a00RRL600AiwF#rJp0|6)$F@OL8avCuJ00ChlF#rGoWhgNK00C?+F#rJo1_1y8DLOF#0RaU8C_XWO00DDEF#rGoYfLc!00DDSF#rGoVOTK$00DGfF#rGoUt}==00C`kF#rGob#O5N0RRXA00Aj{F#rJp2LUL6F@OL8aE37e00DK5F#rGoUz9Nb00DHGF#rGoWuP$t0RRdC00AkiF#rJp2>~duF@OL8dA2bC00DBoF#rGobi^?L0RRjE00Ak`F#rJp3jrw9F@OL8blNch00C*@F#rGpZg|Qu0O&CQ00Cw0F#rGoU-&Tq00D0PF#rGsWMgh~ZwfI0G%)~nF#x7900c4s00BA*G5`Sq4FNhJGJpU9awakW00Ct#G5`Ppb~G{o00DA3G5`PpWk50j00DDGG5`PpVoWjs00D1OG5`PpWmqx*00C`YG5`PpZ)7q600MP#;xGVeG5`Spy8{3LDS9#h0Rg%LD1I`400DZ0G5`PuZ*^g8Xmo-L0B|k<{Ab1pLg00Ct(GXMYqX*e?g00Cn@GXMYqZbUNx00Cu6GXMYqWKc5z00C}RGXMYqWn41=00D4fGXMYqWoR=100CrfGXMYqAapYT00C}#GXMYqWq>mP00C=;GXMYqZ;Uel00DH8GXMYqAeb`%00C&8GXMYqbfhx?00C*LGXMYqcd#=60Ri6uAht6A00CvZGXMYqZp1SH00C*rGXMYqY|t|R00CvzGXMYqaNIKh00C*@GXMbqL;?T-W$rTo00D0DGXMYqY4|e$00C_NGXMYqWdt+;00D3cGynhrX%I9300D3oGynhrARIIR00CtpGynhrY$!AU00CkyGynhrVKg)V00C?|GynhrVL&th00Co0GynhsaC7K008BIh00AIUGynhrY+N(|00CuUGynhrY-lt900C@nGynhrVRSSA00DA(GynhrVSqFM00D4@GynhrEsQh(00C=~GynhrbC@&$00Cj1GynkrGXMYqWU4d(00C~UGynhrX|yx|0RU7000DErGynhrZp1VI00D2xGynhrY0xwP00DH?GynhrVcawT00DC1GynhrW#}{j00DIFGynkr!vFvQbow*^00DCTGynkr%K-oZZVEL300ChRH2?qtZ({^C02DO<0RYqj00C|!H2?qsZzweY00C((H2?qsbTl;p00Cnb00C*%H2?qsa@;il00D2}H2?qsXXrHm00C_7H2?qsVe~Zs00AKSH2?qsX8<+;00C|WHUIztX$&?100C|iHUIztZx}WJ00C(pHUIztbR;$a00C(#HUIzuVRXPX05CQH00D9~HUIztVL&zj0RZd*00C`GHUIztXizo)00DGXHUIztXUi00Cu&HUIztY>YMl00DK9HUIztWSBMp00D2FHUIzwWOQ(CC^G=0Gyq060Hihm00AJXHUIztySz3400F|nHUI$u@d6;oHh=&DbIvvZ00C{*HUIztXxugc00DI3HUIztY3Mcp00DCDHUIztZ}c_*00C$EHUIztYydX^0RZy=00CtTHvj+uZV)#B00C(hHvj+uY#cWL00CtpHvj+ua40tb00C((Hvj+ua5Ogn00AI6Hvj+uXFxXq00C}BHvj+uX-qc&00DDSHvj+ub67V300CuQHvj+uV`Mh~00D1mHvj+ua&R{Q00Lof1~&k9Hvj<{Aa$q+ts00Cx}IRF3wYn(X%00DENIRF3wVW>F(0RS)o00AkqIRF6xF99gHIe-8GJ-#^r00DBwIRF3wZ_GIW00DH;IRF3wVc0nU00DB|IRF3wW#l;k00D67IRF6wG64VqDfT%40Rb@qDEc{o00DOZIsgCxZwNX700C?cIsgCxW)wOA00C00CtRI{*LyY!EvD0RTb)00Ai+I{*OzK>;WrJAeQIYbHAY00DC@I{*LyVKh4c00DG5I{*LyUqCwm00C%5I{*LyV@x{$00CiAI{*LyWmr1^00CcKI{*Lya%4LI00D1mI{*OyL;(N+DRw&m0Rck+D0(}900DV`I{*LyV~9Hd00C}}I{*LyWt2Mr00Cs0I{*LyZlF5=00CvDI{*LyaI8B300CvPI{*LyWVky300CdVI{*LyWyCuG00DN&I{*LyZ_qmc00C{*I{*LyW!yUe0RTn;00AlJI{*OzMFA-6JAeQIX!1J%00D6NI{*LyVE{Y;00D9aJOBU+bZ>rSZe?R;a%ZkN0Pr~gGCBZSIsm>p0PH#d7CQiNI{@N401P|;00BA@JOBa!PXc)a00D49JOBUza7a7=00C%DJOBj+M*;T%D*-J5DFGlaIw&rDHvnEafIK{acsYPnJOBUzcUn9E00D1$JOBUzY=Arf0RT1t00AkCJOBX!H2^4(Jb(ZJWtKbu00C{DJOBXzH~;_vDXKgG0RcAvD6Tw!00DEfJOBUza=bhM00D2pJOBUzbI3dZ00DE(JOBUzWz;+X00Cp#JOBUzW8gdh0RTJz00AlNJOBX!I{+x~Jb(ZJbM`y{00CwGJOBUzX#_n000CnPJpcd!ZV){H00CtdJpcd!WE?#J00C|yJpcd!WhgxW00D3=Jpcd!Wi&ki00Cq=Jpcd!UqC$o00C!4Jpcd!ZcIG@00D1OJpcd!a#%e800MP%<~#siJpcg!IsgCxDQ-Og0RcGxC~`f300DD&Jpcd!a)3Pm00D1?Jpcd!bBsLz00DE7Jpcd!Wtcqx00Cp3Jpcd!Wu!d-00CsGJpcd!Ww1Q}00DBgJpcg!J^%m#DZ)Jf0RcS#D8@a200DE%Jpcd!Wz;&B500DEdJ^%m#bGSYL00CvbJ^%m#W5hlH00CvnJ^%m#WY9hU00CvzJ^%m#a@;-u00MJxls*9BJ^%m#U+6vn00C(7J^%m#WB5J*00CkCJ^%m#WduI}00CbLKL7v$au7cN00D0nKL7y$L;wH*DIz}r0Rck*C?-FE00CtzKL7v$a5O&v00DVAKL7y$MgRZ-DMmj40Rcq-C`vzo00CuCKL7v$byzKL7y$J^}y%DdIl>0RcS%DCR$a00Cv}KL7v$aP&U_00Loo+&=*NKL7#%1Of&E00AiqKmY*%2m$~BDHcEg0RaaBC>lV30s;d91p+8N5I}$+KmY*%3IYHDDKz>&We`CC00CqcK>z>&ZX7`X00CtpK>z>&a410l00Ct#K>z>&WHdnl00Cb*K>z>&XFx#!00C}BK>z>&Z%jb|00DARK>z>(b#&lB09Zi)0RRvJ00AjzK>z^(4+1D`L4W`Ob8z>&Z-7An00DD`K>z>&bBsX%00Cu^K>z>&W0*kz00Cv5K>z>&WTZg=00CvHK>z>&az^(69Oo}L4W`ObH+gc00CvrK>z>&Y1Bag00Cp#K>z>&Zs0)x00Cv@K>z>&Wb8oz00D0DK>z>&W%xk=00D6RK>z>&WduS100CqQLI3~(Ul2k700C)Tv04PEL00CbvLI3~(Z9GB%00Lokol#00CqKLjV8)Weh_A00CtZLjV8)V;DmK00CnjLjV8)btFRo0RSKZ00Aj5LjVB*9|9;eLx2DQb2>u+00DABLjV8)Z%9J`00DDOLjV8)b5uhB00CuMLjV8)V_-u700CuYLjV8)WNbqK00CukLjV8)a(F`k0RSQb00Ak4LjVB*Ap$6fLx2DQbB;p*00DBALjV8)Z=6E_00DENLjV8)bErcA00CvLLjV8)W3)p600CpVLjV8)b-+Ua00MAxenSApLjV8)U(7=Q00CvzLjV8)cHBb%00D2}LjVE*CITb^00AlPLjVE+B?2P?DD*>s00CwCLjV8)a|A>H00ChNL;wH*We`LF0RS!n00Ai+L;wK*FaiJpDJDb!0Rb-pC@MsN00C<-L;wH*ayUc)00D14L;wH*cSJ-00Rk-oC|@8%fJ#IF00DGRL;wH*Wn4r600V7obLc|=7DNDIL;wH*WoSeI0stxkC;|WhDSSi#0s$!kCjuyfM1TMRWr#!o00DB6L;wH*X_!O+00DKLL;wH*ail~500CpFL;wN+ECNCT00AksL;wN-D*{0RD7-{~00CjbL;wH*bj(Bm0RTV(00Al3L;wK+KLRM+M1TMRaNMF0Q+ay&%<00D18MF0Q+cSuD50RTJ#00AjfMF0T-I|3+JMSuVSWnM)900DJoMF0Q+Y-~jU00CigMF0Q+c6dbq00CcqMF0Q-UuaH60E9&V00DD~MF0Q+VU$Gx0RT1v00AkWMF0T-H3BH4MSuVSa;ilD00D2ZMF0Q+bGStS00DEpMF0Q+WyD1Q00CplMF0T+IsyOzDb_^*0RcGzDB4AU00C>@MF0Q+bLd3?00Cj{MF0Q+bo50400Ce6MF0Q+cK}8J00D0XMgRZ-Yz#&K00CwaMgRZ;V`9)n02oF900DI#MgRc-I066xDK16;0RcAxC^ANX00DA1MgRZ-Z$L%>00DDGMgRZ-b4*4600CuEMgRZ-V^~H200CuQMgRZ-WMoDF00CucMgRZ-a&Sff00MJxC`JHwMgRZ-UwlRY00C%*MgRZ-V~j=s00Ci=MgRZ-Wtc_)00Cc~MgRZ-a->E800D2RMgRZ}bZ>5VV{mhFVPs`!W&l6{d_Vw5LI7|=0KP&1^g;kCLjZO}0I);=_Cx?AMF5&b0I)^?00BC-MgRc-U;+REW%5P<00C|GMgRZ-Vf;n_0RUkF00ChLM*si;bPPuT00ChVM*si+0RV#o00DATM*si;Wmrc500CrPM*si;Y-C3O00D1mM*si;XmCdW00CuoM*si;V|+&d00CiwM*si;Y=}ny0RV&p00Co^M*si;X_!X<00DHKM*si;VWdX@0RV;r00DQfM*si;Z?s1M00C^eM*si;X23@P00CmgM*sis00C|UNB{rL00C(ZNB{r!T>r*4}bsxWkN{+00DDKNdN!=VNgi`0Rxx=Vq{}hNdW2(0Co@n00CoMNdN!=Wo$_R00DS%NdN%>W&}DK6@UN%WqwHj00DD`NdN!=VT?%t0R)!=V`XGw8VmqH00CtPN&o->a}Y`Z00ChZN&o=^mIGvBV;&^{97+HxC;$KfZ6Znl00Ch#N&o->WjIOz00DA7N&o-?V{`&a07Oav0Rf@_Iz%ae00CuGN&o->b6iRQ00CiQN&o=^l>=g9WN1nN?kNCnDgXchbZ$xj00C}(N&o->WrRuq00DN2N&o=?P6Ij`E`R_5YnDm?00DEJN&o->VWdg`00Cu;7XSbObwU{c00Luk*cSjA7ytkPZM;eV00LoUx)=b)N&o-_XLDq2W+Wp3uu1^7N&w7C08k?U00(DtWp!k9U%p8IN=g8bN&q-30PHIOs!9OXN&o->VeCo(00CqQO8@`?a%wUF00DAx3jhECa2!hj00CpFF8}}qb!c>NVGv6I7E1t(G5{h=04PfU0RRmE00AjPO8@}@3;-xdOMn0YcTP(H00D1SO8@`?Y+OqK0RRyI00AjzO8@}^5CCQ00C~IO8@`?X{bv80RR&K00AkqO8@}@5&$T;OMn0YZoW$Z00CvjO8@`?Ys^ak00D2(O8@}?6#xJMDc(x}0Ra>MDB??i00D65O8@`?Z}3Y100DINO8@}?7XSbODFRFY0Ra{OCOn?9Za%M~b00CugOaK4@W^_yd00CusOaK4|aBOsQZe)r}0Io{_*h>KXO8{0(0Dw#Y00BCNOaK7^8UQ+=On?9Zccx4L00D2VOaK4@Y_v=O0RUeB00DHsOaK7@X#fBLDauR$0Rm_MUns;(0M1N+00CvxOaK4@Y~V})00Cm=OaK4@VeCu*00C_BOaK4@Z1_w700CkCOaKA^VE|(Q00AiqO#lJ`U;ttOC=gA600CtdO#lD^a3D-U=00Cu|O#lD^aGXs500C*DO#lG^*8l(kDXvWb0Rh$kD6&m}00DEjO#lD^Wx!1U00C~sO#lD^Y0OOk0RY(m00Al3O#lG_*Z?TpO@IIabK*?^00Cv{O#lD^aPUn400C+CO#lG^-2eaqDFRLa0Rh|qC7PXGV`ZlF&900DKTPXGV{Y-n;%0IW{{0RV*n00DNqPXGV`U%*cQ00DBwPXGV`Wz0_i00D5)PXGV`aM(`(00Cj%PXGV`a^z0{00DREPXGV`U+_-=00DINPXGV`Z2V6E00DIZPyhe{VF*wF00CwWPyhe{WfV{V00CqgPyhe{Um#Eb00DC*Pyhh{h5!HoWin6z00DY7Pyhe{X*^H>00DDCPyhe{Ur0~@00C=GPyhe{Zd6bJ00DJcPyhh{hX4Qpa%NBf00CucPyhe{aBxrn00D4zPyhe{VSG>k00DA>Pyhe{cZg5`00Cc)Pyhe{Wt30=00DBEPyhe{b)ZlH00DERPyhe{Z>&%N00L-iU{C1Y00DH$Q2+q|jspMzW7bgs00C**Q2+n|ZQxM=00CjiE00CugQUCw}b9hn!00MG#Tv7mjQUCz}tpfl7DUMPA0RgN7D3Vfu00Cu~QUCw}ZlF>C00C*HQUCw}Zmd!O00CdJQUCw}X1G!S00D2lQUCw}U&K-X00C~wQUCw}VbD?l00C*%QUCw}WZY5!00Cv70PXL5c01Q(A0{|QVA^;!&00Ai=Qvd@28vr2y9{?yQQ-A;gYc5j&00DD4Qvd(~VL(#=00DGHQvd(~V@y*300C}NQvd(~d00~b00DDeQvd+~9smFVDQZ&y0RbHVC~i}L00C=sQvd(~b9_?(00CiwQvd(~bcj;`00Co;Qvd(~Zj@6100DWLQvd(~bD&cI00Cj9Qvd=0CIBP=00AkoQvd=1B>*D;D7aIA00BL|Qvd(~a>!Ev00D2#Qvd(~bktJ-00CjzQvd(~a^O<{00Cv@Qvd(~aO_h60RSif00AlZQvd-0Cjcn?Q-A;gc>+`b00DFgQ~&@0a1c}g00C_lQ~&`0DgXchDI!z=0RbrhC?-^Z00DF?Q~&@0Wi(U(00DD4Q~&@0Wk6H_0RSuj00AjXQ~&`1D*z}^RDb{hWmZ%G00DJgQ~&@0Y-Cgb00CiYQ~&@0c5qYx00CciQ~&@0Xna%v00DD?Q~&@0VTe=!0RS!l00AkKQ~&`1EdVH(RDb{hbe>cI00C~MQ~&@0b*xkX00D2ZQ~&@0W4KfR00CdVQ~&@0WyDkf00DE#Q~&`0FaQ7nDb`c~0Rb-nDB4tj00Cv-Q~&@0Zs=4100D3AQ~&@0Wb{-300Ce6Q~&@0a{yHU00?1zZ)I|5b!lW%0IX90@>2jDQ~*X)0FG1u&{O~hRR911It*0+0Rb@pIxba!00C<>RR911b39c500Ch_RR911bVyYI00Co8RR911Zd6qO00DVgRR941wF3YFDP~mw0RglFC~8%J00DS#RR941w*vqHDSlM|0RgrHD1udh00C}@RR912VQF|(0FYGx00C{5RR911U!YY000CpBRR911X{=QM00C{XRR911Z@5(e00MJ#U{wIVRR942M*%uKJb(ZJcg|G+00D2-RR911Y}{1<0R}??c4j(obYHqZ0OC~u>_&i2M*!kT00c<@00C?0RR911a|BiZ0sx5uhXMcrDG*iw0s)8uh5{%SR)7Eja2!?u00C_#RsaA2bu3l@0sxHyivj=vDL7UD0s)KyiUKG;R)7Ejc|=wK00DANRsaA2VN_NC00C`URsaA2ZD3Xa0RWW(00Aj%RsaD3lmaMlR)7EjaCTMz00DJ=RsaA2UxZcw00DH0RsaA3b!BE&0FYJy0RWEz00AkWRsaD3jshs8R)7Eja;jDU00CjLRsaA2Ww=%V00C^iRsaD2kpch#Dauv=0RfN#D9%=Z00DE0RWQ%00AlVRsaD3k^(6BR)7EjbN*HU00D9aR{#J3WeisU00C)Q00CvHSO5S4cCc6g00DBgSO5S4WxQAb00DEtSO5S4V#rtk00D2#SO5V4O#lD^DcV>70Rc<^DBf6r00D61SO5S4b?jIG00Cd}SO5S4bof{R00CwGSO5V4PXGV`DGFHt0Rc_`C=OYG00DUvSpWb5avWIz00DF&SpWe5Q2+n|DK1$60Rd0|C^A`q00DG3SpWb5X+T*300M4!C|LkTSpWb5WlUKB00CcCSpWb5Zd_Ra00Vw=Z`4=-1X%!LSpWb5I%rt{0Rd6~I(Aur00CuuSpWb5V}w}%00Ci&SpWb5W{_C`00DBASpWb5Wt>?600DHOSpWb5Zm3xR0RW5x00CvRSpWb5aJX3j00DWvSpWe5lLG(&bjn!(00CdlSpWb5aMW1<00C{Y<0ReFUC>~pY00D9&TL1t7Z!B8?00DF|TL1t7bvRo900Cn@TL1t7Wkg#500DSPTL1w7bN~PWDOOtm0ReLWC|X;900CrRTL1t7VQ5nzAj9LH`TL4g70DxNn00BCNTL1w7%mM%bDV|#Z0RhVbD56_{00CsETL1w7&H?}dDYjbx0RhbdD7ssK00CvdTL1t7ddOP<00C*vTL1w7&;kGfDcV~A0RhhfDBfFu00DXATL1t7bnIII00C+8TL1t7WB6MD00CkCTL1t7a0FZc00ChNTmS$9b7Rz701#XN00CbXTmS$8WguJt00DI-TmS$8Wh`6(00DI}TmS$9X>qVy061I#00BBaTmS(9(E>V1Tz~)pcTQXY00CuITmS$8XZTmS$8a=css00D2pTmS$8cgS1-0RZR%00Ak~TmS(9=K?6$Tz~)pW!_u>00DL8TmS$8XY5=600D3ETmS$8Z1`LN00D3QTmS$8as*uf0RZX(00AiwT>t?A=>jMeU4Q@qWg1-o00DI(T>t<9XDD3&00D0t<9Y&2Z}00C(_T>t<9a6nxE00MAj3S9t3T>t<9Urb#900DAVT>t<9Wn5hV00C@bT>t<9b!c4x00CrfT>t<9WprHt00C%vT>t<9V}M-%00Mq<&RhV7T>t<9I*eTa0Rig*I+k6400Cv3T>t<9W29XG00CjDT>t<9X0TlV00DBgT>t<9WxQPg00DHuT>t<9Zpd8#0RwdaI(BK8TYzF*0M1t<9W#nA|00C?2T>t<9a`0UM00D3IT>t|BdH{X^d;kCeDFR*q0|9veeE@p^C<tUVs1rJwRRn00DAJUH||AZ%|$U00DGXUH||AVO(AS00DAhUH||AWoTXi00D4rUH||BXKyZE0CZje00BCBUH}0Bg8({&UVs1rXo_9{00D54UH||AVVGV30RX`R00AkaUH}0Bzym0#UVs1rcCKCk00C~cUH||AWxQSh00CdZUH||AbjV%+0RVgi00Ak~UH}0Bdju%hUVs1rW!_!@00D63UH||AdF);Q0|0&mfCPdB00AlZUH}6DeFT35fdnW3Uw{Asat2=j00CtZUjP6BZWvzx00D0vUjP6BX(V3&00DF=UjP6BX)s>^00DG1UjP6Bay(xE00Ch_UjP6Ba7bSO00Cr9UjP6BWmI1P00CrLUjP6BX<%Of00DJoUjP6CbZ7Ej0Bm0X00BC3UjP9Cg9JK!Uw{Asa)Ms~00Cu+UjP6BZjfI900D27UjP6BX`EjG00DHOUjP6BX{cWS00DHaUjP6BaV!aDHdP=0|CeY$^gs&C>~&d00D9&U;qFCWiVg>00C|`U;qFCZ#-ZC00C)2U;qFCbVy(T00C)EU;qFCbW~sf00DAZU;qFCVPIeY00D4jU;qFCV{Bjm00C)oU;qFCZFpb+00CisU;qFCZiHX}00DW5U;qFDbYu`<0FYn+00BCdU;qID&Hy@~V1NJta;9Ja00CvLU;qFCZnR(k00D2hU;qFCX~19r00DHyU;qFCY0O{%00DH;U;qFCa@b%100Cj%U;qFCaO7YB00Cp_U;qFCY4BhG00C|GU;qFCVfht00CtZVE_ODa2R0#00D3wVE_ODVI*My00D9;VE_ODcQ9c900Ct-VE_ODay(%G00DJEVE_ODb4XzT00D1KVE_ODY*b+Y0RWT(00DAfVE_ODWn^Ih00C}lVE_ODZ*XA%00C)sVE_ODbbMg|00C)&VE_ODbckU900DB2VE_ODVU%G20RV;s00DELVE_ODZKPoU00CjDVE_ODalVE_ODbI4%;00CjnVE_OFV{mj@VE~$80Muas00DH`VE_ODZ0KPC00DLGVE_ODVf0}D00CzDVE_RDb^-tabOvGo0RVym00AiwVgLaGfdXGB3Ss~hVt@bvWg21t00C?!VgLXEVk}|+00Ch#VgLXEY&c>700C@1VgLXEVMJm80RVRb00DGRVgLXEUsPfM00DGbVgLXEbzouu00D4jVgLXEZfs%z0RVUc00DG(VgLXEUwmQ!00DG@VgLXEb%;R^D6C?D00CvPVgLXEaJ*sw00C*jVgLXEaL8f+00CjnVgLXEbkt%100CjzVgLdFegc3300AlHVgLdGeFA?1DC}Z@00Cw4VgLXEaQtEb00DUdV*mgJV`*<>bV_0Xa$*3WVgTS`00?6M00BAbSh&20sx-^p#lH_DKujM0s)=^paLj5V}JkwWk6#900C}FV*mgFX;5PT00C@PV*mgFWn5zb00D4fV*mgFX=q~r00LidE@J?0V*mgFVRT~v00DG$CV}JkwbDCoS00Cv9V*mgFZm44b00C*PV*mjFr2+r}DY|0-0Rf}}D86HW00DEvV*mgFWz1s$00D5)V*mgFY1m@`0RX8200AlFV*mjGr~)YHV}JkwbM9jR00D0HV*mgFW&C3R00CpGZwO=n00V4rbhKjt-eUj`WB>pGUle2j00C_xWB>pGbtq&20RX2000Aj9WB>sHrUEE9WPktxbUtJN00Cu2WB>pGb4+9a00CuEWB>sGs{#N4DPCj%0RgH4C}L!Q00C%dWB>pGaByS*00CikWB>pGa(rX}00d-ia%YNT04`(zSY!Z#WB>pGI*4Qd0RgN6I+|pF00Cv7WB>pGZm47c00C*PWB>sIg918iBx8WIWB>pGWx8Yl00D5uWB>pGY0P8*00D5)WB>pGVc28<00DH~WB>sG%>w`dDe7bZ0RhYdDDGr{00Cw6WB>pGWBg4Tx00D1uWdH#H=>Px$DSl-D0RiX$D1v2x00DA{WdHyHZ;)jG00DHCWdHyHb)01Y00Cp7WdHyHWvFEU00DTeWdH#H>i_@&DY|6<0Rid&D86NY00C&kWdHyHaLi=@00CjrWdHyHa@b`60s!p*?*IS+Ddc4U0s-s*?f@w2Wq<$yZt!IQ00C+GWdHyHVE|?T00DFcW&i*IWejEj00e4fXL5LD0JLQQ-emw1W&i*IIv8dE0Riv;Ix1#>00DO}W&i*IZ#ZTE00C@1W&i*IW<+KH00Cl3W&i;I;Q;^vDOP3x0Ri9vC|YKK00CuSW&i*IaA;-#00DV!W&i^K;{oLX=K%l#DSBoA0|DXz$t00D5yW&i*IW6)**00C*%W&i*IblheD00Cj*W&i*Ibm(RP0RZU%00AlVW&i;J=m9ABW`FX8-^JXfS6000Cn*X8-^JUp!|300Cq|X8-^JWk_cL00DGPX8-^Jb5v&l00DJcX8-^JYG7vo00CrXX8-^LbaP>JW&rMH0BmOf00BC3X8-{K>H#`{XMg|!a)xIB00Cu=X8-^JZj@&L00D2BX8-^JX`p8S00DHSX8-^JX{=`e00DHeX8-^Ja=2#z00CjXX8-{JsRIB3DavO60RgB3D9&eq00A=8X8-^JXWVB100D2}X8-^JZ0KhI00Cd_X8-^Ja`a~a00D3MX8-^JZvbcj00C?UXaE2KW(;Tm00MAd#Ag5!XaE2KV;E=v00C(tXaE2KbSP*500ChxXaE5KtpWf6bUJ7N00Cb!LKw*s{S00AjnXaEHPt^%+EvI4dOv;rt-Xn+6#Wo~Ey00C}(XaE2KX@qD100C@@XaE2KWsqn900D58XaE2KX`E;P00D5KXaE2KWvFNX00C^SXaE2KVzg)g00CjTXaE2KY`|y$00CjfXaE2KW6Wp(0sy-LxdH$IDcEQL0s*=LxB@8NXn+6#J>+Nr00DCDXaE2KZ}eyY00DIRXaE2KVE}0W00D9aX#fBLWejNm00D3kX#fELy#fFMDIRG60Rg-MC?aWq00DC-X#fBLWiV*~00C|`X#fBLX*_8F0RX=O00AjTX#fEMz5*yrX@CF$b5dyl00CuMX#fBLaA0Ww00C)cX#fEL#{vKWDROB50RhGWD0XRp00DD+X#fBLZiHz700Cu+X#fBLYmjLG00D27X#fBNY;bf!X#i|#0Gw$600Cd3X#fBLZLnzo00DKjX#fKN!2-hq#R32UDZ*(00|CGS!UDtsD9UMo00DH+X#fBLW!z~100DF2X#fBLW$0-D0RYJY00AlVX#fEM$O0(%X@CF$X#Qyc00D3YY5)KMVGL>j00D9mY5)KRaC2{Na%Wa(0Mcjx7-;~!X#nnN02pcj00BB4Y5)NN$^tq#YJdO%Wj<;E00C}BY5)KMX-sMW00C@LY5)KMWmswe00D4bY5)KMX=G{u00D4nY5)KMWpHW$00C@vY5)KMVti@<00CiwY5)KMY=~+A00Ci+Y5)QN*aF%D00AkQY5)QO*8j~00DS9YXATNVL)pD0s!Cw;sO8xDNJhs0s-Fw;Q}a9Yk&X&bXaQu00CuUYXATNb7*S-00CugYXATQb9QZV+G+p($p00DHeYybcOW4LSp00CvbYybcOcEoG|00C^uYybcOY|v}~00CjvYybfOuLA%9DdKDZ0RgT9DCTT{0RXWB00AlVYybfPumdRgY=8g)GBfOK0RC(M00DFcZ2$lPau96*0RYJZ00Ai+Z2$oQ$O9-KZGZp)Y$k0000C((Z2$lPbTn-M00C|~Z2$lPbwF(Z00Cb{Z2$lPZcJ?e00C)IZ2$lVb#!B8ZEtW;W&kv30F-I~)@uOVYycK*09b7R0RR#K00Aj{Z2$oQ5dkQGZGZp)a)xaH00D1~Z2$lPbd+rX00DKHZ2$lPW1wvS00CvDZ2$lPc&u#z0RR*M00AkuZ2$oQ69Fi^ZGZp)a>8u@00CvnZ2$lPYtU@~00DB=Z2$lPZ`^GF0RR>O00AlJZ2$oQ6#*#hZGZp)W%6wR00C_JZ2$lPbpUPv00CqMZU6uQWejcr00C$cZU6!R9048y00Ai;ZU6!S8vz{wC?sxx00CnvZU6uQZZvKH00DVAZU6uQb3kqY00Ch}ZU6uQUrcTQ00DGTZU6uQZdh&r00CuQZU6uQY-DZ#00Lua7;XS+ZU6!R7y%jq00Aj_ZU6!S7XcXoD1dH&00Co$ZU6uQZjf#O00DWHZU6uQbDVAf0stZbAOQdYDX4A$00Cjh2>=2DApsu&C||B_fV6G^00C~gZU6uQb;xc200DB&ZU6uQZ`5u80RSce00AlBZU6xRB>^bpZh!y*Wa@4J00Cw4ZU6uQZ1`>f00C_NZU6uQWdv^k00CnPZvX%RZV+z(00ChZZvX%RV;pY)00CbjZvX%Rb0}{B00w1ub75m?vTXq3Z2)j?0HSUH+HL?YZvX%RIy7$p0RblgI!e)bDSB@J0RhYbD1L8%00Cu$ZvX%RaExyN00DWDZvX`V&jHZ^(*e~1*8uj4~00CvrZ~y=SZq#r900D2>Z~y=SY2a`G00DI7Z~y=SY3y(S00DIJZ~y=Sa`Y0Rc<_C^m6`00DA5aR2}TZ$xnb00DGLaR2}Tbx?5t00CoGaR2}TWn6Ip00DSnaR31TPXPb{DQ0R{;P4DX?(>1OQn9TLE1GUjYCCDZp_61OZq9S^-=EUI8e|aexE>VF6Hd0RemgC^~Y000DA9asU7UWk_-W00DMRasU7Ua#V5v00CuMasU7Ub6|1+00ClVasU7UZ)|b^00CceasU7UWq5J`00DM>asU7UZ-jCH00C`^asU7Vb!9Me0FZJ30RU+M00AkWasU7Ub@B-S0Rm_NC|{&bASK=b7FG<00DApa{vGVWpHx<00DG%a{vGVX?$}400M7eQgZ-;a{vJVa{&MWDUx#l0ReIWD3)`800DEHa{vGVWu$We00DKXa{vGVWw3Jq00DKja{vJVbpZeYDZ+CA0ReOYD8_Su00DE%a{vGVWz=&300C**a{vGVXW(-H00D32a{vGVZ0vIY00D3Ea{vGVaQJfo0RVRa00AikbN~SXb^#~|bbtT>a}IO>00DOvbN~PWZya<000C?wbN~PWW+-$300C<*bN~PWb2M}S00M7e{&N62bN~SWe*pjiDN1wz0ResiC{A>M00DDUbN~PWa$Ix(00CuUbN~PWc4%|}00DAtbN~PWWps1^00DD)bN~PWVt{l200eh&bZ>}r0K9VmKy(0xbN~PWUyO7B00C{DbN~PWb)<9v0RVXc00AkmbN~SXcmXK1bbtT>WV&-bN~PWa^Q3T00D32bN~PWckFZk00Cd}bN~PWbNF-s00C+KbN~PWXasct0RVde00AiwbpQbYdI2aDb$|c?avF6200CtpbpQYXb|`fK00D9?bpQYXWi)jF00DD4bpQYXVnB5O00D1CbpQYXUrcoX00CuEbpQYXYglyv00CiMbpQeYg8_j700Aj#bpQeZf&qX5C~$Rv00CombpQYXZh&wIlkpYtd00Ajxb^rtclmV6jkO7hbC~$Uw00Comb^rhYZiIFK00DW5b^rhYbC7lb00Ci^b^rhYUz~OT00DHOb^rhYZm4zu00CvLb^rhYWVChw00C~gb^rhYWx#d-00D5ub^rhYWz2Q}01IV)a$|RHbZudCWN2{!s&N38asbY903ve$s&oJfbpT{_0Fre8V0Hk~b^rhYI@opq0Rfl+ItF)u00D9gcK`qZZxnX`00DFwcK`qZbs%>D00CnrcK`qZWh{3900DS1cK`tZn*jg;Wj=QR00D49cK`qZc}RBv00DGPcK`waodKT#00AjlcK`wboB^HzC}4Mh00CuYcK`qZXK;4_00C}xcK`qZVSIN000LifRCfS^cK`qZc!+lZ00Cu^cK`qZWSDmV00C~EcK`wasR5w@00AkgcK`wbr~#k>D6n^c00C~ccK`qZX~1^?00C#lcK`qZWz2T~0syN4qX7T`DcE-a0s*Q4q5&x0cYpu^WaM`M00D09cK`watpTM000AlbcK`wbtO29}C;)hX00CtNcmMzadJuR300C(hcmMzaa~ya800CbjcmMzaa42{H00Ct#cmM$auK@r7DLQxn0RgT7C_Z?A00Cu0cmMzadQ5l#00MMrG~_d4K=`WfFM+00Czjc>n+bZXkI800Chpc>n+bax8fO00DS1c>n+bWjJ{N00Cq^c>nn+bb5wZ%00C`Uc>n+bVPJUx00DAlc>n+bVQhH-00D4vc>n+bUwC-{00C=$c>n+bbA)*S00Ci&c>nc9wYn00Cc`c>n+bYoK`m00DERc>n+bVXS!o00DHec>n+bU$}Vy00CpZc>n+bX~cN|00DH$c>n+bVbFO10RWu?00DN~c>n+bU*LHF00C>}c>n+bbL@El00Ck0c>n+bbohAy00CeAc>n+bcLaI>00D0bdH?_cY!G??00CwedH?_cVjOw^00MM%)_DLTdH?|co&x{@b~1VZ00Cb%dH?_cYdm@Y00DDCdH?_cVMuxa00DGPdH?_cUsQSk00CuMdH?_cZD4u;00C)cdH?_cbZmM600C}tdH?_cb$EIJ0RW!^00DM{dH?_cUx<1D00C=`dH?_cbCh}j00Ci|dH?_cbf9_w00Cd7dH?_cZmfC$00D2ZdH?_cX}Ed-00DHqdH?_cX~cQ}00CsmdH?_cZqRxF00D2-dH?|cpaTE_cH(*f00Cd-dH?_cYwUUe00DFIdH?_cVfcCg00DIVdH?_cUj%yq00C|adjJ3dZxDL`00C(hdjJ3dbR2sC00C(tdjJ3dWGH(800C|;djJ3dZ!~)V00CnXdH~#d0A70l$a?@vd;kCeUr>Ai00M4vM0o&id;kFewgCVEDSmta0RgoED1v-|00Cu)d;kCeb&z}j00Cu|d;kCeb)0+v00D8Ld;kCeYp8qx00DEZd;kCeVYGYz00DHmd;kCeWWam?00Cvjd;kCeWXya300C~&d;kCeb=Z6W00D2_d;kFexB&nGDe8Ox0RguGDDHfK00Cw6d;kCeb^Lq)00CtJeENWDXx700RhJWD6)Nk00DHkeE800CdpeEegFUgV?2HU00Ch_egFUga7cat00Ci6egFXgzySaODO!F20Rg`OC|-Vm00C%ZegFUgb8LP900DJ!egFUgaCm+I00CcqegFUgd4zrd00DA}egFXg!T|sQDVBZ!0Rh1QD4KqN00C&AegFUgbEtj*0RY4S00AkqegFXh!vQF`et-Z0aK3&300MPyu6_W>00BDIegFXh$pJd_et-Z0W%_;q00DIZe*gdhWe9%&00DIle*gdhaTI?500C?7ve}Di1W%_>r00D9WfB*mibqIg}00DFkfB*pi_5uI_Xc~Y300D3wfB*miVI+V600D9;fB*miXE1;O0RZ>{00AjHfB*pj_W~$DfPer2az=mv00CuAfB*miWK@6v00C@TfB*miX<&c=00DJofB*miVr+l_00DP$fB*miZ+L(J0RRvK00Ak4fB*pi5(5AMDUN^u0Ra&MD3XAH00DTIfB*miWuSln00CsCfB*miZmfU+0RR*O00AkufB*pj69Xu`fPer2c*1}H00CvnfB*miWYB;B00C~+fB*miY21JS00Cd(fB*midFX%u00L=ZvVZ{YfB*pk4+AJ)h=73ffB*miX#Riz00CnPfdBvjVGw};00CtdfdBvjUmSq|00D9$fdByjE&~7oDK3Ek0RS)q00AjDfdBykF9Rq%fq(!3bV7ju00D1GfdBvjZcu>$0Rk-pC|@#xfL4J300CxPfdBvjX=s4}00CcafdBvjWORW500C}#fdBvlY;S3PfB-0g0Dyr21_1g3`~v<000RO81Oos8DU^W#1_Aj3`vUy}{{jI60|O|cfq(!3WvGDw00CpVfdBvjVZea^00C^qfdBvjaLj=K0RRRA00Al3fdByk1p_GDfq(!3W#WMV00DFAfdBvjVeo+f0RScg00AldfdBykB?Bk`f`9-4ZU%w?00ChRf&c&kV-$h_00DCvf&c&kUm$`200Cttf&c&kb}WJb00C(-f&c&kb2x$k00DJAf&c&kY(#ex00D2Nf&c*k3IhNEDYAk90RagED7J!t00DBmf&c&kWyFF200Csmf&c&kY|w%L00D2-f&c&kXxxGT00CvlX&i$900D9$g8%>lVJL$D00Ct#g8%>lZZv}c00C(_g8%{m76TXq00AjRg8%{m9s?i)00Ajdg8%{n9RnW&C|HAl00CuQg8%>lV`zf_00DJwg8%>lWORc700Cusg8%{mC<7`300Ak6g8%{nCj%)1D2#)E00C~2g8%>lVVr{i00Cp7g8%>lbEty=00CdFg8%>lWwe6;00DNog8%>lX~2U300DExg8%>lblW7vZL00C~^g8%^lECT=mDe8j&0RbxmDDH!R00D0Fg8%>lVf=#t00CnHga7~ma|nb000CbPga7~mWfX(}00DLyga7~mX&{6E00DC*ga7~mbu5Gc00C?=ga7~mV>pBW00DSDga7~oa%p9Ng8<}%07QfU0s<8S7Xv6?OoM<*ga7~mZCHc=00DAlga7~mZ)}7B00Cxlga7~mb9jUR00C}(ga7~mVT6PL00DA}ga82m8Up|UDVBr)0RS8W00AkWga82n8v`h$gn$46c&dZ|00CvPga7~mWVnO?00C~kga7~mX~cv800Cdhga7~mdC-Ia00Cmwga82o83QO^nuLJdga7~mW#WVY00Cp}ga7~mb@YS)00CtBga7~mWdMZ$0stZdBm)2eDGY@G0s$cdBLgTBg@6D7ZWx6C00Chlg#Z8nVg#Z8nc0h#y00C)6g#Z8nb4-N*00DJUg#Z8nY*>W=00CoOg#Z8nZe)c30RS=s00Aj*g#ZBoF#{-cg@6D7bb5sV00C}-g#Z8nWr&3U00C`|g#Z8nWt4>g00C^8g#Z8nWuS!s00Cd7g#Z8nZmfj>0RS`u00Akug#ZBoGXp5Rg@6D7bi#!I00C~wg#Z8nWzdBH00C{*g#Z8nW!!}T00C^`g#Z8nW$1+f00Cd_g#Z8nc=Uw;00L=YvV{Qpg#ZEoI0H5V00Aiqh5!NqHv=^TC=iB#00DFsh5!Hoav+8P00D0%h5!HyUuI=vaBOsPX>D^hfB=Sp09Jwktbzdgf&f5+0FZg#ZAC04#<800BBOh5!KpIRiRihJXM8a%P4A00Cugh5!HoWORlA00C@zh5!HoX@G_R00DJ|h5!HoVvL3W0RX2100AkOh5!KprUNLPhJXM8GNOh600C#Jh5!HoZ?J{{00C^ah5!HoU%Z9@00DBsh5!HpcW;u00LX>_00D2#h5!Hpb!-%X0N92A0RUtI00AlJh5!KpV*)7bhJXM8W%7mq00DLSh5!HoWdMf&00DIdhX4QpaSVq500CrW00CqwhX4QpWiW>T00Cq+hX4QpZajwo00DJEhX4QpZ%Bs#00Cl7hX4QpZd8W=0RUzK00AjrhX4TqWdbN=fn0Fa0P0RU_Q00AkWhyVcrYXT^wh=2eAd8&v200DHehyVZqX}E|000CpZhyVZqVZ?|400D5yhyVZqVbF*G0RV0S00Al7hyVcrZ2~CZh=2eAXy%9j00DFEhyVcqZ~_1UDf);20Re9UDE^3m00D3Wi2wisb#L^D01Sx$00wn_WpiV2T89AGhX4YI0G5aV)`$QSi2wirIvj}r0ReRaIyQ-b00Ct@i2wirbwr5(00Cu6i2wirbx?@_00D7Ui2wirYg~x{00DDii2wirVQ7f}00DGvi2wirWORuD00Cusi2wirWPphP00C}>i2wirb&QDs00D23i2wlrvjYGDDV~V{0RgfDD58mg00CsEi2wirX|Ra^00CdNi2wirWxR<100DKvi2wirama}P00LrZn27++i2wirZq$hY0RWl<00DO7i2wirU+9Sd00C(3i2wiraP)})00Ck8i2wirasY||0RY1T00AisiU0ut!UHG}ihuwCb{2{N00C|uiU0rtXJrP803?b400CbriU0rsb2N$o00Ch-iU0usqXPf|DMpF_0Rf@|C`yWe00A;kiU0rscUX!500D1aiU0rsaAb-A00CcWiU0rsba09Q00DS*iU0rsWqgVN0RXE500Ak8iU0utsskvDihuwCWs-^j00Cv1iU0rsa-fO;00C&GiU0rsbgYU100CdJiU0rsX1Iy~00D2liU0rsU&M+400C~wiU0rsVbF>I00C*%iU0rsWZa4X0RX=P00AlJiU0utz5^)iihuwCbn=P-00DLSiU0rsYygV@00CbHivR!tWeke|00C_hivR!tX&8$D00D9yivR!ta3qTW00D0*ivR%t&jSDfDK?7$0RhefC_0OP00Ct{ivR!tV@Qht00Ci6ivR!tW>kv+00DAZivR!tWnhZ{00DGnivR!tZfuJH00C)oivR!tUwDfE00DA-ivR!tWrT|W00C=?ivR!ta*&Gv00D27ivR%t*8>0nDWZ!20Rh$nD5i^m00CvJivR!taI}j600C*bivR!taKMWI00CddivR!tVa$sF00DH;ivR!tVc3fR0RY(p00AlFivR%u*aIl&i+}(DYwn8x00DFMivR!tVf>2#00DFYi~s-uUkHo<00CtVi~s-uY!r+D00DIxi~s-uWFU+H00D0%i~s-xV|Qe2FpB`3ivZq>04$6E00Cbzi~s-uWkies00C=Ci~s-vWntoq08oqo0RS-s00Ajri~s=vFa#)MjDP?EZfcAG00D1ui~s-uX?TnP00DD;i~s-ua)gWk00Cu+i~s-=V{l<~Z+B;8b8=;AVrF${Zf$lhO8^p60K`=QR9FCTS^)fA0OVf)AY}kvZ2%~7062F5bbJ6-e*oTw0N#lJK#Bl@iU3xO0FaCT00BCdi~s=u`~d&~W$KIo00DaLi~s-uY4nT$00DFQi~s-uUjU5&00CzPjQ{`vZw!q900C?gjQ{`vUl@%500CtljQ{`vbtH`d00CtxjQ{`vbuf(p00D6}jQ{`vUp$Qf00C=4jQ{`vb4ZN<00Ci6jQ{`vbX1K100CcGjQ{`vWMGW|00CuYjQ{`vWNeK900C}tjQ{`vb$E>c00D1)jQ{}v{Q&?0Wr~de00DZAjQ{`vX_Sos00DEFjQ{`vU!aWu00C#FjQ{`vZ>)^~00C^WjQ{`vU$~6`00CvbjQ{`vb;OMT00CvnjQ{`vb_jQ{`vbLfo#00Cj{jQ{`vbo7k?00Ce6jQ{`vWB`r;00CtNjsO4wWDJe~00C|ijsO4wbr_BS00D0vjsO4wVkC|L00C|)jsO4xVs(U#05FaK00Cb%jsO4wbU=;(00C@9jsO4wbxe)`00CiAjsO4wW>}5@00CuQjsO7wO9B7^ZfcGI00D1qjsO4wX>^VN00DD)jsO4wa)6Ei00Cu&jsO7wOacG_Zjz1w00D27jsO4wX`GG#00DENjsO4wa;S~~00CvLjsO4wcC?NF00CdRjsO4wa=?xN00D2tjsO4wYRrxR0RT+`00C~;jsO4wZ`_Um00C*@jsO4wbLfr$00DCDjsO4wW%P~!00DOTjsO4wUjUB)00D9aj{pDxZw!wB00C|ij{pDyVQJEi02q$|0RT<{00C|&j{pDxZ!C`h00C(-j{pDxb2yIx00DA7j{pDxWkinv00DMNj{pDxUr>(#00C%Lj{pDxV_c5_00DGjj{pDxVQ7y400d-qZE$3c0E~_R9*+QSj{pG!G6XtbaO8}DbdLZ8SO5V42mk;9ZI+J!0RZ#>00Cv7j{pDxZlsR@00C*Lj{pDxY_N|200CvTj{pDxaJ-KI00C*jj{pDxaLA7U00AJ*j{pDxXVi}X00C~=j{pDxY2c3l00DF6j{pDxbL@`*00Cw4j{pDxWB88%00D3Qj{pDxas-e700D3ckN^MyAP|rM00DFskN^MzVR)L40347200L`Y%u4`BSO5zE%>m&7d<4e;WdJ?`L;!vQO9N*BfC2ykDK?M*3jxdl-~oFC#sFjhJpw}jeF91YW&nQzC{B=o00CuGkN^MyaBz?S00MGpbXx%abN~hbV+0lhWds-lzW~7i00Ak6kN^e)VgwZfWCRxjz5u`gC@x==kbs0107`@a00C{9kN^Mya}d{HUupwa9C00C*XkpKVzbi9!O00DBskpKVzVaSmH0RfH!aL$nc00CpvkpKVzY21+j00C{{kpKVzVd#+n00D09kpKw*HUR~?$0Re~tN(4UwGXYEk{{V&pC=QZ<00D3mk^le!Z7h-i00Lul+IRpCWdHyHb2yR!00m)Xd39-QkpO&=0Q8XnGLis7k^lh!0ssI3b6%1F00C)Yk^le!b7+zP00DV!k^le!Y;=+U00Ciok^le!Zh(>i00Ci!k^le!AdHd#00C!`k^le!ZkUn)00C*9k^le!ZKRR_00C*Lk^le!bg+^D00AJjk^le!W4w|800C*jk^lh!M*;u=bk33h00C~&k^le!W!RDc00C{@k^le!W#p0o00Cp_k^le!Vepaw00C_Fk^le!aQu=000AHZlK=n#YzUJ800C?clK=n#WfYSD00CnflK=n#ARv%z00A{BlmGw$GBA_?00A&IlmGw$GCY(300AgMlmGw$AV`z|00A&glmGw$E>x5N0RZ&?00AjrlmGw$a%7YM00CuclmGw$ba0da00DD$lmGw$VSJPT00C`+lmGw$D2S8*00AJ5lmGw$FqD)400AzVlmGz$n*smX00A$ylmGw$G`y4m00A(h(%G7yyj00A%-l>h(%FC3Ks00A%}l>h(%F({P)00A#9l>h(%G&Gd}00A;Ol>h(%FhG?600A;al>h(%C`^?A00AIUl>h(%Fj$oU00Ayul>h+%$^ifYDQcAf00A;?l>h(%Fm#mw00A$0l>h(%F@Ti-00A(Dl>h(%FN~D{00A|Ul>h(%GMJSB00A(bl>h(%GNhFN00Ahfl>h(%Ah4AH00A(zl>h(%F1(ch0RZ*^00Ak;l>h(%H_Vj)00A)4l>h(%FW8j;00A}Ll>h(%Fyxg000A%Rl>h(%H1L%G00A=gl>h(%F#MGO00A-rmH+?&CDJGTx00A{BmH+?&Fff(?00A#HmH+?&F+7$400A&UmH+?&FG!XE00A>jmH+?&GE|lT00A&smH+?&GGLYf00AgwmH+?&AZ(TZ00A&^mH+?&E_jvz0RWr=00Ak4mH+?&Gl-S|00A(LmH+?&FO-%500A+YmH+?&FrbzI00A$imH+?&GOU&W00AmjD0(ASjmr00A&AmjD0(E;N?_0RVvm00AjLmjD0(H$;~J00A&cmjD0(FHn~N00A{tmjD0(FkF`a00A#zmjD0(Gia9p00A;?mjD0(Fm#sy00A<3mjD0(D1es$00AI|mjD0(FpQS~00AzNmjD3(Gz0(vDV~=A00B9omjD0(GN_jT00A$qmjD0(H?)@k00A(%mjD0(FTj@o00A+^mjD0(GR&6%00A)4mjD0(GT4^@00Ai8mjD0(Amo<-00A)SmjD0(F7THC0RV*r00AldmjD0(F#wnV00A%tm;e9)FASIf00A))m;e9)Fc_Es00A!^m;e9)F(jA(00A;8m;e9)Fff<^00A;Km;e9)C_I<|00AIEm;e9)Fi4mH00Ayem;e9@F*rFlH8V6eGBEy>0A!T_!j%9ZmH>X10Nj=UI+p;LmjL#c092R&00Ayqm;eC)^8o+>DT`00Al7m;e9)GvJs200A-Pm;e9)FYK5A00A@dm;e9)F!-1N00A%lm;e9)Fa((Z00A-znE(I*Fc6sl00A-)sp00AH(nE(I+FfrDc04SLN00Ay8nE(L*UIG9CDL$D100A{ZnE(I*F-VyJ00A#fnE(I*GE|uW00A&snE(I*FJPGf00B8>nE(I*F>ILt00A&^nE(I*GI*H)00Ag|nE(I*AcUC!00A(HnE(I*E|8f30RV&o00AkSnE(I*F`$_M00A(jnE(I*FRYmW00B0#nE(I*Fu0ij00A$)nE(I*FvOVv00A<}nE(I*FwmI*00A=AnE(I*DBPI<00AK4nE(I*FzA^800A!UnE(L*(*ghiDf*cJ00A-nnE(I*Fa(+a00A!wng9R+Fc6vm00A);ng9R+FC3Zx00A%}ng9R+GANn=00A&Ang9R+GBlb100AgEng9R+AV8V`00A&Yng9R+E=-yL0RZg+00Ajjng9R+I9!?l00A;$ng9R+FKC(o00B5|ng9R+Fm###00A$0ng9R+IDnb}00A+Eng9R+FpQc200AN5R00DSboB#j;E?k@d0RYkh00D4loB#j;VQicL00C`soB#j;WO$qa00CisoB#j;Z-krx00c8KF>-{M0P>pvVw?bqoB#j;Ads8@00Cv9oB#j;bEupE00CjHoB#j;Wwe|C00C^eoB#j;Wx$*O00DBwoB#j;Ak3Tq00FzwoB#j;!q}Vu0Rd0~Al{sS00DF4oB#j;W$c^)00DIJoB#j;Z}^-500nL{GBI-goB)KF0P>pvVw?bqoB#p>B?2P?ASeW#0Q5tE00Cthod5sRKs+|DXodEiT01zht0RUA100DCdo&W#=WelDG0s0R{?zkdjkLgDL9@01_Mw5QUO!}RsnnidINMQUoxHmMxKBaasU7VY;F>s08E|$1_Isz&H>N?(gD-~)&V*wz;J-7Z-4*+a&Vpi00Cu&o&W#=Zj7D)00D23o&W#=X_%e>00DHKo&W#=X{4S200DHWo&W#=aX<@380DJ@h00DIFo&W>_>H*>bZycWh00C(tp8x;>bSR$y00C((p8x;>bTpp;00DA3p8x;>VL+b%0t18tjRPQXU-+H?MxOu@XaE2KV@#g_00C)Up8x;?bZ%sy06vod0szJVasmJWDRiFz0szPXbOHbYDS)2<0s+SXa{?%apMU@Xbc~+>00C~6p8x;>X`G(`00C&Cp8x;>U#On|00C&Op8x;>bhMuU00C*bp8x;>cfg+j0s_SWaRMk`dY^#Cp8x;>Y|Nhk00MAfD0l$cp8x;@VQ*z>p8)Vn0N|eh0|Vm%*8atxpV00D0jpa1{?Xc(XX00D3wpa1{?VI-gc0{{pCl>$uw00Aj5pa25_2LY405Wy}00Cd>pa1{?bo8JA00D0Lpa1{?WdNZ70|2i9ya8$g00Aisp#TE`t^vCNX#yw|p@0AZWg4LX00VApdaj@V2B82Zp#TB`@BoGbIv^@$fHnjG00DP8p#T5@Z%Cm400C@Hp#T5@W>ld70stNYMFRi}&x3YybcObTpy>00DAFq5ud2WdlzGQv+E8Q3F>4UjtPGIw+oOfbeU800DARq5uE^Z)~Cf00DGzq5uE^V|bzf00Cuwq5uE_c3*^|017z(00C@@q5uE^Y?Pt^00Ci|q5uF1bY*FEV{dJFaO$4`T%Z6fp#YAd04kyYOrijwq5zmC09b7R5CE9~1OWE{aRg`t1_1j2`T_3(Yy|lL&;WA;;s9y{ZUg`UDa@h(5CNC~0|52_a0F)r1pxX0`2p?%YXtZJ&j4}+;Q(m_Z3HOpqJRJaDfFTM6aaz%S^_Wu1Og@iUIPXKK?9)xLj)uMZv$Kdy8@{IVgqCYfB;wmxdH$IDIlW&6as+&SpqKs0|F%gT>}LIKm(uvLIfiKZUb8bx&o*GVFP0We*jkkxB@6WN~3@vKmY>((*rLAI|Bd#DT1Q_0|C+lE(AIQD2k(i00BIZqW}N|C_N!O8lwPEqX0-i0Gy)$00DBMqW}N_Z?vNT00DHmqW}N_VZfsR00DBwqW}N_Wz3@h00e1tUoNg$0K{1U@>u{v2><{Aa@eB)00Cv{qW}N_cJQMB00DCLqW}N_W&EQ600DCXqyPf|3IUe_PXGV`DGsCn0|5yEmI6)yC>ErE00DU%qyPW`awwz#00DF^qyPf|3;~$}Q2+n|DLSM80|5&Gm;z7$C_D00M4!G^7ApqyPW`WniQL00CcWqyPW~aAIzBb<(2%2&4dTqyT)K0GeU|00Ci|j{pDxWsIZ%00C?go&W#=b(o|800CrZ3IG5BZ=|FE00VS#W0Ir*o}>V(qyPW`Ah4tW00FzcqyPW`!o;Ki0RjC0Aj+hG00CvtqyPW`W7wns00C~^qyPW`VdSI$00D05qyPW`a`2=800Cw8qyPW`cKoCO2mpEj>j2*XMggk=$pU@=d;kCeDF~$i0RWl-00C?ur2qf{VI-vh00DCHyvVMFFY;$O3%;djL8pATESnfEuNM#H9e(2LJ#8XwIbo00D6Fr2qf{Vfdv01OkHueFT35fdo1zd|!Y7Uw{Asas;LT00CtdrT_o|ZXBin00D0zrT_o|X(*-u00DF^rT_o|X*8w)00DG5rT_o|azLg400Ch}rT_o|a7?BE00L}eFa`iprT_o|Wmu*F00CrTrT_o|X=tVZ0s=P#IRsxQ!d?I_FMt36babWw00nPyb#P?@r2zh=0B)uL0t5hprT_o~aCBxEH~_j9000*N0Rd(MFP^4=00C{RrT_o|XtbsP00DHmrT_o|X~3oc00DBwrT_o|Z_K6u00L)l(xw17Z~z1XF#sC?Apjo$Iw&qxfGAUd00C>_rT_o|bM&SF00Ck8rT_#07y*L;8UcX;00AimrvL>67Xg9+83BL+i34;fXkP%Q01&5ub_D<{AV-}|X00C|?rvLx}c{ry40RTw>00C`8rvLx}Wk{z000DGPrvLx}b5y4Q00DVgrvLx}b6}?c00AIorvLx}XKbec00C}trvLx}X?Ujq00C`&rvLx}ZG@))00Ci&rvLx}a*(G000C!~rvLx}Z=9z900DBMrvLx}aHyvM00AJbrvLx}Y_z8U00CvXrvLx}Y`~`g00C^qrvLx}Va%rh00DB+rvLx}Vc4et00D5`rvLx}E##*F00C?2rvLx}bMU7C00Ck4rvLx}bo{3P00AHZr~m)~WeBJM00DCjr~m)~ZxpBi00D3sr~m)~av-Px00DI-r~m)~a4e_*00AvBr~m)~Y&fU@00Ch>r~m)~azv;A00Cu6r~m)~ZcwNI00CuIr~m)~XI!WN00wMxAY@~7*rotJrvPH80A^^0ZzQPz00C?&sQ?22VE_>TV*mgFDK@D90|Vdy(g4%|Iw&r00Cu0sQ>^0Zdj=R00C)UsQ?59Y5-sW5CCETW+*yrUvx0306wXJXib1*sQ^A`08CE+00CucsQ>^0aEz${0stHWLjwQ-DVV7M0s$KWLIWtCsek|hWu&P900CsKsQ>^0WwfaP00CpVsQ>^1Z)NNP0KlmL0sw9T!T|sQDbT3^0s(CT!2u}Nsek|hXxym)00DF6sQ>~1!~t*u00AlTsQ>~2!vSvsDEO&>00D6RsQ>~22?K)zD7J!t00D9issI21Wf-ae00CqkssI21Y$U1x00D0*ssI21XfUb(00Ct-ssI21V?3$=00Ch_ssI27VQF%8Z)|f6sQ{9x0LG~R?5O|QS|O#uJ_DQv0$1_KZQm;rGB=l~G`OaU$^It&K@cB+5|cYpu^a(t=)00D27ssI21beyUH00DKPssI21W2mYC00L!f_5lDyf&c&lWn-?Y0JN$A1^_YwIsqsENd&q9XaWEMDafh-1_3bwIRPgCNCde7X96hJs(=6idEBZ100MMu?y3OBfdBvjaP+DG0s^W5q5&vh-gki7CIA2dWCW`K00ek#ZgRe=0RE}~5~~3AfdByl4ge@&luLjpg8%>lWh$!x00M4pHmd+|0RRC2#Q*>Sbw;ZI00D4Hs{jB2XHcsE00AIYs{jB2VO*;K00CrTs{jB2b!e*q00Coes{jB2Aatt$00C%vs{jB2bbzY>00C)+s{jB2cZ{n50ReabAd;&900C~8s{jB2Z=kCH00C*Hs{jB2bgZiY00CjLs{jB2W4NmT00C*fs{jB2Y{aVo00D5ys{jB2Y0#?x00DH?s{jB2Y}~5=00DL4s{jB2ZRo2200AKGs{jB2c=W3P00C+Gs{jB2asaFV00DFctN;K3VGOJQ0RY4R00DIvtN;K3a2%`v00CzrtN;K3ASkQ=00ChxtN;K3WHhV*00DJ6tN;K3V?e9`00AIItN;K3XiTgC00DGTtN;K3X;`cP00DPitN;N4>j5BStN;K3XKJhf00C}ttN;K3X?Uyv00DA-tN;K3WrVB%00DH0tN;K3bC9e600DKDtN;K3Y@DnB00Cp7tN;K3AgHVW00DEZtN;K3ZnUfb00CjTtN;K3Wx%We00C{rtN;K3EzGO{00D4zHvj+uAlR$`00DR2tN;K3Z{(~100C_3tN;K3X7H>400C?EtN;K3bNs9T00DR$kN^MzVPyiX00^xB0Ra9000DCttpET4ZXB%u00C(ttpET4Zz!z*00CnztpET4AT+H200D41tpET4X+W(20RTw?00DALtpET4Wl*gE00ClFtpET4ZCtGY00DJktpET4ZfLCl00AIwtpET4X>_ds00CoqtpET4V}Pvy00Ci!tpET4Zj7w}00D23tpET4VwkM}00L}jMy&vztpET4AjG8r00C&OtpET4aI~!e00CjTtpET4a=@(s0RS@u00C{xtpET4bI`2-00C*%tpET4Y}~B?00CvU00D2Zt^fc5a=5Ml00Cyct^fc5AjGZ!00C#pt^fc5ZqTj(00C*%t^fc5ZrrW_00DC1t^fc5Vd$;^00L=cJgWfit^fl74gxp;M*{!>DgLei0|5;JHvmQhC900DChuK)l6au}}w00D0vuK)l6b0n_-00DCMI{+w7uYdpnb5ySY00CuQuK)l6X=JYe00CoauK)l6Zg8&v00CuouK)l6WPGmx00C}-uK)l6Wr(i;00D50uK)l6Wt6V~00Cs0uK)l6U!bo500C#FuK)l6Zmh2W00D2ZuK)l6a=5Pm00MP%LazY6uK)u85CS>?Ndo`@DbB9|0|5^LIRHolDAuol00DE{uK)l6a_FxB00D3AuK)l6bM&tO00DFQuK)l6WdN`M00CnLumAu7Wel(Y00CqYumAu7Wf-sk0|40or2@YK00Ai^umA%A*Z`ygz5*yLuz&yob26|100Ct_umAu7a73^G00MGp9umAu7a^SE400Cv@umAu7YwWN900DCHumAu9Wp81`umJe50D^M>0|3_mqXNAG00Aisu>b=B)&QacyaFf`v48*pa~iP#00Cttu>b%8ZY;3?00C(-u>b=A-2kZq#{vKWDL%0P0|DFsr~<|UC`Pe>00DDMu>b%8ZdkDZ00CuQu>b%9Yj9+-0QhqN0{}Py1_6-*00Ajb=BHvt6!kOC-tv48*qb7q3E04Q_-00C=`u>b%8bC|IJ00M7eYOw&Gu>b%HWOHzLY;190K~5VP_O|0umCu*0Hm=1ymJ7CbO1OK0AdmV00CdFu>b%8ZP>8@1po{JHUK^W76Lp000AlFu>b}E3j#F&JpmO0I{{z00Cw8u>b%8Z3wae00ebmWpCcG01mPMe2M^;5&!@QVQ*t}Wny)Arc(gUSO7jG04zEHq;UXHeE`yj0LWYb0{}b%MgS}T00AjTvH$}CI|4-jD*z}^vVZ^qWmd8P00DJkvH$=9Y-q9o00CicvH$=9c671;00CcmvH$=BcV}ftMF0kh03?b400L-VfU*FFvH$=9bCj|G0sv10s{jB2DWtLh0s&40ssJdevVZ^qbg;4j00DElvH$=AVQ-$Y0Kl>U0s?gcV*)xTHi>}jhJXM8Wzez!00DL0vH$=9W#qB|00DLCvH$=9aqzML00C?EvH$=9bNsRZ00ChFvj6}AbO^Hm00CqUvj6}AWfZdj00Cqgvj6}AZXmM&00DI-vj6}AZ!EI_00UxVX)?0_m`DK5i2witb8d9XQ~)@$08%*s00M7i?oc{00?AbbaQfTaA{Pt0A8~Ika++odH{ZU0A70l$a?@vd;kCeUud%c0sw3P%mM%bDUh=O0s(6P%K|8tvw#BvvH_d{o&h>240(WHcYpu^Wu~(L00C#Vvj6}AZoIPq00Cjbvj6}Ba$m@^091DX00DT;vj6}AW!SR-00U%jW8Sj>W?}$dg#Z8pb7XFcvjCj40OYd(0suAwKmY&%DFCzp0s%DwKL98Ow15BsatyQp00D0nv;Y7Ba~!k)00DC%v;Y7BWhk@&00Cnzv;Y7CV`4P402oF900DJ6v;YDCI08Zd00AjVv;YDDHv&NbC{VP300DAVv;Y7BZ(y_l00DDmv;Y7Bb8NH#00Cukv;Y7BV|cUx00Cuwv;Y7BWQ4Q;00Cu+v;Y7Ba*(tD0{~$J5(B*e00AkSv;YGEU<45ZyZ|Vqw15Bsc&fAj00CvTv;Y7BWW2Nh00C~ov;Y7Fb8u;NVnnn6mb3uKv;aCg0PcVQ0{~wH5Cgja00AlFv;YGGUIY&Vx&U7&F3z+7?6iQ;761SNX!5iG00CnLwEzGCVGOkZ00CtZwEzGCUl_Fj00D9ywEzMDSp>BJ00Ai~wEzMESOl~HC@{5v00C?^wEzGEY+`OewE&o{0D6G{0s=V$_X0X7V1|G|fPer2a#Xbd00CuUwEzGCWN5Vj00C@nwEzGCX>_#!00DJ+wEzGDUt+=t0D!ds00MV!hP435h5!HpX>UNS0Fbo+1Oi{J0RUbA00DP+wg3PDZ+x}@00C@*wg3PDW{9=`00C=`wg3PDbCk9K00Ci|wg3PDbfC5X00AJTwg3PDYpk{a00DBcwg3PDVYs#c00C{jwg3PDXvDSv00Cplwg3PDZqT*>00Cvzwg3PDY3d3900D5~wg3PDZRoZD00D3Awg3PEVq@I40Q9y100AKSwg3PDy9BoY00F`Zw*UbFT>u~uw}1cvXBM{r00C|uw*UYEX(YD*00C?&w*UYEY%sR~00Ch(w*UYEbUe2J00DDCw*UYEZb-KP00C)Ew*UYEAXK*i00CrLw*UYEZeX_n00CiUw*UYEAZ)h)00C!mw*UYEZg{r<00C)!w*UYEWQ4Z>00C@@w*UYEX^^)70s^Q3paCdfuy=rnB>(^cZk)FO00C*Lw*UYGZfA7Rwg9lV09TY#Pi0K9F0Vq5^uT>t<9a%{K&0Rk5SWhjQY09HbP00C=^xBvhFa+tUP0RngfUnmN507fT(00DHQxBvhGX?C!<05+}w00M7ep11(ExBvnGZUD{#00Ak=xBvnHZ2-*zDA2fo00CvzxBvhFdf>PK00C*{xBvnGZ~)K(00AlTxBvnHZvf8%DEPR500DXaxBvhFbO^Zs00C(Zxc~qGV-&dn00Chdxc~qGa3Hw=00Chpxc~qHb7Snd04%ux00Cbzxc~qGWjwh600DJExc~qGWk|UI00DJQxc~qKZ)104ZANtfFpB`3ivZq>04$6E00CcGxc~qGWpKFw00MSpE^7ejg#Z8nUwpX$00VSucm}KhcDVqCxc~wHQv|O700AkQxc~wIQUtC5D4@B300DHSxc~qGWw5yb0svJ6u>b%8DZIG=0s&M6umC8+xqtuxdC0i{00D5)xc~qHb7Qu-0NA+z00M1f-njtWiU0uthyx&iHUIztW%9WI00C_Jxc~qGbpW~m00CqMx&QzKVP$V=jJW{lxd05h0J4Pu00LuYxKIEvfB*sjQ3S0300Aj3x&Q(JPz0<1C^Wi&00C?|x&QzHWkk9F00C}Fx&QzHa!|Se00Lz#R=NPptN;Q4R|K;F00Ajxx&Q(JRs^yDC~Ufb00D1ux&QzHa(ub~00Cu!x&QzIVP*bm0E&zN02*U-WO8$FXJK$Nx&R*%08bJCh7tg>5&-QI0Pzw4HxmFm698rt0DBVvu@eB=69D}a05B8)GZX-56aZ`#0HG8Br4#_$6ae2801g!Z5fuPW6#!Ee0Era06iZ7Kpy~n9{_+K0JR?gxgP-V9{}_p04g8=E+7C*AOL?L0D~X^z90a?AOHa&00kicI3WN#ApmS40B<1xks$!jApq+k03sp)QX&9VA^?#h0F@#D%pw5LA^;F002CttPa^mvXjBmh<<0FWdAlOzDsBmmbW030O%ASD1>B>-O~0GcHLo+SXuB>>GO00$-j3MK$YCICw&0D~p~hb92KCIIXv05B&2G$#OiCjfsZ0Kz8##wP&lCjcBM0AwftXea=;C;+=C00Joh1}OkZDF96=0H-Mct0@5ZDFFN_08uIcRVo0ODgc}+0OBeD<|+U)D*!hu0C+0^d@BIJD*(hR01+$z6)XTcpGytYF0MRr6>ofohH2_UD08upnhBW|+H2}Ib0KGK;_cZ|fH2^X;05&!NeKr7rHUP&q0LwN2{WbtAHvm;P0C_h6tTzC!Hvr%_0OL0R8aMzRH~>*N097~ujW__2H~`By0M9r84mki4IRHgD07*Fjc{u=lIRLOZ0JAv&>Nx=IIRFtl05UoNcRB!jIsmRZ0I@m%?m7VSIsh;`05m%QXgdIFI{>Xa0I@p&<~soDI{+j+04O{FUOWI|JOHCS0H-_v+&lo_JOB_q024g`R6PJ#Jpi6P0HQqr(mepxJpkrC046>FV?F?rJ^;r)03tsCCO-gLKLA}n0F*xfm_Go}KLFG}01!X`6hHt`Kmb)h0G2=inm_>DKmg!C046~IDnS5oK>&3@0JlK^yFmc{K>z|m07*gsO+o;bLI9aU0LwxE&q4qQLjVjz07F9nMneFCLjZO}w@MgSp30ANM{WJUm50A)-7X-oicK03}ZVTTcLwPXM(~0OU^q=T875Pyi-S0BBGEY)}BRPyn}30RK<`15p4_Q2w8_07G2>d0hZ~T>!CN0JU8JiT>u?k03cofR$c&FUI3F`0GD0>-Ch9UUH~ay04-ktc3%K`UjVgV0J&cP?q2}%UjQ^<061U(ZD0U#U;vL`0M}pu@n8TJVE{y70C-^lo?!saVE_$c08(NAmtp|iVgUbQ02*TeOJe{|V*rd}0FYw<$zuS`V*n0h01{*XPGkU5WB`L?0ET1$sAK@#WB>$Z05D|$Tx9@pWdOBh0J&uV_hkV3WdJZ{05oO*X=VUzW&oyU0IFsH+GYUWW&jLl01syXKW6|!X8?m|0EcG)!Dj&TX88e`x@NX#lBd0NrT-5^4ZcY5-Vj0F7z@k!k?eY5>}501|5e7Ha@TYXC`W0E24)hid@CYXHY<0RC$L0&DV4UZ~)eD03mSzC2;_2aR6^|0I6{Rt#JV7aRBRa02^`u9&!LuasXCx0FrV5mT~~masbtG00naZ2y*~Fa{zI30CjT!uyX*ka{%*m0QYkM2y_55bO2{`0GD(C#B=}?bpS_o07`WLgmnOjbpXC~0K#v0J0M2;;19|`ldH_6n06=;Gb$S4KdH}9^0J3@j=XwC^dH^MR04aL_U3&mwdjOex0G)dP(|Z8cdjK4K03dt-Z+rlAd;qX~0JMAn`g{QXd;mdx07ZQOhJ65veE`vY0M&f}6n+30egI&80Azjuw0;1%egG1G02Y4$TYmswe*mX{0IPoh^M3&Me*jB>08fAbrhovdfB*r300n^nM}YuKfdG+#0F{9N*?|DvfdC;zf&h|&0G5IP!h!%5g8&(W08E1bPlEuFg8-F-0N8^7+=BoYga8|a09J$mT7&?ZgaDm{0N#WE;)DPfg#a6c09}OuVTAyPg#f*U0KkO+28I9%h5$;208WMgoQ43Ph5+b>0PKbUFNXj#hX8qp0DXr5zJ~z9hX4$S01$`(Scm{zhybXF0IY}r*oXiNi2yZ;0Aq;&l!*Y@i2&V+049n6DvAJZiU4wo0H}%ptBL^riU0zO06vQVLW=-|ivWm=0LY5~%!>dRi~t;r09lLxU5o&Oi~zul0Q8IiD2)JGjR1^|0J)6-=ZydrjsQiD0CJ81o{j*{jsOUc06dQXX^#Mxj{we(0R4{uDv$tVkN}*J0M?KI2ay0dkpOCu0Fsda%8>y5kpM4}0B4c_pppRDk^mHw09BI!jgtVvlK{q(005K#1e5?elmI@I0D6=FeUt#xlmOS104kLLE|marl>l{>0KSy~!j%B_l>qsb07sSpOO^ncmH?fW0L+#E&z1oGmH+~m06LcdK9>M_mjHa10Kk_3#FqdFm;elz06drgKbQb@m;ia00I`?=wU_|?m;eEp06UogKbZiFnE;QO0Lz&G&zS%ongAl20AiW|W|{!3ngFnx0PdOq@|plKn*cPM0B)NAahm|Hn*g$#012D`4V(Z|oB&sx0Kc37!<+yXod6o00BfB9Z=C?ZodCq002Q798J+-Xo&as00KJ|7!JYsSp8yq~0A!y4XrBPDp8&I;012P~4WIyHpa5r}0Joq3yPyF5pa1}&07IbwMxg+Op#X`Y0Lh^M&7lAlq5v7909>K~V4?t{q5!9&0OFzm=Ar-`qW~bI0ED9eh@$}HqX6im06L@qKBNGcqyU_x0Pds!@}vMgr2s*t0EwjljimtKr2yom05YZkHl_fCrT~bh0Me!a)}{a@rvNFZ0CT4Rcc%c!rvS~T01~JG7N`Jer~qxK0H3G;qNo71r~u%o02HYJLa6|5sQ{d*0H3J<$*BO%sQ?4205_@tXsQ6LssOO60Qaf@`>Fsws{lf)0DP+efU5wys{p>M0Q9Q>_^SXktN=Ky0B5WKYODaGtN^F10Qal_`>X&ytpGx;0FA8xk*xsAtpLug01B=E4z2)Et^if80F15xkgfpAt^m%i01>YM6|VqOuK-uC0GqD>pRWMKuK@qA00Xc9O|Sq_umG2^0GqG?@UQ^%umB^m0A8^GVzB_Ju>h^H0NJqs2C@J;vH)_j0HCq}$+7_7vH%XV01>kQO|t+{vjCE_0G6`=-LnAUvj7>i06eq+aI^rZv;Y_w7^$hLsYn)`si~={NSGKH5UHuDs7M$P7)_si~={NU5nvsi~={si~={si>)`si;V)sYt1)si~=`si>)`si{bq7#J8Bsi~={si{a17^$hLsi~-`si~={si~<*h?p3usHv!_si~={si~={si~={sHv!_s7M&8si~-`si~={sHv%`si~={si~={si~-`si~={s7Oc<7#OLksi>)`si~={si~={si~={si~={si~={si~={si~=`si~={si~={si~={si~={si~={si~={si~={si~={si~={si~={si{ba7)`si~={si~={si~={NEoT8si>)`si~={NEjFxcvzSam>3wC7=Re5sYtM}n5n3#si~={si~={si~={si~={si~={si~={si~={si~={si~={si~={si~={sHv!_si~={si~={si~={si~={sYn)`si~={si~={NEjG+si~={si~={si>)`si;VJsi>)`sj024sj024si~={si~=`si~={si~=`NU5o*si~<*7#J9MSgEL~si~={si~-`sYn>9sHv%`sYn)`si~=`si~={si>)`si~=`si~={si~={si~={si~={si~={si>)_si~={si~={si~={si~={si~={si~={si~={si~={si>)_si~={si~={si~={si~={si~={si~={si~={si~={si~={si~={si~={si~=`AQ%`J=&7iwsYn=jsi~={si~={si~={si~=`si~={si~={si~={si~={si~={si~={NU5o*si~={sYt1*si{a97#NtC5UHrCsHv%`sYsX@NO}YUAi}#KLvL@6CZeeU7RAF;#X>Ml#Ze(e6X>V>}Y*b-$YiVw0E@*UZZ1DmBAi}#KL}7GcEpTCSVQg$=Y#>l+aAj<1Ze;*OVRT_oVRB(?Y-MavX>et1X>MgMXmo9CcK`q&!n+_-bYXO9V<1#%ZE0?2AVF?nY@fG&5o`W@Th#GcsfY073=_03Z`OI&W}ga$$6DawsV%V|8RG1V{Bn_b9823F<&ubb!1^ja$#_2Lvm$dbZ>GlXmo9C{Qv+U!n+_;WnpaqbY)>}E@*UZY~KMCAa`kWXdqN?Yh`XAL}hSgZe(R{V`*h`09R>rXjE@&Wo|@eaAj^}Wo~0>WpgfQbZu;A0{|~@VRCe7bZKvHWpXZPaBu){VRCe7bZKvHWpZCQXmD^YXmo9C4gml!Wq4&{b#!lXE@*IY0A+Y(V|8?Ia$h)TaBwbYbZu-J02l%~I#6kFWoB$|cPJ?UBVjl&F*r9jG&D12H!w6dVmW0rH83$YFk>(^V=^{mH)IR|IvM~N2s%1!X>MgGDIjBSZgX@1BVlG`WiT}~Wj0|sIXN~pWHe$kWivKrW;rl3GBP)1He(I|ItCg57zjE#aA|O5C@COgZ*FsR03%^EIWaV2F=jJ3F<~<>F*ap6I5IdjVlXi|V`ermWHw|F06GR502l~5I&^PqWo{@bAY*TCb94YBVKO;2Wi@0tI5ajkHDfS0HfA(sW;J9tI59IhW;bIsF%kee1{weu0y;W#bZ>AdDF7p3I5RL}H)b+0G-F~oH)dvLF*jv6WHd4~IWah7IW#ph6aYFJ02l~5I&O7sUu0!wWpZ+1Y;!0nAY*TCb94YBVKQbpFgas4HDxh1Vm4xAW;iouVP-O8Fk&|}V>e+pG!_6l1{weu2s%1sWoBh^D069gWnXkD03%^FW@2PAGcq<}HaKB6IXE&lG%_<{W-v7}IAJ$qIW{pE06GS001swtZ+8G@Y;SiiXmo9CF#s6?Iyymfd2VA=VRLIJDF7p3HaIpoIb%0DV>e_sIW%E7Vr4RAF=H_}F=IF}I503V8vr{o02u;0I(|WOd2VA=VRLIJDF7p3F*G=2WMg4sWMMdFWHdB4G&W)~H#jsoIb&orV`4Qk9RNEq02vZGIze-JZevtob89GLZ*FsRAVG6^Zevtob899k03%^GI5J{mHa9t8H#ak6VKrqmH8(LdW;ZcoI59IfW;ZY&06P#d02vKBIze-JZevtob89F;b9ruKRAF;#CMGEWBVjdRGcjg3HDoenWi>Q1W;QZqH#jt8H8nA1V>dWBIW{2xI}0%Y84@}=Z*XODVRUbDJt$*uZgX@XL34R-V^m>tYbGfGBVjQ#W??ZnWHL8lFfwB_F=a3@Vl+5qVK6skI5%cBV>KfHI}kAd84Wr*Z*XODVRUbDJt#qQd2VA=VRLIHCMf_TVK`)CF*sx~H)bM-DV>VG`06PXU1OPfZL34R-V^m>tYXDAZevtob89YWbZu-<0stVwyC70!Y-M3{Wgs&yF)%JLASg00GBqzSH7_tRDFAY1Y-M3{Wi2x;F)%GKE@*UZZ1DmBAi}#KP+@XmY;0w0AVz6!WB^cMa$#(2Wo$-iZe%WKbZu-70RS&^Wo=<_Xm4_5E@*IY0CQz+VQ^?~a%EpQXmD^YXmo9CM*tWIIy!D)ZDlAaAY*TCb94YBVPiOBWHmT3H!);oIALNmI5st9Gh;JmF*Gt{Vlrf6F*N`>21fuG2s%1$b!}gBbz@^?b8~NUb0{ewV{dMAbO0k^W@a%qWHL56VPiKnV`gSyH8o)~F)}qdGiEX{WHT^iHvl>YM*tWIIy!E3ZC`M5Wn^VzWpi_Ha&sstAY*TCb94YBVK6l?W;ZrvG&nY7WnnovGhs4fIb=CAFf}tdVPRxAH8}t}21fuG2s%1$b!}gBbaHQQXJ2GxaAj^}Wo~0>WpgMgAY*TCb94YBVPs-AIXE~lIA%6wW;tRsHZwUnW?^M9V=`o7Wn?iqVmkmj21fuG2s%1$b!}gFWnpVyWMyz=Ze(R{V`*h`C@COgZ*FsR03%^xWoBbIVPZHqW-&KmV=^~oHZ?OhW;SFtWn*G9H)Cc!06GRo02m88I%aQjUu9uqXkT-6V`F7=b8m7eRwgDXAY*TCb94YBVKZZ5HZ(V6V=yo_G-5SmF)?K@V>mZ5GB#s0H)c3yI6nY72}b}J3pzSxZ*pH{VPj}taB^j2Wn*P?b8m7eRwgDXAY*TCb94YBVKHQ4HfA+rHDxn0HDzHiWiVtkF<~(_VP$19Wiw%9HbDS72}b}J2s%1+d2nSYDIjBSZgX@1BVl1OFl0AkH8(jnFg7$}Ght#eIW%Q9WH)7EWinx8VlhJiItE7o7zjE#Xkl|`Uv^<^b!8|iAY*TCb94YBVPa)DHDzKsWjHoAV`MjCI59IfWHn_tV_`L9Wnp7wHbnqB215c5R%vB-09I*bcP?mjZETqV054;8WMN-(baG{3Z7yhVZ~$XDnAAarPDAWdO-X>E0FAVY6%V|8+JWo~1608D9ZX>?R*WldptX>E0FLvL0GGb;qWI1GGG-hEpG&VCbW;Qr7HDozV07C{+03jASI&W@YWo~qGc_>$Ja%*LBR%vB-EFe^2b8A*MUolT&b7gXNWpYVwbY*g8VPj=3Xmo9CnE?PVV|8RV?GE@*UZZ2bTLAi}#KQgv))a{zL6Y-MvUXmo9CLvL<$a%p09bZKs907GwXbaH88b#!TNXD(=TZERZrAO$))V|8R4R^Zf782Ze$=yZgX^DY;0+6X8>t#b97;BY%XYYZERft03gD6c0K&P-$>wC{Sr|WmI`^Wh@{=CMGEWBVlD?Wid21GB7kaW??ZjH(_KrGcq`4Vlgu{H#cKsW-?*`HVtY36bL#xba`-PC@COgZ*FsR03%^AVK`xAIXPrvVm3BmWH4f5VK+85WMMQnWo9ukVlpvg05%3{02B&3I&^t(WhhW-aAj0^aAhd~BVlAYGdVUfH)A+7VL4?vH)UoqWMnlqVPrKnVmLHqGc#rYHVA3}6a_juV_|G;VPb4$C{-pVDF7p3G&5vjHDP5mVqrBlWivHpG-NPgH!wA3H)dfnH8walXaF_?G6VoRI#6kFWdLJrVRLhIW?wO1F;Ho6WiDuRZESS_76>{zX=ErVAY*TCb94YBVK6Z`FlAviF=JyiHZ?UdF*P@3VPR!7H8El}Fk?4lIBNhn26X@y2s%1(b!ByBUvp`CWhf~iV{dMAbO0k^HDxhnV>CEsWHK^2HDoe2Fl9JqGcz@2W->BjIW=KrZ2&k1bpRF!Iy!N6Wp!m=V_|S%V`+4GC@COgZ*FsR03%^AV=y%^HDWMhWi?_nHZ){3G&MM4H8M9iGB!0dGC5&y05}G902Tr|I%RleV|8?IawsVPBVjgTWi(_mWMMdDH8VM4Wn(mEW;r-HIbvovWH&T9GC6SoICTIP0y;W$XmVv?WGE>BBVjf%H8^HDF=S#hVmUZ4Wiv20GGk*fGcq}2Vr4U9F*I`kI5GqPIyzTxa%*LB0Ap-nb8~cNUol@XS8sA_WpXZPbZu;1001DuyC6evZE$I9Wgt{xb8BX7Z+9SfX>@2HLsdjU07GwWaA|C1RAF;#W^8YFS7~%;LsdjUE@*UZY*J-mY#>#0WgtUgb7gY?b#rB7VRL13E@*UZZ1DjAAi}#KO<{C$X?P$_b!>ELaBOK~VRUJ4ZUAjzbaH8UUu|`4bZKyGX=7n@X>V>WXmo9Cg8&r*Iyy&kVQ^?DDF7p3F=J#iVqs-CV_{-qG&MLeFk&(_V`O4vWHep(q02K~8I!AJ0aA+uFZ*FsRAV+dxaA+ne03%^!WjQrDGd5;4IbmivF=9D5Gc;u|Ib}F7H!w3ZWi~N;05=SS02K*3I!AJ0aA+t;a$#_2CMGEWBVjc)F=Aq2G&4CkG-P2hF)?FiWi>P~IWS^nHfCitVr6{*HwS|N6%IN&Z*XODVRUbDJt$*uZgX@XM{;3sXeKEDBVjONH!?D0V>vKkWjSFmF=8_{V>4xAWj10mGBPnXI5>X*Hw=RS6$v^zZ*XODVRUbDJt#+VVQ^?BCMf_TVK+H6WHd2lF*P_cF*i6iWMwlpGGj3^Wo2b%Ib~rtWPt!T2QmZzIyy&kVQ^>wV{Bn_b9823F<&u9a$#_2E@*UZY?=TcIyz%)Z*z5WWnXu4VQ_F|a$j_LaAg1^VKQPiF*Pz~GBjf{VKZblWj8f2Gc-A5FfwIhVl*{5HJSh)Iyz}{Uu0!wVRdYDUw3k0aByXEUv>Z^VKF&2F*7k`Vlg*mH!(IdW;JGEIAl38H!);4VPQ8gF`57$IyzrtXmxIDUvp`CWdI{#Ic8=xHfA?5H85pmG-hTrGGa9_Fg7_fGG;krG-5R}ngAaV?2azbHqWhf~CBVlAPFf%u0H!wFcHeoO|G&M3eVL380I5IJ1V=`npHDQPVL7D&`2s%1YVRCe7bZKvHWpYAcb7d%VX?kT}bSVHMVK6y4G&VG0Gc!45Fgav7W;SIsVKq5oWn(llHDfh1VTu4j2ATjL4LUkdVRCe7bZKvHWpYAcb7d%VX?kT}bSxl4CMGEWBVjpXW;tdxG%zqZIWl85WI1AFG%+(VVl`oAGGaMpIAx3gK?|Ay9|$@+V`z15YhQC|dSxgnAY*TCb94YBVP;`7G%`6kWMyG9H8nRgIAvyGH8wajFfue`FgP+XHI4v52ATjL2s%1rXmxIDUvp`CWhirLdSzd9DF7p3G&nIZIX7WtVmD=DHZo*mV=^-}I5cHpVKg{nVq;}AkN`mjlmHjHa0glVPZHjW??g7IALUx06_+t03QN6I%8~ab9HiMUw3k0aByXEC@BCVVPZBjIX7Z4I5#&kI5IG1VP<7EVKy-{W;rl6IX5cvG?oBC1DXII1v)x!aAk5~bZ>GfDJVuJCMf_TVKO&kH8x^lV>dQrW;Qc9V>e=EGcaW~GBGn_G&eUlHkbfG12O~vIyz8ca&&2QX>V?2azbHqWdLJrVRLhIW?wO1F;HQ0bZK;HZ*FCBLSb`dE@*UZY@z@iIyz!ub7fz2d2nR_BVjRSHDNM2I5%N9Gcq_iVq;}6V>B=^IW}Q8WMgGwW@MrO9S%A=V|8R2>hG-P2gF=jDjHfA+4F=jDgHa2E6H(_KoG@bxI5264a5jr|=aAk5~bZ>G!C}VYGVMTUjZgfInb7dwbDF7p3G&g2rH#jwAIWsviG&EylVP-1mVRK~wV{Bn_b9823F<&ubb!1^hc4cmKLSb`dE@*UZY{CE`Iyz!ub7fz2d2nR_BVl7VV=*yeHDqKoWH4i8HDY9CVm35pI5}c5G-YBjH#5QjAr3k^V|8RCH4He)e4HaIt8G+|~pWi>ZrG&eLgV_{=AGi0OyLkz+IAr(40V|8RG!C}VYGVMlUdaA-w%Wn)5Nb7dwbDF7p3IAS+pIW;gjG&D6hGh{MmWH~TlHDoj}GB-6jI5J{4r~pF~!T=!_Iyz`?b95+Ub!1^wVRLINAVVf5EFgAoX=ExX03%^!Gh|^kWjJGGVP!WpH8o){I5#sjW?^DwW@KhDGcjbU07DeQ03jkHa1~0Ffw9dHmm?cDZ&6D9y&U0Wo>h1bSPtWWMNccb89Rhc5i89Dl8yrZgealb7^{IUvwz|BVlDRVL4-EVKX!{VKXo`Wic@{WMO4xVPZKnF*syoHDay+Lma{YAtpLHZDnm^aCsV%9HDWb2I5jk4Ibt?BHDokpGBi12F*jwf07E3g03jASI(lVtZzyARWMNccb89RhR4ObWb7^{IUvwz|BVjQ)HDob1GcaQ}G&VIiWH323GBPz~V=!Z4H8V6bW-+n=LlnXQAsIS4W@&6}C}VYGVN_vrYb+pCDl8yWEFg1fdSzd9DF7p3W??orWMeZqWo9^LF*GtVI5c86IXPrxFk?A0VP#}8v;adF!T=#4Iyz%-aCsM@24LUk=X>Mn1WnXk*b89GLb!1^wVRLINAVVntBVlDTVmUQ6Ght;mGB!3bGdVOdVPQ98H83?XHDhEnGGe#@Lkq$HArm?}W^Zy|Wnp7zC}VYGVN_vrYb+p1EFeiNAVVntBVlGWV>L83V`gPyIAk<3W;9|pVPQ2gWMngCVlrW7GiJH~LlMFNAr?A1W^Zy|Wnp7zUukY+Wq2rKb!1^wVRLINAW1ABNh}~qEFeQE03%^xW;ru6V=!VlWn?#EWM*bLGd5&mF=jM4IW{+AW@ci%07DeQ03jASI&^YjZgXaDa&0JMb!1^wVRLINAW1ABNh}~wEFeQE03%^!IAl39G-hNrW-w!6W;QirVl^{iW@IpAV>dWBFkxZ707DeQ03jSYI&^YjZgXaDa&0JMb!1^wVRLINAW1PSAW1PSAW1STAWtkHLn#0wVKp%}WMyGwHezF8H!xx}HDWe1GBRXhF*h+bFgZ3bG{68u8khqBLSb`d0Ap-nb8~cNUol@XV|8Ru_a&!PtVRB(?Y-MayZ*p`lXmo9C>j3~DcWHEJAVXC|K_Ev^RRBv(Wnpe}S7~%;LsdjUM^IHRXmo9CcmMz(cWHEJAVXC|K_Ev^RRB$4baH8UO?7N^X>e?5V_|e@Z*EsdR+07M4P03r=KIz)MHVQpz+P+@X(X>@6CZe?;Pb7^{IUvw-WLnbCE03%^IGGj7jWj8o7Ib}68Vl+22WjSIoF=a40G&EvkGGk=S07MHi1OPfZM0svuZE0grVRCe7bZKvHWpV&xY+-YAbY@>MUok{^ZeeX{V^CpobZK;HZ*FCBE@*UZY|#KA6goO@aAk5~bZ>GfDJWxgWMNZua%Ev{Uvw!TV{dMAbO0k^HZnFgF*#!}Fkv`1F=IDkWjHo4H!)^qGBz?}Wi~Tq&j3RbmIDApWo%`1WpV&xY+-YAbY@>MUom5KWMNZua%Ev{L}hGcbY*fbXmo9C-~bpCIyz8kaAj<1Ze=KQX?kT}bSxlHb08)rATBO0DF7p3VPrTrGc`6hHf1zsIc8)yG&eM1WHDi5H8El}IA$|t(f~RU-~bpMIyz8kaAj<1Ze=KQX?kT}bSxlqbYwa@bailSWjs)GE-o%UCMGEWBVjlpHDx(9)&M#N-~bp1Iy!E3ZC`L{aAk8SDIjBSZgX@1BVlA_Wo9sAHZW#5W;kJEWimE5WH)1EI5jjlG&MJ7W;56TItJhX7y>#va%FR6bSNnRBVjW!F=RM5G-5e8WMwvHFl1$BIAJ$rI5s(DIAk$3Vr1F?I^X~p2s%1$b!}gCZ);_4b0{ewV{dMAbO0k^I5}ZvHa9UeVm3B0WMyPEIXO2lV_{=7V>dHmGi7Eu+yFWT-~bo`Iyz@^VQ^?DDF7p3G&3?VVq|1EVmM=FFg0U2VPRu2VmW4EFfcG>G+{F}-T*pk01s?wZe;*$X>MgMXmo9C;{YTVIy!G~WpZJ3Z*nLpC}VYGVMlUdaA-w%Wn*7-DIjBSZgX@1BVlDVV>4tnGc;voF*G-0I5}ZuH#K2nG-Wb3F)%VSVr1a}MirO?07PYMWprh70Ap-nb8~cNUol@XV|8R37~HDx$CIAS(oH8o;3Wi>H3Gh;V2V>LB3HDxt7<^VYc@BkMAIyz-|Wn*=8Z*nLp03%^CV`DI5IAmfoV`VjEIb~)!I5lK8F=AvjWH>W9H8C*g06Fjg7XmsuXL4b1XecQFBVjc-H#RgeWH325H(_F9H)dmDHa282G&E&oVlg>nVlwIgIq(1%20A)&WprU_ZYW}JZ)_<5BVl7TI5jpiIAl0uVK`=DGG;S0V>CE2He+HiWHe?oIcDqtIRx+k7YI5!a%FU3X>KSfAY*TCb94YBVP-QlWMegBHDokoVK+EtIWaUaV>2^iVqsx9G-hQmH|_vA1~LQyIyzHzVrFb_cK~B-VRLhIW?wO1F;jJ7W^8YFE@*UZYz_ecFL!TpYh`jSXmD@et1X>MfzRAF;#W^8YFP;zf$Wpi_BZf8(waAj<1Ze=cLbZu<$0RSMvyC76$cyu68a&Kd0b8~5KXCP2%aAj<1Ze;*eWq5Q@a&Kd0b8~5KXHaQyWo&6~WiDuRZER2i03gD2)`IX5?CHeq8qHezHoH85gfWHmA~G%;c^W?}dMLK6G{AQL(|Z*XODVRUbDC@Cmob!1^hc4cmKUvw!TV{dMAbO0k^H#cN8HZ*27HZU-1na%Ew3Z*l-*Y+-YAbY@>MUom5KWMM^iWo~psa%Ew3Z*neZbZu-$Z+2yJZeea?WdKKSc4cyIVQyn(E@*UZZ1DjAAi}#KMr>(tAVY6yZgT);Y-w;~Z)t9GE@*UZYz+Y(6*@X(b!~7cb97`nI&X7ya%Ev{CMh6eZ*FsR03%^FWM(rsH8Ny4H8M40VPausWo0%wHe@wsH#cH4W@I=206-HB0UiiCI%IWia40DtV{dMAbO0k^G%++}HZWslWjQfqVliYhIWjahGd5;pWnndDG-Y8o0Rcb;4FMhkIyz%)WnpqCDF7p3GdVOkWi>Z3I5sggHDNY6Gc;o{FfcSUH)CQsGcz(|0|7t{0UiiCI&O7sUvyz}YjY?mAY*TCb94YBVKg)_GcsZ~HZV72Wi@1EIWjpmI5RghIc79uW;tbMG6exZ1`Poo2s%1*WprP4a42(WdSzd9DF7p3VK`(pWI1CqWH4l9H)S_4Fg9c|H!wFjWH@DFGdMUn2LV6^4FMh&Iy!G|UuAA|a(O6MZ*ps8a#m?&cPt=OVRLI%X=QgQ03%^GW??ljGGk#iWo9;FW@0&HGBGwdWMVNnFflSUI5jm10YDTD0Uj1QI&W@YWq4_HC|7TCYh`j)X=QgTAXH&hWjJFsFgP(ZWHLB4WHLErH85c_GBPwaF=03`GYbJg6fy(=IyysWa&K*APhxXra&~2M0Ap-nb8~cNUol@XLuhhuZDmhlb7gXNWpXZPbZu<+0R%5&Z*pY-GBPk@WHe+sVmM=BGdN>7G%#W`H8o^4WH&WqGBsjiE@*UZY?%Q7FJpCNVP9=!ZEtdUE@*IY0AqDzVP9WuWo>VAd0#kaaBwbYbZu;U1OOnyyC6k)Wn*=8Z*l-dcx7XCbY)awb8BX7Z+9+ebZu-W0T%)~I%RleV|8?IawsVPBVjaUWi&WpI5K2rF=jb8H8^28IXPr8IAu04F*RdlV>A%~IVS-Z0y;W(Z*ps8awsVPBVlDRWMX4wH#KHtH#0XiVP!L7W-~ZBG&nXhIAt?5Wn&WoIVS-Z20A)(V`yb$b!=rQRAF;#DF7p3VK-$rGGk*fW->WAG&eOdIW=N3H8Nv3W-&K8GczEnGh}3DVr5}AFk>_`H8x^6HfA#!0XYaK0T%^2I&*1kWo~p|VRLzIV<<)@CMf_TVK8H5GC5^tVKOl=He)zrHDNI{Gcsi~VKp^jVlrhgH5&mr11A9&3OYJ-X>4U~bYEd}d2VAUP$niUAVwx8DF7p3H8V0~IAdfmIb~vHWi>f4GBaf}VPrNnW-~G}FlIDm9RWEACjl1)Iyz%-a&>MfR3;`V03%^BV>dT4WjQc5F*GwUGB7eUV`gDBGG<{oHe@$7V_{<-0XYLF0T%)~I%98gb#5pr03%^yVKp>iH)COAW@chFG-fqrVlgu_H#0C~GBq|~H)Av*0XZiD7XmsuV{dYGZeL++Y$z!JBVlG^V=*{1I5c57HD+WoVK_51IW#b2I5#+DVP-UBW@94(IVS-Z0y;Wrb6;a&ZewL^Y-MC9DF7p3GB9L0H#Ih4H(_BmVl_BqFk~<^G&46iF*su}FfcYYB>_1y1OPfZQgv>0X>DZyV{Bn_b9823F<&uKb#8QNZDlTKbZu-_001DuyC6q(X=G(=X>Mh60B3b+WMyn=Ze?>WXmo9CM*$cVIyzKgb8BX7Z+9qTZ*FsRAaitNIy!T7a%pa7CMf_TVK*`~GGR9~WMg76Ha9adVl-tmFgP(bHZ(V4IWsdbH7Nl)5=Q|T0y;WWVRLI{Y;SicDF7p3WMwyIV>UD~WHB=~VKZhpV>vc9I5J}}I50FYHZnG0D*-x30T>KAI#gkEYi4Y3cPLb0b8BX7Z+9jpDF7p3W@I-xFl1#mWMVdCF*ammW;8c4IXN~rV=-l9WnyAAEde?TM*$cNIy!G~WpZJ3Z*n~-RAF;#W^8YFCMGEWBVl1QIW;$7G%;m1Gi5PkF<~}lH#cK7G%#d0Fk~<^W??S@ItoVt7y>#vepF#|Yi4Y3cPJ?UBVl1VHa1~4IW##lWn*JFWn*GFF*7k`W;0_oGc`0~IAt*bI!6H*6*@X(b!~7cb97`nI&X7ya%Ev{CMh6eZ*FsR03%^zG&3_WF*q_dGG#P0VlgsfG-G2qW-wzkH#9h8H#Imj0Xh>$0T>86I%IWia40DtV{dMAbO0k^Ib$?AFfe3cF*q|ZWMgGFFgP|gG+{6{Vl!nkVKZi8H32#XM*$cJIy!E3ZC`X@b8B-bDIjBSZgX@1BVlGVW-v2iV_`5jG-PFBHDfX|GB{ylI5{&mWHU21G&wf`ItE7p7zjE#Wo>YDc_=9$V{dMAbO0k^GBPwXFflkZGGsY4V>C2oWH)AIIXPrFVPi64VKOu^IRQEbM*$cVIy!D)ZDlB9Z*FsRAaitNIy!T7a%pa7CMf_TVPrKoFg9g1I59b4VK6aaHZx;1VPP_5Gc-6gH8eJ2IXeM55=Q|T2s%1$VQpn7DIjBSZgX@1BVjo)F=b?AIW=KpV=*%_GBh(YVq|1yV`gPxW-?=AVlX`cItE7p7y>#vV{Bz%awsVPBVjRSFfd|bWj8WmF*P+aV>vc8Hf3XBWi(_tIAUTsVP-!8I!6H*3pzSxZ*pH{VPj}tbYXLAC{`vWDIjBSZgX@1BVjf(H85l_WHvT6Ght;pVP!XBIW%K4HDxq1W@ct(FgHN~ItfPs7z{c(a%F9Ac4c2=WpHI~WMyt+c_>t2b89RhRAF;#DF7p3Gc`0}H#RpjW?^DwVP-R8Wi~cBGB{ymHexk5GdVCYLjgJpM*$cDIyz@^VQ^?DDF7p3IAmctI5cE2G&y24GdX2AI5ajmVrDWnWiT>jV`efpMFBcP0uN?vZ+8G@Y;SiiXmo9CG6VoRIz(k=VRdYDLu_wzb#i4_a$#_AWpV&xY+-YAbY@>MUok{wW?^+~bVF=!b9HiMS8`!+aAk5XXmo9CK_EwEZe?;|Y%Oqga&T{RWgt{xb89VdVRB(?Y-Ma9P;zf)a$#+4X>MmAQ+acAWo-ayZe(S6E@*UZYcB*c42gBZ*Blka&Ky7V{}b#bZK^BbZKvHE@*UZYGfb7^{IUvw-Wb97`nI&)}Ya%E&+aCCA!S8sA_WpYVwbY*g8VPj=JDF7p3GBaW{V>mT9GB-IkF*am3IW{peWMgAwIbk<3H8VCfOaVG5m;o39Iy!zucx7XCbZ>GfDF7p3HDNU|F=8-dV=^;hGBz|gGBz_eGh{PlVPa!tW;Qo4P60ZY0T>H9I&yVxC{$r{Yi4Y3cP1$SBVjc$IX5>qW;HiBGdVahWHmE1WjSRvGh#V8H!(6WHaSoMItiEo7z{c(a&>MfRAF;#W^8YFCMGEWBVjaRVr6DEGd3|eWMVNmH8LMfRAF;#W^8YFCM+OBCMGEWBVl7OF=H}jGcjajGi78mVl-rCVKHGgV>vT3VKq2nGGMfRAF;#W^8YFCMGN(LnbCE03%^FGGj6~WM(pEVmC84GBP(eW;8i6HfA(oVlZJaIWsp_0Xh$u0T>fHI&yVxUv4N=VRLI{Y;SiaEFg1fdSzd9DF7p3He@h1F*jvpVq-ZpGBIW`Vqr06I5J~pHe_TlGGj0>SOGc_m;o3RIy!Q7ZeMOFRAF;#W^8YFCMGN(b7^{IUvwz|BVlAQFkvw=V>x9pG-Eh1Gcsi~F*IX2VKg{2G&5v5GdNlSIue)x7#KP_a&>NBZYWe?b8BX7Z+9jvAaiMYWnXkGAVVf5DF7p3Vq`fvVlg;5G&MJ5F<~$;VKXyhWHK@{GdVUnG-hLDTmd>3m;o3WIy!Q7ZeMOFRAF;#W^8YFCMGN(b7^{IUvw-WLnbCE03%^!F*GzYFkxdhGG;JkHZ)~9Wil~0F*0IdGc{u}GG$?20Xi3$0T>WEI&yVxUv+MDX>2G|VRLI{Y;SiaEFe%OCMf_TVK8PjGG#erWHvNnHZ@^lVKXvgVK8MeGh;DjWHT@@VPF9|4wwNL5jr|@b#7mEZggpEC{$r{Yi4Y3cP1t*AW$YIDF7p3VlgmgVPi5lHaTQCFfcSZWi??pHDod{I5c55IAvsEVgWi2m;o3SIy!Q7ZeMk7bZKlTRAF;#W^8YFCM+OOCMGN(LnbCE03%^$I5aplHf1$qG-Ea}HD+UCGhoKI&yVxUv+MDX>2G|VRLI{Y;SiaCM+OOCMGN(LnbCE03%^$GcaQ|IW#wBFg7q{G-70CGc+`1IWS^2Vl*^nV>d8n0Xh_z0T={2I%98gb#5qBCMf_TVK`=EWI1IpH8nRmGBRRfI59aiIXO9IFk@siGc#pjIA{Sn0+<091v)xoZ*p~RUv+MDX>2G^CMGEWBVjORV>V@CF*#;2HaKKsG+{YoWH4bjV>U53H)AnmGGuB2Is=#i7y>#vcVTICUuJJ|Utw%)C@BCVVPZL9VKOx`W;ZZpIbk#~FfuS?_Bb0{ewV{dMAbO0k^H8LdWuV=y*lW-u{iWiVqmWHw86I&O7sUvzJ9Z)|U8X=QULDIjBSZgX@1BVjgVHe)h4Wn?utGBGh_G-EP1VL3T9H!)>3VrDR6He+-FItG{l7zjE#Zgp*6bYXLAW^8YFb0{ewV{dMAbO0k^HD+QoGdN^pFlIMmIb|_nWin=EGiEY0Fk?6}W-&K3b^$sDm;o3FIy!V{X>(t9Z*ps8a$jj=C@COgZ*FsR03%^GG&nb8H8?qAVliemVlX#2Vm3KuWHVx7F)%nXGc+}L0Xhbl0T>QCI&EQVWnXV%b7gXNWpXG%a%Xc?ASNatE-o%903%^$W;ZuvWo9`xVP!WoFl06~IAmlvG-Wk6F)}kZV>mN<0Xhtr0T>%PI&x)gZ+2y0Z(?(0a&~2MD06gVIy!S`VRB_;UvPACJWpbCWpZ|9ay}^lBVjW!FgP)0G&wS3GdDJ7H)1zAIAmooWi&G|H8V6dGckMtIvJP&7zjE#Zgp*6Z(?(0a&~2Mb0{ewV{dMAbO0k^W@cq$HZn0`VP!QrWH&QrV=*{5Wo9%rG%zqYW@KS9egQfLm;o3HIyzxV(nVq-TkW;A9sGB#!~HDqQtfB`xPm;o3CIyzxE8I&*1kWo~p|VRLzIV<=E2CM+OECMGEWBVjN!HZfyjGGjC~Ib%6yIW#vjV>dEkW-vBkW;tUwFfoJyItZ8n7zH{yb7^d4ZggK^b9ruKC`KkGDF7p3I5#+DV=-c5Gcz}1H)UpJGBP$XIbt#~Vqr67V`OGCh54U~bYEm;aAj^}Wo~p|VRLzIV<<)@CM+OHEFeiK03%^xIbtw4Wi&TsHaKBrF=aJ2IA$?8VPQ8hFf?OgIbt!40Xhko0T>WEI&*1kWo~p|WMyz=Ze(R{bYEd}d2VAUP$niUAVwx8EFeiNAW10zBVlG_W@0d9W;SLyIW;mkI5IgoW;bSKH)S<5V=!Z8Gh&VbIu4it7!x`=WMyz=Ze(R{bYEd}d2VAUMkXdKAXH&n|lFg7wdHez8eXmo9C@c{rJ!n+_xa$#_2AXIW;c4cyNVQc_ra$#_2baG*KWpZ<2Y%XYYZEUgu8ag^@Ze(S6UvznJWdI{#Gh<{mI5uHpWMVToVqrKoIXE*jG&we7He@(AWjJJJvH=EKJO;7>8VEW%Wo~3BDIjBSZgX@1BVjT)FfchbV`4NgH8x^nF*#y5He_KpH)Jp|IW#jgWi+AzJO;7>8VEW%b97~JUvp`CWhf~iV{dMAbO0k^VP<4BW-??jG&V6bWi>T0HZ(b7WI19tIWaP2Fk)sgqyanzvH=KS~DF7p3WMpDDHaTKrV`5`uGcaK`I51;1IW;ynI5#$7GBjd0r~y0yvH=L5jHD)tpWMwjAWMMO7HZ?h7IASn1G_C((8WprgKTUX?kT}bSxlqX?kT}bSWTXZ*FsR03%^yWiw$hVrDmCIWaP1H#22nWHL8pGcz|eV`DZpFl0Bd0X!5k1OPfZNp56icv4|*XJr6mY+-YAbY@>MUolB;WMz0#VQyz-E@*UZYz_ecFK2RLaA+=QaBu);a$#_2UpQ!Ra4u+cZETqV054;8WMN-qWp-&}WiDuNZ~$X~uYf*J&b!8|i03%^EF*Yz_Vl*-^Gd5&5HDY5nFlIS5V`DKmIX5{nVKZd40Yb?EAObo%enM|`Ze(R-RAF;#QFUc?Whf~CBVjc-F)?B@GB`0XWH4nnFkvw@H!@*3Wo9)wG-YC8H)Xd0LdgLj2s%1tZE$pXC@COgZ*FsR03%^#HZ);kFf}n{Ib&rtIX5&oGGk&kFfwChIW{+BG&MQ70YV1J0U!uEI&*1yWhf~iV{dMAbO0k^H#RmhFl01lG+{V3WnpDGWi&Q2HexwsI5T55F=Jvgy8%K5$pIh;Iyz%vaA9L+1v)x(a(Q2Hb#rJaPbMZQ03%^$H8N#nWn(ZnHexkpF=07lFgY+~W@a{FHeok5WMO5$0YU@G0U!!GI&gJ!Xedu6CM+OBCMGEWBVjpWFk?A0VPs=uWnwU4I5##pHaRddH(@d{F=k|DHZs8hLI}wLAObo%aBpxZDF7p3GGaMpF=R3|Ffw8^I5{>oHextAV`4QhHDx(9Wj8Tm!vR9c0U!c8I&*YoVQeTV03%^!GGa7lIAk(5GGZ_=IWuErFgRpoVL38mFfe9eWievK0Yb?EAPG7;b97~4Y+rY2bZB2_X>N2Vb7^{IUvwrZ03%^IWM*SyVm380Gh$;fH)dryIW;w8WMpAAF*7zeGd5wz0YV2d1OPfZLT`0$WMyPjVRLIyb!ByB0Ap-nb8~cNUol@XLT`0$WMyPjVRLIyb!ByBE@*UZY)}FKAi}#KQe|vqVRL05GcGwUFd!&0FfuhSFflJMGARIZWo%_(b7d_vEjcYPE@*UZZ1DmBAi}#KP+@XmY;0w0AX8&uZU9hWa$#(2Wo%PpVQwyHbZu@6CZe?;Pb7^{IUvwz|BVji(W??sFI5atCW@0d7Ibt?7G&f~rV>dB5V`5}sH89WtLk8XfAq_e@QekdnZ*5Rva&&2QX>V?2awv0YdSzd9EFeQBCMf_TVKXr`IALZoW-&7}I5jqAGd40XW?^J9Fl1$8WH@9rFwy}-3*G@C4mvtgVQyq^ZBSuybZK;HZ*FCBC}wPLVRS4YW^8X^bSVHMVKq25W;8N0Gh#JiV`VTlH90skI5J`|FgId1HDzTtX4C;g4Bi1D6FNFlVQyq^ZBSuybZK;HZ*FCBC}wPLVRS4YW^8X^bSxl4CMGEWBVjW(W@BSDI59RkF*as4GBIW_W@0jBVmUK2IASw1H#XJ*LlNEqAqYA;VQg?{VJImeV{dMAbO0k^HaBHuVKZVhV_`I5GC49ZVKXu?WH&K4W@a-nH8Wvh*a1TZ-T@&9Iyz!ybYUndAY*TCb94YBVK`MUolc)Ze(w5P+@X(X>@6CZe?;VXmo9C@d5xK!n+_;VRLIOaA9&`Y;0w0AW&&=Wo&6~WdKxRb8Apxa$#(2Wo%GsaAj<1Ze=cLbZucuGchnXV`VsEFgap3I5gk^L+Sw`0y;WVbYXO9V^CpobZK;HZ*FCBC@BCVVKZVlVmL4~FgPH#4LIyzHyVRUI@P+@X(X>@6CZe?;Pb7^{IUvwz|BVl1SWMN}sWHd2mH!?9ZFf(ChWiU87W;QiuIc7FBG&$q}Lk8*rAq_e@Q*>c;X=6}fa&&2QX>V?2awv0YdSzd9EFeQBCMf_TVPiCAVl+5nWnnQgH8L_ZHaImmWic^1IWuBqWi~itHs%3C3+e$O9y&Tc;X=6}fa&&2QX>V?2asXp&VRLhIW?wO1F;jG5bZKKyVRCe7bZKvHWpXZPbZu<$0RSMvyC7>VZDnC@b09-(b#ruOa%pa70Bdb!VQzCSXmo9CdISIW^8YFAXH&MmOXmo9C?*S`1I&XAnWpV%`VPr8eFkv||IAk(nG-WwCF=jDjVPj-8H#1>2Vqq{aV=@E)Iyz%@WMNfPO+jpIZ)0I}Z*n?1a%Ey^Ze##+baHiLbaZB4F<&ubb!1^xQ%yl^Y;R*>bZ>HBF<&uqWnyV=WG-lQZETqV054%|XK!+8bZBiabaG*Cb7pUHZ7yhVZ~%02VQzC~Z*py4IB0NiE@*UZY)oZga&BpEXCP*BZ*3q&cwudDY-Mu*MR;LtaBO9BE@*UZYz_ecFLYsZYc6PTZ~$~+b8BBXXmD^YXmo9CPyzrT!n+_+Wo%_(b7dehE;BALASg00GB7VNGA}YWDFAY1Y-M3{Wi2u-Gc7PKXmo9Cbp!w)cWHEJAVg(wWo~3;ZewX>a{xqTaAj^}Wo~pqb9ruKRAF;#X>MmOXmo9C4gml!W^8YFUt)D>Y-D9}E@*IY0A_4&cVAy(b!lv5WpZCQXmD^YXmo9CPyzrT!n+_+Wo%_(b7deiE;lYPASg00GBhtRH7_tXDFAY1Y-M3{Wi2x;H!UzOXmo9CbYXLAW^8YF0Ay)$UpP2qVPrHhWnncnV_`BeH#cQ5G%;i|GGRG0WjQl3V=icPZER2i03gDJW;Ql6VKQYfGcYqbH#0FdV=*#iHaRe4GGaI~`2j)^l>-1oWo%`1WpV&xY+-YAbY@>MUom5KWMM^iWo~ptWo%`1WpXZPbZu-@MqfllMnP3fR9`|*RZc`jL|;@vQ%he_RYg@rUrbL&UsFk1MF4bjabGxSaBwbYbZu-SVKy)`W;SLrIc6|sF*iA7WjHluV`ODFWHmD}F=010WmHCAL`6nHRZLW0RZc=rRZc`jL|;@vQ%he_RYg@rUrbL&UsFk1MF4bjabGxSaBwbYbZu-SVKg;mGBsj2HD)(5WMnmCG&5#0V>vJ}FfuSTV>4wlHx2;+FLZNpE@*IY0CaP4UpQ!Ra4u+cZEQmV6aqRrRAF;#C@BCVVKrrBH)3HnGi5ksWnp13W->8mWiU8nI51&hG&eM2VgCU(Ljn{HIyzKgb89GLZ*FsRAXH&3qt}F5IQ<G!D06gVIy!E3Y;16Ja$j^Q03%^EVlZVhWHwK~jI5IE=0yYjq0u&QEI&W}ga$$6Day>mLV{dMAbRbk=b899kAY*TCb94YBVPrTrVq!EoIASniG-Y9BFk>?}HDY5mIbt?8WMX1rI0phY5kmqL6FNF?aAk5~bZ>GYJt$*uZgX@XRAF;#CMh6eZ*FsR03%^$G-WX}HZwM5GhsA2G%`6iVlyyfGGa1iG&nP6FgY~|0yYst0u%^3I&NWYWhf~iV{dMAbO0k^FgRgkVK6i@HDWY5FgZClVm4-GH)b<2FflMUWi@3p3j#ImfwIAJznG%+?~H8C+|H#B58GGt~kGc{xl0yYLi0u%^3I&O7sUvP3|WMyMzb8~NUb0{ewV{dMAbO0k^WMyVzHZ^26IAJz4Vq`dBH#IXhG+{V6Ffn6hWn(ln4+1s@Ljn{CIy!E3ZC`VAa&K;DUu0!)Wo~3;ZewX>b0{ewV{dMAbO0k^Fk&}iH#j+CIbt|rGBG(aH(@zBGG;boV`DKfWMw!p5dt;_Ljn{CIy!E3ZC`g~VQXJxWpHI~WMyt+X=QULDIjBSZgX@1BVlAUWjHr9VmD)BH#cNvF=aPoV>mZ8WHd82W@a~HH!>3fHU>ii6cjo-ZeeX@C}VGKb95kcbYwa@b98cPZf7Pb03%^JFf})2V`MO9GdMFdVmC22G&nG2IWaV5I5jpnW-u}p0yYvu0u%*0I(Kh!YbZk|CMf_TVKQbmW;A1EV>B{kFfle|HDzKrIWsvhW-~T2W@cqMITr#p149B71UfonZ*6dIb7f>-Z)PY|CMf_TVPZ2lVr4ZlFlJ>pFfe0cG-F{gW-~D|GcY(|IW;vnH5mdn0z(263pzS*a%E#>WMwE+b08)rATBO0DF7p3IAb?BV>C50GC4J2H8nG1F=H?>Hf1z0HZo*kI5jvp8v-^7Ljn{FIy!T8V`F7yWGGZ~ASNatE-o%903%^DGcjT@VmC52H8nLbH!@{6V>e_mFgY|aGBr15Ha0XJ0yYUl0u&26I&x)gZ+2y0aB^j2Wn*P?b8m8UC{%MGCMF;*E-onmBVl7SG-Nn7IAt+mVlX&4H#a#sV`VfkH#jz9HZ(afGdCXsHVH!l6bm{!a%F9Ac4c34bz@^?b8~NUb0}1EASNatE-o%903%^BIWjP0H8wY8V>mcwG-P9AIWuH6Wj11DWH32pGBz?H0yYUl0u&57I&x)fWnpt=C{tx^VQ^?~a%Cnd03%^CHa9RiH8L}0Vlgo?W-??jGBjp1V=!eoWHM$jFflnJ0yYXm0u%;1I&x)fWnpt=C`l|JNhts$VKHMiG-Wd}H)LXAGBja0V`emFIAmcqF)}k^V>2^hW+ehP1VaK83_3btV{vt9a%Ct}Wo=<_Xm4_5CMf_TVK6r~VPZLBHfA$nFf%kZWid8mW@0sFWo0*HV>DxBI41%&3PS=E20A)nV{vt9a%CtIG%ztTFf}nTHDoa`V>4qiWj8iuG&MFU0yYFg0u%>2I%HvVVJLQQX=ExX03%^$WH>TmVm4$pH8D73F)=hUVK*{kG-NVkIX7lzF)=hN0yYIh0u%x|I&x)mWppSh03%^DFfwK}F=b>oF*jr|VKikqIb~#HH85s5GGR40F=jI@0yaYe6aqRra%FR6bYFLGa%(6l03%^#V>4l5H!(70F)}eTH#IaeVmD4@0yaYe6bL#xWo>YDc_=9$V{dMAbO0k^I59LiWn^P!Ib~xqVrF4uF*syoF*h-0H(@n3V`VTlF#usWi&Q4HDWVmV>4!AH)AnkVlZW6H8TP>215cA3pzSxZ*pH{VPj}tb9G~5Wpi_Hawt|NCMh6eZ*FsR03%^$W-&8iH83+|G+{GiIXN_CVlX&kVm4+nGG#GhG%`0e0yYUl0u&26I%aQjUu9uqXkT!0Wn^VzWpi_Hawt|NCMh6eZ*FsR03%^DF=RL}HaTQ8V=!VdFk&?|Wi~iuGBz+{G-NV3V=-no0yYUl0u&26I%aQjUu9uqXkT-6VrFb_cVBd2b89G8CMGE$V{dMAbO0k^GB9B>Gh;9~HZd|VI5cHBHezI9H#Rh7Fg7?eGGjAgIRZ8bLjn{CIyz`!b7)_7VQh6}C@COgZ*FsR03%^BV>vT0F=jGhGczz_V=^;kWidEqIA%66HaIvjG%;p70yYLi0u%^3I&^t(Whf~iV{dMAbO0k^W-?@EIW=WAV>n|qI5#k3Ff%h`HaRk7I5=TuFk)jeJpwicLjn{PIyz)^ZEz@abYwa@Z*z2VWnpb5DIjBSZgX@1BVjW$F*so~Ic8=#WH4bcF=J#gWjJMJHa9ggH8Eu~VmChmHWNbv6bL#xWMOn+C@COgZ*FsR03%^DGB7wYH)CQqFfutcVK!!EGGk*iH#RvjG%z(ZFgasE0yYLR1OPfZRAF;#0Ap-nb8~cNUol@XRAF;#E@*UZZ1DmBAi}#KRAF;#EpTCSVQg$=Y#>u(VQgVyY-J!&X>et1X>MfzRAF;#P+@XmY;0w0Q)6LlVPb4$P-$>wY-w&~E@*UZY*J-rWpZV1V`XyyQe|dka%FB~WpgfQbZu-_001DuyC6evZgg^KVs&(Ha&rJ!V`Xh+Zgc=-a%Ev{aBN{?WiDuRZER2i03gDV>wVQyq>X>MmOXmo9CR7PJ`MN(5qPfh@IVRLI{Y;Sj8IB0NiE@*UZY$IVbWM(llV`XMAFk>?}WHMtlHZn6gV>V?oVl_BrFf(RUMqf=qN>5T>Rz*@%Nl#7ybYXLAW^8YFUpQ!Ra4u+cZEPc9V>L80F*rFfHZn6cF=jC_V`DWoV`5`CF=aL|Ib||2R7PJ-Nls5vUsgp@Q%O%w0CZtgVP;}CF*0LeHext6I51{oGc{x|WK>39P(f5fNMBY(Qd3D!P5^Xab8BX7Z+BlfXmD^YXmo9CBVlGWWH4i8G%zw|G%zzUH8EvkI5aq6V`X7uW@R!sWn=dN055c5b8BX7Z+9+eaBu*0VRLI{Y;Sj8IB0NiE@*UZY*J-xWnpt=AWm;|Wpe;hWo%_(b7gZbXmo9CT>=&YIyy#mbairNC@BCVVKp)^H#jvjHZ@@}GC43gG-NkrV`eciWil~0H8443Vo?G(T>=&lIyy#mbairNC}VGKb95j^b#!%dWhN;ABVlARGBz|dW-?-9Vq`L5W;8T8V`gJyGd5*mV`VvEF=A5!I1OC_7798#Ms;*`a%Ct+b#!%dWhN#m03%^#He)qrIWsk3GB7qWW;Hl4H8f%{HaBE5Ic7CwH#lKc0yqd=0u~QCI&W}ga$$6Day=+xZ*FsRAVzg`b#i4UDF7p3H8L=&gIy!G~WpZJ3Z*n~-Ms;*`a%CnaDF7p3H8M9hH#IRfGdVG2IAb?CIbmZtGdMFaW@9urG%zw@Spql+T>=&YIyz%vZewL^C@BCVVKZW8HD)m|G&nXiVPY~hIWS^2G%z_gF*0N`VKg>5Vp{?@G6VoRI!1MLb#i3@V{Bn_b9823F<&u8b#!%dWiDuRZER2i03gDMl#Q*>c;X=7Ajb8BgCXD(=TZERu!8ag^|VQpmqBVl1SGB_|{IW{q2HD+OEFfukUVq`NpW@ctKIAt<6Ha21c8ag^;VRT^tBVl4NV>o0rHezLAI5IG1WHm7|HDP9CI5sq7WHLE9VrD}E4^UxpVQq5&P+@XmZF4SYbZu;P0wM`II#q6BZ*^{DWn@%gb8As`Wp!mJX>N2jG+%Tn03%^zV`DfmF<~%aHDoq9GGQ}gWMMNlHZwOkGd4G5H)1wp0z?OO0wMxBI(}7dVsCYBWMyPjVRLIyb!ByBC@BCVVK*{jVPa)4IW}ZrG-hUFH!@*1IAS?wW;Zr5Gc;v6HDv-sbpj#?Iyz--aCCVnDIjBSZgX@1BVjf&HDfn5WHUH9I5J{5WnwjCHf1(sHfAzmVKg~pWn*UoLnIA&!rGBRXjH#jq6G&nY6HaIh7H)Az0V>M+pYXU?Dbpj#-Iy!K5b7&}3DF7p3Gh{F~H)djGF)%e@GB+_YV=-bfGB!CjWn^YzW-vA}Z309Abpj#+Iy!J~a40DNBVjQ%Ib~&KHZVChG&D74Wiv8iWiVwkG-F|9HZnP5GdOPoM0Elp0y;W#bY)>|C@BCVVK6f|G%zq^Ib$?9VKg!^WHV(lH#0XmV`eikIb||tW^n>Ubpj#@Iy!T7WnpYzcWHEJUubD=bSQIadSzd9CMf_TVKHSlGh$;mGc+_eW??a8HZWo^IW;#kG&o~9I5%W7ICBC-2QmZzIyzNuVsCYBWMyPjVRLIyb!ByB0Ap-nb8~cNUol@XRc>N$b#7#3WK>~uYf*J&b!9GSbZu;d0vI|vV_|G;VPb4$UvvN?VKrl8Gh#P3W-&H1H#ufBV`4I8V`OD!Ha28sWieu5GlK#cIyz}?aCLNFbO0k^Wi@3vWMnZgFkvtjI&XD!aCLNFbO0k^IAJ+CVL3Q3W-w-9HezEnH8LwC{Sr|WmI`^Wh@|LVQg$+Vr*q!bS5S#03%^FG%__aF*rFjV`4coHZx;3GC472Fk@voWHB-}IX5(W0y-9h0vHH7I&^t(Whf~iV{dMAbO0k^I5K5sIAu6yFg7$aG-5I}F=Jz9Wiv7}W?^GuW;QoreF8cLg8~=|Iy!WDaAhb^X>es!d2nSZ03%^$H(@hlGGsF`Gi79CG&D0}Ff=)3IWsqAIAvovV`Mmg0y+qT0vH84I%8pMY++(-Whhl9CMf_TVKq53F=jAfWin)9IAt&}W;9`BG-EU}VKO#hWMXAuWq|@Z12O~vIyyvQbYW0waAg2vY+-YAbY@>MUok{sbYW0waAhuNbZu-<0stVwyC70!Y-M3{Wgs#xH7+n9C^9fIFfTARFEBAF0CHt)Wnpt=Eix@NEif)4qjVK!oBWjQ%Ai~>LgngSjIIyz%)WnpqCDF7p3HDxhoHZw3YG+{Y5VKrelIb&sJHe_KmF*0RlWHK-_jsifM0v-rDI&O7sUvyz}YjY?mAY*TCb94YBVK`zlVlZN5WjHrBGGS&mH8f*pWnyG9Vq!97WjQ!yWRLjV`efqF*!M9Gc`9fI5{vgH8nXnG&nFfGG=2kIg$cE2ATpM2s%1*WprP4a42(WdSzd9DF7p3VK6f=WH>Q3G-PBjGht<6Gh#SqVK`%DG%{r|F*Pw_lmb8ongSjcIy!G|UuAA|a(O6MZ*ps8a#m?&cPt=OVRLI%X=QgQ03%^FWMyJzV`VZoW;bOpF*0N_GB`IfV=*%{Vlp-{V>UCE0zed+0v;ARI&W@YWq4_HC|7TCYh`j)X=QgTAXH&3WHe(jGGjP7F_;2C6fy(=IyzKFP;zf(PhxXra&~2M0Ap-nb8~cNUol@XR7Oy8Z)Q(ob7gXNWpXZPbZu-<0stVwyC70!Y-M3{Wgs&yF)lD5C^9fIF)uJQFEKPJ0CHt)Wnpt=Ei)}KEif)et1X>MgGb7^{IUvw-WP;($ACLk^@E-3&bVKFr|Ic733IAdWsF=8|^WHw_sH!)#2I59XgVK6jhHJ$=K5v&3n9XdKhVRT_oX>et1X>MgGb7^{IUvw-Wb97`nI&^h#Y-K!9b1p6}J|-q903%^yWjSUrIXE*kWiv8mWnyMGIWl82GcaalI5IUcGdN_R0zMn80vrfBI&O7sUuet8C@COgZ*FsR03%^IH!?6`G%z$}GdM9ZVlp^oGh}2nVmUZtIWROgG&we;0zL+;0vrN5I&x)mWppSh03%^!HZwIfVKFf`Vq#-7WHV$oFgIpmHfA|BW@cnLI5;_`0zRw)90)o(Zgp*6bZ={AZgVIpAY*TCb94YBVKQSfFgP$XGBYtVF)%e`F*Gt|GGjPnWn(rqFkxe6WvBu^2CM=c0y;Wpa$#_2C@BCVVPs}wH!x&jHfCgIW@9vAFkv+|Vq`LAWiVziWMeQlWU2x_g8~z5X>MfzY-w&~E@*UZY{~*4Iy!J^aAjX~03%^$Fg9ajHZ(P6IAJ$qG&eanV_`KjWimH3VlZPjH8eNM0w4l9I#Xj|Y++(-Wl(8wWo&6~Whf~CBVjT&W;JCuWj8iBVmCKtIb}IDF)%P=HDoq4HaBEqH!-dPLdpUl2s%1bV_|GzVr*qlX>et1X>MgGb7^{IUvwz|BVjORIXPirGC4P4GB#p4H)J(5Ibt?4H!?D0IWc88V_~oYLI%nLAPzb@Q)6LlVPb4$P-$>wY-w&~D069gWnXkGAW$qIP$>W-VK_K6I59Y8HZnM3Vq`R7VmLQ8WHmN6VliYgWHvHnW3mE549Wr^89F*sV_|GzVr*qlX>et1X>MgGV{dMAbRbh>VQgVyY-LbsaAj<1Ze=DZ03%^BVK_KBIAmrtWHK@`G-NeqIAUZnW;0=AF*IW^VPZM70zwzc0w5GRI#Xj|Y++(-Wl(8wWo&6~Whhf)VQgVyY-LbsaAj<1Ze=DWDF7p3F=1jiG-hEnIXPl5W-&B4Ib>!tIXGixFf%z~H#225wgN&D$^sx6Iy!G~WpZJ3Z*n~-V{dMAbRbh>VQgVyY-LbsaAj<1Ze=DZ03%^DFfln~I5##kGcz}2G&3|}G-YNqFg0d2WH>M}VK-v90zwzc0w5GRI&W}ga$$6Day=+hV_|GzVr*qlX>et1X>MgECMf_TVK*{iF)}k^W;8Q0H#Re4WiUB4V=^!?V=^{1H83?XW4Z!D63PM~2s%1$b!}g4X>Mh6C@COgZ*FsR03%^FVKii9HZ?XgIAu6xFg9f~HDh8lV>LH9GGs6`G-G4D0zwAL0w4%FI&O7sUvOz~WpgMgAY*TCb94YBVK*>iG&yE9GBG(gG&eRlF)=eUWHMngGB#s4Ff?RgF}?yq2Fd~;0y;W!Wpib8C@BCVVL3D~VK`)EH)1t2VPiF8Vq`TrV>LA}IWc57IAb_CX21eM$^sw;Iy!P?b7gcWP%I!&DF7p3WH>lDHe@s~GBq|hVL3TCFgZ0cW??j8Vl_EoWMg4s!U94B$^sw`Iy!P?b7gcWb7^{IUvw-WP%I!&DF7p3I59RgI5{*mF*sy4G%{mlVPP;aG%;l|VKz2oGd4A0!~#ML$^sw=Iy!E3ZC`Y6Yh`Y8C@COgZ*FsR03%^GIWu80WnnclV`gDvIb$$lGBPk^HDWViGB{;6Gchy90zwAL0w4l9I%jfWaA+tg03%^CW;8WnG-F~nF)=n~Vr4mEVq-NlGcYw~F*9Q|GGRE#0zxtb06IETV_|GzVr*qlX>et1X>MfzV{Bn_b9823F<&uLV_|GzVr*qlX>et1X>MgMXmo9C(E=I>Iyz}&C@COgZ*FsR03%^yF*0L0Ffn2V$qI50LcI5IK>06IEjb!1^gWp-&}WkGCgZ)0I}Z*n?1a%Ey^Ze##+baHiLbaZB4F<&ubb!1^gWp-&}WkGCgZ)0I}Z*pHTUomoJVrgzdHnGS>n-;{q87IyzHjZDDX|Z*pZQb7^{IUvwz|BVl4lWivKnI5;z9VlZK3F=1giWMgD8+X6cV;{q87Iy!A(cwcs5Y;|QQDIjBSZgX@1BVjT)H!)&0W;J9tH8MA2I5IL}Vr4itF=RP8VKg{0Vl&+WI|kzd83H;wa%FR6bSNnRBVlAPW;Zx6F)%eXH8n6|GB9E|V=`f7He@z1IXPxBGBDo)JL3Ww2s%1)Wpib8D069gWnXkD03%^DF*!44Ha9acF)?9dFkv_~GB+|fG-hEqW;QrwWj0~q0y_pW1OPfZQ)O*oaAsVdnxt2kQbK9XdL5V`yb$b!=sFUuAGZAHexk5W9b4x8+8B?Np5sya%N#;WdKQTbY*g8VPj=3Xmo9C@d5xK!n+_%Z)A0BWgtOpXK!+8bZBh=O>bm%Y-K@gXK!+8bZBiZXmo9CPyzrT!n+_+Wo%_(b7dehE-)@IASg00F*q+UI4?3dDFAY1Y-M3{Wi2u-FfA}HXmo9C@B$zcIy!G~WpZJ3Z*nLpC}VYGVMlUdaA;q2DIjBSZgX@1BVjN#IAS(5W@KYEG-hRFV`MfqHe)h0W@a;DI59IZGCA%7LJ^k(07PYMWprh70Ap-nb8~cNUol@XV|8Ret1X>MgMXmD@4R=awtb~VQ^?BDF7p3HaBE2GBac~G&wV5VKFs1G+{AhGG$?9VK_NtVKFy3_X0fzIRhI7Iyz--aBN{?Whg@?CMf_TVKy*iH(@q5He+F8H!)>nW??fhF)(5>WoBY9WHw?oH~9iR133d51v)xqZE$R1V`V5qCMGEWBVjf$WH(_pHa25nVmC2mV`gD9FflMY5VPj<|LnbCE03%^GFl0AjHa0e6VPZEiWn*SxGcYkQVlriBVlZSjVl_AU0zCsc0~-Z8I%REeY+++%C_^SDDF7p3HZWv2VKz24V_{-9F=bFk&)gW@0d8Hexk5`2sxyIRhI7Iyz--aBN{?Whg@?CMf_TVKy*iH(@q5He+F8H!)>nW??fhF)(5>WoBY9WHw?oH~9iR133d53OYJvZE$R1V`V5qASNatE-o%903%^BW;QV~I5RRfVPrHjG-ftqI5RmgFgIZ_VK+HuWies{13d^i0~-c9I%RTUb7d%0VRLIK03%^FHZnJ6H#s&kFgZD4V>e?oIXE^sIWuEsW??sCIWRc|13d&e0~-W7I%98baBp*EWM6M)C{!jX03%^zHfAz1GBP-1G-EPlVKQYlG&5r~VlZShGGj1hIWl1f13dyc0~-Q5I&f@ZV`XS>Y-D9}C@BCVVPj!pFk&=eV>mK0HZW#1H)dpFGd5u}GBh-0Wiw%6W(fm5IRhIMIy!7=Ze?L|X?kTSb97`nI(B7abZ>GzRAF;#J|-yuBVjo(W;0|tVmV|sW@BbFWM(*JH)J)WuRAF;#J}CerVKil9I5}c9H8f-~HDP2mW@0%uW;AAGVlgmeVq#)3HVp$k8#x0T3_3byZ*pH{VPj}0LM$LfEFeQHAW$g)BVl1>Gi79EVKO&3Vqsx7HDWM0G%++~FgP<~F*IX1Gd2$cJqkGk8xJ}mWjHxBH8Non13d>h0~-%II&^YjZgXaDa&0I=EFeWJAWtkHLo6UrDF7p3IX7ZuH)CaGW;SMFG%`0fWiw(hH#B20Fk>+@VmLB36$3pDIRhINIy!W6VQzC~Z*pxYLNP2LMKLTOLNY8MPb?rqEFe%R03%^HWnnosWHV-AFkxXiWiw`GGcz|~Ic70sGdDIkGGZ_n13eQt0~-)JI&x)Xbz@~HLM$LfEFe@SEFe!TAW$g)BVlG@GBh_iIAvmHG&W^2H8Wu}Fk&zW-VKHMeVPj=EGC440I5T51VK6W@HZe3|G-Nh5VPrKpI3WW)3^@ZE5jr|)Zewh9b7^*EUvpz&ZYV-5AVn-7L@XdePf#o%R4D)>VK8DfI5}Z8Vq#`FVlg;nHZ(XiW??yHH8nJ3GdW>1IU@r-4>W-VKikhWHvE4WH@DFFk&-eGG#SoFfcSUGhs0>VrDflG9?2&4>W-VK_2oGh;JmG&x~1WHmBlWHmBoHf3RBFgIZ~H92KDVV+rIc7O9DFZzeIRhIOIy!W6VQzC~Z*py4Wq4z3b#rNUWnXh+VQwfwEFeWJAVe%6R4gDuPf#o%RZmbU03%^IH8eRiH#lWxI59akIXPl8FlIPpVP-fuW-v5lH8(da13eTu0~-=LI%a8ZWM64!C_*eCMJymxCM+OTPf#o%P$>W-VKg>jIW}fBI5;_FHeq8pH)1hnW;kRvGchw_HD)w7F)af<5IF-I5;{6&X>Md+X=Yz;Z*(X^EFeWJAXFwSAXQILEFe%R03%^JHD+NlH!(S8VK-u8WjACoH!)*oG&eM4WMVKjH!?Oa13eHq0~-)JI&EogUuA4%ZDnqBC_*eCMJymxCM+OBEFe%R03%^GVrDcmWo2PuI59aeFfceYVKg@~Vr4O8Gd5&2Heog~13eBo0~-)JI&EQiUuA4%ZDnqBC_*eCMJymxCM+OBEFe%R03%^GVr4aCH)b$3GhsM3I5splV>4noWiVtpV`OGFVm3H413eBo0~-lCI&*JwbSOeBAVn-7Ln#0wVKX&lF*q<~Vm32mWj13uW->TsG&3920A)(Z*p`fLM$LfDF7p3H)J?8G-WwBH8x{6V>UHnGB#slWMVioW@cnzVliYkHv>HcG6VoRI!0`7cS3b(Y-D9}0Ap-nb8~cNUol@XMr?0)LUn0uWMy(LXmo9CP6HSqIy!G~WpZJ3Z*n|5D06gVIy!H2baG{3Z6+)rV{dMAbRc7OWMNccb899k03%^EVrDrsF*#;5IAk<4WnwlkG-NblH8eS8WMwuoV>2;213Df~0~i82I%9QYVN_vrYbYrIBVjf;IW#h2H#Iq9Ff=kSWMMfuVm325VK`)EGh<^jWHCGgI!*%^5jr|!b!1^wVRLIJV{dMAbRc7OWMNccb899k03%^DHaBBtG&W-~WnnlnIAmosH83(^GGt_9I50J1FgY|n13C{*0~irHI&W}ga$$6Day=+xZ*FsRAY*l8VN_vrYbGfGBVl1VGc{!}W@0%xI5aqAV>38oV>M)BGGSw3VPs-vWHLYlIuA|*7z;W&aB^j1Wn^V2RC6FECLk^@E-3&bVPa%tHaIdkH)UfqWM*bGGBGw`W-~NoF=S#iG-NSiG(rP92~GnT3pzS;bz@^?Wn?H+b08)rATBO0DF7p3G+{YoVKHGdG-6?6G&f~5V>x0lG-YBmW@0jCH#avmL<2erP6HSSIy!E3ZC`VBV`F7=b8m8UC@COgZ*FsR03%^FI59RiGht*nGhsA1VKQVhVl-wjHfAw0H832>F13Csy0~iQ8I&O7sUvP3|WMyMzb8~NUb0{ewV{dMAbO0k^G-PBsVPRupWHDi4F*ajnGcjd0I50UgIAUZrWHC85NCP?sP6HSSIy!WDaAhbdAY*TCb94YBVK+EpHeon5He)z6G+{6}WHe+sV`gDFIWlBpHe+UGW=aD(22KMQ6*@X(b!~7cb97`nI&X7ya%Ev{CMh6eZ*FsR03%^!F)=wfGB!A5Ffd{@Fk>@gH!w9}GGjS7GB#v2VKy{O13D8j1OPfZV|8RGfb7^{IUvw-WV{dMAbRbkFDF7p3FgY?YHZWyjW;HZqFgZ9fH)LU9VK_K9HDfS1IWjUfQ3E{?Wdj=wIyzHrVQg$xWn*-2awti3RCz2QNpw_sDF7p3Fl8`fIb~rpVlg*0WHDtlIAu08Ff=nXW;r-BIWssgQv*E;Wdj=;IyzHrVQg$xWn*-2awv0jWI8%&ZfSIBVQgu7WpZC^X>)WuR6Z#HBVlATHDNemWie$mVlZSgF*9W~He@(7H#j$CGh{hqIWbiOJs4#J8x%S^Q*B{vY*uAsbZ>GfV{dMAbRbi0VQg$xWn*-2awaJNBVjjTGh<;jG&o~2Vlg>mWi>Z2GB!10He_LBGGu0EW;a&@JrZRD8xJ}GfQ*B{vY*uAsbZ>GdCMf_TVPj@DG-EL}WHDl4Fga#9WH&KoH!wChH90mhGh;9?HCY2a4P^rx6goO@aAk5~bZ>G!C}VGKb95k6ZDDL|R%K&!Z*nFn03%^EW-~TpWnyA7Ff(ChI5%NpG%_iV>LHrV>dQ3HaIzDGGbo?JrrdF8x}e`Z*XODVRUbDJt$LcVQg$xWn*-2a!GA)Y&=vxCMGEWBVlDQHa0e8He@wnI5=fEWMeWkGh#F`G%z_iGB7b^Fg9TWJrrdF8yY$~Z*XODVRUbDJt%W@WI8%&ZfSIBVQgu7WpZC^X>)WuR6Z#HBVjONFgG=2W@2MEVL37|WjSLxGc-0dW->81VlXx^VmM<1Js2_s06IETZDDL|R%K&!Z*l-*Y+-YAbY@>MUolf{VQg$xWn*-2axQ3eZEW`e1TSH1XK!+8bZBh=Ff?N^GcaakVr6AxV`gD6Gc!13VmW3vH(@noV=`kiE@*UZY0X>Da7RAF;#X>Ml#Qgv>0X>DaxVRLI~Zf7oNbZu;U1Ogz!yC6evZE$aMVPb4$AXH&eCIyz!ub7fz2d2nR_BVl7UF*9Q^HDfX}WH4l8Ff}tXF*P$XV>U81HZ?OdWHEjN9u7J>V|8RLK1Vr5}AW@9xrWH~ToYy&_Leghs7Iy!G~WpZJ3Z*n~-V|8RdT4asxmceghr|Iyz-;WM5-paCCKYWhf~iV{dMAbO0k^G-EO|I5at9IXN_DHf1+CH)3KnG&g28GBP<~H8o{3bOS&Jeghs8Iy!P?V{dX~C}VYGVMTUjZggLCDIjBSZgX@1BVjc(WM(mAH#9UdGh;9`H#TE1V_`97F*i9lH8?b6WoC8*KoNce9uqn`cVTICC}VYGVMTUjZggLCDIjBSZgX@1BVjpXI5#(9F*7taWi~csV`4ElVK_NBWi~W9V=^@~Vl#LHKoNce9w0h8a&>MfV{dMAbRc7OWMM~gVQ^?gcx7WkVRL0XLo6UfJ|-yuBVjjWV`OGxH#svnHDNV3V>DwmIbt_sIc733WjJDGVmW#PKpuVr9uYb^a&>MfV|8RW^13(X!0{}u{b7cTyY+-YAbY@>MUom5KWMNZua%Ev{LSb`dE@*UZZ1DjAAi}#KM{;3sXdqB>Z)0V1b7^j8AW&&=Wo&6~WdKKVVQ^?ra&Kd0b8~5KXHaQyWo&6~WiDuRZEW`e055fPX>4h9c`tKqVQg$)c4cF9Z*neZaBu)~ZDDL|Ute}*V{~tFUpQ!Ra4u+cZER2i03gDx}VQzC~Z*pyO08n9aVQg$=Y*cb#ZgXaDa&2=iXmo9Civuz`I#5hOLq$kWOhiRe0AXb{G-WnsHez8gV>e@BVl-u9H(@d`F*ao~W-&K7G%||=GCDd_R6$fpLjYlBWj1CxIb~)sWn(!xWoBYzV=*~lIX5^qV`edCVPj*912Q@~QdLe=NlirnVP!RDHDhEsW@R@qVPh~hG+{P3IA$_8V>e_qGdW>pIEw=^IyzHTLPktaR{&u&HezKqIX5{sIXN&hWH&f8VK*>gWo2VDHDzKkG+|_m12Q@~Lr+dbNmNNsP5@y!W;ZrCV=!W3F=I1jI5cBnFgP}0I5jdcF=aV7IALRp12Q@~O;1EsOho`;F=J*mWie(oH8we6V`O17V>DqnIAJ+4V>K}|Fkv=1ivuz`IzdxePD21;Vr4ZoGcz+VVr61CVl!kpIW#skHf1njG-Wn6V`Motivuz`I#o_YMMgvKnVKg`~F=IG4H)3HqW;tUqGG;eqHZWpiIWhzgIyz8ca&&2QX>V?2a#VS6WdI{#Gcsf~F*GtYW??cjV`MO7V>2}|IAbw3V`ejAVm4zik^?e2I#5AUK}<|VOaNhIW-&8iIXPl5Gc{y3W-wx7Wi)0qG+|;fIAvvHIAvmz12Q@~Q$5IQ<GV=*;jHZx&iH8f;lIAk&e5IQV?2asVS?V`Va9HDP39WHm4|IW#ykWo2SwHe_OCVq!LCGc#o~G6WDhI%9QYVMTUjZgc=6VPQErGC5*mGBPt_Vq!TsVq!OCWnyD9W;0}EV`DX8WHJO0Iyz%@WMNZua%Ev{03%^$F*##7Vlyx?WMeR5FlJ>kVKp&fIW{n6VKg!~WjHf31Q0qpV|8RDu7I5{(AG6WDhI%#uXbYXLAUvOb^VQq6?b^s$`WHVtjF=1q5GBi0kVL3H1Ib>loWj8luV>4wrI5at9G6WDhI%r{YUuSY*aA;q403%^#VKX^2HDozrF*Rd3Heok1H8e0~VKioCHZeIgH)S_61Q0qpR6$cqUsPF8MN3Gc-74V>mZrVqr37V`DinVlo5}Iyz}{UvqR}bZKK>bYXLAUv>Z^VKFggV>x3vIAvlpVK8JfVL4%8H8nLkIASqoG-Nh0G%^GbIyz}{UvqV0W^8YFUvyz}YhQK%BVlD`WnwutFgP}5G+{Y8W-w(nIAJhmGcqwSF*0E_GGj6X5IQ<(b6;|GZggpFWnXk*b8BCA03%^$Gh{h5W-wx9FlIMmG&yB4IW=Q6WjQi8GdE%}W;iu61Q0qpX>(s=Z*F91bZKvHUvyz}YhQK%BVjNwV_`C8V=`oAFl9D3Ibk@6CZeMg^b8BCA03%^HIAb(0WMeotH#RpjHDWMkGGj6~V`MZmHZ(XfVP(t2VRCe7bZKvHWpZD303%^HGh;F^H!wA3V>L84HZx{1IW=K0Fk&`kHZnG4GcYkS1P}u{I&NinbYF09cQPncDF7p3WjQuDGGjAjVlXx~GG=6EHe@z9IAJ(qFgZ12F*#*nq60DkG6WC`Iyz}{UvO`CGALtjZgX@XR3<3^BVjmXG&o^3FflY{Gc;j2W;r)uVP!QkHaTN5Gd4G6GdQFJG6*sR5Cb|oW^8Y7a$jt3XEG>MDF7p3I5;#lW-&8jFf%!2W;r-AFfcebV>xCtHaBKAV>dH2rUNnoG6WC;Iy!T7VRUI@UuJA?Z*pI3Z)Y+nDF7p3IWRb7FlIAmVP-ftGi5kAH)1noF=An6Vq`R8VmLQ8r~@)G1P~rNI&EcSX<=?(Z)RU~XmVv`C{kf=WJz>#EFe;0Ze&Sxax5THVQyqebaE^pLn#0wVK8DcWMeR6Fk?A0F=9DlHZnJ6V=!ScG&nFgHD+NoWvT-*95MtD6*@X_b7gg8Z(nU?WNBe;UvFk#ZfS01C{kf=WJz>#EFe;0Ze&Sxax5T2DF7p3HD)1Wo0liFk@n4He_WnGh;b1tOGI=G6WD6Iy!T2a&$5%Npxj$EFejAWpXSaLvL+xVRB_D03%^IWHV-AIWjdeVlZT3I5%QuW-(-DIWaP1Fkv-0F=l1112Pjb1P~iKI&*JwbTcSPbY*fZAW3v(ax5T8bY*fZAVY6$aA9&~DF7p3I5spmIbmiuF*hoOBUuk40DF7p3Ffd{_HDWn3G&nM3I5;zAFg7wcGGsL|I5A~5W;SLyvI82GV_$7)ZYXnfWI8%wbZ>2GV?0zoCM+OhZ*FsRAXFwP03%^FF*0UmVK8JeG+{C`H)1e3IA%3uWi~Z7VmM-CV`4D212P#h1P}r`I&)=ZWGE>BBVjXUIAu6FVP#@tGC5>4GBPzeF*P?gGBaa1H#9jlG-S8~GBN}Z13Efmbb2UMDF7p3Fk(1mG&VLcG-5JjG%#i`VK-(nV`DToIW{mbGh{S5x&tx+G6WC^Iyz%-aB^vFWhirLdSzd9DF7p3F*Rd3F*7k{VP#=7H#RdhVKy~oHe+FCGBIK{W;ixDyaO@@G6WC;Iy!A(Yh_Wo~vTV{dMAbRctdWI8%?baH8KXC^5CBVl4Tb5EME(Xkl|-Wo~vTV{dMAbRctdWI8%?baH8KXC^5CBVlA?IXE#jVmW3xHexq9Ha0gkVqrNmIWRXkWHL20F*CvgG7>Tb5CS?naA9?GWhf~CBVjW%GcaXkIXN{rWMpD9Wn?fpG%_@0GcYtaHDozuGGxR9GBN}Z2s%1&VRdt5D069gWnXkD03%^#Ibt(6GGk^oGcz}2FlJ;mHZx{rV`DWrVKF%|HDhDO12P6O1P}!}I&*MoZeMk7bZKlTP$niR03%^DVPa)4Ib~)wHDfepG&W^6G-6>mIAUgIF*h}1G-fu)12O|L1P}>2I%#uXX>Md?cwcg1Zf9j*X>N95Y-waDLM$LfEFe=U03%^IG&wanV>w|nH#jh2Vq!TsG&M0}HDWknH8NylWn^T^12P9P1P}>2I%H{cbYX5|Whg={AVn-7Qz-x=VKg~0Gcz_}GGa0?WHvcDIb$?rF*r0~GhsJ3Fk~`jFw6ro2QmZ@4mvt*VQXbycW-iQWpZC>ZggdGW?^GxC_!>(b5tNECLk^@E-3&bVPP>aF*!A6H)J(tGG;MkWHT~iVmLK2VL4%8H8wD2G|mGu3^D`|3OYJ;Z(nnCa%pa7C{$r{YgBn~Whnq7VKF#kG&N#lIW}cBH!?LjG&5x|VKg&2HfCWpIbt<3G|&Sw2r>i^96CC0aAk5~bZ>GzJScN?WI8%;b98cLVQnTXAY*TCb95k7VRLIHDF7p3WH@3mVmUB2IW;gcH!x#1VmD%AF*#*7FgG-0VPP^e(gQLYG6WC~Iy!W3UvqSFX>MmIPhxXra&~2MRC#b^DF7p3VmD=DF*P}3GdD0|V=y%`WoBk$WH4i7Vq-KoFf=!1)B`dMG6WD0Iy!A(Yh_<#VRT_%aA|O5C{Sr|WmI`^Wh@{=CMGEWBVjl>H#KHqH8(agFgQ3jI5}ZtVlZMhFlIA2GB7wZFgex(G7T~W5Cu9qZDDI=Uu|z>b!=r{bYXLAC{!jUDF7p3WHB>lIAkzlHa0RiWH>oBH#ssjH!w9ZH8eRlF*7-1*aI>HG6WC;Iyz%@WMN-tWprO|b!}f{Wp-&}WpgMg03%^yW@ceGFflbXVKrtkHZWp0V`XAEHf3csGGSseF*7#W12QrM5CS?nV|8RUu0!=X=7z5DF7p3GBY)1HDNhpWil``WHmE5Wo2VAW;8J|G%_+VWHdK3+ygQ)1P}!}I%9QYVPA7)bYEm;c4=c}C~0nVDF7p3WMwirGdMChIc8xrIbmUDV>x0nVmD$iIWS~pVPP^d-UBiNG6WD7Iyz%@WMN-tWprO;Wp-&}WnXY|Z*XODba^OgZgealV|8ROWpZ?RD06gVIy!H2baG{3Z6+)rV{dMAbRc7OWMM>Qc4=c}P;zf@CMf_TVPWMy_~V`X1$VR&D3XmVv?WOH9|WpZC)Y;R+0C~0nVDF7p3VrDooGiEnoV>B`}Ib<Iyz%@WMN-tWprO;Wp-&}WnXP!cwcy5WNB?*aAk5|Vr*|?Yba@MbSVHMVP-fnI5c8nF)}z~G&DA5WivQ4Gch%0Gh|~iHDP0BHs}K~12O~<1v)xob!1^*XJvF>WMy_~V`X1$VR&D8Uu0=*UvOn|Ut(-;V{0gBZgeRCBVjpVVqrCAGC46bHDWnrVKO-|F*Rc{VKX*2VPY|4H!$i0G6OOM5Cu9qV|8RUu0!=X=7zyZDDv{dS7H|ZC`L@a$jO>Z)0mHX>N2W03%^FH8?P4H92KAGcz_aHDWSmGc+_}VrDcqH)Jt4V=`v!12O|L1P}!}I%9QYVP9uubYEm;c4=c}Uu|J{UwB_+X>DI{WpZCN2W03%^FIX5*oIb%3sIbmWmF)}nXG%+}3GcaK_VPQ8kG&wQu12O|L1P}!}I%9QYVP9uubYEm;c4=c}Uu|J{UwL0-X>DI{WpZCN2W03%^JF=RDkF*YzcVK+52GC4FkWMnlqWH2~lH)AwoHZ(Qx12O|L1P}!}I%9QYVP9uubYEm;c4=c}Uu|J{UwU6;X>DI{WpZCN2W03%^AGdDA3GcsW^Fk?4nV>vKmG-fa~HZf&oG%;dgIWai$12O|L1P}!}I%9QYVP9uubYEm;c4=c}Uu|J{Uvp?}UvOn|Ut(-;V{0gBZgeRCBVlD@VKg>jV=*@|V>CH5F=00~VPR%uWM*YIH8eJ2W?}RLG6OOM5Cu9qV|8RUu0!=X=7zycVTjHUvp`CWhiNGbSVHMVP#=4VlrW7F)%kXG&weAFf=!0HfCZuIXE<9W@a>IG4=y812O~<1v)xob!1^*XJvF>WMy_~V`X1sZ*6dObY)*-VQ^t$X>4h9d0%Z|YHxBVX>N2W03%^BW@0ciH#sw5Hext8Ff}kTHext1Wo9@xH90wCG-NgS12O|L1P}!}I%9QYVP9uubYEm;c4=c}Ut@1=aCLNLUt?i#VPa`)X>@sCZE0?AawutTbSVHMVK*^mH8D3gHa0b8G-fwtF*rFmG-G39H8*57GBq|ZGWr8D12O~<1v)xob!1^*XJvF>WMy_~V`X1;ZfRy|Wn^DrWMpz>b8~5KXDDfIbSVHMVPrEnIWjpkVl-qiV=*)`F*#&7WH326W;Z!DWimHoHT(lI12O~<0y;Wlb!1^*XJvF>WO8YCWpZD3WpZ4qkWMMgBVmLDX12QrM5CS?nV|8RUvhPBbZKp6Uv_13b7^mGC@BCVVKXr@H8^2pFgQ75HfA_9H)A+sWHvK4V`MinWHvc8IRFGQG6WC>Iyz%@WMN-tWprO=a%E*-ZDnmJX>N2W03%^BWoBk$W;rrBVmDN2W03%^FIAUUBIAk?sIW;vkI5K25Vl`$rW@KSwH8Er`F)(5T1Tq6M1P~26I%9QYVP9=wY;131Uu0!=X=7z5b7^{IUvw-WX>N2W03%^CH8eP7H8^2rFgGz`W-(-9Vq#-CG&N&5G-fh0H)A;l1TqUU1P};1I%9QYVP9=wY;131Uu0!=X=7z5b7^{IUvwz|BVji-W;8iDHa0dnVlrhoF)%PRH#9kAWMX79FflV^F<}S38qGiEU|GG#F{F*Rg4Fg7wYGiGBrVhRK@1~LQ?2|7Asb!1^*W^!d^C{!veAZc!NDF7p3V>e+qW@R@tH8x^6I5=cBFgG${VL3E6I5T54FfcP?37IW;#mH8e3XWo0xpG-Nq9HZT$dG9WSp5EeQ*V|8RUt@1@d0%j0a&2=cR4ObWV{dMAbRbkJEFg1fdSzd9DF7p3WnyJHVmDzmHZ(V4Wn(cjFf%qXGdMJ2HDzWoGG#Vq6a+F9G6WDDIyz%@WMN-tWprO{Wo=_{d0%j0a&2=cc5i89Dl8ylZ*FsRAa-wQWGXBmb7^{IUvwz|BVjgVH)drwVlg>6W-u}~G-5I|VK_4}F)%PNI5TEAF*6nfG8!@j5EME(V|8RUu|V=b7gd2aA9(7b0~IiX=ExaAZc!NEFg1fdSzd9DF7p3Vq|1BGC5^8GC4UkH)J4p?aA9(7b0}0QEFe@YAaiMYWnXkD03%^yW;11FVmM(kIb>!vGi6~lF)}kbF*P}5Vq!2jG&3<81TqaW1P}~5I%9QYVP9uubYFU9a&KR7VRCJAC{!veAaiMYWnXkD03%^CWjA3oH(@n1Ff%k^Vr4TjF*sslV>LK4VlZVnVKp`!1TqRT1P~26I%9QYVP9u*VQ^?)XJvF>Zgp*6a&K>RUv6(?WpgNFb!1^ja$#_2Uvwz|BVjmWIbk$6Ffw6bVl!i9H#T84HDfk1Wn(foHe+NtW@8=%G7B;U5DhvyV|8RUv716Uu9%xWpgNFb!1^ja$#_2Uvwz|BVjmZV>e-DGht+7H)LctWi~W6Vq`TkH#KB9VP!LBH8vsyG7B;U5DhvyV|8RUvh76bYE_7WMy+GV|8RWn^b%b0}kVWMM~gVQ^?)bSVHMVP$1EGdD6cI5#&lWMVfnW-v2lGGt~pWiw-CF*Y(YW+((Q3o--{5jr|!b!1^*XJvF>XL4b1XkTt`WMyA;d2nSYV|8R)A=cVTvAW^!+CbS`LgZER2i03gD6F=RJ4WHmG}Ffd^`I|LvCIyz@%bSNnRBVjZ!V`DHeHf3fwF*P?gGdMIbVL4-CHZnOkHDfkoV`4W1LN^2>2q0r`ZgX@1F*ss2G-fbkGc+|YFg0N}WMea8H)J(sWiU2jFgY?g1VRQf1OPfZLt$fRWo&6~WkGCdXKrO=0Ap-nb8~cNUol@XLt$fRWo&6~WkGCdXKrO=E@*UZY)}FKAi}#KQe|vqVRL05GA=eQFd!&0FfuSNFgPzVH7NjcWo%_(b7d_uEjBGME@*UZY-R)vfAH#TBpGG=08VKz2lVK-wkW;i%CGh#D1H)B2oKMZCB9T7S@V|8RG!C}VYGVMlUdaA-ndb7dwbDF7p3WMnurWi~J}IWjacV`FACHZV0aH)c0xG%+|cWHd2lLIgh#W&|AwIy!E3ZC`G0WMy+GDIjBSZgX@1BVlAUGdD9aWiw$gG%+<~W-?)AI5uWCF)}eZHaR#oF=0dmKL%z59SAx)Zgp*6Wn^b%b0{ewV{dMAbO0k^Fg0Q{IA%FGWidE1Wn?gAVmLTuGchw^WHd2kF=A#iMg%_wW&|AwIyz--aCCVnDIjBSZgX@1BVjc-HDWL@VmM|oGG#Y4V`61tHZ(9bW??fjVq-EmF*!&CKL%z59S=G>WOZ$DD06gVIy!H2baG{3Z6+xIBVl4>H#1>5GB!A3I51^5G&nFbW->G}Wo0oiVK_80Vm3+yKMiIC9RfN!Zf|dJC@BCVVP$4vH!w9~Ibt_3HDYBoH#sq7IXN{rH)J+BWH@3lIZOmUW&|A!Iyz`?b95*}CMGN(c5i89Dk%UXVKiblGcYtTH8nXlFgY<~H8V6}W;Hi6H#aphVl*~5GEM|P3T6ZyAv!v1WpZw1Y$#-DZ8Iz&WNB?PEFg1fdSzd9EFeZKAVG3xb5t%aE-3&bVKFo}Ghs0`VKy)_GiG9AGBsr}W-u~2W-?}GI5c5pV^9P?A7%s{6goO>Wo>h1bSQRjX=ExaAZc!NEFg1fdSzd9DF7p3F)%SXHfA(AWMwioF<~??G%{pmGcq_~VPY~gH8Nv3QUpH|W&|A^Iy!A-ZDVkGD0XjYWGXBmV{dMAbRc$bX=ExaAaiMYWnXkD03%^FWj8Q1V>e=AHa0n7I5K85W@RxnFlA$6G&MIiWnwT?1V0*P1RV@II(lVtZzxnMEFg1fdSzd9DF7p3Vl*{kV>LN4VKXx`I51&kW;ZlsHfAwpH#RviVq`R9Rs=r^W&|A%Iyz=)Y-}i0Dl8yWEFg1fdSzd9DF7p3GcqDqfWjQouH#IO}Wi>Z2I5;*pG-NY0UIae}W&|A!Iyz==a$jX(V`yJ#Ze(S6C`l|JNh}~qEFeQE03%^zV=^%^He@(5GdE^sGi7EqIbktjHaRpiV`648VPP;}1V0L91RV@II&^YjZgXaDa&0I{EFeiNAWtkHLn#0wVPY~iFf(O2H8Ny4Wi?`BH8W#lF=k^oHe_ZpVmLH2W?}?C3T6Zy5;{6`a$#Z*XODVRUbDC@Cl@AY*TCb94YBVP-jIVmM(mGBaXiH)LXBVPQ5lI5S~6G&W{tF)=h|F=qru25$r;7dkp`aAk5~bZ>GfDJWxgWMM~gVQ^?gcx7W>bSWTXZ*FsR03%^DHZo&oH)b|8VPrIAG&5l{VK_22H8eRnVlpvgH)b$t1V$Ba1SAtWI&W}ga$$6DawsV%V|8RGfDJWxaZgX@XV|8RGfDJUr*V{dMAbO0k^G&M3bFk&?{H92EtW;kLpWHnMmOXmo9CPyzrT!n+_+Wo%_(b7dehE;KGMASg00GB7VNGcPhVDFAY1Y-M3{Wi2u-G%YYLXmo9C@d5xK!n+_)VRCe7bZKvHX>MmAL2PGla%psEZ2(YVa&&2QX>V?6Zf8MkXK!+8bZBiZXmo9CnE?PVVQgn_a%psEZ7*hTa$jX(V`wgDaBu)-Z*pH>Wnp7zUpQ!Ra4u+cZEOw!055c5b8BX7Z+9+eaBu))Z*pZ{GIU{cYi4Y3cV9SYaBwbYbZu-yZ*Oa2Z*OY=LvL?uVsCG2E@*UZY=Z2~1H8C?|H#IggG&f{4WinzmIb<_^1Ve)aAqYA;M|EjrWn@rca&&2QX>V?2awv0YdSzd9DF7p3Vl_21Fl041HZf&nHe_O9W-v21V`MZlF*7zdVmW3qe*{AYg9ITBIyy&nX=G(&P+@X(X>@6CZe?;Pb7^{IUvw-WLnbCE03%^GVKg#hW;HN3F=k^nHZ?XkF*7tZW@Tn%Win$mWn(ab1Vall1OPfZM|EjrWn@rca&&2QX>V?2asXp&VRLhIW?wO1F-LW2WMyPfVRCe7bZKvHWpXZPbZu-70RS&=VsmA3c4cxdXmD@E0VRLIxVRB(@b1rCfZEOM_0U`n`0WbkI0XqRe14jc&1x^K00ayTB0Av7XaA9XCodestin Search App - + @@ -121,7 +121,7 @@

    Classes

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/small__vector_8hpp.html b/docs/small__vector_8hpp.html index 7902fff88..f1ba2e691 100644 --- a/docs/small__vector_8hpp.html +++ b/docs/small__vector_8hpp.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -122,7 +122,7 @@

    Classes

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/sort_8hpp.html b/docs/sort_8hpp.html deleted file mode 100644 index 9df40b942..000000000 --- a/docs/sort_8hpp.html +++ /dev/null @@ -1,120 +0,0 @@ - - - - - Codestin Search App - - - - - - - -
    -
    - - - - - - diff --git a/docs/structtf_1_1cudaDeviceAllocator_1_1rebind.html b/docs/structtf_1_1cudaDeviceAllocator_1_1rebind.html index ed689c3fb..c4e1fc5a6 100644 --- a/docs/structtf_1_1cudaDeviceAllocator_1_1rebind.html +++ b/docs/structtf_1_1cudaDeviceAllocator_1_1rebind.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -46,6 +46,7 @@

    +
    template<typename U>
    tf::cudaDeviceAllocator::rebind struct

    @@ -65,7 +66,7 @@

    Contents

    Public types

    - using other = cudaDeviceAllocator<U> + using other = cudaDeviceAllocator<U>
    allocator of a different data type
    @@ -114,7 +115,7 @@

    Public types

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/structtf_1_1cudaUSMAllocator_1_1rebind.html b/docs/structtf_1_1cudaUSMAllocator_1_1rebind.html index e1a34884c..7dfd8c391 100644 --- a/docs/structtf_1_1cudaUSMAllocator_1_1rebind.html +++ b/docs/structtf_1_1cudaUSMAllocator_1_1rebind.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -46,6 +46,7 @@

    +
    template<typename U>
    tf::cudaUSMAllocator::rebind struct

    @@ -65,7 +66,7 @@

    Contents

    Public types

    - using other = cudaUSMAllocator<U> + using other = cudaUSMAllocator<U>
    allocator of a different data type
    @@ -114,7 +115,7 @@

    Public types

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/task_8hpp.html b/docs/task_8hpp.html index 1eca97238..f7b2ce516 100644 --- a/docs/task_8hpp.html +++ b/docs/task_8hpp.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -74,7 +74,7 @@

    Classes

    class tf::Task
    -
    class to create a task handle over a node in a taskflow graph
    +
    class to create a task handle over a taskflow node
    class tf::TaskView
    @@ -125,7 +125,7 @@

    Classes

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/taskflow_8hpp.html b/docs/taskflow_8hpp.html index d27839c36..5d10cb479 100644 --- a/docs/taskflow_8hpp.html +++ b/docs/taskflow_8hpp.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -56,6 +56,7 @@

    Contents

    Reference @@ -67,6 +68,37 @@

    Namespaces

    taskflow namespace
    +
    +

    Defines

    +
    +
    + #define TF_VERSION +
    +
    version of the Taskflow (currently 3.11.0)
    +
    + #define TF_MAJOR_VERSION +
    +
    major version of Taskflow, which is equal to TF_VERSION/100000
    +
    + #define TF_MINOR_VERSION +
    +
    minor version of Taskflow, which is equal to TF_VERSION / 100 % 1000
    +
    + #define TF_PATCH_VERSION +
    +
    patch version of Taskflow, which is equal to TF_VERSION % 100
    +
    +
    +
    +

    Define documentation

    +
    +

    + #define TF_VERSION +

    +

    version of the Taskflow (currently 3.11.0)

    +

    The version system is made of a major version number, a minor version number, and a patch number:

    • TF_VERSION % 100 is the patch level
    • TF_VERSION / 100 % 1000 is the minor version
    • TF_VERSION / 100000 is the major version
    +
    +
    @@ -111,7 +143,7 @@

    Namespaces

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/team.html b/docs/team.html index 42f96d7b6..88fe9bf01 100644 --- a/docs/team.html +++ b/docs/team.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -57,7 +57,7 @@

    Contents

  • Freelance Developers
  • -

    Taskflow consists of a multidisciplinary team with different areas of expertise. We adhere to our Code of Conduct.

    Core Members

    Core members provide the essential development, maintenance, and support of Taskflow in all aspects.

    • Principal Investigator: Dr. Tsung-Wei Huang
    • Software Developers: Tsung-Wei Huang, Dian-Lun Lin, Cheng-Hsiang Chiu
    • Financial Manager: Aidza Cruz (aidza dot cruz at utah dot edu)
    • Ombudsperson: Jennifer Hoskins (jennifer dot hoskins at osp dot utah dot edu)
    • Diversity, Equity, and Inclusion: Tsung-Wei Huang
    • Outreach and Education: Tsung-Wei Huang

    Alumni

    Taskflow would not have reached this far without the work of these individuals who ever participated in its development.

    • Guannan Guo
    • Martin Wong
    • Chun-Xun Lin
    • Yasin Zamani

    Freelance Developers

    Taskflow is contributed by a distributed set of Contributors all around the world.

    +

    Taskflow consists of a multidisciplinary team with different areas of expertise. We adhere to our Code of Conduct.

    Core Members

    Core members provide the essential development, maintenance, and support of Taskflow in all aspects.

    • Principal Investigator: Dr. Tsung-Wei Huang
    • Software Developers: Tsung-Wei Huang, Cheng-Hsiang Chiu, Boyang Zhang, Chih-Chun Chang
    • Financial Manager: Jessica Murnane
    • Ombudsperson: Jessica Murane
    • Diversity, Equity, and Inclusion: Tsung-Wei Huang
    • Outreach and Education: Tsung-Wei Huang

    Alumni

    Taskflow would not have reached this far without the work of these individuals who ever participated in its development.

    • Dian-Lun Lin
    • Guannan Guo
    • Martin Wong
    • Chun-Xun Lin
    • Yasin Zamani

    Freelance Developers

    Taskflow is contributed by a distributed set of Contributors all around the world.

    @@ -102,7 +102,7 @@

    Contents

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/transform_8hpp.html b/docs/transform_8hpp.html index fccfb7f14..3612c8ba4 100644 --- a/docs/transform_8hpp.html +++ b/docs/transform_8hpp.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -111,7 +111,7 @@

    Namespaces

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/tsq_8hpp.html b/docs/tsq_8hpp.html index 776366106..d5fcdb43c 100644 --- a/docs/tsq_8hpp.html +++ b/docs/tsq_8hpp.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -57,6 +57,7 @@

    Contents

    @@ -72,12 +73,45 @@

    Namespaces

    Classes

    -
    template<typename T, unsigned TF_MAX_PRIORITY = static_cast<unsigned>(TaskPriority::MAX)>
    - class tf::TaskQueue +
    template<typename T>
    + class tf::UnboundedTaskQueue
    -
    class to create a lock-free unbounded single-producer multiple-consumer queue
    +
    class to create a lock-free unbounded work-stealing queue
    +
    +
    template<typename T, size_t LogSize = TF_DEFAULT_BOUNDED_TASK_QUEUE_LOG_SIZE>
    + class tf::BoundedTaskQueue +
    +
    class to create a lock-free bounded work-stealing queue
    +
    +
    +
    +

    Defines

    +
    +
    + #define TF_DEFAULT_BOUNDED_TASK_QUEUE_LOG_SIZE +
    +
    +
    + #define TF_DEFAULT_UNBOUNDED_TASK_QUEUE_LOG_SIZE +
    +
    +
    +

    Define documentation

    +
    +

    + #define TF_DEFAULT_BOUNDED_TASK_QUEUE_LOG_SIZE +

    +

    This macro defines the default size of the bounded task queue in Log2. Bounded task queue is used by each worker.

    +
    +
    +

    + #define TF_DEFAULT_UNBOUNDED_TASK_QUEUE_LOG_SIZE +

    +

    This macro defines the default size of the unbounded task queue in Log2. Unbounded task queue is used by the executor.

    +
    +
    @@ -122,7 +156,7 @@

    Classes

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/usecases.html b/docs/usecases.html index 604dbb5bb..5e3a310b3 100644 --- a/docs/usecases.html +++ b/docs/usecases.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -93,7 +93,7 @@

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/uw-madison-ece-logo.png b/docs/uw-madison-ece-logo.png new file mode 100644 index 000000000..42258c755 Binary files /dev/null and b/docs/uw-madison-ece-logo.png differ diff --git a/docs/wavefront.html b/docs/wavefront.html index 4610d6eb8..a566a7a2e 100644 --- a/docs/wavefront.html +++ b/docs/wavefront.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -56,20 +56,20 @@

    Contents

  • Wavefront Task Graph
  • -

    We study the wavefront parallelism, which is a common pattern in dynamic programming to sweep elements in a diagonal direction.

    Problem Formulation

    The computation starts at a singular point at a corner of a data plan (e.g., grid) and propagates its effect diagonally to other elements. This sweep of computation is known as wavefront. Each point in the wavefront can be computed in parallel. The following example shows a wavefront parallelism in a 2D matrix.

    Image

    We partition the 9x9 grid into a 3x3 block and assign a task to one block. The wavefront propagates task dependencies from the top-left block all the way to the bottom-right block. Each task precedes two tasks, one to the right and another below.

    Wavefront Task Graph

    We can describe the wavefront parallelism in a simple two-level loop. Since we need to address the two tasks upper and left to a task when creating its dependencies, we use a 2D vector to pre-allocate all tasks via tf::Taskflow::placeholder.

    #include <taskflow/taskflow.hpp>
    +

    We study the wavefront parallelism, which is a common pattern in dynamic programming to sweep elements in a diagonal direction.

    Problem Formulation

    The computation starts at a singular point at a corner of a data plan (e.g., grid) and propagates its effect diagonally to other elements. This sweep of computation is known as wavefront. Each point in the wavefront can be computed in parallel. The following example shows a wavefront parallelism in a 2D matrix.

    Image

    We partition the 9x9 grid into a 3x3 block and assign a task to one block. The wavefront propagates task dependencies from the top-left block all the way to the bottom-right block. Each task precedes two tasks, one to the right and another below.

    Wavefront Task Graph

    We can describe the wavefront parallelism in a simple two-level loop. Since we need to address the two tasks upper and left to a task when creating its dependencies, we use a 2D vector to pre-allocate all tasks via tf::Taskflow::placeholder.

    #include <taskflow/taskflow.hpp>
     
    -int main() {
    -  tf::Executor executor;
    -  tf::Taskflow taskflow;
    -  int num_blocks = 3;
    -  std::vector<std::vector<tf::Task>> node(num_blocks);
    +int main() {
    +  tf::Executor executor;
    +  tf::Taskflow taskflow;
    +  int num_blocks = 3;
    +  std::vector<std::vector<tf::Task>> node(num_blocks);
       
       // create num_blocks*num_blocks placeholder tasks
    -  for(auto &n : node){
    -    for(int i=0; i<num_blocks; i++){
    -      n.emplace_back(taskflow.placeholder());
    +  for(auto &n : node){
    +    for(int i=0; i<num_blocks; i++){
    +      n.emplace_back(taskflow.placeholder());
         }   
    -  }
    +  }
       
       // scan each block and create dependencies
       for( int i=num_blocks; --i>=0; ) { 
    @@ -78,217 +78,217 @@ 

    Contents

    node[i][j].work([=]() { printf("compute block (%d, %d)", i, j); }); // wavefront dependency - if(j+1 < num_blocks) node[i][j].precede(node[i][j+1]); - if(i+1 < num_blocks) node[i][j].precede(node[i+1][j]); + if(j+1 < num_blocks) node[i][j].precede(node[i][j+1]); + if(i+1 < num_blocks) node[i][j].precede(node[i+1][j]); } - } + } - executor.run(taskflow).wait(); + executor.run(taskflow).wait(); // dump the taskflow - taskflow.dump(std::cout); -}

    The figure below shows the wavefront parallelism in a 3x3 grid:

    + taskflow.dump(std::cout); +}

    The figure below shows the wavefront parallelism in a 3x3 grid:

    Codestin Search App Codestin Search App - -B_0_0 + +B_0_0 Codestin Search App - -B_0_1 + +B_0_1 Codestin Search App - - + + Codestin Search App - -B_1_0 + +B_1_0 Codestin Search App - - + + Codestin Search App - -B_0_2 + +B_0_2 Codestin Search App - - + + Codestin Search App - -B_1_1 + +B_1_1 Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - -B_2_0 + +B_2_0 Codestin Search App - - + + Codestin Search App - -B_0_3 + +B_0_3 Codestin Search App - - + + Codestin Search App - -B_1_2 + +B_1_2 Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - -B_2_1 + +B_2_1 Codestin Search App - - + + Codestin Search App - -B_1_3 + +B_1_3 Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - -B_2_2 + +B_2_2 Codestin Search App - - + + Codestin Search App - -B_2_3 + +B_2_3 Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - -B_3_0 + +B_3_0 Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - -B_3_1 + +B_3_1 Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - -B_3_2 + +B_3_2 Codestin Search App - - + + Codestin Search App - -B_3_3 + +B_3_3 Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - - + + Codestin Search App - - + + @@ -337,7 +337,7 @@

    Contents

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/work-stealing.png b/docs/work-stealing.png new file mode 100644 index 000000000..95bf39ff8 Binary files /dev/null and b/docs/work-stealing.png differ diff --git a/docs/worker_8hpp.html b/docs/worker_8hpp.html index c3c3a35fe..7c530f422 100644 --- a/docs/worker_8hpp.html +++ b/docs/worker_8hpp.html @@ -5,7 +5,7 @@ Codestin Search App - + @@ -78,7 +78,11 @@

    Classes

    class tf::WorkerView
    -
    class to create an immutable view of a worker in an executor
    +
    class to create an immutable view of a worker
    +
    + class tf::WorkerInterface +
    +
    class to configure worker behavior in an executor
    @@ -125,7 +129,7 @@

    Classes

    -

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen 1.9.1 and m.css.

    +

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen 1.12.0 and m.css.

    diff --git a/docs/xml/Algorithms.xml b/docs/xml/Algorithms.xml index 01ccc1e18..21c0b69cb 100644 --- a/docs/xml/Algorithms.xml +++ b/docs/xml/Algorithms.xml @@ -1,5 +1,5 @@ - + Algorithms Codestin Search App @@ -10,6 +10,7 @@ Parallel Sort Parallel Scan Parallel Find + Module Algorithm Task-parallel Pipeline Task-parallel Scalable Pipeline Task-parallel Pipeline with Token Dependencies @@ -26,6 +27,7 @@ Parallel Sort Parallel Scan Parallel Find +Module Algorithm Task-parallel Pipeline Task-parallel Scalable Pipeline Task-parallel Pipeline with Token Dependencies @@ -33,6 +35,6 @@ - + diff --git a/docs/xml/AsyncTasking.xml b/docs/xml/AsyncTasking.xml index db3fb33d5..e5165aedf 100644 --- a/docs/xml/AsyncTasking.xml +++ b/docs/xml/AsyncTasking.xml @@ -1,5 +1,5 @@ - + AsyncTasking Codestin Search App @@ -7,34 +7,29 @@ Launch Asynchronous Tasks from an Executor AsyncTasking_1LaunchAsynchronousTasksFromAnExecutor - - - Launch Asynchronous Tasks from a Subflow - AsyncTasking_1LaunchAsynchronousTasksFromAnSubflow - + Launch Asynchronous Tasks from a Runtime AsyncTasking_1LaunchAsynchronousTasksFromARuntime - + + + Launch Asynchronous Tasks Recursively from a Runtime + AsyncTasking_1LaunchAsynchronousTasksRecursivelyFromARuntime + This chapters discusses how to launch tasks asynchronously so that you can incorporate independent, dynamic parallelism in your taskflows. -Codestin Search App -Taskflow executor provides an STL-styled method, tf::Executor::async, for you to run a callable object asynchronously. The method returns a std::future that will eventually hold the result of that function call. -std::future<int>future=executor.async([](){return1;}); +Codestin Search AppTaskflow's executor provides an STL-style method, tf::Executor::async, that allows you to run a callable object asynchronously. This method returns a std::future which will eventually hold the result of the function call. +std::future<int>future=executor.async([](){return1;}); assert(future.get()==1); -Unlike std::async, the future object returned from tf::Executor::async does not block on destruction until completing the function. - -If you do not need the return value or use a future to synchronize the execution, you are encouraged to use tf::Executor::silent_async which returns nothing and thus has less overhead (i.e., no shared state management) compared to tf::Executor::async. -executor.silent_async([](){ -//dosomeworkwithoutreturninganyresult -}); +If you do not need the return value or do not require a std::future for synchronization, you should use tf::Executor::silent_async. This method returns nothing and incurs less overhead than tf::Executor::async, as it avoids the cost of managing a shared state for std::future. +executor.silent_async([](){}); -Launching asynchronous tasks from an executor is thread-safe and can be called by multiple threads both inside (i.e., worker) and outside the executor. Our scheduler autonomously detects whether an asynchronous task is submitted from an external thread or a worker thread and schedules its execution using work stealing. +Launching asynchronous tasks from an executor is thread-safe and can be invoked from multiple threads, including both worker threads inside the executor and external threads outside of it. The scheduler automatically detects the source of the submission and employs work-stealing to schedule the task efficiently, ensuring balanced workload distribution across workers. tf::Taskmy_task=taskflow.emplace([&](){ //launchanasynchronoustaskfrommy_task executor.async([&](){ @@ -45,104 +40,93 @@ If you do not need the return value or use a future to synchronize the execution executor.run(taskflow); executor.wait_for_all();//waitforalltaskstofinish -Asynchronous tasks created from an executor does not belong to any taskflows. The lifetime of an asynchronous task is managed automatically by the executor that creates the task. +Asynchronous tasks created from an executor do not belong to any taskflow. Their lifetime is automatically managed by the executor that created them. -You can name an asynchronous task using the overloads, tf::Executor::async(const std::string& name, F&& f) and tf::Executor::silent_async(const std::string& name, F&& f), that take a string in the first argument. Assigned names will appear in the observers of the executor. -std::future<void>fu=executor.async("asynctask",[](){}); -executor.silent_async("silengasynctask",[](){}); - - - -Codestin Search App -You can launch asynchronous tasks from tf::Subflow using tf::Subflow::async. Asynchronous tasks are independent tasks spawned during the execution of a subflow. When the subflow joins, all asynchronous tasks are guaranteed to finish. The following code creates 100 asynchronous tasks from a subflow and joins their executions explicitly using tf::Subflow::join. -tf::Taskflowtaskflow; -tf::Executorexecutor; - -std::atomic<int>counter{0}; - -taskflow.emplace([&](tf::Subflow&sf){ -std::vector<std::future<void>>futures; -for(inti=0;i<100;i++){ -futures.emplace_back(sf.async([&](){++counter;})); -} -sf.join();//allofthe100asynchronoustaskswillfinishbythisjoin -assert(counter==100); -}); - -executor.run(taskflow).wait(); - -If you do not need the return value or the future to synchronize the execution, you can use tf::Subflow::silent_async which has less overhead when creating an asynchronous task compared to tf::Subflow::async. -tf::Taskflowtaskflow; -tf::Executorexecutor; - -std::atomic<int>counter{0}; - -taskflow.emplace([&](tf::Subflow&sf){ -for(inti=0;i<100;i++){ -sf.silent_async([&](){++counter;}); -} -sf.join();//allofthe100asynchronoustaskswillfinishbythisjoin -assert(counter==100); -}); - -executor.run(taskflow).wait(); - -You should only create asynchronous tasks from a joinable subflow. Launching asynchronous tasks from a detached subflow results in undefined behavior. - -You can assign an asynchronous task a name using the two overloads, tf::Subflow::async(const std::string& name, F&& f) and tf::Subflow::silent_async(const std::string& name, F&& f). Both methods take an additional argument of a string. -taskflow.emplace([](tf::Subflow&sf){ -std::future<void>future=sf.async("nameofthetask",[](){}); -sf.silent_async("anothernameofthetask",[](){}); -sf.join(); -}); - + -Codestin Search App -The asynchronous tasking feature of tf::Subflow is indeed derived from tf::Runtime. You can launch asynchronous tasks from tf::Runtime using tf::Runtime::async or tf::Runtime::silent_async. The following code creates 100 asynchronous tasks from a runtime and joins their executions explicitly using tf::Runtime::corun_all. +Codestin Search AppYou can launch asynchronous tasks from tf::Runtime using tf::Runtime::async or tf::Runtime::silent_async. The following code creates 100 asynchronous tasks from a runtime and joins their executions explicitly using tf::Runtime::corun. tf::Taskflowtaskflow; tf::Executorexecutor; -std::atomic<int>counter{0}; +std::atomic<int>counter{0}; taskflow.emplace([&](tf::Runtime&rt){ for(inti=0;i<100;i++){ rt.silent_async([&](){++counter;})); } -rt.join();//allofthe100asynchronoustaskswillfinishbythisjoin +rt.corun();//allofthe100asynchronoustaskswillfinishbythisjoin assert(counter==100); }); executor.run(taskflow).wait(); -Unlike tf::Subflow::join, you can call tf::Runtime::corun_all multiple times to synchronize the execution of asynchronous tasks between different runs. For example, the following code spawn 100 asynchronous tasks twice and join each execution to assure the spawned 100 asynchronous tasks have properly completed. +Unlike tf::Subflow::join, you can call tf::Runtime::corun multiple times to synchronize the execution of asynchronous tasks between different runs. For example, the following code spawn 100 asynchronous tasks twice and join each execution to assure the spawned 100 asynchronous tasks have properly completed. tf::Taskflowtaskflow; tf::Executorexecutor; -std::atomic<int>counter{0}; +std::atomic<int>counter{0}; taskflow.emplace([&](tf::Runtime&rt){ //spawn100asynchronoustasksandjoin for(inti=0;i<100;i++){ rt.silent_async([&](){++counter;})); } -rt.join();//allofthe100asynchronoustaskswillfinishbythisjoin +rt.corun();//allofthe100asynchronoustaskswillfinishbythisjoin assert(counter==100); //spawnanother100asynchronoustasksandjoin for(inti=0;i<100;i++){ rt.silent_async([&](){++counter;})); } -rt.join();//allofthe100asynchronoustaskswillfinishbythisjoin +rt.corun();//allofthe100asynchronoustaskswillfinishbythisjoin assert(counter==200); }); executor.run(taskflow).wait(); -By default, tf::Runtime does not join like tf::Subflow. All pending asynchronous tasks spawned by tf::Runtime are no longer controllable when their parent runtime disappears. It is your responsibility to properly synchronize spawned asynchronous tasks using tf::Runtime::corun_all. -Creating asynchronous tasks from a runtime allows users to efficiently implement parallel algorithms using recursion, such as parallel sort (tf::Taskflow::sort), that demands dynamic parallelism at runtime. +By default, tf::Runtime does not join like tf::Subflow. All pending asynchronous tasks spawned from a tf::Runtime become uncontrollable once their parent runtime goes out of scope. It is user's responsibility to explicitly synchronize these tasks using tf::Runtime::corun. +Creating asynchronous tasks from a runtime enables efficient implementation of recursive parallel algorithms, such as tf::Taskflow::sort, that require dynamic task creation at runtime. + + +Codestin Search AppAsynchronous tasks can take a reference to tf::Runtime, allowing them to recursively launch additional asynchronous tasks. Combined with tf::Runtime::corun, this enables the implementation of various recursive parallelism patterns, including parallel sort, divide-and-conquer algorithms, and the fork-join model. For instance, the example below demonstrates a parallel recursive implementation of Fibonacci numbers using recursive asynchronous tasking from tf::Runtime: +#include<taskflow/taskflow.hpp> + +size_tfibonacci(size_tN,tf::Runtime&rt){ + +if(N<2)returnN; + +size_tres1,res2; +rt.silent_async([N,&res1](tf::Runtime&rt1){res1=fibonacci(N-1,rt1);}); + +//tailoptimizationfortherightchild +res2=fibonacci(N-2,rt); + +//usecoruntoavoidblockingtheworkerfromwaitingthetwochildrentasks +//tofinish +rt.corun(); + +returnres1+res2; +} + +intmain(){ + +tf::Executorexecutor; + +size_tN=5,res; +executor.silent_async([N,&res](tf::Runtime&rt){res=fibonacci(N,rt);}); +executor.wait_for_all(); + +std::cout<<N<<"-thFibonaccinumberis"<<res<<'\n'; + +return0; +} + +The figure below shows the execution diagram, where the suffix *_1 represent the left child spawned by its parent runtime. + + - + diff --git a/docs/xml/BenchmarkTaskflow.xml b/docs/xml/BenchmarkTaskflow.xml index 6630bb20b..0eda5da29 100644 --- a/docs/xml/BenchmarkTaskflow.xml +++ b/docs/xml/BenchmarkTaskflow.xml @@ -1,5 +1,5 @@ - + BenchmarkTaskflow Codestin Search App @@ -7,40 +7,39 @@ Compile and Run Benchmarks BenchmarkTaskflow_1CompileAndRunBenchmarks - + Configure Run Options BenchmarkTaskflow_1ConfigureRunOptions - - - Specify the Run Model - BenchmarkTaskflow_1SpecifyTheRunModel - - - Specify the Number of Threads - BenchmarkTaskflow_1SpecifyTheNumberOfThreads - - - Specify the Number of Rounds - BenchmarkTaskflow_1SpecifyTheNumberOfRounds - - - + + + Specify the Run Model + BenchmarkTaskflow_1SpecifyTheRunModel + + + Specify the Number of Threads + BenchmarkTaskflow_1SpecifyTheNumberOfThreads + + + Specify the Number of Rounds + BenchmarkTaskflow_1SpecifyTheNumberOfRounds + + + -Codestin Search App -To build the benchmark code, enable the CMake option TF_BUILD_BENCHMARKS to ON as follows: -#under/taskflow/build +Codestin Search AppTo build the benchmark code, enable the CMake option TF_BUILD_BENCHMARKS to ON as follows: +#under/taskflow/build ~$cmake../-DTF_BUILD_BENCHMARKS=ON ~$make After you successfully build the benchmark code, you can find all benchmark instances in the benchmarks/ folder. You can run the executable of each instance in the corresponding folder. -~$cdbenchmarks&ls -black_scholesbinary_treegraph_traversal... -~$cdgraph_traversal&./graph_traversal +~$cdbenchmarks&ls +bench_black_scholesbench_binary_treebench_graph_traversal... +~$./bench_graph_traversal |V|+|E|Runtime 20.197 8420.198 @@ -52,10 +51,10 @@ 66477177.436 71120083.957 -You can display the help message by giving the option help. -~$./graph_traversal--help +You can display the help message by giving the option --help. +~$./bench_graph_traversal--help GraphTraversal -Usage:./graph_traversal[OPTIONS] +Usage:./bench_graph_traversal[OPTIONS] Options: -h,--helpPrintthishelpmessageandexit @@ -64,64 +63,107 @@ -m,--modelTEXTmodelnametbb|omp|tf(default=tf) We currently implement the following instances that are commonly used by the parallel computing community to evaluate the system performance. - +
    Instance Description -binary_tree +bench_binary_tree traverses a complete binary tree -black_scholes +bench_black_scholes computes option pricing with Black-Shcoles Models -graph_traversal +bench_graph_traversal traverses a randomly generated direct acyclic graph -linear_chain +bench_linear_chain traverses a linear chain of tasks -mandelbrot +bench_mandelbrot exploits imbalanced workloads in a Mandelbrot set -matrix_multiplication +bench_matrix_multiplication multiplies two 2D matrices -mnist +bench_mnist trains a neural network-based image classifier on the MNIST dataset -parallel_sort +bench_parallel_sort sorts a range of items -reduce_sum +bench_reduce_sum sums a range of items using reduction -wavefront +bench_wavefront propagates computations in a 2D grid -linear_pipeline -pipeline scheduling on a linear chain of pipes +bench_linear_pipeline +performs pipeline parallelism on a linear chain of pipes + + +bench_graph_pipeline +performs pipeline parallelism on a graph of pipes + + +bench_deferred_pipeline +performs pipeline parallelism with dependencies from future pipes + + +bench_data_pipeline +performs pipeline parallelisms on a cache-friendly data wrapper + + +bench_thread_pool +uses our executor as a simple thread pool + + +bench_for_each +performs parallel-iteration algorithms + + +bench_scan +performs parallel-scan algorithms + + +bench_async_task +creates asynchronous tasks + + +bench_fibonacci +finds Fibonacci numbers using recursive asynchronous tasking + + +bench_nqueens +parallelizes n-queen search using recursive asynchronous tasking + + +bench_integrate +parallelizes integration using recursive asynchronous tasking + + +bench_primes +finds a range of prime numbers using parallel-reduction algorithms -graph_pipeline -pipeline scheduling on a graph of pipes +bench_skynet +traverses a 10-ray tree using recursive asynchronous tasking
    -Codestin Search App -We implement consistent options for each benchmark instance. Common options are: +Codestin Search AppWe implement consistent options for each benchmark instance. Common options are: option value @@ -130,50 +172,47 @@ -h none -display the help message +displays the help message -t integer -configure the number of threads to run +configures the number of threads to run -r integer -configure the number of rounds to run +configures the number of rounds to run -m string -configure the baseline models to run, tbb, omp, or tf +configures the baseline models to run, tbb, omp, or tf
    You can configure the benchmarking environment by giving different options. -Codestin Search App -In addition to a Taskflow-based implementation for each benchmark instance, we have implemented two baseline models using the state-of-the-art parallel programming libraries, OpenMP and Intel TBB, to measure and evaluate the performance of Taskflow. You can select different implementations by passing the option -m. -~$./graph_traversal-mtf#runtheTaskflowimplementation(default) -~$./graph_traversal-mtbb#runtheTBBimplementation -~$./graph_traversal-momp#runtheOpenMPimplementation +Codestin Search AppIn addition to a Taskflow-based implementation for each benchmark instance, we have implemented two baseline models using the state-of-the-art parallel programming libraries, OpenMP and Intel TBB, to measure and evaluate the performance of Taskflow. You can select different implementations by passing the option -m. +~$./bench_graph_traversal-mtf#runtheTaskflowimplementation(default) +~$./bench_graph_traversal-mtbb#runtheTBBimplementation +~$./bench_graph_traversal-momp#runtheOpenMPimplementation -Codestin Search App -You can configure the number of threads to run a benchmark instance by passing the option -t. The default value is one. -#runtheTaskflowimplementationusing4threads -~$./graph_traversal-mtf-t4 +Codestin Search AppYou can configure the number of threads to run a benchmark instance by passing the option -t. The default value is one. +#runtheTaskflowimplementationusing4threads +~$./bench_graph_traversal-mtf-t4 Depending on your environment, you may need to use taskset to set the CPU affinity of the running process. This allows the OS scheduler to keep process on the same CPU(s) as long as practical for performance reason. -#affinetheprocessto4CPUs,CPU0,CPU1,CPU2,andCPU3 -~$taskset-c0-3graph_traversal-t4 +#affinetheprocessto4CPUs,CPU0,CPU1,CPU2,andCPU3 +~$taskset-c0-3bench_graph_traversal-t4 -Codestin Search App -Each benchmark instance evaluates the runtime of the implementation at different problem sizes. Each problem size corresponds to one iteration. You can configure the number of rounds per iteration to average the runtime. -#measuretheruntimeinanaverageof10runs -~$./graph_traversal-r10 +Codestin Search AppEach benchmark instance evaluates the runtime of the implementation at different problem sizes. Each problem size corresponds to one iteration. You can configure the number of rounds per iteration to average the runtime. +#measurethe%Taskflowruntimebyaveragingtheresultsover10runs +~$./bench_graph_traversal-r10-mtf |V|+|E|Runtime 20.109#theruntimevalue0.109isanaverageof10runs 8420.298 @@ -184,6 +223,6 @@
    - +
    diff --git a/docs/xml/CUDASTDExecutionPolicy.xml b/docs/xml/CUDASTDExecutionPolicy.xml deleted file mode 100644 index 411ecb91e..000000000 --- a/docs/xml/CUDASTDExecutionPolicy.xml +++ /dev/null @@ -1,67 +0,0 @@ - - - - CUDASTDExecutionPolicy - Codestin Search App - - - Include the Header - CUDASTDExecutionPolicy_1CUDASTDExecutionPolicyIncludeTheHeader - - - Parameterize Performance - CUDASTDExecutionPolicy_1CUDASTDParameterizePerformance - - - Define an Execution Policy - CUDASTDExecutionPolicy_1CUDASTDDefineAnExecutionPolicy - - - Allocate Memory Buffer for Algorithms - CUDASTDExecutionPolicy_1CUDASTDAllocateMemoryBufferForAlgorithms - - - - - -Taskflow provides standalone template methods for expressing common parallel algorithms on a GPU. Each of these methods is governed by an execution policy object to configure the kernel execution parameters. - -Codestin Search App -You need to include the header file, taskflow/cuda/cudaflow.hpp, for creating a CUDA execution policy object. -#include<taskflow/cuda/cudaflow.hpp> - - - -Codestin Search App -Taskflow parameterizes most CUDA algorithms in terms of the number of threads per block and units of work per thread, which can be specified in the execution policy template type, tf::cudaExecutionPolicy. The design is inspired by Modern GPU Programming authored by Sean Baxter to achieve high-performance GPU computing. - - -Codestin Search App -The following example defines an execution policy object, policy, which configures (1) each block to invoke 512 threads and (2) each of these 512 threads to perform 11 units of work. Block size must be a power of two. It is always a good idea to specify an odd number in the second parameter to avoid bank conflicts. -tf::cudaExecutionPolicy<512, 11>policy; - -By default, the execution policy object is associated with the CUDA default stream (i.e., 0). Default stream can incur significant overhead due to the global synchronization. You can associate an execution policy with another stream as shown below: -//createaRAII-styledstreamobject -tf::cudaStreamstream1,stream2; - -//assignastreamtoapolicyatconstructiontime -tf::cudaExecutionPolicy<512, 11>policy(stream1); - -//assignanotherstreamtothepolicy -policy.stream(stream2); - -All the CUDA standard algorithms in Taskflow are asynchronous with respect to the stream assigned to the execution policy. This enables high execution efficiency for large GPU workloads that call for many different algorithms. You can synchronize the stream the block until all tasks in the stream finish: -cudaStreamSynchronize(policy.stream()); - -The best-performing configurations for each algorithm, each GPU architecture, and each data type can vary significantly. You should experiment different configurations and find the optimal tuning parameters for your applications. A default policy is given in tf::cudaDefaultExecutionPolicy. -tf::cudaDefaultExecutionPolicydefault_policy; - - - -Codestin Search App -A key difference between our CUDA standard algorithms and others (e.g., Thrust) is the memory management. Unlike CPU-parallel algorithms, many GPU-parallel algorithms require extra buffer to store the temporary results during the multi-phase computation, for instance, tf::cuda_reduce and tf::cuda_sort. We DO NOT allocate any memory during these algorithms call but ask you to provide the memory buffer required for each of such algorithms. This decision seems to complicate the code a little bit, but it gives applications freedom to optimize the memory; also, it makes all algorithm calls capturable to a CUDA graph to improve the execution efficiency. - - - - - diff --git a/docs/xml/CUDASTDFind.xml b/docs/xml/CUDASTDFind.xml deleted file mode 100644 index 944edeae7..000000000 --- a/docs/xml/CUDASTDFind.xml +++ /dev/null @@ -1,180 +0,0 @@ - - - - CUDASTDFind - Codestin Search App - - - Include the Header - CUDASTDFind_1CUDASTDFindIncludeTheHeader - - - Find an Element in a Range - CUDASTDFind_1CUDASTDFindItems - - - Find the Minimum Element in a Range - CUDASTDFind_1CUDASTDFindMinItems - - - Find the Maximum Element in a Range - CUDASTDFind_1CUDASTDFindMaxItems - - - - - -Taskflow provides standalone template methods for finding elements in the given ranges using GPU. - -Codestin Search App -You need to include the header file, taskflow/cuda/algorithm/find.hpp, for using the parallel-find algorithm. -#include<taskflow/cuda/algorithm/find.hpp> - - - -Codestin Search App -tf::cuda_find_if finds the index of the first element in the range [first, last) that satisfies the given criteria. This is equivalent to the parallel execution of the following loop: -unsignedidx=0; -for(;first!=last;++first,++idx){ -if(p(*first)){ -returnidx; -} -} -returnidx; - -If no such an element is found, the size of the range is returned. The following code finds the index of the first element that is dividable by 17 over a range of one million elements. -constsize_tN=1000000; -autovec=tf::cuda_malloc_shared<int>(N);//vector -autoidx=tf::cuda_malloc_shared<unsigned>(1);//index - -//initializesthedata -for(size_ti=0;i<N;vec[i++]=rand()); - -//createanexecutionpolicy -tf::cudaDefaultExecutionPolicypolicy; - -//findstheindexofthefirstelementthatisamultipleof17 -tf::cuda_find_if( -policy,vec,vec+N,idx,[]__device__(autov){returnv%17==0;} -); - -//waitforthefindoperationtocomplete -stream.synchronize(); - -//verifiestheresult -if(*idx!=N){ -assert(vec[*idx]%17==0); -} - -//deletesthememory -cudaFree(vec); -cudaFree(idx); - -The find-if algorithm runs asynchronously through the stream specified in the execution policy. You need to synchronize the stream to obtain the correct result. - - -Codestin Search App -tf::cuda_min_element finds the index of the minimum element in the given range [first, last) using the given comparison function object. This is equivalent to a parallel execution of the following loop: -if(first==last){ -return0; -} -autosmallest=first; -for(++first;first!=last;++first){ -if(op(*first,*smallest)){ -smallest=first; -} -} -returnstd::distance(first,smallest); - -The following code finds the index of the minimum element in a range of one millions elements using GPU computing: -constsize_tN=1000000; -autovec=tf::cuda_malloc_shared<int>(N);//vector -autoidx=tf::cuda_malloc_shared<unsigned>(1);//index - -//initializesthedata -for(size_ti=0;i<N;vec[i++]=rand()); - -//createanexecutionpolicy -tf::cudaStreamstream; -tf::cudaDefaultExecutionPolicypolicy(stream); - -//queriestherequiredbuffersizetofindtheminimumelementoverNelement -autobytes=policy.min_element_bufsz<int>(N); -autobuffer=tf::cuda_malloc_device<std::byte>(bytes); - -//findstheminimumelementusingthelesscomparator -tf::cuda_min_element( -policy,vec,vec+N,idx,[]__device__(autoa,autob){returna<b;},buffer -); - -//waitforthemin-elementoperationcompletes -stream.synchronize(); - -//verifiestheresult -assert(vec[*idx]==*std::min_element(vec,vec+N,std::less<int>{})); - -//deletesthememory -cudaFree(vec); -cudaFree(idx); -cudaFree(buffer); - -Since the GPU min-element algorithm may require extra buffer to store the temporary results, you need to provide a buffer of size at least larger or equal to the value returned from tf::cudaDefaultExecutionPolicy::min_element_bufsz. -You must keep the buffer alive before the tf::cuda_min_element completes. - - - - -Codestin Search App -Similar to tf::cuda_min_element, tf::cuda_max_element finds the index of the maximum element in the given range [first, last) using the given comparison function object. This is equivalent to a parallel execution of the following loop: -if(first==last){ -return0; -} -autolargest=first; -for(++first;first!=last;++first){ -if(op(*largest,*first)){ -largest=first; -} -} -returnstd::distance(first,largest); - -The following code finds the index of the maximum element in a range of one millions elements using GPU computing: -constsize_tN=1000000; -autovec=tf::cuda_malloc_shared<int>(N);//vector -autoidx=tf::cuda_malloc_shared<unsigned>(1);//index - -//initializesthedata -for(size_ti=0;i<N;vec[i++]=rand()); - -//createanexecutionpolicy -tf::cudaStreamstream; -tf::cudaDefaultExecutionPolicypolicy(stream); - -//queriestherequiredbuffersizetofindthemaximumelementoverNelement -autobytes=policy.max_element_bufsz<int>(N); -autobuffer=tf::cuda_malloc_device<std::byte>(bytes); - -//findsthemaximumelementusingthelesscomparator -tf::cuda_max_element( -policy,vec,vec+N,idx,[]__device__(autoa,autob){returna<b;},buffer -); - -//waitforthemax-elementoperationtocomplete -stream.synchronize(); - -//verifiestheresult -assert(vec[*idx]==*std::max_element(vec,vec+N,std::less<int>{})); - -//deletesthememory -cudaFree(vec); -cudaFree(idx); -cudaFree(buffer); - -Since the GPU max-element algorithm may require extra buffer to store the temporary results, you need to provide a buffer of size at least larger or equal to the value returned from tf::cudaDefaultExecutionPolicy::max_element_bufsz. -You must keep the buffer alive before tf::cuda_max_element completes. - - - - - - - diff --git a/docs/xml/CUDASTDForEach.xml b/docs/xml/CUDASTDForEach.xml deleted file mode 100644 index e4296b052..000000000 --- a/docs/xml/CUDASTDForEach.xml +++ /dev/null @@ -1,80 +0,0 @@ - - - - CUDASTDForEach - Codestin Search App - - - Include the Header - CUDASTDForEach_1CUDASTDParallelIterationIncludeTheHeader - - - Index-based Parallel Iterations - CUDASTDForEach_1CUDASTDIndexBasedParallelFor - - - Iterator-based Parallel Iterations - CUDASTDForEach_1CUDASTDIteratorBasedParallelFor - - - - - -Taskflow provides standard template methods for performing parallel iterations over a range of items a CUDA GPU. - -Codestin Search App -You need to include the header file, taskflow/cuda/algorithm/for_each.hpp, for using the parallel-iteration algorithm. -#include<taskflow/cuda/algorithm/for_each.hpp> - - - -Codestin Search App -Index-based parallel-for performs parallel iterations over a range [first, last) with the given step size. The task created by tf::cuda_for_each_index represents a kernel of parallel execution for the following loop: -//positivestep:first,first+step,first+2*step,... -for(autoi=first;i<last;i+=step){ -callable(i); -} -//negativestep:first,first-step,first-2*step,... -for(autoi=first;i>last;i+=step){ -callable(i); -} - -Each iteration i is independent of each other and is assigned one kernel thread to run the callable. The following example creates a kernel that assigns each entry of data to 1 over the range [0, 100) with step size 1. -tf::cudaDefaultExecutionPolicypolicy; -autodata=tf::cuda_malloc_shared<int>(100); - -//assignseachelementindatato1overtherange[0,100)withstepsize1 -tf::cuda_for_each_index( -policy,0,100,1,[data]__device__(intidx){data[idx]=1;} -); - -//synchronizetheexecution -policy.synchronize(); - -The parallel-iteration algorithm runs asynchronously through the stream specified in the execution policy. You need to synchronize the stream to obtain correct results. - - -Codestin Search App -Iterator-based parallel-for performs parallel iterations over a range specified by two STL-styled iterators, first and last. The task created by tf::cuda_for_each represents a parallel execution of the following loop: -for(autoi=first;i<last;i++){ -callable(*i); -} - -The two iterators, first and last, are typically two raw pointers to the first element and the next to the last element in the range in GPU memory space. The following example creates a for_each kernel that assigns each element in gpu_data to 1 over the range [data, data + 1000). -tf::cudaDefaultExecutionPolicypolicy; -autodata=tf::cuda_malloc_shared<int>(1000); - -//assignseachelementindatato1overtherange[0,1000)withstepsize1 -tf::cuda_for_each( -policy,data,data+1000,[]__device__(int&item){item=1;} -); - -//synchronizetheexecution -policy.synchronize(); - -Each iteration is independent of each other and is assigned one kernel thread to run the callable. Since the callable runs on GPU, it must be declared with a __device__ specifier. - - - - - diff --git a/docs/xml/CUDASTDMerge.xml b/docs/xml/CUDASTDMerge.xml deleted file mode 100644 index 25bdb3be9..000000000 --- a/docs/xml/CUDASTDMerge.xml +++ /dev/null @@ -1,133 +0,0 @@ - - - - CUDASTDMerge - Codestin Search App - - - Include the Header - CUDASTDMerge_1CUDASTDMergeIncludeTheHeader - - - Merge Two Sorted Ranges of Items - CUDASTDMerge_1CUDASTDMergeItems - - - Merge Two Sorted Ranges of Key-Value Items - CUDASTDMerge_1CUDASTDMergeKeyValueItems - - - - - -Taskflow provides standalone template methods for merging two sorted ranges of items into a sorted range of items. - -Codestin Search App -You need to include the header file, taskflow/cuda/algorithm/merge.hpp, for using the parallel-merge algorithm. -#include<taskflow/cuda/algorithm/merge.hpp> - - - -Codestin Search App -tf::cuda_merge merges two sorted ranges of items into a sorted range. The following code merges two sorted arrays input_1 and input_2, each of 1000 items, into a sorted array output of 2000 items. -constsize_tN=1000; -int*input_1=tf::cuda_malloc_shared<int>(N);//inputvector1 -int*input_2=tf::cuda_malloc_shared<int>(N);//inputvector2 -int*output=tf::cuda_malloc_shared<int>(2*N);//outputvector - -//initializesthedata -for(size_ti=0;i<N;i++){ -input_1[i]=rand()%100; -input_2[i]=rand()%100; -} -std::sort(input_1,input1+N); -std::sort(input_2,input2+N); - -//createanexecutionpolicy -tf::cudaStreamstream; -tf::cudaDefaultExecutionPolicypolicy(stream); - -//queriestherequiredbuffersizetomergetwoN-elementsortedvectors -autobytes=policy.merge_bufsz(N,N); -autobuffer=tf::cuda_malloc_device<std::byte>(bytes); - -//mergeinput_1andinput_2tooutput -tf::cuda_merge(policy, -input_1,input_1+N,input_2,input_2+N,output, -[]__device__(inta,intb){returna<b;},//comparator -buffer -); - -//synchronizestheexecutionandverifiestheresult -stream.synchronize(); - -//verifytheresult -assert(std::is_sorted(output,output+2*N)); - -//deletethebuffer -cudaFree(input1); -cudaFree(input2); -cudaFree(output); -cudaFree(buffer); - -The merge algorithm runs asynchronously through the stream specified in the execution policy. You need to synchronize the stream to obtain correct results. Since the GPU merge algorithm may require extra buffer to store the temporary results, you need to provide a buffer of size at least larger or equal to the value returned from tf::cudaDefaultExecutionPolicy::merge_bufsz. The buffer size depends only on the two input vector sizes. -You must keep the buffer alive before the merge call completes. - - - - -Codestin Search App -tf::cuda_merge_by_key performs key-value merge over two sorted ranges in a similar way to tf::cuda_merge; additionally, it copies elements from the two ranges of values associated with the two input keys, respectively. The following code performs key-value merge over a and b: -constsize_tN=2; -int*a_keys=tf::cuda_malloc_shared<int>(N); -int*a_vals=tf::cuda_malloc_shared<int>(N); -int*b_keys=tf::cuda_malloc_shared<int>(N); -int*b_vals=tf::cuda_malloc_shared<int>(N); -int*c_keys=tf::cuda_malloc_shared<int>(2*N); -int*c_vals=tf::cuda_malloc_shared<int>(2*N); - -//initializesthedata -a_keys[0]=8,a_keys[1]=1; -a_vals[0]=1,a_vals[1]=2; -b_keys[0]=3,b_keys[1]=7; -b_vals[0]=3,b_vals[1]=4; - -//createanexecutionpolicy -tf::cudaStreamstream; -tf::cudaDefaultExecutionPolicypolicy(stream); - -//queriestherequiredbuffersizetomergetwoN-elementsortedvectorsbykeys -autobytes=policy.merge_bufsz(N,N); -autobuffer=tf::cuda_malloc_device<std::byte>(bytes); - -//mergekeysandvaluesofaandbtoc -tf::cuda_merge_by_key( -policy, -a_keys,a_keys+N,a_vals, -b_keys,b_keys+N,b_vals, -c_keys,c_vals, -[]__device__(inta,intb){returna<b;},//comparator -buffer -); - -//waitforthemergetocomplete -stream.synchronize(); - -//now,c_keys={1,3,7,8} -//now,c_vals={2,3,4,1} - -//deletethedevicememory -cudaFree(buffer); -cudaFree(a_keys); -cudaFree(b_keys); -cudaFree(c_keys); -cudaFree(a_vals); -cudaFree(b_vals); -cudaFree(c_vals); - -Since the GPU merge algorithm may require extra buffer to store the temporary results, you need to provide a buffer of size at least larger or equal to the value returned from tf::cudaDefaultExecutionPolicy::merge_bufsz. The buffer size depends only on the two input vector sizes. - - - - - diff --git a/docs/xml/CUDASTDReduce.xml b/docs/xml/CUDASTDReduce.xml deleted file mode 100644 index e58819bc4..000000000 --- a/docs/xml/CUDASTDReduce.xml +++ /dev/null @@ -1,211 +0,0 @@ - - - - CUDASTDReduce - Codestin Search App - - - Include the Header - CUDASTDReduce_1CUDASTDParallelReductionIncludeTheHeader - - - Reduce a Range of Items with an Initial Value - CUDASTDReduce_1CUDASTDReduceItemsWithAnInitialValue - - - Reduce a Range of Items without an Initial Value - CUDASTDReduce_1CUDASTDReduceItemsWithoutAnInitialValue - - - Reduce a Range of Transformed Items with an Initial Value - CUDASTDReduce_1CUDASTDReduceTransformedItemsWithAnInitialValue - - - Reduce a Range of Transformed Items without an Initial Value - CUDASTDReduce_1CUDASTDReduceTransformedItemsWithoutAnInitialValue - - - - - -Taskflow provides standard template methods for reducing a range of items on a CUDA GPU. - -Codestin Search App -You need to include the header file, taskflow/cuda/algorithm/reduce.hpp, for using the parallel-reduction algorithm. -#include<taskflow/cuda/algorithm/reduce.hpp> - - - -Codestin Search App -tf::cuda_reduce performs a parallel reduction over a range of elements specified by [first, last) using the binary operator bop and stores the reduced result in result. It represents the parallel execution of the following reduction loop on a GPU: -while(first!=last){ -*result=bop(*result,*first++); -} - -The variable result participates in the reduction loop and must be initialized with an initial value. The following code performs a parallel reduction to sum all the numbers in the given range with an initial value 1000: -constsize_tN=1000000; -int*res=tf::cuda_malloc_shared<int>(1);//result -int*vec=tf::cuda_malloc_shared<int>(N);//vector - -//initializesthedata -*res=1000; -for(size_ti=0;i<N;i++) -vec[i]=i; -} - -//createanexecutionpolicy -tf::cudaStreamstream; -tf::cudaDefaultExecutionPolicypolicy(stream); - -//queriestherequiredbuffersizetoreduceNelementsusingthegivenpolicy -autobytes=policy.reduce_bufsz<int>(N); -autobuffer=tf::cuda_malloc_device<std::byte>(bytes); - -//*res=1000+(0+1+2+3+4+...+N-1) -tf::cuda_reduce(policy, -vec,vec+N,res,[]__device__(inta,intb){returna+b;},buffer -); - -//synchronizetheexecution -stream.synchronize(); - -//deletethememory -cudaFree(buffer); -cudaFree(res); -cudaFree(vec); - -The reduce algorithm runs asynchronously through the stream specified in the execution policy. You need to synchronize the stream to obtain correct results. Since the GPU reduction algorithm may require extra buffer to store the temporary results, you need to provide a buffer of size at least larger or equal to the value returned from tf::cudaDefaultExecutionPolicy::reduce_bufsz. -You must keep the buffer alive before the reduction completes. - - - - -Codestin Search App -tf::cuda_uninitialized_reduce performs a parallel reduction over a range of items without an initial value. This method represents a parallel execution of the following reduction loop on a GPU: -*result=*first++;//noinitialvaluestoparticipateinthereductionloop -while(first!=last){ -*result=bop(*result,*first++); -} - -The variable result is directly assigned the reduced value without any initial value participating in the reduction loop. The following code performs a parallel reduction to sum all the numbers in the given range without any initial value: -constsize_tN=1000000; -int*res=tf::cuda_malloc_shared<int>(1);//result -int*vec=tf::cuda_malloc_shared<int>(N);//vector - -//initializesthedata -for(size_ti=0;i<N;i++) -vec[i]=i; -} - -//createanexecutionpolicy -tf::cudaStreamstream; -tf::cudaDefaultExecutionPolicypolicy(stream); - -//queriestherequiredbuffersizetoreduceNelementsusingthegivenpolicy -autobytes=policy.reduce_bufsz<int>(N); -autobuffer=tf::cuda_malloc_device<std::byte>(bytes); - -//*res=0+1+2+3+4+...+N-1 -tf::cuda_uninitialized_reduce(policy, -vec,vec+N,res,[]__device__(inta,intb){returna+b;},buffer -); - -//synchronizetheexecution -stream.synchronize(); - -//deletethebuffer -cudaFree(res); -cudaFree(vec); -cudaFree(buffer); - - - -Codestin Search App -tf::cuda_transform_reduce performs a parallel reduction over a range of transformed elements specified by [first, last) using a binary reduce operator bop and a unary transform operator uop. It represents the parallel execution of the following reduction loop on a GPU: -while(first!=last){ -*result=bop(*result,uop(*first++)); -} - -The variable result participates in the reduction loop and must be initialized with an initial value. The following code performs a parallel reduction to sum all the transformed numbers multiplied by 10 in the given range with an initial value 1000: -constsize_tN=1000000; -int*res=tf::cuda_malloc_shared<int>(1);//result -int*vec=tf::cuda_malloc_shared<int>(N);//vector - -//initializesthedata -*res=1000; -for(size_ti=0;i<N;i++){ -vec[i]=i; -} - -//createanexecutionpolicy -tf::cudaStreamstream; -tf::cudaDefaultExecutionPolicypolicy(stream); - -//queriestherequiredbuffersizetoreduceNelementsusingthegivenpolicy -autobytes=policy.reduce_bufsz<int>(N); -autobuffer=tf::cuda_malloc_device<std::byte>(bytes); - -//*res=1000+(0*10+1*10+2*10+3*10+4*10+...+(N-1)*10) -tf::cuda_transform_reduce(policy, -vec,vec+N,res, -[]__device__(inta,intb){returna+b;}, -[]__device__(inta){returna*10;}, -buffer -); - -//synchronizetheexecution -stream.synchronize(); - -//deletethebuffer -cudaFree(res); -cudaFree(vec); -cudaFree(buffer); - - - -Codestin Search App -tf::cuda_transform_uninitialized_reduce performs a parallel reduction over a range of transformed items without an initial value. This method represents a parallel execution of the following reduction loop on a GPU: -*result=*first++;//noinitialvaluestoparticipateinthereductionloop -while(first!=last){ -*result=bop(*result,uop(*first++)); -} - -The variable result is directly assigned the reduced value without any initial value participating in the reduction loop. The following code performs a parallel reduction to sum all the transformed numbers multiplied by 10 in the given range without any initial value: -constsize_tN=1000000; -int*res=tf::cuda_malloc_shared<int>(1);//result -int*vec=tf::cuda_malloc_shared<int>(N);//vector - -//initializesthedata -for(size_ti=0;i<N;i++){ -vec[i]=i; -} - -//createanexecutionpolicy -tf::cudaStreamstream; -tf::cudaDefaultExecutionPolicypolicy(stream); - -//queriestherequiredbuffersizetoreduceNelementsusingthegivenpolicy -autobytes=policy.reduce_bufsz<int>(N); -autobuffer=tf::cuda_malloc_device<std::byte>(bytes); - -//*res=0*10+1*10+2*10+3*10+4*10+...+(N-1)*10 -tf::cuda_uninitialized_reduce(policy, -vec,vec+N,res, -[]__device__(inta,intb){returna+b;}, -[]__device__(inta){returna*10;}, -buffer -); - -//synchronizetheexecution -stream.synchronize(); - -//deletethedata -cudaFree(res); -cudaFree(vec); -cudaFree(buffer); - - - - - - diff --git a/docs/xml/CUDASTDScan.xml b/docs/xml/CUDASTDScan.xml deleted file mode 100644 index 664e2316e..000000000 --- a/docs/xml/CUDASTDScan.xml +++ /dev/null @@ -1,171 +0,0 @@ - - - - CUDASTDScan - Codestin Search App - - - Include the Header - CUDASTDScan_1CUDASTDParallelScanIncludeTheHeader - - - What is a Scan Operation? - CUDASTDScan_1CUDASTDWhatIsAScanOperation - - - Scan a Range of Items - CUDASTDScan_1CUDASTDScanItems - - - Scan a Range of Transformed Items - CUDASTDScan_1CUDASTDScanTransformedItems - - - - - -Taskflow provides standard template methods for scanning a range of items on a CUDA GPU. - -Codestin Search App -You need to include the header file, taskflow/cuda/algorithm/scan.hpp, for using the parallel-scan algorithm. -#include<taskflow/cuda/algorithm/find.hpp> - - - -Codestin Search App -A parallel scan task performs the cumulative sum, also known as prefix sum or scan, of the input range and writes the result to the output range. Each element of the output range contains the running total of all earlier elements using the given binary operator for summation. - - - - -Codestin Search App -tf::cuda_inclusive_scan computes an inclusive prefix sum operation using the given binary operator over a range of elements specified by [first, last). The term "inclusive" means that the i-th input element is included in the i-th sum. The following code computes the inclusive prefix sum over an input array and stores the result in an output array. -constsize_tN=1000000; -int*input=tf::cuda_malloc_shared<int>(N);//inputvector -int*output=tf::cuda_malloc_shared<int>(N);//outputvector - -//initializesthedata -for(size_ti=0;i<N;input[i++]=rand()); - -//createanexecutionpolicy -tf::cudaStreamstream; -tf::cudaDefaultExecutionPolicypolicy(stream); - -//queriestherequiredbuffersizetoscanNelementsusingthegivenpolicy -autobytes=policy.scan_bufsz<int>(N); -autobuffer=tf::cuda_malloc_device<std::byte>(bytes); - -//computesinclusivescanoverinputandstorestheresultinoutput -tf::cuda_inclusive_scan(policy, -input,input+N,output,[]__device__(inta,intb){returna+b;},buffer -); - -//synchronizesandverifiestheresult -stream.synchronize(); - -for(size_ti=1;i<N;i++){ -assert(output[i]==output[i-1]+input[i]); -} - -//deletethedevicememory -cudaFree(input); -cudaFree(output); -cudaFree(buffer); - -The scan algorithm runs asynchronously through the stream specified in the execution policy. You need to synchronize the stream to obtain correct results. Since the GPU scan algorithm may require extra buffer to store the temporary results, you need to provide a buffer of size at least larger or equal to the value returned from tf::cudaDefaultExecutionPolicy::scan_bufsz. -You must keep the buffer alive before the scan call completes. - -On the other hand, tf::cuda_exclusive_scan computes an exclusive prefix sum operation. The term "exclusive" means that the i-th input element is NOT included in the i-th sum. -//computesexclusivescanoverinputandstorestheresultinoutput -tf::cuda_exclusive_scan(policy, -input,input+N,output,[]__device__(inta,intb){returna+b;},buffer -); - -//synchronizestheexecutionandverifiestheresult -stream.synchronize(); -for(size_ti=1;i<N;i++){ -assert(output[i]==output[i-1]+input[i-1]); -} - - - -Codestin Search App -tf::cuda_transform_inclusive_scan transforms each item in the range [first, last) and computes an inclusive prefix sum over these transformed items. The following code multiplies each item by 10 and then compute the inclusive prefix sum over 1000000 transformed items. -constsize_tN=1000000; -int*input=tf::cuda_malloc_shared<int>(N);//inputvector -int*output=tf::cuda_malloc_shared<int>(N);//outputvector - -//initializesthedata -for(size_ti=0;i<N;input[i++]=rand()); - -//createanexecutionpolicy -tf::cudaStreamstream; -tf::cudaDefaultExecutionPolicypolicy(stream); - -//queriestherequiredbuffersizetoscanNelementsusingthegivenpolicy -autobytes=policy.scan_bufsz<int>(N); -autobuffer=tf::cuda_malloc_device<std::byte>(bytes); - -//computesinclusivescanovertransformedinputandstorestheresultinoutput -tf::cuda_transform_inclusive_scan(policy, -input,input+N,output, -[]__device__(inta,intb){returna+b;},//binaryscanoperator -[]__device__(inta){returna*10;},//unarytransformoperator -buffer -); - -//waitforthescantocomplete -stream.synchronize(); - -//verifiestheresult -for(size_ti=1;i<N;i++){ -assert(output[i]==output[i-1]+input[i]*10); -} - -//deletethedevicememory -cudaFree(input); -cudaFree(output); -cudaFree(buffer); - -Similarly, tf::cuda_transform_exclusive_scan performs an exclusive prefix sum over a range of transformed items. The following code computes the exclusive prefix sum over 1000000 transformed items each multipled by 10. -constsize_tN=1000000; -int*input=tf::cuda_malloc_shared<int>(N);//inputvector -int*output=tf::cuda_malloc_shared<int>(N);//outputvector - -//initializesthedata -for(size_ti=0;i<N;input[i++]=rand()); - -//createanexecutionpolicy -tf::cudaStreamstream; -tf::cudaDefaultExecutionPolicypolicy(stream); - -//queriestherequiredbuffersizetoscanNelementsusingthegivenpolicy -autobytes=policy.scan_bufsz<int>(N); -autobuffer=tf::cuda_malloc_device<std::byte>(bytes); - -//computesexclusivescanovertransformedinputandstorestheresultinoutput -tf::cuda_transform_exclusive_scan(policy, -input,input+N,output, -[]__device__(inta,intb){returna+b;},//binaryscanoperator -[]__device__(inta){returna*10;},//unarytransformoperator -buffer -); - -//waitforthescantocomplete -stream.synchronize(); - -//verifiestheresult -for(size_ti=1;i<N;i++){ -assert(output[i]==output[i-1]+input[i-1]*10); -} - -//deletethedevicememory -cudaFree(input); -cudaFree(output); -cudaFree(buffer); - - - - - - diff --git a/docs/xml/CUDASTDSingleTask.xml b/docs/xml/CUDASTDSingleTask.xml deleted file mode 100644 index 726ac06b9..000000000 --- a/docs/xml/CUDASTDSingleTask.xml +++ /dev/null @@ -1,45 +0,0 @@ - - - - CUDASTDSingleTask - Codestin Search App - - - Include the Header - CUDASTDSingleTask_1CUDASTDSingleTaskIncludeTheHeader - - - Run a Task with a Single Thread - CUDASTDSingleTask_1CUDASTDSingleTaskRunATaskWithASingleThread - - - - - -Taskflow provides a standard template method for running a callable using a single GPU thread. - -Codestin Search App -You need to include the header file, taskflow/cuda/algorithm/for_each.hpp, for creating a single-threaded task. -#include<taskflow/cuda/algorithm/for_each.hpp> - - - -Codestin Search App -You can launch a kernel with only one GPU thread running it, which is handy when you want to set up a single or a few variables that do not need multiple threads. The following example creates a single-task kernel that sets a device variable to 1. -tf::cudaStreamstream; -tf::cudaDefaultExecutionPolicypolicy(stream); - -//launchthesingle-taskkernelasynchronouslythroughthepolicy -tf::cuda_single_task(policy,[gpu_variable]__device__(){ -*gpu_Variable=1; -}); - -//waitforthekernelcompletes -stream.synchronize(); - -Since the callable runs on GPU, it must be declared with a __device__ specifier. - - - - - diff --git a/docs/xml/CUDASTDTransform.xml b/docs/xml/CUDASTDTransform.xml deleted file mode 100644 index fbbe767c3..000000000 --- a/docs/xml/CUDASTDTransform.xml +++ /dev/null @@ -1,72 +0,0 @@ - - - - CUDASTDTransform - Codestin Search App - - - Include the Header - CUDASTDTransform_1CUDASTDParallelTransformsIncludeTheHeader - - - Transform a Range of Items - CUDASTDTransform_1CUDASTDTransformARangeOfItems - - - Transform Two Ranges of Items - CUDASTDTransform_1CUDASTDTransformTwoRangesOfItems - - - - - -Taskflow provides template methods for transforming ranges of items to different outputs. - -Codestin Search App -You need to include the header file, taskflow/cuda/algorithm/transform.hpp, for using the parallel-transform algorithm. -#include<taskflow/cuda/algorithm/transform.hpp> - - - -Codestin Search App -Parallel-transform algorithm applies the given transform function to a range of items and store the result in another range specified by two iterators, first and last. The task created by tf::cuda_transform(P&& p, I first, I last, O output, C op) represents a parallel execution for the following loop: -while(first!=last){ -*output++=op(*first++); -} - -The following example creates a transform kernel that transforms an input range of N items to an output range by multiplying each item by 10. -tf::cudaDefaultExecutionPolicypolicy; - -//output[i]=input[i]*10 -tf::cuda_transform( -policy,input,input+N,output,[]__device__(intx){returnx*10;} -); - -//synchronizetheexecution -policy.synchronize(); - -Each iteration is independent of each other and is assigned one kernel thread to run the callable. The transform algorithm runs asynchronously through the stream specified in the execution policy. You need to synchronize the stream to obtain correct results. - - -Codestin Search App -You can transform two ranges of items to an output range through a binary operator. The task created by tf::cuda_transform(P&& p, I1 first1, I1 last1, I2 first2, O output, C op) represents a parallel execution for the following loop: -while(first1!=last1){ -*output++=op(*first1++,*first2++); -} - -The following example creates a transform kernel that transforms two input ranges of N items to an output range by summing each pair of items in the input ranges. -tf::cudaDefaultExecutionPolicypolicy; - -//output[i]=input1[i]+inpu2[i] -tf::cuda_transform(policy, -input1,input1+N,input2,output,[]__device__(inta,intb){returna+b;} -); - -//synchronizetheexecution -policy.synchronize(); - - - - - - diff --git a/docs/xml/CompileTaskflowWithCUDA.xml b/docs/xml/CompileTaskflowWithCUDA.xml index 4b83ed6c7..ba6ad8621 100644 --- a/docs/xml/CompileTaskflowWithCUDA.xml +++ b/docs/xml/CompileTaskflowWithCUDA.xml @@ -1,5 +1,5 @@ - + CompileTaskflowWithCUDA Codestin Search App @@ -7,72 +7,60 @@ Install CUDA Compiler CompileTaskflowWithCUDA_1InstallCUDACompiler - + Compile Source Code Directly CompileTaskflowWithCUDA_1CompileTaskflowWithCUDADirectly - + Compile Source Code Separately CompileTaskflowWithCUDA_1CompileTaskflowWithCUDASeparately - - - Link Objects Using nvcc - CompileTaskflowWithCUDA_1CompileTaskflowWithCUDANaiveLinking - - - Link Objects Using Different Linkers - CompileTaskflowWithCUDA_1CompileTaskflowWithCUDADifferentLinkers - - - + + + Link Objects Using nvcc + CompileTaskflowWithCUDA_1CompileTaskflowWithCUDANaiveLinking + + + Link Objects Using Different Linkers + CompileTaskflowWithCUDA_1CompileTaskflowWithCUDADifferentLinkers + + + -Codestin Search App -To compile Taskflow with CUDA code, you need a nvcc compiler. Please visit the official page of Downloading CUDA Toolkit. +Codestin Search AppTo compile Taskflow with CUDA code, you need a nvcc compiler. Please visit the official page of Downloading CUDA Toolkit. -Codestin Search App -Taskflow's GPU programming interface for CUDA is tf::cudaFlow. Consider the following simple.cu program that launches a single kernel function to output a message: +Codestin Search AppTaskflow's GPU programming interface for CUDA is tf::cudaFlow. Consider the following simple.cu program that launches a single kernel function to output a message: #include<taskflow/taskflow.hpp> #include<taskflow/cudaflow.hpp> -#include<taskflow/cuda/for_each.hpp> intmain(intargc,constchar**argv){ -tf::Executorexecutor; -tf::Taskflowtaskflow; - -tf::Tasktask1=taskflow.emplace([](){}).name("cputask"); -tf::Tasktask2=taskflow.emplace([](){ -//createacudaFlowofasingle-threadedtask -tf::cudaFlowcf; -cf.single_task([]__device__(){printf("hellocudaFlow!\n");}); - -//launchthecudaflowthroughastream -tf::cudaStreamstream; -cf.run(stream); -stream.synchronize(); -}).name("gputask"); +//createaCUDAgraphwithasingle-threadedtask +tf::cudaGraphcg; +cf.single_task([]__device__(){printf("helloCUDAGraph!\n");}); + +//instantiateanexecutableCUDAgraphandrunitthroughastream +tf::cudaStreamstream; +tf::cudaGraphExecexec(cg); -task1.precede(task2); +stream.run(cg).synchronize(); -executor.run(taskflow).wait(); return0; } The easiest way to compile Taskflow with CUDA code (e.g., cudaFlow, kernels) is to use nvcc: -~$nvcc-std=c++17-Ipath/to/taskflow/--extended-lambdasimple.cu-osimple +~$nvcc-std=c++17-Ipath/to/taskflow/--extended-lambdasimple.cu-osimple ~$./simple hellocudaFlow! -Codestin Search App -Large GPU applications often compile a program into separate objects and link them together to form an executable or a library. You can compile your CPU code and GPU code separately with Taskflow using nvcc and other compilers (such as g++ and clang++). Consider the following example that defines two tasks on two different pieces (main.cpp and cudaflow.cpp) of source code: +Codestin Search AppLarge GPU applications often compile a program into separate objects and link them together to form an executable or a library. You can compile your CPU code and GPU code separately with Taskflow using nvcc and other compilers (such as g++ and clang++). Consider the following example that defines two tasks on two different pieces (main.cpp and cudaflow.cpp) of source code: //main.cpp #include<taskflow/taskflow.hpp> @@ -83,7 +71,7 @@ tf::Executorexecutor; tf::Taskflowtaskflow; -tf::Tasktask1=taskflow.emplace([](){std::cout<<"main.cpp!\n";}) +tf::Tasktask1=taskflow.emplace([](){std::cout<<"main.cpp!\n";}) .name("cputask"); tf::Tasktask2=make_cudaflow(taskflow); @@ -100,34 +88,34 @@ tf::Taskmake_cudaflow(tf::Taskflow&taskflow){ returntaskflow.emplace([](){ -//createacudaFlowofasingle-threadedtask -tf::cudaFlowcf; -cf.single_task([]__device__(){printf("cudaflow.cpp!\n");}); +//createaCUDAgraphwithasingle-threadedtask +tf::cudaGraphcg; +cf.single_task([]__device__(){printf("helloCUDAGraph!\n");}); -//launchthecudaflowthroughastream -tf::cudaStreamstream; -cf.run(stream); -stream.synchronize(); +//instantiateanexecutableCUDAgraphandrunitthroughastream +tf::cudaStreamstream; +tf::cudaGraphExecexec(cg); + +stream.run(cg).synchronize(); }).name("gputask"); } Compile each source to an object (g++ as an example): -~$g++-std=c++17-Ipath/to/taskflow-cmain.cpp-omain.o +~$g++-std=c++17-Ipath/to/taskflow-cmain.cpp-omain.o ~$nvcc-std=c++17--extended-lambda-xcu-Ipath/to/taskflow\ -dccudaflow.cpp-ocudaflow.o ~$ls #nowwehavethetwocompiled.oobjects,main.oandcudaflow.o main.ocudaflow.o -The extended-lambda option tells nvcc to generate GPU code for the lambda defined with device. The -x cu tells nvcc to treat the input files as .cu files containing both CPU and GPU code. By default, nvcc treats .cpp files as CPU-only code. This option is required to have nvcc generate device code here, but it is also a handy way to avoid renaming source files in larger projects. The –dc option tells nvcc to generate device code for later linking. +The --extended-lambda option tells nvcc to generate GPU code for the lambda defined with device. The -x cu tells nvcc to treat the input files as .cu files containing both CPU and GPU code. By default, nvcc treats .cpp files as CPU-only code. This option is required to have nvcc generate device code here, but it is also a handy way to avoid renaming source files in larger projects. The –dc option tells nvcc to generate device code for later linking. You may also need to specify the target architecture to tell nvcc to target on a compatible SM architecture using the option -arch. For instance, the following command requires device code linking to have compute capability 7.5 or later: -~$nvcc-std=c++17--extended-lambda-xcu-arch=sm_75-Ipath/to/taskflow\ +~$nvcc-std=c++17--extended-lambda-xcu-arch=sm_75-Ipath/to/taskflow\ -dccudaflow.cpp-ocudaflow.o -Codestin Search App -Using nvcc to link compiled object code is nothing special but replacing the normal compiler with nvcc and it takes care of all the necessary steps: -~$nvccmain.ocudaflow.o-omain +Codestin Search AppUsing nvcc to link compiled object code is nothing special but replacing the normal compiler with nvcc and it takes care of all the necessary steps: +~$nvccmain.ocudaflow.o-omain #runthemainprogram ~$./main @@ -136,15 +124,14 @@ -Codestin Search App -You can choose to use a compiler other than nvcc for the final link step. Since your CPU compiler does not know how to link CUDA device code, you have to add a step in your build to have nvcc link the CUDA device code, using the option -dlink: -~$nvcc-ogpuCode.o-dlinkmain.ocudaflow.o +Codestin Search AppYou can choose to use a compiler other than nvcc for the final link step. Since your CPU compiler does not know how to link CUDA device code, you have to add a step in your build to have nvcc link the CUDA device code, using the option -dlink: +~$nvcc-ogpuCode.o-dlinkmain.ocudaflow.o This step links all the device object code and places it into gpuCode.o. -Note that this step does not link the CPU object code and discards the CPU object code in main.o and cudaflow.o. +Note that this step does not link the CPU object code and discards the CPU object code in main.o and cudaflow.o. To complete the link to an executable, you can use, for example, ld or g++. -#replace/usr/local/cuda/lib64withyourownCUDAlibraryinstallationpath +#replace/usr/local/cuda/lib64withyourownCUDAlibraryinstallationpath ~$g++-pthread-L/usr/local/cuda/lib64/-lcudart\ gpuCode.omain.ocudaflow.o-omain @@ -154,12 +141,12 @@ To complete the link to an executable, you can use, for example, cudaflow.cpp! We give g++ all of the objects again because it needs the CPU object code, which is not in gpuCode.o. The device code stored in the original objects, main.o and cudaflow.o, does not conflict with the code in gpuCode.o. g++ ignores device code because it does not know how to link it, and the device code in gpuCode.o is already linked and ready to go. -This intentional ignorance is extremely useful in large builds where intermediate objects may have both CPU and GPU code. In this case, we just let the GPU and CPU linkers each do its own job, noting that the CPU linker is always the last one we run. The CUDA Runtime API library is automatically linked when we use nvcc for linking, but we must explicitly link it (-lcudart) when using another linker. +This intentional ignorance is extremely useful in large builds where intermediate objects may have both CPU and GPU code. In this case, we just let the GPU and CPU linkers each do its own job, noting that the CPU linker is always the last one we run. The CUDA Runtime API library is automatically linked when we use nvcc for linking, but we must explicitly link it (-lcudart) when using another linker. - + diff --git a/docs/xml/ComposableTasking.xml b/docs/xml/ComposableTasking.xml index 9616ed963..38a639fc7 100644 --- a/docs/xml/ComposableTasking.xml +++ b/docs/xml/ComposableTasking.xml @@ -1,5 +1,5 @@ - + ComposableTasking Codestin Search App @@ -7,29 +7,28 @@ Compose a Taskflow ComposableTasking_1ComposeATaskflow - + - Create a Module Task - ComposableTasking_1CreateAModuleTask - + Create a Module Task from a %Taskflow + ComposableTasking_1CreateAModuleTaskFromATaskflow + Create a Custom Composable Graph ComposableTasking_1CreateACustomComposableGraph - + Composition is a key to improve the programmability of a complex workflow. This chapter describes how to create a large parallel graph through composition of modular and reusable blocks that are easier to optimize. -Codestin Search App -A powerful feature of tf::Taskflow is its composable interface. You can break down a large parallel workload into smaller pieces each designed to run a specific task dependency graph. This largely facilitates the modularity of writing a parallel task program. +Codestin Search AppA powerful feature of tf::Taskflow is its composable interface. You can break down a large parallel workload into smaller pieces each designed to run a specific task dependency graph. This largely facilitates the modularity of writing a parallel task program. 1://f1hasthreeindependenttasks 2:tf::Taskflowf1; 3:f1.name("F1"); -4:tf::Taskf1A=f1.emplace([&](){std::cout<<"F1TaskA\n";}); -5:tf::Taskf1B=f1.emplace([&](){std::cout<<"F1TaskB\n";}); -6:tf::Taskf1C=f1.emplace([&](){std::cout<<"F1TaskC\n";}); +4:tf::Taskf1A=f1.emplace([&](){std::cout<<"F1TaskA\n";}); +5:tf::Taskf1B=f1.emplace([&](){std::cout<<"F1TaskB\n";}); +6:tf::Taskf1C=f1.emplace([&](){std::cout<<"F1TaskC\n";}); 7: 8:f1A.name("f1A"); 9:f1B.name("f1B"); @@ -42,10 +41,10 @@ 16://f2B--- 17:tf::Taskflowf2; 18:f2.name("F2"); -19:tf::Taskf2A=f2.emplace([&](){std::cout<<"F2TaskA\n";}); -20:tf::Taskf2B=f2.emplace([&](){std::cout<<"F2TaskB\n";}); -21:tf::Taskf2C=f2.emplace([&](){std::cout<<"F2TaskC\n";}); -22:tf::Taskf2D=f2.emplace([&](){std::cout<<"F2TaskD\n";}); +19:tf::Taskf2A=f2.emplace([&](){std::cout<<"F2TaskA\n";}); +20:tf::Taskf2B=f2.emplace([&](){std::cout<<"F2TaskB\n";}); +21:tf::Taskf2C=f2.emplace([&](){std::cout<<"F2TaskC\n";}); +22:tf::Taskf2D=f2.emplace([&](){std::cout<<"F2TaskD\n";}); 23: 24:f2A.name("f2A"); 25:f2B.name("f2B"); @@ -59,9 +58,9 @@ 33:f2C.precede(f1_module_task); 34:f1_module_task.precede(f2D); 35: -36:f2.dump(std::cout); +36:f2.dump(std::cout); - + Debrief: @@ -78,24 +77,22 @@ - -Codestin Search App -The task created from Taskflow::composed_of is a module task that runs on a pre-defined taskflow. A module task does not own the taskflow but maintains a soft mapping to the taskflow. You can create multiple module tasks from the same taskflow but only one module task can run at one time. For example, the following composition is valid. Even though the two module tasks module1 and module2 refer to the same taskflow F1, the dependency link prevents F1 from multiple executions at the same time. - + +Codestin Search AppThe task created from Taskflow::composed_of is a module task that runs on a pre-defined taskflow. A module task does not own the taskflow but maintains a soft mapping to the taskflow. You can create multiple module tasks from the same taskflow but only one module task can run at one time. For example, the following composition is valid. Even though the two module tasks module1 and module2 refer to the same taskflow F1, the dependency link prevents F1 from multiple executions at the same time. + However, the following composition is invalid. Both module tasks refer to the same taskflow. They can not run at the same time because they are associated with the same graph. - + -Codestin Search App -Taskflow allows you to create a custom graph object that can participate in the scheduling using composition. To become a module task, your class T must define a method T::graph() that returns a reference to a tf::Graph object. The following example defines a custom graph object that can be assembled in a taskflow throw composition: +Codestin Search AppTaskflow allows you to create a custom graph object that can participate in the scheduling using composition. To become a module task, your class T must define the method T::graph() that returns a reference to the tf::Graph object managed by T. The following example defines a custom graph object that can be assembled in a taskflow throw composition: 1:structCustomGraph{ 2:tf::Graphgraph; 3:CustomGraph(){ -4:tf::FlowBuilderbuilder(graph); +4:tf::FlowBuilderbuilder(graph);//inheritalltaskbuildersintf::Taskflow 5:tf::Tasktask=builder.emplace([](){ -6:std::cout<<"atask\n";//statictask +6:std::cout<<"atask\n";//statictask 7:}); 8:} 9://returnsareferencetothegraphfortaskflowcomposition @@ -113,12 +110,12 @@ Lines 13-14 creates a module task for the declared graph object in the taskflow -The composition method tf::Taskflow::composed_of requires the target to define the graph() method that returns a reference to a tf::Graph object defined by the target. At runtime, the executor will run dependent tasks in that graph using the same work-stealing scheduling algorithm as other taskflows. Taskflow leverages this powerful feature to design high-level algorithms, such as tf::Pipeline. -While Taskflow gives you the flexibility to create a composable graph object, you should consider using tf::Graph as an opaque data structure just to interact with the library. Additionally, as other module tasks, Taskflow does not own the lifetime of a custom composable graph object but keeps a soft mapping to it. You should keep the graph object alive during its execution. +The composition method tf::Taskflow::composed_of requires the target to define the graph() method that returns a reference to a tf::Graph object defined by the target. At runtime, the executor will schedule tasks in that graph using the same work-stealing algorithm as other taskflows. +Users are responsible for ensuring the given target remains valid throughout its execution. The executor does not assume ownership of the target object. - + diff --git a/docs/xml/ConditionalTasking.xml b/docs/xml/ConditionalTasking.xml index 1258cd5d5..2ae877a90 100644 --- a/docs/xml/ConditionalTasking.xml +++ b/docs/xml/ConditionalTasking.xml @@ -1,5 +1,5 @@ - + ConditionalTasking Codestin Search App @@ -7,74 +7,73 @@ Create a Condition Task ConditionalTasking_1CreateAConditionTask - + Understand our Task-level Scheduling ConditionalTasking_1TaskSchedulingPolicy - - - Example - ConditionalTasking_1TaskLevelSchedulingExample - - - + + + Example + ConditionalTasking_1TaskLevelSchedulingExample + + + Avoid Common Pitfalls ConditionalTasking_1AvoidCommonPitfalls - + Implement Control-flow Graphs ConditionalTasking_1ImplementControlFlowGraphs - - - Implement If-Else Control Flow - ConditionalTasking_1ImplementIfElseControlFlow - - - Implement Switch Control Flow - ConditionalTasking_1ImplementSwitchControlFlow - - - Implement Do-While-Loop Control Flow - ConditionalTasking_1ImplementDoWhileLoopControlFlow - - - Implement While-Loop Control Flow - ConditionalTasking_1ImplementWhileLoopControlFlow - - - + + + Implement If-Else Control Flow + ConditionalTasking_1ImplementIfElseControlFlow + + + Implement Switch Control Flow + ConditionalTasking_1ImplementSwitchControlFlow + + + Implement Do-While-Loop Control Flow + ConditionalTasking_1ImplementDoWhileLoopControlFlow + + + Implement While-Loop Control Flow + ConditionalTasking_1ImplementWhileLoopControlFlow + + + Create a Multi-condition Task ConditionalTasking_1CreateAMultiConditionTask - + -Parallel workloads often require making control-flow decisions across dependent tasks. Taskflow supports an very efficient interface of conditional tasking for users to implement general control flow such as dynamic flow, cycles, and conditionals that are otherwise difficult to do with existing frameworks. +One of the most powerful features that distinguishes Taskflow from other systems is its support for conditional tasking, also known as the control taskflow programming model (CTFG). CTFG allows you to embed control flow directly within a taskflow graph, enabling tasks to make decisions dynamically during execution. This mechanism supports advanced in-graph control flow patterns, such as dynamic branching, loops, and conditionals—that are typically difficult or impossible to express in traditional task graph models. -Codestin Search App -A condition task evalutes a set of instructions and returns an integer index of the next successor task to execute. The index is defined with respect to the order of its successor construction. The following example creates an if-else block using a single condition task. +Codestin Search AppA condition task returns an integer index indicating which successor task to execute next. The index corresponds to the position of the successor in the order it was added during task construction. The following example creates an if-else block using a condition task. 1:tf::Taskflowtaskflow; 2: 3:auto[init,cond,yes,no]=taskflow.emplace( 4:[](){}, 5:[](){return0;}, -6:[](){std::cout<<"yes\n";}, -7:[](){std::cout<<"no\n";} +6:[](){std::cout<<"yes\n";}, +7:[](){std::cout<<"no\n";} 8:); 9: 10:cond.succeed(init) 11:.precede(yes,no);//executesyesifcondreturns0 12://executesnoifcondreturns1 - + Line 5 creates a condition task cond and line 11 creates two dependencies from cond to two other tasks, yes and no. With this order, when cond returns 0, the execution moves on to task yes. When cond returns 1, the execution moves on to task no. -It is your responsibility to ensure the return of a condition task goes to a correct successor task. If the return falls beyond the range of the successors, the executor will not schedule any tasks. +It is your responsibility to ensure that the return value of a condition task corresponds to a valid successor. If the returned index is out of range, the executor will not schedule any successor tasks. -Condition task can go cyclic to describe iterative control flow. The example below implements a simple yet commonly used feedback loop through a condition task (line 7-10) that returns a random binary value. If the return value from cond is 0, it loops back to itself, or otherwise to stop. +A condition task can form a cycle to express iterative control flow. The example below demonstrates a simple yet commonly used feedback loop implemented using a condition task (lines 7–10) that returns a random binary value. If the return value from cond is 0, the task loops back to itself; otherwise, it proceeds to stop. 1:tf::Taskflowtaskflow; 2: 3:tf::Taskinit=taskflow.emplace([](){}).name("init"); @@ -82,8 +81,8 @@ Condition task can go cyclic to describe iterative control 5: 6://createsaconditiontaskthatreturns0or1 7:tf::Taskcond=taskflow.emplace([](){ -8:std::cout<<"flippingacoin\n"; -9:returnstd::rand()%2; +8:std::cout<<"flippingacoin\n"; +9:returnstd::rand()%2; 10:}).name("cond"); 11: 12://createsafeedbackloop{0:cond,1:stop} @@ -92,9 +91,9 @@ Condition task can go cyclic to describe iterative control 15: 16:executor.run(taskflow).wait(); - + -A taskflow of complex control flow often just takes a few lines of code to implement, and different control flow blocks may run in parallel. The code below creates another taskflow with three condition tasks. +Creating a taskflow with complex control flow often requires only a few lines of code to implement. Different control flow paths can execute in parallel, making it easy to express both logic and concurrency. The code below creates a taskflow with three condition tasks to demonstrate this capability: tf::Taskflowtaskflow; tf::TaskA=taskflow.emplace([](){}).name("A"); @@ -109,9 +108,9 @@ Condition task can go cyclic to describe iterative control tf::TaskK=taskflow.emplace([](){}).name("K"); tf::TaskL=taskflow.emplace([](){}).name("L"); tf::TaskM=taskflow.emplace([](){}).name("M"); -tf::Taskcond_1=taskflow.emplace([](){returnstd::rand()%2;}).name("cond_1"); -tf::Taskcond_2=taskflow.emplace([](){returnstd::rand()%2;}).name("cond_2"); -tf::Taskcond_3=taskflow.emplace([](){returnstd::rand()%2;}).name("cond_3"); +tf::Taskcond_1=taskflow.emplace([](){returnstd::rand()%2;}).name("cond_1"); +tf::Taskcond_2=taskflow.emplace([](){returnstd::rand()%2;}).name("cond_2"); +tf::Taskcond_3=taskflow.emplace([](){returnstd::rand()%2;}).name("cond_3"); A.precede(B,F); B.precede(C); @@ -127,24 +126,25 @@ Condition task can go cyclic to describe iterative control cond_2.precede(G,H);//return0to'G'or1to'H' cond_3.precede(cond_3,L);//return0to'cond_3'or1to'L' -taskflow.dump(std::cout); +taskflow.dump(std::cout); -The above code creates three condition tasks: (1) a condition task cond_1 that loops back to B on returning 0, or proceeds to E on returning 1, (2) a condition task cond_2 that goes to G on returning 0, or H on returning 1, (3) a condition task cond_3 that loops back to itself on returning 0, or proceeds to L on returning 1 - +The above code creates three condition tasks to implement three different control-flow tasks: +A condition task cond_1 that loops back to B on returning 0, or proceeds to E on returning 1, +A condition task cond_2 that goes to G on returning 0, or H on returning 1, +A condition task cond_3 that loops back to itself on returning 0, or proceeds to L on returning 1 + -You can use condition tasks to create cycles as long as the graph does not introduce task race during execution. However, cycles are not allowed in non-condition tasks. -Conditional tasking lets you make in-task control-flow decisions to enable end-to-end parallelism, instead of resorting to client-side partition or synchronizing your task graph at the decision points of control flow. - + +In this particular example, we can clearly see the advantage of CTFG: the execution of cond_1 can overlap with cond_2 or cond_3, enabling greater concurrency in control-driven workloads. Unlike traditional task graph models that require static structure or external orchestration to handle control flow, CTFG allows tasks to make decisions dynamically and continue execution without global synchronization barriers. This design leads to better parallelism, reduced overhead, and more expressive task graphs, especially in workloads with branching or iterative control flows. -Codestin Search App -In order to understand how an executor schedules condition tasks, we define two dependency types, strong dependency and weak dependency. A strong dependency is a preceding link from a non-condition task to another task. A weak dependency is a preceding link from a condition task to another task. The number of dependents of a task is the sum of strong dependency and weak dependency. The table below lists the strong dependency and weak dependency numbers of each task in the previous example. +Codestin Search AppIn order to understand how an executor schedules condition tasks, we define two dependency types, strong dependency and weak dependency. A strong dependency is a preceding link from one non-condition task to another task. A weak dependency is a preceding link from one condition task to another task. The number of dependencies of a task is the sum of its strong dependencies and weak dependencies. The table below lists the number of strong dependencies and weak dependencies of each task in the previous example: task strong dependency weak dependency -dependents +dependencies A @@ -238,44 +238,40 @@ Condition task can go cyclic to describe iterative control
    -You can query the number of strong dependents, the number of weak dependents, and the number of dependents of a task. +You can query the number of strong dependencies, the number of weak dependencies, and the number of dependencies of a task. 1:tf::Taskflowtaskflow; 2: 3:tf::Tasktask=taskflow.emplace([](){}); 4: 5://...addmoretasksandprecedinglinks 6: -7:std::cout<<task.num_dependents()<<'\n'; -8:std::cout<<task.num_strong_dependents()<<'\n'; -9:std::cout<<task.num_weak_dependents()<<'\n'; +7:std::cout<<task.num_predecessors()<<'\n'; +8:std::cout<<task.num_strong_dependencies()<<'\n'; +9:std::cout<<task.num_weak_dependencies()<<'\n'; -When you submit a task to an executor, the scheduler starts with tasks of zero dependents (both zero strong and weak dependencies) and continues to execute successive tasks whenever their strong dependencies are met. However, the scheduler skips this rule when executing a condition task and jumps directly to its successors indexed by the return value. - +When you submit a task to an executor, the scheduler starts with tasks of zero dependencies (both zero strong and weak dependencies) and continues to execute successive tasks whenever their strong dependencies are met. However, the scheduler skips this rule when executing a condition task and jumps directly to its successors indexed by the return value. + -Each task has an atomic join counter to keep track of strong dependents that are met at runtime. When a task completes, the join counter is restored to the task's strong dependency number in the graph, such that the subsequent execution can reuse the counter again. +Each task has an atomic join counter to keep track of strong dependencies that are met at runtime. When a task completes, the join counter is restored to the task's strong dependency number in the graph, such that the subsequent execution can reuse the counter again. -Codestin Search App -Let's take a look at an example to understand how task-level scheduling works. Suppose we have the following taskflow of one condition task cond that forms a loop to itself on returning 0 and moves on to stop on returning 1: - +Codestin Search AppLet's take a look at an example to understand how task-level scheduling works. Suppose we have the following taskflow of one condition task cond that forms a loop to itself on returning 0 and moves on to stop on returning 1: + The scheduler starts with init task because it has no dependencies (both strong and weak dependencies). Then, the scheduler moves on to the condition task cond. If cond returns 0, the scheduler enqueues cond and runs it again. If cond returns 1, the scheduler enqueues stop and then moves on.
    -Codestin Search App -Condition tasks are handy in creasing dynamic and cyclic control flows, but they are also easy to make mistakes. It is your responsibility to ensure a taskflow is properly conditioned. Top things to avoid include no source tasks to start with and task race. The figure below shows common pitfalls and their remedies. - +Codestin Search AppCondition tasks are handy in creating dynamic and cyclic control flows, but they are also easy to make mistakes. It is your responsibility to ensure a taskflow is properly conditioned. Top things to avoid include no source tasks to start with and task race. The figure below shows common pitfalls and their remedies. + -In the error1 scenario, there is no source task for the scheduler to start with, and the simplest fix is to add a task S that has no dependents. In the error2 scenario, D might be scheduled twice by E through the strong dependency and C through the weak dependency (on returning 1). To fix this problem, you can add an auxiliary task D-aux to break the mixed use of strong dependency and weak dependency. In the risky scenario, task X may be raced by M and P if M returns 0 and P returns 1. +In the error1 scenario, there is no source task for the scheduler to start with, and the simplest fix is to add a task S that has no dependencies. In the error2 scenario, D might be scheduled twice by E through the strong dependency and C through the weak dependency (on returning 1). To fix this problem, you can add an auxiliary task D-aux to break the mixed use of strong dependency and weak dependency. In the risky scenario, task X may be raced by M and P if M returns 0 and P returns 1. It is your responsibility to ensure a written taskflow graph is properly conditioned. We suggest that you Understand our Task-level Scheduling and infer if task race exists in the execution of your graph. -Codestin Search App - -Codestin Search App -You can use conditional tasking to implement if-else control flow. The following example creates a nested if-else control flow diagram that executes three condition tasks to check the range of i. +Codestin Search App +Codestin Search AppYou can use conditional tasking to implement if-else control flow. The following example creates a nested if-else control flow diagram that executes three condition tasks to check the range of i. tf::Taskflowtaskflow; inti; @@ -285,41 +281,40 @@ Condition task can go cyclic to describe iterative control autocond1=taskflow.emplace([&](){returni>1?1:0;}); autocond2=taskflow.emplace([&](){returni>2?1:0;}); autocond3=taskflow.emplace([&](){returni>3?1:0;}); -autoequl1=taskflow.emplace([&](){std::cout<<"i=1\n";}); -autoequl2=taskflow.emplace([&](){std::cout<<"i=2\n";}); -autoequl3=taskflow.emplace([&](){std::cout<<"i=3\n";}); -autogrtr3=taskflow.emplace([&](){std::cout<<"i>3\n";}); +autoequl1=taskflow.emplace([&](){std::cout<<"i=1\n";}); +autoequl2=taskflow.emplace([&](){std::cout<<"i=2\n";}); +autoequl3=taskflow.emplace([&](){std::cout<<"i=3\n";}); +autogrtr3=taskflow.emplace([&](){std::cout<<"i>3\n";}); initi.precede(cond1); cond1.precede(equl1,cond2);//goestocond2ifi>1 cond2.precede(equl2,cond3);//goestocond3ifi>2 cond3.precede(equl3,grtr3);//goestogrtr3ifi>3 - + -Codestin Search App -You can use conditional tasking to implement switch control flow. The following example creates a switch control flow diagram that executes one of the three cases at random using four condition tasks. +Codestin Search AppYou can use condition tasks to implement switch-style control flow. The following example demonstrates this by creating a switch structure that randomly selects and executes one of three cases using four condition tasks. tf::Taskflowtaskflow; auto[source,swcond,case1,case2,case3,target]=taskflow.emplace( -[](){std::cout<<"source\n";}, -[](){std::cout<<"switch\n";returnrand()%3;}, -[](){std::cout<<"case1\n";return0;}, -[](){std::cout<<"case2\n";return0;}, -[](){std::cout<<"case3\n";return0;}, -[](){std::cout<<"target\n";} +[](){std::cout<<"source\n";}, +[](){std::cout<<"switch\n";returnrand()%3;}, +[](){std::cout<<"case1\n";return0;}, +[](){std::cout<<"case2\n";return0;}, +[](){std::cout<<"case3\n";return0;}, +[](){std::cout<<"target\n";} ); source.precede(swcond); swcond.precede(case1,case2,case3); target.succeed(case1,case2,case3); - + Assuming swcond returns 1, the program outputs: -source +source switch case2 target @@ -329,44 +324,43 @@ Condition task can go cyclic to describe iterative control tf::Taskflowtaskflow; auto[source,swcond,case1,case2,case3,target]=taskflow.emplace( -[](){std::cout<<"source\n";}, -[](){std::cout<<"switch\n";returnrand()%3;}, -[](){std::cout<<"case1\n";}, -[](){std::cout<<"case2\n";}, -[](){std::cout<<"case3\n";}, -[](){std::cout<<"target\n";}//targethasthreestrongdependencies +[](){std::cout<<"source\n";}, +[](){std::cout<<"switch\n";returnrand()%3;}, +[](){std::cout<<"case1\n";}, +[](){std::cout<<"case2\n";}, +[](){std::cout<<"case3\n";}, +[](){std::cout<<"target\n";}//targethasthreestrongdependencies ); source.precede(swcond); swcond.precede(case1,case2,case3); target.succeed(case1,case2,case3); - + In this faulty implementation, task target has three strong dependencies but only one of them will be met. This is because swcond is a condition task, and only one case task will be executed depending on the return of swcond. -Codestin Search App -You can use conditional tasking to implement do-while-loop control flow. The following example creates a do-while-loop control flow diagram that repeatedly increments variable i five times using one condition task. +Codestin Search AppYou can use conditional tasking to implement do-while-loop control flow. The following example creates a do-while-loop control flow diagram that repeatedly increments variable i five times using one condition task. tf::Taskflowtaskflow; inti; auto[init,body,cond,done]=taskflow.emplace( -[&](){std::cout<<"i=0\n";i=0;}, -[&](){std::cout<<"i++=>i=";i++;}, -[&](){std::cout<<i<<'\n';returni<5?0:1;}, -[&](){std::cout<<"done\n";} +[&](){std::cout<<"i=0\n";i=0;}, +[&](){std::cout<<"i++=>i=";i++;}, +[&](){std::cout<<i<<'\n';returni<5?0:1;}, +[&](){std::cout<<"done\n";} ); init.precede(body); body.precede(cond); cond.precede(body,done); - + The program outputs: -i=0 +i=0 i++=>i=1 i++=>i=2 i++=>i=3 @@ -376,18 +370,17 @@ Condition task can go cyclic to describe iterative control -Codestin Search App -You can use conditional tasking to implement while-loop control flow. The following example creates a while-loop control flow diagram that repeatedly increments variable i five times using two condition task. +Codestin Search AppYou can use conditional tasking to implement while-loop control flow. The following example creates a while-loop control flow diagram that repeatedly increments variable i five times using two condition task. tf::Taskflowtaskflow; inti; auto[init,cond,body,back,done]=taskflow.emplace( -[&](){std::cout<<"i=0\n";i=0;}, -[&](){std::cout<<"whilei<5\n";returni<5?0:1;}, -[&](){std::cout<<"i++="<<i++<<'\n';}, -[&](){std::cout<<"back\n";return0;}, -[&](){std::cout<<"done\n";} +[&](){std::cout<<"i=0\n";i=0;}, +[&](){std::cout<<"whilei<5\n";returni<5?0:1;}, +[&](){std::cout<<"i++="<<i++<<'\n';}, +[&](){std::cout<<"back\n";return0;}, +[&](){std::cout<<"done\n";} ); init.precede(cond); @@ -395,10 +388,10 @@ Condition task can go cyclic to describe iterative control body.precede(back); back.precede(cond); - + The program outputs: -i=0 +i=0 whilei<5 i++=0 back @@ -424,77 +417,45 @@ Condition task can go cyclic to describe iterative control inti; auto[init,cond,body,done]=taskflow.emplace( -[&](){std::cout<<"i=0\n";i=0;}, -[&](){std::cout<<"whilei<5\n";returni<5?0:1;}, -[&](){std::cout<<"i++="<<i++<<'\n';}, -[&](){std::cout<<"done\n";} +[&](){std::cout<<"i=0\n";i=0;}, +[&](){std::cout<<"whilei<5\n";returni<5?0:1;}, +[&](){std::cout<<"i++="<<i++<<'\n';}, +[&](){std::cout<<"done\n";} ); init.precede(cond); cond.precede(body,done); body.precede(cond); - + In the taskflow diagram above, the scheduler starts with init and then decrements the strong dependency of the loop condition task, while i<5. After this, there remains one strong dependency, i.e., introduced by the loop body task, i++. However, task i++ will not be executed until the loop condition task returns 0, causing a deadlock. -Codestin Search App -A multi-condition task is a generalized version of conditional tasking. In some cases, applications need to jump to multiple branches from a parent task. This can be done by creating a multi-condition task which allows a task to select one or more successor tasks to execute. Similar to a condition task, a multi-condition task returns a vector of integer indices that indicate the successors to execute when the multi-condition task completes. The index is defined with respect to the order of successors preceded by a multi-condition task. For example, the following code creates a multi-condition task, A, that informs the scheduler to run on its two successors, B and D. +Codestin Search AppA multi-condition task is a generalized version of conditional tasking. In some cases, applications need to jump to multiple branches from a parent task. This can be done by creating a multi-condition task which allows a task to select one or more successor tasks to execute. Similar to a condition task, a multi-condition task returns a vector of integer indices that indicate the successors to execute when the multi-condition task completes. The index is defined with respect to the order of successors preceded by a multi-condition task. For example, the following code creates a multi-condition task, A, that informs the scheduler to run on its two successors, B and D. tf::Executorexecutor; tf::Taskflowtaskflow; autoA=taskflow.emplace([&]()->tf::SmallVector<int>{ -std::cout<<"A\n"; +std::cout<<"A\n"; return{0,2}; }).name("A"); -autoB=taskflow.emplace([&](){std::cout<<"B\n";}).name("B"); -autoC=taskflow.emplace([&](){std::cout<<"C\n";}).name("C"); -autoD=taskflow.emplace([&](){std::cout<<"D\n";}).name("D"); +autoB=taskflow.emplace([&](){std::cout<<"B\n";}).name("B"); +autoC=taskflow.emplace([&](){std::cout<<"C\n";}).name("C"); +autoD=taskflow.emplace([&](){std::cout<<"D\n";}).name("D"); A.precede(B,C,D); executor.run(taskflow).wait(); - - -The return type of a multi-condition task is tf::SmallVector, which provides C++ vector-style functionalities but comes with small buffer optimization. - -One important application of conditional tasking is implementing iterative control flow. You can use multi-condition tasks to create multiple loops that run concurrently. The following code creates a sequential chain of four loops in which each loop increments a counter variable ten times. When the program completes, the value of the counter variable is 40. -tf::Executorexecutor; -tf::Taskflowtaskflow; -std::atomic<int>counter{0}; - -autoloop=[&,i=bool{true},c=int(0)]()mutable->tf::SmallVector<int>{ -if(i){ -i=false; -return{0,-1}; -} -else{ -counter.fetch_add(1,std::memory_order_relaxed); -return{++c<10?0:-1}; -} -} -autoA=taskflow.emplace([](){}); -autoB=taskflow.emplace(loop); -autoC=taskflow.emplace(loop); -autoD=taskflow.emplace(loop); - -A.precede(B); -B.precede(B,C); -C.precede(C,D); -D.precede(D); - -executor.run(taskflow).wait();//counter==40 - - + -It is your responsibility to ensure the return of a multi-condition task goes to a correct successor task. If a returned index falls outside the successor range of a multi-condition task, the scheduler will skip that index without doing anything. +The return type of a multi-condition task is tf::SmallVector, which provides C++ vector-style functionalities but comes with small buffer optimization.
    - +
    diff --git a/docs/xml/Contributing.xml b/docs/xml/Contributing.xml index 3c09f72e1..60da4dc51 100644 --- a/docs/xml/Contributing.xml +++ b/docs/xml/Contributing.xml @@ -1,5 +1,5 @@ - + Contributing Codestin Search App @@ -15,6 +15,6 @@ - + diff --git a/docs/xml/Cookbook.xml b/docs/xml/Cookbook.xml index d85a3d2c7..3f1a41a0d 100644 --- a/docs/xml/Cookbook.xml +++ b/docs/xml/Cookbook.xml @@ -1,5 +1,5 @@ - + Cookbook Codestin Search App @@ -11,13 +11,11 @@ Composable Tasking Asynchronous Tasking Asynchronous Tasking with Dependencies - Interact with the Runtime - Prioritized Tasking + Runtime Tasking Exception Handling - GPU Tasking (%cudaFlow) - GPU Tasking (%cudaFlowCapturer) Limit the Maximum Concurrency Request Cancellation + GPU Tasking Profile Taskflow Programs @@ -32,17 +30,15 @@ Composable Tasking Asynchronous Tasking Asynchronous Tasking with Dependencies -Interact with the Runtime -Prioritized Tasking +Runtime Tasking Exception Handling -GPU Tasking (cudaFlow) -GPU Tasking (cudaFlowCapturer) Limit the Maximum Concurrency Request Cancellation +GPU Tasking Profile Taskflow Programs - + diff --git a/docs/xml/Cookbook_8dox.xml b/docs/xml/Cookbook_8dox.xml index a2cda60dd..c41410cf8 100644 --- a/docs/xml/Cookbook_8dox.xml +++ b/docs/xml/Cookbook_8dox.xml @@ -1,5 +1,5 @@ - + Cookbook.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/DataParallelPipeline.xml b/docs/xml/DataParallelPipeline.xml index c11e1caca..f2e83ef89 100644 --- a/docs/xml/DataParallelPipeline.xml +++ b/docs/xml/DataParallelPipeline.xml @@ -1,5 +1,5 @@ - + DataParallelPipeline Codestin Search App @@ -7,46 +7,44 @@ Include the Header DataParallelPipeline_1ParallelDataPipelineIncludeHeaderFile - + Create a Data Pipeline Module Task DataParallelPipeline_1CreateADataPipelineModuleTask - + Understand Internal Data Storage DataParallelPipeline_1UnderstandInternalDataStorage - + Learn More about Taskflow Pipeline DataParallelPipeline_1DataParallelPipelineLearnMore - + Taskflow provides another variant, tf::DataPipeline, on top of tf::Pipeline (see Task-parallel Pipeline) to help you implement data-parallel pipeline algorithms while leaving data management to Taskflow. We recommend you finishing reading TaskParallelPipeline first before learning tf::DataPipeline. -Codestin Search App -You need to include the header file, taskflow/algorithm/data_pipeline.hpp, for implementing data-parallel pipeline algorithms. +Codestin Search AppYou need to include the header file, taskflow/algorithm/data_pipeline.hpp, for implementing data-parallel pipeline algorithms. #include<taskflow/algorithm/data_pipeline.hpp> -Codestin Search App -Similar to creating a task-parallel pipeline (tf::Pipeline), there are three steps to create a data-parallel pipeline application: +Codestin Search AppSimilar to creating a task-parallel pipeline (tf::Pipeline), there are three steps to create a data-parallel pipeline application: Define the pipeline structure (e.g., pipe type, pipe callable, stopping rule, line count) Define the data storage and layout, if needed for the application Define the pipeline taskflow graph using composition -The following example creates a data-parallel pipeline that generates a total of five dataflow tokens from void to int at the first stage, from int to std::string at the second stage, from std::string to float at the third stage, and float to void at the final stage. Data storage between stages is automatically managed by tf::DataPipeline. +The following example creates a data-parallel pipeline that generates a total of five dataflow tokens from void to int at the first stage, from int to std::string at the second stage, and std::string to void at the final stage. Data storage between stages is automatically managed by tf::DataPipeline. #include<taskflow/taskflow.hpp> #include<taskflow/algorithm/data_pipeline.hpp> intmain(){ -//dataflow=>void->int->std::string->float->void +//dataflow=>void->int->std::string->void tf::Taskflowtaskflow("pipeline"); tf::Executorexecutor; @@ -54,24 +52,24 @@ //createapipelinegraph tf::DataPipelinepl(num_lines, -tf::make_data_pipe<void,int>(tf::PipeType::SERIAL,[&](tf::Pipeflow&pf)->int{ +tf::make_data_pipe<void, int>(tf::PipeType::SERIAL,[&](tf::Pipeflow&pf)->int{ if(pf.token()==5){ -pf.stop(); -return0; +pf.stop(); +return0; } else{ -printf("firstpipereturns%lu\n",pf.token()); -returnpf.token(); +printf("firstpipereturns%lu\n",pf.token()); +returnpf.token(); } }), -tf::make_data_pipe<int,std::string>(tf::PipeType::SERIAL,[](int&input){ -printf("secondpipereturnsastrongof%d\n",input+100); -returnstd::to_string(input+100); +tf::make_data_pipe<int, std::string>(tf::PipeType::SERIAL,[](int&input){ +printf("secondpipereturnsastringof%d\n",input+100); +returnstd::to_string(input+100); }), -tf::make_data_pipe<std::string,void>(tf::PipeType::SERIAL,[](std::string&input){ -printf("thirdpipereceivestheinputstring%s\n",input.c_str()); +tf::make_data_pipe<std::string, void>(tf::PipeType::SERIAL,[](std::string&input){ +printf("thirdpipereceivestheinputstring%s\n",input.c_str()); }) ); @@ -79,7 +77,7 @@ taskflow.composed_of(pl).name("pipeline"); //dumpthepipelinegraphstructure(withcomposition) -taskflow.dump(std::cout); +taskflow.dump(std::cout); //runthepipeline executor.run(taskflow).wait(); @@ -88,38 +86,38 @@ } The interface of tf::DataPipeline is very similar to tf::Pipeline, except that the library transparently manages the dataflow between pipes. To create a stage in a data-parallel pipeline, you should always use the helper function tf::make_data_pipe: -tf::make_data_pipe<int,std::string>( +tf::make_data_pipe<int, std::string>( tf::PipeType::SERIAL, [](int&input){ -returnstd::to_string(input+100); +returnstd::to_string(input+100); } ); -The helper function starts with a pair of an input and an output types in its template arguments. Both types will always be decayed to their original form using std::decay (e.g., const int& becomes int) for storage purpose. In terms of function arguments, the first argument specifies the direction of this data pipe, which can be either tf::PipeType::SERIAL or tf::PipeType::PARALLEL, and the second argument is a callable to invoke by the pipeline scheduler. The callable must take the input data type in its first argument and returns a value of the output data type. Additionally, the callable can take a tf::Pipeflow reference in its second argument which allows you to query the runtime information of a stage task, such as its line number and token number. -tf::make_data_pipe<int,std::string>( +The helper function starts with a pair of an input and an output types in its template arguments. Both types will always be decayed to their original form using std::decay (e.g., const int& becomes int) for storage purpose. In terms of function arguments, the first argument specifies the direction of this data pipe, which can be either tf::PipeType::SERIAL or tf::PipeType::PARALLEL, and the second argument is a callable to invoke by the pipeline scheduler. The callable must take the input data type in its first argument and returns a value of the output data type. Additionally, the callable can take a tf::Pipeflow reference in its second argument which allows you to query the runtime information of a stage task, such as its line number and token number. +tf::make_data_pipe<int, std::string>( tf::PipeType::SERIAL, [](int&input,tf::Pipeflow&pf){ -printf("token=%lu,line=%lu\n",pf.token(),pf.line()); -returnstd::to_string(input+100); +printf("token=%lu,line=%lu\n",pf.token(),pf.line()); +returnstd::to_string(input+100); } ) -By default, tf::DataPipeline passes the data in reference to your callable at which you can take it in copy or in reference depending on application needs. +By default, tf::DataPipeline passes the data in reference to your callable at which you can take it in copy or in reference depending on application needs. For the first pipe, the input type should always be void and the callable must take a tf::Pipeflow reference in its argument. In this example, we will stop the pipeline when processing five tokens. -tf::make_data_pipe<void,int>(tf::PipeType::SERIAL,[](tf::Pipeflow&pf)->int{ +tf::make_data_pipe<void, int>(tf::PipeType::SERIAL,[](tf::Pipeflow&pf)->int{ if(pf.token()==5){ -pf.stop(); -return0;//returnsadummyvalue +pf.stop(); +return0;//returnsadummyvalue } else{ -returnpf.token(); +returnpf.token(); } }), Similarly, the output type of the last pipe should be void as no more data will go out of the final pipe. -tf::make_data_pipe<std::string,void>(tf::PipeType::SERIAL,[](std::string&input){ -std::cout<<input<<std::endl; +tf::make_data_pipe<std::string, void>(tf::PipeType::SERIAL,[](std::string&input){ +std::cout<<input<<std::endl; }) Finally, you need to compose the pipeline graph by creating a module task (i.e., tf::Taskflow::compoased_of). @@ -127,22 +125,19 @@ For the first pipe, the input type should always be voidtaskflow.composed_of(pl).name("pipeline"); //dumpthepipelinegraphstructure(withcomposition) -taskflow.dump(std::cout); +taskflow.dump(std::cout); //runthepipeline executor.run(taskflow).wait(); - - - + + -Codestin Search App -By default, tf::DataPipeline uses std::variant to store a type-safe union of all input and output data types extracted from the given data pipes. To avoid false sharing, each line keeps a variant that is aligned with the cacheline size. When invoking a pipe callable, the input data is acquired in reference from the variant using std::get. When returning from a pipe callable, the output data is stored back to the variant using assignment operator. +Codestin Search AppBy default, tf::DataPipeline uses std::variant to store a type-safe union of all input and output data types extracted from the given data pipes. To avoid false sharing, each line keeps a variant that is aligned with the cacheline size. When invoking a pipe callable, the input data is acquired in reference from the variant using std::get. When returning from a pipe callable, the output data is stored back to the variant using assignment operator. -Codestin Search App -Visit the following pages to learn more about pipeline: +Codestin Search AppVisit the following pages to learn more about pipeline: Task-parallel Pipeline Task-parallel Scalable Pipeline @@ -153,6 +148,6 @@ For the first pipe, the input type should always be void - + diff --git a/docs/xml/DependentAsyncTasking.xml b/docs/xml/DependentAsyncTasking.xml index cd3f25efe..f10c16348 100644 --- a/docs/xml/DependentAsyncTasking.xml +++ b/docs/xml/DependentAsyncTasking.xml @@ -1,5 +1,5 @@ - + DependentAsyncTasking Codestin Search App @@ -7,105 +7,105 @@ Create a Dynamic Task Graph DependentAsyncTasking_1CreateADynamicTaskGraph - + Specify a Range of Dependent Async Tasks DependentAsyncTasking_1SpecifyARagneOfDependentAsyncTasks - + - Understand the Lifetime of a Dependent Async Task + Understand the Lifetime of a Dependent-async Task DependentAsyncTasking_1UnderstandTheLifeTimeOfADependentAsyncTask - + Create a Dynamic Task Graph by Multiple Threads DependentAsyncTasking_1CreateADynamicTaskGraphByMultipleThreads - + Query the Completion Status of Dependent Async Tasks DependentAsyncTasking_1QueryTheComppletionStatusOfDependentAsyncTasks - + -This chapters discusses how to create a task graph dynamically using asynchronous tasks, which is extremely beneficial for workloads that want to (1) explore task graph parallelism out of dynamic control flow or (2) overlap task graph creation time with individual task execution time. We recommend that you first read Asynchronous Tasking before digesting this chapter. +This chapters discusses how to create a task graph dynamically using dependent asynchronous (dependent-async) tasks, which is extremely beneficial for workloads that want to (1) explore task graph parallelism out of dynamic control flow or (2) overlap task graph creation time with individual task execution time. We recommend that you first read Asynchronous Tasking before digesting this chapter. -Codestin Search App -When the construct-and-run model of a task graph is not possible in your application, you can use tf::Executor::dependent_async and tf::Executor::silent_dependent_async to create a task graph dynamically. This type of parallelism is also known as on-the-fly task graph parallelism, which offers great flexibility for expressing dynamic task graph parallelism. The example below dynamically creates a task graph of four dependent async tasks, A, B, C, and D, where A runs before B and C and D runs after B and C: - +Codestin Search AppWhen the construct-and-run model of a task graph is not possible in your application, you can use tf::Executor::dependent_async and tf::Executor::silent_dependent_async to create a task graph on the fly. This style of execution is commonly referred to as dynamic task graph parallelism and provides greater flexibility in expressing parallelism that adapts to runtime conditions. The example below dynamically creates a task graph of four dependent-async tasks, A, B, C, and D, where A runs before B and C and D runs after B and C: + tf::Executorexecutor; -tf::AsyncTaskA=executor.silent_dependent_async([](){printf("A\n");}); -tf::AsyncTaskB=executor.silent_dependent_async([](){printf("B\n");},A); -tf::AsyncTaskC=executor.silent_dependent_async([](){printf("C\n");},A); -auto[D,fuD]=executor.dependent_async([](){printf("D\n");},B,C); -fuD.get();//waitforDtofinish,whichinturnsmeansA,B,Cfinish +tf::AsyncTaskA=executor.silent_dependent_async([](){printf("A\n");}); +tf::AsyncTaskB=executor.silent_dependent_async([](){printf("B\n");},A); +tf::AsyncTaskC=executor.silent_dependent_async([](){printf("C\n");},A); +auto[D,fuD]=executor.dependent_async([](){printf("D\n");},B,C); +fuD.get();//waitforDtofinish,whichinturnmeansA,B,Chavefinished -Both tf::Executor::dependent_async and tf::Executor::silent_dependent_async create a task of type tf::AsyncTask to run the given function asynchronously. Additionally, tf::Executor::dependent_async returns a std::future that eventually holds the result of the execution. When returning from both calls, the executor has scheduled a worker to run the task whenever its dependencies are met. That is, task execution happens simultaneously with the creation of the task graph, which is different from constructing a Taskflow and running it from an executor, illustrated in the figure below: +Both tf::Executor::dependent_async and tf::Executor::silent_dependent_async create a dependent-async task of type tf::AsyncTask to run the given function asynchronously. Additionally, tf::Executor::dependent_async returns a std::future that eventually holds the result of the execution. When returning from both calls, the executor has scheduled a worker to run the task whenever its dependencies are met. That is, task execution happens simultaneously with the creation of the task graph, which is different from constructing a Taskflow and running it from an executor, illustrated in the figure below: Since this model only allows relating a dependency from the current task to a previously created task, you need a correct topological order of graph expression. In our example, there are only two possible topological orderings, either ABCD or ACBD. The code below shows another feasible order of expressing this dynamic task graph parallelism: tf::Executorexecutor; -tf::AsyncTaskA=executor.silent_dependent_async([](){printf("A\n");}); -tf::AsyncTaskC=executor.silent_dependent_async([](){printf("C\n");},A); -tf::AsyncTaskB=executor.silent_dependent_async([](){printf("B\n");},A); -auto[D,fuD]=executor.dependent_async([](){printf("D\n");},B,C); -fuD.get();//waitforDtofinish,whichinturnsmeansA,B,Cfinish +tf::AsyncTaskA=executor.silent_dependent_async([](){printf("A\n");}); +tf::AsyncTaskC=executor.silent_dependent_async([](){printf("C\n");},A); +tf::AsyncTaskB=executor.silent_dependent_async([](){printf("B\n");},A); +auto[D,fuD]=executor.dependent_async([](){printf("D\n");},B,C); +fuD.get();//waitforDtofinish,whichinturnmeansA,B,Chavefinished -In addition to using std::future to synchronize the execution, you can use tf::Executor::wait_for_all to wait for all scheduled tasks to finish: +In addition to using std::future to synchronize the execution at a particular task point, you can use tf::Executor::wait_for_all to wait for all scheduled tasks to finish: tf::Executorexecutor; -tf::AsyncTaskA=executor.silent_dependent_async([](){printf("A\n");}); -tf::AsyncTaskB=executor.silent_dependent_async([](){printf("B\n");},A); -tf::AsyncTaskC=executor.silent_dependent_async([](){printf("C\n");},A); -tf::AsyncTaskD=executor.silent_dependent_async([](){printf("D\n");},B,C); +tf::AsyncTaskA=executor.silent_dependent_async([](){printf("A\n");}); +tf::AsyncTaskB=executor.silent_dependent_async([](){printf("B\n");},A); +tf::AsyncTaskC=executor.silent_dependent_async([](){printf("C\n");},A); +tf::AsyncTaskD=executor.silent_dependent_async([](){printf("D\n");},B,C); executor.wait_for_all(); -Codestin Search App -Both tf::Executor::dependent_async(F&& func, Tasks&&... tasks) and tf::Executor::silent_dependent_async(F&& func, Tasks&&... tasks) accept an arbitrary number of tasks in the dependency list. If the number of dependent tasks is unknown at programming time, such as those relying on runtime variables, you can use the following two overloads to specify dependent tasks in an iterable range [first, last): +Codestin Search AppBoth tf::Executor::dependent_async and tf::Executor::silent_dependent_async accept an arbitrary number of tasks in the dependency list. If the number of task dependencies (i.e., predecessors) is unknown at programming time, such as those relying on runtime variables, you can use the following two overloads to specify predecessor tasks in an iterable range [first, last): tf::Executor::dependent_async(F&& func, I first, I last) tf::Executor::silent_dependent_async(F&& func, I first, I last) -The code below creates an asynchronous task that depends on N previously created asynchronous tasks stored in a vector, where N is a runtime variable: +The range must be an input iterator whose deferenced type is convertible to tf::AsyncTask. The following example creates a dependent-async task that depends on N previously created dependent-async tasks stored in a vector, where N is a runtime variable: tf::Executorexecutor; -std::vector<tf::AsyncTask>dependents; +std::vector<tf::AsyncTask>predecessors; for(size_ti=0;i<N;i++){//Nisaruntimevariable -dependents.push_back(executor.silent_dependent_async([](){})); +predecessors.push_back(executor.silent_dependent_async([](){})); } -executor.silent_dependent_async([](){},dependents.begin(),dependents.end()); +executor.silent_dependent_async([](){},predecessors.begin(),predecessors.end()); + +//waitfortheaboveN+1dependent-asynctaskstofinish executor.wait_for_all(); -Codestin Search App -A tf::AsyncTask is a lightweight handle that retains shared ownership of a dependent async task created by an executor. This shared ownership ensures that the async task remains alive when adding it to the dependency list of another async task, thus avoiding the classical ABA problem. +Codestin Search Apptf::AsyncTask is a lightweight handle that retains shared ownership of a dependent-async task created by an executor. This shared ownership ensures that the async task remains alive when adding it to the dependency list of another async task, thus avoiding the classical ABA problem. //mainthreadretainssharedownershipofasynctaskA tf::AsyncTaskA=executor.silent_dependent_async([](){}); +assert(A.use_count()>=1);//mainthreadholdsasharedownershiptoA //taskAremainsalive(i.e.,atleastonerefcountbythemainthread) //whenbeingaddedtothedependencylistofasynctaskB tf::AsyncTaskB=executor.silent_dependent_async([](){},A); +assert(B.use_count()>=1);//mainthreadholdsasharedownershiptoB -Currently, tf::AsyncTask is implemented based on the logic of C++ smart pointer std::shared_ptr and is considered cheap to copy or move as long as only a handful of objects own it. When a worker completes an async task, it will remove the task from the executor, decrementing the number of shared owners by one. If that counter reaches zero, the task is destroyed. +Currently, tf::AsyncTask is implemented based on C++ smart pointer (std::shared_ptr) and is considered cheap to copy or move as long as only a handful of objects own it. When a worker completes a dependent-async task, it will remove the task from the executor, decrementing the number of shared owners by one. If that counter reaches zero, the task is destroyed. -Codestin Search App -You can use multiple threads to create a dynamic task graph as long as the order of simultaneously creating tasks is topologically correct. The example below uses creates a dynamic task graph using three threads (including the main thread), where task A runs before task B and task C: +Codestin Search AppYou can use multiple threads to create a dynamic task graph as long as the order of simultaneously creating tasks is topologically correct. The example below uses creates a dynamic task graph using three threads (including the main thread), where task A runs before task B and task C: tf::Executorexecutor; -//mainthreadcreatesadependentasynctaskA +//mainthreadcreatesadependent-asynctaskA tf::AsyncTaskA=executor.silent_dependent_async([](){}); //spawnanewthreadtocreateanasynctaskBthatrunsafterA -std::threadt1([&](){ +std::threadt1([&](){ tf::AsyncTaskB=executor.silent_dependent_async([](){},A); }); //spawnanewthreadtocreateanasynctaskCthatrunsafterA -std::threadt2([&](){ +std::threadt2([&](){ tf::AsyncTaskC=executor.silent_dependent_async([](){},A); }); @@ -113,38 +113,37 @@ t1.join(); t2.join(); -Regardless of t1 runs before or after t2, the resulting topological order is always correct with the graph definition, either ABC or ACB. +Regardless of whether t1 runs before or after t2, the resulting topological order remains valid with respect to the graph definition. In this example, either ABC or ACB is a correct ordering. -Codestin Search App -When you create a dependent async task, you can query its completion status by tf::AsyncTask::is_done, which returns true upon completion or false otherwise. A completed dependent async task indicates that a worker has executed its associated callable. -//createadependentasynctaskthatreturns100 +Codestin Search AppWhen you create a dependent-async task, you can query its completion status using tf::AsyncTask::is_done, which returns true if the task has completed its execution, or false otherwise. A task is considered completed once a worker has finished executing its associated callable. +//createadependent-asynctaskthatreturns100 auto[task,fu]=executor.dependent_async([](){return100;}); -//loopsuntilthedependentasynctaskcompletes +//loopsuntilthedependent-asynctaskcompletes while(!task.is_done()); assert(fu.get()==100); -tf::AsyncTask::is_done is useful when you need to wait on the result of a dependent async task before moving onto the next program instruction. Often, tf::AsyncTask is used together with tf::Executor::corun_until to keep a worker awake in its work-stealing loop to avoid deadlock (see Execute a Taskflow from an Internal Worker for more details). For instance, the code below implements the famous Fibonacci sequence using recursive asynchronous tasking: +tf::AsyncTask::is_done is useful when you need to wait on the result of a dependent-async task before moving onto the next program instruction. Often, tf::AsyncTask is used together with tf::Executor::corun_until to keep a worker awake in its work-stealing loop to avoid deadlock (see Execute a Taskflow from an Internal Worker for more details). For instance, the code below implements the famous Fibonacci sequence using recursive dependent-async tasking: tf::Executorexecutor; -std::function<int(int)>fibonacci; +std::function<int(int)>fibonacci; //calculatetheFibonaccisequence:0,1,1,2,3,5,8,13,21,34,55,89 fibonacci=[&](intN){ if(N<2){ returnN; } -auto[t1,fu1]=executor.dependent_async(std::bind(fibonacci,N-1)); -auto[t2,fu2]=executor.dependent_async(std::bind(fibonacci,N-2)); +auto[t1,fu1]=executor.dependent_async(std::bind(fibonacci,N-1)); +auto[t2,fu2]=executor.dependent_async(std::bind(fibonacci,N-2)); executor.corun_until([&](){returnt1.is_done()&&t2.is_done();}); returnfu1.get()+fu2.get(); }; -auto[task,fib11]=executor.dependent_async(std::bind(fibonacci,11)); +auto[task,fib11]=executor.dependent_async(std::bind(fibonacci,11)); assert(fib11==89);//the11-thFibonaccinumberis89 - + diff --git a/docs/xml/Doxyfile.xml b/docs/xml/Doxyfile.xml index a31b43606..af207e8f7 100644 --- a/docs/xml/Doxyfile.xml +++ b/docs/xml/Doxyfile.xml @@ -1,10 +1,11 @@ - + + @@ -27,10 +28,12 @@ - - + - @@ -99,6 +102,7 @@ + @@ -112,6 +116,7 @@ + @@ -154,7 +159,9 @@ - + + @@ -167,6 +174,9 @@ - + + + @@ -376,6 +372,7 @@ + @@ -390,6 +387,7 @@ + @@ -428,10 +426,9 @@ - + - @@ -439,6 +436,8 @@ + @@ -451,11 +450,14 @@ + + + - + @@ -463,7 +465,8 @@ - @@ -475,7 +478,6 @@ - @@ -504,8 +506,7 @@ - + @@ -517,4 +518,7 @@ + + diff --git a/docs/xml/Examples.xml b/docs/xml/Examples.xml index 13e861346..25f488a8f 100644 --- a/docs/xml/Examples.xml +++ b/docs/xml/Examples.xml @@ -1,5 +1,5 @@ - + Examples Codestin Search App @@ -8,9 +8,9 @@ Flip Coins Graph Traversal Matrix Multiplication - Matrix Multiplication (cudaFlow) + Matrix Multiplication with CUDA GPU k-means Clustering - k-means Clustering (cudaFlow) + k-means Clustering with CUDA GPU Text Processing Pipeline Graph Processing Pipeline Taskflow Processing Pipeline @@ -24,15 +24,15 @@ Flip Coins Graph Traversal Matrix Multiplication -Matrix Multiplication (cudaFlow) +Matrix Multiplication with CUDA GPU k-means Clustering -k-means Clustering (cudaFlow) +k-means Clustering with CUDA GPU Text Processing Pipeline Graph Processing Pipeline Taskflow Processing Pipeline - + diff --git a/docs/xml/ExceptionHandling.xml b/docs/xml/ExceptionHandling.xml index 6dd94727a..da0e60472 100644 --- a/docs/xml/ExceptionHandling.xml +++ b/docs/xml/ExceptionHandling.xml @@ -1,5 +1,5 @@ - + ExceptionHandling Codestin Search App @@ -7,53 +7,60 @@ Catch an Exception from a Running Taskflow ExceptionHandling_1CatchAnExceptionFromARunningTaskflow - + + + Catch an Exception from a Subflow + ExceptionHandling_1CatchAnExceptionFromASubflow + Catch an Exception from an Async Task ExceptionHandling_1CatchAnExceptionFromAnAsyncTask - + Catch an Exception from a Corun Loop ExceptionHandling_1CatchAnExceptionFromACorunLoop - + + + Turn Off Exception Handling + ExceptionHandling_1TurnOffExceptionHandling + This chapters discusses how to handle exceptions from a submitted taskflow so you can properly catch or propagate exceptions in your workload. -Codestin Search App -When a task throws an exception, the executor will store that exception in the shared state referenced by the tf::Future handle. You can catch that exception via calling the get method: +Codestin Search AppWhen a task throws an exception, the executor will store that exception in the shared state referenced by the tf::Future handle. You can catch that exception via calling the get method: tf::Executorexecutor; tf::Taskflowtaskflow; -taskflow.emplace([](){throwstd::runtime_error("exception");}); +taskflow.emplace([](){throwstd::runtime_error("exception");}); try{ executor.run(taskflow).get(); } -catch(conststd::runtime_error&e){ -std::cerr<<e.what()<<std::endl; +catch(conststd::runtime_error&e){ +std::cerr<<e.what()<<std::endl; } -As tf::Future is derived from std::future, it inherits all the exception handling behaviors defined by the C++ standard. +As tf::Future is derived from std::future, it inherits all the exception handling behaviors defined by the C++ standard. An exception will automatically cancel the execution of its parent taskflow. All the subsequent tasks that have dependencies on that exception task will not run. For instance, the following code defines two tasks, A and B, where B runs after A. When A throws an exception, the executor will cancel the execution of the taskflow, stopping every tasks that run after A. In this case, B will not run. tf::Executorexecutor; tf::Taskflowtaskflow; -tf::TaskA=taskflow.emplace([](){throwstd::runtime_error("exceptiononA");}); -tf::TaskB=taskflow.emplace([](){std::cout<<"TaskB\n";}); +tf::TaskA=taskflow.emplace([](){throwstd::runtime_error("exceptiononA");}); +tf::TaskB=taskflow.emplace([](){std::cout<<"TaskB\n";}); A.precede(B); try{ executor.run(taskflow).get(); } -catch(conststd::runtime_error&e){ -std::cerr<<e.what()<<std::endl; +catch(conststd::runtime_error&e){ +std::cerr<<e.what()<<std::endl; } -~$exceptiononA +~$exceptiononA #executionoftaskflowiscancelledafteranexecutionisthrown When multiple tasks throw exceptions simultaneously, the executor will only catch one exception and store it in the shared state. Other exceptions will be silently ignored. For example, the following taskflow may concurrently throw two exceptions from task B and task C. Only one exception, either B or C, will be propagated. @@ -61,16 +68,16 @@ An exception will automatically cancel the execution of its parent taskflow. All tf::Taskflowtaskflow; auto[A,B,C,D]=taskflow.emplace( -[](){std::cout<<"TaskA\n";}, +[](){std::cout<<"TaskA\n";}, [](){ -std::cout<<"TaskB\n"; -throwstd::runtime_error("ExceptiononTaskB"); +std::cout<<"TaskB\n"; +throwstd::runtime_error("ExceptiononTaskB"); }, [](){ -std::cout<<"TaskC\n"; -throwstd::runtime_error("ExceptiononTaskC"); +std::cout<<"TaskC\n"; +throwstd::runtime_error("ExceptiononTaskC"); }, -[](){std::cout<<"TaskDwillnotbeprintedduetoexception\n";} +[](){std::cout<<"TaskDwillnotbeprintedduetoexception\n";} ); A.precede(B,C);//ArunsbeforeBandC @@ -79,61 +86,120 @@ An exception will automatically cancel the execution of its parent taskflow. All try{ executor.run(taskflow).get(); } -catch(conststd::runtime_error&e){ -//catchedeitherB'sorC'sexception -std::cout<<e.what()<<std::endl; +catch(conststd::runtime_error&e){ +//caughteitherB'sorC'sexception +std::cout<<e.what()<<std::endl; +} + + + +Codestin Search AppWhen you join a subflow using tf::Subflow::join, you can catch an exception thrown by its children tasks. For example, the following code catches an exception from the child task A of the subflow sf: +tf::Executorexecutor; +tf::Taskflowtaskflow; + +taskflow.emplace([](tf::Subflow&sf){ +tf::TaskA=sf.emplace([](){ +std::cout<<"TaskA\n"; +throwstd::runtime_error("exceptiononA"); +}); +tf::TaskB=sf.emplace([](){ +std::cout<<"TaskB\n"; +}); +A.precede(B); + +//catchtheexception +try{ +sf.join(); +} +catch(conststd::runtime_error&re){ +std::cout<<"exceptionthrownduringsubflowjoining:"<<re.what()<<'\n'; +} +}); + +executor.run(taskflow).get(); + +When an exception is thrown, it will cancel the execution of the parent subflow. All the subsequent tasks that depend on that exception task will not run. The above code example has the following output: +TaskA +exceptionthrownduringsubflowjoining:exceptiononA + +Uncaught exception will be propagated to the parent level until being explicitly caught. For example, the code below will propagate the exception to the parent of the subflow, which in this case in its taskflow. +tf::Executorexecutor; +tf::Taskflowtaskflow; + +taskflow.emplace([](tf::Subflow&sf){ +tf::TaskA=sf.emplace([](){ +std::cout<<"TaskA\n"; +throwstd::runtime_error("exceptiononA"); +}); +tf::TaskB=sf.emplace([](){ +std::cout<<"TaskB\n"; +}); +A.precede(B); + +//uncaughtexceptionwillpropagatetotheparent +sf.join(); +}); + +try +{ +executor.run(taskflow).get(); +} +catch(conststd::runtime_error&re) +{ +std::cout<<"exceptionthrownfromrunningthetaskflow:"<<re.what()<<'\n'; } +TaskA +exceptionthrownfromrunningthetaskflow:exceptiononA + -Codestin Search App -Similar to std::future, tf::Executor::async will store the exception in the shared state referenced by the returned std::future handle. +Codestin Search AppSimilar to std::future, tf::Executor::async will store the exception in the shared state referenced by the returned std::future handle. tf::Executorexecutor; -autofu=executor.async([](){throwstd::runtime_error("exception");}); +autofu=executor.async([](){throwstd::runtime_error("exception");}); try{ fu.get(); } -catch(conststd::runtime_error&e){ -std::cerr<<e.what()<<std::endl; +catch(conststd::runtime_error&e){ +std::cerr<<e.what()<<std::endl; } Running the program will show the exception message on the async task: -~$exception +~$exception On the other hand, since tf::Executor::silent_async does not return any future handle, any exception thrown from a silent-async task will be silently caught by the executor and (1) propagated to the its parent task if the parent task exists or (2) ignored if the parent task does not exist. tf::Taskflowtaskflow; tf::Executorexecutor; -//execptionwillbesilentlyignored -executor.silent_async([](){throwstd::runtime_error("exception");}); +//exceptionwillbesilentlyignored +executor.silent_async([](){throwstd::runtime_error("exception");}); //exceptionwillbepropagatedtotheparenttf::RuntimetaskandthenitsTaskflow taskflow.emplace([&](tf::Runtime&rt){ -rt.silent_async([](){throwstd::runtime_error("exception");}); +rt.silent_async([](){throwstd::runtime_error("exception");}); }); try{ taskflow.get(); } -catch(conststd::runtime_error&re){ -std::cout<<re.what()<<std::endl; +catch(conststd::runtime_error&re){ +std::cout<<re.what()<<std::endl; } -Codestin Search App -When you corun a graph via tf::Executor::corun or tf::Runtime::corun, any exception will be thrown during the execution. For example, the code below will throw an exception during the execution of taskflow1: +Codestin Search AppWhen you corun a graph via tf::Executor::corun or tf::Runtime::corun, any exception will be thrown during the execution. For example, the code below will throw an exception during the execution of taskflow1: tf::Executorexecutor; tf::Taskflowtaskflow1; tf::Taskflowtaskflow2; taskflow1.emplace([](){ -throwstd::runtime_error("exception"); +throwstd::runtime_error("exception"); }); taskflow2.emplace([&](){ try{ executor.corun(taskflow1); -}catch(conststd::runtime_error&re){ -std::cout<<re.what()<<std::endl; +}catch(conststd::runtime_error&re){ +std::cout<<re.what()<<std::endl; } }); executor.run(taskflow2).get(); @@ -144,13 +210,13 @@ An exception will automatically cancel the execution of its parent taskflow. All tf::Taskflowtaskflow2; taskflow1.emplace([](){ -throwstd::runtime_error("exception"); +throwstd::runtime_error("exception"); }); taskflow2.emplace([&](tf::Runtime&rt){ try{ rt.corun(taskflow1); -}catch(conststd::runtime_error&re){ -std::cout<<re.what()<<std::endl; +}catch(conststd::runtime_error&re){ +std::cout<<re.what()<<std::endl; } }); executor.run(taskflow2).get(); @@ -161,7 +227,7 @@ An exception will automatically cancel the execution of its parent taskflow. All tf::Taskflowtaskflow2; taskflow1.emplace([](){ -throwstd::runtime_error("exception"); +throwstd::runtime_error("exception"); }); taskflow2.emplace([&](tf::Runtime&rt){ rt.corun(taskflow1); @@ -170,13 +236,22 @@ An exception will automatically cancel the execution of its parent taskflow. All try{ executor.run(taskflow2).get(); } -catch(conststd::runtime_error&re){ -std::cout<<re.what()<<std::endl; +catch(conststd::runtime_error&re){ +std::cout<<re.what()<<std::endl; } -For the above example, if the exception is not caught with tf::Runtime::corun, it will be propagated to its parent task, which is the tf::Runtime object rt in this case. Then, the exception will be propagated to taskflow2. +For the above example, if the exception is not caught with tf::Runtime::corun, it will be propagated to its parent task, which is the tf::Runtime object rt in this case. Then, the exception will be propagated to taskflow2. + + +Codestin Search AppIn some applications, exception handling may not be desirable due to performance concerns, coding style preferences, or platform constraints. Taskflow allows you to disable exception handling entirely at compile time. To do this, simply define the macro TF_DISABLE_EXCEPTION_HANDLING when compiling your program: +~$g++-DTF_DISABLE_EXCEPTION_HANDLINGyour_taskflow_prog.cpp + +Disabling exception handling removes all try-catch blocks from the Taskflow runtime, resulting in a leaner binary and potentially faster execution. However, please note that this also means Taskflow will not catch or report runtime exceptions. +Disabling exception handling means that Taskflow will not catch or report runtime exceptions. Any exception thrown during execution will propagate unchecked and may cause your program to behave abnormally. Use this option only if you are confident that your application does not rely on exception safety. + + - + diff --git a/docs/xml/ExecuteTaskflow.xml b/docs/xml/ExecuteTaskflow.xml index e8dad6738..3d07e78f9 100644 --- a/docs/xml/ExecuteTaskflow.xml +++ b/docs/xml/ExecuteTaskflow.xml @@ -1,5 +1,5 @@ - + ExecuteTaskflow Codestin Search App @@ -7,56 +7,75 @@ Create an Executor ExecuteTaskflow_1CreateAnExecutor - + + + Understand Work-stealing in Executor + ExecuteTaskflow_1UnderstandWorkStealingInExecutor + Execute a Taskflow ExecuteTaskflow_1ExecuteATaskflow - + Execute a Taskflow with Transferred Ownership ExecuteTaskflow_1ExecuteATaskflowWithTransferredOwnership - + Execute a Taskflow from an Internal Worker ExecuteTaskflow_1ExecuteATaskflowFromAnInternalWorker - + - Touch an Executor from Multiple Threads - ExecuteTaskflow_1ThreadSafety - + Thread Safety of Executor + ExecuteTaskflow_1ThreadSafetyOfExecution + Query the Worker ID ExecuteTaskflow_1QueryTheWorkerID - + Observe Thread Activities ExecuteTaskflow_1ObserveThreadActivities - + + + Modify Worker Property + ExecuteTaskflow_1ModifyWorkerProperty + After you create a task dependency graph, you need to submit it to threads for execution. In this chapter, we will show you how to execute a task dependency graph. -Codestin Search App -To execute a taskflow, you need to create an executor of type tf::Executor. An executor is a thread-safe object that manages a set of worker threads and executes tasks through an efficient work-stealing algorithm. Issuing a call to run a taskflow creates a topology, a data structure to keep track of the execution status of a running graph. tf::Executor takes an unsigned integer to construct with N worker threads. The default value is std::thread::hardware_concurrency. +Codestin Search AppTo execute a taskflow, you need to create an executor of type tf::Executor. An executor is a thread-safe object that manages a set of worker threads and executes tasks through an efficient work-stealing algorithm. Issuing a call to run a taskflow creates a topology, a data structure to keep track of the execution status of a running graph. tf::Executor takes an unsigned integer to construct with N worker threads. The default value is std::thread::hardware_concurrency. tf::Executorexecutor1;//createanexecutorwiththenumberofworkers //equaltostd::thread::hardware_concurrency tf::Executorexecutor2(4);//createanexecutorof4workerthreads -An executor can be reused to execute multiple taskflows. In most workloads, you may need only one executor to run multiple taskflows where each taskflow represents a part of a parallel decomposition. +Creating a tf::Executor has non-negligible overhead. Unless your application requires multiple executors, we recommend creating a single tf::Executor and reusing it to run multiple taskflows. + + + + +Codestin Search AppTaskflow designs a highly efficient work-stealing algorithm to schedule and run tasks in an executor. Work-stealing is a dynamic scheduling algorithm widely used in parallel computing to distribute and balance workload among multiple threads or cores. Specifically, within an executor, each worker maintains its own local queue of tasks. When a worker finishes its own tasks, instead of becoming idle or going sleep, it (thief) tries to steal a task from the queue another worker (victim). The figure below illustrates the idea of work-stealing: + + +The key advantage of work-stealing lies in its decentralized nature and efficiency. Most of the time, worker threads work on their local queues without contention. Stealing only occurs when a worker becomes idle, minimizing overhead associated with synchronization and task distribution. This decentralized strategy effectively balances the workload, ensuring that idle workers are put to work and that the overall computation progresses efficiently. +That being said, the internal scheduling mechanisms in tf::Executor are not trivial, and it's not easy to explain every detail in just a few sentences. If you're interested in learning more about the technical details, please refer to our paper published in 2022 IEEE Transactions on Parallel and Distributed Systems (TPDS): + +Tsung-Wei Huang, Dian-Lun Lin, Chun-Xun Lin, and Yibo Lin, "Taskflow: A Lightweight Parallel and Heterogeneous Task Graph Computing System," IEEE Transactions on Parallel and Distributed Systems (TPDS), vol. 33, no. 6, pp. 1303-1320, June 2022 + + -Codestin Search App -tf::Executor provides a set of run_* methods, tf::Executor::run, tf::Executor::run_n, and tf::Executor::run_until to run a taskflow for one time, multiple times, or until a given predicate evaluates to true. All methods accept an optional callback to invoke after the execution completes, and return a tf::Future for users to access the execution status. The code below shows several ways to run a taskflow. +Codestin Search Apptf::Executor provides a set of run_* methods, tf::Executor::run, tf::Executor::run_n, and tf::Executor::run_until to run a taskflow for one time, multiple times, or until a given predicate evaluates to true. All methods accept an optional callback to invoke after the execution completes, and return a tf::Future for users to access the execution status. The code below shows several ways to run a taskflow. 1://Declareanexecutorandataskflow 2:tf::Executorexecutor; 3:tf::Taskflowtaskflow; 4: 5://Addthreetasksintothetaskflow -6:tf::TaskA=taskflow.emplace([](){std::cout<<"ThisisTaskA\n";}); -7:tf::TaskB=taskflow.emplace([](){std::cout<<"ThisisTaskB\n";}); -8:tf::TaskC=taskflow.emplace([](){std::cout<<"ThisisTaskC\n";}); +6:tf::TaskA=taskflow.emplace([](){std::cout<<"ThisisTaskA\n";}); +7:tf::TaskB=taskflow.emplace([](){std::cout<<"ThisisTaskB\n";}); +8:tf::TaskC=taskflow.emplace([](){std::cout<<"ThisisTaskC\n";}); 9: 10://Buildprecedencebetweentasks 11:A.precede(B,C); @@ -64,10 +83,10 @@ 13:tf::Future<void>fu=executor.run(taskflow); 14:fu.wait();//blockuntiltheexecutioncompletes 15: -16:executor.run(taskflow,[](){std::cout<<"endof1run";}).wait(); +16:executor.run(taskflow,[](){std::cout<<"endof1run";}).wait(); 17:executor.run_n(taskflow,4); 18:executor.wait_for_all();//blockuntilallassociatedexecutionsfinish -19:executor.run_n(taskflow,4,[](){std::cout<<"endof4runs";}).wait(); +19:executor.run_n(taskflow,4,[](){std::cout<<"endof4runs";}).wait(); 20:executor.run_until(taskflow,[cnt=0]()mutable{return++cnt==10;}); Debrief: @@ -80,7 +99,7 @@ Lines 17-18 run the taskflow four times and use tf::Executor::wait_for_all to wait for completion -Line 19 runs the taskflow four times and invokes a callback at the end of the forth execution +Line 19 runs the taskflow four times and invokes a callback at the end of the fourth execution Line 20 keeps running the taskflow until the predicate returns true @@ -103,7 +122,7 @@ Issuing multiple runs on the same taskflow will automatically synchron //... //runthetaskflow -executor.run(f); +executor.run(taskflow); }//leavingthescopewilldestroytaskflowwhileitisrunning, //resultinginundefinedbehavior @@ -117,23 +136,22 @@ Issuing multiple runs on the same taskflow will automatically synchron //Declareanexecutor tf::Executorexecutor; -tf::Future<void>future=taskflow.run(f);//non-blockingreturn +tf::Future<void>future=executor.run(taskflow);//non-blockingreturn //alterthetaskflowwhilerunningleadstoundefinedbehavior -f.emplace([](){std::cout<<"Addanewtask\n";}); +taskflow.emplace([](){std::cout<<"Addanewtask\n";}); You must always keep a taskflow alive and must not modify it while it is running on an executor. -Codestin Search App -You can transfer the ownership of a taskflow to an executor and run it without wrangling with the lifetime issue of that taskflow. Each run_* method discussed in the previous section comes with an overload that takes a moved taskflow object. +Codestin Search AppYou can transfer the ownership of a taskflow to an executor and run it without wrangling with the lifetime issue of that taskflow. Each run_* method discussed in the previous section comes with an overload that takes a moved taskflow object. tf::Taskflowtaskflow; tf::Executorexecutor; taskflow.emplace([](){}); //lettheexecutormanagethelifetimeofthesubmittedtaskflow -executor.run(std::move(taskflow)); +executor.run(std::move(taskflow)); //nowtaskflowhasnotasks assert(taskflow.num_tasks()==0); @@ -148,35 +166,32 @@ Issuing multiple runs on the same taskflow will automatically synchron executor.run(taskflow); //error!youcannotmoveataskflowwhileitisrunning -executor.run(std::move(taskflow)); +executor.run(std::move(taskflow)); The correct way to submit a taskflow with moved ownership to an executor is to ensure all previous runs have completed. The executor will automatically release the resources of a moved taskflow right after its execution completes. //submitthetaskflowandwaituntilitcompletes executor.run(taskflow).wait(); //nowit'ssafetomovethetaskflowtotheexecutorandrunit -executor.run(std::move(taskflow)); +executor.run(std::move(taskflow)); Likewise, you cannot move a taskflow that is running on an executor. You must wait until all the previous fires of runs on that taskflow complete before calling move. //submitthetaskflowandwaituntilitcompletes executor.run(taskflow).wait(); //nowit'ssafetomovethetaskflowtoanother -tf::Taskflowmoved_taskflow(std::move(taskflow)); +tf::Taskflowmoved_taskflow(std::move(taskflow)); -Codestin Search App -Each run variant of tf::Executor returns a tf::Future object which allows you to wait for the result to complete. When calling tf::Future::wait, the caller blocks without doing anything until the associated state is written to be ready. This design, however, can introduce deadlock problem especially when you need to run multiple taskflows from the internal workers of an executor. For example, the code below creates a taskflow of 1000 tasks with each task running a taskflow of 500 tasks in a blocking fashion: +Codestin Search AppEach run variant of tf::Executor returns a tf::Future object which allows you to wait for the result to complete. When calling tf::Future::wait, the caller blocks without doing anything until the associated state is written to be ready. This design, however, can introduce deadlock problem especially when you need to run multiple taskflows from the internal workers of an executor. For example, the code below creates a taskflow of 1000 tasks with each task running a taskflow of 500 tasks in a blocking fashion: tf::Executorexecutor(2); tf::Taskflowtaskflow; -std::array<tf::Taskflow, 1000>others; - -std::atomic<size_t>counter{0}; +std::array<tf::Taskflow, 1000>others; for(size_tn=0;n<1000;n++){ for(size_ti=0;i<500;i++){ -others[n].emplace([&](){counter++;}); +others[n].emplace([&](){}); } taskflow.emplace([&executor,&tf=others[n]](){ //blockingtheworkercanintroducedeadlockwhere @@ -189,9 +204,9 @@ Issuing multiple runs on the same taskflow will automatically synchron To avoid this problem, the executor has a method, tf::Executor::corun, to execute a taskflow from a worker of that executor. The worker will not block but co-run the taskflow with other tasks in its work-stealing loop. tf::Executorexecutor(2); tf::Taskflowtaskflow; -std::array<tf::Taskflow, 1000>others; +std::array<tf::Taskflow, 1000>others; -std::atomic<size_t>counter{0}; +std::atomic<size_t>counter{0}; for(size_tn=0;n<1000;n++){ for(size_ti=0;i<500;i++){ @@ -207,9 +222,9 @@ Issuing multiple runs on the same taskflow will automatically synchron Similar to tf::Executor::corun, the method tf::Executor::corun_until is another variant that keeps the calling worker in the work-stealing loop until the given predicate becomes true. You can use this method to prevent blocking a worker from doing useful things, such as being blocked when submitting an outstanding task (e.g., a GPU operation). taskflow.emplace([&](){ -autofu=std::async([](){std::sleep(100s);}); +autofu=std::async([](){std::sleep(100s);}); executor.corun_until([](){ -returnfu.wait_for(std::chrono::seconds(0))==future_status::ready; +returnfu.wait_for(std::chrono::seconds(0))==future_status::ready; }); }); @@ -217,25 +232,21 @@ Issuing multiple runs on the same taskflow will automatically synchron - -Codestin Search App -All run_* methods are thread-safe. You can have multiple threads call these methods from an executor to run different taskflows. However, the order which taskflow runs first is non-deterministic and is up to the runtime. -1:tf::Executorexecutor; -2: -3:for(inti=0;i<10;++i){ -4:std::thread([i,&](){ -5://...modifymytaskflowati -6:executor.run(taskflows[i]);//runmytaskflowati -7:}).detach(); -8:} -9: -10:executor.wait_for_all(); + +Codestin Search AppAll run_* methods of tf::Executor are thread-safe. You can safely invoke these methods from multiple threads to run different taskflows concurrently. However, the execution order of the submitted taskflows is non-deterministic and determined by the runtime scheduler. +tf::Executorexecutor; +for(inti=0;i<10;++i){ +std::thread([i,&](){ +//...modifymytaskflowati +executor.run(taskflows[i]);//runmytaskflowati +}).detach(); +} +executor.wait_for_all(); -Codestin Search App -Each worker in an executor has an unique integer identifier in the range [0, N) that can be queried by the caller thread using tf::Executor::this_worker_id. If the caller thread is not a worker in the executor, -1 is returned. This method is convenient for users to maintain a one-to-one mapping between a worker and its application data structure. -std::vector<int>worker_vectors[8];//onevectorperworker +Codestin Search AppEach worker thread in a tf::Executor is assigned a unique integer identifier in the range [0, N), where N is the number of worker threads in the executor. You can query the identifier of the calling thread using tf::Executor::this_worker_id. If the calling thread is not a worker of the executor, the method returns -1. This functionality is particularly useful for establishing a one-to-one mapping between worker threads and application-specific data structures. +std::vector<int>worker_vectors[8];//onevectorperworker tf::Taskflowtaskflow; tf::Executorexecutor(8);//anexecutorofeightworkers @@ -250,9 +261,8 @@ Issuing multiple runs on the same taskflow will automatically synchron -Codestin Search App -You can observe thread activities in an executor when a worker thread participates in executing a task and leaves the execution using tf::ObserverInterface an interface class that provides a set of methods for you to define what to do when a thread enters and leaves the execution context of a task. -classObserverInterface{ +Codestin Search AppYou can observe thread activities in an executor when a worker thread participates in executing a task and leaves the execution using tf::ObserverInterface an interface class that provides a set of methods for you to define what to do when a thread enters and leaves the execution context of a task. +classObserverInterface{ virtual~ObserverInterface()=default; virtualvoidset_up(size_tnum_workers)=0; virtualvoidon_entry(tf::WorkerViewworker_view,tf::TaskViewtask_view)=0; @@ -260,29 +270,29 @@ Issuing multiple runs on the same taskflow will automatically synchron }; There are three methods you must define in your derived class, tf::ObserverInterface::set_up, tf::ObserverInterface::on_entry, and tf::ObserverInterface::on_exit. The method, tf::ObserverInterface::set_up, is a constructor-like method that will be called by the executor when the observer is constructed. It passes an argument of the number of workers to observer in the executor. You may use it to preallocate or initialize data storage, e.g., an independent vector for each worker. The methods, tf::ObserverInterface::on_entry and tf::ObserverInterface::on_exit, are called by a worker thread before and after the execution context of a task, respectively. Both methods provide immutable access to the underlying worker and the running task using tf::WorkerView and tf::TaskView. You may use them to record timepoints and calculate the elapsed time of a task. -You can associate an executor with one or multiple observers (though one is common) using tf::Executor::make_observer. We use std::shared_ptr to manage the ownership of an observer. The executor loops through each observer and invoke the corresponding methods accordingly. +You can associate an executor with one or multiple observers (though one is common) using tf::Executor::make_observer. We use std::shared_ptr to manage the ownership of an observer. The executor loops through each observer and invoke the corresponding methods accordingly. #include<taskflow/taskflow.hpp> structMyObserver:publictf::ObserverInterface{ -MyObserver(conststd::string&name){ -std::cout<<"constructingobserver"<<name<<'\n'; +MyObserver(conststd::string&name){ +std::cout<<"constructingobserver"<<name<<'\n'; } -voidset_up(size_tnum_workers)overridefinal{ -std::cout<<"settingupobserverwith"<<num_workers<<"workers\n"; +voidset_up(size_tnum_workers)overridefinal{ +std::cout<<"settingupobserverwith"<<num_workers<<"workers\n"; } -voidon_entry(tf::WorkerVieww,tf::TaskViewtv)overridefinal{ -std::ostringstreamoss; +voidon_entry(tf::WorkerVieww,tf::TaskViewtv)overridefinal{ +std::ostringstreamoss; oss<<"worker"<<w.id()<<"readytorun"<<tv.name()<<'\n'; -std::cout<<oss.str(); +std::cout<<oss.str(); } -voidon_exit(tf::WorkerVieww,tf::TaskViewtv)overridefinal{ -std::ostringstreamoss; +voidon_exit(tf::WorkerVieww,tf::TaskViewtv)overridefinal{ +std::ostringstreamoss; oss<<"worker"<<w.id()<<"finishedrunning"<<tv.name()<<'\n'; -std::cout<<oss.str(); +std::cout<<oss.str(); } }; @@ -294,17 +304,17 @@ Issuing multiple runs on the same taskflow will automatically synchron //Createataskflowofeighttasks tf::Taskflowtaskflow; -autoA=taskflow.emplace([](){std::cout<<"1\n";}).name("A"); -autoB=taskflow.emplace([](){std::cout<<"2\n";}).name("B"); -autoC=taskflow.emplace([](){std::cout<<"3\n";}).name("C"); -autoD=taskflow.emplace([](){std::cout<<"4\n";}).name("D"); -autoE=taskflow.emplace([](){std::cout<<"5\n";}).name("E"); -autoF=taskflow.emplace([](){std::cout<<"6\n";}).name("F"); -autoG=taskflow.emplace([](){std::cout<<"7\n";}).name("G"); -autoH=taskflow.emplace([](){std::cout<<"8\n";}).name("H"); +autoA=taskflow.emplace([](){std::cout<<"1\n";}).name("A"); +autoB=taskflow.emplace([](){std::cout<<"2\n";}).name("B"); +autoC=taskflow.emplace([](){std::cout<<"3\n";}).name("C"); +autoD=taskflow.emplace([](){std::cout<<"4\n";}).name("D"); +autoE=taskflow.emplace([](){std::cout<<"5\n";}).name("E"); +autoF=taskflow.emplace([](){std::cout<<"6\n";}).name("F"); +autoG=taskflow.emplace([](){std::cout<<"7\n";}).name("G"); +autoH=taskflow.emplace([](){std::cout<<"8\n";}).name("H"); //createanobserver -std::shared_ptr<MyObserver>observer=executor.make_observer<MyObserver>( +std::shared_ptr<MyObserver>observer=executor.make_observer<MyObserver>( "MyObserver" ); @@ -312,7 +322,7 @@ Issuing multiple runs on the same taskflow will automatically synchron executor.run(taskflow).get(); //removetheobserver(optional) -executor.remove_observer(std::move(observer)); +executor.remove_observer(std::move(observer)); return0; } @@ -345,9 +355,94 @@ Issuing multiple runs on the same taskflow will automatically synchron worker2finishedrunningG worker3finishedrunningH -It is expected each line of std::cout interleaves with each other as there are four workers participating in task scheduling. However, the ready message always appears before the corresponding task message (e.g., numbers) and then the finished message. +It is expected each line of std::cout interleaves with each other as there are four workers participating in task scheduling. However, the ready message always appears before the corresponding task message (e.g., numbers) and then the finished message. + + +Codestin Search AppYou can change the property of each worker thread from its executor, such as assigning thread-processor affinity before the worker enters the scheduler loop and post-processing additional information after the worker leaves the scheduler loop, by passing an instance derived from tf::WorkerInterface to the executor. The example demonstrates the usage of tf::WorkerInterface to affine a worker to a specific CPU core equal to its id on a linux platform: +//affinethegiventhreadtothegivencoreindex(linux-specific) +boolaffine(std::thread&thread,unsignedintcore_id){ +cpu_set_tcpuset; +CPU_ZERO(&cpuset); +CPU_SET(core_id,&cpuset); +pthread_tnative_handle=thread.native_handle(); +returnpthread_setaffinity_np(native_handle,sizeof(cpu_set_t),&cpuset)==0; +} + +classCustomWorkerBehavior:publictf::WorkerInterface{ + +public: + +//tocallbeforetheworkerenterstheschedulingloop +voidscheduler_prologue(tf::Worker&w)override{ +printf("worker%lupreparestoenterthework-stealingloop\n",w.id()); + +//nowaffinetheworkertoaparticularCPUcoreequaltoitsid +if(affine(w.thread(),w.id())){ +printf("successfullyaffinesworker%lutoCPUcore%lu\n",w.id(),w.id()); +} +else{ +printf("failedtoaffineworker%lutoCPUcore%lu\n",w.id(),w.id()); +} +} + +//tocallaftertheworkerleavestheschedulingloop +voidscheduler_epilogue(tf::Worker&w,std::exception_ptr)override{ +printf("worker%luleftthework-stealingloop\n",w.id()); +} +}; + +intmain(){ +tf::Executorexecutor(4,tf::make_worker_interface<CustomWorkerBehavior>()); +return0; +} + +When running the program, we see the following one possible output: +worker3preparestoenterthework-stealingloop +successfullyaffinesworker3toCPUcore3 +worker3leftthework-stealingloop +worker0preparestoenterthework-stealingloop +successfullyaffinesworker0toCPUcore0 +worker0leftthework-stealingloop +worker1preparestoenterthework-stealingloop +worker2preparestoenterthework-stealingloop +successfullyaffinesworker1toCPUcore1 +worker1leftthework-stealingloop +successfullyaffinesworker2toCPUcore2 +worker2leftthework-stealingloop + +When you create an executor, it spawns a set of worker threads to run tasks using a work-stealing scheduling algorithm. The execution logic of the scheduler and its interaction with each spawned worker via tf::WorkerInterface is given below: +for(size_tn=0;n<num_workers;n++){ +create_thread([](Worker&worker) + +//pre-processingexecutor-specificworkerinformation +//... + +//entertheschedulingloop +//Here,WorkerInterface::scheduler_prologueisinvoked,ifany +worker_interface->scheduler_prologue(worker); + +try{ +while(1){ +perform_work_stealing_algorithm(); +if(stop){ +break; +} +} +}catch(...){ +exception_ptr=std::current_exception(); +} + +//leavestheschedulingloopandjoinsthisworkerthread +//Here,WorkerInterface::scheduler_epilogueisinvoked,ifany +worker_interface->scheduler_epilogue(worker,exception_ptr); +); +} + +tf::WorkerInterface::scheduler_prologue and tf::WorkerInterface::scheduler_epilogue are invoked by each worker simultaneously. It is your responsibility to ensure no data race can occur during their invokation. + + - + diff --git a/docs/xml/FAQ.xml b/docs/xml/FAQ.xml index 7af1c21a0..b0c61809c 100644 --- a/docs/xml/FAQ.xml +++ b/docs/xml/FAQ.xml @@ -1,5 +1,5 @@ - + FAQ Codestin Search App @@ -7,256 +7,226 @@ General Questions FAQ_1GeneralQuestions - - - Q1: What's the goal of Taskflow? - FAQ_1GeneralQuestion1 - - - Q2: How do I use Taskflow in my projects? - FAQ_1GeneralQuestion2 - - - Q3: What is the difference between static tasking and dynamic tasking? - FAQ_1GeneralQuestion3 - - - Q4: How many tasks can Taskflow handle? - FAQ_1GeneralQuestion4 - - - Q5: What is the weird hex value, like 0x7fc39d402ab0, in the dumped graph? - FAQ_1GeneralQuestion5 - - - Q6: Does Taskflow have backward compatibility with C++03/98/11/14? - FAQ_1GeneralQuestion6 - - - Q7: How does Taskflow schedule tasks? - FAQ_1GeneralQuestion7 - - - Q8: What is the overhead of taskflow? - FAQ_1GeneralQuestion8 - - - Q9: How does it compare to existing task programming systems? - FAQ_1GeneralQuestion9 - - - Q10: Do you try to simplify the GPU kernel programming? - FAQ_1GeneralQuestion10 - - - Q11: Do you have any real use cases? - FAQ_1GeneralQuestion11 - - - Q12: Who is the Principal Investigator of Taskflow I can talk to? - FAQ_1GeneralQuestion12 - - - Q13: Who are developing and maintaining Taskflow? - FAQ_1GeneralQuestion13 - - - Q14: Is Taskflow just an another API or model? - FAQ_1GeneralQuestion14 - - - Q15: How can I contribute? - FAQ_1GeneralQuestion15 - - - Q16: Does Taskflow support pipeline parallelism? - FAQ_1GeneralQuestion16 - - - + + + Q1: What's the goal of Taskflow? + FAQ_1GeneralQuestion1 + + + Q2: How do I use Taskflow in my projects? + FAQ_1GeneralQuestion2 + + + Q3: What is the difference between static tasking and dynamic tasking? + FAQ_1GeneralQuestion3 + + + Q4: How many tasks can Taskflow handle? + FAQ_1GeneralQuestion4 + + + Q5: What is the weird hex value, like 0x7fc39d402ab0, in the dumped graph? + FAQ_1GeneralQuestion5 + + + Q6: Does Taskflow have backward compatibility with C++03/98/11/14? + FAQ_1GeneralQuestion6 + + + Q7: How does Taskflow schedule tasks? + FAQ_1GeneralQuestion7 + + + Q8: What is the overhead of taskflow? + FAQ_1GeneralQuestion8 + + + Q9: How does it compare to existing task programming systems? + FAQ_1GeneralQuestion9 + + + Q10: Do you try to simplify the GPU kernel programming? + FAQ_1GeneralQuestion10 + + + Q11: Do you have any real use cases? + FAQ_1GeneralQuestion11 + + + Q12: Who is the Principal Investigator of Taskflow I can talk to? + FAQ_1GeneralQuestion12 + + + Q13: Who are developing and maintaining Taskflow? + FAQ_1GeneralQuestion13 + + + Q14: Is Taskflow just an another API or model? + FAQ_1GeneralQuestion14 + + + Q15: How can I contribute? + FAQ_1GeneralQuestion15 + + + Q16: Does Taskflow support pipeline parallelism? + FAQ_1GeneralQuestion16 + + + Programming Questions FAQ_1ProgrammingQuestions - - - Q1: What is the difference between Taskflow threads and workers? - FAQ_1ProgrammingQuestions1 - - - Q2: What is the Lifetime of a Task and a Graph? - FAQ_1ProgrammingQuestions2 - - - Q3: Is taskflow thread-safe? - FAQ_1ProgrammingQuestions3 - - - Q4: Is executor thread-safe? - FAQ_1ProgrammingQuestions4 - - - Q5: My program hangs and never returns after dispatching a taskflow graph. What's wrong? - FAQ_1ProgrammingQuestions5 - - - Q6: In the following example where B spawns a joined subflow of three tasks B1, B2, and B3, do they run concurrently with task A? - FAQ_1ProgrammingQuestions6 - - - Q7: What is the purpose of a condition task? - FAQ_1ProgrammingQuestions7 - - - Q8: Is the program master thread involved in running tasks? - FAQ_1ProgrammingQuestions8 - - - Q9: Are there any limits on the branches of conditional tasking? - FAQ_1ProgrammingQuestions9 - - - Q10: Why does Taskflow program GPU in a task graph? - FAQ_1ProgrammingQuestions10 - - - Q11: Can I limit the concurrency in certain sections of tasks? - FAQ_1ProgrammingQuestions11 - - - Q12: How can I attach custom data to a task and access it? - FAQ_1ProgrammingQuestions12 - - - + + + Q1: What is the difference between Taskflow threads and workers? + FAQ_1ProgrammingQuestions1 + + + Q2: What is the Lifetime of a Task and a Graph? + FAQ_1ProgrammingQuestions2 + + + Q3: Is taskflow thread-safe? + FAQ_1ProgrammingQuestions3 + + + Q4: Is executor thread-safe? + FAQ_1ProgrammingQuestions4 + + + Q5: My program hangs and never returns after dispatching a taskflow graph. What's wrong? + FAQ_1ProgrammingQuestions5 + + + Q6: In the following example where B spawns a joined subflow of three tasks B1, B2, and B3, do they run concurrently with task A? + FAQ_1ProgrammingQuestions6 + + + Q7: What is the purpose of a condition task? + FAQ_1ProgrammingQuestions7 + + + Q8: Is the program master thread involved in running tasks? + FAQ_1ProgrammingQuestions8 + + + Q9: Are there any limits on the branches of conditional tasking? + FAQ_1ProgrammingQuestions9 + + + Q10: Why does Taskflow program GPU in a task graph? + FAQ_1ProgrammingQuestions10 + + + Q11: Can I limit the concurrency in certain sections of tasks? + FAQ_1ProgrammingQuestions11 + + + Q12: How can I attach custom data to a task and access it? + FAQ_1ProgrammingQuestions12 + + + This page summarizes a list of frequently asked questions about Taskflow. If you cannot find a solution here, please post an issue at here. -Codestin Search App - -Codestin Search App -Taskflow aims to help C++ developers quickly implement efficient parallel decomposition strategies using task-based approaches. +Codestin Search App +Codestin Search AppTaskflow aims to help C++ developers quickly implement efficient parallel decomposition strategies using task-based approaches. -Codestin Search App -Taskflow is a header-only library with zero dependencies. The only thing you need is a C++17 compiler. To use Taskflow, simply drop the folder taskflow/ to your project and include taskflow.hpp. +Codestin Search AppTaskflow is a header-only library with zero dependencies. The only thing you need is a C++17 compiler. To use Taskflow, simply drop the folder taskflow/ to your project and include taskflow.hpp. -Codestin Search App -Static tasking refers to those tasks created before execution, while dynamic tasking refers to those tasks created during the execution of static tasks or dynamic tasks (nested). Dynamic tasks created by the same task node are grouped together to a subflow. +Codestin Search AppStatic tasking refers to those tasks created before execution, while dynamic tasking refers to those tasks created during the execution of static tasks or dynamic tasks (nested). Dynamic tasks created by the same task node are grouped together to a subflow. -Codestin Search App -Benchmarks showed Taskflow can efficiently handle millions or billions of tasks (both large and small tasks) on a machine with up to 64 CPUs. +Codestin Search AppBenchmarks showed Taskflow can efficiently handle millions or billions of tasks (both large and small tasks) on a machine with up to 64 CPUs. -Codestin Search App -The hex value represents the memory address of the task. Each task has a method tf::Task::name(const std::string&) for user to assign a human readable string to ease the debugging process. If a task is not assigned a name or is an internal node, its address value in the memory is used instead. +Codestin Search AppThe hex value represents the memory address of the task. Each task has a method tf::Task::name(const std::string&) for user to assign a human readable string to ease the debugging process. If a task is not assigned a name or is an internal node, its address value in the memory is used instead. -Codestin Search App -Unfortunately, Taskflow is heavily relying on modern C++17's features/idoms/STL and it is very difficult to provide a version that compiles under older C++ versions. +Codestin Search AppUnfortunately, Taskflow is heavily relying on modern C++17's features/idoms/STL and it is very difficult to provide a version that compiles under older C++ versions. -Codestin Search App -Taskflow implemented a very efficient work-stealing scheduler to execute task dependency graphs. The source code is available at taskflow/core/executor.hpp. +Codestin Search AppTaskflow implemented a very efficient work-stealing scheduler to execute task dependency graphs. The source code is available at taskflow/core/executor.hpp. -Codestin Search App -Creating a taskflow has certain overhead. For example, creating a task and a dependency takes about 61 and 14 nanoseconds in our system (Intel 4-core CPU at 2.00GHz). The time is amortized over 1M operations, since we have implemented an object pool to recycle tasks for minimal overhead. +Codestin Search AppCreating a taskflow has certain overhead. For example, creating a task and a dependency takes about 61 and 14 nanoseconds in our system (Intel 4-core CPU at 2.00GHz). The time is amortized over 1M operations, since we have implemented an object pool to recycle tasks for minimal overhead. -Codestin Search App -There is a large amount of work on programming systems (e.g., StarPU, Intel TBB, OpenMP, PaRSEC, Kokkos, HPX) in the interest of simplifying the programming complexity of parallel and heterogeneous computing. Each of these systems has its own pros and cons and deserves a reason to exist. However, they do have some problems, particularly from the standpoint of ease of use, static control flow, and scheduling efficiency. Taskflow addresses these limitations through a simple, expressive, and transparent graph programming model. +Codestin Search AppThere is a large amount of work on programming systems (e.g., StarPU, Intel TBB, OpenMP, PaRSEC, Kokkos, HPX) in the interest of simplifying the programming complexity of parallel and heterogeneous computing. Each of these systems has its own pros and cons and deserves a reason to exist. However, they do have some problems, particularly from the standpoint of ease of use, static control flow, and scheduling efficiency. Taskflow addresses these limitations through a simple, expressive, and transparent graph programming model. -Codestin Search App -No, we do not develop new programming models to simplify the kernel programming. The rationale is simple: Writing efficient kernels requires domain-specific knowledge and developers often require direct access to the native GPU programming interface. High-level kernel programming models or abstractions all come with restricted applicability. Despite non-trivial kernel programming, we believe what makes heterogeneous computing difficult are surrounding tasks. A mistake made by task scheduling can outweigh all speed-up benefits from a highly optimized kernel. Therefore, Taskflow focuses on heterogeneous tasking that affects the overall system performance to a large extent. +Codestin Search AppNo, we do not develop new programming models to simplify the kernel programming. The rationale is simple: Writing efficient kernels requires domain-specific knowledge and developers often require direct access to the native GPU programming interface. High-level kernel programming models or abstractions all come with restricted applicability. Despite non-trivial kernel programming, we believe what makes heterogeneous computing difficult are surrounding tasks. A mistake made by task scheduling can outweigh all speed-up benefits from a highly optimized kernel. Therefore, Taskflow focuses on heterogeneous tasking that affects the overall system performance to a large extent. -Codestin Search App -We have applied Taskflow to solve many realistic workloads and demonstrated promising performance scalability and programming productivity. Please refer to Real Use Cases and References. +Codestin Search AppWe have applied Taskflow to solve many realistic workloads and demonstrated promising performance scalability and programming productivity. Please refer to Real Use Cases and References. -Codestin Search App -Please visit this page or email the investigator Dr. Tsung-Wei Huang. +Codestin Search AppPlease visit this page or email the investigator Dr. Tsung-Wei Huang. -Codestin Search App -Taskflow is in active development with core functionalities contributed by an academic group at the University of Wisconsin at Madison, led by Dr. Tsung-Wei Huang. While coming out of an academic lab, Taskflow aims to be industrial-strength and is committed to long-term support. +Codestin Search AppTaskflow is in active development with core functionalities contributed by an academic group at the University of Wisconsin at Madison, led by Dr. Tsung-Wei Huang. While coming out of an academic lab, Taskflow aims to be industrial-strength and is committed to long-term support. -Codestin Search App -OK, let me ask this first: Is your new car just another vehicle? Or, is your new home just another place to live? +Codestin Search AppOK, let me ask this first: Is your new car just another vehicle? Or, is your new home just another place to live? The answer to this question is the question itself. As technology advances, we can always find new ways to solve computational problems and achieve new performance milestones that were previously out-of-reach. -Codestin Search App -New contributors are always welcome! Please visit Contributing. +Codestin Search AppNew contributors are always welcome! Please visit Contributing. -Codestin Search App -Yes, Taskflow has a specialized programming model to create a pipeline scheduling framework. Please visit Task-parallel Pipeline and Data-parallel Pipeline. +Codestin Search AppYes, Taskflow has a specialized programming model to create a pipeline scheduling framework. Please visit Task-parallel Pipeline and Data-parallel Pipeline. -Codestin Search App - -Codestin Search App -The master thread owns the thread pool and can spawn workers to run tasks or shutdown the pool. Giving taskflow N threads means using N threads to do the works, and there is a total of N+1 threads (including the master thread) in the program. Please refer to Create an Executor for more details. +Codestin Search App +Codestin Search AppThe master thread owns the thread pool and can spawn workers to run tasks or shutdown the pool. Giving taskflow N threads means using N threads to do the works, and there is a total of N+1 threads (including the master thread) in the program. Please refer to Create an Executor for more details. -Codestin Search App -The lifetime of a task sticks with its parent graph. A task is not destroyed until its parent graph is destroyed. Please refer to Understand the Lifetime of a Task for more details. +Codestin Search AppThe lifetime of a task sticks with its parent graph. A task is not destroyed until its parent graph is destroyed. Please refer to Understand the Lifetime of a Task for more details. -Codestin Search App -No, the taskflow object is not thread-safe. Multiple threads cannot create tasks from the same taskflow at the same time. +Codestin Search AppNo, the taskflow object is not thread-safe. Multiple threads cannot create tasks from the same taskflow at the same time. -Codestin Search App -Yes, the executor object is thread-safe. You can have multiple threads submit different taskflows to the same executor. +Codestin Search AppYes, the executor object is thread-safe. You can have multiple threads submit different taskflows to the same executor. -Codestin Search App -When the program hangs forever it is very likely your taskflow graph has a cycle or not properly conditioned (see Conditional Tasking). Try the tf::Taskflow::dump method to debug the graph before dispatching your taskflow graph. +Codestin Search AppWhen the program hangs forever it is very likely your taskflow graph has a cycle or not properly conditioned (see Conditional Tasking). Try the tf::Taskflow::dump method to debug the graph before dispatching your taskflow graph. -Codestin Search App - +Codestin Search App No. The subflow is spawned during the execution of B, and at this point A must have finished because A precedes B. This gives rise to the fact B1 and B2 must run after A. -Codestin Search App -A condition task lets you perform in-task decision making so you can integrate control flow into a task graph with end-to-end parallelism without synchronizing or partitioning your parallelism across conditionals. +Codestin Search AppA condition task lets you perform in-task decision making so you can integrate control flow into a task graph with end-to-end parallelism without synchronizing or partitioning your parallelism across conditionals. -Codestin Search App -No, the program master thread is not involved in running taskflows. The executor keeps a set of private worker threads spawned upon construction time to run tasks. +Codestin Search AppNo, the program master thread is not involved in running taskflows. The executor keeps a set of private worker threads spawned upon construction time to run tasks. -Codestin Search App -No, as long as the return value points to a valid successors, your conditional tasking is valid. +Codestin Search AppNo, as long as the return value points to a valid successors, your conditional tasking is valid. -Codestin Search App -We ask users to describe a GPU workload in a task graph and execute it in a second moment. This organization minimizes kernels launch overhead and allows the GPU runtime (e.g., CUDA) to optimize the whole workflow. +Codestin Search AppWe ask users to describe a GPU workload in a task graph and execute it in a second moment. This organization minimizes kernels launch overhead and allows the GPU runtime (e.g., CUDA) to optimize the whole workflow. -Codestin Search App -Yes, Taskflow provides a lightweight mechanism, tf::Semaphore, for you to limit the maximum concurrency (i.e., the number of workers) in a section of tasks. Please refer to Limit the Maximum Concurrency. +Codestin Search AppYes, Taskflow provides a lightweight mechanism, tf::Semaphore, for you to limit the maximum concurrency (i.e., the number of workers) in a section of tasks. Please refer to Limit the Maximum Concurrency. -Codestin Search App -Each node in a taskflow is associated with a C-styled data pointer (i.e., void*) you can use to point to user data and access it in the body of a task callable. Please refer to Attach User Data to a Task. +Codestin Search AppEach node in a taskflow is associated with a C-styled data pointer (i.e., void*) you can use to point to user data and access it in the body of a task callable. Please refer to Attach User Data to a Task. - + diff --git a/docs/xml/FAQ_8dox.xml b/docs/xml/FAQ_8dox.xml index c31e4343f..f49702f9e 100644 --- a/docs/xml/FAQ_8dox.xml +++ b/docs/xml/FAQ_8dox.xml @@ -1,5 +1,5 @@ - + FAQ.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/ForEachCUDA.xml b/docs/xml/ForEachCUDA.xml deleted file mode 100644 index ea294e829..000000000 --- a/docs/xml/ForEachCUDA.xml +++ /dev/null @@ -1,75 +0,0 @@ - - - - ForEachCUDA - Codestin Search App - - - Include the Header - ForEachCUDA_1CUDAForEachIncludeTheHeader - - - Index-based Parallel Iterations - ForEachCUDA_1ForEachCUDAIndexBasedParallelFor - - - Iterator-based Parallel Iterations - ForEachCUDA_1ForEachCUDAIteratorBasedParallelIterations - - - Miscellaneous Items - ForEachCUDA_1ForEachCUDAMiscellaneousItems - - - - - -tf::cudaFlow provides two template methods, tf::cudaFlow::for_each and tf::cudaFlow::for_each_index, for creating tasks to perform parallel iterations over a range of items. - -Codestin Search App -You need to include the header file, taskflow/cuda/algorithm/for_each.hpp, for creating a parallel-iteration task. -#include<taskflow/cuda/algorithm/for_each.hpp> - - - -Codestin Search App -Index-based parallel-for performs parallel iterations over a range [first, last) with the given step size. The task created by tf::cudaFlow::for_each_index(I first, I last, I step, C callable) represents a kernel of parallel execution for the following loop: -//positivestep:first,first+step,first+2*step,... -for(autoi=first;i<last;i+=step){ -callable(i); -} -//negativestep:first,first-step,first-2*step,... -for(autoi=first;i>last;i+=step){ -callable(i); -} - -Each iteration i is independent of each other and is assigned one kernel thread to run the callable. Since the callable runs on GPU, it must be declared with a __device__ specifier. The following example creates a kernel that assigns each entry of gpu_data to 1 over the range [0, 100) with step size 1. -//assignseachelementingpu_datato1overtherange[0,100)withstepsize1 -cudaflow.for_each_index(0,100,1,[gpu_data]__device__(intidx){ -gpu_data[idx]=1; -}); - - - -Codestin Search App -Iterator-based parallel-for performs parallel iterations over a range specified by two STL-styled iterators, first and last. The task created by tf::cudaFlow::for_each(I first, I last, C callable) represents a parallel execution of the following loop: -for(autoi=first;i<last;i++){ -callable(*i); -} - -The two iterators, first and last, are typically two raw pointers to the first element and the next to the last element in the range in GPU memory space. The following example creates a for_each kernel that assigns each element in gpu_data to 1 over the range [gpu_data, gpu_data + 1000). -//assignseachelementto1overtherange[gpu_data,gpu_data+1000) -cudaflow.for_each(gpu_data,gpu_data+1000,[]__device__(int&item){ -item=1; -}); - -Each iteration is independent of each other and is assigned one kernel thread to run the callable. Since the callable runs on GPU, it must be declared with a __device__ specifier. - - -Codestin Search App -The parallel-iteration algorithms are also available in tf::cudaFlowCapturer::for_each and tf::cudaFlowCapturer::for_each_index. - - - - - diff --git a/docs/xml/GPUTasking.xml b/docs/xml/GPUTasking.xml new file mode 100644 index 000000000..173dbbbfe --- /dev/null +++ b/docs/xml/GPUTasking.xml @@ -0,0 +1,234 @@ + + + + GPUTasking + Codestin Search App + + + Include the Header + GPUTasking_1GPUTaskingIncludeTheHeader + + + What is a CUDA Graph? + GPUTasking_1WhatIsACudaGraph + + + Create a CUDA Graph + GPUTasking_1CreateACUDAGraph + + + Compile a CUDA Graph Program + GPUTasking_1CompileACUDAGraphProgram + + + Run a CUDA Graph on Specific GPU + GPUTasking_1RunACUDAGraphOnASpecificGPU + + + Create Memory Operation Tasks + GPUTasking_1GPUMemoryOperations + + + Run a CUDA Graph + GPUTasking_1RunACUDAGraph + + + Update an Executable CUDA Graph + GPUTasking_1UpdateAnExecutableCUDAGraph + + + Integrate a CUDA Graph into Taskflow + GPUTasking_1IntegrateACUDAGraphIntoTaskflow + + + + + +Modern scientific computing typically leverages GPU-powered parallel processing cores to speed up large-scale applications. This chapter discusses how to implement CPU-GPU heterogeneous tasking algorithms with Nvidia CUDA Graph. + +Codestin Search AppYou need to include the header file, taskflow/cuda/cudaflow.hpp, for creating a GPU task graph using tf::cudaGraph. +#include<taskflow/cuda/cudaflow.hpp> + + + +Codestin Search AppCUDA Graph is a new execution model that enables a series of CUDA kernels to be defined and encapsulated as a single unit, i.e., a task graph of operations, rather than a sequence of individually-launched operations. This organization allows launching multiple GPU operations through a single CPU operation and hence reduces the launching overheads, especially for kernels of short running time. The benefit of CUDA Graph can be demonstrated in the figure below: + + +In this example, a sequence of short kernels is launched one-by-one by the CPU. The CPU launching overhead creates a significant gap in between the kernels. If we replace this sequence of kernels with a CUDA graph, initially we will need to spend a little extra time on building the graph and launching the whole graph in one go on the first occasion, but subsequent executions will be very fast, as there will be very little gap between the kernels. The difference is more pronounced when the same sequence of operations is repeated many times, for example, many training epochs in machine learning workloads. In that case, the initial costs of building and launching the graph will be amortized over the entire training iterations. +A comprehensive introduction about CUDA Graph can be referred to the CUDA Graph Programming Guide. + + + + +Codestin Search AppTaskflow leverages CUDA Graph to enable concurrent CPU-GPU tasking using a task graph model called tf::cudaGraph. A tf::cudaGraph is essentially a C++ wrapper over a native CUDA graph, designed to simplify GPU task graph programming by eliminating much of the boilerplate code required in raw CUDA Graph programming. The following example creates a CUDA graph to perform the saxpy (A·X Plus Y) workload: +#include<taskflow/cuda/cudaflow.hpp> + +//saxpy(single-precisionA·XPlusY)kernel +__global__voidsaxpy(intn,floata,float*x,float*y){ +inti=blockIdx.x*blockDim.x+threadIdx.x; +if(i<n){ +y[i]=a*x[i]+y[i]; +} +} + +//mainfunctionbegins +intmain(){ + +constunsignedN=1<<20;//sizeofthevector + +std::vector<float>hx(N,1.0f);//xvectorathost +std::vector<float>hy(N,2.0f);//yvectorathost + +float*dx{nullptr};//xvectoratdevice +float*dy{nullptr};//yvectoratdevice + +cudaMalloc(&dx,N*sizeof(float)); +cudaMalloc(&dy,N*sizeof(float)); + +tf::cudaGraphcg; + +//createdatatransfertasks +tf::cudaTaskh2d_x=cg.copy(dx,hx.data(),N); +tf::cudaTaskh2d_y=cg.copy(dy,hy.data(),N); +tf::cudaTaskd2h_x=cg.copy(hx.data(),dx,N); +tf::cudaTaskd2h_y=cg.copy(hy.data(),dy,N); + +//launchsaxpy<<<(N+255)/256,256,0>>>(N,2.0f,dx,dy) +tf::cudaTaskkernel=cg.kernel( +(N+255)/256,256,0,saxpy,N,2.0f,dx,dy +).name("saxpy"); + +kernel.succeed(h2d_x,h2d_y) +.precede(d2h_x,d2h_y); + +//instantiateaCUDAgraphexecutableandrunitthroughastream +tf::cudaGraphExececec(cg); +tf::cudaStreamstream; +stream.run(exec).synchronize(); + +//dumpthegraph +cg.dump(std::cout); +} + +The graph consists of two CPU-to-GPU data copies (h2d_x and h2d_y), one kernel (saxpy), and two GPU-to-CPU data copies (d2h_x and d2h_y), in this order of their task dependencies. + + +We do not expend yet another effort on simplifying kernel programming but focus on tasking CUDA operations and their dependencies. That is, tf::cudaGraph is simply a lightweight C++ wrapper over the native CUDA Graph. This organization lets users fully take advantage of CUDA features that are commensurate with their domain knowledge, while leaving difficult task parallelism details to Taskflow. + + +Codestin Search AppUse nvcc to compile a CUDA Graph program: +~$nvcc-std=c++20my_cudaflow.cu-Ipath/to/include/taskflow-O2-omy_cudaflow +~$./my_cudaflow + +Please visit the page Compile Taskflow with CUDA for more details. + + +Codestin Search AppBy default, a tf::cudaGraph runs on the current GPU context associated with the caller, which is typically GPU 0. Each CUDA GPU has an integer identifier in the range of [0, N) to represent the context of that GPU, where N is the number of GPUs in the system. You can run a CUDA graph on a specific GPU by switching the context to a different GPU using tf::cudaScopedDevice. The code below creates a CUDA graph and runs it on GPU 2. +{ +//createanRAII-styledswitchertothecontextofGPU2 +tf::cudaScopedDevicecontext(2); + +//createaCUDAgraphunderGPU2 +tf::cudaGraphgraph; +//... + +//createastreamunderGPU2andoffloadthecapturertothatGPU +tf::cudaStreamstream; +tf::cudaGraphExecexec(graph); +stream.run(exec).synchronize(); +} + +tf::cudaScopedDevice is an RAII-styled wrapper to perform scoped switch to the given GPU context. When the scope is destroyed, it switches back to the original context. +tf::cudaScopedDevice allows you to place a CUDA Graph on a particular GPU device, but it is your responsibility to ensure correct memory access. For example, you may not allocate a memory block on GPU 2 while accessing it from a kernel on GPU 0. An easy practice for multi-GPU programming is to allocate unified shared memory using cudaMallocManaged and let the CUDA runtime perform automatic memory migration between GPUs. + + + + +Codestin Search Apptf::cudaGraph provides a set of methods for users to manipulate device memory. There are two categories, raw data and typed data. Raw data operations are methods with prefix mem, such as memcpy and memset, that operate in bytes. Typed data operations such as copy, fill, and zero, take logical count of elements. For instance, the following three methods have the same result of zeroing sizeof(int)*count bytes of the device memory area pointed to by target. +int*target; +cudaMalloc(&target,count*sizeof(int)); + +tf::cudaGraphcg; +memset_target=cg.memset(target,0,sizeof(int)*count); +same_as_above=cg.fill(target,0,count); +same_as_above_again=cg.zero(target,count); + +The method tf::cudaGraph::fill is a more powerful variant of tf::cudaGraph::memset. It can fill a memory area with any value of type T, given that sizeof(T) is 1, 2, or 4 bytes. The following example creates a GPU task to fill count elements in the array target with value 1234. +cf.fill(target,1234,count); + +Similar concept applies to tf::cudaGraph::memcpy and tf::cudaGraph::copy as well. The following two methods are equivalent to each other. +cg.memcpy(target,source,sizeof(int)*count); +cg.copy(target,source,count); + + + +Codestin Search AppTo offload a CUDA graph to a GPU, you need to instantiate an executable CUDA graph of tf::cudaGraphExec and create a tf::cudaStream to run the executable graph. The run method is asynchronous and can be explicitly synchronized on the given stream. +tf::cudaGraphgraph; +//modifythegraph... + +//createanexecutableCUDAgraphandrunitthroughastream +tf::cudaGraphExecexec(graph); +tf::cudaStreamstream; +stream.run(exec); + +//waitfortheexecutablecudagraphtofinish +stream.synchronize(); + +There is always an one-to-one mapping between an tf::cudaGraphExec and its parent CUDA graph in terms of its graph structure. However, the executable graph is an independent entity and has no lifetime dependency on its parent CUDA graph. You can instantiate multiple executable graphs from the same CUDA graph. + + +Codestin Search AppMany GPU applications require launching a CUDA graph multiple times and updating node parameters (e.g., kernel arguments or memory addresses) between iterations. tf::cudaGraphExec allows you to update the parameters of tasks created from its parent CUDA graph. Every task creation method in tf::cudaGraph has a corresponding method in tf::cudaGraphExec for updating the parameters of that task. +tf::cudaStreamstream; +tf::cudaGraphcg; + +//createakerneltask +tf::cudaTasktask=cf.kernel(grid1,block1,shm1,kernel,kernel_args_1); + +//instantiateanexecutablegraph +tf::cudaGraphExecexec(cg); +stream.run(stream).synchronize(); + +//updatethecreatedkerneltaskwithdifferentparameters +exec.kernel(task,grid2,block2,shm2,kernel,kernel_args_2); + +//runtheupdatedexecutablegraph +stream.run(stream).synchronize(); + +Between successive offloads (i.e., iterative executions of a CUDA graph), you can ONLY update task parameters, such as changing the kernel execution parameters and memory operation parameters. However, you must NOT change the topology of the CUDA graph, such as adding a new task or adding a new dependency. This is the limitation of Nvidia CUDA Graph. +There are a few restrictions on updating task parameters in an executable CUDA graph: +You cannot change a task to a different type +kernel task +The kernel function is not allowed to change. This restriction applies to all algorithm tasks that are created using lambda. + + +memset and memcpy tasks: +The CUDA device(s) to which the operand(s) was allocated/mapped cannot change +The source/destination memory must be allocated from the same contexts as the original source/destination memory. + + + + + + + + +Codestin Search AppAs tf::cudaGraph is a standalone wrapper over Nvidia CUDA Graph, you can simply run it as a task. The following example runs a CUDA graph from a static task: +tf::Executorexecutor; +tf::Taskflowtaskflow; + +taskflow.emplace([](){ +//createaCUDAgraphinsideastatictask +tf::cudaGraphcg; +cg.kernel(...); + +//instantiateaCUDAgraphexecutableandrunitthroughastream +tf::cudaGraphExececec(cg); +tf::cudaStreamstream; +stream.run(exec).synchronize(); +}); + + + + + + diff --git a/docs/xml/GPUTaskingcudaFlow.xml b/docs/xml/GPUTaskingcudaFlow.xml deleted file mode 100644 index a41d100f0..000000000 --- a/docs/xml/GPUTaskingcudaFlow.xml +++ /dev/null @@ -1,236 +0,0 @@ - - - - GPUTaskingcudaFlow - Codestin Search App - - - Include the Header - GPUTaskingcudaFlow_1GPUTaskingcudaFlowIncludeTheHeader - - - What is a CUDA Graph? - GPUTaskingcudaFlow_1WhatIsACudaGraph - - - Create a cudaFlow - GPUTaskingcudaFlow_1Create_a_cudaFlow - - - Compile a cudaFlow Program - GPUTaskingcudaFlow_1Compile_a_cudaFlow_program - - - Run a cudaFlow on Specific GPU - GPUTaskingcudaFlow_1run_a_cudaflow_on_a_specific_gpu - - - Create Memory Operation Tasks - GPUTaskingcudaFlow_1GPUMemoryOperations - - - Offload a cudaFlow - GPUTaskingcudaFlow_1OffloadAcudaFlow - - - Update a cudaFlow - GPUTaskingcudaFlow_1UpdateAcudaFlow - - - Integrate a cudaFlow into Taskflow - GPUTaskingcudaFlow_1IntegrateCudaFlowIntoTaskflow - - - - - -Modern scientific computing typically leverages GPU-powered parallel processing cores to speed up large-scale applications. This chapter discusses how to implement CPU-GPU heterogeneous tasking algorithms with Nvidia CUDA. - -Codestin Search App -You need to include the header file, taskflow/cuda/cudaflow.hpp, for creating a GPU task graph using tf::cudaFlow. -#include<taskflow/cuda/cudaflow.hpp> - - - -Codestin Search App -CUDA Graph is a new execution model that enables a series of CUDA kernels to be defined and encapsulated as a single unit, i.e., a task graph of operations, rather than a sequence of individually-launched operations. This organization allows launching multiple GPU operations through a single CPU operation and hence reduces the launching overheads, especially for kernels of short running time. The benefit of CUDA Graph can be demonstrated in the figure below: - - -In this example, a sequence of short kernels is launched one-by-one by the CPU. The CPU launching overhead creates a significant gap in between the kernels. If we replace this sequence of kernels with a CUDA graph, initially we will need to spend a little extra time on building the graph and launching the whole graph in one go on the first occasion, but subsequent executions will be very fast, as there will be very little gap between the kernels. The difference is more pronounced when the same sequence of operations is repeated many times, for example, many training epochs in machine learning workloads. In that case, the initial costs of building and launching the graph will be amortized over the entire training iterations. -A comprehensive introduction about CUDA Graph can be referred to the CUDA Graph Programming Guide. - - - - -Codestin Search App -Taskflow leverages CUDA Graph to enable concurrent CPU-GPU tasking using a task graph model called tf::cudaFlow. A cudaFlow manages a CUDA graph explicitly to execute dependent GPU operations in a single CPU call. The following example implements a cudaFlow that performs an saxpy (A·X Plus Y) workload: -#include<taskflow/cuda/cudaflow.hpp> - -//saxpy(single-precisionA·XPlusY)kernel -__global__voidsaxpy(intn,floata,float*x,float*y){ -inti=blockIdx.x*blockDim.x+threadIdx.x; -if(i<n){ -y[i]=a*x[i]+y[i]; -} -} - -//mainfunctionbegins -intmain(){ - -constunsignedN=1<<20;//sizeofthevector - -std::vector<float>hx(N,1.0f);//xvectorathost -std::vector<float>hy(N,2.0f);//yvectorathost - -float*dx{nullptr};//xvectoratdevice -float*dy{nullptr};//yvectoratdevice - -cudaMalloc(&dx,N*sizeof(float)); -cudaMalloc(&dy,N*sizeof(float)); - -tf::cudaFlowcudaflow; - -//createdatatransfertasks -tf::cudaTaskh2d_x=cudaflow.copy(dx,hx.data(),N).name("h2d_x"); -tf::cudaTaskh2d_y=cudaflow.copy(dy,hy.data(),N).name("h2d_y"); -tf::cudaTaskd2h_x=cudaflow.copy(hx.data(),dx,N).name("d2h_x"); -tf::cudaTaskd2h_y=cudaflow.copy(hy.data(),dy,N).name("d2h_y"); - -//launchsaxpy<<<(N+255)/256,256,0>>>(N,2.0f,dx,dy) -tf::cudaTaskkernel=cudaflow.kernel( -(N+255)/256,256,0,saxpy,N,2.0f,dx,dy -).name("saxpy"); - -kernel.succeed(h2d_x,h2d_y) -.precede(d2h_x,d2h_y); - -//runthecudaflowthroughastream -tf::cudaStreamstream; -cudaflow.run(stream) -stream.synchronize(); - -//dumpthecudaflow -cudaflow.dump(std::cout); -} - -The cudaFlow graph consists of two CPU-to-GPU data copies (h2d_x and h2d_y), one kernel (saxpy), and two GPU-to-CPU data copies (d2h_x and d2h_y), in this order of their task dependencies. - - -We do not expend yet another effort on simplifying kernel programming but focus on tasking CUDA operations and their dependencies. In other words, tf::cudaFlow is a lightweight C++ abstraction over CUDA Graph. This organization lets users fully take advantage of CUDA features that are commensurate with their domain knowledge, while leaving difficult task parallelism details to Taskflow. - - -Codestin Search App -Use nvcc to compile a cudaFlow program: -~$nvcc-std=c++17my_cudaflow.cu-Ipath/to/include/taskflow-O2-omy_cudaflow -~$./my_cudaflow - -Please visit the page Compile Taskflow with CUDA for more details. - - -Codestin Search App -By default, a cudaFlow runs on the current GPU context associated with the caller, which is typically GPU 0. Each CUDA GPU has an integer identifier in the range of [0, N) to represent the context of that GPU, where N is the number of GPUs in the system. You can run a cudaFlow on a specific GPU by switching the context to a different GPU using tf::cudaScopedDevice. The code below creates a cudaFlow and runs it on GPU 2. -{ -//createanRAII-styledswitchertothecontextofGPU2 -tf::cudaScopedDevicecontext(2); - -//createacudaFlowcapturerunderGPU2 -tf::cudaFlowCapturercapturer; -//... - -//createastreamunderGPU2andoffloadthecapturertothatGPU -tf::cudaStreamstream; -capturer.run(stream); -stream.synchronize(); -} - -tf::cudaScopedDevice is an RAII-styled wrapper to perform scoped switch to the given GPU context. When the scope is destroyed, it switches back to the original context. -tf::cudaScopedDeviceallows you to place a cudaFlow on a particular GPU device, but it is your responsibility to ensure correct memory access. For example, you may not allocate a memory block on GPU 2 while accessing it from a kernel on GPU 0. An easy practice for multi-GPU programming is to allocate unified shared memory using cudaMallocManaged and let the CUDA runtime perform automatic memory migration between GPUs. - - - - -Codestin Search App -cudaFlow provides a set of methods for users to manipulate device memory. There are two categories, raw data and typed data. Raw data operations are methods with prefix mem, such as memcpy and memset, that operate in bytes. Typed data operations such as copy, fill, and zero, take logical count of elements. For instance, the following three methods have the same result of zeroing sizeof(int)*count bytes of the device memory area pointed to by target. -int*target; -cudaMalloc(&target,count*sizeof(int)); - -tf::cudaFlowcudaflow; -memset_target=cudaflow.memset(target,0,sizeof(int)*count); -same_as_above=cudaflow.fill(target,0,count); -same_as_above_again=cudaflow.zero(target,count); - -The method tf::cudaFlow::fill is a more powerful variant of tf::cudaFlow::memset. It can fill a memory area with any value of type T, given that sizeof(T) is 1, 2, or 4 bytes. The following example creates a GPU task to fill count elements in the array target with value 1234. -cf.fill(target,1234,count); - -Similar concept applies to tf::cudaFlow::memcpy and tf::cudaFlow::copy as well. The following two methods are equivalent to each other. -cudaflow.memcpy(target,source,sizeof(int)*count); -cudaflow.copy(target,source,count); - - - -Codestin Search App -To offload a cudaFlow to a GPU, you need to use tf::cudaFlow::run and pass a tf::cudaStream created on that GPU. The run method is asynchronous and can be explicitly synchronized through the given stream. -tf::cudaStreamstream; -//launchacudaflowasynchronouslythroughastream -cudaflow.run(stream); -//waitforthecudaflowtofinish -stream.synchronize(); - -When you offload a cudaFlow using tf::cudaFlow::run, the runtime transforms that cudaFlow (i.e., application GPU task graph) into a native executable instance and submit it to the CUDA runtime for execution. There is always an one-to-one mapping between cudaFlow and its native CUDA graph representation (except those constructed by using tf::cudaFlowCapturer). - - -Codestin Search App -Many GPU applications require you to launch a cudaFlow multiple times and update node parameters (e.g., kernel parameters and memory addresses) between iterations. cudaFlow allows you to update the parameters of created tasks and run the updated cudaFlow with new parameters. Every task-creation method in tf::cudaFlow has an overload to update the parameters of a created task by that method. -tf::cudaStreamstream; -tf::cudaFlowcf; - -//createakerneltask -tf::cudaTasktask=cf.kernel(grid1,block1,shm1,kernel,kernel_args_1); -cf.run(stream); -stream.synchronize(); - -//updatethecreatedkerneltaskwithdifferentparameters -cf.kernel(task,grid2,block2,shm2,kernel,kernel_args_2); -cf.run(stream); -stream.synchronize(); - -Between successive offloads (i.e., iterative executions of a cudaFlow), you can ONLY update task parameters, such as changing the kernel execution parameters and memory operation parameters. However, you must NOT change the topology of the cudaFlow, such as adding a new task or adding a new dependency. This is the limitation of CUDA Graph. -There are a few restrictions on updating task parameters in a cudaFlow. Notably, you must NOT change the topology of an offloaded graph. In addition, update methods have the following limitations: -kernel task -The kernel function is not allowed to change. This restriction applies to all algorithm tasks that are created using lambda. - - -memset and memcpy tasks: -The CUDA device(s) to which the operand(s) was allocated/mapped cannot change -The source/destination memory must be allocated from the same contexts as the original source/destination memory. - - - - - - - - -Codestin Search App -You can create a task to enclose a cudaFlow and run it from a worker thread. The usage of the cudaFlow remains the same except that the cudaFlow is run by a worker thread from a taskflow task. The following example runs a cudaFlow from a static task: -tf::Executorexecutor; -tf::Taskflowtaskflow; - -taskflow.emplace([](){ -//createacudaFlowinsideastatictask -tf::cudaFlowcudaflow; - -//...createakerneltask -cudaflow.kernel(...); - -//runthecapturerthroughastream -tf::cudaStreamstream; -capturer.run(stream); -stream.synchronize(); -}); - - - - - - diff --git a/docs/xml/GPUTaskingcudaFlowCapturer.xml b/docs/xml/GPUTaskingcudaFlowCapturer.xml deleted file mode 100644 index 653e2c357..000000000 --- a/docs/xml/GPUTaskingcudaFlowCapturer.xml +++ /dev/null @@ -1,210 +0,0 @@ - - - - GPUTaskingcudaFlowCapturer - Codestin Search App - - - Include the Header - GPUTaskingcudaFlowCapturer_1GPUTaskingcudaFlowCapturerIncludeTheHeader - - - Capture a cudaFlow - GPUTaskingcudaFlowCapturer_1Capture_a_cudaFlow - - - Common Capture Methods - GPUTaskingcudaFlowCapturer_1CommonCaptureMethods - - - Create a Capturer on a Specific GPU - GPUTaskingcudaFlowCapturer_1CreateACapturerOnASpecificGPU - - - Create a Capturer from a cudaFlow - GPUTaskingcudaFlowCapturer_1CreateACapturerWithinAcudaFlow - - - Offload a cudaFlow Capturer - GPUTaskingcudaFlowCapturer_1OffloadAcudaFlowCapturer - - - Update a cudaFlow Capturer - GPUTaskingcudaFlowCapturer_1UpdateAcudaFlowCapturer - - - Integrate a cudaFlow Capturer into Taskflow - GPUTaskingcudaFlowCapturer_1IntegrateCudaFlowCapturerIntoTaskflow - - - - - -You can create a cudaFlow through stream capture, which allows you to implicitly capture a CUDA graph using stream-based interface. Compared to explicit CUDA Graph construction (tf::cudaFlow), implicit CUDA Graph capturing (tf::cudaFlowCapturer) is more flexible in building GPU task graphs. - -Codestin Search App -You need to include the header file, taskflow/cuda/cudaflow.hpp, for capturing a GPU task graph using tf::cudaFlowCapturer. -#include<taskflow/cuda/cudaflow.hpp> - - - -Codestin Search App -When your program has no access to direct kernel calls but can only invoke them through a stream-based interface (e.g., cuBLAS and cuDNN library functions), you can use tf::cudaFlowCapturer to capture the hidden GPU operations into a CUDA graph. A cudaFlowCapturer is similar to a cudaFlow except it constructs a GPU task graph through stream capture. You use the method tf::cudaFlowCapturer::on to capture a sequence of asynchronous GPU operations through the given stream. The following example creates a CUDA graph that captures two kernel tasks, task_1 (my_kernel_1) and task_2 (my_kernel_2) , where task_1 runs before task_2. -//createacudaFlowcapturertorunaCUDAgraphusingstreamcapturing -tf::cudaFlowCapturercapturer; - -//capturemy_kernel_1throughastreammanagedbycapturer -tf::cudaTasktask_1=capturer.on([&](cudaStream_tstream){ -my_kernel_1<<<grid_1,block_1,shm_size_1,stream>>>(my_parameters_1); -}).name("my_kernel_1"); - -//capturemy_kernel_2throughastreammanagedbycapturer -tf::cudaTasktask_2=capturer.on([&](cudaStream_tstream){ -my_kernel_2<<<grid_2,block_2,shm_size_2,stream>>>(my_parameters_2); -}).name("my_kernel_2"); - -//my_kernel_1runsbeforemy_kernel_2 -task_1.precede(task_2); - -//offloadcapturedGPUtasksusingtheCUDAGraphexecutionmodel -tf::cudaStreamstream; -capturer.run(stream); -stream.synchronize(); - -//dumpthecudaFlowtoaDOTformatthroughstd::cout -capturer.dump(std::cout) - - - -Inside tf::cudaFlowCapturer::on, you should NOT modify the properties of the stream argument but only use it to capture asynchronous GPU operations (e.g., kernel, cudaMemcpyAsync). The stream argument is internal to the capturer use only. - - - - -Codestin Search App -tf::cudaFlowCapturer defines a set of methods for capturing common GPU operations, such as tf::cudaFlowCapturer::kernel, tf::cudaFlowCapturer::memcpy, tf::cudaFlowCapturer::memset, and so on. For example, the following code snippet uses these pre-defined methods to construct a GPU task graph of one host-to-device copy, kernel, and one device-to-host copy, in this order of their dependencies. -tf::cudaFlowCapturercapturer; - -//copydatafromhost_datatogpu_data -tf::cudaTaskh2d=capturer.memcpy(gpu_data,host_data,bytes) -.name("h2d"); - -//capturemy_kerneltodocomputationongpu_data -tf::cudaTaskkernel=capturer.kernel(grid,block,shm_size,kernel,kernel_args); -.name("my_kernel"); - -//copydatafromgpu_datatohost_data -tf::cudaTaskd2h=capturer.memcpy(host_data,gpu_data,bytes) -.name("d2h"); - -//buildtaskdependencies -h2d.precede(kernel); -kernel.precede(d2h); - - - - - -Codestin Search App -You can run a cudaFlow capturer on a specific GPU by switching to the context of that GPU using tf::cudaScopedDevice, following the CUDA convention of multi-GPU programming. The example below creates a cudaFlow capturer and runs it on GPU 2: -{ -//createanRAII-styledswitchertothecontextofGPU2 -tf::cudaScopedDevicecontext(2); - -//createacudaFlowcapturerunderGPU2 -tf::cudaFlowCapturercapturer; -//... - -//createastreamunderGPU2andoffloadthecapturertothatGPU -tf::cudaStreamstream; -capturer.run(stream); -stream.synchronize(); -} - -tf::cudaScopedDevice is an RAII-styled wrapper to perform scoped switch to the given GPU context. When the scope is destroyed, it switches back to the original context. -By default, a cudaFlow capturer runs on the current GPU associated with the caller, which is typically 0. - - - - -Codestin Search App -Within a parent cudaFlow, you can capture a cudaFlow to form a subflow that eventually becomes a child node in the underlying CUDA task graph. The following example defines a captured flow task2 of two dependent tasks, task2_1 and task2_2, and task2 runs after task1. -tf::cudaFlowcudaflow; - -tf::cudaTasktask1=cudaflow.kernel(grid,block,shm,my_kernel,args...) -.name("kernel"); - -//task2formsasubflowasachildnodeintheunderlyingCUDAgraph -tf::cudaTasktask2=cudaflow.capture([&](tf::cudaFlowCapturer&capturer){ - -//capturekernel_1usingthegivenstream -tf::cudaTasktask2_1=capturer.on([&](cudaStream_tstream){ -kernel_2<<<grid1,block1,shm_size1,stream>>>(args1...); -}).name("kernel_1"); - -//capturekernel_2usingthegivenstream -tf::cudaTasktask2_2=capturer.on([&](cudaStream_tstream){ -kernel_2<<<grid2,block2,shm_size2,stream>>>(args2...); -}).name("kernel_2"); - -//kernel_1runsbeforekernel_2 -task2_1.precede(task2_2); -}).name("capturer"); - -task1.precede(task2); - - - - - -Codestin Search App -When you offload a cudaFlow capturer using tf::cudaFlowCapturer::run, the runtime transforms that capturer (i.e., application GPU task graph) into a native CUDA graph and an executable instance both optimized for maximum kernel concurrency. Depending on the optimization algorithm, the application GPU task graph may be different from the actual executable graph submitted to the CUDA runtime. -tf::cudaStreamstream; -//launchacudaflowcapturerasynchronouslythroughastream -capturer.run(stream); -//waitforthecudaflowtofinish -stream.synchronize(); - - - -Codestin Search App -Between successive offloads (i.e., executions of a cudaFlow capturer), you can update the captured task with a different set of parameters. Every task-creation method in tf::cudaFlowCapturer has an overload to update the parameters of a created task by that method. The following example creates a kernel task and updates its parameter between successive runs: -tf::cudaStreamstream; -tf::cudaFlowCapturercf; - -//createakerneltask -tf::cudaTasktask=cf.kernel(grid1,block1,shm1,kernel,kernel_args_1); -cf.run(stream); -stream.synchronize(); - -//updatethecreatedkerneltaskwithdifferentparameters -cf.kernel(task,grid2,block2,shm2,kernel,kernel_args_2); -cf.run(stream); -stream.synchronize(); - -When you run a updated cudaFlow capturer, Taskflow will try to update the underlying executable with the newly captured graph first. If that update is unsuccessful, Taskflow will destroy the executable graph and re-instantiate a new one from the newly captured graph. - - -Codestin Search App -You can create a task to enclose a cudaFlow capturer and run it from a worker thread. The usage of the capturer remains the same except that the capturer is run by a worker thread from a taskflow task. The following example runs a cudaFlow capturer from a static task: -tf::Executorexecutor; -tf::Taskflowtaskflow; - -taskflow.emplace([](){ -//createacudaFlowcapturerinsideastatictask -tf::cudaFlowCapturercapturer; - -//...captureaGPUtaskgraph -capturer.kernel(...); - -//runthecapturerthroughastream -tf::cudaStreamstream; -capturer.run(stream); -stream.synchronize(); -}); - - - - - - diff --git a/docs/xml/Governance.xml b/docs/xml/Governance.xml index 588aa97fd..0026b0bb5 100644 --- a/docs/xml/Governance.xml +++ b/docs/xml/Governance.xml @@ -1,5 +1,5 @@ - + Governance Codestin Search App @@ -17,6 +17,6 @@ - + diff --git a/docs/xml/GraphProcessingPipeline.xml b/docs/xml/GraphProcessingPipeline.xml index c404b113f..43c6e87b4 100644 --- a/docs/xml/GraphProcessingPipeline.xml +++ b/docs/xml/GraphProcessingPipeline.xml @@ -1,5 +1,5 @@ - + GraphProcessingPipeline Codestin Search App @@ -7,77 +7,75 @@ Formulate the Graph Processing Pipeline Problem GraphProcessingPipeline_1FormulateTheGraphProcessingPipelineProblem - + Create a Graph Processing Pipeline GraphProcessingPipeline_1CreateAGraphProcessingPipeline - - - Find a Topological Order of the Graph - GraphProcessingPipeline_1GraphPipelineFindATopologicalOrderOfTheGraph - - - Define the Stage Function - GraphProcessingPipeline_1GraphPipelineDefineTheStageFunction - - - Define the Pipes - GraphProcessingPipeline_1GraphPipelineDefineThePipes - - - Define the Task Graph - GraphProcessingPipeline_1GraphPipelineDefineTheTaskGraph - - - Submit the Task Graph - GraphProcessingPipeline_1GraphPipelineSubmitTheTaskGraph - - - + + + Find a Topological Order of the Graph + GraphProcessingPipeline_1GraphPipelineFindATopologicalOrderOfTheGraph + + + Define the Stage Function + GraphProcessingPipeline_1GraphPipelineDefineTheStageFunction + + + Define the Pipes + GraphProcessingPipeline_1GraphPipelineDefineThePipes + + + Define the Task Graph + GraphProcessingPipeline_1GraphPipelineDefineTheTaskGraph + + + Submit the Task Graph + GraphProcessingPipeline_1GraphPipelineSubmitTheTaskGraph + + + Reference GraphProcessingPipeline_1GraphPipelineReference - + We study a graph processing pipeline that propagates a sequence of linearly dependent tasks over a dependency graph. In this particular workload, we will learn how to transform task graph parallelism into pipeline parallelism. -Codestin Search App -Given a directed acyclic graph (DAG), where each node encapsulates a sequence of linearly dependent tasks, namely stage tasks, and each edge represents a dependency between two tasks at the same stages of adjacent nodes. For example, assuming fi(u) represents the ith-stage task of node u, a dependency from u to v requires fi(u) to run before fi(v). The following figures shows an example of three stage tasks in a DAG of three nodes (A, B, and C) and two dependencies (A->B and A->C): - +Codestin Search AppGiven a directed acyclic graph (DAG), where each node encapsulates a sequence of linearly dependent tasks, namely stage tasks, and each edge represents a dependency between two tasks at the same stages of adjacent nodes. For example, assuming fi(u) represents the ith-stage task of node u, a dependency from u to v requires fi(u) to run before fi(v). The following figures shows an example of three stage tasks in a DAG of three nodes (A, B, and C) and two dependencies (A->B and A->C): + While we can directly create a taskflow for the DAG (i.e., each task in the taskflow runs f1, f2, and f3 sequentially), we can describe the parallelism as a three-stage pipeline that propagates a topological order of the DAG through three stage tasks. Consider a valid topological order of this DAG, A, B, C, its pipeline parallelism can be illustrated in the following figure: - + At the beginning, f1(A) runs first. When f1(A) completes, it moves on to f2(A) and, meanwhile, f1(B) can start to run together with f2(A), and so on so forth. The straight line represents two parallel tasks that can overlap in time in the pipeline. For example, f3(A), f2(B), and f1(C) can run simultaneously. The following figures shows the task dependency graph of this pipeline workload: - + As we can see, tasks in diagonal lines (lower-left to upper-right) can run in parallel. This type of parallelism is also referred to as wavefront parallelism, which sweeps parallel elements in a diagonal direction. -Depending on the graph size and the number of stage tasks, task graph parallelism and pipeline parallelism can bring very different performance results. For example, a small graph will a long chain of stage tasks may perform better with pipeline parallelism than task graph parallelism, and vice versa. +Depending on the graph size and the number of stage tasks, task graph parallelism and pipeline parallelism can bring very different performance results. For example, a small graph will a long chain of stage tasks may perform better with pipeline parallelism than task graph parallelism, and vice versa. -Codestin Search App -Using the example from the previous section, we create a three-stage pipeline that encapsulates the three stage tasks (f1, f2, f3) in three pipes. By finding a topological order of the graph, we can transform the node dependency into a sequence of linearly dependent data tokens to feed into the pipeline. The overall implementation is shown below: +Codestin Search AppUsing the example from the previous section, we create a three-stage pipeline that encapsulates the three stage tasks (f1, f2, f3) in three pipes. By finding a topological order of the graph, we can transform the node dependency into a sequence of linearly dependent data tokens to feed into the pipeline. The overall implementation is shown below: #include<taskflow/taskflow.hpp> #include<taskflow/algorithm/pipeline.hpp> //1st-stagefunction -voidf1(conststd::string&node){ -printf("f1(%s)\n",node.c_str()); +voidf1(conststd::string&node){ +printf("f1(%s)\n",node.c_str()); } //2nd-stagefunction -voidf2(conststd::string&node){ -printf("f2(%s)\n",node.c_str()); +voidf2(conststd::string&node){ +printf("f2(%s)\n",node.c_str()); } //3rd-stagefunction -voidf3(conststd::string&node){ -printf("f3(%s)\n",node.c_str()); +voidf3(conststd::string&node){ +printf("f3(%s)\n",node.c_str()); } intmain(){ @@ -91,7 +89,7 @@ //|->B //A--| //|->C -conststd::vector<std::string>nodes={"A","B","C"}; +conststd::vector<std::string>nodes={"A","B","C"}; //thepipelineconsistsofthreeserialpipes //anduptotwoconcurrentschedulingtokens @@ -119,11 +117,11 @@ ); //buildthepipelinegraphusingcomposition -tf::Taskinit=taskflow.emplace([](){std::cout<<"ready\n";}) +tf::Taskinit=taskflow.emplace([](){std::cout<<"ready\n";}) .name("startingpipeline"); tf::Tasktask=taskflow.composed_of(pl) .name("pipeline"); -tf::Taskstop=taskflow.emplace([](){std::cout<<"stopped\n";}) +tf::Taskstop=taskflow.emplace([](){std::cout<<"stopped\n";}) .name("pipelinestopped"); //createtaskdependency @@ -131,7 +129,7 @@ task.precede(stop); //dumpthepipelinegraphstructure(withcomposition) -taskflow.dump(std::cout); +taskflow.dump(std::cout); //runthepipeline executor.run(taskflow).wait(); @@ -140,36 +138,33 @@ } -Codestin Search App -The first step is to find a valid topological order of the graph, such that we can transform the graph dependency into a linear sequence. In this example, we simply hard-code it: -conststd::vector<std::string>nodes={"A","B","C"}; +Codestin Search AppThe first step is to find a valid topological order of the graph, such that we can transform the graph dependency into a linear sequence. In this example, we simply hard-code it: +conststd::vector<std::string>nodes={"A","B","C"}; -Codestin Search App -This particular workload does not propagate data directly through the pipeline. In most situations, data is directly stored in a custom graph data structure, and the stage function will just need to know which node to process. For demo's sake, we simply output a message to show which stage function is processing which node: +Codestin Search AppThis particular workload does not propagate data directly through the pipeline. In most situations, data is directly stored in a custom graph data structure, and the stage function will just need to know which node to process. For demo's sake, we simply output a message to show which stage function is processing which node: //1st-stagefunction -voidf1(conststd::string&node){ -printf("f1(%s)\n",node.c_str()); +voidf1(conststd::string&node){ +printf("f1(%s)\n",node.c_str()); } //2nd-stagefunction -voidf2(conststd::string&node){ -printf("f2(%s)\n",node.c_str()); +voidf2(conststd::string&node){ +printf("f2(%s)\n",node.c_str()); } //3rd-stagefunction -voidf3(conststd::string&node){ -printf("f3(%s)\n",node.c_str()); +voidf3(conststd::string&node){ +printf("f3(%s)\n",node.c_str()); } -A key advantage of Taskflow's pipeline programming model is that we do not provide any data abstraction but give users full control over data management, which is typically application-dependent. In an application like this graph processing pipeline, data is managed in a global custom graph data structure, and any data abstraction provided by the library can become a unnecessary overhead. +A key advantage of Taskflow's pipeline programming model is that we do not provide any data abstraction but give users full control over data management, which is typically application-dependent. In an application like this graph processing pipeline, data is managed in a global custom graph data structure, and any data abstraction provided by the library can become a unnecessary overhead. -Codestin Search App -The pipe structure is straightforward. Each pipe encapsulates the corresponding stage function and passes the node into the function argument. The first pipe will cease the pipeline scheduling when it has processed all nodes. To identify which node is being processed at a running pipe, we use tf::Pipeflow::token to find the index: +Codestin Search AppThe pipe structure is straightforward. Each pipe encapsulates the corresponding stage function and passes the node into the function argument. The first pipe will cease the pipeline scheduling when it has processed all nodes. To identify which node is being processed at a running pipe, we use tf::Pipeflow::token to find the index: //firstpipecallsf1 tf::Pipe{tf::PipeType::SERIAL,[&](tf::Pipeflow&pf){ if(pf.token()==nodes.size()){ @@ -192,27 +187,25 @@ -Codestin Search App -To build up the taskflow for the pipeline, we create a module task with the defined pipeline structure and connect it with two tasks that output helper messages before and after the pipeline: -tf::Taskinit=taskflow.emplace([](){std::cout<<"ready\n";}) +Codestin Search AppTo build up the taskflow for the pipeline, we create a module task with the defined pipeline structure and connect it with two tasks that output helper messages before and after the pipeline: +tf::Taskinit=taskflow.emplace([](){std::cout<<"ready\n";}) .name("startingpipeline"); tf::Tasktask=taskflow.composed_of(pl) .name("pipeline"); -tf::Taskstop=taskflow.emplace([](){std::cout<<"stopped\n";}) +tf::Taskstop=taskflow.emplace([](){std::cout<<"stopped\n";}) .name("pipelinestopped"); init.precede(task); task.precede(stop); - + -Codestin Search App -Finally, we submit the taskflow to the execution and run it once: +Codestin Search AppFinally, we submit the taskflow to the execution and run it once: executor.run(taskflow).wait(); Three possible outputs are shown below: -#possibleoutput1 +#possibleoutput1 ready f1(A) f2(A) @@ -253,14 +246,13 @@ -Codestin Search App -We have applied the graph processing pipeline technique to speed up a circuit analysis problem. Details can be referred to our publication below: +Codestin Search AppWe have applied the graph processing pipeline technique to speed up a circuit analysis problem. Details can be referred to our publication below: Cheng-Hsiang Chiu and Tsung-Wei Huang, "Efficient Timing Propagation with Simultaneous Structural and Pipeline Parallelisms," ACM/IEEE Design Automation Conference (DAC), San Francisco, CA, 2022 - + diff --git a/docs/xml/kmeans_cudaflow.xml b/docs/xml/KMeansWithCUDAGPU.xml similarity index 81% rename from docs/xml/kmeans_cudaflow.xml rename to docs/xml/KMeansWithCUDAGPU.xml index 16cd8471c..60b52a992 100644 --- a/docs/xml/kmeans_cudaflow.xml +++ b/docs/xml/KMeansWithCUDAGPU.xml @@ -1,29 +1,28 @@ - - - kmeans_cudaflow - Codestin Search App + + + KMeansWithCUDAGPU + Codestin Search App Define the k-means Kernels - kmeans_cudaflow_1DefineTheKMeansKernels - + KMeansWithCUDAGPU_1DefineTheKMeansKernels + - Define the k-means cudaFlow - kmeans_cudaflow_1DefineTheKMeanscudaFlow - + Define the k-means CUDA Graph + KMeansWithCUDAGPU_1DefineTheKMeansCUDAGraph + Benchmarking - kmeans_cudaflow_1KMeanscudaFlowBenchmarking - + KMeansWithCUDAGPU_1KMeansWithGPUBenchmarking + -Following up on k-means Clustering, this page studies how to accelerate a k-means workload on a GPU using tf::cudaFlow. - -Codestin Search App -Recall that the k-means algorithm has the following steps: +Following up on k-means Clustering, this page studies how to accelerate a k-means workload on a GPU using tf::cudaGraph. + +Codestin Search AppRecall that the k-means algorithm has the following steps: Step 1: initialize k random centroids @@ -89,24 +88,23 @@ float*mx,float*my,float*sx,float*sy,int*c ){ intk=threadIdx.x; -intcount=max(1,c[k]);//turn0/0to0/1 -mx[k]=sx[k]/count; -my[k]=sy[k]/count; +intcount=max(1,c[k]);//turn0/0to0/1 +mx[k]=sx[k]/count; +my[k]=sy[k]/count; } When we recompute the cluster centroids to be the mean of all points assigned to a particular centroid, multiple GPU threads may access the sum arrays, sx and sy, and the count array, c. To avoid data race, we use a simple atomicAdd method. - -Codestin Search App -Based on the two kernels, we can define the cudaFlow for the k-means workload below: + +Codestin Search AppBased on the two kernels, we can define a CUDA graph for the k-means workload below: //N:numberofpoints //K:numberofclusters //M:numberofiterations //px/py:2Dpointvector voidkmeans_gpu( -intN,intK,intM,cconststd::vector<float>&px,conststd::vector<float>&py +intN,intK,intM,cconststd::vector<float>&px,conststd::vector<float>&py ){ -std::vector<float>h_mx,h_my; +std::vector<float>h_mx,h_my; float*d_px,*d_py,*d_mx,*d_my,*d_sx,*d_sy,*d_c; for(inti=0;i<K;++i){ @@ -154,31 +152,37 @@ autokmeans=taskflow.emplace([&](){ -tf::cudaFlowcf; +tf::cudaGraphcg; -autozero_c=cf.zero(d_c,K).name("zero_c"); -autozero_sx=cf.zero(d_sx,K).name("zero_sx"); -autozero_sy=cf.zero(d_sy,K).name("zero_sy"); +autozero_c=cg.zero(d_c,K); +autozero_sx=cg.zero(d_sx,K); +autozero_sy=cg.zero(d_sy,K); -autocluster=cf.kernel( +autocluster=cg.kernel( (N+512-1)/512,512,0, assign_clusters,d_px,d_py,N,d_mx,d_my,d_sx,d_sy,K,d_c -).name("cluster"); +); -autonew_centroid=cf.kernel( +autonew_centroid=cg.kernel( 1,K,0, compute_new_means,d_mx,d_my,d_sx,d_sy,d_c -).name("new_centroid"); +); cluster.precede(new_centroid) .succeed(zero_c,zero_sx,zero_sy); -//RepeattheexecutionforMtimes -tf::cudaStreamstream; +//dumptheCUDAgraph +cg.dump(std::cout); + +//instantiateanexecutableCUDAgraph +tf::cudaGraphExecexec(cg); + +//RepeattheexecutionforMtimesandthensynchronize +tf::cudaStreamstream; for(inti=0;i<M;i++){ -cf.run(stream); +stream.run(exec); } -stream.synchronize(); +stream.synchronize(); }).name("update_means"); autostop=taskflow.emplace([&](){ @@ -186,7 +190,7 @@ cudaMemcpy(h_my.data(),d_my,K*sizeof(float),cudaMemcpyDefault); }).name("d2h"); -autofree=taskflow.emplace([&](){ +autofree=taskflow.emplace([&](){ cudaFree(d_px); cudaFree(d_py); cudaFree(d_mx); @@ -207,19 +211,16 @@ //runthetaskflow executor.run(taskflow).wait(); -//std::cout<<"dumpingkmeansgraph...\n"; -taskflow.dump(std::cout); return{h_mx,h_my}; } -The first dump before executing the taskflow produces the following diagram. The condition tasks introduces a cycle between itself and update_means. Each time it goes back to update_means, the cudaFlow is reconstructed with captured parameters in the closure and offloaded to the GPU. - +The first dump before executing the taskflow produces the following diagram. The condition tasks introduces a cycle between itself and update_means. Each time it goes back to update_means, the CUDA graph is reconstructed with captured parameters in the closure and offloaded to the GPU. + -The main cudaFlow task, update_means, must not run before all required data has settled down. It precedes a condition task that circles back to itself until we reach M iterations. When iteration completes, the condition task directs the execution path to the cudaFlow, h2d, to copy the results of clusters to h_mx and h_my and then deallocate all GPU memory. +The main CUDA Graph task, update_means, must not run before all required data has settled down. It precedes a condition task that circles back to itself until we reach M iterations. When iteration completes, the condition task directs the execution path to the CUDA graph, h2d, to copy the results of clusters to h_mx and h_my and then deallocate all GPU memory. - -Codestin Search App -We run three versions of k-means, sequential CPU, parallel CPUs, and one GPU, on a machine of 12 Intel i7-8700 CPUs at 3.20 GHz and a Nvidia RTX 2080 GPU using various numbers of 2D point counts and iterations. + +Codestin Search AppWe run three versions of k-means, sequential CPU, parallel CPUs, and one GPU, on a machine of 12 Intel i7-8700 CPUs at 3.20 GHz and a Nvidia RTX 2080 GPU using various numbers of 2D point counts and iterations. N K @@ -270,9 +271,9 @@
    -When the number of points is larger than 10K, both parallel CPU and GPU implementations start to pick up the speed over than the sequential version. We can see that using the built-in predicate, tf::cudaFlow::offload_n, can avoid repetitively creating the graph over and over, resulting in two times faster than conditional tasking. +When the number of points is larger than 10K, both parallel CPU and GPU implementations start to pick up the speed over than the sequential version.
    - +
    diff --git a/docs/xml/LimitTheMaximumConcurrency.xml b/docs/xml/LimitTheMaximumConcurrency.xml index 53f737aee..068951657 100644 --- a/docs/xml/LimitTheMaximumConcurrency.xml +++ b/docs/xml/LimitTheMaximumConcurrency.xml @@ -1,5 +1,5 @@ - + LimitTheMaximumConcurrency Codestin Search App @@ -7,47 +7,54 @@ Define a Semaphore LimitTheMaximumConcurrency_1DefineASemaphore - + - Define a Critical Section - LimitTheMaximumConcurrency_1DefineACriticalRegion - + Use Semaphores Across Different Tasks + LimitTheMaximumConcurrency_1UseSemaphoresAcrossDifferentTasks + Define a Conflict Graph LimitTheMaximumConcurrency_1DefineAConflictGraph - + + + Reset a Semaphore + LimitTheMaximumConcurrency_1ResetASemaphore + + + Understand the Limitation of Semaphores + LimitTheMaximumConcurrency_1UnderstandTheLimitationOfSemaphores + -This chapters discusses how to limit the concurrency or the maximum number of workers in subgraphs of a taskflow. +This chapters discusses how to limit the concurrency or the maximum number of workers in your Taskflow applications. -Codestin Search App -Taskflow provides a mechanism, tf::Semaphore, for you to limit the maximum concurrency in a section of tasks. You can let a task acquire/release one or multiple semaphores before/after executing its work. A task can acquire and release a semaphore, or just acquire or just release it. A tf::Semaphore object starts with an initial count. As long as that count is above 0, tasks can acquire the semaphore and do their work. If the count is 0 or less, a task trying to acquire the semaphore will not run but goes to a waiting list of that semaphore. When the semaphore is released by another task, it reschedules all tasks on that waiting list. +Codestin Search AppTaskflow provides a mechanism, tf::Semaphore, for you to limit the maximum concurrency in a section of tasks. You can let a task acquire/release one or multiple semaphores before/after executing its work. A task can acquire and release a semaphore, or just acquire or just release it. A tf::Semaphore object starts with an initial value. As long as that value is above 0, tasks can acquire the semaphore and do their work. If the value is 0 or less, a task trying to acquire the semaphore will not run but goes to a waiting list of that semaphore. When the semaphore is released by another task, it reschedules all tasks on that waiting list. tf::Executorexecutor(8);//createanexecutorof8workers tf::Taskflowtaskflow; -tf::Semaphoresemaphore(1);//createasemaphorewithinitialcount1 +tf::Semaphoresemaphore(1);//createasemaphorewithinitialvalueof1 -std::vector<tf::Task>tasks{ -taskflow.emplace([](){std::cout<<"A"<<std::endl;}), +std::vector<tf::Task>tasks{ +taskflow.emplace([](){std::cout<<"A"<<std::endl;}), taskflow.emplace([](){std::cout<<"B"<<std::endl;}), -taskflow.emplace([](){std::cout<<"C"<<std::endl;}), +taskflow.emplace([](){std::cout<<"C"<<std::endl;}), taskflow.emplace([](){std::cout<<"D"<<std::endl;}), -taskflow.emplace([](){std::cout<<"E"<<std::endl;}) +taskflow.emplace([](){std::cout<<"E"<<std::endl;}) }; for(auto&task:tasks){//eachtaskacquiresandreleasethesemaphore -task.acquire(semaphore); -task.release(semaphore); +task.acquire(semaphore); +task.release(semaphore); } executor.run(taskflow).wait(); - + -The above example creates five tasks with no dependencies between them. Under normal circumstances, the five tasks would be executed concurrently. However, this example has a semaphore with initial count 1, and all tasks need to acquire that semaphore before running and release that semaphore after they are done. This organization limits the number of concurrently running tasks to only one. One possible output is shown below: -#theoutputisasequentialchainoffivetasks +The above example creates five tasks with no dependencies between them. Under normal circumstances, the five tasks would be executed concurrently. However, this example has a semaphore with initial value of 1, and all tasks need to acquire that semaphore before running and release that semaphore after they are done. This organization limits the number of concurrently running tasks to only one. One possible output is shown below: +#theoutputisasequentialchainoffivetasks A B E @@ -60,24 +67,24 @@ For the same example above, we can limit the semaphore concurrency to another va tf::Executorexecutor(8);//createanexecutorof8workers tf::Taskflowtaskflow; -tf::Semaphoresemaphore(3);//createasemaphorewithinitialcount3 +tf::Semaphoresemaphore(3);//createasemaphorewithinitialvalueof3 -std::vector<tf::Task>tasks{ -taskflow.emplace([](){std::cout<<"A"<<std::endl;}), +std::vector<tf::Task>tasks{ +taskflow.emplace([](){std::cout<<"A"<<std::endl;}), taskflow.emplace([](){std::cout<<"B"<<std::endl;}), -taskflow.emplace([](){std::cout<<"C"<<std::endl;}), +taskflow.emplace([](){std::cout<<"C"<<std::endl;}), taskflow.emplace([](){std::cout<<"D"<<std::endl;}), -taskflow.emplace([](){std::cout<<"E"<<std::endl;}) +taskflow.emplace([](){std::cout<<"E"<<std::endl;}) }; for(auto&task:tasks){//eachtaskacquiresandreleasethesemaphore -task.acquire(semaphore); -task.release(semaphore); +task.acquire(semaphore); +task.release(semaphore); } executor.run(taskflow).wait(); -#Onepossibleoutput:A,B,andCrunconcurrently,DandErunconcurrently +#Onepossibleoutput:A,B,andCrunconcurrently,DandErunconcurrently ABC ED @@ -91,9 +98,9 @@ For the same example above, we can limit the semaphore concurrency to another va for(inti=0;i<N;i++){ tf::Taskf=taskflow.emplace([&](){counter++;}) -.name("from-"s+std::to_string(i)); +.name("from-"s+std::to_string(i)); tf::Taskt=taskflow.emplace([&](){counter++;}) -.name("to-"s+std::to_string(i)); +.name("to-"s+std::to_string(i)); f.precede(t); f.acquire(semaphore); t.release(semaphore); @@ -103,34 +110,35 @@ For the same example above, we can limit the semaphore concurrency to another va assert(counter==2*N); - + Without semaphores, each pair of tasks, e.g., from-0 -> to-0, will run independently and concurrently. However, the program forces each from task to acquire the semaphore before running its work and not to release it until its paired to task is done. This constraint forces each pair of tasks to run sequentially, while the order of which pair runs first is up to the scheduler. - -Codestin Search App -tf::CriticalSection is a wrapper over tf::Semaphore specialized for limiting the maximum concurrency over a section of tasks. A critical section starts with an initial count representing that limit. When a task is added to the critical section, the task acquires and releases the semaphore internal to the critical section. This method tf::CriticalSection::add automatically calls tf::Task::acquire and tf::Task::release for each task added to the critical section. The following example creates a critical section of two workers to run five tasks in the critical section. + +Codestin Search AppYou can use semaphores to limit the concurrency across different sections of taskflow graphs. When you submit multiple taskflows to an executor, the executor view them as a bag of dependent tasks. It does not matter which task in which taskflow graph acquires or releases a semaphore. tf::Executorexecutor(8);//createanexecutorof8workers -tf::Taskflowtaskflow; +tf::Taskflowtaskflow1; +tf::Taskflowtaskflow2; -//createacriticalsectionoftwoworkers -tf::CriticalSectioncritical_section(2); +tf::Semaphoresemaphore(1);//createasemaphorewithinitialvalueof1 -tf::TaskA=taskflow.emplace([](){std::cout<<"A"<<std::endl;}); -tf::TaskB=taskflow.emplace([](){std::cout<<"B"<<std::endl;}); -tf::TaskC=taskflow.emplace([](){std::cout<<"C"<<std::endl;}); -tf::TaskD=taskflow.emplace([](){std::cout<<"D"<<std::endl;}); -tf::TaskE=taskflow.emplace([](){std::cout<<"E"<<std::endl;}); +taskflow1.emplace([](){std::cout<<"taskintaskflow1";}) +.acquire(semaphore) +.release(semaphore); -critical_section.add(A,B,C,D,E); +taskflow2.emplace([](){std::cout<<"taskintaskflow2";}) +.acquire(semaphore) +.release(semaphore); -executor.run(taskflow).wait(); +executor.run(taskflow1); +executor.run(taskflow2); +executor.wait_for_all(); +The above examples creates one task from each taskflow and submits the two taskflows to the executor. Again, under normal circumstances, the two tasks can run concurrently, but the semaphore restricts one worker to run the two task sequentially in arbitrary order. -Codestin Search App -One important application of tf::Semaphore is conflict-aware scheduling using a conflict graph. A conflict graph is a undirected graph where each vertex represents a task and each edge represents a conflict between a pair of tasks. When a task conflicts with another task, they cannot run together. Consider the conflict graph below, task A conflicts with task B and task C (and vice versa), meaning that A cannot run together with B and C whereas B and C can run together. - +Codestin Search AppOne important application of tf::Semaphore is conflict-aware scheduling using a conflict graph. A conflict graph is a undirected graph where each vertex represents a task and each edge represents a conflict between a pair of tasks. When a task conflicts with another task, they cannot run together. Consider the conflict graph below, task A conflicts with task B and task C (and vice versa), meaning that A cannot run together with B and C whereas B and C can run together. + We can create one semaphore of one concurrency for each edge in the conflict graph and let the two tasks of that edge acquire the semaphore. This organization forces the two tasks to not run concurrently. tf::Executorexecutor; @@ -139,9 +147,9 @@ For the same example above, we can limit the semaphore concurrency to another va tf::Semaphoreconflict_AB(1); tf::Semaphoreconflict_AC(1); -tf::TaskA=taskflow.emplace([](){std::cout<<"A"<<std::endl;}); -tf::TaskB=taskflow.emplace([](){std::cout<<"B"<<std::endl;}); -tf::TaskC=taskflow.emplace([](){std::cout<<"C"<<std::endl;}); +tf::TaskA=taskflow.emplace([](){std::cout<<"A"<<std::endl;}); +tf::TaskB=taskflow.emplace([](){std::cout<<"B"<<std::endl;}); +tf::TaskC=taskflow.emplace([](){std::cout<<"C"<<std::endl;}); //describetheconflictbetweenAandB A.acquire(conflict_AB).release(conflict_AB); @@ -153,31 +161,66 @@ For the same example above, we can limit the semaphore concurrency to another va executor.run(taskflow).wait(); -#Onepossibleoutput:BandCrunconcurrentlyafterA +#Onepossibleoutput:BandCrunconcurrentlyafterA A BC -A task can acquire and release multiple semaphores. When the executor is running a task, it will first try to acquire all semaphores of that task. When the executor finishes a task, it will release all acquired semaphores of that task. +A task can acquire and release multiple semaphores. When the executor runs a task, it will try to acquire all semaphores needed by that task. When the executor finishes that task, it will release all acquired semaphores by that task. -The above code can be rewritten with tf::CriticalSection for simplicity, as shown below: + + + +Codestin Search AppYou can reset a semaphore to its initial state using tf::Semaphore::reset(), or set a new maximum value with tf::Semaphore::reset(size_t new_max_value). The method tf::Semaphore::value() allows you to query the current value of the semaphore, which represents the number of available acquisitions. +tf::Semaphoresemaphore(4); +assert(semaphore.value()==4&&semaphore.max_value()==4); + +//resetthesemaphoretoanewvalue +semaphore.reset(11); +assert(semaphore.value()==11&&semaphore.max_value()==11); + +When a semaphore is acquired more times than its maximum value, an exception will be thrown. + + + + +Codestin Search AppCurrently, tf::Semaphore has limited support for exception handling and taskflow cancellation. If a task throws an exception or the taskflow is canceled, subsequent acquire and release operations on the semaphore may result in undefined behavior. To ensure correct behavior, you should call tf::Semaphore::reset before reusing the semaphore in the next run. For instance, in the code below, when task B throws an exception, the executor will cancel the execution of the taskflow. That is, tasks C and D will not run, and thus no task will release the acquired semaphore. To resolve this situation, we must reset the semaphore to a clean state for the next run. tf::Executorexecutor; tf::Taskflowtaskflow; +tf::Semaphoresemaphore(1); -tf::CriticalSectioncritical_section_AB(1); -tf::CriticalSectioncritical_section_AC(1); +tf::TaskA=taskflow.emplace([](){}); +tf::TaskB=taskflow.emplace([](){throwstd::runtime_error("exception");}); +tf::TaskC=taskflow.emplace([](){}); +tf::TaskD=taskflow.emplace([](){}); +A.precede(B); +B.precede(C); +C.precede(D); -tf::TaskA=taskflow.emplace([](){std::cout<<"A"<<std::endl;}); -tf::TaskB=taskflow.emplace([](){std::cout<<"B"<<std::endl;}); -tf::TaskC=taskflow.emplace([](){std::cout<<"C"<<std::endl;}); +A.acquire(semaphore); +D.release(semaphore); -//describetheconflictgraph -critical_section_AB.add(A,B); -critical_section_AC.add(A,C); +//currentsemaphorehasavalueof1 +assert(semaphore.value()==1); -executor.run(taskflow).wait(); +//whenBthrowstheexception,Dwillnotrunandthussemaphoreisnotreleased +try{ +executor.run(taskflow).get(); +} +catch(std::runtime_error&e){ +std::cout<<e.what()<<std::endl; +} + +//sinceAacquiredthesemaphore,itsvalueis0 +assert(semaphore.value()==0); + +//resetthesemaphoretoacleanstatebeforerunningthetaskflowagain +semaphore.reset(); +assert(semaphore.value()==1); + +executor.run(taskflow).get(); - + diff --git a/docs/xml/matrix_multiplication_cudaflow.xml b/docs/xml/MatrixMultiplicationWithCUDAGPU.xml similarity index 73% rename from docs/xml/matrix_multiplication_cudaflow.xml rename to docs/xml/MatrixMultiplicationWithCUDAGPU.xml index 9af784eda..13e861580 100644 --- a/docs/xml/matrix_multiplication_cudaflow.xml +++ b/docs/xml/MatrixMultiplicationWithCUDAGPU.xml @@ -1,29 +1,28 @@ - - - matrix_multiplication_cudaflow - Codestin Search App + + + MatrixMultiplicationWithCUDAGPU + Codestin Search App Define a Matrix Multiplication Kernel - matrix_multiplication_cudaflow_1GPUAcceleratedMatrixMultiplication - + MatrixMultiplicationWithCUDAGPU_1GPUAcceleratedMatrixMultiplication + - Define a cudaFlow for Matrix Multiplication - matrix_multiplication_cudaflow_1DefineAcudaFlowForMatrixMultiplication - + Define a CUDA Graph for Matrix Multiplication + MatrixMultiplicationWithCUDAGPU_1DefineACUDAGraphForMatrixMultiplication + Benchmarking - matrix_multiplication_cudaflow_1MatrixMultiplicationcudaFlowBenchmarking - + MatrixMultiplicationWithCUDAGPU_1MatrixMultiplicationcudaFlowBenchmarking + -Following up on Matrix Multiplication, this page studies how to accelerate a matrix multiplication workload on a GPU using tf::cudaFlow. - -Codestin Search App -GPU can perform a lot of parallel computations more than CPUs. It is especially useful for data-intensive computing such as matrix multiplication. With GPU, we express the parallel patterns at a fine-grained level. The kernel, written in CUDA, is described as follows: +Following up on Matrix Multiplication, this page studies how to accelerate a matrix multiplication workload on a GPU using tf::cudaGraph. + +Codestin Search AppGPU can perform a lot of parallel computations more than CPUs. It is especially useful for data-intensive computing such as matrix multiplication. With GPU, we express the parallel patterns at a fine-grained level. The kernel, written in CUDA, is described as follows: //CUDAkerneltoperformmatrixmultiplication __global__voidmatmul(int*A,int*B,int*C,intM,intK,intN){ introw=blockIdx.y*blockDim.y+threadIdx.y; @@ -41,9 +40,8 @@ - -Codestin Search App -The next step is to allocate memory for A, B, and C at a GPU. We create three tasks each calling cudaMalloc to allocate space for one matrix. Then, we create a cudaFlow to offload matrix multiplication to a GPU. The entire code is described as follows: + +Codestin Search AppThe next step is to allocate memory for A, B, and C at a GPU. We create three tasks each calling cudaMalloc to allocate space for one matrix. Then, we create a CUDA graph to offload matrix multiplication to a GPU. The entire code is described as follows: voidmatrix_multiplication(int*A,int*B,int*C,intM,intK,intN){ tf::Taskflowtaskflow; @@ -64,34 +62,37 @@ cudaMalloc(&dc,M*N*sizeof(int)); }).name("allocate_c"); -//createacudaFlowtasktorunthematrixmultiplication +//createaCUDAgraphtasktorunthematrixmultiplication tf::TaskcudaFlow=taskflow.emplace([&](){ -tf::cudaFlowcf; +tf::cudaGraphcg; //copydatatoda,db,anddc -tf::cudaTaskcopy_da=cf.copy(da,A,M*K).name("H2D_A"); -tf::cudaTaskcopy_db=cf.copy(db,B,K*N).name("H2D_B"); -tf::cudaTaskcopy_hc=cf.copy(C,dc,M*N).name("D2H_C"); +tf::cudaTaskcopy_da=cg.copy(da,A,M*K); +tf::cudaTaskcopy_db=cg.copy(db,B,K*N); +tf::cudaTaskcopy_hc=cg.copy(C,dc,M*N); dim3grid((K+16-1)/16,(M+16-1)/16); dim3block(16,16); -tf::cudaTaskkmatmul=cf.kernel(grid,block,0,matmul,da,db,dc,M,K,N) -.name("matmul"); +tf::cudaTaskkmatmul=cg.kernel(grid,block,0,matmul,da,db,dc,M,K,N); kmatmul.succeed(copy_da,copy_db) .precede(copy_hc); -//launchthecudaFlow -tf::cudaStreamstream; -cf.run(stream); -stream.synchronize(); +//dumptheCUDAgraph +cg.dump(std::cout); + +//instantiateanexecutableCUDAgraphandrunitthroughastream +tf::cudaStreamstream; +tf::cudaGraphExecexec(cg); +stream.run(exec) +.synchronize(); }).name("cudaFlow"); //freethegpustorage -autofree=taskflow.emplace([&](){ +autofree=taskflow.emplace([&](){ cudaFree(da); cudaFree(db); cudaFree(dc); @@ -101,26 +102,19 @@ cudaFlow.succeed(allocate_a,allocate_b,allocate_c) .precede(free); -//dumpthegraphwithoutunfoldingthecudaFlow -taskflow.dump(std::cout); - //runthetaskflow executor.run(taskflow).wait(); - -//dumptheentireexecutiongraphincludingunfoldedcudaFlow -taskflow.dump(std::cout); } Within the cudaFlow, we create two host-to-device (H2D) tasks that copy data from A and B to da and db, one device-to-host (D2H) task that copies the result from dc to C, and one kernel task that launches matmul on the GPU (by default, GPU 0). H2D tasks precede the kernel and the kernel precedes the D2H task. These GPU operations form a GPU task graph managed by a cudaFlow. The first dump of the taskflow gives the following graph: - + A cudaFlow encapsulates a GPU task dependency graph similar to a tf::Subflow (see Subflow Tasking). In order to visualize it, we need to execute the graph first and then dump the taskflow. - + - -Codestin Search App -We run three versions of matrix multiplication, sequential CPU, parallel CPUs, and one GPU, on a machine of 12 Intel i7-8700 CPUs at 3.20 GHz and a Nvidia RTX 2080 GPU using various matrix sizes of A, B, and C. + +Codestin Search AppWe run three versions of matrix multiplication, sequential CPU, parallel CPUs, and one GPU, on a machine of 12 Intel i7-8700 CPUs at 3.20 GHz and a Nvidia RTX 2080 GPU using various matrix sizes of A, B, and C. A B @@ -182,6 +176,6 @@ As the matrix size increases, the speed-up of GPU over CPUs becomes prominent. For example, at 4000x4000, the GPU runtime is 585.8 times faster than the sequential CPU runtime and is 92.8 times faster than the parallel CPU solutions. - + diff --git a/docs/xml/ModuleAlgorithm.xml b/docs/xml/ModuleAlgorithm.xml new file mode 100644 index 000000000..b686490e4 --- /dev/null +++ b/docs/xml/ModuleAlgorithm.xml @@ -0,0 +1,136 @@ + + + + ModuleAlgorithm + Codestin Search App + + + Include the Header + ModuleAlgorithm_1ModuleAlgorithmInclude + + + What is a Module Task + ModuleAlgorithm_1WhatIsAModuleTask + + + Create a Module Task over a Custom Graph + ModuleAlgorithm_1CreateAModuleTaskOverACustomGraph + + + + + +Taskflow provides template methods that let users create reusable building blocks called modules. Users can connect modules together to build more complex parallel algorithms. + +Codestin Search AppYou need to include the header file, taskflow/algorithm/module.hpp, for creating a module task over a schedulable graph target. +#include<taskflow/algorithm/module.hpp> + + + +Codestin Search AppSimilar to Composable Tasking, but in a more general setting, the template function tf::make_module_task allows you to create a task over a Taskflow graph that can be executed by an executor. This provides a flexible mechanism to encapsulate and reuse complex task logic within your Taskflow applications. The following example demonstrates how to create and launch multiple Taskflow graphs in parallel using asynchronous tasking: +#include<taskflow/taskflow.hpp> +#include<taskflow/algorithm/module.hpp> + +intmain(){ + +tf::Executorexecutor; + +tf::TaskflowA; +tf::TaskflowB; +tf::TaskflowC; +tf::TaskflowD; + +A.emplace([](){printf("TaskflowA\n");}); +B.emplace([](){printf("TaskflowB\n");}); +C.emplace([](){printf("TaskflowC\n");}); +D.emplace([](){printf("TaskflowD\n");}); + +//launchthefourtaskflowsusingasynchronoustasking +executor.async(tf::make_module_task(A)); +executor.async(tf::make_module_task(B)); +executor.async(tf::make_module_task(C)); +executor.async(tf::make_module_task(D)); +executor.wait_for_all(); + +return0; +} + + + +Since the four taskflows are launched asynchronously without any dependencies between them, we can observe any order of the output message: +#onepossibleoutput +TaskflowB +TaskflowC +TaskflowA +TaskflowD + +#anotherpossibleoutput +TaskflowD +TaskflowA +TaskflowB +TaskflowC + +If you need to enforce dependencies among these four taskflows, you can use dependent-async tasks. The example below launches the four taskflows one by one in sequential: +tf::Executorexecutor; + +tf::TaskflowA; +tf::TaskflowB; +tf::TaskflowC; +tf::TaskflowD; + +A.emplace([](){printf("TaskflowA\n");}); +B.emplace([](){printf("TaskflowB\n");}); +C.emplace([](){printf("TaskflowC\n");}); +D.emplace([](){printf("TaskflowD\n");}); + +autoTA=executor.silent_dependent_async(tf::make_module_task(A)); +autoTB=executor.silent_dependent_async(tf::make_module_task(B),TA); +autoTC=executor.silent_dependent_async(tf::make_module_task(C),TB); +auto[TD,FD]=executor.dependent_async(tf::make_module_task(D),TC); +FD.get(); + + + +#dependent-asynctasksenforceasequentialexecutionofthefourtaskflows +TaskflowA +TaskflowB +TaskflowC +TaskflowD + +The module task maker, tf::make_module_task, operates similarly to tf::Taskflow::composed_of, but provides a more general interface that can be used beyond Taskflow. Specifically, the following two approaches achieve equivalent functionality: +//approach1:compositionusingcomposed_of +tf::Taskm1=taskflow1.composed_of(taskflow2); + +//approach2:compositionusingmake_module_task +tf::Taskm1=taskflow1.emplace(tf::make_module_task(taskflow2)); + +Similar to tf::Taskflow::composed_of, tf::make_module_task does not assume ownership of the provided taskflow but a soft reference. You are responsible for ensuring that the encapsulated taskflow remains valid throughout its execution. + + + + +Codestin Search AppIn addition to encapsulate taskflow graphs, you can create a module task to schedule a custom graph target. A schedulable target (of type T) must define the method T::graph() that returns a reference to the tf::Graph object managed by T. The following example defines a custom graph that can be scheduled through making module tasks: +structCustomGraph{ +tf::Graphgraph; +CustomGraph(){ +//useflowbuildertoinheritalltaskcreationmethodsintf::Taskflow +tf::FlowBuilderbuilder(graph); +tf::Tasktask=builder.emplace([](){ +std::cout<<"atask\n";//statictask +}); +} +//returnsareferencetothegraphfortaskflowcomposition +Graph&graph(){returngraph;} +}; + +CustomGraphtarget; +executor.async(tf::make_module_task(target)); + +Users are responsible for ensuring the given custom graph remains valid throughout its execution. The executor does not assume ownership of the custom graph. + + + + + + + diff --git a/docs/xml/ParallelFind.xml b/docs/xml/ParallelFind.xml index ee422529c..ab2a5585d 100644 --- a/docs/xml/ParallelFind.xml +++ b/docs/xml/ParallelFind.xml @@ -1,5 +1,5 @@ - + ParallelFind Codestin Search App @@ -7,58 +7,55 @@ Include the Header ParallelFind_1ParallelFindIncludeTheHeader - + What is a Find Algorithm? ParallelFind_1WhatIsAFindAlgorithm - + Create a Parallel Find-If Task ParallelFind_1CreateAParallelFindIfTask - + Capture Iterators by Reference ParallelFind_1ParallelFindCaptureIteratorsByReference - + Create a Parallel Find-If-Not Task ParallelFind_1CreateAParallelFindIfNotTask - + Find the Smallest and the Largest Elements ParallelFind_1ParallelFindMinMaxElement - + Configure a Partitioner ParallelFind_1ParallelFindConfigureAPartitioner - + Taskflow provides template functions for constructing tasks to perform parallel iterations over ranges of items. -Codestin Search App -You need to include the header file, taskflow/algorithm/find.hpp, for using parallel-find algorithms. +Codestin Search AppYou need to include the header file, taskflow/algorithm/find.hpp, for using parallel-find algorithms. #include<taskflow/algorithm/find.hpp> -Codestin Search App -A find algorithm allows you to find an element in a range [first, last) that satisfies a specific criteria. The algorithm returns an iterator to the first found element in the range or returns last if there is no such iterator. Taskflow provides the following parallel-find algorithms: +Codestin Search AppA find algorithm allows you to find an element in a range [first, last) that satisfies a specific criteria. The algorithm returns an iterator to the first found element in the range or returns last if there is no such iterator. Taskflow provides the following parallel-find algorithms: -tf::Taskflow::find_if(B first, E last, T& result, UOP predicate, P&& part) -tf::Taskflow::find_if_not(B first, E last, T& result, UOP predicate, P&& part) -tf::Taskflow::min_element(B first, E last, T& result, C comp, P&& part) -tf::Taskflow::max_element(B first, E last, T& result, C comp, P&& part) +tf::Taskflow::find_if(B first, E last, T& result, UOP predicate, P part) +tf::Taskflow::find_if_not(B first, E last, T& result, UOP predicate, P part) +tf::Taskflow::min_element(B first, E last, T& result, C comp, P part) +tf::Taskflow::max_element(B first, E last, T& result, C comp, P part) -Codestin Search App -tf::Taskflow::find_if performs parallel iterations to find the first element in the range [first, last) that makes the given predicate return true. It resembles a parallel implementation of the following loop: +Codestin Search Apptf::Taskflow::find_if performs parallel iterations to find the first element in the range [first, last) that makes the given predicate return true. It resembles a parallel implementation of the following loop: template<typenameInputIt,typenameUnaryPredicate> -InputItfind_if(InputItfirst,InputItlast,UnaryPredicatepredicate){ +InputItfind_if(InputItfirst,InputItlast,UnaryPredicatepredicate){ for(;first!=last;++first){ if(predicate(*first)){ returnfirst; @@ -68,8 +65,8 @@ } The example below creates a task to find the element that is equal to 22 from an input range of 10 elements. The result will be stored in the forth argument passed by reference: -std::vector<int>input={1,9,22,3,-6,13,12,0,9,11}; -std::vector<int>::iteratorresult; +std::vector<int>input={1,9,22,3,-6,13,12,0,9,11}; +std::vector<int>::iteratorresult; taskflow.find_if( input.begin(),input.end(),[](inti){returni==22;},result ); @@ -78,10 +75,9 @@ -Codestin Search App -You can pass iterators by reference using std::ref to marshal parameters update between dependent tasks. This is especially useful when the range iterators are not known at the time of creating a find-if task, but need initialization from another task. -std::vector<int>input; -std::vector<int>::iteratorresult,first,last; +Codestin Search AppYou can pass iterators by reference using std::ref to marshal parameters update between dependent tasks. This is especially useful when the range iterators are not known at the time of creating a find-if task, but need initialization from another task. +std::vector<int>input; +std::vector<int>::iteratorresult,first,last; //tasktosetuptherangeiterators tf::Taskinit=taskflow.emplace([&](){ @@ -92,7 +88,7 @@ //tasktoperformparallelfind tf::Tasktask=taskflow.find_if( -std::ref(first),std::ref(last),result,[](inti){returni==22;} +std::ref(first),std::ref(last),result,[](inti){returni==22;} ); init.precede(task); @@ -103,10 +99,9 @@ In the above example, when init finishes, input has been initialized to 10 elements with first and last pointing to the data range of input. The find-if task will then work on this initialized range as a result of passing iterators by reference. -Codestin Search App -tf::Taskflow::find_if_not performs parallel iterations to find the first element in the range [first, last) that makes the given predicate return false. It resembles a parallel implementation of the following loop: +Codestin Search Apptf::Taskflow::find_if_not performs parallel iterations to find the first element in the range [first, last) that makes the given predicate return false. It resembles a parallel implementation of the following loop: template<typenameInputIt,typenameUnaryPredicate> -InputItfind_if(InputItfirst,InputItlast,UnaryPredicatepredicate){ +InputItfind_if(InputItfirst,InputItlast,UnaryPredicatepredicate){ for(;first!=last;++first){ if(!predicate(*first)){ returnfirst; @@ -116,8 +111,8 @@ } The example below creates a task to find the element that is NOT equal to 22 from an input range of 10 elements. The result will be stored in the forth argument passed by reference: -std::vector<int>input={1,1,22,1,1,1,1,1,1,1}; -std::vector<int>::iteratorresult; +std::vector<int>input={1,1,22,1,1,1,1,1,1,1}; +std::vector<int>::iteratorresult; taskflow.find_if_not( input.begin(),input.end(),result,[](inti){returni==1;} ); @@ -127,37 +122,36 @@ Similar to Capture Iterators by Reference, iterators of tf::Taskflow::find_if_not are templated to allow passing iterators by reference using std::ref. This is especially useful when the range iterators are not known at the time of creating a find-if-not task, but need initialization from another task. -Codestin Search App -tf::Taskflow::min_element finds the smallest element in a range [first, last) using the given comparison function object. The example below finds the smallest element, i.e., -1, from an input range of 10 elements and stores the iterator to that smallest element in result: -std::vector<int>input={1,1,1,1,1,-1,1,1,1,1}; -std::vector<int>::iteratorresult; +Codestin Search Apptf::Taskflow::min_element finds the smallest element in a range [first, last) using the given comparison function object. The example below finds the smallest element, i.e., -1, from an input range of 10 elements and stores the iterator to that smallest element in result: +std::vector<int>input={1,1,1,1,1,-1,1,1,1,1}; +std::vector<int>::iteratorresult; taskflow.min_element( -input.begin(),input.end(),std::less<int>(),result +input.begin(),input.end(),std::less<int>(),result ); executor.run(taskflow).wait(); assert(*result==-1); Similarly, tf::Taskflow::max_element finds the largest element in a range [first, last) using the given comparison function object. The example below finds the largest element, i.e., 2, from an input range of 10 elements and stores the iterator to that largest element in result: -std::vector<int>input={1,1,1,1,1,2,1,1,1,1}; -std::vector<int>::iteratorresult; +std::vector<int>input={1,1,1,1,1,2,1,1,1,1}; +std::vector<int>::iteratorresult; taskflow.max_element( -input.begin(),input.end(),std::less<int>(),result +input.begin(),input.end(),std::less<int>(),result ); executor.run(taskflow).wait(); assert(*result==2); -When using tf::Taskflow::max_element to find the large element, we will still need to use std::less as our comparison function. Details can be referred to std::max_element. +When using tf::Taskflow::max_element to find the large element, we will still need to use std::less as our comparison function. Details can be referred to std::max_element. -Codestin Search App -You can configure a partitioner for parallel-find tasks (tf::Taskflow::find_if, tf::Taskflow::find_if_not, tf::Taskflow::min_element, tf::Taskflow::max_element) to run with different scheduling methods, such as guided partitioning, dynamic partitioning, and static partitioning. The following example creates two parallel-find tasks using two different partitioners, one with the static partitioning algorithm and another one with the guided partitioning algorithm: -std::vector<int>vec(1024,-1); -std::vector<int>::iteratorresult; +Codestin Search AppYou can configure a partitioner for parallel-find tasks (tf::Taskflow::find_if, tf::Taskflow::find_if_not, tf::Taskflow::min_element, tf::Taskflow::max_element) to run with different scheduling methods, such as guided partitioning, dynamic partitioning, and static partitioning. The following example creates two parallel-find tasks using two different partitioners, one with the static partitioning algorithm and another one with the guided partitioning algorithm: +std::vector<int>vec(1024,-1); +std::vector<int>::iteratorresult; -tf::ExecutionPolicy<tf::StaticPartitioner>static_partitioner; -tf::ExecutionPolicy<tf::GuidedPartitioner>guided_partitioner; +//createtwopartitionerswithachunksizeof10 +tf::StaticPartitionerstatic_partitioner(10); +tf::GuidedPartitionerguided_partitioner(10); //createaparallel-findtaskwithastaticpartitioner taskflow.find_if( @@ -169,11 +163,11 @@ vec.begin(),vec.end(),result,[&](inti){returni==-1;},guided_partitioner ); -By default, parallel-find tasks use tf::DefaultPartitioner if no partitioner is specified. +By default, parallel-find tasks use tf::DefaultPartitioner if no partitioner is specified. - + diff --git a/docs/xml/ParallelIterations.xml b/docs/xml/ParallelIterations.xml index 66b3a8807..d06d6202d 100644 --- a/docs/xml/ParallelIterations.xml +++ b/docs/xml/ParallelIterations.xml @@ -1,5 +1,5 @@ - + ParallelIterations Codestin Search App @@ -7,41 +7,39 @@ Include the Header ParallelIterations_1ParallelIterationsIncludeTheHeader - + Create an Index-based Parallel-Iteration Task ParallelIterations_1A1IndexBasedParallelFor - + Capture Indices by Reference ParallelIterations_1ParallelForEachCaptureIndicesByReference - + Create an Iterator-based Parallel-Iteration Task ParallelIterations_1A1IteratorBasedParallelFor - + Capture Iterators by Reference ParallelIterations_1ParallelForEachCaptureIteratorsByReference - + Configure a Partitioner ParallelIterations_1ParallelIterationsConfigureAPartitioner - + Taskflow provides template functions for constructing tasks to perform parallel iterations over ranges of items. -Codestin Search App -You need to include the header file, taskflow/algorithm/for_each.hpp, for using parallel-iteration algorithms. +Codestin Search AppYou need to include the header file, taskflow/algorithm/for_each.hpp, for using parallel-iteration algorithms. #include<taskflow/algorithm/for_each.hpp> -Codestin Search App -Index-based parallel-for performs parallel iterations over a range [first, last) with the given step size. The task created by tf::Taskflow::for_each_index(B first, E last, S step, C callable, P&& part) represents parallel execution of the following loop: +Codestin Search AppIndex-based parallel-for performs parallel iterations over a range [first, last) with the given step size. The task created by tf::Taskflow::for_each_index(B first, E last, S step, C callable, P part) represents parallel execution of the following loop: //positivestep for(autoi=first;i<last;i+=step){ callable(i); @@ -57,12 +55,26 @@ taskflow.for_each_index(100,0,-2,[](inti){});//50loopswitha-step Notice that either positive or negative direction is defined in terms of the range, [first, last), where end is excluded. In the positive case, the 50 items are 0, 2, 4, 6, 8, ..., 96, 98. In the negative case, the 50 items are 100, 98, 96, 04, ... 4, 2. An example of the Taskflow graph for the positive case under 12 workers is depicted below: - + +Instead of explicitly specifying the index range and the callable for each index invocation, the overload tf::Taskflow::for_each_by_index(R range, C callable, P part) provides you with a more flexible way to iterate over subranges of indices. This overload uses tf::IndexRange to partition the range into subranges, allowing finer control over how each subrange is processed. For instance, the code below does the same thing using two different approaches: +std::vector<int>data1(100),data2(100); + +//Approach1:initializedata1usingexplicitindexrange +taskflow.for_each_index(0,100,1,[&](inti){data1[i]=10;}); + +//Approach2:initializedata2usingtf::IndexRange +tf::IndexRange<int>range(0,100,1); +taskflow.for_each_by_index(range,[&](tf::IndexRange<int>subrange){ +for(inti=subrange.begin();i<subrange.end();i+=subrange.step_size()){ +data2[i]=10; +} +}); + +Both approaches produce the same result, but the second approach offers more flexibility in terms of how each partitioned subrange is iterated. This is particularly useful for applications that benefit from SIMD optimizations or other range-based processing strategies. -Codestin Search App -You can pass indices by reference using std::ref to marshal parameter update between dependent tasks. This is especially useful when the range indices are unknown at the time of creating a for-each-index task, but is initialized from another task. +Codestin Search AppYou can pass indices by reference using std::ref to marshal parameter update between dependent tasks. This is especially useful when the range indices are unknown at the time of creating a for-each-index task, but is initialized from another task. int*vec; intfirst,last; @@ -72,9 +84,9 @@ vec=newint[1000]; }); -autopf=taskflow.for_each_index(std::ref(first),std::ref(last),1, +autopf=taskflow.for_each_index(std::ref(first),std::ref(last),1, [&](inti){ -std::cout<<"paralleliterationonindex"<<vec[i]<<'\n'; +std::cout<<"paralleliterationonindex"<<vec[i]<<'\n'; } ); @@ -88,29 +100,27 @@ When init finishes, the parallel-for task pf will see first as 0 and last as 1000 and performs parallel iterations over the 1000 items. -Codestin Search App -Iterator-based parallel-for performs parallel iterations over a range specified by two STL-styled iterators, first and last. The task created by tf::Taskflow::for_each(B first, E last, C callable, P&& part) represents a parallel execution of the following loop: +Codestin Search AppIterator-based parallel-for performs parallel iterations over a range specified by two STL-styled iterators, first and last. The task created by tf::Taskflow::for_each(B first, E last, C callable, P part) represents a parallel execution of the following loop: for(autoi=first;i<last;i++){ callable(*i); } tf::Taskflow::for_each(B first, E last, C callable, P&& part) simultaneously applies the callable to the object obtained by dereferencing every iterator in the range [first, last). It is user's responsibility for ensuring the range is valid within the execution of the parallel-for task. Iterators must have the post-increment operator ++ defined. -std::vector<int>vec={1,2,3,4,5}; +std::vector<int>vec={1,2,3,4,5}; taskflow.for_each(vec.begin(),vec.end(),[](inti){ std::cout<<"parallelforonitem"<<i<<'\n'; }); -std::list<std::string>list={"hi","from","t","a","s","k","f","low"}; -taskflow.for_each(list.begin(),list.end(),[](conststd::string&str){ +std::list<std::string>list={"hi","from","t","a","s","k","f","low"}; +taskflow.for_each(list.begin(),list.end(),[](conststd::string&str){ std::cout<<"parallelforonitem"<<str<<'\n'; }); -Codestin Search App -Similar to tf::Taskflow::for_each_index, iterators of tf::Taskflow::for_each are templated to allow capturing range parameters by reference, such that one task can set up the range before another task performs the parallel-for algorithm. For example: -std::vector<int>vec; -std::vector<int>::iteratorfirst,last;; +Codestin Search AppSimilar to tf::Taskflow::for_each_index, iterators of tf::Taskflow::for_each are templated to allow capturing range parameters by reference, such that one task can set up the range before another task performs the parallel-for algorithm. For example: +std::vector<int>vec; +std::vector<int>::iteratorfirst,last;; tf::Taskinit=taskflow.emplace([&](){ vec.resize(1000); @@ -118,8 +128,8 @@ last=vec.end(); }); -tf::Taskpf=taskflow.for_each(std::ref(first),std::ref(last),[&](inti){ -std::cout<<"paralleliterationonitem"<<i<<'\n'; +tf::Taskpf=taskflow.for_each(std::ref(first),std::ref(last),[&](inti){ +std::cout<<"paralleliterationonitem"<<i<<'\n'; }); //wrong!mustusestd::ref,orfirstandlastarecapturedbycopy @@ -132,12 +142,12 @@ When init finishes, the parallel-for task pf will see first pointing to the beginning of vec and last pointing to the end of vec and performs parallel iterations over the 1000 items. The two tasks form an end-to-end task graph where the parameters of parallel-for are computed on the fly. -Codestin Search App -You can configure a partitioner for parallel-iteration tasks to run with different scheduling methods, such as guided partitioning, dynamic partitioning, and static partitioning. The following example creates two parallel-iteration tasks using two different partitioners, one with the static partitioning algorithm and another one with the guided partitioning algorithm: -std::vector<int>vec(1024,0); +Codestin Search AppYou can configure a partitioner for parallel-iteration tasks to run with different scheduling methods, such as guided partitioning, dynamic partitioning, and static partitioning. The following example creates two parallel-iteration tasks using two different partitioners, one with the static partitioning algorithm and another one with the guided partitioning algorithm: +std::vector<int>vec(1024,0); -tf::ExecutionPolicy<tf::StaticPartitioner>static_partitioner; -tf::ExecutionPolicy<tf::GuidedPartitioner>guided_partitioner; +//createtwopartitionerswithachunksizeof10 +tf::StaticPartitionerstatic_partitioner(10); +tf::GuidedPartitionerguided_partitioner(10); //createaparallel-iterationtaskwithstaticpartitioner taskflow.for_each( @@ -155,11 +165,11 @@ guided_partitioner ); -By default, parallel-iteration tasks use tf::DefaultPartitioner if no partitioner is specified. +By default, parallel-iteration tasks use tf::DefaultPartitioner if no partitioner is specified. - + diff --git a/docs/xml/ParallelReduction.xml b/docs/xml/ParallelReduction.xml index b16d11a22..dfe6efab7 100644 --- a/docs/xml/ParallelReduction.xml +++ b/docs/xml/ParallelReduction.xml @@ -1,5 +1,5 @@ - + ParallelReduction Codestin Search App @@ -7,44 +7,46 @@ Include the Header ParallelReduction_1ParallelReductionInclude - + Create a Parallel-Reduction Task ParallelReduction_1A2ParallelReduction - + Capture Iterators by Reference ParallelReduction_1ParallelReductionCaptureIteratorsByReference - + Create a Parallel-Transform-Reduction Task ParallelReduction_1A2ParallelTransformationReduction - + + + Create a Reduce-by-Index Task + ParallelReduction_1ParallelReductionCreateAReduceByIndexTask + Configure a Partitioner - ParallelReduction_1ParallelReductionCfigureAPartitioner - + ParallelReduction_1ParallelReductionConfigureAPartitioner + Taskflow provides template function that constructs a task to perform parallel reduction over a range of items. -Codestin Search App -You need to include the header file, taskflow/algorithm/reduce.hpp, for creating a parallel-reduction task. +Codestin Search AppYou need to include the header file, taskflow/algorithm/reduce.hpp, for creating a parallel-reduction task. #include<taskflow/algorithm/reduce.hpp> -Codestin Search App -The reduction task created by tf::Taskflow::reduce(B first, E last, T& result, O bop, P&& part) performs parallel reduction over a range of elements specified by [first, last) using the binary operator bop and stores the reduced result in result. It represents the parallel execution of the following reduction loop: +Codestin Search AppThe reduction task created by tf::Taskflow::reduce(B first, E last, T& result, O bop, P part) performs parallel reduction over a range of elements specified by [first, last) using the binary operator bop and stores the reduced result in result. It represents the parallel execution of the following reduction loop: for(autoitr=first;itr<last;itr++){ result=bop(result,*itr); } At runtime, the reduction task spawns a subflow to perform parallel reduction. The reduced result is stored in result that will be captured by reference in the reduction task. It is your responsibility to ensure result remains alive during the parallel execution. intsum=100; -std::vector<int>vec={1,2,3,4,5,6,7,8,9,10}; +std::vector<int>vec={1,2,3,4,5,6,7,8,9,10}; tf::Tasktask=taskflow.reduce(vec.begin(),vec.end(),sum, [](intl,intr){returnl+r;}//binaryreduceroperator @@ -56,11 +58,10 @@ The order in which the binary operator is applied to pairs of elements is unspecified. In other words, the elements of the range may be grouped and rearranged in arbitrary order. The result and the argument types of the binary operator must be consistent with the input data type. -Codestin Search App -You can pass iterators by reference using std::ref to marshal parameter update between dependent tasks. This is especially useful when the range is unknown at the time of creating a parallel-reduction task, but needs initialization from another task. +Codestin Search AppYou can pass iterators by reference using std::ref to marshal parameter update between dependent tasks. This is especially useful when the range is unknown at the time of creating a parallel-reduction task, but needs initialization from another task. intsum=100; -std::vector<int>vec; -std::vector<int>::iteratorfirst,last; +std::vector<int>vec; +std::vector<int>::iteratorfirst,last; tf::Taskinit=taskflow.emplace([&](){ vec={1,2,3,4,5,6,7,8,9,10}; @@ -68,7 +69,7 @@ last=vec.end(); }); -tf::Tasktask=taskflow.reduce(std::ref(first),std::ref(last),sum, +tf::Tasktask=taskflow.reduce(std::ref(first),std::ref(last),sum, [](intl,intr){returnl+r;}//binaryreduceroperator ); @@ -86,14 +87,13 @@ In the above example, when init finishes, vec has been initialized to 10 elements with first and last pointing to the data range of vec. The reduction task will then work on this initialized range as a result of passing iterators by reference. -Codestin Search App -It is common to transform each element into a new data type and then perform reduction on the transformed elements. Taskflow provides a method, tf::Taskflow::transform_reduce(B first, E last, T& result, BOP bop, UOP uop, P&& part), that applies uop to transform each element in the specified range and then perform parallel reduction over result and transformed elements. It represents the parallel execution of the following reduction loop: +Codestin Search AppIt is common to transform each element into a new data type and then perform reduction on the transformed elements. Taskflow provides a method, tf::Taskflow::transform_reduce(B first, E last, T& result, BOP bop, UOP uop, P part), that applies uop to transform each element in the specified range and then perform parallel reduction over result and transformed elements. It represents the parallel execution of the following reduction loop: for(autoitr=first;itr<last;itr++){ result=bop(result,uop(*itr)); } The example below transforms each digit in a string to an integer number and then sums up all integers in parallel. -std::stringstr="12345678"; +std::stringstr="12345678"; intsum{0}; tf::Tasktask=taskflow.transform_reduce(str.begin(),str.end(),sum, [](inta,intb){//binaryreductionoperator @@ -108,14 +108,42 @@ The order in which we apply the binary operator on the transformed elements is unspecified. It is possible that the binary operator will take r-value in both arguments, for example, bop(uop(*itr1), uop(*itr2)), due to the transformed temporaries. When data passing is expensive, you may define the result type T to be move-constructible. - -Codestin Search App -You can configure a partitioner for parallel-reduction tasks to run with different scheduling methods, such as guided partitioning, dynamic partitioning, and static partitioning. The following example creates two parallel-reduction tasks using two different partitioners, one with the static partitioning algorithm and another one with the guided partitioning algorithm: + +Codestin Search AppUnlike tf::Taskflow::reduce, the tf::Taskflow::reduce_by_index function lets you perform a parallel reduction over an index range, but with more control over how each part of the range is processed. This is useful when you need to customize the reduction process for each subrange or you want to incorporate optimizations like SIMD. The example below performs a sum-reduction over all elements in data with res: + +std::vector<double>data(100000); +doubleres=1.0; +taskflow.reduce_by_index( +//indexrange +tf::IndexRange<size_t>(0,N,1), +//finalresult +res, +//localreducer +[&](tf::IndexRange<size_t>subrange,std::optional<double>running_total){ +doubleresidual=running_total?*running_total:0.0; +for(size_ti=subrange.begin();i<subrange.end();i+=subrange.step_size()){ +data[i]=1.0;//weinitializethedatahere +residual+=data[i]; +} +printf("partialsum=%lf\n",residual); +returnresidual; +}, +//globalreducer +std::plus<double>() +); + +executor.run(taskflow).wait(); +assert(res==100001); + +The local reducer lop computes a partial sum for each subrange, and the global reducer gop combines the partial results into the final result and store it in res, whose initial value (i.e., 1.0 here) also participates in the reduction process. The second argument of the local reducer is a std::optional type, which indicates the current partial sum until this subrange. Apparently, the first subrange does not have any partial sum since there is no running total from previous subranges (i.e., running_total is std::nullopt). + + +Codestin Search AppYou can configure a partitioner for parallel-reduction tasks to run with different scheduling methods, such as guided partitioning, dynamic partitioning, and static partitioning. The following example creates two parallel-reduction tasks using two different partitioners, one with the static partitioning algorithm and another one with the guided partitioning algorithm: tf::StaticPartitionerstatic_partitioner; tf::GuidedPartitionerguided_partitioner; intsum1=100,sum2=100; -std::vector<int>vec={1,2,3,4,5,6,7,8,9,10}; +std::vector<int>vec={1,2,3,4,5,6,7,8,9,10}; //createaparallel-reductiontaskwithstaticpartitioner taskflow.reduce(vec.begin(),vec.end(),sum1, @@ -129,11 +157,11 @@ guided_partitioner ); -By default, parallel-reduction tasks use tf::DefaultPartitioner if no partitioner is specified. +By default, parallel-reduction tasks use tf::DefaultPartitioner if no partitioner is specified. - + diff --git a/docs/xml/ParallelScan.xml b/docs/xml/ParallelScan.xml index bb6616533..bc0079fe2 100644 --- a/docs/xml/ParallelScan.xml +++ b/docs/xml/ParallelScan.xml @@ -1,5 +1,5 @@ - + ParallelScan Codestin Search App @@ -7,91 +7,87 @@ Include the Header ParallelScan_1ParallelScanInclude - + What is a Scan Operation? ParallelScan_1WhatIsAScanOperation - + Create a Parallel Inclusive Scan Task ParallelScan_1CreateAParallelInclusiveScanTask - + Create a Parallel Transform-Inclusive Scan Task ParallelScan_1CreateAParallelTransformInclusiveScanTask - + Create a Parallel Exclusive Scan Task ParallelScan_1CreateAParallelExclusiveScanTask - + Create a Parallel Transform-Exclusive Scan Task ParallelScan_1CreateAParallelTransformExclusiveScanTask - + Taskflow provide template methods that construct tasks to perform parallel scan over a range of items. -Codestin Search App -You need to include the header file, taskflow/algorithm/scan.hpp, for creating a parallel-scan task. +Codestin Search AppYou need to include the header file, taskflow/algorithm/scan.hpp, for creating a parallel-scan task. #include<taskflow/algorithm/scan.hpp> -Codestin Search App -A parallel scan task performs the cumulative sum, also known as prefix sum or scan, of the input range and writes the result to the output range. Each element of the output range contains the running total of all earlier elements using the given binary operator for summation. +Codestin Search AppA parallel scan task performs the cumulative sum, also known as prefix sum or scan, of the input range and writes the result to the output range. Each element of the output range contains the running total of all earlier elements using the given binary operator for summation. -Codestin Search App -tf::Taskflow::inclusive_scan(B first, E last, D d_first, BOP bop) generates an inclusive scan, meaning that the N-th element of the output range is the sum of the first N input elements, so the N-th input element is included. For example, the code below performs an inclusive scan over five elements: -std::vector<int>input={1,2,3,4,5}; -std::vector<int>output(input.size()) +Codestin Search Apptf::Taskflow::inclusive_scan(B first, E last, D d_first, BOP bop) generates an inclusive scan, meaning that the N-th element of the output range is the sum of the first N input elements, so the N-th input element is included. For example, the code below performs an inclusive scan over five elements: +std::vector<int>input={1,2,3,4,5}; +std::vector<int>output(input.size()) taskflow.inclusive_scan( -input.begin(),input.end(),output.begin(),std::plus<int>{} +input.begin(),input.end(),output.begin(),std::plus<int>{} ); executor.run(taskflow).wait(); //outputis{1,3,6,10,15} The output range may be the same as the input range, in which the scan operation is in-place with results written to the input range. For example, the code below performs an in-place inclusive scan over five elements: -std::vector<int>input={1,2,3,4,5}; +std::vector<int>input={1,2,3,4,5}; taskflow.inclusive_scan( -input.begin(),input.end(),input.begin(),std::plus<int>{} +input.begin(),input.end(),input.begin(),std::plus<int>{} ); executor.run(taskflow).wait(); //inputis{1,3,6,10,15} -Similar to tf::Taskflow::inclusive_scan(B first, E last, D d_first, BOP bop), tf::Taskflow::inclusive_scan(B first, E last, D d_first, BOP bop, T init) performs an inclusive scan but with an additional initial value init. For example, the code below performs an inclusive scan over five elements plus an initial value: -std::vector<int>input={1,2,3,4,5}; -std::vector<int>output(input.size()); +Similar to tf::Taskflow::inclusive_scan(B first, E last, D d_first, BOP bop), tf::Taskflow::inclusive_scan(B first, E last, D d_first, BOP bop, T init) performs an inclusive scan but with an additional initial value init. For example, the code below performs an inclusive scan over five elements plus an initial value: +std::vector<int>input={1,2,3,4,5}; +std::vector<int>output(input.size()); //performsinclusivescanwithaninitialvalue taskflow.inclusive_scan( -input.begin(),input.end(),output.begin(),std::plus<int>{},-1 +input.begin(),input.end(),output.begin(),std::plus<int>{},-1 ); executor.run(taskflow).wait(); //outputis{0,2,5,9,14} -Codestin Search App -You can transform elements in the input range before running inclusive scan using tf::Taskflow::transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop) and tf::Taskflow::transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop, T init). For example, the code below performs an inclusive scan over five transformed elements: -std::vector<int>input={1,2,3,4,5}; -std::vector<int>output(input.size()); +Codestin Search AppYou can transform elements in the input range before running inclusive scan using tf::Taskflow::transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop) and tf::Taskflow::transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop, T init). For example, the code below performs an inclusive scan over five transformed elements: +std::vector<int>input={1,2,3,4,5}; +std::vector<int>output(input.size()); taskflow.transform_inclusive_scan( -input.begin(),input.end(),output.begin(),std::plus<int>{}, +input.begin(),input.end(),output.begin(),std::plus<int>{}, [](intitem){return-item;} ); executor.run(taskflow).wait(); //outputis{-1,-3,-6,-10,-15} -You can also associate the transform-inclusive scan with an initial value using tf::Taskflow::transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop, T init). Only elements in the input range will be transformed using uop, i.e., the initial value init does not participate in uop. -std::vector<int>input={1,2,3,4,5}; -std::vector<int>output(input.size()); +You can also associate the transform-inclusive scan with an initial value using tf::Taskflow::transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop, T init). Only elements in the input range will be transformed using uop, i.e., the initial value init does not participate in uop. +std::vector<int>input={1,2,3,4,5}; +std::vector<int>output(input.size()); taskflow.transform_inclusive_scan( -input.begin(),input.end(),output.begin(),std::plus<int>{}, +input.begin(),input.end(),output.begin(),std::plus<int>{}, [](intitem){return-item;}, -1 ); @@ -100,33 +96,31 @@ -Codestin Search App -tf::Taskflow::exclusive_scan(B first, E last, D d_first, T init, BOP bop) generates an exclusive scan with the given initial value. The N-th element of the output range is the sum of the first N-1 input elements, so the N-th input element is included. For example, the code below performs an exclusive scan over five elements with an initial value -1: -std::vector<int>input={1,2,3,4,5}; -std::vector<int>output(input.size()) +Codestin Search Apptf::Taskflow::exclusive_scan(B first, E last, D d_first, T init, BOP bop) generates an exclusive scan with the given initial value. The N-th element of the output range is the sum of the first N-1 input elements, so the N-th input element is included. For example, the code below performs an exclusive scan over five elements with an initial value -1: +std::vector<int>input={1,2,3,4,5}; +std::vector<int>output(input.size()) taskflow.exclusive_scan( -input.begin(),input.end(),output.begin(),-1,std::plus<int>{} +input.begin(),input.end(),output.begin(),-1,std::plus<int>{} ); executor.run(taskflow).wait(); //outputis{-1,0,2,5,9} The output range may be the same as the input range, in which the scan operation is in-place with results written to the input range. For example, the code below performs an in-place exclusive scan over five elements with an initial -1: -std::vector<int>input={1,2,3,4,5}; -std::vector<int>output(input.size()); +std::vector<int>input={1,2,3,4,5}; +std::vector<int>output(input.size()); taskflow.exclusive_scan( -input.begin(),input.end(),output.begin(),-1,std::plus<int>{} +input.begin(),input.end(),output.begin(),-1,std::plus<int>{} ); executor.run(taskflow).wait(); //outputis{-1,0,2,5,9} -Codestin Search App -You can transform elements in the input range before running exclusive scan using tf::Taskflow::transform_exclusive_scan(B first, E last, D d_first, T init, BOP bop, UOP uop). For example, the code below performs an exclusive scan over five transformed elements: -std::vector<int>input={1,2,3,4,5}; -std::vector<int>output(input.size()); +Codestin Search AppYou can transform elements in the input range before running exclusive scan using tf::Taskflow::transform_exclusive_scan(B first, E last, D d_first, T init, BOP bop, UOP uop). For example, the code below performs an exclusive scan over five transformed elements: +std::vector<int>input={1,2,3,4,5}; +std::vector<int>output(input.size()); taskflow.transform_exclusive_scan( -input.begin(),input.end(),input.begin(),-1,std::plus<int>{}, +input.begin(),input.end(),input.begin(),-1,std::plus<int>{}, [](intitem){return-item;} ); executor.run(taskflow).wait(); @@ -134,6 +128,6 @@ - + diff --git a/docs/xml/ParallelSort.xml b/docs/xml/ParallelSort.xml index 54173e75d..de8e55324 100644 --- a/docs/xml/ParallelSort.xml +++ b/docs/xml/ParallelSort.xml @@ -1,5 +1,5 @@ - + ParallelSort Codestin Search App @@ -7,93 +7,89 @@ Include the Header ParallelSort_1ParallelSortInclude - + Sort a Range of Items ParallelSort_1SortARangeOfItems - + Sort a Range of Items with a Custom Comparator ParallelSort_1SortARangeOfItemsWithACustomComparator - + Enable Stateful Data Passing ParallelSort_1ParallelSortEnableStatefulDataPassing - + Taskflow provides template functions for constructing tasks to sort ranges of items in parallel. -Codestin Search App -You need to include the header file, taskflow/algorithm/sort.hpp, for creating a parallel-sort task. +Codestin Search AppYou need to include the header file, taskflow/algorithm/sort.hpp, for creating a parallel-sort task. #include<taskflow/algorithm/sort.hpp> -Codestin Search App -The task created by tf::Taskflow::sort(B first, E last) performs parallel sort to rank a range of elements specified by [first, last) in increasing order. The given iterators must be random-accessible. The following example creates a task to sort a data vector in increasing order. +Codestin Search AppThe task created by tf::Taskflow::sort(B first, E last) performs parallel sort to rank a range of elements specified by [first, last) in increasing order. The given iterators must be random-accessible. The following example creates a task to sort a data vector in increasing order. tf::Taskflowtaskflow; tf::Executorexecutor; -std::vector<int>data={1,4,9,2,3,11,-8}; +std::vector<int>data={1,4,9,2,3,11,-8}; -tf::Tasksort=taskflow.sort(data.begin(),data.end()); +tf::Tasksort=taskflow.sort(data.begin(),data.end()); executor.run(taskflow).wait(); -assert(std::is_sorted(data.begin(),data.end())); +assert(std::is_sorted(data.begin(),data.end())); -Elements are compared using the operator <. +Elements are compared using the operator <. -Codestin Search App -tf::Taskflow::sort(B first, E last, C cmp) is an overload of parallel sort that allows users to specify a custom comparator. The following example sorts a data vector in decreasing order. +Codestin Search Apptf::Taskflow::sort(B first, E last, C cmp) is an overload of parallel sort that allows users to specify a custom comparator. The following example sorts a data vector in decreasing order. tf::Taskflowtaskflow; tf::Executorexecutor; -std::vector<int>data={1,4,9,2,3,11,-8}; +std::vector<int>data={1,4,9,2,3,11,-8}; -tf::Tasksort=taskflow.sort(data.begin(),data.end(), +tf::Tasksort=taskflow.sort(data.begin(),data.end(), [](inta,intb){returna>b;} ); executor.run(taskflow).wait(); -assert(std::is_sorted(data.begin(),data.end(),std::greater<int>{})); +assert(std::is_sorted(data.begin(),data.end(),std::greater<int>{})); -tf::Taskflow::sort is not stable. That is, two or more objects with equal keys may not appear in the same order before sorting. +tf::Taskflow::sort is not stable. That is, two or more objects with equal keys may not appear in the same order before sorting. -Codestin Search App -The iterators taken by tf::Taskflow::sort are templated. You can use std::reference_wrapper to enable stateful data passing between the sort task and others. The following example creates a task init to initialize the data vector and a task sort to sort the data in parallel after init finishes. +Codestin Search AppThe iterators taken by tf::Taskflow::sort are templated. You can use std::reference_wrapper to enable stateful data passing between the sort task and others. The following example creates a task init to initialize the data vector and a task sort to sort the data in parallel after init finishes. tf::Taskflowtaskflow; tf::Executorexecutor; -std::vector<int>data; -std::vector<int>::iteratorfirst,last; +std::vector<int>data; +std::vector<int>::iteratorfirst,last; tf::Taskinit=taskflow.emplace([&](){ data={1,4,9,2,3,11,-8}; first=data.begin(); last=data.end(); }); -tf::Tasksort=taskflow.sort( -std::ref(first),std::ref(last),[](intl,intr){returnl<r;} +tf::Tasksort=taskflow.sort( +std::ref(first),std::ref(last),[](intl,intr){returnl<r;} ); init.precede(sort); executor.run(taskflow).wait(); -assert(std::is_sorted(data.begin(),data.end())); +assert(std::is_sorted(data.begin(),data.end())); - + diff --git a/docs/xml/ParallelTransforms.xml b/docs/xml/ParallelTransforms.xml index 11c0491ab..660660f5e 100644 --- a/docs/xml/ParallelTransforms.xml +++ b/docs/xml/ParallelTransforms.xml @@ -1,5 +1,5 @@ - + ParallelTransforms Codestin Search App @@ -7,44 +7,42 @@ Include the Header ParallelTransforms_1ParallelTransformsInclude - + Create a Unary Parallel-Transform Task ParallelTransforms_1ParallelTransformsOverARange - + Capture Iterators by Reference ParallelTransforms_1ParallelTransformsCaptureIteratorsByReference - + Create a Binary Parallel-Transform Task ParallelTransforms_1ParallelBinaryTransformsOverARange - + Configure a Partitioner ParallelTransforms_1ParallelTransformsCfigureAPartitioner - + Taskflow provides template functions for constructing tasks to perform parallel transforms over ranges of items. -Codestin Search App -You need to include the header file, taskflow/algorithm/transform.hpp, for creating a parallel-transform task. +Codestin Search AppYou need to include the header file, taskflow/algorithm/transform.hpp, for creating a parallel-transform task. #include<taskflow/algorithm/transform.hpp> -Codestin Search App -Parallel-transform transforms a range of items, possibly with a different type for the transformed data, and stores the result in another range. The task created by tf::Taskflow::transform(B first1, E last1, O d_first, C c, P&& part) is equivalent to a parallel execution of the following loop: +Codestin Search AppParallel-transform transforms a range of items, possibly with a different type for the transformed data, and stores the result in another range. The task created by tf::Taskflow::transform(B first1, E last1, O d_first, C c, P part) is equivalent to a parallel execution of the following loop: while(first1!=last1){ *d_first++=c(*first1++); } tf::Taskflow::transform simultaneously applies the callable c to the object obtained by dereferencing every iterator in the range [first1, last1) and stores the result in another range beginning at d_first. It is user's responsibility for ensuring the range is valid within the execution of the parallel-transform task. -std::vector<int>src={1,2,3,4,5}; -std::vector<int>tgt(src.size()); +std::vector<int>src={1,2,3,4,5}; +std::vector<int>tgt(src.size()); taskflow.transform(src.begin(),src.end(),tgt.begin(),[](inti){ std::cout<<"transformingitem"<<i<<"to"<<i+1<<'\n'; returni+1; @@ -52,10 +50,9 @@ -Codestin Search App -You can pass iterators by reference using std::ref to marshal parameter update between dependent tasks. This is especially useful when the range is unknown at the time of creating a parallel-transform task, but needs initialization from another task. -std::vector<int>src,tgt; -std::vector<int>::iteratorfirst,last,d_first; +Codestin Search AppYou can pass iterators by reference using std::ref to marshal parameter update between dependent tasks. This is especially useful when the range is unknown at the time of creating a parallel-transform task, but needs initialization from another task. +std::vector<int>src,tgt; +std::vector<int>::iteratorfirst,last,d_first; tf::Taskinit=taskflow.emplace([&](){ src.resize(1000); @@ -65,10 +62,10 @@ d_first=tgt.begin(); }); -tf::Tasktransform=taskflow.for_each( -std::ref(first),std::ref(last),std::ref(d_first), +tf::Tasktransform=taskflow.transform( +std::ref(first),std::ref(last),std::ref(d_first), [&](inti){ -std::cout<<"transformingitem"<<i<<"to"<<i+1<<'\n'; +std::cout<<"transformingitem"<<i<<"to"<<i+1<<'\n'; returni+1; } ); @@ -78,16 +75,15 @@ When init finishes, the parallel-transform task transform will see first pointing to the beginning of src and last pointing to the end of src. Then, it simultaneously transforms these 1000 items by adding one to each element and stores the result in another range starting at d_first. -Codestin Search App -You can use the overload, tf::Taskflow::transform(B1 first1, E1 last1, B2 first2, O d_first, C c, P&& part), to perform parallel transforms on two source ranges pointed by first1 and first2 using the binary operator c and store the result in another range pointed by d_first. This method is equivalent to the parallel execution of the following loop: +Codestin Search AppYou can use the overload, tf::Taskflow::transform(B1 first1, E1 last1, B2 first2, O d_first, C c, P part), to perform parallel transforms on two source ranges pointed by first1 and first2 using the binary operator c and store the result in another range pointed by d_first. This method is equivalent to the parallel execution of the following loop: while(first1!=last1){ *d_first++=c(*first1++,*first2++); } The following example creates a parallel-transform task that adds two ranges of elements one by one and stores the result in a target range: -std::vector<int>src1={1,2,3,4,5}; -std::vector<int>src2={5,4,3,2,1}; -std::vector<int>tgt(src1.size()); +std::vector<int>src1={1,2,3,4,5}; +std::vector<int>src2={5,4,3,2,1}; +std::vector<int>tgt(src1.size()); taskflow.transform( src1.begin(),src1.end(),src2.begin(),tgt.begin(), [](inti,intj){ @@ -97,15 +93,14 @@ -Codestin Search App -You can configure a partitioner for parallel-transform tasks to run with different scheduling methods, such as guided partitioning, dynamic partitioning, and static partitioning. The following example creates two parallel-transform tasks using two different partitioners, one with the static partitioning algorithm and another one with the guided partitioning algorithm: +Codestin Search AppYou can configure a partitioner for parallel-transform tasks to run with different scheduling methods, such as guided partitioning, dynamic partitioning, and static partitioning. The following example creates two parallel-transform tasks using two different partitioners, one with the static partitioning algorithm and another one with the guided partitioning algorithm: tf::StaticPartitionerstatic_partitioner; tf::GuidedPartitionerguided_partitioner; -std::vector<int>src1={1,2,3,4,5}; -std::vector<int>src2={5,4,3,2,1}; -std::vector<int>tgt1(src1.size()); -std::vector<int>tgt2(src2.size()); +std::vector<int>src1={1,2,3,4,5}; +std::vector<int>src2={5,4,3,2,1}; +std::vector<int>tgt1(src1.size()); +std::vector<int>tgt2(src2.size()); //createaparallel-transformtaskwithstaticexecutionpartitioner taskflow.transform( @@ -125,11 +120,11 @@ guided_partitioner ); -By default, parallel-transform tasks use tf::DefaultPartitioner if no partitioner is specified. +By default, parallel-transform tasks use tf::DefaultPartitioner if no partitioner is specified. - + diff --git a/docs/xml/ParallelTransformsCUDA.xml b/docs/xml/ParallelTransformsCUDA.xml deleted file mode 100644 index 7ab58415a..000000000 --- a/docs/xml/ParallelTransformsCUDA.xml +++ /dev/null @@ -1,70 +0,0 @@ - - - - ParallelTransformsCUDA - Codestin Search App - - - Include the Header - ParallelTransformsCUDA_1CUDAParallelTransformsIncludeTheHeader - - - Transform a Range of Items - ParallelTransformsCUDA_1cudaFlowTransformARangeOfItems - - - Transform Two Ranges of Items - ParallelTransformsCUDA_1cudaFlowTransformTwoRangesOfItems - - - Miscellaneous Items - ParallelTransformsCUDA_1ParallelTransformCUDAMiscellaneousItems - - - - - -tf::cudaFlow provides template methods for transforming ranges of items to different outputs. - -Codestin Search App -You need to include the header file, taskflow/cuda/algorithm/transform.hpp, for creating a parallel-transform task. -#include<taskflow/cuda/algorithm/transform.hpp> - - - -Codestin Search App -Iterator-based parallel-transform applies the given transform function to a range of items and store the result in another range specified by two iterators, first and last. The task created by tf::cudaFlow::transform(I first, I last, O output, C op) represents a parallel execution for the following loop: -while(first!=last){ -*output++=op(*first++); -} - -The following example creates a transform kernel that transforms an input range of N items to an output range by multiplying each item by 10. -//output[i]=input[i]*10 -cudaflow.transform( -input,input+N,output,[]__device__(intx){returnx*10;} -); - -Each iteration is independent of each other and is assigned one kernel thread to run the callable. Since the callable runs on GPU, it must be declared with a __device__ specifier. - - -Codestin Search App -You can transform two ranges of items to an output range through a binary operator. The task created by tf::cudaFlow::transform(I1 first1, I1 last1, I2 first2, O output, C op) represents a parallel execution for the following loop: -while(first1!=last1){ -*output++=op(*first1++,*first2++); -} - -The following example creates a transform kernel that transforms two input ranges of N items to an output range by summing each pair of items in the input ranges. -//output[i]=input1[i]+inpu2[i] -cudaflow.transform( -input1,input1+N,input2,output,[]__device__(inta,intb){returna+b;} -); - - - -Codestin Search App -The parallel-transform algorithms are also available in tf::cudaFlowCapturer. - - - - - diff --git a/docs/xml/PartitioningAlgorithm.xml b/docs/xml/PartitioningAlgorithm.xml index d12573d8a..e1ef29bb0 100644 --- a/docs/xml/PartitioningAlgorithm.xml +++ b/docs/xml/PartitioningAlgorithm.xml @@ -1,5 +1,5 @@ - + PartitioningAlgorithm Codestin Search App @@ -7,32 +7,31 @@ Define a Partitioner for Parallel Algorithms PartitioningAlgorithm_1DefineAPartitionerForParallelAlgorithms - + Define a Static Partitioner PartitioningAlgorithm_1DefineAStaticPartitioner - + Define a Dynamic Partitioner PartitioningAlgorithm_1DefineADynamicPartitioner - + Define a Guided Partitioner PartitioningAlgorithm_1DefineAGuidedPartitioner - + Define a Closure Wrapper for a Partitioner PartitioningAlgorithm_1DefineAClosureWrapperForAPartitioner - + A partitioning algorithm allows applications to optimize parallel algorithms using different scheduling methods, such as static partitioning, dynamic partitioning, and guided partitioning. -Codestin Search App -A partitioner defines how to partition and distribute iterations to different workers when running parallel algorithms in Taskflow, such as tf::Taskflow::for_each and tf::Taskflow::transform. The following example shows how to create parallel-iteration tasks with different execution policies: -std::vector<int>data={1,2,3,4,5,6,7,8,9,10} +Codestin Search AppA partitioner defines how to partition and distribute iterations to different workers when running parallel algorithms in Taskflow, such as tf::Taskflow::for_each and tf::Taskflow::transform. The following example shows how to create parallel-iteration tasks with different execution policies: +std::vector<int>data={1,2,3,4,5,6,7,8,9,10} //createdifferentpartitioners tf::GuidedPartitionerguided_partitioner; @@ -50,37 +49,33 @@ Depending on applications, partitioning algorithms can impact the performance a lot. For example, if a parallel-iteration workload contains a regular work unit per iteration, tf::StaticPartitioner may deliver the best performance. On the other hand, if the work unit per iteration is irregular and unbalanced, tf::GuidedPartitioner or tf::DynamicPartitioner can outperform tf::StaticPartitioner. -By default, all parallel algorithms in Taskflow use tf::DefaultPartitioner, which is based on guided scheduling via tf::GuidedPartitioner. +By default, all parallel algorithms in Taskflow use tf::DefaultPartitioner, which is based on guided scheduling via tf::GuidedPartitioner. -Codestin Search App -Static partitioner splits iterations into iter_size/chunk_size chunks and distribute chunks to workers in order. If no chunk size is given (chunk_size is 0), Taskflow will partition iterations into chunks that are approximately equal in size. The following code creates a static partitioner with chunk size equal to 100: +Codestin Search AppStatic partitioner splits iterations into iter_size/chunk_size chunks and distribute chunks to workers in order. If no chunk size is given (chunk_size is 0), Taskflow will partition iterations into chunks that are approximately equal in size. The following code creates a static partitioner with chunk size equal to 100: tf::StaticPartitionerstatic_partitioner(100); -Codestin Search App -Dynamic partitioner splits iterations into iter_size/chunk_size chunks and distribute chunks to workers without any specific order. If no chunk size is given (chunk_size is 0), Taskflow will use 1 for the minimum size of a partition. The following code creates a dynamic partitioner with chunk size equal to 2: +Codestin Search AppDynamic partitioner splits iterations into iter_size/chunk_size chunks and distribute chunks to workers without any specific order. If no chunk size is given (chunk_size is 0), Taskflow will use 1 for the minimum size of a partition. The following code creates a dynamic partitioner with chunk size equal to 2: tf::DynamicPartitionerdynamic_partitioner(2); -Codestin Search App -Guided partitioner dynamically decides the chunk size. The size of a chunk is proportional to the number of unassigned iterations divided by the number of the threads, and the size will gradually decrease to the specified chunk size (default 1). The last chunk may be smaller than the specified chunk size. If no chunk size is given (chunk_size is 0), Taskflow will use 1 for the minimum size of a partition. The following code creates a guided partitioner with chunk size equal to 10: +Codestin Search AppGuided partitioner dynamically decides the chunk size. The size of a chunk is proportional to the number of unassigned iterations divided by the number of the threads, and the size will gradually decrease to the specified chunk size (default 1). The last chunk may be smaller than the specified chunk size. If no chunk size is given (chunk_size is 0), Taskflow will use 1 for the minimum size of a partition. The following code creates a guided partitioner with chunk size equal to 10: tf::GuidedPartitionerguided_partitioner(10); In most situations, guided partitioner can achieve decent performance due to adaptive parallelism, especially for those with irregular and unbalanced workload per iteration. As a result, guided partitioner is used as the default partitioner for our parallel algorithms. -Codestin Search App -In addition to partition size, applications can specify a closure wrapper for a partitioner. A closure wrapper allows the application to wrapper a partitioned task, i.e., closure, with a custom function object that performs additional tasks. For example: -std::atomic<int>count=0; +Codestin Search AppIn addition to partition size, applications can specify a closure wrapper for a partitioner. A closure wrapper allows the application to wrapper a partitioned task, i.e., closure, with a custom function object that performs additional tasks. For example: +std::atomic<int>count=0; tf::Taskflowtaskflow; taskflow.for_each_index(0,100,1, [](){ -printf("%d\n",i); +printf("%d\n",i); }, tf::StaticPartitioner(0,[](auto&&closure){ //dosomethingbeforeinvokingthepartitionedtask @@ -95,14 +90,14 @@ ); executor.run(taskflow).wait(); -Each partitioner uses a default closure wrapper (tf::DefaultClosureWrapper) that does nothing but simply invokes the given closure to perform the ordinary partitioned task. -structDefaultClosureWrapper{ +Each partitioner uses a default closure wrapper (tf::DefaultClosureWrapper) that does nothing but simply invokes the given closure to perform the ordinary partitioned task. +structDefaultClosureWrapper{ template<typenameC> -voidoperator()(C&&closure)const{std::forward<C>(closure)();} +voidoperator()(C&&closure)const{std::forward<C>(closure)();} }; - + diff --git a/docs/xml/PrioritizedTasking.xml b/docs/xml/PrioritizedTasking.xml deleted file mode 100644 index 469485aa3..000000000 --- a/docs/xml/PrioritizedTasking.xml +++ /dev/null @@ -1,60 +0,0 @@ - - - - PrioritizedTasking - Codestin Search App - - - Assign a Priority to a Task - PrioritizedTasking_1AssignAPriorityToATask - - - - - -This chapter demonstrates how to assigns a task a priority to hint the scheduler about one task of a higher priority should start earlier than another task of a lower priority. Task priorities are useful in many cases. For instance, we may prioritize some tasks over others to improve responsiveness or data locality of parallel tasks. - -Codestin Search App -Taskflow supports three different priority levels, tf::TaskPriority::HIGH, tf::TaskPriority::NORMAL, and tf::TaskPriority::LOW, as defined in tf::TaskPriority. When there are parallel tasks (i.e., no dependencies), Taskflow will try to execute tasks of higher priorities before tasks of lower priorities. By default, all tasks have the highest priorities (tf::TaskPriority::HIGH) unless otherwise assigned. -tf::Executorexecutor(1); -tf::Taskflowtaskflow; - -intcounter=0; - -auto[A,B,C,D,E]=taskflow.emplace( -[](){}, -[&](){ -std::cout<<"TaskB:"<<counter++<<'\n';//0 -}, -[&](){ -std::cout<<"TaskC:"<<counter++<<'\n';//2 -}, -[&](){ -std::cout<<"TaskD:"<<counter++<<'\n';//1 -}, -[](){} -); - -A.precede(B,C,D); -E.succeed(B,C,D); - -B.priority(tf::TaskPriority::HIGH); -C.priority(tf::TaskPriority::LOW); -D.priority(tf::TaskPriority::NORMAL); - -executor.run(taskflow).wait(); - -In the above code, we have a task graph of five tasks, A, B, C, D, and E, in which B, C, and D can run in simultaneously when A finishes. Since we only uses one worker thread in the executor, we can deterministically run B first, then D, and C in order of their priority values. The output of the above code is as follows: -TaskB:0 -TaskD:1 -TaskC:2 - -Task priorities are just hints to Taskflow's work-stealing scheduler about which task should run before another. Due to the randomness nature of work stealing, there is no guarantee that the scheduler will always follow these hints to run tasks when multiple workers exist. -Currently, Taskflow does not have any high-level abstraction for assigning priorities to threads but tasks. - - - - - - - diff --git a/docs/xml/Profiler.xml b/docs/xml/Profiler.xml index a67b634f6..ceff7615d 100644 --- a/docs/xml/Profiler.xml +++ b/docs/xml/Profiler.xml @@ -1,5 +1,5 @@ - + Profiler Codestin Search App @@ -7,15 +7,15 @@ Enable Taskflow Profiler Profiler_1ProfilerEnableTFProf - + Enable Taskflow Profiler on a HTTP Server Profiler_1ProfilerEnableTFProfServer - + Display Profile Summary Profiler_1ProfilerDisplayProfileSummary - + @@ -24,9 +24,8 @@ -Codestin Search App -All taskflow programs come with a lightweight profiling module to observer worker activities in every executor. To enable the profiler, set the environment variable TF_ENABLE_PROFILER to a file name in which the profiling result will be stored. -~$TF_ENABLE_PROFILER=result.json./my_taskflow +Codestin Search AppAll taskflow programs come with a lightweight profiling module to observer worker activities in every executor. To enable the profiler, set the environment variable TF_ENABLE_PROFILER to a file name in which the profiling result will be stored. +~$TF_ENABLE_PROFILER=result.json./my_taskflow ~$catresult.json [ {"executor":"0","data":[{"worker":12,"level":0,"data":[{"span":[72,117],"name":"12_0","type":"static"},{"span":[121,123],"name":"12_1","type":"static"},{"span":[123,125],"name":"12_2","type":"static"},{"span":[125,127],"name":"12_3","type":"static"}]}]} @@ -43,19 +42,18 @@ TFProf implements a clustering-based algorithm to efficiently visualize tasks and their execution timelines in a browser. Without losing much visual accuracy, each clustered task indicates a group of adjacent tasks clustered by the algorithm, and you can zoom in to see these tasks. -Codestin Search App -When profiling large taskflow programs, the method in the previous section may not work because of the limitation of processing large JSON files. For example, a taskflow program of a million tasks can produce several GBs of profiling data, and the profile may respond to your requests very slowly. To solve this problem, we have implemented a C++-based http server optimized for our profiling data. To compile the server, enable the cmake option TF_BUILD_PROFILER. You may visit Building and Installing to understand Taskflow's build environment. -#underthebuilddirectory +Codestin Search AppWhen profiling large taskflow programs, the method in the previous section may not work because of the limitation of processing large JSON files. For example, a taskflow program of a million tasks can produce several GBs of profiling data, and the profile may respond to your requests very slowly. To solve this problem, we have implemented a C++-based http server optimized for our profiling data. To compile the server, enable the cmake option TF_BUILD_PROFILER. You may visit Building and Installing to understand Taskflow's build environment. +#underthebuilddirectory ~$cmake../-DTF_BUILD_PROFILER=ON ~$make After successfully compiling the server, you can find the executable at tfprof/server/tfprof. Now, generate profiling data from running a taskflow program but specify the output file with extension .tfp. -~$TF_ENABLE_PROFILER=my_taskflow.tfp./my_taskflow +~$TF_ENABLE_PROFILER=my_taskflow.tfp./my_taskflow ~$ls my_taskflow.tfp#my_taskflow.tfpisofbinaryformat -Launch the server program tfprof/server/tfprof and pass (1) the directory of index.html (default at tfprof/) via the option mount and (2) the my_taskflow.tfp via the option input. -#underthebuild/directory +Launch the server program tfprof/server/tfprof and pass (1) the directory of index.html (default at tfprof/) via the option --mount and (2) the my_taskflow.tfp via the option --input. +#underthebuild/directory ~$./tfprof/server/tfprof--mount../tfprof/--inputmy_taskflow.tfp Now, open your favorite browser at localhost:8080 to visualize and profile your my_taskflow program. @@ -68,9 +66,8 @@ -Codestin Search App -You can display a profile summary by specifying only the environment variable TF_ENABLE_PROFILER without any value. The Taskflow will generate a separate summary report of tasks and workers for each executor created by the program. -#enabletheenvironmentvariablewithoutanyvalue +Codestin Search AppYou can display a profile summary by specifying only the environment variable TF_ENABLE_PROFILER without any value. The Taskflow will generate a separate summary report of tasks and workers for each executor created by the program. +#enabletheenvironmentvariablewithoutanyvalue ~$TF_ENABLE_PROFILER=./my_taskflow_program #yourprogramoutput @@ -91,6 +88,6 @@ The report consists of two sections, task summary and worker summary. In the first section, the summary reports for each task type the number of executions (Count), the total execution time (Time), average execution time per task (Avg), and the minimum (Min) and the maximum (Max) execution time among all tasks. Similarly in the second section, the summary reports for each worker the task execution statistics. - + diff --git a/docs/xml/ProjectMotivation.xml b/docs/xml/ProjectMotivation.xml index b97381cdb..14e9d0f99 100644 --- a/docs/xml/ProjectMotivation.xml +++ b/docs/xml/ProjectMotivation.xml @@ -1,5 +1,5 @@ - + ProjectMotivation Codestin Search App @@ -7,59 +7,54 @@ The Era of Multicore ProjectMotivation_1TheEraOfMulticore - + Heterogeneous Computing ProjectMotivation_1C0HeterogeneousComputing - + Loop-level Parallelism ProjectMotivation_1LoopLevelParallelism - + Task-based Parallelism ProjectMotivation_1TaskBasedParallelism - + The Project Mantra ProjectMotivation_1TheProjectMantra - + Taskflow addresses a long-standing problem, how can we make it easier for C++ developers to quickly write parallel and heterogeneous programs with high performance scalability and simultaneous high productivity? -Codestin Search App -In the past, we embrace free performance scaling on our software thanks to advances in manufacturing technologies and micro-architectural innovations. Approximately for every 1.5 year we can speed up our programs by simply switching to new hardware and compiler vendors that brings 2x more transistors, faster clock rates, and higher instruction-level parallelism. However, this paradigm was challenged by the power wall and increasing difficulties in exploiting instruction-level parallelism. The boost to computing performance has stemmed from changes to multicore chip designs. +Codestin Search AppIn the past, we embrace free performance scaling on our software thanks to advances in manufacturing technologies and micro-architectural innovations. Approximately for every 1.5 year we can speed up our programs by simply switching to new hardware and compiler vendors that brings 2x more transistors, faster clock rates, and higher instruction-level parallelism. However, this paradigm was challenged by the power wall and increasing difficulties in exploiting instruction-level parallelism. The boost to computing performance has stemmed from changes to multicore chip designs. The above sweeping visualization (thanks to Prof. Mark Horowitz and his group) shows the evolution of computer architectures is moving toward multicore designs. Today, multicore processors and multiprocessor systems are common in many electronic products such as mobiles, laptops, desktops, and servers. In order to keep up with the performance scaling, it is becoming necessary for software developers to write parallel programs that utilize the number of available cores. -Codestin Search App -With the influence of artificial intelligence (AI) through new and merged workloads, heterogeneous computing becomes demanding and will continue to be heard for years to come. We have not just CPUs but GPUs, TPUs, FPGAs, and ASICs to accelerator a wide variety of scientific computing problems. +Codestin Search AppWith the influence of artificial intelligence (AI) through new and merged workloads, heterogeneous computing becomes demanding and will continue to be heard for years to come. We have not just CPUs but GPUs, TPUs, FPGAs, and ASICs to accelerator a wide variety of scientific computing problems. The question is: How are we going to program these beasts? Writing a high-performance sequential program is hard. Parallel programming is harder. Parallel programming of heterogeneous devices is extremely challenging if we care about performance and power efficiency. Programming models need to deal with productivity versus performance. -Codestin Search App -The most basic and simplest concept of parallel programming is loop-level parallelism, exploiting parallelism that exists among the iterations of a loop. The program typically partitions a loop of iterations into a set of of blocks, either fixed or dynamic, and run each block in parallel. Below the figure illustrates this pattern. +Codestin Search AppThe most basic and simplest concept of parallel programming is loop-level parallelism, exploiting parallelism that exists among the iterations of a loop. The program typically partitions a loop of iterations into a set of of blocks, either fixed or dynamic, and run each block in parallel. Below the figure illustrates this pattern. The main advantage of the loop-based approach is its simplicity in speeding up a regular workload in line with Amdahl's Law. Programmers only need to discover independence of each iteration within a loop and, once possible, the parallel decomposition strategy can be easily implemented. Many existing libraries have built-in support to write a parallel-for loop. -Codestin Search App -The traditional loop-level parallelism is simple but hardly allows users to exploit parallelism in more irregular applications such as graph algorithms, incremental flows, recursion, and dynamically-allocated data structures. To address these challenges, parallel programming and libraries are evolving from the tradition loop-based parallelism to the task-based model. - +Codestin Search AppThe traditional loop-level parallelism is simple but hardly allows users to exploit parallelism in more irregular applications such as graph algorithms, incremental flows, recursion, and dynamically-allocated data structures. To address these challenges, parallel programming and libraries are evolving from the tradition loop-based parallelism to the task-based model. + The above figure shows an example task dependency graph. Each node in the graph represents a task unit at function level and each edge indicates the task dependency between a pair of tasks. Task-based model offers a powerful means to express both regular and irregular parallelism in a top-down manner, and provides transparent scaling to large number of cores. In fact, it has been proven, both by the research community and the evolution of parallel programming standards, task-based approach scales the best with future processor generations and architectures. -Codestin Search App -The goal of Taskflow is simple - We help developers quickly write parallel programs with high performance scalability and simultaneous high productivity. We want developers to write simple and effective parallel code, specifically with the following objectives: +Codestin Search AppThe goal of Taskflow is simple - We help developers quickly write parallel programs with high performance scalability and simultaneous high productivity. We want developers to write simple and effective parallel code, specifically with the following objectives: Expressiveness @@ -71,6 +66,6 @@ In a nutshell, code written with Taskflow explains itself. The transparency allows developers to focus on the development of application algorithms and parallel decomposition strategies, rather than low-level, system-specific details. - + diff --git a/docs/xml/QuickStart_8dox.xml b/docs/xml/QuickStart_8dox.xml index 81c0947cb..7063da84a 100644 --- a/docs/xml/QuickStart_8dox.xml +++ b/docs/xml/QuickStart_8dox.xml @@ -1,5 +1,5 @@ - + QuickStart.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/References.xml b/docs/xml/References.xml index 8ba2eddb9..75827bc91 100644 --- a/docs/xml/References.xml +++ b/docs/xml/References.xml @@ -1,96 +1,24 @@ - + References Codestin Search App - - - Conference - References_1RefConference - - - Journal - References_1RefJournal - - - Recognition - References_1RefRecognition - - -This page summarizes a list of publication related to Taskflow. If you are using Taskflow, please cite the following paper we publised at 2022 IEEE TPDS: +This page summarizes a list of publication related to Taskflow. If you are using Taskflow, please cite the following paper we published at 2022 IEEE Transactions on Parallel and Distributed Systems (TPDS): Tsung-Wei Huang, Dian-Lun Lin, Chun-Xun Lin, and Yibo Lin, "Taskflow: A Lightweight Parallel and Heterogeneous Task Graph Computing System," IEEE Transactions on Parallel and Distributed Systems (TPDS), vol. 33, no. 6, pp. 1303-1320, June 2022 - -Codestin Search App - - -Dian-Lun Lin, Yanqing Zhang, Haoxing Ren, Shih-Hsin Wang, Brucek Khailany and Tsung-Wei Huang, "GenFuzz: GPU-accelerated Hardware Fuzzing using Genetic Algorithm with Multiple Inputs," ACM/IEEE Design Automation Conference (DAC), San Francisco, CA, 2023 - - -Tsung-Wei Huang, "qTask: Task-parallel Quantum Circuit Simulation with Incrementality," IEEE International Parallel and Distributed Processing Symposium (IPDPS), St. Petersburg, Florida, 2023 - - -Elmir Dzaka, Dian-Lun Lin, and Tsung-Wei Huang, "Parallel And-Inverter Graph Simulation Using a Task-graph Computing System," IEEE International Parallel and Distributed Processing Symposium Workshop (IPDPSW), St. Petersburg, Florida, 2023 - - -Tsung-Wei Huang and Leslie Hwang, "Task-Parallel Programming with Constrained Parallelism," IEEE High-Performance Extreme Computing Conference (HPEC), MA, 2022 - - -Tsung-Wei Huang, "Enhancing the Performance Portability of Heterogeneous Circuit Analysis Programs," IEEE High-Performance Extreme Computing Conference (HPEC), MA, 2022 - - -Dian-Lun Lin, Haoxing Ren, Yanqing Zhang, and Tsung-Wei Huang, "From RTL to CUDA: A GPU Acceleration Flow for RTL Simulation with Batch Stimulus," ACM International Conference on Parallel Processing (ICPP), Bordeaux, France, 2022 - - -Cheng-Hsiang Chiu and Tsung-Wei Huang, "Composing Pipeline Parallelism using Control Taskflow Graph," ACM International Symposium on High-Performance Parallel and Distributed Computing (HPDC), Minneapolis, Minnesota, 2022 - - -Cheng-Hsiang Chiu and Tsung-Wei Huang, "Efficient Timing Propagation with Simultaneous Structural and Pipeline Parallelisms," ACM/IEEE Design Automation Conference (DAC), San Francisco, CA, 2022 - - -Dian-Lun Lin and Tsung-Wei Huang, "Efficient GPU Computation using Task Graph Parallelism," European Conference on Parallel and Distributed Computing (EuroPar), 2021 - - -Tsung-Wei Huang, "A General-purpose Parallel and Heterogeneous Task Programming System for VLSI CAD," IEEE/ACM International Conference on Computer-aided Design (ICCAD), CA, 2020 - - -Chun-Xun Lin, Tsung-Wei Huang, and Martin Wong, "An Efficient Work-Stealing Scheduler for Task Dependency Graph," IEEE International Conference on Parallel and Distributed Systems (ICPADS), Hong Kong, 2020 - - -Tsung-Wei Huang, Chun-Xun Lin, Guannan Guo, and Martin Wong, "Cpp-Taskflow: Fast Task-based Parallel Programming using Modern C++," IEEE International Parallel and Distributed Processing Symposium (IPDPS), pp. 974-983, Rio de Janeiro, Brazil, 2019 - - -Chun-Xun Lin, Tsung-Wei Huang, Guannan Guo, and Martin Wong, "A Modern C++ Parallel Task Programming Library," ACM Multimedia Conference (MM), pp. 2284-2287, Nice, France, 2019 - - -Chun-Xun Lin, Tsung-Wei Huang, Guannan Guo, and Martin Wong, "An Efficient and Composable Parallel Task Programming Library," IEEE High-performance and Extreme Computing Conference (HPEC), pp. 1-7, Waltham, MA, 2019 - - - - - -Codestin Search App - - -Dian-Lun Lin and Tsung-Wei Huang, "Accelerating Large Sparse Neural Network Inference using GPU Task Graph Parallelism," IEEE Transactions on Parallel and Distributed Systems (TPDS), vol. 33, no. 11, pp. 3041-3052, Nov 2022 - + +Codestin Search App -Tsung-Wei Huang, Dian-Lun Lin, Chun-Xun Lin, and Yibo Lin, "Taskflow: A Lightweight Parallel and Heterogeneous Task Graph Computing System," IEEE Transactions on Parallel and Distributed Systems (TPDS), vol. 33, no. 6, pp. 1303-1320, June 2022 +Second Place of Fast Code Programming Challenge at the 2025 ACM PPoPP -Tsung-Wei Huang, Dian-Lun Lin, Yibo Lin, and Chun-Xun Lin, "Cpp-Taskflow: A General-purpose Parallel Task Programming System at Scale," IEEE Transactions on Computer-aided Design of Integrated Circuits and Systems (TCAD), vol. 40, no.8, 2021 +Innovation Award of the 2023 IEEE HPEC/MIT/Amazon Stochastic Block Partition Challenge - - - - -Codestin Search App - Champion of Graph Challenge at the 2020 IEEE High-performance Extreme Computing Conference @@ -107,6 +35,6 @@ - + diff --git a/docs/xml/Releases.xml b/docs/xml/Releases.xml index 04cd7f4bd..5dbd234d3 100644 --- a/docs/xml/Releases.xml +++ b/docs/xml/Releases.xml @@ -1,10 +1,14 @@ - + Releases Codestin Search App Release Roadmap - Release 3.7.0 (Master) + Release 3.11.0 (Master) + Release 3.10.0 (2025/05/01) + Release 3.9.0 (2025/01/02) + Release 3.8.0 (2024/10/02) + Release 3.7.0 (2024/05/07) Release 3.6.0 (2023/05/07) Release 3.5.0 (2023/01/05) Release 3.4.0 (2022/05/23) @@ -31,7 +35,11 @@ All releases are available in Project GitHub. Release Roadmap -Release 3.7.0 (Master) +Release 3.11.0 (Master) +Release 3.10.0 (2025/05/01) +Release 3.9.0 (2025/01/02) +Release 3.8.0 (2024/10/02) +Release 3.7.0 (2024/05/07) Release 3.6.0 (2023/05/07) Release 3.5.0 (2023/01/05) Release 3.4.0 (2022/05/23) @@ -52,6 +60,6 @@ - + diff --git a/docs/xml/RequestCancellation.xml b/docs/xml/RequestCancellation.xml index 48de6468a..1afb3195b 100644 --- a/docs/xml/RequestCancellation.xml +++ b/docs/xml/RequestCancellation.xml @@ -1,31 +1,30 @@ - + RequestCancellation Codestin Search App - Cancel Execution of Taskflows + Cancel a Running Taskflow RequestCancellation_1CancelARunningTaskflow - + Understand the Limitations of Cancellation RequestCancellation_1UnderstandTheLimitationsOfCancellation - + -This chapters discusses how to cancel submitted tasks. +This chapters discusses how to cancel a running taskflow. -Codestin Search App -When you submit a taskflow to an executor (e.g., tf::Executor::run), the executor returns a tf::Future object that will hold the result of the execution. tf::Future is a derived class from std::future. In addition to base methods of std::future, you can call tf::Future::cancel to cancel the execution of a running taskflow. The following example cancels a submission of a taskflow that contains 1000 tasks each running one second. +Codestin Search AppWhen you submit a taskflow to an executor using the run series (e.g., tf::Executor::run), the executor returns a tf::Future object that holds the result of the execution. tf::Future is derived from std::future. In addition to the base methods of std::future, you can call tf::Future::cancel to cancel the execution of a running taskflow. The following example demonstrates cancelling a submission of a taskflow containing 1000 tasks, each running for one second. tf::Executorexecutor; tf::Taskflowtaskflow; for(inti=0;i<1000;i++){ taskflow.emplace([](){ -std::this_thread::sleep_for(std::chrono::seconds(1)); +std::this_thread::sleep_for(std::chrono::seconds(1)); }); } @@ -36,14 +35,9 @@ fu.cancel(); //waituntilthecancellationcompletes -fu.get(); +fu.wait(); -tf::Future::cancel is non-deterministic and out-of-order. - -When you request a cancellation, the executor will stop scheduling the rest tasks of the taskflow. Tasks that are already running will continue to finish, but their successor tasks will not be scheduled to run. A cancellation is considered complete when all these running tasks finish. To wait for a cancellation to complete, you may explicitly call tf::Future::get. -It is your responsibility to ensure that the taskflow remains alive before the cancellation completes. - -For instance, the following code results in undefined behavior: +When you request a cancellation, the executor will stop scheduling the remaining tasks of the taskflow. Requesting a cancellation does not guarantee an immediate stop of a running taskflow. Tasks that are already running will continue to finish, but their successor tasks will not be scheduled. A cancellation is considered complete only after all running tasks have finished. To wait for the cancellation to complete, you can explicitly call tf::Future::wait. Note that it is your responsibility to ensure that the taskflow remains alive until the cancellation is complete, as there may still be running tasks that cannot be canceled. For instance, the following code results in undefined behavior: tf::Executorexecutor; { tf::Taskflowtaskflow; @@ -58,7 +52,7 @@ For instance, the following code results in undefined behavior: }//destroyingtaskflowherecanresultinundefinedbehavior -The undefined behavior problem exists because tf::Future::cancel does not guarantee an immediate cancellation. To fix the problem, call get to ensure the cancellation completes before the end of the scope destroys the taskflow. +To avoid this issue, call wait to ensure the cancellation completes before the taskflow is destroyed at the end of the scope. tf::Executorexecutor; { tf::Taskflowtaskflow; @@ -70,20 +64,19 @@ For instance, the following code results in undefined behavior: tf::Futurefu=executor.run(taskflow); fu.cancel();//therecanstillbetaskrunningaftercancellation -fu.get();//waitsuntilthecancellationcompletes +fu.wait();//waituntilthecancellationcompletes } -Codestin Search App -Canceling the execution of a running taskflow has the following limitations: -Cancellation is non-preemptive. A running task will not be cancelled until it finishes. -Cancelling a taskflow with tasks acquiring and/or releasing tf::Semaphore results is currently not supported. +Codestin Search AppDue to its asynchronous and non-deterministic nature, taskflow cancellation has the following limitations: +Non-preemptive behavior: Cancellation does not forcibly terminate running tasks. Any task already in execution will continue to completion before cancellation takes effect. +Semaphore incompatibility: Cancelling a taskflow that includes tasks involving tf::Semaphore (i.e., acquiring or releasing) is currently unsupported and may lead to undefined behavior. We may overcome these limitations in the future releases. - + diff --git a/docs/xml/RuntimeTasking.xml b/docs/xml/RuntimeTasking.xml index 54ceec31f..4b4bd3748 100644 --- a/docs/xml/RuntimeTasking.xml +++ b/docs/xml/RuntimeTasking.xml @@ -1,59 +1,57 @@ - + RuntimeTasking - Codestin Search App + Codestin Search App - Create a Runtime Object + Create a Runtime Task RuntimeTasking_1CreateARuntimeTask - + Acquire the Running Executor RuntimeTasking_1AcquireTheRunningExecutor - + - Run a Task Graph Synchronously - RuntimeTasking_1RuntimeTaskingRunATaskGraphSynchronously - + Corun Taskflows from a Runtime Task + RuntimeTasking_1CorunTaskflowsFromARuntimeTask + - Learn More About Runtime - RuntimeTasking_1LearnMoreAboutRuntime - + Corun Asynchronous Tasks from a Runtime Task + RuntimeTasking_1CorunAsynchronousTasksFromARuntimeTask + -Taskflow allows you to interact with the scheduling runtime by taking a runtime object as an argument of a task. This is mostly useful for designing specialized parallel algorithms extended from the existing facility of Taskflow. +Taskflow allows you to interact with the scheduling runtime by taking a runtime object as an argument of a task. This is mostly useful for designing recursive parallel algorithms that require dynamic tasking on the fly. -Codestin Search App -Taskflow allows a static task and a condition task to take a referenced tf::Runtime object that provides a set of methods to interact with the scheduling runtime. The following example creates a static task that leverages tf::Runtime to explicitly schedule a conditioned task which would never run under the normal scheduling circumstance: +Codestin Search AppTaskflow allows users to define a runtime task that accepts a reference to a tf::Runtime object. This object provides methods to interact with the underlying scheduling engine. For example, a runtime task can be used to explicitly schedule another task that would not normally execute due to the graph's structure or conditional dependencies: tf::TaskA,B,C,D; -std::tie(A,B,C,D)=taskflow.emplace( +std::tie(A,B,C,D)=taskflow.emplace( [](){return0;}, [&C](tf::Runtime&rt){//Cmustbecapturedbyreference -std::cout<<"B\n"; +std::cout<<"B\n"; rt.schedule(C); }, -[](){std::cout<<"C\n";}, -[](){std::cout<<"D\n";} +[](){std::cout<<"C\n";}, +[](){std::cout<<"D\n";} ); A.precede(B,C,D); executor.run(taskflow).wait(); - + -When the condition task A completes and returns 0, the scheduler moves on to task B. Under the normal circumstance, tasks C and D will not run because their conditional dependencies never happen. This can be broken by forcefully scheduling C or/and D via a runtime object of a task that resides in the same graph. Here, task B call tf::Runtime::schedule to forcefully run task C even though the weak dependency between A and C will never happen based on the graph structure itself. As a result, we will see both B and C in the output: -B#BleveragesaruntimeobjecttoscheduleCoutofitsdependencyconstraint +In the above code, when the condition task A completes and returns 0, the scheduler moves on to task B. Under normal circumstances, tasks C and D will not run because their conditional dependencies never occur. This behavior can be overridden by forcefully scheduling C or/and D via a runtime object of a task that resides in the same graph. Here, task B calls tf::Runtime::schedule to forcefully run task C, even though the weak dependency between A and C will never occur based on the graph structure itself. As a result, we will see both B and C in the output: +B#BusesaruntimeobjecttoscheduleCoutofitsdependencyconstraint C -You should only schedule an active task from a runtime object. An active task is a task in a running taskflow. The task may or may not be running, and scheduling that task will immediately put it into the task queue of the worker that is running the runtime object. +You should only schedule an active task when using tf::Runtime::schedule. An active task is one that belongs to a currently running taskflow. The task may or may not be executing at the moment, but scheduling it will immediately place it into the task queue of the worker that invoked the runtime object. -Codestin Search App -You can acquire the reference to the running executor using tf::Runtime::executor(). The executor associated with a runtime object is the executor that runs the parent task of that runtime object. +Codestin Search AppYou can acquire the reference to the running executor using tf::Runtime::executor. The executor associated with a runtime object is the executor that runs the parent task of that runtime object. tf::Executorexecutor; tf::Taskflowtaskflow; taskflow.emplace([&](tf::Runtime&rt){ @@ -62,39 +60,27 @@ executor.run(taskflow).wait(); - -Codestin Search App -A runtime object can spawn and run a task graph synchronously using tf::Runtime::corun. This model allows you to leverage dynamic tasking to execute a parallel workload within a runtime object. The following code creates a subflow of two independent tasks and executes it synchronously via the given runtime object: -taskflow.emplace([](tf::Runtime&rt){ -rt.corun([](tf::Subflow&sf){ -sf.emplace([](){std::cout<<"independenttask1\n";}); -sf.emplace([](){std::cout<<"independenttask2\n";}); -//subflowjoinsuponcorunreturns -}); -}); - -You can also create a task graph yourself and execute it through a runtime object. This organization avoids repetitive creation of a subflow with the same topology, such as running a runtime object repetitively. The following code performs the same execution logic as the above example but using the given task graph to avoid repetitive creations of a subflow: + +Codestin Search AppOne of the most powerful features of a runtime task is tf::Runtime::corun. The method tf::Runtime::corun provides a non-blocking mechanism that allows the calling worker to continue executing other available tasks in the executor while waiting for all tasks spawned from that runtime to complete. This behavior is critical for avoiding deadlock in nested or recursive tasking patterns, where workers may otherwise block while waiting on subgraphs of children tasks to finish, leading to a situation where no workers are left to make forward progress. The following example demonstrates how to use tf::Runtime::corun to run a predefined task graph during the execution of a runtime task, without blocking the calling worker: //createacustomgraph tf::Taskflowgraph; -graph.emplace([](){std::cout<<"independenttask1\n";}); -graph.emplace([](){std::cout<<"independenttask2\n";}); +graph.emplace([](){std::cout<<"independenttask1\n";}); +graph.emplace([](){std::cout<<"independenttask2\n";}); taskflow.emplace([&](tf::Runtime&rt){ -//thisworkercorunsthegraphthroughitswork-stealingloop +//corunsthegraphwithoutblockingthecallingworkerofthisruntime rt.corun(graph); }); executor.run_n(taskflow,10000); -Although tf::Runtime::corun blocks until the operation completes, the caller thread (worker) is not preempted (e.g., sleep or holding any lock). Instead, the caller thread joins the work-stealing loop of the executor and leaves whenever the spawned task graph completes. This is different from waiting for a submitted taskflow using tf::Future<T>::wait which blocks the caller thread until the submitted taskflow completes. When multiple submitted taskflows are being waited, their executions can potentially lead to deadlock. For example, the code below creates a taskflow of 1000 tasks with each task running a taskflow of 500 tasks in a blocking fashion: +Although tf::Runtime::corun does not return control to the program until the given graph finishes its execution, the calling worker (i.e., parent worker) of the runtime indeed joins the executor's work-stealing loop and continues executing other tasks together with graph execution. This behavior differs from waiting on a submitted taskflow using std::future<T>::wait (i.e., base class of tf::Future), which blocks the calling thread entirely until completion. If multiple taskflows are submitted and waited on in this blocking manner, it can potentially lead to deadlock, especially in recursive or nested patterns. For example, the code below submits a taskflow of 1000 tasks to an executor of two workers, where each worker blocks while waiting on another taskflow of 500 tasks, causing deadlock: tf::Executorexecutor(2); tf::Taskflowtaskflow; -std::array<tf::Taskflow, 1000>others; - -std::atomic<size_t>counter{0}; +std::array<tf::Taskflow, 1000>others; for(size_tn=0;n<1000;n++){ for(size_ti=0;i<500;i++){ -others[n].emplace([&](){counter++;}); +others[n].emplace([&](){}); } taskflow.emplace([&executor,&tf=others[n]](){ //blockingtheworkercanintroducedeadlockwhere @@ -104,19 +90,17 @@ } executor.run(taskflow).wait(); -Using tf::Runtime::corun allows each worker to corun these taskflows through its work-stealing loop, thus avoiding deadlock problem caused by blocking wait. +To avoid deadlock, you should instead use tf::Runtime::corun that allows the calling worker to corun these taskflows without blocking its execution, thereby avoiding deadlocks. tf::Executorexecutor(2); tf::Taskflowtaskflow; -std::array<tf::Taskflow, 1000>others; - -std::atomic<size_t>counter{0}; +std::array<tf::Taskflow, 1000>others; for(size_tn=0;n<1000;n++){ for(size_ti=0;i<500;i++){ -others[n].emplace([&](){counter++;}); +others[n].emplace([&](){}); } taskflow.emplace([&tf=others[n]](tf::Runtime&rt){ -//thecallerworkerwillnotblockbutcorunthese +//thecallerworkerwillnotblockonwaitbutcorunthese //taskflowsthroughitswork-stealingloop rt.corun(tf); }); @@ -124,15 +108,48 @@ executor.run(taskflow).wait(); - -Codestin Search App -t the following pages to learn more about tf::Runtime: - -Launch Asynchronous Tasks from a Runtime - + +Codestin Search AppSimilar to tf::Executor, tf::Runtime allows you to create asynchronous tasks on the fly using tf::Runtime::async or tf::Runtime::silent_async. Asynchronous tasks spawned from a runtime task are logically parented to that runtime and can be explicitly synchronized using tf::Runtime::corun. Furthermore, each asynchronous task can itself be a runtime task, enabling recursive task creation and dynamic parallelism. This model is particularly powerful for implementing divide-and-conquer algorithms, such as parallel sort, graph traversal, and recursion. For instance, the example below demonstrates a parallel recursive implementation of Fibonacci numbers using recursive asynchronous tasking with tf::Runtime: +#include<taskflow/taskflow.hpp> + +size_tfibonacci(size_tN,tf::Runtime&rt){ + +if(N<2)returnN; + +size_tres1,res2; +rt.silent_async([N,&res1](tf::Runtime&rt1){res1=fibonacci(N-1,rt1);}); + +//tailoptimizationfortherightchild +res2=fibonacci(N-2,rt); + +//usecoruntoavoidblockingtheworkerfromwaitingchildrentaskstofinish +rt.corun(); + +returnres1+res2; +} + +intmain(){ + +tf::Executorexecutor; + +size_tN=5,res; +executor.silent_async([N,&res](tf::Runtime&rt){res=fibonacci(N,rt);}); +executor.wait_for_all(); + +std::cout<<N<<"-thFibonaccinumberis"<<res<<'\n'; + +return0; +} + +The figure below shows the execution diagram, where the task with suffix *_1 represents the left child spawned by its parent runtime. + + +For more details, please refer to Asynchronous Tasking and Fibonacci Number. +While asynchronous tasks spawned from a runtime task are parented to that runtime task, the runtime task does not automatically synchronize their execution or wait for their completion upon destruction. To ensure all spawned tasks finish before proceeding, you should explicitly call tf::Runtime::corun to synchronize them. This prevents potential issues such as tasks being destroyed prematurely or lost without execution. + - + diff --git a/docs/xml/SingleTaskCUDA.xml b/docs/xml/SingleTaskCUDA.xml deleted file mode 100644 index 36c24fad0..000000000 --- a/docs/xml/SingleTaskCUDA.xml +++ /dev/null @@ -1,54 +0,0 @@ - - - - SingleTaskCUDA - Codestin Search App - - - Include the Header - SingleTaskCUDA_1CUDASingleTaskIncludeTheHeader - - - Run a Task with a Single Thread - SingleTaskCUDA_1SingleTaskCUDASingleTask - - - Miscellaneous Items - SingleTaskCUDA_1SingleTaskCUDAMiscellaneousItems - - - - - -tf::cudaFlow provides a template method, tf::cudaFlow::single_task, for creating a task to run the given callable using a single kernel thread. - -Codestin Search App -You need to include the header file, taskflow/cuda/algorithm/for_each.hpp, for creating a single-threaded task. -#include<taskflow/cuda/algorithm/for_each.hpp> - - - -Codestin Search App -You can create a task to run a kernel function just once, i.e., using one GPU thread. This is handy when you want to set up a single or a few global variables that do not need multiple threads and will be used by multiple kernels afterwards. The following example creates a single-task kernel that sets a device variable to 1. -int*gpu_variable; -cudaMalloc(&gpu_variable,sizeof(int)); - -tf::cudaFlowcf; -cf.single_task([gpu_variable]__device__(){ -*gpu_Variable=1; -}); - -tf::cudaStreamstream; -cf.run(stream); -stream.synchronize(); - -Since the callable runs on GPU, it must be declared with a __device__ specifier. - - -Codestin Search App -The single-task algorithm is also available in tf::cudaFlowCapturer::single_task. - - - - - diff --git a/docs/xml/StaticTasking.xml b/docs/xml/StaticTasking.xml index cf7b27b7d..2a6e78c16 100644 --- a/docs/xml/StaticTasking.xml +++ b/docs/xml/StaticTasking.xml @@ -1,5 +1,5 @@ - + StaticTasking Codestin Search App @@ -7,47 +7,46 @@ Create a Task Dependency Graph StaticTasking_1CreateATaskDependencyGraph - + Visualize a Task Dependency Graph StaticTasking_1VisualizeATaskDependencyGraph - + Modify Task Attributes StaticTasking_1ModifyTaskAttributes - + Traverse Adjacent Tasks StaticTasking_1TraverseAdjacentTasks - + Attach User Data to a Task StaticTasking_1AttachUserDataToATask - + Understand the Lifetime of a Task StaticTasking_1UnderstandTheLifetimeOfATask - + Move a Taskflow StaticTasking_1MoveATaskflow - + This chapter demonstrates how to create a static task dependency graph. Static tasking captures the static parallel structure of a decomposition and is defined only by the program itself. It has a flat task hierarchy and cannot spawn new tasks from a running dependency graph. -Codestin Search App -A task in Taskflow is a callable object for which the operation std::invoke is applicable. It can be either a functor, a lambda expression, a bind expression, or a class objects with operator() overloaded. All tasks are created from tf::Taskflow, the class that manages a task dependency graph. Taskflow provides two methods, tf::Taskflow::placeholder and tf::Taskflow::emplace to create a task. +Codestin Search AppA task in Taskflow is a callable object for which the operation std::invoke is applicable. It can be either a functor, a lambda expression, a bind expression, or a class objects with operator() overloaded. All tasks are created from tf::Taskflow, the class that manages a task dependency graph. Taskflow provides two methods, tf::Taskflow::placeholder and tf::Taskflow::emplace to create a task. 1:tf::Taskflowtaskflow; 2:tf::TaskA=taskflow.placeholder(); -3:tf::TaskB=taskflow.emplace([](){std::cout<<"taskB\n";}); +3:tf::TaskB=taskflow.emplace([](){std::cout<<"taskB\n";}); 4: 5:auto[D,E,F]=taskflow.emplace( -6:[](){std::cout<<"TaskA\n";}, -7:[](){std::cout<<"TaskB\n";}, -8:[](){std::cout<<"TaskC\n";} +6:[](){std::cout<<"TaskA\n";}, +7:[](){std::cout<<"TaskB\n";}, +8:[](){std::cout<<"TaskC\n";} 9:); Debrief: @@ -57,24 +56,24 @@ Line 3 creates a task from a given callable object and returns a task handle -Lines 5-9 create three tasks in one call using C++ structured binding coupled with std::tuple +Lines 5-9 create three tasks in one call using C++ structured binding coupled with std::tuple Each time you create a task, the taskflow object creates a node in the task graph and returns a task handle of type tf::Task. A task handle is a lightweight object that wraps up a particular node in a graph and provides a set of methods for you to assign different attributes to the task such as adding dependencies, naming, and assigning a new work. 1:tf::Taskflowtaskflow; -2:tf::TaskA=taskflow.emplace([](){std::cout<<"createataskA\n";}); -3:tf::TaskB=taskflow.emplace([](){std::cout<<"createataskB\n";}); +2:tf::TaskA=taskflow.emplace([](){std::cout<<"createataskA\n";}); +3:tf::TaskB=taskflow.emplace([](){std::cout<<"createataskB\n";}); 4: 5:A.name("TaskA"); -6:A.work([](){std::cout<<"reassignAtoanewcallable\n";}); +6:A.work([](){std::cout<<"reassignAtoanewcallable\n";}); 7:A.precede(B); 8: -9:std::cout<<A.name()<<std::endl;//TaskA -10:std::cout<<A.num_successors()<<std::endl;//1 -11:std::cout<<A.num_dependents()<<std::endl;//0 +9:std::cout<<A.name()<<std::endl;//TaskA +10:std::cout<<A.num_successors()<<std::endl;//1 +11:std::cout<<A.num_predecessors()<<std::endl;//0 12: -13:std::cout<<B.num_successors()<<std::endl;//0 -14:std::cout<<B.num_dependents()<<std::endl;//1 +13:std::cout<<B.num_successors()<<std::endl;//0 +14:std::cout<<B.num_predecessors()<<std::endl;//1 Debrief: Line 1 creates a taskflow object @@ -88,15 +87,14 @@ Each time you create a task, the taskflow object creates a node in the task grap Lines 9-14 dump the task attributes -Taskflow uses general-purpose polymorphic function wrapper, std::function, to store and invoke a callable in a task. You need to follow its contract to create a task. For example, the callable to construct a task must be copyable, and thus the code below won't compile: +Taskflow uses general-purpose polymorphic function wrapper, std::function, to store and invoke a callable in a task. You need to follow its contract to create a task. For example, the callable to construct a task must be copyable, and thus the code below won't compile: taskflow.emplace([ptr=std::make_unique<int>(1)](){ -std::cout<<"captureduniquepointerisnotcopyable"; +std::cout<<"captureduniquepointerisnotcopyable"; }); -Codestin Search App -You can dump a taskflow to a DOT format and visualize the graph using free online tools such as GraphvizOnline and WebGraphviz. +Codestin Search AppYou can dump a taskflow to a DOT format and visualize the graph using free online tools such as GraphvizOnline and WebGraphviz. 1:#include<taskflow/taskflow.hpp> 2: 3:intmain(){ @@ -104,10 +102,10 @@ Taskflow uses general-purpose polymorphic function wrapper, 5:tf::Taskflowtaskflow; 6: 7://createataskdependencygraph -8:tf::TaskA=taskflow.emplace([](){std::cout<<"TaskA\n";}); -9:tf::TaskB=taskflow.emplace([](){std::cout<<"TaskB\n";}); -10:tf::TaskC=taskflow.emplace([](){std::cout<<"TaskC\n";}); -11:tf::TaskD=taskflow.emplace([](){std::cout<<"TaskD\n";}); +8:tf::TaskA=taskflow.emplace([](){std::cout<<"TaskA\n";}); +9:tf::TaskB=taskflow.emplace([](){std::cout<<"TaskB\n";}); +10:tf::TaskC=taskflow.emplace([](){std::cout<<"TaskC\n";}); +11:tf::TaskD=taskflow.emplace([](){std::cout<<"TaskD\n";}); 12: 13://adddependencylinks 14:A.precede(B); @@ -115,7 +113,7 @@ Taskflow uses general-purpose polymorphic function wrapper, 16:B.precede(D); 17:C.precede(D); 18: -19:taskflow.dump(std::cout); +19:taskflow.dump(std::cout); 20:} Debrief: @@ -128,19 +126,18 @@ Taskflow uses general-purpose polymorphic function wrapper, + -Codestin Search App -This example demonstrates how to modify a task's attributes using methods defined in the task handler. +Codestin Search AppThis example demonstrates how to modify a task's attributes using methods defined in the task handler. 1:#include<taskflow/taskflow.hpp> 2: 3:intmain(){ 4: 5:tf::Taskflowtaskflow; 6: -7:std::vector<tf::Task>tasks={ +7:std::vector<tf::Task>tasks={ 8:taskflow.placeholder(),//createataskwithnowork 9:taskflow.placeholder()//createataskwithnowork 10:}; @@ -150,22 +147,22 @@ Taskflow uses general-purpose polymorphic function wrapper, 14:tasks[0].precede(tasks[1]); 15: 16:for(autotask:tasks){//printouteachtask'sattributes -17:std::cout<<task.name()<<":" -18:<<"num_dependents="<<task.num_dependents()<<"," +17:std::cout<<task.name()<<":" +18:<<"num_predecessors="<<task.num_predecessors()<<"," 19:<<"num_successors="<<task.num_successors()<<'\n'; 20:} 21: -22:taskflow.dump(std::cout);//dumpthetaskflowgraph +22:taskflow.dump(std::cout);//dumpthetaskflowgraph 23: -24:tasks[0].work([](){std::cout<<"gotanewwork!\n";}); -25:tasks[1].work([](){std::cout<<"gotanewwork!\n";}); +24:tasks[0].work([](){std::cout<<"gotanewwork!\n";}); +25:tasks[1].work([](){std::cout<<"gotanewwork!\n";}); 26: 27:return0; 28:} The output of this program looks like the following: -ThisisTask0:num_dependents=0,num_successors=1 -ThisisTask1:num_dependents=1,num_successors=0 +ThisisTask0:num_predecessors=0,num_successors=1 +ThisisTask1:num_predecessors=1,num_successors=0 digraphTaskflow{ "ThisisTask1"; "ThisisTask0"; @@ -181,7 +178,7 @@ Taskflow uses general-purpose polymorphic function wrapper, GraphViz Online format (dot) @@ -191,28 +188,31 @@ Taskflow uses general-purpose polymorphic function wrapper, -Codestin Search App -You can iterate the successor list and the dependent list of a task by using tf::Task::for_each_successor and tf::Task::for_each_dependent, respectively. Each method takes a lambda and applies it to a successor or a dependent being traversed. +Codestin Search AppYou can iterate the successor list and the predecessor list of a task by using tf::Task::for_each_successor and tf::Task::for_each_predecessor, respectively. Each method takes a lambda and applies it to a successor or a predecessor being traversed. //traverseallsuccessorsofmy_task my_task.for_each_successor([s=0](tf::Tasksuccessor)mutable{ -std::cout<<"successor"<<s++<<'\n'; +std::cout<<"successor"<<s++<<'\n'; }); -//traversealldependentsofmy_task -my_task.for_each_dependent([d=0](tf::Taskdependent)mutable{ -std::cout<<"dependent"<<d++<<'\n'; +//traverseallpredecessorsofmy_task +my_task.for_each_predecessor([d=0](tf::Taskpredecessor)mutable{ +std::cout<<"predecessor"<<d++<<'\n'; +}); + +If the task contains a subflow, you can use tf::Task::for_each_subflow_task to iterate all tasks associated with that subflow. +my_task.for_each_subflow_task([](tf::Taskstask){ +std::cout<<"subflowtask"<<stask.name()<<'\n'; }); -Codestin Search App -You can attach custom data to a task using tf::Task::data(void*) and access it using tf::Task::data(). Each node in a taskflow is associated with a C-styled data pointer (i.e., void*) you can use to point to user data and access it in the body of a task callable. The following example attaches an integer to a task and accesses that integer through capturing the data in the callable. +Codestin Search AppYou can attach custom data to a task using tf::Task::data(void*) and access it using tf::Task::data(). Each node in a taskflow is associated with a C-styled data pointer (i.e., void*) you can use to point to user data and access it in the body of a task callable. The following example attaches an integer to a task and accesses that integer through capturing the data in the callable. intmy_data=5; tf::Tasktask=taskflow.placeholder(); task.data(&my_data) .work([task](){ intmy_date=*static_cast<int*>(task.data()); -std::cout<<"my_data:"<<my_data; +std::cout<<"my_data:"<<my_data; }); Notice that you need to create a placeholder task first before assigning it a work callable. Only this way can you capture that task in the lambda and access its attached data in the lambda body. @@ -221,30 +221,28 @@ You can change the name and work of a task at anytime before running the graph. -Codestin Search App -A task lives with its graph and belongs to only a graph at a time, and is not destroyed until the graph gets cleaned up. The lifetime of a task refers to the user-given callable object, including captured values. As long as the graph is alive, all the associated tasks exist. +Codestin Search AppA task lives with its graph and belongs to only a graph at a time, and is not destroyed until the graph gets cleaned up. The lifetime of a task refers to the user-given callable object, including captured values. As long as the graph is alive, all the associated tasks exist. It is your responsibility to keep tasks and graph alive during their execution. -Codestin Search App -You can construct or assign a taskflow from a moved taskflow. Moving a taskflow to another will result in transferring the underlying graph data structures from one to the other. +Codestin Search AppYou can construct or assign a taskflow from a moved taskflow. Moving a taskflow to another will result in transferring the underlying graph data structures from one to the other. tf::Taskflowtaskflow1,taskflow3; taskflow1.emplace([](){}); //move-constructtaskflow2fromtaskflow1 -tf::Taskflowtaskflow2(std::move(taskflow1)); +tf::Taskflowtaskflow2(std::move(taskflow1)); assert(taskflow2.num_tasks()==1&&taskflow1.num_tasks()==0); //move-assigntaskflow3totaskflow2 -taskflow3=std::move(taskflow2); +taskflow3=std::move(taskflow2); assert(taskflow3.num_tasks()==1&&taskflow2.num_tasks()==0); You can only move a taskflow to another while that taskflow is not being run by an executor. Moving a running taskflow can result in undefined behavior. Please see Execute a Taskflow with Transferred Ownership for more details. - + diff --git a/docs/xml/SubflowTasking.xml b/docs/xml/SubflowTasking.xml index 91fd2a498..ad99bf469 100644 --- a/docs/xml/SubflowTasking.xml +++ b/docs/xml/SubflowTasking.xml @@ -1,5 +1,5 @@ - + SubflowTasking Codestin Search App @@ -7,27 +7,26 @@ Create a Subflow SubflowTasking_1CreateASubflow - + - Join a Subflow - SubflowTasking_1JoinASubflow - + Retain a Subflow + SubflowTasking_1RetainASubflow + - Detach a Subflow - SubflowTasking_1DetachASubflow - + Join a Subflow Explicitly + SubflowTasking_1JoinASubflow + Create a Nested Subflow SubflowTasking_1CreateANestedSubflow - + It is very common for a parallel program to spawn task dependency graphs at runtime. In Taskflow, we call this subflow tasking. -Codestin Search App -Subflow tasks are those created during the execution of a graph. These tasks are spawned from a parent task and are grouped together to a subflow dependency graph. To create a subflow, emplace a callable that takes an argument of type tf::Subflow. A tf::Subflow object will be created and forwarded to the execution context of the task. All methods you find in tf::Taskflow are applicable for tf::Subflow. +Codestin Search AppSubflow tasks are those created during the execution of a graph. These tasks are spawned from a parent task and are grouped together to a subflow dependency graph. To create a subflow, emplace a callable that takes an argument of type tf::Subflow. A tf::Subflow object will be created and forwarded to the execution context of the task. All methods you find in tf::Taskflow are applicable for tf::Subflow. 1:tf::Taskflowtaskflow; 2:tf::Executorexecutor; 3: @@ -39,7 +38,7 @@ 9:tf::TaskB1=subflow.emplace([](){}).name("B1");//subflowtaskB1 10:tf::TaskB2=subflow.emplace([](){}).name("B2");//subflowtaskB2 11:tf::TaskB3=subflow.emplace([](){}).name("B3");//subflowtaskB3 -12:B1.precede(B3);//B1runsboforeB3 +12:B1.precede(B3);//B1runsbeforeB3 13:B2.precede(B3);//B2runsbeforeB3 14:}).name("B"); 15: @@ -48,10 +47,9 @@ 18:B.precede(D);//DrunsafterB 19:C.precede(D);//DrunsafterC 20: -21:executor.run(taskflow).get();//executethegraphtospawnthesubflow -22:taskflow.dump(std::cout);//dumpthetaskflowtoaDOTformat +21:executor.run(taskflow).get();//executethegraphtospawnthesubflow - + Debrief: @@ -63,16 +61,32 @@ Lines 16-19 add dependencies among A, B, C, and D -Line 21 submits the graph to an executor and waits until it finishes - -Line 22 dumps the entire task dependency graph +Line 21 submits the graph to an executor and waits until it finishes -Lines 8-14 are the main block to enable subflow tasking at task B. The runtime will create a tf::Subflow passing it to task B, and spawn a dependency graph as described by the associated callable. This new subflow graph will be added to the topology of its parent task B. Due to the property of subflow tasking, we cannot dump its structure before execution. We will need to run the graph first to spawn the graph and then call tf::Taskflow::dump. +Lines 8-14 are the main block to enable subflow tasking at task B. The runtime will create a tf::Subflow passing it to task B, and spawn a dependency graph as described by the associated callable. This new subflow graph will be added to the topology of its parent task B. + + +Codestin Search AppBy default, a tf::Subflow automatically clears its internal task graph once it is joined. After a subflow joins, its structure and associated resources are no longer accessible. This behavior is designed to reduce memory usage, particularly in applications that recursively spawn many subflows. For applications that require post-processing, such as visualizing the subflow through tf::Taskflow::dump, users can disable this default cleanup behavior by calling tf::Subflow::retain on true. This instructs the runtime to retain the subflow's task graph even after it has joined, enabling further inspection or visualization. +tf::Taskflowtaskflow; +tf::Executorexecutor; + +taskflow.emplace([&](tf::Subflow&sf){ +sf.retain(true);//retainthesubflowafterjoinforvisualization +autoA=sf.emplace([](){std::cout<<"A\n";}); +autoB=sf.emplace([](){std::cout<<"B\n";}); +autoC=sf.emplace([](){std::cout<<"C\n";}); +A.precede(B,C);//ArunsbeforeBandC +});//subflowimplicitlyjoinshere + +executor.run(taskflow).wait(); + +//Thesubflowgraphisnowretainedandcanbevisualizedusingtaskflow.dump(...) +taskflow.dump(std::cout); + -Codestin Search App -By default, a subflow joins its parent task when the program leaves its execution context. All nodes of zero outgoing edges in the subflow precede its parent task. You can explicitly join a subflow within its execution context to carry out recursive patterns. A famous implementation is fibonacci recursion. +Codestin Search AppBy default, a subflow implicitly joins its parent task when execution leaves its context. All terminal nodes (i.e., nodes with no outgoing edges) in the subflow are guaranteed to precede the parent task. Upon joining, the subflow's task graph and associated resources are automatically cleaned up. If your application needs to access variables defined within the subflow after it joins, you can explicitly join the subflow and handle post-processing accordingly. A common use case is parallelizing recursive computations such as the Fibonacci sequence: intspawn(intn,tf::Subflow&sbf){ if(n<2)returnn; intres1,res2; @@ -88,68 +102,30 @@ Lines 8-14 are the main block to enable subflow tasking at task B. The runtime w executor.run(taskflow).wait(); -The code above computes the fifth fibonacci number using recursive subflow. Calling tf::Subflow::join immediately materializes the subflow by executing all associated tasks to recursively compute fibonacci numbers. The taskflow graph is shown below: - +The code above computes the fifth Fibonacci number using recursive subflow. Calling tf::Subflow::join immediately materializes the subflow by executing all associated tasks to recursively compute Fibonacci numbers. The taskflow graph is shown below: + -Our implementation to join subflows is recursive in order to preserve the thread context in each subflow task. Having a deep recursion of subflows may cause stack overflow. - - -Codestin Search App -In contract to joined subflow, you can detach a subflow from its parent task, allowing its execution to flow independently. -1:tf::Taskflowtaskflow; -2: -3:tf::TaskA=taskflow.emplace([](){}).name("A");//statictaskA -4:tf::TaskC=taskflow.emplace([](){}).name("C");//statictaskC -5:tf::TaskD=taskflow.emplace([](){}).name("D");//statictaskD -6: -7:tf::TaskB=taskflow.emplace([](tf::Subflow&subflow){ -8:tf::TaskB1=subflow.emplace([](){}).name("B1");//statictaskB1 -9:tf::TaskB2=subflow.emplace([](){}).name("B2");//statictaskB2 -10:tf::TaskB3=subflow.emplace([](){}).name("B3");//statictaskB3 -11:B1.precede(B3);//B1runsboforeB3 -12:B2.precede(B3);//B2runsbeforeB3 -13:subflow.detach();//detachthissubflow -14:}).name("B"); -15: -16:A.precede(B);//BrunsafterA -17:A.precede(C);//CrunsafterA -18:B.precede(D);//DrunsafterB -19:C.precede(D);//DrunsafterC -20: -21:tf::Executorexecutor; -22:executor.run(taskflow).wait();//executethegraphtospawnthesubflow -22:taskflow.dump(std::cout);//dumpthetaskflowtoDOTformat - -The figure below demonstrates a detached subflow based on the previous example. A detached subflow will eventually join the topology of its parent task. - - -Detached subflow becomes an independent graph attached to the top-most taskflow. Running a taskflow multiple times will accumulate all detached tasks in the graph. For example, running the above taskflow 5 times results in a total of 19 tasks. -executor.run_n(taskflow,5).wait(); -assert(taskflow.num_tasks()==19); -taskflow.dump(std::cout); - -The dumped graph is shown as follows: - +Using tf::Subflow to implement recursive parallelism like finding Fibonacci numbers may not be as efficient as tf::Runtime due to additional task graph overhead. For more details, readers can refer to Fibonacci Number. + -Codestin Search App -A subflow can be nested or recursive. You can create another subflow from the execution of a subflow and so on. +Codestin Search AppA subflow can be nested or recursive. You can create another subflow from the execution of a subflow and so on. 1:tf::Taskflowtaskflow; 2: -3:tf::TaskA=taskflow.emplace([](tf::Subflow&sbf){ -4:std::cout<<"AspawnsA1&subflowA2\n"; -5:tf::TaskA1=sbf.emplace([](){ -6:std::cout<<"subtaskA1\n"; +3:tf::TaskA=taskflow.emplace([](tf::Subflow&sf){ +4:std::cout<<"AspawnsA1&subflowA2\n"; +5:tf::TaskA1=sf.emplace([](){ +6:std::cout<<"subtaskA1\n"; 7:}).name("A1"); 8: -9:tf::TaskA2=sbf.emplace([](tf::Subflow&sbf2){ -10:std::cout<<"A2spawnsA2_1&A2_2\n"; -11:tf::TaskA2_1=sbf2.emplace([](){ -12:std::cout<<"subtaskA2_1\n"; +9:tf::TaskA2=sf.emplace([](tf::Subflow&sf2){ +10:std::cout<<"A2spawnsA2_1&A2_2\n"; +11:tf::TaskA2_1=sf2.emplace([](){ +12:std::cout<<"subtaskA2_1\n"; 13:}).name("A2_1"); -14:tf::TaskA2_2=sbf2.emplace([](){ -15:std::cout<<"subtaskA2_2\n"; +14:tf::TaskA2_2=sf2.emplace([](){ +15:std::cout<<"subtaskA2_2\n"; 16:}).name("A2_2"); 17:A2_1.precede(A2_2); 18:}).name("A2"); @@ -158,9 +134,8 @@ Lines 8-14 are the main block to enable subflow tasking at task B. The runtime w 21: 22://executethegraphtospawnthesubflow 23:tf::Executor().run(taskflow).get(); -24:taskflow.dump(std::cout); - + Debrief: Line 1 creates a taskflow object @@ -169,12 +144,14 @@ Lines 8-14 are the main block to enable subflow tasking at task B. The runtime w Lines 9-18 spawn another subflow of two tasks A2_1 and A2_2 out of its parent task A2 -Lines 23-24 runs the graph asynchronously and dump its structure when it finishes +Lines 23 runs the defined taskflow graph -Similarly, you can detach a nested subflow from its parent subflow. A detached subflow will run independently and eventually join the topology of its parent subflow. +To properly visualize subflows, you must call tf::Subflow::retain on each subflow and execute the taskflow once to ensure all associated subflows are spawned. + + - + diff --git a/docs/xml/TaskParallelPipeline.xml b/docs/xml/TaskParallelPipeline.xml index 08c2eb203..129cb2ffb 100644 --- a/docs/xml/TaskParallelPipeline.xml +++ b/docs/xml/TaskParallelPipeline.xml @@ -1,5 +1,5 @@ - + TaskParallelPipeline Codestin Search App @@ -7,65 +7,62 @@ Include the Header TaskParallelPipeline_1TaskParallelPipelineIncludeHeaderFile - + Understand the Pipeline Scheduling Framework TaskParallelPipeline_1UnderstandPipelineScheduling - + Create a Task-parallel Pipeline Module Task TaskParallelPipeline_1CreateATaskParallelPipelineModuleTask - + Connect Pipeline with Other Tasks TaskParallelPipeline_1ConnectWithTasks - - - Example 1: Iterate a Pipeline - TaskParallelPipeline_1IterateAPipeline - - - Example 2: Concatenate Two Pipelines - TaskParallelPipeline_1ConcatenateTwoPipelines - - - Example 3: Define Multiple Parallel Pipelines - TaskParallelPipeline_1DefineMultipleTaskParallelPipelines - - - + + + Example 1: Iterate a Pipeline + TaskParallelPipeline_1IterateAPipeline + + + Example 2: Concatenate Two Pipelines + TaskParallelPipeline_1ConcatenateTwoPipelines + + + Example 3: Define Multiple Parallel Pipelines + TaskParallelPipeline_1DefineMultipleTaskParallelPipelines + + + Reset a Pipeline TaskParallelPipeline_1ResetPipeline - + Learn More about Taskflow Pipeline TaskParallelPipeline_1TaskParallelPipelineLearnMore - + Taskflow provides a task-parallel pipeline programming framework for you to implement a pipeline algorithm. Pipeline parallelism refers to a parallel execution of multiple data tokens through a linear chain of pipes or stages. Each stage processes the data token sent from the previous stage, applies the given callable to that data token, and then sends the result to the next stage. Multiple data tokens can be processed simultaneously across different stages. -Codestin Search App -You need to include the header file, taskflow/algorithm/pipeline.hpp, for implementing task-parallel pipeline algorithms. +Codestin Search AppYou need to include the header file, taskflow/algorithm/pipeline.hpp, for implementing task-parallel pipeline algorithms. #include<taskflow/algorithm/pipeline.hpp> -Codestin Search App -A tf::Pipeline object is a composable graph to create a pipeline scheduling framework through a module task in a taskflow (see Composable Tasking). Unlike the conventional pipeline programming frameworks (e.g., Intel TBB Parallel Pipeline), Taskflow's pipeline algorithm does not provide any data abstraction, which often restricts users from optimizing data layouts in their applications, but a flexible framework for users to customize their application data atop an efficient pipeline scheduling framework. - +Codestin Search AppA tf::Pipeline object is a composable graph to create a pipeline scheduling framework through a module task in a taskflow (see Composable Tasking). Unlike the conventional pipeline programming frameworks (e.g., Intel TBB Parallel Pipeline), Taskflow's pipeline algorithm does not provide any data abstraction, which often restricts users from optimizing data layouts in their applications, but a flexible framework for users to customize their application data atop an efficient pipeline scheduling framework. + The figure above gives an example of our pipeline scheduling framework. The framework consists of three pipes (serial-parallel-serial stages) and four lines (maximum parallelism), where each line processes at most one data token. A pipeline of three pipes and four lines will propagate each data token through a sequential chain of three pipes and can simultaneously process up to four data tokens at the four lines. Each edge represents a task dependency. For example, the edge from pipe-0 to pipe-1 in line 0 represents the task dependency between the first and the second pipes in the first line; the edge from pipe-0 in line 0 to pipe-0 in line 1 represents the task dependency between two adjacent lines when processing two data tokens at the same pipe. Each pipe can be either a serial type or a parallel type, where a serial pipe processes data tokens sequentially and a parallel pipe processes different data tokens simultaneously. -Due to the nature of pipeline, Taskflow requires the first pipe to be a serial type. The pipeline scheduling algorithm operates in a circular fashion with a factor of line count. +Due to the nature of pipeline, Taskflow requires the first pipe to be a serial type. The pipeline scheduling algorithm operates in a circular fashion with a factor of line count. -Codestin Search App -Taskflow leverages modern C++ and template techniques to strike a balance between the expressiveness and generality in designing the pipeline programming model. In general, there are three steps to create a task-parallel pipeline application: +Codestin Search AppTaskflow leverages modern C++ and template techniques to strike a balance between the expressiveness and generality in designing the pipeline programming model. In general, there are three steps to create a task-parallel pipeline application: Define the pipeline structure (e.g., pipe type, pipe callable, stopping rule, line count) Define the data storage and layout, if needed for the application @@ -80,7 +77,7 @@ 5:constsize_tnum_lines=4; 6: 7://customdatastorage -8:std::array<int, num_lines>buffer; +8:std::array<int, num_lines>buffer; 9: 10://thepipelineconsistsofthreepipes(serial-parallel-serial) 11://anduptofourconcurrentschedulingtokens @@ -92,13 +89,13 @@ 17:} 18://savetheresultofthispipeintothebuffer 19:else{ -20:printf("pipe0:inputtoken=%zu\n",pf.token()); +20:printf("pipe0:inputtoken=%zu\n",pf.token()); 21:buffer[pf.line()]=pf.token(); 22:} 23:}}, 24: 25:tf::Pipe{tf::PipeType::PARALLEL,[&buffer](tf::Pipeflow&pf){ -26:printf( +26:printf( 27:"pipe1:inputbuffer[%zu]=%d\n", 28:pf.line(),buffer[pf.line()] 29:); @@ -107,7 +104,7 @@ 32:}}, 33: 34:tf::Pipe{tf::PipeType::SERIAL,[&buffer](tf::Pipeflow&pf){ -35:printf( +35:printf( 36:"pipe2:inputbuffer[%zu]=%d\n", 37:pf.line(),buffer[pf.line()] 38:); @@ -141,18 +138,18 @@ Line 48 executes the taskflow -Taskflow leverages Interact with the Runtime and Composable Tasking to implement the pipeline scheduling framework. The taskflow graph of this pipeline example is shown as follows, where 1) one condition task is used to decide which runtime task to run and 2) four runtime tasks is used to schedule tokens at four parallel lines, respectively. - +Taskflow leverages Runtime Tasking and Composable Tasking to implement the pipeline scheduling framework. The taskflow graph of this pipeline example is shown as follows, where 1) one condition task is used to decide which runtime task to run and 2) four runtime tasks are used to schedule tokens at four parallel lines, respectively. + In this example, we customize the data storage, buffer, as an one-dimensional array of 4 integers, since the pipeline structure defines only four parallel lines. Each entry of buffer stores stores the data being processed in the corresponding line. For example, buffer[1] stores the processed data at line 1. The following figure shows the data layout of buffer. - + -In practice, you may need to add padding to the data type of the buffer or align it with the cacheline size to avoid false sharing. If the data type varies at different pipes, you can use std::variant to store the data types in a uniform storage. +In practice, you may need to add padding to the data type of the buffer or align it with the cacheline size to avoid false sharing. If the data type varies at different pipes, you can use std::variant to store the data types in a uniform storage. For each scheduling token, you can use tf::Pipeflow::line() to get its line identifier and tf::Pipeflow::pipe() to get its pipe identifier. For example, if a scheduling token is at the third pipe of the forth line, tf::Pipeflow::line() will return 3 and tf::Pipeflow::pipe() will return 2 (index starts from 0). To stop the execution of the pipeline, you need to call tf::Pipeflow::stop() at the first pipe. Once the stop signal has been triggered, the pipeline will stop scheduling any new tokens after the callable. As we can see from this example, tf::Pipeline gives you the full control to customize your application data on top of a pipeline scheduling framework. - + Calling tf::Pipeflow::stop() not at the first pipe has no effect on the pipeline scheduling. -In most cases, std::thread::hardware_concurrency is a good number for line count. +In most cases, std::thread::hardware_concurrency is a good number for line count. @@ -176,16 +173,14 @@ Our pipeline algorithm schedules tokens in a circular manne There are a total of five tokens running through three pipes. Each pipes prints its input data value, except the first pipe that prints its token identifier. Since the second pipe is a parallel pipe, the output can interleave. -Codestin Search App -You can connect the pipeline module task with other tasks to create a taskflow application that embeds one or multiple pipeline algorithms. We describe three common examples below: +Codestin Search AppYou can connect the pipeline module task with other tasks to create a taskflow application that embeds one or multiple pipeline algorithms. We describe three common examples below: Example 1: Iterate a Pipeline Example 2: Concatenate Two Pipelines Example 3: Define Multiple Parallel Pipelines -Codestin Search App -This example emulates a data streaming application that iteratively runs a stream of data through a pipeline using conditional tasking. The taskflow graph consists of one pipeline module task and one condition task. The pipeline module task processes a stream of data. The condition task decides the availability of data and reruns the pipeline when the next stream of data becomes available. +Codestin Search AppThis example emulates a data streaming application that iteratively runs a stream of data through a pipeline using conditional tasking. The taskflow graph consists of one pipeline module task and one condition task. The pipeline module task processes a stream of data. The condition task decides the availability of data and reruns the pipeline when the next stream of data becomes available. 1:tf::Taskflowtaskflow; 2:tf::Executorexecutor; @@ -194,7 +189,7 @@ Our pipeline algorithm schedules tokens in a circular manne 5: 6:inti=0,N=0; 7://customdatastorage -8:std::array<int, num_lines>buffer; +8:std::array<int, num_lines>buffer; 9: 10://thepipelineconsistsofthreepipes(serial-parallel-serial) 11://anduptofourconcurrentschedulingtokens @@ -206,13 +201,13 @@ Our pipeline algorithm schedules tokens in a circular manne 17:} 18://savetheresultofthispipeintothebuffer 19:else{ -20:printf("stage0:inputtoken=%zu\n",pf.token()); +20:printf("stage0:inputtoken=%zu\n",pf.token()); 21:buffer[pf.line()]=pf.token(); 22:} 23:}}, 24: 25:tf::Pipe{tf::PipeType::PARALLEL,[&buffer](tf::Pipeflow&pf){ -26:printf( +26:printf( 27:"stage1:inputbuffer[%zu]=%d\n", 28:pf.line(),buffer[pf.line()] 29:); @@ -221,7 +216,7 @@ Our pipeline algorithm schedules tokens in a circular manne 32:}}, 33: 34:tf::Pipe{tf::PipeType::SERIAL,[&buffer](tf::Pipeflow&pf){ -35:printf( +35:printf( 36:"stage2:inputbuffer[%zu]=%d\n", 37:pf.line(),buffer[pf.line()] 38:); @@ -233,7 +228,7 @@ Our pipeline algorithm schedules tokens in a circular manne 44:tf::Taskconditional=taskflow.emplace([&N,&i](){ 45:i=0; 46:if(++N<2){ -47:std::cout<<"Rerunthepipeline\n"; +47:std::cout<<"Rerunthepipeline\n"; 48:return0; 49:} 50:else{ @@ -244,10 +239,10 @@ Our pipeline algorithm schedules tokens in a circular manne 55://buildthepipelinegraphusingcomposition 56:tf::Taskpipeline=taskflow.composed_of(pl) 57:.name("pipeline"); -58:tf::Taskinitial=taskflow.emplace([](){std::cout<<"initial\n";}) +58:tf::Taskinitial=taskflow.emplace([](){std::cout<<"initial\n";}) 59:.name("initial"); -60:tf::Taskstop=taskflow.emplace([](){std::cout<<"stop\n";}) -61:.name("stop"); +60:tf::Taskstop=taskflow.emplace([](){std::cout<<"stop\n";}) +61:.name("stop"); 62: 63://specifythegraphdependency 64:initial.precede(pipeline); @@ -286,7 +281,7 @@ Our pipeline algorithm schedules tokens in a circular manne The taskflow graph of this pipeline example is illustrated as follows: - + The following snippet shows one of the possible outputs: initial @@ -326,8 +321,7 @@ The taskflow graph of this pipeline example is illustrated as follows: The pipeline runs twice as controlled by the condition task conditional. The starting token in the second run of the pipeline is 5 rather than 0 because the pipeline keeps a stateful number of tokens. The last token is 9, which means the pipeline processes in total 10 scheduling tokens. The first five tokens (token 0 to 4) are processed in the first run, and the remaining five tokens (token 5 to 9) are processed in the second run. In the condition task, we use N as a decision-making counter to process the next stream of data. -Codestin Search App -This example demonstrates two concatenated pipelines where a sequence of data tokens run synchronously from one pipeline to another pipeline. The first pipeline task precedes the second pipeline task. +Codestin Search AppThis example demonstrates two concatenated pipelines where a sequence of data tokens run synchronously from one pipeline to another pipeline. The first pipeline task precedes the second pipeline task. 1:tf::Taskflowtaskflow("pipeline"); 2:tf::Executorexecutor; 3: @@ -335,8 +329,8 @@ The taskflow graph of this pipeline example is illustrated as follows: 5:constsize_tnum_lines=4; 6: 7://customdatastorage -8:std::array<int, num_lines>buffer_1; -9:std::array<int, num_lines>buffer_2; +8:std::array<int, num_lines>buffer_1; +9:std::array<int, num_lines>buffer_2; 10: 11://thepipeline_1consistsofthreepipes(serial-parallel-serial) 12://anduptofourconcurrentschedulingtokens @@ -348,13 +342,13 @@ The taskflow graph of this pipeline example is illustrated as follows: 18:} 19://savetheresultofthispipeintothebuffer 20:else{ -21:printf("pipeline1,pipe0:inputtoken=%zu\n",pf.token()); +21:printf("pipeline1,pipe0:inputtoken=%zu\n",pf.token()); 22:buffer_1[pf.line()]=pf.token(); 23:} 24:}}, 25: 26:tf::Pipe{tf::PipeType::PARALLEL,[&buffer_1](tf::Pipeflow&pf){ -27:printf( +27:printf( 28:"pipeline1,pipe1:inputbuffer_1[%zu]=%d\n", 29:pf.line(),buffer_1[pf.line()] 30:); @@ -363,7 +357,7 @@ The taskflow graph of this pipeline example is illustrated as follows: 33:}}, 34: 35:tf::Pipe{tf::PipeType::SERIAL,[&buffer_1](tf::Pipeflow&pf){ -36:printf( +36:printf( 37:"pipeline1,pipe2:inputbuffer_1[%zu]=%d\n", 38:pf.line(),buffer_1[pf.line()] 39:); @@ -383,13 +377,13 @@ The taskflow graph of this pipeline example is illustrated as follows: 53:} 54://savetheresultofthispipeintothebuffer 55:else{ -56:printf("pipeline2,pipe0:inputvalue=%d\n",buffer_1[pf.line()]); +56:printf("pipeline2,pipe0:inputvalue=%d\n",buffer_1[pf.line()]); 57:buffer_2[pf.line()]=buffer_1[pf.line()]; 58:} 59:}}, 60: 61:tf::Pipe{tf::PipeType::PARALLEL,[&buffer_2](tf::Pipeflow&pf){ -62:printf( +62:printf( 63:"pipeline2,pipe1:inputbuffer_2[%zu]=%d\n", 64:pf.line(),buffer_2[pf.line()] 65:); @@ -398,7 +392,7 @@ The taskflow graph of this pipeline example is illustrated as follows: 68:}}, 69: 70:tf::Pipe{tf::PipeType::SERIAL,[&buffer_2](tf::Pipeflow&pf){ -71:printf( +71:printf( 72:"pipeline2,pipe2:inputbuffer_2[%zu]=%d\n", 73:pf.line(),buffer_2[pf.line()] 74:); @@ -443,7 +437,7 @@ The taskflow graph of this pipeline example is illustrated as follows: The taskflow graph of this pipeline example is illustrated as follows: - + The following snippet shows one of the possible outputs: pipeline1,pipe0:inputtoken=0 @@ -474,8 +468,7 @@ The taskflow graph of this pipeline example is illustrated as follows: The output of pipelines pl_1 and pl_2 can be different from run to run because their second pipes are both parallel types. Due to the task dependency between pipeline_1 and pipeline_2, the output of pl_1 precedes the output of pl_2. -Codestin Search App -This example creates two independent pipelines that run in parallel on different data sets. +Codestin Search AppThis example creates two independent pipelines that run in parallel on different data sets. 1:tf::Taskflowtaskflow("pipeline"); 2:tf::Executorexecutor; 3: @@ -483,8 +476,8 @@ The taskflow graph of this pipeline example is illustrated as follows: 5:constsize_tnum_lines=4; 6: 7://customdatastorage -8:std::array<int, num_lines>buffer_1; -9:std::array<int, num_lines>buffer_2; +8:std::array<int, num_lines>buffer_1; +9:std::array<int, num_lines>buffer_2; 10: 11://thepipeline_1consistsofthreepipes(serial-parallel-serial) 12://anduptofourconcurrentschedulingtokens @@ -496,13 +489,13 @@ The taskflow graph of this pipeline example is illustrated as follows: 18:} 19://savetheresultofthispipeintothebuffer 20:else{ -21:printf("pipeline1,pipe0:inputtoken=%zu\n",pf.token()); +21:printf("pipeline1,pipe0:inputtoken=%zu\n",pf.token()); 22:buffer_1[pf.line()]=pf.token(); 23:} 24:}}, 25: 26:tf::Pipe{tf::PipeType::PARALLEL,[&buffer_1](tf::Pipeflow&pf){ -27:printf( +27:printf( 28:"pipeline1,pipe1:inputbuffer_1[%zu]=%d\n", 29:pf.line(),buffer_1[pf.line()] 30:); @@ -511,7 +504,7 @@ The taskflow graph of this pipeline example is illustrated as follows: 33:}}, 34: 35:tf::Pipe{tf::PipeType::SERIAL,[&buffer_1](tf::Pipeflow&pf){ -36:printf( +36:printf( 37:"pipeline1,pipe2:inputbuffer_1[%zu]=%d\n", 38:pf.line(),buffer_1[pf.line()] 39:); @@ -530,13 +523,13 @@ The taskflow graph of this pipeline example is illustrated as follows: 52:} 53://savetheresultofthispipeintothebuffer 54:else{ -55:printf("pipeline2,pipe0:inputtoken=%zu\n",pf.token()); +55:printf("pipeline2,pipe0:inputtoken=%zu\n",pf.token()); 56:buffer_2[pf.line()]="pipeline"; 57:} 58:}}, 59: 60:tf::Pipe{tf::PipeType::PARALLEL,[&buffer_2](tf::Pipeflow&pf){ -61:printf( +61:printf( 62:"pipeline2,pipe1:inputbuffer_2[%zu]=%d\n", 63:pf.line(),buffer_2[pf.line()] 64:); @@ -545,7 +538,7 @@ The taskflow graph of this pipeline example is illustrated as follows: 67:}}, 68: 69:tf::Pipe{tf::PipeType::SERIAL,[&buffer_2](tf::Pipeflow&pf){ -70:printf( +70:printf( 71:"pipeline2,pipe2:inputbuffer_2[%zu]=%d\n", 72:pf.line(),buffer_2[pf.line()] 73:); @@ -558,7 +551,7 @@ The taskflow graph of this pipeline example is illustrated as follows: 80:.name("pipeline_1"); 81:tf::Taskpipeline_2=taskflow.composed_of(pl_2) 82:.name("pipeline_2"); -83:tf::Taskinitial=taskflow.emplace([](){std::cout<<"initial";}) +83:tf::Taskinitial=taskflow.emplace([](){std::cout<<"initial";}) 84:.name("initial"); 85: 86:initial.precede(pipeline_1,pipeline_2); @@ -593,7 +586,7 @@ The taskflow graph of this pipeline example is illustrated as follows: The taskflow graph of this pipeline example is illustrated as follows: - + The following snippet shows one of the possible outputs: initial @@ -632,8 +625,7 @@ The taskflow graph of this pipeline example is illustrated as follows: -Codestin Search App -Our pipeline scheduling framework keeps a stateful number of scheduled tokens at each submitted run. You can reset the pipeline to the initial state using tf::Pipeline::reset(), where the number of scheduled tokens will start from zero in the next run. Borrowed from Example 1: Iterate a Pipeline, the program below resets the pipeline at the second iteration (inside the condition task) so the scheduling token will start from zero in the next run. +Codestin Search AppOur pipeline scheduling framework keeps a stateful number of scheduled tokens at each submitted run. You can reset the pipeline to the initial state using tf::Pipeline::reset(), where the number of scheduled tokens will start from zero in the next run. Borrowed from Example 1: Iterate a Pipeline, the program below resets the pipeline at the second iteration (inside the condition task) so the scheduling token will start from zero in the next run. tf::Taskflowtaskflow("pipeline"); tf::Executorexecutor; @@ -641,7 +633,7 @@ The taskflow graph of this pipeline example is illustrated as follows: constsize_tnum_lines=4; //customdatastorage -std::array<int, num_lines>buffer; +std::array<int, num_lines>buffer; //thepipelineconsistsofthreepipes(serial-parallel-serial) //anduptofourconcurrentschedulingtokens @@ -653,13 +645,13 @@ The taskflow graph of this pipeline example is illustrated as follows: } //savetheresultofthispipeintothebuffer else{ -printf("pipe0:inputtoken=%zu\n",pf.token()); +printf("pipe0:inputtoken=%zu\n",pf.token()); buffer[pf.line()]=pf.token(); } }}, tf::Pipe{tf::PipeType::PARALLEL,[&buffer](tf::Pipeflow&pf){ -printf( +printf( "pipe1:inputbuffer_1[%zu]=%d\n",pf.line(),buffer[pf.line()] ); //propagatethepreviousresulttothispipebyaddingone @@ -667,7 +659,7 @@ The taskflow graph of this pipeline example is illustrated as follows: }}, tf::Pipe{tf::PipeType::SERIAL,[&buffer](tf::Pipeflow&pf){ -printf( +printf( "pipe2:inputbuffer[%zu][%zu]=%d\n",pf.line(),buffer[pf.line()] ); //propagatethepreviousresulttothispipebyaddingone @@ -678,7 +670,7 @@ The taskflow graph of this pipeline example is illustrated as follows: tf::Taskconditional=taskflow.emplace([&](){ if(++N<2){ pl.reset(); -std::cout<<"Rerunthepipeline\n"; +std::cout<<"Rerunthepipeline\n"; return0; } else{ @@ -688,9 +680,9 @@ The taskflow graph of this pipeline example is illustrated as follows: tf::Taskpipeline=taskflow.composed_of(pl) .name("pipeline"); -tf::Taskinitial=taskflow.emplace([](){std::cout<<"initial";}) +tf::Taskinitial=taskflow.emplace([](){std::cout<<"initial";}) .name("initial"); -tf::Taskstop=taskflow.emplace([](){std::cout<<"stop";}) +tf::Taskstop=taskflow.emplace([](){std::cout<<"stop";}) .name("stop"); initial.precede(pipeline); @@ -737,8 +729,7 @@ The taskflow graph of this pipeline example is illustrated as follows: The output can be different from run to run, since the second pipe is a parallel type. At the second iteration from the condition task, we reset the pipeline so the token identifier starts from 0 rather than 5. -Codestin Search App -Visit the following pages to learn more about pipeline: +Codestin Search AppVisit the following pages to learn more about pipeline: Task-parallel Scalable Pipeline Data-parallel Pipeline @@ -749,6 +740,6 @@ The taskflow graph of this pipeline example is illustrated as follows: - + diff --git a/docs/xml/TaskParallelPipelineWithTokenDependencies.xml b/docs/xml/TaskParallelPipelineWithTokenDependencies.xml index c54060ad3..34804da0b 100644 --- a/docs/xml/TaskParallelPipelineWithTokenDependencies.xml +++ b/docs/xml/TaskParallelPipelineWithTokenDependencies.xml @@ -1,5 +1,5 @@ - + TaskParallelPipelineWithTokenDependencies Codestin Search App @@ -7,35 +7,34 @@ Understand Token Dependencies TaskParallelPipelineWithTokenDependencies_1DeferredPipelineTokenDependencies - + Resolve Token Dependencies TaskParallelPipelineWithTokenDependencies_1DeferredPipelineResolveTokenDependencies - + Include the Header TaskParallelPipelineWithTokenDependencies_1DeferredPipelineIncludeHeaderFile - + Create a Deferred Pipeline Module Task TaskParallelPipelineWithTokenDependencies_1CreateADeferredPipelineModuleTask - + Create a Deferred Scalable Pipeline Module Task TaskParallelPipelineWithTokenDependencies_1CreateADeferredScalablePipelineModuleTask - + Learn More about Taskflow Pipeline TaskParallelPipelineWithTokenDependencies_1ParalleliDeferredScalablePipelineLearnMore - + Taskflow pipeline allows you to defer the execution of a token to future tokens. This deferral introduces a dependency from a future token to the current token, particularly suitable for many video encoding applications. We recommend reading Task-parallel Pipeline first before learning this interface. -Codestin Search App -Token dependencies establish the order in which data tokens should execute in a task-parallel pipeline. When token t1 completes before t2 starts, there is a dependency from t1 to t2. We categorize token dependencies into two types: +Codestin Search AppToken dependencies establish the order in which data tokens should execute in a task-parallel pipeline. When token t1 completes before t2 starts, there is a dependency from t1 to t2. We categorize token dependencies into two types: forward token dependencies (FTD): dependencies from earlier to future tokens backward token dependencies (BTD): dependencies from future to earlier tokens The following figure illustrates a sample token dependency diagram and its token execution sequence. Edge pointing from token 2 to 5 is FTD, and those from 8 to 2 and 5 and 9 to 5 are BTDs. Based on the dependencies, the tokens execute in the corresponding execution sequence. @@ -44,37 +43,34 @@ -Codestin Search App -To resolve the token dependencies, the basic idea is to defer the execution of a token with unresolved dependencies and save the token in a data structure until its dependencies are resolved. To implement the idea, we leverage three data structures, deferred_tokens (DT), token_dependencies (TD), and ready_tokens (RT). DT and TD are associative containers and RT is a queue. DT stores deferred tokens and their dependents by which the deferred tokens are deferred. TD stores a dependent and its related deferred tokens. RT stores the tokens that were deferred tokens and now are ready because their dependencies are resolved. The following image illustrates the usages of the three data structures to resolve the token dependencies and get the corresponding serial execution sequence exemplified in Understand Token Dependencies. +Codestin Search AppTo resolve the token dependencies, the basic idea is to defer the execution of a token with unresolved dependencies and save the token in a data structure until its dependencies are resolved. To implement the idea, we leverage three data structures, deferred_tokens (DT), token_dependencies (TD), and ready_tokens (RT). DT and TD are associative containers and RT is a queue. DT stores deferred tokens and their dependents by which the deferred tokens are deferred. TD stores a dependent and its related deferred tokens. RT stores the tokens that were deferred tokens and now are ready because their dependencies are resolved. The following image illustrates the usages of the three data structures to resolve the token dependencies and get the corresponding serial execution sequence exemplified in Understand Token Dependencies. The whole process has the following steps: Token 1 is not a deferred token and then 1 is finished. Now the execution sequence is {1}. -Token 2 defers to 8. We insert DT[2]={8} and TD[8]={2}. The black cicle 2 in the above image illustrates this step. +Token 2 defers to 8. We insert DT[2]={8} and TD[8]={2}. The black circle 2 in the above image illustrates this step. Token 3 is not a deferred token and then 3 is finished. Now the execution sequence is {1,3}. Token 4 is not a deferred token and then 4 is finished. Now the execution sequence is {1,3,4}. -Token 5 defers to 2 and 7. We insert DT[5]={2,7}, TD[2]={5}, and TD[7]={5}. The black cicle 5 in the above image illustrates this step. +Token 5 defers to 2 and 7. We insert DT[5]={2,7}, TD[2]={5}, and TD[7]={5}. The black circle 5 in the above image illustrates this step. Token 6 is not a deferred token and then 6 is finished. Now the execution sequence is {1,3,4,6}. -Token 7 is not a deferred token and then 7 is finished. Now the execution sequence is {1,3,4,6,7}. Since TD[7]={5}, we directly remove 7 from DT[5]. The black cicle 7 in the above image illustrates this step. -Token 8 is not a deferred token and then 8 is finished. Now the execution sequence is {1,3,4,6,7,8}. Since TD[8]={2}, we directly remove 8 from DT[2] and find out DT[2] is empty. Now token 2 is no longer a deferred token and we move 2 to RT. The black cicle 8 in the above image illustrates this step. -RT is not empty and has a token 2. Then we finish running 2. Now the execution sequence is {1,3,4,6,7,8,2}. Since TD[2]={5}, we directly remove 2 from DT[5] and find out DT[5] is empty. Now token 5 is no longer a deferred token and we move 5 to RT. The black cicle 9 in the above image illustrates this step. -RT is not empty and has a token 5. Then we run 5 and find out token 5 defers the second time, defers to 9. We insert DT[5]={9} and TD[9]={5}. The black cicle 20 in the above image illustrates this step. -Token 9 is not a deferred token and then 9 is finished. Now the execution sequence is {1,3,4,6,7,8,2,9}. Since TD[9]={5}, we directly remove 9 from DT[5] and find out DT[5] is empty. Now token 5 is no longer a deferred token and we move 5 to RT. The black cicle 11 in the above image illustrates this step. -RT is not empty and has a token 5. Then we finish running 5. Now the execution sequence is {1,3,4,6,7,8,2,9,5}. The black cicle 12 in the above image illustrates this step. +Token 7 is not a deferred token and then 7 is finished. Now the execution sequence is {1,3,4,6,7}. Since TD[7]={5}, we directly remove 7 from DT[5]. The black circle 7 in the above image illustrates this step. +Token 8 is not a deferred token and then 8 is finished. Now the execution sequence is {1,3,4,6,7,8}. Since TD[8]={2}, we directly remove 8 from DT[2] and find out DT[2] is empty. Now token 2 is no longer a deferred token and we move 2 to RT. The black circle 8 in the above image illustrates this step. +RT is not empty and has a token 2. Then we finish running 2. Now the execution sequence is {1,3,4,6,7,8,2}. Since TD[2]={5}, we directly remove 2 from DT[5] and find out DT[5] is empty. Now token 5 is no longer a deferred token and we move 5 to RT. The black circle 9 in the above image illustrates this step. +RT is not empty and has a token 5. Then we run 5 and find out token 5 defers the second time, defers to 9. We insert DT[5]={9} and TD[9]={5}. The black circle 20 in the above image illustrates this step. +Token 9 is not a deferred token and then 9 is finished. Now the execution sequence is {1,3,4,6,7,8,2,9}. Since TD[9]={5}, we directly remove 9 from DT[5] and find out DT[5] is empty. Now token 5 is no longer a deferred token and we move 5 to RT. The black circle 11 in the above image illustrates this step. +RT is not empty and has a token 5. Then we finish running 5. Now the execution sequence is {1,3,4,6,7,8,2,9,5}. The black circle 12 in the above image illustrates this step. Token 10 is not a deferred token and then 10 is finished. Now the execution sequence is {1,3,4,6,7,8,2,9,5,10}. -Codestin Search App -You need to include the header file, taskflow/algorithm/pipeline.hpp, for implementing deferred pipeline algorithms. +Codestin Search AppYou need to include the header file, taskflow/algorithm/pipeline.hpp, for implementing deferred pipeline algorithms. #include<taskflow/algorithm/pipeline.hpp> -Codestin Search App -To create a deferred pipeline application, there are four steps, one more step than creating a task-parallel pipeline (tf::Pipeline): +Codestin Search AppTo create a deferred pipeline application, there are four steps, one more step than creating a task-parallel pipeline (tf::Pipeline): Define the pipeline structure (e.g., pipe type, pipe callable, stopping rule, line count) Define the token dependencies at the first pipe @@ -102,20 +98,20 @@ 17:switch(pf.num_deferrals()){ 18:case0: 19:pf.defer(2); -20:printf("1st-time:Token%zuisdeferredby2\n",pf.token()); +20:printf("1st-time:Token%zuisdeferredby2\n",pf.token()); 21:pf.defer(7); -22:printf("1st-time:Token%zuisdeferredby7\n",pf.token()); +22:printf("1st-time:Token%zuisdeferredby7\n",pf.token()); 23:return; 24:break; 25: 26:case1: 27:pf.defer(9); -28:printf("2nd-time:Token%zuisdeferredby9\n",pf.token()); +28:printf("2nd-time:Token%zuisdeferredby9\n",pf.token()); 29:return; 30:break; 31: 32:case2: -33:printf("3rd-time:Tokens2,7and9resolveddependencies\ +33:printf("3rd-time:Tokens2,7and9resolveddependencies\ fortoken%zu\n",pf.token()); 34:break; 35:} @@ -125,27 +121,27 @@ 39:switch(pf.num_deferrals()){ 40:case0: 41:pf.defer(8); -42:printf("1st-time:Token%zuisdeferredby8\n",pf.token()); +42:printf("1st-time:Token%zuisdeferredby8\n",pf.token()); 43:break; 44:case1: -45:printf("2nd-time:Token8resolveddependenciesfortoken%zu\n", +45:printf("2nd-time:Token8resolveddependenciesfortoken%zu\n", pf.token()); 46:break; 47:} 48:} 49:else{ -50:printf("stage1:Non-deferredtoken%zu\n",pf.token()); +50:printf("stage1:Non-deferredtoken%zu\n",pf.token()); 51:} 52:} 53:}}, 54: 55:tf::Pipe{tf::PipeType::SERIAL,[](tf::Pipeflow&pf){ -56:printf("stage2:inputtoken%zu(deferrals=%zu)\n", +56:printf("stage2:inputtoken%zu(deferrals=%zu)\n", pf.token(),pf.num_deferrals()); 57:}}, 58: 59:tf::Pipe{tf::PipeType::SERIAL,[](tf::Pipeflow&pf){ -60:printf("stage3:inputtoken%zu\n",pf.token()); +60:printf("stage3:inputtoken%zu\n",pf.token()); 61:}} 62:); 63: @@ -173,7 +169,7 @@ Line 67 executes the taskflow -The following is one of the possible outcomes of the exmaple. +The following is one of the possible outcomes of the example. stage1:Non-deferredtoken0 stage2:inputtoken0(deferrals=0) stage3:inputtoken0 @@ -212,13 +208,12 @@ The following is one of the possible outcomes of the exmaple. stage2:inputtoken10(deferrals=0) stage3:inputtoken10 -You can only specify the token dependencies at the first pipe to get the serial execution of tokens. +You can only specify the token dependencies at the first pipe to get the serial execution of tokens. -Codestin Search App -In addition to task-parallel pipeline (tf::Pipeline), you can specify token dependencies on top of a task-parallel scalable pipeline (tf::ScalablePipeline). We recommend reading Task-parallel Scalable Pipeline first before learning this interface. +Codestin Search AppIn addition to task-parallel pipeline (tf::Pipeline), you can specify token dependencies on top of a task-parallel scalable pipeline (tf::ScalablePipeline). We recommend reading Task-parallel Scalable Pipeline first before learning this interface. To create a deferred scalable pipeline application, there are four steps, which are identical to the steps described in Create a Deferred Pipeline Module Task. They are: Define the pipeline structure (e.g., pipe type, pipe callable, stopping rule, line count) @@ -229,7 +224,7 @@ The following is one of the possible outcomes of the exmaple. The following code creates a deferred scalable pipeline that uses four parallel lines to schedule tokens through two serial pipes in the given vector, then resetting that pipeline to three serial pipes. The three pipe callables are identical to the pipe callables demonstrated in the code snippet in Create a Deferred Pipeline Module Task. The token dependencies are exemplified in Understand Token Dependencies. 1://createavectorofthreepipes -2:std::vector<tf::Pipe<std::function<void(tf::Pipeflow&)>>>pipes; +2:std::vector<tf::Pipe<std::function<void(tf::Pipeflow&)>>>pipes; 3: 4://definepipecallables 5://first_pipe_callableissameaslines15-53intheabovecodesnippet @@ -244,20 +239,20 @@ The following is one of the possible outcomes of the exmaple. 14:switch(pf.num_deferrals()){ 15:case0: 16:pf.defer(2); -17:printf("1st-time:Token%zuisdeferredby2\n",pf.token()); +17:printf("1st-time:Token%zuisdeferredby2\n",pf.token()); 18:pf.defer(7); -19:printf("1st-time:Token%zuisdeferredby7\n",pf.token()); +19:printf("1st-time:Token%zuisdeferredby7\n",pf.token()); 20:return; 21:break; 22: 23:case1: 24:pf.defer(9); -25:printf("2nd-time:Token%zuisdeferredby9\n",pf.token()); +25:printf("2nd-time:Token%zuisdeferredby9\n",pf.token()); 26:return; 27:break; 28: 29:case2: -30:printf("3rd-time:Tokens2,7and9resolveddependenciesfortoken%zu\n", +30:printf("3rd-time:Tokens2,7and9resolveddependenciesfortoken%zu\n", pf.token()); 31:break; 32:} @@ -267,28 +262,28 @@ The following is one of the possible outcomes of the exmaple. 36:switch(pf.num_deferrals()){ 37:case0: 38:pf.defer(8); -39:printf("1st-time:Token%zuisdeferredby8\n",pf.token()); +39:printf("1st-time:Token%zuisdeferredby8\n",pf.token()); 40:break; 41:case1: -42:printf("2nd-time:Token8resolveddependenciesfortoken%zu\n", +42:printf("2nd-time:Token8resolveddependenciesfortoken%zu\n", pf.token()); 43:break; 44:} 45:} 46:else{ -47:printf("stage1:Non-deferredtoken%zu\n",pf.token()); +47:printf("stage1:Non-deferredtoken%zu\n",pf.token()); 48:} 49:}; 50: 51://second_pipe_callableissameaslines55-57intheabovecodesnippet 52:autosecond_pipe_callable=[](tf::Pipeflow&pf){ -53:printf("stage2:inputtoken%zu(deferrals=%zu)\n", +53:printf("stage2:inputtoken%zu(deferrals=%zu)\n", pf.token(),pf.num_deferrals()); 54:}; 55: 56://third_pipe_callableissameaslines59-61intheabovecodesnippet 57:autothird_pipe_callable=[](tf::Pipeflow&pf){ -58:printf("stage3:inputtoken%zu\n",pf.token()); +58:printf("stage3:inputtoken%zu\n",pf.token()); 59:}; 60: 61:pipes.emplace_back(tf::PipeType::SERIAL,first_pipe_callable); @@ -337,8 +332,7 @@ The following is one of the possible outcomes of the exmaple. -Codestin Search App -Visit the following pages to learn more about pipeline: +Codestin Search AppVisit the following pages to learn more about pipeline: Task-parallel Pipeline Data-parallel Pipeline @@ -350,6 +344,6 @@ The following is one of the possible outcomes of the exmaple. - + diff --git a/docs/xml/TaskParallelScalablePipeline.xml b/docs/xml/TaskParallelScalablePipeline.xml index ca16cf84a..29d944215 100644 --- a/docs/xml/TaskParallelScalablePipeline.xml +++ b/docs/xml/TaskParallelScalablePipeline.xml @@ -1,5 +1,5 @@ - + TaskParallelScalablePipeline Codestin Search App @@ -7,44 +7,42 @@ Include the Header TaskParallelScalablePipeline_1IncludeTheScalablePipelineHeader - + Create a Scalable Pipeline Module Task TaskParallelScalablePipeline_1CreateAScalablePipelineModuleTask - + Reset a Placeholder Scalable Pipeline TaskParallelScalablePipeline_1ResetAPlaceholderScalablePipeline - + Use Other Iterator Types TaskParallelScalablePipeline_1ScalablePipelineUseOtherIteratorTypes - + Learn More about Taskflow Pipeline TaskParallelScalablePipeline_1ParallelScalablePipelineLearnMore - + Unlike tf::Pipeline (see Task-parallel Pipeline) that instantiates all pipes at the construction time, Taskflow provides a scalable alternative called tf::ScalablePipeline to allow variable assignments of pipes using range iterators. A scalable pipeline is thus more flexible for applications to create a pipeline scheduling framework whose pipeline structure depends on runtime variables. -Codestin Search App -You need to include the header file, taskflow/algorithm/pipeline.hpp, for creating a scalable pipeline scheduling framework. +Codestin Search AppYou need to include the header file, taskflow/algorithm/pipeline.hpp, for creating a scalable pipeline scheduling framework. #include<taskflow/algorithm/pipeline.hpp> -Codestin Search App -Similar to tf::Pipeline, tf::ScalablePipeline is a composable graph object to implement a pipeline scheduling framework in a taskflow. The key difference between tf::Pipeline and tf::ScalablePipeline is that a scalable pipeline can accept variable assignments of pipes rather than instantiating all pipes at construction or programming time. Users define a linear range of pipes, each of the same callable type, and apply that range to construct a scalable pipeline. Between successive runs, users can reset the pipeline to a different range of pipes. The following code creates a scalable pipeline that uses four parallel lines to schedule tokens through three serial pipes in the given vector, then resetting that pipeline to a new range of five serial pipes: +Codestin Search AppSimilar to tf::Pipeline, tf::ScalablePipeline is a composable graph object to implement a pipeline scheduling framework in a taskflow. The key difference between tf::Pipeline and tf::ScalablePipeline is that a scalable pipeline can accept variable assignments of pipes rather than instantiating all pipes at construction or programming time. Users define a linear range of pipes, each of the same callable type, and apply that range to construct a scalable pipeline. Between successive runs, users can reset the pipeline to a different range of pipes. The following code creates a scalable pipeline that uses four parallel lines to schedule tokens through three serial pipes in the given vector, then resetting that pipeline to a new range of five serial pipes: tf::Taskflowtaskflow("pipeline"); tf::Executorexecutor; constsize_tnum_lines=4; //createdatastorage -std::array<int, num_lines>buffer; +std::array<int, num_lines>buffer; //definethepipecallable autopipe_callable=[&buffer](tf::Pipeflow&pf)mutable{ @@ -56,7 +54,7 @@ pf.stop(); } else{ -printf("stage1:inputtoken=%zu\n",pf.token()); +printf("stage1:inputtoken=%zu\n",pf.token()); buffer[pf.line()]=pf.token(); } return; @@ -66,7 +64,7 @@ //otherstagespropagatethepreviousresulttothispipeand //incrementitbyone default:{ -printf( +printf( "stage%zu:inputbuffer[%zu]=%d\n",pf.pipe(),pf.line(),buffer[pf.line()] ); buffer[pf.line()]=buffer[pf.line()]+1; @@ -76,7 +74,7 @@ }; //createavectorofthreepipes -std::vector<tf::Pipe<std::function<void(tf::Pipeflow&)>>>pipes; +std::vector<tf::Pipe<std::function<void(tf::Pipeflow&)>>>pipes; for(size_ti=0;i<3;i++){ pipes.emplace_back(tf::PipeType::SERIAL,pipe_callable); @@ -86,11 +84,11 @@ tf::ScalablePipelinepl(num_lines,pipes.begin(),pipes.end()); //buildthepipelinegraphusingcomposition -tf::Taskinit=taskflow.emplace([](){std::cout<<"ready\n";}) +tf::Taskinit=taskflow.emplace([](){std::cout<<"ready\n";}) .name("startingpipeline"); tf::Tasktask=taskflow.composed_of(pl) .name("pipeline"); -tf::Taskstop=taskflow.emplace([](){std::cout<<"stopped\n";}) +tf::Taskstop=taskflow.emplace([](){std::cout<<"stopped\n";}) .name("pipelinestopped"); //createtaskdependency @@ -98,7 +96,7 @@ task.precede(stop); //dumpthepipelinegraphstructure(withcomposition) -taskflow.dump(std::cout); +taskflow.dump(std::cout); //runthepipeline executor.run(taskflow).wait(); @@ -113,34 +111,33 @@ executor.run(taskflow).wait(); The program defines a uniform pipe type of tf::Pipe<std::function<void(tf::Pipeflow&)>> and keep all pipes in a vector that is amenable to change. Then, it constructs a scalable pipeline using two range iterators, [first, last), that point to the beginning and the end of the pipe vector, resulting in a pipeline of three serial stages: - + Then, the program appends another two pipes into the vector and resets the pipeline to the new range of two additional pipes, resulting in a pipeline of five serial stages: - + When resetting a scalable pipeline to a new range, it will start from the initial state as if it has just been constructed, i.e., the token number counts from zero. -Unlike tf::Pipeline that keeps the given pipes in a std::tuple object, tf::ScalablePipeline does not own the given pipe but maintains a vector of iterators to each pipe in the given range. It is your responsibility to keep those pipe objects alive during the execution of the pipeline task. +Unlike tf::Pipeline that keeps the given pipes in a std::tuple object, tf::ScalablePipeline does not own the given pipe but maintains a vector of iterators to each pipe in the given range. It is your responsibility to keep those pipe objects alive during the execution of the pipeline task. -Codestin Search App -It is possible to create a scalable pipeline as a placeholder using the constructor tf::ScalablePipeline(size_t num_lines) and reset it to another range later in the application. The following code creates a task to emplace a range of pipes and reset the pipeline to that range, before running the pipeline task: +Codestin Search AppIt is possible to create a scalable pipeline as a placeholder using the constructor tf::ScalablePipeline(size_t num_lines) and reset it to another range later in the application. The following code creates a task to emplace a range of pipes and reset the pipeline to that range, before running the pipeline task: tf::Executorexecutor; tf::Taskflowtaskflow; size_tnum_pipes=10; size_tnum_lines=10; -std::vector<tf::Pipe<std::function<void(tf::Pipeflow&)>>>pipes; -tf::ScalablePipeline<typenamedecltype(pipes)::iterator>spl(num_lines); +std::vector<tf::Pipe<std::function<void(tf::Pipeflow&)>>>pipes; +tf::ScalablePipeline<typenamedecltype(pipes)::iterator>spl(num_lines); tf::Taskinit=taskflow.emplace([&](){ for(size_ti=0;i<num_pipes;i++){ pipes.emplace_back(tf::PipeType::SERIAL,[&](tf::Pipeflow&pf){ if(pf.pipe()==0&&pf.token()==1024){ -pf.stop(); -return; +pf.stop(); +return; } }); } @@ -152,32 +149,30 @@ executor.run(taskflow).wait(); The task graph of this program is shown below: - + It is your responsibility to ensure a scalable pipeline has a valid structure before running it. A valid pipeline must have at least one parallel line and one pipe, where the first pipe is a serial type. Similarly, you can create an empty scalable pipeline using the default constructor tf::ScalablePipeline() and reset it later in your program. -std::vector<tf::Pipe<std::function<void(tf::Pipeflow&)>>>pipes; -tf::ScalablePipeline<typenamedecltype(pipes)::iterator>spl; +std::vector<tf::Pipe<std::function<void(tf::Pipeflow&)>>>pipes; +tf::ScalablePipeline<typenamedecltype(pipes)::iterator>spl; //createpipes... -spl.reset(num_lines,pipes.begin(),pipes.end()); +spl.reset(num_lines,pipes.begin(),pipes.end()); -Codestin Search App -When assigning a range to a scalable pipeline, the pipeline fetches all pipe iterators in that range to an internal vector. This organization allows invoking a pipe callable to be a random accessible operation, regardless of the pipe container type. Taskflow does not have much restriction on the iterator type, as long as these pipes can be iterated in a sequential order using the postfix increment operator, ++. +Codestin Search AppWhen assigning a range to a scalable pipeline, the pipeline fetches all pipe iterators in that range to an internal vector. This organization allows invoking a pipe callable to be a random accessible operation, regardless of the pipe container type. Taskflow does not have much restriction on the iterator type, as long as these pipes can be iterated in a sequential order using the postfix increment operator, ++. //usevectortostorepipes -std::vector<tf::Pipe<std::function<void(tf::Pipeflow&)>>>vector; +std::vector<tf::Pipe<std::function<void(tf::Pipeflow&)>>>vector; tf::ScalablePipelinespl1(num_lines,vector.begin(),vector.end()); //uselisttostorepipes -std::list<tf::Pipe<std::function<void(tf::Pipeflow&)>>>list; +std::list<tf::Pipe<std::function<void(tf::Pipeflow&)>>>list; tf::ScalablePipelinespl2(num_lines,list.begin(),list.end()); -Codestin Search App -Visit the following pages to learn more about pipeline: +Codestin Search AppVisit the following pages to learn more about pipeline: Task-parallel Pipeline Data-parallel Pipeline @@ -188,6 +183,6 @@ Similarly, you can create an empty scalable pipeline using the default construct - + diff --git a/docs/xml/TaskflowProcessingPipeline.xml b/docs/xml/TaskflowProcessingPipeline.xml index eed99646e..031b1228f 100644 --- a/docs/xml/TaskflowProcessingPipeline.xml +++ b/docs/xml/TaskflowProcessingPipeline.xml @@ -1,5 +1,5 @@ - + TaskflowProcessingPipeline Codestin Search App @@ -7,55 +7,53 @@ Formulate the Taskflow Processing Pipeline Problem TaskflowProcessingPipeline_1FormulateTheTaskflowProcessingPipelineProblem - + Create a Taskflow Processing Pipeline TaskflowProcessingPipeline_1CreateATaskflowProcessingPipeline - - - Define Taskflows - TaskflowProcessingPipeline_1TaskflowPipelineDefineTaskflows - - - Define the Pipes - TaskflowProcessingPipeline_1TaskflowPipelineDefineThePipes - - - Define the Task Graph - TaskflowProcessingPipeline_1TaskflowPipelineDefineTheTaskGraph - - - Submit the Task Graph - TaskflowProcessingPipeline_1TaskflowPipelineSubmitTheTaskGraph - - - + + + Define Taskflows + TaskflowProcessingPipeline_1TaskflowPipelineDefineTaskflows + + + Define the Pipes + TaskflowProcessingPipeline_1TaskflowPipelineDefineThePipes + + + Define the Task Graph + TaskflowProcessingPipeline_1TaskflowPipelineDefineTheTaskGraph + + + Submit the Task Graph + TaskflowProcessingPipeline_1TaskflowPipelineSubmitTheTaskGraph + + + We study a taskflow processing pipeline that propagates a sequence of tokens through linearly dependent taskflows. The pipeline embeds a taskflow in each pipe to run a parallel algorithm using task graph parallelism. -Codestin Search App -Many complex and irregular pipeline applications require each pipe to run a parallel algorithm using task graph parallelism. We can formulate such applications as scheduling a sequence of tokens through linearly dependent taskflows. The following example illustrates the pipeline propagation of three scheduling tokens through three linearly dependent taskflows: - - +Codestin Search AppMany complex and irregular pipeline applications require each pipe to run a parallel algorithm using task graph parallelism. We can formulate such applications as scheduling a sequence of tokens through linearly dependent taskflows. The following example illustrates the pipeline propagation of three scheduling tokens through three linearly dependent taskflows: + + Each pipe (stage) in the pipeline embeds a taskflow to perform a stage-specific parallel algorithm on an input scheduling token. Parallelism exhibits both inside and outside the three taskflows, combining both task graph parallelism and pipeline parallelism. -Codestin Search App -Using the example from the previous section, we create a pipeline of three serial pipes each running a taskflow on a sequence of five scheduling tokens. The overall implementation is shown below: +Codestin Search AppUsing the example from the previous section, we create a pipeline of three serial pipes each running a taskflow on a sequence of five scheduling tokens. The overall implementation is shown below: #include<taskflow/taskflow.hpp> #include<taskflow/algorithm/pipeline.hpp> //taskflowonthefirstpipe voidmake_taskflow1(tf::Taskflow&tf){ auto[A1,B1,C1,D1]=tf.emplace( -[](){printf("A1\n");}, -[](){printf("B1\n");}, -[](){printf("C1\n");}, -[](){printf("D1\n");} +[](){printf("A1\n");}, +[](){printf("B1\n");}, +[](){printf("C1\n");}, +[](){printf("D1\n");} ); A1.precede(B1,C1); D1.succeed(B1,C1); @@ -64,10 +62,10 @@ //taskflowonthesecondpipe voidmake_taskflow2(tf::Taskflow&tf){ auto[A2,B2,C2,D2]=tf.emplace( -[](){printf("A2\n");}, -[](){printf("B2\n");}, -[](){printf("C2\n");}, -[](){printf("D2\n");} +[](){printf("A2\n");}, +[](){printf("B2\n");}, +[](){printf("C2\n");}, +[](){printf("D2\n");} ); tf.linearize({A2,B2,C2,D2}); } @@ -75,10 +73,10 @@ //taskflowonthethirdpipe voidmake_taskflow3(tf::Taskflow&tf){ auto[A3,B3,C3,D3]=tf.emplace( -[](){printf("A3\n");}, -[](){printf("B3\n");}, -[](){printf("C3\n");}, -[](){printf("D3\n");} +[](){printf("A3\n");}, +[](){printf("B3\n");}, +[](){printf("C3\n");}, +[](){printf("D3\n");} ); A3.precede(B3,C3,D3); } @@ -93,7 +91,7 @@ //definethetaskflowstorage //weusethepipedimensionbecausewecreatethree'serial'pipes -std::array<tf::Taskflow, num_pipes>taskflows; +std::array<tf::Taskflow, num_pipes>taskflows; //createthreedifferenttaskflowsforthethreepipes make_taskflow1(taskflows[0]); @@ -110,7 +108,7 @@ pf.stop(); return; } -printf("begintoken%zu\n",pf.token()); +printf("begintoken%zu\n",pf.token()); executor.corun(taskflows[pf.pipe()]); }}, @@ -126,11 +124,11 @@ ); //buildthepipelinegraphusingcomposition -tf::Taskinit=taskflow.emplace([](){std::cout<<"ready\n";}) +tf::Taskinit=taskflow.emplace([](){std::cout<<"ready\n";}) .name("startingpipeline"); tf::Tasktask=taskflow.composed_of(pl) .name("pipeline"); -tf::Taskstop=taskflow.emplace([](){std::cout<<"stopped\n";}) +tf::Taskstop=taskflow.emplace([](){std::cout<<"stopped\n";}) .name("pipelinestopped"); //createtaskdependency @@ -138,7 +136,7 @@ task.precede(stop); //dumpthepipelinegraphstructure(withcomposition) -taskflow.dump(std::cout); +taskflow.dump(std::cout); //runthepipeline executor.run(taskflow).wait(); @@ -147,15 +145,14 @@ } -Codestin Search App -First, we define three taskflows for the three pipes in the pipeline: +Codestin Search AppFirst, we define three taskflows for the three pipes in the pipeline: //taskflowonthefirstpipe voidmake_taskflow1(tf::Taskflow&tf){ auto[A1,B1,C1,D1]=tf.emplace( -[](){printf("A1\n");}, -[](){printf("B1\n");}, -[](){printf("C1\n");}, -[](){printf("D1\n");} +[](){printf("A1\n");}, +[](){printf("B1\n");}, +[](){printf("C1\n");}, +[](){printf("D1\n");} ); A1.precede(B1,C1); D1.succeed(B1,C1); @@ -164,10 +161,10 @@ //taskflowonthesecondpipe voidmake_taskflow2(tf::Taskflow&tf){ auto[A2,B2,C2,D2]=tf.emplace( -[](){printf("A2\n");}, -[](){printf("B2\n");}, -[](){printf("C2\n");}, -[](){printf("D2\n");} +[](){printf("A2\n");}, +[](){printf("B2\n");}, +[](){printf("C2\n");}, +[](){printf("D2\n");} ); tf.linearize({A2,B2,C2,D2}); } @@ -175,16 +172,16 @@ //taskflowonthethirdpipe voidmake_taskflow3(tf::Taskflow&tf){ auto[A3,B3,C3,D3]=tf.emplace( -[](){printf("A3\n");}, -[](){printf("B3\n");}, -[](){printf("C3\n");}, -[](){printf("D3\n");} +[](){printf("A3\n");}, +[](){printf("B3\n");}, +[](){printf("C3\n");}, +[](){printf("D3\n");} ); A3.precede(B3,C3,D3); } As each taskflow corresponds to a pipe in the pipeline, we create a linear array to store the three taskflows: -std::array<tf::Taskflow, num_pipes>taskflows; +std::array<tf::Taskflow, num_pipes>taskflows; make_taskflow1(taskflows[0]); make_taskflow2(taskflows[1]); make_taskflow3(taskflows[2]); @@ -192,15 +189,14 @@ Since the three taskflows are linearly dependent, at most one taskflow will run at a pipe. We can store the three taskflows in a linear array of dimension equal to the number of pipes. If there is a parallel pipe, we need to use two-dimensional array, as multiple taskflows at a stage can run simultaneously across parallel lines. -Codestin Search App -The pipe definition is straightforward. Each pipe runs the corresponding taskflow, which can be indexed at taskflows with the pipe's identifier, tf::Pipeflow::pipe(). The first pipe will cease the pipeline scheduling when it has processed five scheduling tokens: +Codestin Search AppThe pipe definition is straightforward. Each pipe runs the corresponding taskflow, which can be indexed at taskflows with the pipe's identifier, tf::Pipeflow::pipe(). The first pipe will cease the pipeline scheduling when it has processed five scheduling tokens: //firstpiperunstaskflow1 tf::Pipe{tf::PipeType::SERIAL,[&](tf::Pipeflow&pf){ if(pf.token()==5){ pf.stop(); return; } -printf("begintoken%zu\n",pf.token()); +printf("begintoken%zu\n",pf.token()); executor.corun(taskflows[pf.pipe()]); }}, @@ -214,30 +210,28 @@ executor.corun(taskflows[pf.pipe()]); }} -At each pipe, we use tf::Executor::corun to execute the corresponding taskflow and wait until the execution completes. This is important because we want te caller thread, which is the worker that invokes the pipe callable, to not block (i.e., executor.run(taskflows[pf.pipe()]).wait()) but participate in the work-stealing loop of the scheduler to avoid deadlock. +At each pipe, we use tf::Executor::corun to execute the corresponding taskflow and wait until the execution completes. This is important because we want the caller thread, which is the worker that invokes the pipe callable, to not block (i.e., executor.run(taskflows[pf.pipe()]).wait()) but participate in the work-stealing loop of the scheduler to avoid deadlock. -Codestin Search App -To build up the taskflow for the pipeline, we create a module task with the defined pipeline structure and connect it with two tasks that output helper messages before and after the pipeline: -tf::Taskinit=taskflow.emplace([](){std::cout<<"ready\n";}) +Codestin Search AppTo build up the taskflow for the pipeline, we create a module task with the defined pipeline structure and connect it with two tasks that output helper messages before and after the pipeline: +tf::Taskinit=taskflow.emplace([](){std::cout<<"ready\n";}) .name("startingpipeline"); tf::Tasktask=taskflow.composed_of(pl) .name("pipeline"); -tf::Taskstop=taskflow.emplace([](){std::cout<<"stopped\n";}) +tf::Taskstop=taskflow.emplace([](){std::cout<<"stopped\n";}) .name("pipelinestopped"); init.precede(task); task.precede(stop); - + -Codestin Search App -Finally, we submit the taskflow to the execution and run it once: +Codestin Search AppFinally, we submit the taskflow to the execution and run it once: executor.run(taskflow).wait(); One possible output is shown below: -ready +ready begintoken0 A1 C1 @@ -308,6 +302,6 @@ - + diff --git a/docs/xml/TextProcessingPipeline.xml b/docs/xml/TextProcessingPipeline.xml index 77856655b..baf1cebf7 100644 --- a/docs/xml/TextProcessingPipeline.xml +++ b/docs/xml/TextProcessingPipeline.xml @@ -1,5 +1,5 @@ - + TextProcessingPipeline Codestin Search App @@ -7,38 +7,37 @@ Formulate the Text Processing Pipeline Problem TextProcessingPipeline_1FormulateTheTextProcessingPipelineProblem - + Create a Text Processing Pipeline TextProcessingPipeline_1CreateAParallelTextPipeline - - - Define the Data Buffer - TextProcessingPipeline_1TextPipelineDefineTheDataBuffer - - - Define the Pipes - TextProcessingPipeline_1TextPipelineDefineThePipes - - - Define the Task Graph - TextProcessingPipeline_1TextPipelineDefineTheTaskGraph - - - Submit the Task Graph - TextProcessingPipeline_1TextPipelineSubmitTheTaskGraph - - - + + + Define the Data Buffer + TextProcessingPipeline_1TextPipelineDefineTheDataBuffer + + + Define the Pipes + TextProcessingPipeline_1TextPipelineDefineThePipes + + + Define the Task Graph + TextProcessingPipeline_1TextPipelineDefineTheTaskGraph + + + Submit the Task Graph + TextProcessingPipeline_1TextPipelineSubmitTheTaskGraph + + + We study a text processing pipeline that finds the most frequent character of each string from an input source. Parallelism exhibits in the form of a three-stage pipeline that transforms the input string to a final pair type. -Codestin Search App -Given an input vector of strings, we want to compute the most frequent character for each string using a series of transform operations. For example: -#inputstrings +Codestin Search AppGiven an input vector of strings, we want to compute the most frequent character for each string using a series of transform operations. For example: +#inputstrings abade ddddf eefge @@ -66,14 +65,13 @@ The first and the third stages process inputs and generate results in serial, and the second stage can run in parallel. The algorithm is a perfect fit to pipeline parallelism, as different stages can overlap with each other in time across parallel lines. -Codestin Search App -We create a pipeline of three pipes (stages) and two parallel lines to solve the problem. The number of parallel lines is a tunable parameter. In most cases, we can just use std::thread::hardware_concurrency as the line count. The first pipe reads an input string from the vector in order, the second pipe transforms the input string from the first pipe to a frequency map in parallel, and the third pipe reduces the frequency map to find the most frequent character. The overall implementation is shown below: +Codestin Search AppWe create a pipeline of three pipes (stages) and two parallel lines to solve the problem. The number of parallel lines is a tunable parameter. In most cases, we can just use std::thread::hardware_concurrency as the line count. The first pipe reads an input string from the vector in order, the second pipe transforms the input string from the first pipe to a frequency map in parallel, and the third pipe reduces the frequency map to find the most frequent character. The overall implementation is shown below: #include<taskflow/taskflow.hpp> #include<taskflow/algorithm/pipeline.hpp> //Function:formatthemap -std::stringformat_map(conststd::unordered_map<char, size_t>&map){ -std::ostringstreamoss; +std::stringformat_map(conststd::unordered_map<char, size_t>&map){ +std::ostringstreamoss; for(constauto&[i,j]:map){ oss<<i<<':'<<j<<''; } @@ -88,7 +86,7 @@ constsize_tnum_lines=2; //inputdata -std::vector<std::string>input={ +std::vector<std::string>input={ "abade", "ddddf", "eefge", @@ -99,10 +97,10 @@ }; //customdatastorage -usingdata_type=std::variant< -std::string,std::unordered_map<char, size_t>,std::pair<char, size_t> +usingdata_type=std::variant< +std::string,std::unordered_map<char, size_t>,std::pair<char, size_t> >; -std::array<data_type, num_lines>mybuffer; +std::array<data_type, num_lines>mybuffer; //thepipelineconsistsofthreepipes(serial-parallel-serial) //anduptotwoconcurrentschedulingtokens @@ -114,39 +112,39 @@ pf.stop(); } else{ -printf("stage1:inputtoken=%s\n",input[pf.token()].c_str()); +printf("stage1:inputtoken=%s\n",input[pf.token()].c_str()); mybuffer[pf.line()]=input[pf.token()]; } }}, //secondpipecountsthefrequencyofeachcharacter tf::Pipe{tf::PipeType::PARALLEL,[&](tf::Pipeflow&pf){ -std::unordered_map<char, size_t>map; +std::unordered_map<char, size_t>map; for(autoc:std::get<std::string>(mybuffer[pf.line()])){ map[c]++; } -printf("stage2:map=%s\n",format_map(map).c_str()); +printf("stage2:map=%s\n",format_map(map).c_str()); mybuffer[pf.line()]=map; }}, //thirdpipereducesthemostfrequentcharacter tf::Pipe{tf::PipeType::SERIAL,[&mybuffer](tf::Pipeflow&pf){ auto&map=std::get<std::unordered_map<char,size_t>>(mybuffer[pf.line()]); -autosol=std::max_element(map.begin(),map.end(),[](auto&a,auto&b){ +autosol=std::max_element(map.begin(),map.end(),[](auto&a,auto&b){ returna.second<b.second; }); -printf("stage3:%c:%zu\n",sol->first,sol->second); +printf("stage3:%c:%zu\n",sol->first,sol->second); //notnecessarytostorethelast-stagedata,justfordemopurpose mybuffer[pf.line()]=*sol; }} ); //buildthepipelinegraphusingcomposition -tf::Taskinit=taskflow.emplace([](){std::cout<<"ready\n";}) +tf::Taskinit=taskflow.emplace([](){std::cout<<"ready\n";}) .name("startingpipeline"); tf::Tasktask=taskflow.composed_of(pl) .name("pipeline"); -tf::Taskstop=taskflow.emplace([](){std::cout<<"stopped\n";}) +tf::Taskstop=taskflow.emplace([](){std::cout<<"stopped\n";}) .name("pipelinestopped"); //createtaskdependency @@ -154,7 +152,7 @@ task.precede(stop); //dumpthepipelinegraphstructure(withcomposition) -taskflow.dump(std::cout); +taskflow.dump(std::cout); //runthepipeline executor.run(taskflow).wait(); @@ -163,70 +161,66 @@ } -Codestin Search App -Taskflow does not provide any data abstraction to perform pipeline scheduling, but give users full control over data management in their applications. In this example, we create an one-dimensional buffer of a std::variant data type to store the output of each pipe in a uniform storage: -usingdata_type=std::variant< -std::string,std::unordered_map<char, size_t>,std::pair<char, size_t> +Codestin Search AppTaskflow does not provide any data abstraction to perform pipeline scheduling, but give users full control over data management in their applications. In this example, we create an one-dimensional buffer of a std::variant data type to store the output of each pipe in a uniform storage: +usingdata_type=std::variant< +std::string,std::unordered_map<char, size_t>,std::pair<char, size_t> >; -std::array<std::array<data_type, num_pipes>,num_lines>mybuffer; +std::array<std::array<data_type, num_pipes>,num_lines>mybuffer; -One-dimensional buffer is sufficient because Taskflow enables only one scheduling token per line at a time. +One-dimensional buffer is sufficient because Taskflow enables only one scheduling token per line at a time. -Codestin Search App -The first pipe reads one string and puts it in the corresponding entry at the buffer, mybuffer[pf.line()]. Since we read in each string in order, we declare the pipe as a serial type: +Codestin Search AppThe first pipe reads one string and puts it in the corresponding entry at the buffer, mybuffer[pf.line()]. Since we read in each string in order, we declare the pipe as a serial type: tf::Pipe{tf::PipeType::SERIAL,[&](tf::Pipeflow&pf){ if(pf.token()==input.size()){ pf.stop(); } else{ mybuffer[pf.line()]=input[pf.token()]; -printf("stage1:inputtoken=%s\n",input[pf.token()].c_str()); +printf("stage1:inputtoken=%s\n",input[pf.token()].c_str()); } }}, The second pipe needs to get the input string from the previous pipe and then transforms that input string into a frequency map that records the occurrence of each character in the string. As multiple transforms can operate simultaneously, we declare the pipe as a parallel type: tf::Pipe{tf::PipeType::PARALLEL,[&](tf::Pipeflow&pf){ -std::unordered_map<char, size_t>map; +std::unordered_map<char, size_t>map; for(autoc:std::get<std::string>(mybuffer[pf.line()])){ map[c]++; } mybuffer[pf.line()]=map; -printf("stage2:map=%s\n",format_map(map).c_str()); +printf("stage2:map=%s\n",format_map(map).c_str()); }} Similarly, the third pipe needs to get the input frequency map from the previous pipe and then reduces the result to find the most frequent character. We may not need to store the result in the buffer but other places defined by the application (e.g., an output file). As we want to output the result in the same order as the input, we declare the pipe as a serial type: tf::Pipe{tf::PipeType::SERIAL,[&mybuffer](tf::Pipeflow&pf){ auto&map=std::get<std::unordered_map<char,size_t>>(mybuffer[pf.line()]); -autosol=std::max_element(map.begin(),map.end(),[](auto&a,auto&b){ +autosol=std::max_element(map.begin(),map.end(),[](auto&a,auto&b){ returna.second<b.second; }); -printf("stage3:%c:%zu\n",sol->first,sol->second); +printf("stage3:%c:%zu\n",sol->first,sol->second); }} -Codestin Search App -To build up the taskflow graph for the pipeline, we create a module task out of the pipeline structure and connect it with two tasks that outputs messages before and after the pipeline: -tf::Taskinit=taskflow.emplace([](){std::cout<<"ready\n";}) +Codestin Search AppTo build up the taskflow graph for the pipeline, we create a module task out of the pipeline structure and connect it with two tasks that outputs messages before and after the pipeline: +tf::Taskinit=taskflow.emplace([](){std::cout<<"ready\n";}) .name("startingpipeline"); tf::Tasktask=taskflow.composed_of(pl) .name("pipeline"); -tf::Taskstop=taskflow.emplace([](){std::cout<<"stopped\n";}) +tf::Taskstop=taskflow.emplace([](){std::cout<<"stopped\n";}) .name("pipelinestopped"); init.precede(task); task.precede(stop); -Codestin Search App -Finally, we submit the taskflow to the execution and run it once: +Codestin Search AppFinally, we submit the taskflow to the execution and run it once: executor.run(taskflow).wait(); As the second stage is a parallel pipe, the output may interleave. One possible result is shown below: -ready +ready stage1:inputtoken=abade stage1:inputtoken=ddddf stage2:map=f:1d:4 @@ -251,11 +245,11 @@ stopped We can see seven outputs at the third stage that show the most frequent character for each of the seven strings in order (a:2, d:4, e:3, z:2, j:4, i:4, k:3). The taskflow graph of this pipeline workload is shown below: - + - + diff --git a/docs/xml/algorithms_8dox.xml b/docs/xml/algorithms_8dox.xml index 635e98716..6352681ea 100644 --- a/docs/xml/algorithms_8dox.xml +++ b/docs/xml/algorithms_8dox.xml @@ -1,5 +1,5 @@ - + algorithms.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/async__task_8hpp.xml b/docs/xml/async__task_8hpp.xml index 1c3eff2d2..22f6743e0 100644 --- a/docs/xml/async__task_8hpp.xml +++ b/docs/xml/async__task_8hpp.xml @@ -1,7 +1,251 @@ - + async_task.hpp + graph.hpp + taskflow/core/executor.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf::AsyncTask tf @@ -9,6 +253,6 @@ - + diff --git a/docs/xml/async__tasking_8dox.xml b/docs/xml/async__tasking_8dox.xml index e3f43e8a5..d43336a49 100644 --- a/docs/xml/async__tasking_8dox.xml +++ b/docs/xml/async__tasking_8dox.xml @@ -1,5 +1,5 @@ - + async_tasking.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/benchmark__taskflow_8dox.xml b/docs/xml/benchmark__taskflow_8dox.xml index e4d35b525..e26a20269 100644 --- a/docs/xml/benchmark__taskflow_8dox.xml +++ b/docs/xml/benchmark__taskflow_8dox.xml @@ -1,5 +1,5 @@ - + benchmark_taskflow.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/cancellation_8dox.xml b/docs/xml/cancellation_8dox.xml index 2a5f9e892..d0d116fcb 100644 --- a/docs/xml/cancellation_8dox.xml +++ b/docs/xml/cancellation_8dox.xml @@ -1,5 +1,5 @@ - + cancellation.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/classtf_1_1AnchorGuard.xml b/docs/xml/classtf_1_1AnchorGuard.xml new file mode 100644 index 000000000..b7c3c98e0 --- /dev/null +++ b/docs/xml/classtf_1_1AnchorGuard.xml @@ -0,0 +1,66 @@ + + + + tf::AnchorGuard + + + Node * + Node* tf::AnchorGuard::_node + + _node + tf::AnchorGuard::_node + + + + + + + + + + + + + tf::AnchorGuard::AnchorGuard + (Node *node) + AnchorGuard + tf::AnchorGuard::AnchorGuard + + Node * + node + + + + + + + + + + + + tf::AnchorGuard::~AnchorGuard + () + ~AnchorGuard + tf::AnchorGuard::~AnchorGuard + + + + + + + + + + + + + + + + tf::AnchorGuard_node + tf::AnchorGuardAnchorGuard + tf::AnchorGuard~AnchorGuard + + + diff --git a/docs/xml/classtf_1_1AsyncTask.xml b/docs/xml/classtf_1_1AsyncTask.xml index d7184ac67..d62e660d1 100644 --- a/docs/xml/classtf_1_1AsyncTask.xml +++ b/docs/xml/classtf_1_1AsyncTask.xml @@ -1,14 +1,15 @@ - + tf::AsyncTask - async_task.hpp - + taskflow/core/async_task.hpp + class friend class Executor Executor + tf::AsyncTask::Executor Executor @@ -18,15 +19,16 @@ - + - - + + Node * Node* tf::AsyncTask::_node _node + tf::AsyncTask::_node {nullptr} @@ -34,15 +36,16 @@ - + - - + + tf::AsyncTask::AsyncTask ()=default AsyncTask + tf::AsyncTask::AsyncTask constructs an empty task handle @@ -50,173 +53,209 @@ - + tf::AsyncTask::~AsyncTask () ~AsyncTask + tf::AsyncTask::~AsyncTask -destroys the managed asynchronous task if this is the last owner +destroys the managed dependent-async task if this is the last owner - + tf::AsyncTask::AsyncTask (const AsyncTask &rhs) AsyncTask + tf::AsyncTask::AsyncTask const AsyncTask & rhs -constructs an asynchronous task that shares ownership of rhs +constructs a dependent-async task that shares ownership of rhs - + tf::AsyncTask::AsyncTask (AsyncTask &&rhs) AsyncTask + tf::AsyncTask::AsyncTask AsyncTask && rhs -move-constructs an asynchronous task from rhs +move-constructs an dependent-async task from rhs - + AsyncTask & AsyncTask & tf::AsyncTask::operator= (const AsyncTask &rhs) operator= + tf::AsyncTask::operator= const AsyncTask & rhs -copy-assigns the asynchronous task from rhs +copy-assigns the dependent-async task from rhs Releases the managed object of this and retains a new shared ownership of rhs. - + AsyncTask & AsyncTask & tf::AsyncTask::operator= (AsyncTask &&rhs) operator= + tf::AsyncTask::operator= AsyncTask && rhs -move-assigns the asynchronous task from rhs +move-assigns the dependent-async task from rhs Releases the managed object of this and takes over the ownership of rhs. - + bool bool tf::AsyncTask::empty () const empty + tf::AsyncTask::empty -checks if the asynchronous task stores nothing +checks if this dependent-async task is associated with any task +An empty dependent-async task is not associated with any task created from the executor. +tf::AsyncTasktask; +assert(task.empty()); + - + void void tf::AsyncTask::reset () reset + tf::AsyncTask::reset release the managed object of this +Releases the ownership of the managed task, if any. After the call *this manages no task. +tf::AsyncTasktask=executor.silent_dependent_async([](){}); +assert(task.empty()==false); +task.reset(); +assert(task.empty()==true); + - + size_t size_t tf::AsyncTask::hash_value () const hash_value + tf::AsyncTask::hash_value -obtains a hash value of this asynchronous task +obtains the hashed value of this dependent-async task +tf::AsyncTasktask=executor.silent_dependent_async([](){}); +std::cout<<task.hash_value()<<'\n'; + - + size_t size_t tf::AsyncTask::use_count () const use_count + tf::AsyncTask::use_count -returns the number of shared owners that are currently managing this asynchronous task +returns the number of shared owners that are currently managing this dependent-async task +In a multithreaded environment, use_count atomically retrieves (with memory_order_relaxed load) the number of tf::AsyncTask instances that manage the current task. +tf::AsyncTasktask; +assert(task.use_count()==0); + - + bool bool tf::AsyncTask::is_done () const is_done + tf::AsyncTask::is_done -returns the boolean indicating whether the async task is done +checks if this dependent-async task finishes +In a multithreaded environment, is_done atomically retrieves (with memory_order_acquire load) the underlying state bit that indicates the completion of this dependent-async task. If the dependent-async task is empty, returns true. +tf::AsyncTasktask=executor.silent_dependent_async([](){}); +while(task.is_done()==false); +std::cout<<"dependent-asynctaskfinishes\n"; + +task.reset(); +assert(task.is_done()==true); + - + - - + + tf::AsyncTask::AsyncTask (Node *) AsyncTask + tf::AsyncTask::AsyncTask Node * ptr @@ -227,40 +266,42 @@ - + void void tf::AsyncTask::_incref () _incref + tf::AsyncTask::_incref - + void void tf::AsyncTask::_decref () _decref + tf::AsyncTask::_decref - + - + -class to create a dependent asynchronous task +class to hold a dependent asynchronous task with shared ownership -A tf::AsyncTask is a lightweight handle that retains shared ownership of a dependent async task created by an executor. This shared ownership ensures that the async task remains alive when adding it to the dependency list of another async task, thus avoiding the classical ABA problem. +A tf::AsyncTask is a lightweight handle that retains shared ownership of a dependent asynchronous (dependent-async) task created by an executor. This shared ownership ensures that the dependent-async task remains alive when adding it to the dependency list of another dependent-async task, thus avoiding the classical ABA problem. //mainthreadretainssharedownershipofasynctaskA tf::AsyncTaskA=executor.silent_dependent_async([](){}); @@ -268,9 +309,12 @@ //whenbeingaddedtothedependencylistofasynctaskB tf::AsyncTaskB=executor.silent_dependent_async([](){},A); -Currently, tf::AsyncTask is implemented based on the logic of C++ smart pointer std::shared_ptr and is considered cheap to copy or move as long as only a handful of objects own it. When a worker completes an async task, it will remove the task from the executor, decrementing the number of shared owners by one. If that counter reaches zero, the task is destroyed. +tf::AsyncTask is implemented based on the logic of C++ smart pointer std::shared_ptr and is considered cheap to copy or move as long as only a handful of objects own it. When a worker completes an async task, it will remove the task from the executor, decrementing the number of shared owners by one. If that counter reaches zero, the task is destroyed. +To know more about dependent-async task, please refer to Asynchronous Tasking with Dependencies. + + - + tf::AsyncTask_decref tf::AsyncTask_incref diff --git a/docs/xml/classtf_1_1BoundedTaskQueue.xml b/docs/xml/classtf_1_1BoundedTaskQueue.xml new file mode 100644 index 000000000..1ed254256 --- /dev/null +++ b/docs/xml/classtf_1_1BoundedTaskQueue.xml @@ -0,0 +1,388 @@ + + + + tf::BoundedTaskQueue + taskflow/core/tsq.hpp + + + typename T + + + size_t + LogSize + LogSize + TF_DEFAULT_BOUNDED_TASK_QUEUE_LOG_SIZE + + + + + int64_t + int64_t tf::BoundedTaskQueue< T, LogSize >::BufferSize + + BufferSize + tf::BoundedTaskQueue::BufferSize + = int64_t{1} << LogSize + + + + + + + + + + int64_t + int64_t tf::BoundedTaskQueue< T, LogSize >::BufferMask + + BufferMask + tf::BoundedTaskQueue::BufferMask + = (BufferSize - 1) + + + + + + + + + + + + std::atomic< int64_t > + std::atomic<int64_t> tf::BoundedTaskQueue< T, LogSize >::_top + + _top + tf::BoundedTaskQueue::_top + {0} + + + + + + + + + + std::atomic< int64_t > + std::atomic<int64_t> tf::BoundedTaskQueue< T, LogSize >::_bottom + + _bottom + tf::BoundedTaskQueue::_bottom + {0} + + + + + + + + + + std::atomic< T > + std::atomic<T> tf::BoundedTaskQueue< T, LogSize >::_buffer[BufferSize] + [BufferSize] + _buffer + tf::BoundedTaskQueue::_buffer + + + + + + + + + + + + + tf::BoundedTaskQueue< T, LogSize >::BoundedTaskQueue + ()=default + BoundedTaskQueue + tf::BoundedTaskQueue::BoundedTaskQueue + +constructs the queue with a given capacity + + + + + + + + + + tf::BoundedTaskQueue< T, LogSize >::~BoundedTaskQueue + ()=default + ~BoundedTaskQueue + tf::BoundedTaskQueue::~BoundedTaskQueue + +destructs the queue + + + + + + + + + bool + bool tf::BoundedTaskQueue< T, LogSize >::empty + () const noexcept + empty + tf::BoundedTaskQueue::empty + +queries if the queue is empty at the time of this call + + + + + + + + + size_t + size_t tf::BoundedTaskQueue< T, LogSize >::size + () const noexcept + size + tf::BoundedTaskQueue::size + +queries the number of items at the time of this call + + + + + + + + + size_t + size_t tf::BoundedTaskQueue< T, LogSize >::capacity + () const + capacity + tf::BoundedTaskQueue::capacity + +queries the capacity of the queue + + + + + + + + + + + typename O + + + bool + bool tf::BoundedTaskQueue< T, LogSize >::try_push + (O &&item) + try_push + tf::BoundedTaskQueue::try_push + + O && + item + + +tries to insert an item to the queue + + + + +O + + +data type + + + + + +item + + +the item to perfect-forward to the queue + + + +true if the insertion succeed or false (queue is full) + +Only the owner thread can insert an item to the queue. + + + + + + + + + typename O + + + typename C + + + void + void tf::BoundedTaskQueue< T, LogSize >::push + (O &&item, C &&on_full) + push + tf::BoundedTaskQueue::push + + O && + item + + + C && + on_full + + +tries to insert an item to the queue or invoke the callable if fails + + + + +O + + +data type + + + + +C + + +callable type + + + + + +item + + +the item to perfect-forward to the queue + + + + +on_full + + +callable to invoke when the queue is full (insertion fails) + + + +Only the owner thread can insert an item to the queue. + + + + + + + T + T tf::BoundedTaskQueue< T, LogSize >::pop + () + pop + tf::BoundedTaskQueue::pop + +pops out an item from the queue + + +Only the owner thread can pop out an item from the queue. The return can be a nullptr if this operation failed (empty queue). + + + + + + + T + T tf::BoundedTaskQueue< T, LogSize >::steal + () + steal + tf::BoundedTaskQueue::steal + +steals an item from the queue + + +Any threads can try to steal an item from the queue. The return can be a nullptr if this operation failed (not necessary empty). + + + + + + + T + T tf::BoundedTaskQueue< T, LogSize >::steal_with_hint + (size_t &num_empty_steals) + steal_with_hint + tf::BoundedTaskQueue::steal_with_hint + + size_t & + num_empty_steals + + +attempts to steal a task with a hint mechanism + + + + +num_empty_steals + + +a reference to a counter tracking consecutive empty steal attempts + + + +This function tries to steal a task from the queue. If the steal attempt is successful, the stolen task is returned. Additionally, if the queue is empty, the provided counter num_empty_steals is incremented; otherwise, num_empty_steals is reset to zero. + + + + + + + +class to create a lock-free bounded work-stealing queue + + + + +T + + +data type + + + + +LogSize + + +the base-2 logarithm of the queue size + + + + +This class implements the work-stealing queue described in the paper, "Correct and Efficient Work-Stealing for Weak Memory Models," available at https://www.di.ens.fr/~zappa/readings/ppopp13.pdf. +Only the queue owner can perform pop and push operations, while others can steal data from the queue. + + + + tf::BoundedTaskQueue_bottom + tf::BoundedTaskQueue_buffer + tf::BoundedTaskQueue_top + tf::BoundedTaskQueueBoundedTaskQueue + tf::BoundedTaskQueueBufferMask + tf::BoundedTaskQueueBufferSize + tf::BoundedTaskQueuecapacity + tf::BoundedTaskQueueempty + tf::BoundedTaskQueuepop + tf::BoundedTaskQueuepush + tf::BoundedTaskQueuesize + tf::BoundedTaskQueuesteal + tf::BoundedTaskQueuesteal_with_hint + tf::BoundedTaskQueuetry_push + tf::BoundedTaskQueue~BoundedTaskQueue + + + diff --git a/docs/xml/classtf_1_1CachelineAligned.xml b/docs/xml/classtf_1_1CachelineAligned.xml new file mode 100644 index 000000000..7ada14654 --- /dev/null +++ b/docs/xml/classtf_1_1CachelineAligned.xml @@ -0,0 +1,98 @@ + + + + tf::CachelineAligned + taskflow/utility/os.hpp + + + typename T + + + + + T + T tf::CachelineAligned< T >::data + + data + tf::CachelineAligned::data + +The stored object, aligned to twice the cacheline size. + + + + + + + + + + + T & + T & tf::CachelineAligned< T >::get + () + get + tf::CachelineAligned::get + +accesses the underlying object + + +a reference to the underlying object. + + + + + + + + + const T & + const T & tf::CachelineAligned< T >::get + () const + get + tf::CachelineAligned::get + +accesses the underlying object as a constant reference + + +a constant reference to the underlying object. + + + + + + + + + +class to ensure cacheline-aligned storage for an object. + + + + +T + + +The type of the stored object. + + + +This utility class aligns the stored object data to twice the size of a cacheline. The alignment improves performance by optimizing data access in cache-sensitive scenarios. +//createtwointegersontwoseparatecachelinestoavoidfalsesharing +tf::CachelineAligned<int>counter1; +tf::CachelineAligned<int>counter2; + +//twothreadsaccessthetwocounterswithoutfalsesharing +std::threadt1([&]{counter1.get()=1;}); +std::threadt2([&]{counter2.get()=2;}); +t1.join(); +t2.join(); + + + + + tf::CachelineAligneddata + tf::CachelineAlignedget + tf::CachelineAlignedget + + + diff --git a/docs/xml/classtf_1_1ChromeObserver.xml b/docs/xml/classtf_1_1ChromeObserver.xml index b9f7b4635..eb8b1698a 100644 --- a/docs/xml/classtf_1_1ChromeObserver.xml +++ b/docs/xml/classtf_1_1ChromeObserver.xml @@ -1,17 +1,18 @@ - + tf::ChromeObserver tf::ObserverInterface - observer.hpp + taskflow/core/observer.hpp tf::ChromeObserver::Segment tf::ChromeObserver::Timeline - + class friend class Executor Executor + tf::ChromeObserver::Executor Executor @@ -21,32 +22,34 @@ - + - - + + Timeline Timeline tf::ChromeObserver::_timeline _timeline + tf::ChromeObserver::_timeline - + - - + + void void tf::ChromeObserver::dump (std::ostream &ostream) const dump + tf::ChromeObserver::dump - std::ostream & + std::ostream & ostream @@ -56,13 +59,14 @@ - + - std::string + std::string std::string tf::ChromeObserver::dump () const dump + tf::ChromeObserver::dump dumps the timelines into a Chrome Tracing format @@ -70,13 +74,14 @@ - + void void tf::ChromeObserver::clear () clear + tf::ChromeObserver::clear clears the timeline data @@ -84,13 +89,14 @@ - + size_t size_t tf::ChromeObserver::num_tasks () const num_tasks + tf::ChromeObserver::num_tasks queries the number of tasks observed @@ -98,15 +104,16 @@ - + - - + + void void tf::ChromeObserver::set_up (size_t num_workers) override final set_up + tf::ChromeObserver::set_up set_up size_t @@ -129,13 +136,14 @@ - + void void tf::ChromeObserver::on_entry (WorkerView w, TaskView task_view) override final on_entry + tf::ChromeObserver::on_entry on_entry WorkerView @@ -170,13 +178,14 @@ - + void void tf::ChromeObserver::on_exit (WorkerView w, TaskView task_view) override final on_exit + tf::ChromeObserver::on_exit on_exit WorkerView @@ -211,9 +220,9 @@ - + - + class to create an observer based on Chrome tracing format @@ -226,40 +235,40 @@ //... //createacustomobserver -std::shared_ptr<tf::ChromeObserver>observer=executor.make_observer<tf::ChromeObserver>(); +std::shared_ptr<tf::ChromeObserver>observer=executor.make_observer<tf::ChromeObserver>(); //runthetaskflow executor.run(taskflow).wait(); //dumpthethreadactivitiestoachrome-tracingformat. -observer->dump(std::cout); +observer->dump(std::cout); - - - - - - + + + + + + - + tf::ChromeObserver_timeline tf::ChromeObserverclear diff --git a/docs/xml/classtf_1_1CriticalSection.xml b/docs/xml/classtf_1_1CriticalSection.xml deleted file mode 100644 index 9fcfcbc2f..000000000 --- a/docs/xml/classtf_1_1CriticalSection.xml +++ /dev/null @@ -1,107 +0,0 @@ - - - - tf::CriticalSection - tf::Semaphore - critical.hpp - - - - tf::CriticalSection::CriticalSection - (size_t max_workers=1) - CriticalSection - - size_t - max_workers - 1 - - -constructs a critical region of a limited number of workers - - - - - - - - - - - typename... - Tasks - Tasks - - - void - void tf::CriticalSection::add - (Tasks...tasks) - add - - Tasks... - tasks - - -adds a task into the critical region - - - - - - - - - -class to create a critical region of limited workers to run tasks - - -tf::CriticalSection is a warpper over tf::Semaphore and is specialized for limiting the maximum concurrency over a set of tasks. A critical section starts with an initial count representing that limit. When a task is added to the critical section, the task acquires and releases the semaphore internal to the critical section. This design avoids explicit call of tf::Task::acquire and tf::Task::release. The following example creates a critical section of one worker and adds the five tasks to the critical section. -tf::Executorexecutor(8);//createanexecutorof8workers -tf::Taskflowtaskflow; - -//createacriticalsectionof1worker -tf::CriticalSectioncritical_section(1); - -tf::TaskA=taskflow.emplace([](){std::cout<<"A"<<std::endl;}); -tf::TaskB=taskflow.emplace([](){std::cout<<"B"<<std::endl;}); -tf::TaskC=taskflow.emplace([](){std::cout<<"C"<<std::endl;}); -tf::TaskD=taskflow.emplace([](){std::cout<<"D"<<std::endl;}); -tf::TaskE=taskflow.emplace([](){std::cout<<"E"<<std::endl;}); - -critical_section.add(A,B,C,D,E); - -executor.run(taskflow).wait(); - - - - - - - - - - - - - - - - - - - - - - - - - - - - - tf::CriticalSectionadd - tf::CriticalSectioncount - tf::CriticalSectionCriticalSection - tf::CriticalSectionSemaphore - - - diff --git a/docs/xml/classtf_1_1DataPipe.xml b/docs/xml/classtf_1_1DataPipe.xml index ceb60ae7a..ce2fdf7a6 100644 --- a/docs/xml/classtf_1_1DataPipe.xml +++ b/docs/xml/classtf_1_1DataPipe.xml @@ -1,8 +1,8 @@ - + tf::DataPipe - data_pipeline.hpp + taskflow/algorithm/data_pipeline.hpp typename Input @@ -14,12 +14,13 @@ typename C - + C using tf::DataPipe< Input, Output, C >::callable_t = C callable_t + tf::DataPipe::callable_t callable type of the data pipe @@ -27,13 +28,14 @@ - + Input using tf::DataPipe< Input, Output, C >::input_t = Input input_t + tf::DataPipe::input_t input type of the data pipe @@ -41,13 +43,14 @@ - + Output using tf::DataPipe< Input, Output, C >::output_t = Output output_t + tf::DataPipe::output_t output type of the data pipe @@ -55,10 +58,10 @@ - + - - + + @@ -71,6 +74,7 @@ friend class DataPipeline DataPipeline + tf::DataPipe::DataPipeline DataPipeline @@ -80,43 +84,46 @@ - + - - + + PipeType PipeType tf::DataPipe< Input, Output, C >::_type _type + tf::DataPipe::_type - + callable_t callable_t tf::DataPipe< Input, Output, C >::_callable _callable + tf::DataPipe::_callable - + - - + + tf::DataPipe< Input, Output, C >::DataPipe ()=default DataPipe + tf::DataPipe::DataPipe default constructor @@ -124,13 +131,14 @@ - + tf::DataPipe< Input, Output, C >::DataPipe (PipeType d, callable_t &&callable) DataPipe + tf::DataPipe::DataPipe PipeType d @@ -147,13 +155,14 @@ - + PipeType PipeType tf::DataPipe< Input, Output, C >::type () const type + tf::DataPipe::type queries the type of the data pipe @@ -162,13 +171,14 @@ - + void void tf::DataPipe< Input, Output, C >::type (PipeType type) type + tf::DataPipe::type PipeType type @@ -180,7 +190,7 @@ - + @@ -192,6 +202,7 @@ void tf::DataPipe< Input, Output, C >::callable (U &&callable) callable + tf::DataPipe::callable U && callable @@ -222,31 +233,31 @@ Assigns a new callable to the pipe using universal forwarding. - + - + class to create a stage in a data-parallel pipeline A data pipe represents a stage of a data-parallel pipeline. A data pipe can be either parallel direction or serial direction (specified by tf::PipeType) and is associated with a callable to invoke by the pipeline scheduler. -You need to use the template function, tf::make_data_pipe, to create a data pipe. The input and output types of a tf::DataPipe should be decayed types (though the library will always decay them for you using std::decay) to allow internal storage to work. The data will be passed by reference to your callable, at which you can take it by copy or reference. -tf::make_data_pipe<int,std::string>( +You need to use the template function, tf::make_data_pipe, to create a data pipe. The input and output types of a tf::DataPipe should be decayed types (though the library will always decay them for you using std::decay) to allow internal storage to work. The data will be passed by reference to your callable, at which you can take it by copy or reference. +tf::make_data_pipe<int, std::string>( tf::PipeType::SERIAL, -[](int&input){returnstd::to_string(input+100);} +[](int&input){returnstd::to_string(input+100);} ); In addition to the data, you callable can take an additional reference of tf::Pipeflow in the second argument to probe the runtime information for a stage task, such as its line number and token number: -tf::make_data_pipe<int,std::string>( +tf::make_data_pipe<int, std::string>( tf::PipeType::SERIAL, [](int&input,tf::Pipeflow&pf){ -printf("token=%lu,line=%lu\n",pf.token(),pf.line()); -returnstd::to_string(input+100); +printf("token=%lu,line=%lu\n",pf.token(),pf.line()); +returnstd::to_string(input+100); } ); - + tf::DataPipe_callable tf::DataPipe_type diff --git a/docs/xml/classtf_1_1DataPipeline.xml b/docs/xml/classtf_1_1DataPipeline.xml index 888c8deb4..e959a78fe 100644 --- a/docs/xml/classtf_1_1DataPipeline.xml +++ b/docs/xml/classtf_1_1DataPipeline.xml @@ -1,8 +1,8 @@ - + tf::DataPipeline - data_pipeline.hpp + taskflow/algorithm/data_pipeline.hpp tf::DataPipeline::Line tf::DataPipeline::PipeMeta @@ -12,12 +12,17 @@ Ps - - + + unique_variant_t< std::variant< std::conditional_t< std::is_void_v< typename Ps::output_t >, std::monostate, std::decay_t< typename Ps::output_t > >... > > - using tf::DataPipeline< Ps >::data_t = unique_variant_t<std::variant<std::conditional_t< std::is_void_v<typename Ps::output_t>, std::monostate, std::decay_t<typename Ps::output_t> >... > > + using tf::DataPipeline< Ps >::data_t = unique_variant_t<std::variant<std::conditional_t< + std::is_void_v<typename Ps::output_t>, + std::monostate, + std::decay_t<typename Ps::output_t>>... + >> data_t + tf::DataPipeline::data_t internal storage type for each data token (default std::variant) @@ -25,121 +30,130 @@ - + - - + + Graph Graph tf::DataPipeline< Ps >::_graph _graph + tf::DataPipeline::_graph - + size_t size_t tf::DataPipeline< Ps >::_num_tokens _num_tokens + tf::DataPipeline::_num_tokens - + - std::tuple< Ps... > + std::tuple< Ps... > std::tuple<Ps...> tf::DataPipeline< Ps >::_pipes _pipes + tf::DataPipeline::_pipes - + - std::array< PipeMeta, sizeof...(Ps)> + std::array< PipeMeta, sizeof...(Ps)> std::array<PipeMeta, sizeof...(Ps)> tf::DataPipeline< Ps >::_meta _meta + tf::DataPipeline::_meta - + - std::vector< std::array< Line, sizeof...(Ps)> > + std::vector< std::array< Line, sizeof...(Ps)> > std::vector<std::array<Line, sizeof...(Ps)> > tf::DataPipeline< Ps >::_lines _lines + tf::DataPipeline::_lines - + - std::vector< Task > + std::vector< Task > std::vector<Task> tf::DataPipeline< Ps >::_tasks _tasks + tf::DataPipeline::_tasks - + - std::vector< Pipeflow > + std::vector< Pipeflow > std::vector<Pipeflow> tf::DataPipeline< Ps >::_pipeflows _pipeflows + tf::DataPipeline::_pipeflows - + - std::vector< CachelineAligned< data_t > > + std::vector< CachelineAligned< data_t > > std::vector<CachelineAligned<data_t> > tf::DataPipeline< Ps >::_buffer _buffer + tf::DataPipeline::_buffer - + - - + + tf::DataPipeline< Ps >::DataPipeline (size_t num_lines, Ps &&... ps) DataPipeline + tf::DataPipeline::DataPipeline size_t num_lines @@ -173,19 +187,20 @@ Constructs a data-parallel pipeline of up to num_lines - + tf::DataPipeline< Ps >::DataPipeline (size_t num_lines, std::tuple< Ps... > &&ps) DataPipeline + tf::DataPipeline::DataPipeline size_t num_lines - std::tuple< Ps... > && + std::tuple< Ps... > && ps @@ -209,17 +224,18 @@ Constructs a data-parallel pipeline of up to num_lines -Constructs a data-parallel pipeline of up to num_lines parallel lines to schedule tokens through the given linear chain of pipes stored in a std::tuple. The first pipe must define a serial direction (tf::PipeType::SERIAL) or an exception will be thrown. +Constructs a data-parallel pipeline of up to num_lines parallel lines to schedule tokens through the given linear chain of pipes stored in a std::tuple. The first pipe must define a serial direction (tf::PipeType::SERIAL) or an exception will be thrown. - + size_t size_t tf::DataPipeline< Ps >::num_lines () const noexcept num_lines + tf::DataPipeline::num_lines queries the number of parallel lines @@ -228,13 +244,14 @@ Constructs a data-parallel pipeline of up to num_lines - + - constexpr size_t - constexpr size_t tf::DataPipeline< Ps >::num_pipes + size_t + size_t tf::DataPipeline< Ps >::num_pipes () const noexcept num_pipes + tf::DataPipeline::num_pipes queries the number of pipes @@ -243,13 +260,14 @@ Constructs a data-parallel pipeline of up to num_lines - + void void tf::DataPipeline< Ps >::reset () reset + tf::DataPipeline::reset resets the pipeline @@ -258,13 +276,14 @@ Constructs a data-parallel pipeline of up to num_lines - + size_t size_t tf::DataPipeline< Ps >::num_tokens () const noexcept num_tokens + tf::DataPipeline::num_tokens queries the number of generated tokens in the pipeline @@ -273,13 +292,14 @@ Constructs a data-parallel pipeline of up to num_lines - + Graph & Graph & tf::DataPipeline< Ps >::graph () graph + tf::DataPipeline::graph obtains the graph object associated with the pipeline construct @@ -288,10 +308,10 @@ Constructs a data-parallel pipeline of up to num_lines - + - - + + @@ -304,8 +324,9 @@ Constructs a data-parallel pipeline of up to num_linesauto tf::DataPipeline< Ps >::_gen_meta (std::tuple< Ps... > &&, std::index_sequence< I... >) _gen_meta + tf::DataPipeline::_gen_meta - std::tuple< Ps... > && + std::tuple< Ps... > && ps @@ -317,13 +338,14 @@ Constructs a data-parallel pipeline of up to num_lines - + void void tf::DataPipeline< Ps >::_on_pipe (Pipeflow &, Runtime &) _on_pipe + tf::DataPipeline::_on_pipe Pipeflow & pf @@ -337,22 +359,23 @@ Constructs a data-parallel pipeline of up to num_lines - + void void tf::DataPipeline< Ps >::_build () _build + tf::DataPipeline::_build - + - + class to create a data-parallel pipeline scheduling framework @@ -366,33 +389,33 @@ Constructs a data-parallel pipeline of up to num_lines -Similar to tf::Pipeline, a tf::DataPipeline is a composable graph object for users to create a data-parallel pipeline scheduling framework using a module task in a taskflow. The only difference is that tf::DataPipeline provides a data abstraction for users to quickly express dataflow in a pipeline. The following example creates a data-parallel pipeline of three stages that generate dataflow from void to int, std::string, float, and void. +Similar to tf::Pipeline, a tf::DataPipeline is a composable graph object for users to create a data-parallel pipeline scheduling framework using a module task in a taskflow. The only difference is that tf::DataPipeline provides a data abstraction for users to quickly express dataflow in a pipeline. The following example creates a data-parallel pipeline of three stages that generate dataflow from void to int, std::string, and void. #include<taskflow/taskflow.hpp> #include<taskflow/algorithm/data_pipeline.hpp> intmain(){ -//dataflow=>void->int->std::string->float->void +//dataflow=>void->int->std::string->void tf::Taskflowtaskflow("pipeline"); tf::Executorexecutor; constsize_tnum_lines=4; tf::DataPipelinepl(num_lines, -tf::make_data_pipe<void,int>(tf::PipeType::SERIAL,[&](tf::Pipeflow&pf)->int{ +tf::make_data_pipe<void, int>(tf::PipeType::SERIAL,[&](tf::Pipeflow&pf)->int{ if(pf.token()==5){ -pf.stop(); -return0; +pf.stop(); +return0; } else{ -returnpf.token(); +returnpf.token(); } }), -tf::make_data_pipe<int,std::string>(tf::PipeType::SERIAL,[](int&input){ -returnstd::to_string(input+100); +tf::make_data_pipe<int, std::string>(tf::PipeType::SERIAL,[](int&input){ +returnstd::to_string(input+100); }), -tf::make_data_pipe<std::string,void>(tf::PipeType::SERIAL,[](std::string&input){ -std::cout<<input<<std::endl; +tf::make_data_pipe<std::string, void>(tf::PipeType::SERIAL,[](std::string&input){ +std::cout<<input<<std::endl; }) ); @@ -400,7 +423,7 @@ Similar to tf::Pipelinetaskflow.composed_of(pl).name("pipeline"); //dumpthepipelinegraphstructure(withcomposition) -taskflow.dump(std::cout); +taskflow.dump(std::cout); //runthepipeline executor.run(taskflow).wait(); @@ -409,7 +432,7 @@ Similar to tf::Pipeline} The pipeline schedules five tokens over four parallel lines in a circular fashion, as depicted below: -o->o->o +o->o->o ||| vvv o->o->o @@ -421,7 +444,7 @@ Similar to tf::Pipelineo->o->o - + tf::DataPipeline_buffer tf::DataPipeline_build @@ -434,7 +457,7 @@ Similar to tf::Pipelinetf::DataPipeline_pipeflows tf::DataPipeline_pipes tf::DataPipeline_tasks - tf::DataPipelinedata_t + tf::DataPipelinedata_t tf::DataPipelineDataPipeline tf::DataPipelineDataPipeline tf::DataPipelinegraph diff --git a/docs/xml/classtf_1_1DefaultClosureWrapper.xml b/docs/xml/classtf_1_1DefaultClosureWrapper.xml new file mode 100644 index 000000000..ed17d4023 --- /dev/null +++ b/docs/xml/classtf_1_1DefaultClosureWrapper.xml @@ -0,0 +1,15 @@ + + + + tf::DefaultClosureWrapper + taskflow/algorithm/partitioner.hpp + +class to create a default closure wrapper + + + + + + + + diff --git a/docs/xml/classtf_1_1DefaultTaskParams.xml b/docs/xml/classtf_1_1DefaultTaskParams.xml new file mode 100644 index 000000000..f9e4d720e --- /dev/null +++ b/docs/xml/classtf_1_1DefaultTaskParams.xml @@ -0,0 +1,15 @@ + + + + tf::DefaultTaskParams + taskflow/core/graph.hpp + +class to create an empty task parameter for compile-time optimization + + + + + + + + diff --git a/docs/xml/classtf_1_1DeferredPipeflow.xml b/docs/xml/classtf_1_1DeferredPipeflow.xml index 86bee4f93..ab83366c6 100644 --- a/docs/xml/classtf_1_1DeferredPipeflow.xml +++ b/docs/xml/classtf_1_1DeferredPipeflow.xml @@ -1,8 +1,8 @@ - + tf::DeferredPipeflow - + @@ -15,6 +15,7 @@ friend class Pipeline Pipeline + tf::DeferredPipeflow::Pipeline Pipeline @@ -24,7 +25,7 @@ - + @@ -36,6 +37,7 @@ friend class ScalablePipeline ScalablePipeline + tf::DeferredPipeflow::ScalablePipeline ScalablePipeline @@ -45,69 +47,74 @@ - + - - + + size_t size_t tf::DeferredPipeflow::_token _token + tf::DeferredPipeflow::_token - + size_t size_t tf::DeferredPipeflow::_num_deferrals _num_deferrals + tf::DeferredPipeflow::_num_deferrals - + - std::unordered_set< size_t > + std::unordered_set< size_t > std::unordered_set<size_t> tf::DeferredPipeflow::_dependents _dependents + tf::DeferredPipeflow::_dependents - + - - + + tf::DeferredPipeflow::DeferredPipeflow ()=default DeferredPipeflow + tf::DeferredPipeflow::DeferredPipeflow - + tf::DeferredPipeflow::DeferredPipeflow (const DeferredPipeflow &)=delete DeferredPipeflow + tf::DeferredPipeflow::DeferredPipeflow const DeferredPipeflow & @@ -117,13 +124,14 @@ - + tf::DeferredPipeflow::DeferredPipeflow (DeferredPipeflow &&)=delete DeferredPipeflow + tf::DeferredPipeflow::DeferredPipeflow DeferredPipeflow && @@ -133,13 +141,14 @@ - + tf::DeferredPipeflow::DeferredPipeflow (size_t t, size_t n, std::unordered_set< size_t > &&dep) DeferredPipeflow + tf::DeferredPipeflow::DeferredPipeflow size_t t @@ -149,7 +158,7 @@ n - std::unordered_set< size_t > && + std::unordered_set< size_t > && dep @@ -158,13 +167,14 @@ - + - + DeferredPipeflow & - DeferredPipeflow& tf::DeferredPipeflow::operator= + DeferredPipeflow & tf::DeferredPipeflow::operator= (const DeferredPipeflow &)=delete operator= + tf::DeferredPipeflow::operator= const DeferredPipeflow & @@ -174,13 +184,14 @@ - + - + DeferredPipeflow & - DeferredPipeflow& tf::DeferredPipeflow::operator= + DeferredPipeflow & tf::DeferredPipeflow::operator= (DeferredPipeflow &&)=delete operator= + tf::DeferredPipeflow::operator= DeferredPipeflow && @@ -190,14 +201,14 @@ - + - + - + tf::DeferredPipeflow_dependents tf::DeferredPipeflow_num_deferrals @@ -206,8 +217,8 @@ tf::DeferredPipeflowDeferredPipeflow tf::DeferredPipeflowDeferredPipeflow tf::DeferredPipeflowDeferredPipeflow - tf::DeferredPipeflowoperator= - tf::DeferredPipeflowoperator= + tf::DeferredPipeflowoperator= + tf::DeferredPipeflowoperator= tf::DeferredPipeflowPipeline tf::DeferredPipeflowScalablePipeline diff --git a/docs/xml/classtf_1_1DynamicPartitioner.xml b/docs/xml/classtf_1_1DynamicPartitioner.xml index 7672f444b..35b46d2c8 100644 --- a/docs/xml/classtf_1_1DynamicPartitioner.xml +++ b/docs/xml/classtf_1_1DynamicPartitioner.xml @@ -1,21 +1,22 @@ - + tf::DynamicPartitioner tf::PartitionerBase< DefaultClosureWrapper > - partitioner.hpp + taskflow/algorithm/partitioner.hpp typename C - DefaultClosureWrapper + DefaultClosureWrapper - + - constexpr PartitionerType + PartitionerType static constexpr PartitionerType tf::DynamicPartitioner< C >::type () type + tf::DynamicPartitioner::type queries the partition type (dynamic) @@ -23,15 +24,16 @@ - + - - + + tf::DynamicPartitioner< C >::DynamicPartitioner ()=default DynamicPartitioner + tf::DynamicPartitioner::DynamicPartitioner default constructor @@ -39,13 +41,14 @@ - + tf::DynamicPartitioner< C >::DynamicPartitioner (size_t sz) DynamicPartitioner + tf::DynamicPartitioner::DynamicPartitioner size_t sz @@ -57,13 +60,14 @@ - + tf::DynamicPartitioner< C >::DynamicPartitioner (size_t sz, C &&closure) DynamicPartitioner + tf::DynamicPartitioner::DynamicPartitioner size_t sz @@ -79,10 +83,10 @@ - + - - + + @@ -97,6 +101,7 @@ void tf::DynamicPartitioner< C >::loop (size_t N, size_t, std::atomic< size_t > &next, F &&func) const loop + tf::DynamicPartitioner::loop size_t N @@ -105,7 +110,7 @@ size_t - std::atomic< size_t > & + std::atomic< size_t > & next @@ -118,7 +123,7 @@ - + @@ -134,6 +139,7 @@ void tf::DynamicPartitioner< C >::loop_until (size_t N, size_t, std::atomic< size_t > &next, F &&func) const loop_until + tf::DynamicPartitioner::loop_until size_t N @@ -142,7 +148,7 @@ size_t - std::atomic< size_t > & + std::atomic< size_t > & next @@ -155,11 +161,11 @@ - + - + -class to construct a dynamic partitioner for scheduling parallel algorithms +class to create a dynamic partitioner for scheduling parallel algorithms @@ -167,17 +173,17 @@ C -closure wrapper type (default tf::DefaultClosureWrapper) +closure wrapper type (default tf::DefaultClosureWrapper) The partitioner splits iterations into many partitions each of size equal to the given chunk size. Different partitions are distributed dynamically to workers without any specific order. In addition to partition size, the application can specify a closure wrapper for a dynamic partitioner. A closure wrapper allows the application to wrapper a partitioned task (i.e., closure) with a custom function object that performs additional tasks. For example: -std::atomic<int>count=0; +std::atomic<int>count=0; tf::Taskflowtaskflow; taskflow.for_each_index(0,100,1, [](){ -printf("%d\n",i); +printf("%d\n",i); }, tf::DynamicPartitioner(0,[](auto&&closure){ //dosomethingbeforeinvokingthepartitionedtask @@ -211,6 +217,10 @@ The partitioner splits iterations into many partitions each of size equal to the + + + + @@ -225,22 +235,28 @@ The partitioner splits iterations into many partitions each of size equal to the + + _closure_wrapper + - + tf::DynamicPartitioner_chunk_size tf::DynamicPartitioner_closure_wrapper tf::DynamicPartitionerchunk_size tf::DynamicPartitionerchunk_size - tf::DynamicPartitionerclosure_wrapper + tf::DynamicPartitionerclosure_wrapper + tf::DynamicPartitionerclosure_wrapper tf::DynamicPartitionerclosure_wrapper tf::DynamicPartitionerclosure_wrapper_type tf::DynamicPartitionerDynamicPartitioner tf::DynamicPartitionerDynamicPartitioner tf::DynamicPartitionerDynamicPartitioner + tf::DynamicPartitioneris_default_wrapper_v tf::DynamicPartitionerloop tf::DynamicPartitionerloop_until + tf::DynamicPartitioneroperator() tf::DynamicPartitionerPartitionerBase tf::DynamicPartitionerPartitionerBase tf::DynamicPartitionerPartitionerBase diff --git a/docs/xml/classtf_1_1Executor.xml b/docs/xml/classtf_1_1Executor.xml index f5e04f8be..c39434c6e 100644 --- a/docs/xml/classtf_1_1Executor.xml +++ b/docs/xml/classtf_1_1Executor.xml @@ -1,14 +1,15 @@ - + tf::Executor - executor.hpp - + taskflow/core/executor.hpp + class friend class FlowBuilder FlowBuilder + tf::Executor::FlowBuilder FlowBuilder @@ -18,13 +19,14 @@ - + class friend class Subflow Subflow + tf::Executor::Subflow Subflow @@ -34,13 +36,14 @@ - + class friend class Runtime Runtime + tf::Executor::Runtime Runtime @@ -50,108 +53,103 @@ - + - - - - const size_t - const size_t tf::Executor::_MAX_STEALS - - _MAX_STEALS - - - - - - - - - - std::mutex - std::mutex tf::Executor::_wsq_mutex + + class + friend class Algorithm - _wsq_mutex + Algorithm + tf::Executor::Algorithm + + Algorithm + - + + + - std::mutex + std::mutex std::mutex tf::Executor::_taskflows_mutex _taskflows_mutex + tf::Executor::_taskflows_mutex - + - - std::atomic< size_t > - std::atomic<size_t> tf::Executor::_num_topologies + + std::vector< Worker > + std::vector<Worker> tf::Executor::_workers - _num_topologies - {0} + _workers + tf::Executor::_workers - + - - std::atomic_flag - std::atomic_flag tf::Executor::_all_spawned + + DefaultNotifier + DefaultNotifier tf::Executor::_notifier - _all_spawned - = ATOMIC_FLAG_INIT + _notifier + tf::Executor::_notifier - + - std::condition_variable + std::condition_variable std::condition_variable tf::Executor::_topology_cv _topology_cv + tf::Executor::_topology_cv - + - std::mutex + std::mutex std::mutex tf::Executor::_topology_mutex _topology_mutex + tf::Executor::_topology_mutex - + size_t size_t tf::Executor::_num_topologies _num_topologies + tf::Executor::_num_topologies {0} @@ -159,124 +157,81 @@ - - - - std::unordered_map< std::thread::id, size_t > - std::unordered_map<std::thread::id, size_t> tf::Executor::_wids - - _wids - - - - - - - - - - std::vector< std::thread > - std::vector<std::thread> tf::Executor::_threads - - _threads - - - - - - - - - - std::vector< Worker > - std::vector<Worker> tf::Executor::_workers - - _workers - - - - - - - + - std::list< Taskflow > + std::list< Taskflow > std::list<Taskflow> tf::Executor::_taskflows _taskflows + tf::Executor::_taskflows - - - - Notifier - Notifier tf::Executor::_notifier - - _notifier - - - - - - - + - - TaskQueue< Node * > - TaskQueue<Node*> tf::Executor::_wsq + + Freelist< Node * > + Freelist<Node*> tf::Executor::_buffers - _wsq + _buffers + tf::Executor::_buffers - + - - std::atomic< bool > - std::atomic<bool> tf::Executor::_done + + std::shared_ptr< WorkerInterface > + std::shared_ptr<WorkerInterface> tf::Executor::_worker_interface - _done - {0} + _worker_interface + tf::Executor::_worker_interface - + - std::unordered_set< std::shared_ptr< ObserverInterface > > + std::unordered_set< std::shared_ptr< ObserverInterface > > std::unordered_set<std::shared_ptr<ObserverInterface> > tf::Executor::_observers _observers + tf::Executor::_observers - + - - - + + + tf::Executor::Executor - (size_t N=std::thread::hardware_concurrency()) + (size_t N=std::thread::hardware_concurrency(), std::shared_ptr< WorkerInterface > wix=nullptr) Executor + tf::Executor::Executor size_t N - std::thread::hardware_concurrency() + std::thread::hardware_concurrency() + + + std::shared_ptr< WorkerInterface > + wix + nullptr constructs the executor with N worker threads @@ -287,21 +242,34 @@ N -the number of workers (default std::thread::hardware_concurrency) +number of workers (default std::thread::hardware_concurrency) + + + + +wix + + +interface class instance to configure workers' behaviors -The constructor spawns N worker threads to run tasks in a work-stealing loop. The number of workers must be greater than zero or an exception will be thrown. By default, the number of worker threads is equal to the maximum hardware concurrency returned by std::thread::hardware_concurrency. +The constructor spawns N worker threads to run tasks in a work-stealing loop. The number of workers must be greater than zero or an exception will be thrown. By default, the number of worker threads is equal to the maximum hardware concurrency returned by std::thread::hardware_concurrency. +Users can alter the worker behavior, such as changing thread affinity, via deriving an instance from tf::WorkerInterface. +An exception will be thrown if executor construction fails. + + - + - + tf::Executor::~Executor () ~Executor + tf::Executor::~Executor destructs the executor @@ -310,13 +278,14 @@ The constructor spawns N worker threads to run - + - + tf::Future< void > tf::Future< void > tf::Executor::run (Taskflow &taskflow) run + tf::Executor::run Taskflow & taskflow @@ -348,13 +317,14 @@ This member function executes the given taskflow once and returns a + - + tf::Future< void > tf::Future< void > tf::Executor::run (Taskflow &&taskflow) run + tf::Executor::run Taskflow && taskflow @@ -375,7 +345,7 @@ This member function executes the given taskflow once and returns a a tf::Future that holds the result of the execution This member function executes a moved taskflow once and returns a tf::Future object that eventually holds the result of the execution. The executor will take care of the lifetime of the moved taskflow. -tf::Future<void>future=executor.run(std::move(taskflow)); +tf::Future<void>future=executor.run(std::move(taskflow)); //dosomethingelse future.wait(); @@ -383,7 +353,7 @@ This member function executes a moved taskflow once and returns a + @@ -395,6 +365,7 @@ This member function executes a moved taskflow once and returns a Taskflow & taskflow @@ -427,7 +398,7 @@ This member function executes a moved taskflow once and returns a a tf::Future that holds the result of the execution This member function executes the given taskflow once and invokes the given callable when the execution completes. This member function returns a tf::Future object that eventually holds the result of the execution. -tf::Future<void>future=executor.run(taskflow,[](){std::cout<<"done";}); +tf::Future<void>future=executor.run(taskflow,[](){std::cout<<"done";}); //dosomethingelse future.wait(); @@ -438,7 +409,7 @@ This member function executes the given taskflow once and invokes the given call - + @@ -450,6 +421,7 @@ This member function executes the given taskflow once and invokes the given call tf::Future< void > tf::Executor::run (Taskflow &&taskflow, C &&callable) run + tf::Executor::run Taskflow && taskflow @@ -483,7 +455,7 @@ This member function executes the given taskflow once and invokes the given call This member function executes a moved taskflow once and invokes the given callable when the execution completes. This member function returns a tf::Future object that eventually holds the result of the execution. The executor will take care of the lifetime of the moved taskflow. tf::Future<void>future=executor.run( -std::move(taskflow),[](){std::cout<<"done";} +std::move(taskflow),[](){std::cout<<"done";} ); //dosomethingelse future.wait(); @@ -492,13 +464,14 @@ This member function executes a moved taskflow once and invokes the given callab - + - + tf::Future< void > tf::Future< void > tf::Executor::run_n (Taskflow &taskflow, size_t N) run_n + tf::Executor::run_n Taskflow & taskflow @@ -542,13 +515,14 @@ This member function executes the given taskflow N - + - + tf::Future< void > tf::Future< void > tf::Executor::run_n (Taskflow &&taskflow, size_t N) run_n + tf::Executor::run_n Taskflow && taskflow @@ -582,7 +556,7 @@ This member function executes the given taskflow N This member function executes a moved taskflow N times and returns a tf::Future object that eventually holds the result of the execution. The executor will take care of the lifetime of the moved taskflow. tf::Future<void>future=executor.run_n( -std::move(taskflow),2//runthemovedtaskflow2times +std::move(taskflow),2//runthemovedtaskflow2times ); //dosomethingelse future.wait(); @@ -591,7 +565,7 @@ This member function executes a moved taskflow N - + @@ -603,6 +577,7 @@ This member function executes a moved taskflow Ntf::Future< void > tf::Executor::run_n (Taskflow &taskflow, size_t N, C &&callable) run_n + tf::Executor::run_n Taskflow & taskflow @@ -648,7 +623,7 @@ This member function executes a moved taskflow N This member function executes the given taskflow N times and invokes the given callable when the execution completes. This member function returns a tf::Future object that eventually holds the result of the execution. tf::Future<void>future=executor.run( -taskflow,2,[](){std::cout<<"done";}//runstaskflow2timesandinvoke +taskflow,2,[](){std::cout<<"done";}//runstaskflow2timesandinvoke //thelambdatoprint"done" ); //dosomethingelse @@ -661,7 +636,7 @@ This member function executes the given taskflow N - + @@ -673,6 +648,7 @@ This member function executes the given taskflow Ntf::Future< void > tf::Executor::run_n (Taskflow &&taskflow, size_t N, C &&callable) run_n + tf::Executor::run_n Taskflow && taskflow @@ -719,7 +695,7 @@ This member function executes the given taskflow NN times and invokes the given callable when the execution completes. This member function returns a tf::Future object that eventually holds the result of the execution. tf::Future<void>future=executor.run_n( //runthemovedtaskflow2timesandinvokethelambdatoprint"done" -std::move(taskflow),2,[](){std::cout<<"done";} +std::move(taskflow),2,[](){std::cout<<"done";} ); //dosomethingelse future.wait(); @@ -728,7 +704,7 @@ This member function executes a moved taskflow N - + @@ -740,6 +716,7 @@ This member function executes a moved taskflow Ntf::Future< void > tf::Executor::run_until (Taskflow &taskflow, P &&pred) run_until + tf::Executor::run_until Taskflow & taskflow @@ -773,7 +750,7 @@ This member function executes a moved taskflow N This member function executes the given taskflow multiple times until the predicate returns true. This member function returns a tf::Future object that eventually holds the result of the execution. tf::Future<void>future=executor.run_until( -taskflow,[](){returnrand()%10==0} +taskflow,[](){returnrand()%10==0} ); //dosomethingelse future.wait(); @@ -785,7 +762,7 @@ This member function executes the given taskflow multiple times until the predic - + @@ -797,6 +774,7 @@ This member function executes the given taskflow multiple times until the predic tf::Future< void > tf::Executor::run_until (Taskflow &&taskflow, P &&pred) run_until + tf::Executor::run_until Taskflow && taskflow @@ -830,7 +808,7 @@ This member function executes the given taskflow multiple times until the predic This member function executes a moved taskflow multiple times until the predicate returns true. This member function returns a tf::Future object that eventually holds the result of the execution. The executor will take care of the lifetime of the moved taskflow. tf::Future<void>future=executor.run_until( -std::move(taskflow),[](){returnrand()%10==0} +std::move(taskflow),[](){returnrand()%10==0} ); //dosomethingelse future.wait(); @@ -839,7 +817,7 @@ This member function executes a moved taskflow multiple times until the predicat - + @@ -854,6 +832,7 @@ This member function executes a moved taskflow multiple times until the predicat tf::Future< void > tf::Executor::run_until (Taskflow &taskflow, P &&pred, C &&callable) run_until + tf::Executor::run_until Taskflow & taskflow @@ -899,7 +878,7 @@ This member function executes a moved taskflow multiple times until the predicat This member function executes the given taskflow multiple times until the predicate returns true and then invokes the given callable when the execution completes. This member function returns a tf::Future object that eventually holds the result of the execution. tf::Future<void>future=executor.run_until( -taskflow,[](){returnrand()%10==0},[](){std::cout<<"done";} +taskflow,[](){returnrand()%10==0},[](){std::cout<<"done";} ); //dosomethingelse future.wait(); @@ -911,7 +890,7 @@ This member function executes the given taskflow multiple times until the predic - + @@ -926,6 +905,7 @@ This member function executes the given taskflow multiple times until the predic tf::Future< void > tf::Executor::run_until (Taskflow &&taskflow, P &&pred, C &&callable) run_until + tf::Executor::run_until Taskflow && taskflow @@ -971,8 +951,8 @@ This member function executes the given taskflow multiple times until the predic This member function executes a moved taskflow multiple times until the predicate returns true and then invokes the given callable when the execution completes. This member function returns a tf::Future object that eventually holds the result of the execution. The executor will take care of the lifetime of the moved taskflow. tf::Future<void>future=executor.run_until( -std::move(taskflow), -[](){returnrand()%10==0},[](){std::cout<<"done";} +std::move(taskflow), +[](){returnrand()%10==0},[](){std::cout<<"done";} ); //dosomethingelse future.wait(); @@ -981,7 +961,7 @@ This member function executes a moved taskflow multiple times until the predicat - + @@ -993,6 +973,7 @@ This member function executes a moved taskflow multiple times until the predicat void tf::Executor::corun (T &target) corun + tf::Executor::corun T & target @@ -1019,12 +1000,12 @@ This member function executes a moved taskflow multiple times until the predicat -The method runs a target graph which has tf::Graph& T::graph() defined and waits until the execution completes. Unlike the typical flow of calling tf::Executor::run series plus waiting on the result, this method must be called by an internal worker of this executor. The caller worker will participate in the work-stealing loop of the scheduler, therby avoiding potential deadlock caused by blocked waiting. +The method runs a target graph which has tf::Graph& T::graph() defined and waits until the execution completes. Unlike the typical flow of calling tf::Executor::run series plus waiting on the result, this method must be called by an internal worker of this executor. The caller worker will participate in the work-stealing loop of the scheduler, thereby avoiding potential deadlock caused by blocked waiting. tf::Executorexecutor(2); tf::Taskflowtaskflow; -std::array<tf::Taskflow, 1000>others; +std::array<tf::Taskflow, 1000>others; -std::atomic<size_t>counter{0}; +std::atomic<size_t>counter{0}; for(size_tn=0;n<1000;n++){ for(size_ti=0;i<1000;i++){ @@ -1045,7 +1026,7 @@ The method runs a target graph which has + @@ -1057,6 +1038,7 @@ The method runs a target graph which has The method keeps the caller worker running in the work-stealing loop until the stop predicate becomes true. taskflow.emplace([&](){ -std::future<void>fu=std::async([](){std::sleep(100s);}); +std::future<void>fu=std::async([](){std::sleep(100s);}); executor.corun_until([](){ -returnfu.wait_for(std::chrono::seconds(0))==future_status::ready; +returnfu.wait_for(std::chrono::seconds(0))==future_status::ready; }); }); @@ -1097,13 +1079,14 @@ The method keeps the caller worker running in the work-stealing loop until the s - + - + void void tf::Executor::wait_for_all () wait_for_all + tf::Executor::wait_for_all waits for all tasks to complete @@ -1117,68 +1100,102 @@ The method keeps the caller worker running in the work-stealing loop until the s - + - + size_t size_t tf::Executor::num_workers () const noexcept num_workers + tf::Executor::num_workers queries the number of worker threads Each worker represents one unique thread spawned by an executor upon its construction time. tf::Executorexecutor(4); -std::cout<<executor.num_workers();//4 +std::cout<<executor.num_workers();//4 - + - + + size_t + size_t tf::Executor::num_waiters + () const noexcept + num_waiters + tf::Executor::num_waiters + +queries the number of workers that are currently not making any stealing attempts + + + + + + + + + size_t + size_t tf::Executor::num_queues + () const noexcept + num_queues + tf::Executor::num_queues + +queries the number of queues used in the work-stealing loop + + + + + + + + size_t size_t tf::Executor::num_topologies () const num_topologies + tf::Executor::num_topologies queries the number of running topologies at the time of this call When a taskflow is submitted to an executor, a topology is created to store runtime metadata of the running taskflow. When the execution of the submitted taskflow finishes, its corresponding topology will be removed from the executor. executor.run(taskflow); -std::cout<<executor.num_topologies();//0or1(taskflowstillrunning) +std::cout<<executor.num_topologies();//0or1(taskflowstillrunning) - + - + size_t size_t tf::Executor::num_taskflows () const num_taskflows + tf::Executor::num_taskflows queries the number of running taskflows with moved ownership -executor.run(std::move(taskflow)); -std::cout<<executor.num_taskflows();//0or1(taskflowstillrunning) +executor.run(std::move(taskflow)); +std::cout<<executor.num_taskflows();//0or1(taskflowstillrunning) - + - + int int tf::Executor::this_worker_id () const this_worker_id + tf::Executor::this_worker_id -queries the id of the caller thread in this executor +queries the id of the caller thread within this executor Each worker has an unique id in the range of 0 to N-1 associated with its parent executor. If the caller thread does not belong to the executor, -1 is returned. @@ -1186,14 +1203,14 @@ The method keeps the caller worker running in the work-stealing loop until the s executor.this_worker_id();//-1(mainthreadisnotaworker) taskflow.emplace([&](){ -std::cout<<executor.this_worker_id();//0,1,2,or3 +std::cout<<executor.this_worker_id();//0,1,2,or3 }); executor.run(taskflow); - + @@ -1206,10 +1223,11 @@ The method keeps the caller worker running in the work-stealing loop until the s ArgsT - std::shared_ptr< Observer > + std::shared_ptr< Observer > std::shared_ptr< Observer > tf::Executor::make_observer (ArgsT &&... args) make_observer + tf::Executor::make_observer ArgsT &&... args @@ -1251,7 +1269,7 @@ Each executor manages a list of observers with shared ownership with callers. Fo - + @@ -1263,8 +1281,9 @@ Each executor manages a list of observers with shared ownership with callers. Fo void tf::Executor::remove_observer (std::shared_ptr< Observer > observer) remove_observer + tf::Executor::remove_observer - std::shared_ptr< Observer > + std::shared_ptr< Observer > observer @@ -1275,13 +1294,14 @@ Each executor manages a list of observers with shared ownership with callers. Fo - + - + size_t size_t tf::Executor::num_observers () const noexcept num_observers + tf::Executor::num_observers queries the number of observers @@ -1289,7 +1309,7 @@ Each executor manages a list of observers with shared ownership with callers. Fo - + @@ -1304,6 +1324,7 @@ Each executor manages a list of observers with shared ownership with callers. Fo auto tf::Executor::async (P &&params, F &&func) async + tf::Executor::async P && params @@ -1353,8 +1374,8 @@ Each executor manages a list of observers with shared ownership with callers. Fo a std::future that will hold the result of the execution The method creates a parameterized asynchronous task to run the given function and return a std::future object that eventually will hold the result of the execution. -std::future<int>future=executor.async("name",[](){ -std::cout<<"createanasynchronoustaskwithanameandreturns1\n"; +std::future<int>future=executor.async("name",[](){ +std::cout<<"createanasynchronoustaskwithanameandreturns1\n"; return1; }); future.get(); @@ -1363,7 +1384,7 @@ The method creates a parameterized asynchronous task to run the given function a - + @@ -1375,6 +1396,7 @@ The method creates a parameterized asynchronous task to run the given function a auto tf::Executor::async (F &&func) async + tf::Executor::async F && func @@ -1404,8 +1426,8 @@ The method creates a parameterized asynchronous task to run the given function a a std::future that will hold the result of the execution The method creates an asynchronous task to run the given function and return a std::future object that eventually will hold the result of the return value. -std::future<int>future=executor.async([](){ -std::cout<<"createanasynchronoustaskandreturns1\n"; +std::future<int>future=executor.async([](){ +std::cout<<"createanasynchronoustaskandreturns1\n"; return1; }); future.get(); @@ -1414,7 +1436,7 @@ The method creates an asynchronous task to run the given function and return a < - + @@ -1429,6 +1451,7 @@ The method creates an asynchronous task to run the given function and return a < void tf::Executor::silent_async (P &&params, F &&func) silent_async + tf::Executor::silent_async P && params @@ -1469,7 +1492,7 @@ The method creates an asynchronous task to run the given function and return a < The method creates a parameterized asynchronous task to run the given function without returning any std::future object. This member function is more efficient than tf::Executor::async and is encouraged to use when applications do not need a std::future to acquire the result or synchronize the execution. executor.silent_async("name",[](){ -std::cout<<"createanasynchronoustaskwithanameandnoreturn\n"; +std::cout<<"createanasynchronoustaskwithanameandnoreturn\n"; }); executor.wait_for_all(); @@ -1477,7 +1500,7 @@ The method creates a parameterized asynchronous task to run the given function w - + @@ -1489,6 +1512,7 @@ The method creates a parameterized asynchronous task to run the given function w void tf::Executor::silent_async (F &&func) silent_async + tf::Executor::silent_async F && func @@ -1517,7 +1541,7 @@ The method creates a parameterized asynchronous task to run the given function w The method creates an asynchronous task to run the given function without returning any std::future object. This member function is more efficient than tf::Executor::async and is encouraged to use when applications do not need a std::future to acquire the result or synchronize the execution. executor.silent_async([](){ -std::cout<<"createanasynchronoustaskwithnoreturn\n"; +std::cout<<"createanasynchronoustaskwithnoreturn\n"; }); executor.wait_for_all(); @@ -1525,7 +1549,7 @@ The method creates an asynchronous task to run the given function without return - + @@ -1546,6 +1570,7 @@ The method creates an asynchronous task to run the given function without return tf::AsyncTask tf::Executor::silent_dependent_async (F &&func, Tasks &&... tasks) silent_dependent_async + tf::Executor::silent_dependent_async F && func @@ -1555,7 +1580,7 @@ The method creates an asynchronous task to run the given function without return tasks -runs the given function asynchronously when the given dependents finish +runs the given function asynchronously when the given predecessors finish @@ -1595,16 +1620,16 @@ The method creates an asynchronous task to run the given function without return a tf::AsyncTask handle This member function is more efficient than tf::Executor::dependent_async and is encouraged to use when you do not want a std::future to acquire the result or synchronize the execution. The example below creates three asynchronous tasks, A, B, and C, in which task C runs after task A and task B. -tf::AsyncTaskA=executor.silent_dependent_async([](){printf("A\n");}); -tf::AsyncTaskB=executor.silent_dependent_async([](){printf("B\n");}); -executor.silent_dependent_async([](){printf("CrunsafterAandB\n");},A,B); +tf::AsyncTaskA=executor.silent_dependent_async([](){printf("A\n");}); +tf::AsyncTaskB=executor.silent_dependent_async([](){printf("B\n");}); +executor.silent_dependent_async([](){printf("CrunsafterAandB\n");},A,B); executor.wait_for_all(); This member function is thread-safe. - + @@ -1628,6 +1653,7 @@ This member function is more efficient than tasks -runs the given function asynchronously when the given dependents finish +runs the given function asynchronously when the given predecessors finish @@ -1689,10 +1715,10 @@ This member function is more efficient than a tf::AsyncTask handle This member function is more efficient than tf::Executor::dependent_async and is encouraged to use when you do not want a std::future to acquire the result or synchronize the execution. The example below creates three asynchronous tasks, A, B, and C, in which task C runs after task A and task B. Assigned task names will appear in the observers of the executor. -tf::AsyncTaskA=executor.silent_dependent_async("A",[](){printf("A\n");}); -tf::AsyncTaskB=executor.silent_dependent_async("B",[](){printf("B\n");}); +tf::AsyncTaskA=executor.silent_dependent_async("A",[](){printf("A\n");}); +tf::AsyncTaskB=executor.silent_dependent_async("B",[](){printf("B\n");}); executor.silent_dependent_async( -"C",[](){printf("CrunsafterAandB\n");},A,B +"C",[](){printf("CrunsafterAandB\n");},A,B ); executor.wait_for_all(); @@ -1700,7 +1726,7 @@ This member function is more efficient than + @@ -1719,6 +1745,7 @@ This member function is more efficient than last -runs the given function asynchronously when the given range of dependents finish +runs the given function asynchronously when the given range of predecessors finish @@ -1780,12 +1807,12 @@ This member function is more efficient than a tf::AsyncTask handle This member function is more efficient than tf::Executor::dependent_async and is encouraged to use when you do not want a std::future to acquire the result or synchronize the execution. The example below creates three asynchronous tasks, A, B, and C, in which task C runs after task A and task B. -std::array<tf::AsyncTask, 2>array{ -executor.silent_dependent_async([](){printf("A\n");}), -executor.silent_dependent_async([](){printf("B\n");}) +std::array<tf::AsyncTask, 2>array{ +executor.silent_dependent_async([](){printf("A\n");}), +executor.silent_dependent_async([](){printf("B\n");}) }; executor.silent_dependent_async( -[](){printf("CrunsafterAandB\n");},array.begin(),array.end() +[](){printf("CrunsafterAandB\n");},array.begin(),array.end() ); executor.wait_for_all(); @@ -1793,7 +1820,7 @@ This member function is more efficient than + @@ -1815,6 +1842,7 @@ This member function is more efficient than last -runs the given function asynchronously when the given range of dependents finish +runs the given function asynchronously when the given range of predecessors finish @@ -1888,12 +1916,12 @@ This member function is more efficient than a tf::AsyncTask handle This member function is more efficient than tf::Executor::dependent_async and is encouraged to use when you do not want a std::future to acquire the result or synchronize the execution. The example below creates three asynchronous tasks, A, B, and C, in which task C runs after task A and task B. Assigned task names will appear in the observers of the executor. -std::array<tf::AsyncTask, 2>array{ -executor.silent_dependent_async("A",[](){printf("A\n");}), -executor.silent_dependent_async("B",[](){printf("B\n");}) +std::array<tf::AsyncTask, 2>array{ +executor.silent_dependent_async("A",[](){printf("A\n");}), +executor.silent_dependent_async("B",[](){printf("B\n");}) }; executor.silent_dependent_async( -"C",[](){printf("CrunsafterAandB\n");},array.begin(),array.end() +"C",[](){printf("CrunsafterAandB\n");},array.begin(),array.end() ); executor.wait_for_all(); @@ -1901,7 +1929,7 @@ This member function is more efficient than + @@ -1922,6 +1950,7 @@ This member function is more efficient than tasks -runs the given function asynchronously when the given dependents finish +runs the given function asynchronously when the given predecessors finish @@ -1971,11 +2000,11 @@ This member function is more efficient than a pair of a tf::AsyncTask handle and a std::future that holds the result of the execution The example below creates three asynchronous tasks, A, B, and C, in which task C runs after task A and task B. Task C returns a pair of its tf::AsyncTask handle and a std::future<int> that eventually will hold the result of the execution. -tf::AsyncTaskA=executor.silent_dependent_async([](){printf("A\n");}); -tf::AsyncTaskB=executor.silent_dependent_async([](){printf("B\n");}); +tf::AsyncTaskA=executor.silent_dependent_async([](){printf("A\n");}); +tf::AsyncTaskB=executor.silent_dependent_async([](){printf("B\n");}); auto[C,fuC]=executor.dependent_async( [](){ -printf("CrunsafterAandB\n"); +printf("CrunsafterAandB\n"); return1; }, A,B @@ -1987,7 +2016,7 @@ The example below creates three asynchronous tasks, A - + @@ -2011,6 +2040,7 @@ The example below creates three asynchronous tasks, Aauto tf::Executor::dependent_async (P &&params, F &&func, Tasks &&... tasks) dependent_async + tf::Executor::dependent_async P && params @@ -2024,7 +2054,7 @@ The example below creates three asynchronous tasks, Atasks -runs the given function asynchronously when the given dependents finish +runs the given function asynchronously when the given predecessors finish @@ -2080,12 +2110,12 @@ The example below creates three asynchronous tasks, Aa pair of a tf::AsyncTask handle and a std::future that holds the result of the execution The example below creates three named asynchronous tasks, A, B, and C, in which task C runs after task A and task B. Task C returns a pair of its tf::AsyncTask handle and a std::future<int> that eventually will hold the result of the execution. Assigned task names will appear in the observers of the executor. -tf::AsyncTaskA=executor.silent_dependent_async("A",[](){printf("A\n");}); -tf::AsyncTaskB=executor.silent_dependent_async("B",[](){printf("B\n");}); +tf::AsyncTaskA=executor.silent_dependent_async("A",[](){printf("A\n");}); +tf::AsyncTaskB=executor.silent_dependent_async("B",[](){printf("B\n");}); auto[C,fuC]=executor.dependent_async( "C", [](){ -printf("CrunsafterAandB\n"); +printf("CrunsafterAandB\n"); return1; }, A,B @@ -2097,7 +2127,7 @@ The example below creates three named asynchronous tasks, A - + @@ -2116,6 +2146,7 @@ The example below creates three named asynchronous tasks, Aauto tf::Executor::dependent_async (F &&func, I first, I last) dependent_async + tf::Executor::dependent_async F && func @@ -2129,7 +2160,7 @@ The example below creates three named asynchronous tasks, Alast -runs the given function asynchronously when the given range of dependents finish +runs the given function asynchronously when the given range of predecessors finish @@ -2177,13 +2208,13 @@ The example below creates three named asynchronous tasks, Aa pair of a tf::AsyncTask handle and a std::future that holds the result of the execution The example below creates three asynchronous tasks, A, B, and C, in which task C runs after task A and task B. Task C returns a pair of its tf::AsyncTask handle and a std::future<int> that eventually will hold the result of the execution. -std::array<tf::AsyncTask, 2>array{ -executor.silent_dependent_async([](){printf("A\n");}), -executor.silent_dependent_async([](){printf("B\n");}) +std::array<tf::AsyncTask, 2>array{ +executor.silent_dependent_async([](){printf("A\n");}), +executor.silent_dependent_async([](){printf("B\n");}) }; auto[C,fuC]=executor.dependent_async( [](){ -printf("CrunsafterAandB\n"); +printf("CrunsafterAandB\n"); return1; }, array.begin(),array.end() @@ -2195,7 +2226,7 @@ The example below creates three asynchronous tasks, A - + @@ -2217,6 +2248,7 @@ The example below creates three asynchronous tasks, Aauto tf::Executor::dependent_async (P &&params, F &&func, I first, I last) dependent_async + tf::Executor::dependent_async P && params @@ -2234,7 +2266,7 @@ The example below creates three asynchronous tasks, Alast -runs the given function asynchronously when the given range of dependents finish +runs the given function asynchronously when the given range of predecessors finish @@ -2298,14 +2330,14 @@ The example below creates three asynchronous tasks, Aa pair of a tf::AsyncTask handle and a std::future that holds the result of the execution The example below creates three named asynchronous tasks, A, B, and C, in which task C runs after task A and task B. Task C returns a pair of its tf::AsyncTask handle and a std::future<int> that eventually will hold the result of the execution. Assigned task names will appear in the observers of the executor. -std::array<tf::AsyncTask, 2>array{ -executor.silent_dependent_async("A",[](){printf("A\n");}), -executor.silent_dependent_async("B",[](){printf("B\n");}) +std::array<tf::AsyncTask, 2>array{ +executor.silent_dependent_async("A",[](){printf("A\n");}), +executor.silent_dependent_async("B",[](){printf("B\n");}) }; auto[C,fuC]=executor.dependent_async( "C", [](){ -printf("CrunsafterAandB\n"); +printf("CrunsafterAandB\n"); return1; }, array.begin(),array.end() @@ -2317,75 +2349,35 @@ The example below creates three named asynchronous tasks, A - + - - - - Worker * - Worker * tf::Executor::_this_worker + + + + void + void tf::Executor::_shutdown () - _this_worker - - - - - - - - - - bool - bool tf::Executor::_wait_for_task - (Worker &, Node *&) - _wait_for_task - - Worker & - worker - - - Node *& - t - - - - - - - - - - - bool - bool tf::Executor::_invoke_module_task_internal - (Worker &, Node *) - _invoke_module_task_internal - - Worker & - - - Node * - + _shutdown + tf::Executor::_shutdown - + - + void void tf::Executor::_observer_prologue (Worker &, Node *) _observer_prologue + tf::Executor::_observer_prologue Worker & - worker Node * - node @@ -2393,20 +2385,19 @@ The example below creates three named asynchronous tasks, A - + - + void void tf::Executor::_observer_epilogue (Worker &, Node *) _observer_epilogue + tf::Executor::_observer_epilogue Worker & - worker Node * - node @@ -2414,16 +2405,16 @@ The example below creates three named asynchronous tasks, A - + - + void void tf::Executor::_spawn (size_t) _spawn + tf::Executor::_spawn size_t - N @@ -2431,20 +2422,19 @@ The example below creates three named asynchronous tasks, A - + - + void void tf::Executor::_exploit_task (Worker &, Node *&) _exploit_task + tf::Executor::_exploit_task Worker & - w Node *& - t @@ -2452,20 +2442,19 @@ The example below creates three named asynchronous tasks, A - + - - void - void tf::Executor::_explore_task + + bool + bool tf::Executor::_explore_task (Worker &, Node *&) _explore_task + tf::Executor::_explore_task Worker & - w Node *& - t @@ -2473,20 +2462,19 @@ The example below creates three named asynchronous tasks, A - + - + void void tf::Executor::_schedule (Worker &, Node *) _schedule + tf::Executor::_schedule Worker & - worker Node * - node @@ -2494,16 +2482,16 @@ The example below creates three named asynchronous tasks, A - + - + void void tf::Executor::_schedule (Node *) _schedule + tf::Executor::_schedule Node * - node @@ -2511,20 +2499,19 @@ The example below creates three named asynchronous tasks, A - + - + void - void tf::Executor::_schedule - (Worker &, const SmallVector< Node * > &) - _schedule + void tf::Executor::_set_up_topology + (Worker *, Topology *) + _set_up_topology + tf::Executor::_set_up_topology - Worker & - worker + Worker * - const SmallVector< Node * > & - nodes + Topology * @@ -2532,16 +2519,19 @@ The example below creates three named asynchronous tasks, A - + - + void - void tf::Executor::_schedule - (const SmallVector< Node * > &) - _schedule + void tf::Executor::_tear_down_topology + (Worker &, Topology *) + _tear_down_topology + tf::Executor::_tear_down_topology + + Worker & + - const SmallVector< Node * > & - nodes + Topology * @@ -2549,20 +2539,22 @@ The example below creates three named asynchronous tasks, A - + - + void - void tf::Executor::_set_up_topology - (Worker *, Topology *) - _set_up_topology + void tf::Executor::_tear_down_async + (Worker &, Node *, Node *&) + _tear_down_async + tf::Executor::_tear_down_async - Worker * - worker + Worker & - Topology * - tpg + Node * + + + Node *& @@ -2570,32 +2562,45 @@ The example below creates three named asynchronous tasks, A - + - + void - void tf::Executor::_set_up_graph - (Graph &, Node *, Topology *, int, SmallVector< Node * > &) - _set_up_graph + void tf::Executor::_tear_down_dependent_async + (Worker &, Node *, Node *&) + _tear_down_dependent_async + tf::Executor::_tear_down_dependent_async - Graph & - g + Worker & Node * - parent - Topology * - tpg + Node *& + + + + + + + + + + + void + void tf::Executor::_tear_down_invoke + (Worker &, Node *, Node *&) + _tear_down_invoke + tf::Executor::_tear_down_invoke + + Worker & - int - state + Node * - SmallVector< Node * > & - src + Node *& @@ -2603,20 +2608,47 @@ The example below creates three named asynchronous tasks, A - + - + void - void tf::Executor::_tear_down_topology - (Worker &, Topology *) - _tear_down_topology + void tf::Executor::_increment_topology + () + _increment_topology + tf::Executor::_increment_topology + + + + + + + + + + void + void tf::Executor::_decrement_topology + () + _decrement_topology + tf::Executor::_decrement_topology + + + + + + + + + + void + void tf::Executor::_invoke + (Worker &, Node *) + _invoke + tf::Executor::_invoke Worker & - worker - Topology * - tpg + Node * @@ -2624,13 +2656,17 @@ The example below creates three named asynchronous tasks, A - + - + void - void tf::Executor::_tear_down_async - (Node *) - _tear_down_async + void tf::Executor::_invoke_static_task + (Worker &, Node *) + _invoke_static_task + tf::Executor::_invoke_static_task + + Worker & + Node * @@ -2640,39 +2676,45 @@ The example below creates three named asynchronous tasks, A - + - + void - void tf::Executor::_tear_down_dependent_async - (Worker &, Node *) - _tear_down_dependent_async + void tf::Executor::_invoke_condition_task + (Worker &, Node *, SmallVector< int > &) + _invoke_condition_task + tf::Executor::_invoke_condition_task Worker & Node * + + SmallVector< int > & + - + - + void - void tf::Executor::_tear_down_invoke - (Worker &, Node *) - _tear_down_invoke + void tf::Executor::_invoke_multi_condition_task + (Worker &, Node *, SmallVector< int > &) + _invoke_multi_condition_task + tf::Executor::_invoke_multi_condition_task Worker & - worker Node * - node + + + SmallVector< int > & @@ -2680,46 +2722,82 @@ The example below creates three named asynchronous tasks, A - + - + void - void tf::Executor::_increment_topology - () - _increment_topology + void tf::Executor::_process_dependent_async + (Node *, tf::AsyncTask &, size_t &) + _process_dependent_async + tf::Executor::_process_dependent_async + + Node * + + + tf::AsyncTask & + + + size_t & + - + - + void - void tf::Executor::_decrement_topology - () - _decrement_topology + void tf::Executor::_process_exception + (Worker &, Node *) + _process_exception + tf::Executor::_process_exception + + Worker & + + + Node * + - + - + void - void tf::Executor::_invoke - (Worker &, Node *) - _invoke + void tf::Executor::_schedule_async_task + (Node *) + _schedule_async_task + tf::Executor::_schedule_async_task + + Node * + + + + + + + + + + + void + void tf::Executor::_update_cache + (Worker &, Node *&, Node *) + _update_cache + tf::Executor::_update_cache Worker & - worker + + + Node *& Node * - node @@ -2727,20 +2805,39 @@ The example below creates three named asynchronous tasks, A - + - - void - void tf::Executor::_invoke_static_task + + bool + bool tf::Executor::_wait_for_task + (Worker &, Node *&) + _wait_for_task + tf::Executor::_wait_for_task + + Worker & + + + Node *& + + + + + + + + + + + bool + bool tf::Executor::_invoke_subflow_task (Worker &, Node *) - _invoke_static_task + _invoke_subflow_task + tf::Executor::_invoke_subflow_task Worker & - worker Node * - node @@ -2748,20 +2845,19 @@ The example below creates three named asynchronous tasks, A - + - - void - void tf::Executor::_invoke_subflow_task + + bool + bool tf::Executor::_invoke_module_task (Worker &, Node *) - _invoke_subflow_task + _invoke_module_task + tf::Executor::_invoke_module_task Worker & - w Node * - node @@ -2769,24 +2865,22 @@ The example below creates three named asynchronous tasks, A - + - - void - void tf::Executor::_detach_subflow_task + + bool + bool tf::Executor::_invoke_module_task_impl (Worker &, Node *, Graph &) - _detach_subflow_task + _invoke_module_task_impl + tf::Executor::_invoke_module_task_impl Worker & - w Node * - p Graph & - g @@ -2794,24 +2888,39 @@ The example below creates three named asynchronous tasks, A - + - - void - void tf::Executor::_invoke_condition_task - (Worker &, Node *, SmallVector< int > &) - _invoke_condition_task + + bool + bool tf::Executor::_invoke_async_task + (Worker &, Node *) + _invoke_async_task + tf::Executor::_invoke_async_task Worker & - worker Node * - node + + + + + + + + + + + bool + bool tf::Executor::_invoke_dependent_async_task + (Worker &, Node *) + _invoke_dependent_async_task + tf::Executor::_invoke_dependent_async_task + + Worker & - SmallVector< int > & - conds + Node * @@ -2819,13 +2928,14 @@ The example below creates three named asynchronous tasks, A - + - - void - void tf::Executor::_invoke_multi_condition_task - (Worker &, Node *, SmallVector< int > &) - _invoke_multi_condition_task + + bool + bool tf::Executor::_invoke_runtime_task + (Worker &, Node *) + _invoke_runtime_task + tf::Executor::_invoke_runtime_task Worker & worker @@ -2834,44 +2944,46 @@ The example below creates three named asynchronous tasks, ANode * node - - SmallVector< int > & - conds - - + - - void - void tf::Executor::_invoke_module_task - (Worker &, Node *) - _invoke_module_task + + bool + bool tf::Executor::_invoke_runtime_task_impl + (Worker &, Node *, std::function< void(Runtime &)> &) + _invoke_runtime_task_impl + tf::Executor::_invoke_runtime_task_impl Worker & - w + worker Node * node + + std::function< void(Runtime &)> & + work + - + - - void - void tf::Executor::_invoke_async_task - (Worker &, Node *) - _invoke_async_task + + bool + bool tf::Executor::_invoke_runtime_task_impl + (Worker &, Node *, std::function< void(Runtime &, bool)> &) + _invoke_runtime_task_impl + tf::Executor::_invoke_runtime_task_impl Worker & worker @@ -2880,26 +2992,65 @@ The example below creates three named asynchronous tasks, ANode * node + + std::function< void(Runtime &, bool)> & + work + - + - + + + + typename I + + + I + I tf::Executor::_set_up_graph + (I, I, Topology *, Node *) + _set_up_graph + tf::Executor::_set_up_graph + + I + + + I + + + Topology * + + + Node * + + + + + + + + + + + + + typename P + + void - void tf::Executor::_invoke_dependent_async_task - (Worker &, Node *) - _invoke_dependent_async_task + void tf::Executor::_corun_until + (Worker &, P &&) + _corun_until + tf::Executor::_corun_until Worker & - worker - Node * - node + P && @@ -2907,21 +3058,30 @@ The example below creates three named asynchronous tasks, A - + - + + + + typename I + + void - void tf::Executor::_process_async_dependent - (Node *, tf::AsyncTask &, size_t &) - _process_async_dependent + void tf::Executor::_corun_graph + (Worker &, Node *, I, I) + _corun_graph + tf::Executor::_corun_graph + + Worker & + Node * - tf::AsyncTask & + I - size_t & + I @@ -2929,19 +3089,27 @@ The example below creates three named asynchronous tasks, A - + - + + + + typename I + + void - void tf::Executor::_process_exception - (Worker &, Node *) - _process_exception + void tf::Executor::_schedule + (Worker &, I, I) + _schedule + tf::Executor::_schedule Worker & - Node * - node + I + + + I @@ -2949,15 +3117,24 @@ The example below creates three named asynchronous tasks, A - + - + + + + typename I + + void - void tf::Executor::_schedule_async_task - (Node *) - _schedule_async_task + void tf::Executor::_schedule + (I, I) + _schedule + tf::Executor::_schedule - Node * + I + + + I @@ -2965,24 +3142,64 @@ The example below creates three named asynchronous tasks, A - + - + + + + typename I + + void - void tf::Executor::_corun_graph - (Worker &, Node *, Graph &) - _corun_graph + void tf::Executor::_schedule_graph_with_parent + (Worker &, I, I, Node *) + _schedule_graph_with_parent + tf::Executor::_schedule_graph_with_parent Worker & - w + + + I + + + I Node * - p + + + + + + + + + + + + typename P + + + typename F + + + auto + auto tf::Executor::_async + (P &&, F &&, Topology *, Node *) + _async + tf::Executor::_async - Graph & - g + P && + + + F && + + + Topology * + + + Node * @@ -2990,25 +3207,33 @@ The example below creates three named asynchronous tasks, A - + - + typename P + + typename F + void - void tf::Executor::_corun_until - (Worker &, P &&) - _corun_until + void tf::Executor::_silent_async + (P &&, F &&, Topology *, Node *) + _silent_async + tf::Executor::_silent_async - Worker & - w + P && - P && - stop_predicate + F && + + + Topology * + + + Node * @@ -3016,89 +3241,97 @@ The example below creates three named asynchronous tasks, A - + - + -class to create an executor for running a taskflow graph +class to create an executor -An executor manages a set of worker threads to run one or multiple taskflows using an efficient work-stealing scheduling algorithm. +An tf::Executor manages a set of worker threads to run tasks using an efficient work-stealing scheduling algorithm. //Declareanexecutorandataskflow tf::Executorexecutor; tf::Taskflowtaskflow; //Addthreetasksintothetaskflow -tf::TaskA=taskflow.emplace([](){std::cout<<"ThisisTaskA\n";}); -tf::TaskB=taskflow.emplace([](){std::cout<<"ThisisTaskB\n";}); -tf::TaskC=taskflow.emplace([](){std::cout<<"ThisisTaskC\n";}); +tf::TaskA=taskflow.emplace([](){std::cout<<"ThisisTaskA\n";}); +tf::TaskB=taskflow.emplace([](){std::cout<<"ThisisTaskB\n";}); +tf::TaskC=taskflow.emplace([](){std::cout<<"ThisisTaskC\n";}); //Buildprecedencebetweentasks A.precede(B,C); tf::Future<void>fu=executor.run(taskflow); -fu.wait();//blockuntiltheexecutioncompletes +fu.wait();//blockuntiltheexecutioncompletes -executor.run(taskflow,[](){std::cout<<"endof1run";}).wait(); +executor.run(taskflow,[](){std::cout<<"endof1run";}).wait(); executor.run_n(taskflow,4); executor.wait_for_all();//blockuntilallassociatedexecutionsfinish -executor.run_n(taskflow,4,[](){std::cout<<"endof4runs";}).wait(); +executor.run_n(taskflow,4,[](){std::cout<<"endof4runs";}).wait(); executor.run_until(taskflow,[cnt=0]()mutable{return++cnt==10;}); -All the run methods are thread-safe. You can submit multiple taskflows at the same time to an executor from different threads. +All executor methods are thread-safe. For example, you can submit multiple taskflows to an executor concurrently from different threads, while other threads simultaneously create asynchronous tasks. +std::threadt1([&](){executor.run(taskflow);}; +std::threadt2([&](){executor.async([](){std::cout<<"asynctaskfromt2\n";});}); +executor.async([&](){std::cout<<"asynctaskfromthemainthread\n";}); + +To know more about tf::Executor, please refer to Executor. + + - + - tf::Executor_all_spawned - tf::Executor_corun_graph + tf::Executor_async + tf::Executor_buffers + tf::Executor_corun_graph tf::Executor_corun_until tf::Executor_decrement_topology - tf::Executor_detach_subflow_task - tf::Executor_done tf::Executor_exploit_task - tf::Executor_explore_task + tf::Executor_explore_task tf::Executor_increment_topology tf::Executor_invoke - tf::Executor_invoke_async_task + tf::Executor_invoke_async_task tf::Executor_invoke_condition_task - tf::Executor_invoke_dependent_async_task - tf::Executor_invoke_module_task - tf::Executor_invoke_module_task_internal + tf::Executor_invoke_dependent_async_task + tf::Executor_invoke_module_task + tf::Executor_invoke_module_task_impl tf::Executor_invoke_multi_condition_task + tf::Executor_invoke_runtime_task + tf::Executor_invoke_runtime_task_impl + tf::Executor_invoke_runtime_task_impl tf::Executor_invoke_static_task - tf::Executor_invoke_subflow_task - tf::Executor_MAX_STEALS - tf::Executor_notifier - tf::Executor_num_topologies + tf::Executor_invoke_subflow_task + tf::Executor_notifier tf::Executor_num_topologies tf::Executor_observer_epilogue tf::Executor_observer_prologue tf::Executor_observers - tf::Executor_process_async_dependent + tf::Executor_process_dependent_async tf::Executor_process_exception tf::Executor_schedule tf::Executor_schedule - tf::Executor_schedule - tf::Executor_schedule + tf::Executor_schedule + tf::Executor_schedule tf::Executor_schedule_async_task - tf::Executor_set_up_graph + tf::Executor_schedule_graph_with_parent + tf::Executor_set_up_graph tf::Executor_set_up_topology + tf::Executor_shutdown + tf::Executor_silent_async tf::Executor_spawn tf::Executor_taskflows tf::Executor_taskflows_mutex - tf::Executor_tear_down_async - tf::Executor_tear_down_dependent_async - tf::Executor_tear_down_invoke + tf::Executor_tear_down_async + tf::Executor_tear_down_dependent_async + tf::Executor_tear_down_invoke tf::Executor_tear_down_topology - tf::Executor_this_worker - tf::Executor_threads tf::Executor_topology_cv tf::Executor_topology_mutex + tf::Executor_update_cache tf::Executor_wait_for_task - tf::Executor_wids + tf::Executor_worker_interface tf::Executor_workers - tf::Executor_wsq - tf::Executor_wsq_mutex + tf::ExecutorAlgorithm tf::Executorasync tf::Executorasync tf::Executorcorun @@ -3107,12 +3340,14 @@ The example below creates three named asynchronous tasks, Atf::Executordependent_async tf::Executordependent_async tf::Executordependent_async - tf::ExecutorExecutor + tf::ExecutorExecutor tf::ExecutorFlowBuilder tf::Executormake_observer tf::Executornum_observers + tf::Executornum_queues tf::Executornum_taskflows tf::Executornum_topologies + tf::Executornum_waiters tf::Executornum_workers tf::Executorremove_observer tf::Executorrun diff --git a/docs/xml/classtf_1_1FlowBuilder.xml b/docs/xml/classtf_1_1FlowBuilder.xml index 568b59841..a09e2cd8b 100644 --- a/docs/xml/classtf_1_1FlowBuilder.xml +++ b/docs/xml/classtf_1_1FlowBuilder.xml @@ -1,16 +1,17 @@ - + tf::FlowBuilder tf::Subflow tf::Taskflow - flow_builder.hpp - + taskflow/core/flow_builder.hpp + class friend class Executor Executor + tf::FlowBuilder::Executor Executor @@ -20,15 +21,16 @@ - + - - + + Graph & Graph& tf::FlowBuilder::_graph _graph + tf::FlowBuilder::_graph associated graph object @@ -36,15 +38,16 @@ - + - - + + tf::FlowBuilder::FlowBuilder (Graph &graph) FlowBuilder + tf::FlowBuilder::FlowBuilder Graph & graph @@ -56,7 +59,7 @@ - + @@ -72,6 +75,7 @@ Task tf::FlowBuilder::emplace (C &&callable) emplace + tf::FlowBuilder::emplace C && callable @@ -85,7 +89,7 @@ C -callable type constructible from std::function<void()> +callable type constructible from std::function<void()> @@ -103,11 +107,67 @@ The following example creates a static task. tf::Taskstatic_task=taskflow.emplace([](){}); -Please refer to Static Tasking for details. +Please refer to Static Tasking for details. + + + + + + + + + + + typename C + + + std::enable_if_t< is_runtime_task_v< C >, void > * + nullptr + + + Task + Task tf::FlowBuilder::emplace + (C &&callable) + emplace + tf::FlowBuilder::emplace + + C && + callable + + +creates a runtime task + + + + +C + + +callable type constructible from std::function<void(tf::Runtime&)> + + + + + +callable + + +callable to construct a runtime task + + + +a tf::Task handle + +The following example creates a runtime task. +tf::Taskstatic_task=taskflow.emplace([](tf::Runtime&){}); + +Please refer to Runtime Tasking for details. + + - + @@ -123,6 +183,7 @@ The following example creates a static task. Task tf::FlowBuilder::emplace (C &&callable) emplace + tf::FlowBuilder::emplace C && callable @@ -136,7 +197,7 @@ The following example creates a static task. C -callable type constructible from std::function<void(tf::Subflow&)> +callable type constructible from std::function<void(tf::Subflow&)> @@ -157,11 +218,13 @@ The following example creates a dynamic task (tf::Taskstatic_task2=sf.emplace([](){}); }); -Please refer to Subflow Tasking for details. +Please refer to Subflow Tasking for details. + + - + @@ -177,6 +240,7 @@ The following example creates a dynamic task (Task tf::FlowBuilder::emplace (C &&callable) emplace + tf::FlowBuilder::emplace C && callable @@ -190,7 +254,7 @@ The following example creates a dynamic task (C -callable type constructible from std::function<int()> +callable type constructible from std::function<int()> @@ -211,19 +275,21 @@ The following example creates an if-else block using one condition task and thre auto[init,cond,yes,no]=taskflow.emplace( [](){}, [](){return0;}, -[](){std::cout<<"yes\n";}, -[](){std::cout<<"no\n";} +[](){std::cout<<"yes\n";}, +[](){std::cout<<"no\n";} ); //executesyesifcondreturns0,ornoifcondreturns1 cond.precede(yes,no); cond.succeed(init); -Please refer to Conditional Tasking for details. +Please refer to Conditional Tasking for details. + + - + @@ -239,6 +305,7 @@ The following example creates an if-else block using one condition task and thre Task tf::FlowBuilder::emplace (C &&callable) emplace + tf::FlowBuilder::emplace C && callable @@ -252,7 +319,7 @@ The following example creates an if-else block using one condition task and thre C -callable type constructible from std::function<tf::SmallVector<int>()> +callable type constructible from std::function<tf::SmallVector<int>()> @@ -273,20 +340,22 @@ The following example creates a multi-condition task that selectively jumps to t auto[init,cond,branch1,branch2,branch3]=taskflow.emplace( [](){}, [](){returntf::SmallVector{0,2};}, -[](){std::cout<<"branch1\n";}, -[](){std::cout<<"branch2\n";}, -[](){std::cout<<"branch3\n";} +[](){std::cout<<"branch1\n";}, +[](){std::cout<<"branch2\n";}, +[](){std::cout<<"branch3\n";} ); //executesbranch1andbranch3whencondreturns0and2 cond.precede(branch1,branch2,branch3); cond.succeed(init); -Please refer to Conditional Tasking for details. +Please refer to Conditional Tasking for details. + + - + @@ -304,6 +373,7 @@ The following example creates a multi-condition task that selectively jumps to t auto tf::FlowBuilder::emplace (C &&... callables) emplace + tf::FlowBuilder::emplace C &&... callables @@ -334,22 +404,23 @@ The following example creates a multi-condition task that selectively jumps to t The method returns a tuple of tasks each corresponding to the given callable target. You can use structured binding to get the return tasks one by one. The following example creates four static tasks and assign them to A, B, C, and D using structured binding. auto[A,B,C,D]=taskflow.emplace( -[](){std::cout<<"A";}, -[](){std::cout<<"B";}, -[](){std::cout<<"C";}, -[](){std::cout<<"D";} +[](){std::cout<<"A";}, +[](){std::cout<<"B";}, +[](){std::cout<<"C";}, +[](){std::cout<<"D";} ); - + void void tf::FlowBuilder::erase (Task task) erase + tf::FlowBuilder::erase Task task @@ -368,10 +439,10 @@ The method returns a tuple of tasks each corresponding to the given callable tar Removes a task and its input and output dependencies from the graph associated with the flow builder. If the task does not belong to the graph, nothing will happen. -tf::TaskA=taskflow.emplace([](){std::cout<<"A";}); -tf::TaskB=taskflow.emplace([](){std::cout<<"B";}); -tf::TaskC=taskflow.emplace([](){std::cout<<"C";}); -tf::TaskD=taskflow.emplace([](){std::cout<<"D";}); +tf::TaskA=taskflow.emplace([](){std::cout<<"A";}); +tf::TaskB=taskflow.emplace([](){std::cout<<"B";}); +tf::TaskC=taskflow.emplace([](){std::cout<<"C";}); +tf::TaskD=taskflow.emplace([](){std::cout<<"D";}); A.precede(B,C,D); //eraseAfromthetaskflowanditsdependenciestoB,C,andD @@ -380,7 +451,7 @@ Removes a task and its input and output dependencies from the graph associated w - + @@ -392,6 +463,7 @@ Removes a task and its input and output dependencies from the graph associated w Task tf::FlowBuilder::composed_of (T &object) composed_of + tf::FlowBuilder::composed_of T & object @@ -422,11 +494,11 @@ Removes a task and its input and output dependencies from the graph associated w The example below demonstrates a taskflow composition using the composed_of method. tf::Taskflowt1,t2; -t1.emplace([](){std::cout<<"t1";}); +t1.emplace([](){std::cout<<"t1";}); //t2ispartiallycomposedoft1 tf::Taskcomp=t2.composed_of(t1); -tf::Taskinit=t2.emplace([](){std::cout<<"t2";}); +tf::Taskinit=t2.emplace([](){std::cout<<"t2";}); init.precede(comp); The taskflow object t2 is composed of another taskflow object t1, preceded by another static task init. When taskflow t2 is submitted to an executor, init will run first and then comp which spawns its definition in taskflow t1. @@ -437,7 +509,7 @@ The example below demonstrates a taskflow composition using the MyObj(){ tf::FlowBuilderbuilder(graph); tf::Tasktask=builder.emplace([](){ -std::cout<<"atask\n";//statictask +std::cout<<"atask\n";//statictask }); } Graph&graph(){returngraph;} @@ -446,17 +518,20 @@ The example below demonstrates a taskflow composition using the MyObjobj; tf::Taskcomp=taskflow.composed_of(obj); -Please refer to Composable Tasking for details. +Please refer to Composable Tasking for details. + + - + Task Task tf::FlowBuilder::placeholder () placeholder + tf::FlowBuilder::placeholder creates a placeholder task @@ -479,15 +554,16 @@ A placeholder task maps to a node in the taskflow graph, but it does not have an - + void void tf::FlowBuilder::linearize (std::vector< Task > &tasks) linearize + tf::FlowBuilder::linearize - std::vector< Task > & + std::vector< Task > & tasks @@ -504,25 +580,26 @@ A placeholder task maps to a node in the taskflow graph, but it does not have an This member function creates linear dependencies over a vector of tasks. -tf::TaskA=taskflow.emplace([](){std::cout<<"A";}); -tf::TaskB=taskflow.emplace([](){std::cout<<"B";}); -tf::TaskC=taskflow.emplace([](){std::cout<<"C";}); -tf::TaskD=taskflow.emplace([](){std::cout<<"D";}); -std::vector<tf::Task>tasks{A,B,C,D} +tf::TaskA=taskflow.emplace([](){std::cout<<"A";}); +tf::TaskB=taskflow.emplace([](){std::cout<<"B";}); +tf::TaskC=taskflow.emplace([](){std::cout<<"C";}); +tf::TaskD=taskflow.emplace([](){std::cout<<"D";}); +std::vector<tf::Task>tasks{A,B,C,D} taskflow.linearize(tasks);//A->B->C->D - + void void tf::FlowBuilder::linearize (std::initializer_list< Task > tasks) linearize + tf::FlowBuilder::linearize - std::initializer_list< Task > + std::initializer_list< Task > tasks @@ -539,16 +616,16 @@ This member function creates linear dependencies over a vector of tasks. This member function creates linear dependencies over a list of tasks. -tf::TaskA=taskflow.emplace([](){std::cout<<"A";}); -tf::TaskB=taskflow.emplace([](){std::cout<<"B";}); -tf::TaskC=taskflow.emplace([](){std::cout<<"C";}); -tf::TaskD=taskflow.emplace([](){std::cout<<"D";}); +tf::TaskA=taskflow.emplace([](){std::cout<<"A";}); +tf::TaskB=taskflow.emplace([](){std::cout<<"B";}); +tf::TaskC=taskflow.emplace([](){std::cout<<"C";}); +tf::TaskD=taskflow.emplace([](){std::cout<<"D";}); taskflow.linearize({A,B,C,D});//A->B->C->D - + @@ -563,13 +640,14 @@ This member function creates linear dependencies over a list of tasks. typename P - DefaultPartitioner + DefaultPartitioner Task Task tf::FlowBuilder::for_each (B first, E last, C callable, P part=P()) for_each + tf::FlowBuilder::for_each B first @@ -620,7 +698,7 @@ This member function creates linear dependencies over a list of tasks. P -partitioner type (default tf::DefaultPartitioner) +partitioner type (default tf::DefaultPartitioner) @@ -664,12 +742,14 @@ The task spawns asynchronous tasks that applies the callable object to each obje callable(*itr); } -Iterators are templated to enable stateful range using std::reference_wrapper. The callable needs to take a single argument of the dereferenced iterator type. -Please refer to Parallel Iterations for details. +Iterators can be made stateful by using std::reference_wrapper The callable needs to take a single argument of the dereferenced iterator type. +Please refer to Parallel Iterations for details. + + - + @@ -687,13 +767,14 @@ The task spawns asynchronous tasks that applies the callable object to each obje typename P - DefaultPartitioner + DefaultPartitioner Task Task tf::FlowBuilder::for_each_index (B first, E last, S step, C callable, P part=P()) for_each_index + tf::FlowBuilder::for_each_index B first @@ -716,7 +797,7 @@ The task spawns asynchronous tasks that applies the callable object to each obje P() -constructs an STL-styled index-based parallel-for task +constructs an index-based parallel-for task @@ -756,7 +837,7 @@ The task spawns asynchronous tasks that applies the callable object to each obje P -partitioner type (default tf::DefaultPartitioner) +partitioner type (default tf::DefaultPartitioner) @@ -814,12 +895,124 @@ The task spawns asynchronous tasks that applies the callable object to each inde callable(i); } -Iterators are templated to enable stateful range using std::reference_wrapper. The callable needs to take a single argument of the integral index type. -Please refer to Parallel Iterations for details. +Iterators can be made stateful by using std::reference_wrapper The callable needs to take a single argument of the integral index type. +Please refer to Parallel Iterations for details. + + + + + + + + + + + typename R + + + typename C + + + typename P + DefaultPartitioner + + + Task + Task tf::FlowBuilder::for_each_by_index + (R range, C callable, P part=P()) + for_each_by_index + tf::FlowBuilder::for_each_by_index + + R + range + + + C + callable + + + P + part + P() + + +constructs an index range-based parallel-for task + + + + +R + + +index range type (tf::IndexRange) + + + + +C + + +callable type + + + + +P + + +partitioner type (default tf::DefaultPartitioner) + + + + + +range + + +index range + + + + +callable + + +callable object to apply to each valid index + + + + +part + + +partitioning algorithm to schedule parallel iterations + + + +a tf::Task handle + +The task spawns asynchronous tasks that applies the callable object to in the range [first, last) with the step size. +//[0,17)withastepsizeof2usingtf::IndexRange +tf::IndexRange<int>range(0,17,2); + +//parallelizethesequence[0,2,4,6,8,10,12,14,16] +taskflow.for_each_by_index(range,[](tf::IndexRange<int>range){ +//iterateeachindexinthesubrange +for(inti=range.begin();i<range.end();i+=range.step_size()){ +printf("iterate%d\n",i); +} +}); + +executor.run(taskflow).wait(); + +The callable needs to take a single argument of type tf::IndexRange. +Please refer to Parallel Iterations for details. + + - + @@ -837,10 +1030,10 @@ The task spawns asynchronous tasks that applies the callable object to each inde typename P - DefaultPartitioner + DefaultPartitioner - std::enable_if_t< is_partitioner_v< std::decay_t< P >>, void > * + std::enable_if_t< is_partitioner_v< std::decay_t< P > >, void > * nullptr @@ -848,6 +1041,7 @@ The task spawns asynchronous tasks that applies the callable object to each inde Task tf::FlowBuilder::transform (B first1, E last1, O d_first, C c, P part=P()) transform + tf::FlowBuilder::transform B first1 @@ -910,7 +1104,7 @@ The task spawns asynchronous tasks that applies the callable object to each inde P -partitioner type (default tf::DefaultPartitioner) +partitioner type (default tf::DefaultPartitioner) @@ -962,12 +1156,14 @@ The task spawns asynchronous tasks that applies the callable object to an input *d_first++=c(*first1++); } -Iterators are templated to enable stateful range using std::reference_wrapper. The callable needs to take a single argument of the dereferenced iterator type. -Please refer to Parallel Transforms for details. +Iterators can be made stateful by using std::reference_wrapper The callable needs to take a single argument of the dereferenced iterator type. +Please refer to Parallel Transforms for details. + + - + @@ -988,10 +1184,10 @@ The task spawns asynchronous tasks that applies the callable object to an input typename P - DefaultPartitioner + DefaultPartitioner - std::enable_if_t<!is_partitioner_v< std::decay_t< C >>, void > * + std::enable_if_t<!is_partitioner_v< std::decay_t< C > >, void > * nullptr @@ -999,6 +1195,7 @@ The task spawns asynchronous tasks that applies the callable object to an input Task tf::FlowBuilder::transform (B1 first1, E1 last1, B2 first2, O d_first, C c, P part=P()) transform + tf::FlowBuilder::transform B1 first1 @@ -1073,7 +1270,7 @@ The task spawns asynchronous tasks that applies the callable object to an input P -partitioner type (default tf::DefaultPartitioner) +partitioner type (default tf::DefaultPartitioner) @@ -1133,12 +1330,14 @@ The task spawns asynchronous tasks that applies the callable object to two input *d_first++=c(*first1++,*first2++); } -Iterators are templated to enable stateful range using std::reference_wrapper. The callable needs to take two arguments of dereferenced elements from the two input ranges. -Please refer to Parallel Transforms for details. +Iterators can be made stateful by using std::reference_wrapper The callable needs to take two arguments of dereferenced elements from the two input ranges. +Please refer to Parallel Transforms for details. + + - + @@ -1156,13 +1355,14 @@ The task spawns asynchronous tasks that applies the callable object to two input typename P - DefaultPartitioner + DefaultPartitioner Task Task tf::FlowBuilder::reduce (B first, E last, T &init, O bop, P part=P()) reduce + tf::FlowBuilder::reduce B first @@ -1185,7 +1385,7 @@ The task spawns asynchronous tasks that applies the callable object to two input P() -constructs an STL-styled parallel-reduce task +constructs an STL-styled parallel-reduction task @@ -1225,7 +1425,7 @@ The task spawns asynchronous tasks that applies the callable object to two input P -partitioner type (default tf::DefaultPartitioner) +partitioner type (default tf::DefaultPartitioner) @@ -1277,12 +1477,181 @@ The task spawns asynchronous tasks to perform parallel reduction over init=bop(init,*itr); } -Iterators are templated to enable stateful range using std::reference_wrapper. -Please refer to Parallel Reduction for details. +Iterators can be made stateful by using std::reference_wrapper +Please refer to Parallel Reduction for details. + + + + + + + + + + + typename R + + + typename T + + + typename L + + + typename G + + + typename P + DefaultPartitioner + + + Task + Task tf::FlowBuilder::reduce_by_index + (R range, T &init, L lop, G gop, P part=P()) + reduce_by_index + tf::FlowBuilder::reduce_by_index + + R + range + + + T & + init + + + L + lop + + + G + gop + + + P + part + P() + + +constructs an index range-based parallel-reduction task + + + + +R + + +index range type (tf::IndexRange) + + + + +T + + +result type + + + + +L + + +local reducer type + + + + +G + + +global reducer type + + + + +P + + +partitioner type (default tf::DefaultPartitioner) + + + + + +range + + +index range + + + + +init + + +initial value of the reduction and the storage for the reduced result + + + + +lop + + +binary operator that will be applied locally per worker + + + + +gop + + +binary operator that will be applied globally among worker + + + + +part + + +partitioning algorithm to schedule parallel iterations + + + +a tf::Task handle + +The task spawns asynchronous tasks to perform parallel reduction over a range with init. The reduced result is store in init. Unlike the iterator-based reduction, index range-based reduction is particularly useful for applications that benefit from SIMD optimizations or other range-based processing strategies. +constsize_tN=1000000; +std::vector<int>data(N);//uninitializeddatavector +intres=1;//reswillparticipateinthereduction + +taskflow.reduce_by_index( +tf::IndexRange<size_t>(0,N,1), +//finalresult +res, +//localreducer +[&](tf::IndexRange<size_t>subrange,std::optional<int>running_total)->int{ +intresidual=running_total?*running_total:0.0; +for(size_ti=subrange.begin();i<subrange.end();i+=subrange.step_size()){ +data[i]=1.0; +residual+=data[i]; +} +printf("partialsum=%lf\n",residual); +returnresidual; +}, +//globalreducer +std::plus<int>() +); +executor.run(taskflow).wait(); +assert(res=N+1); + +Range can be made stateful by using std::reference_wrapper. +Please refer to Parallel Reduction for details. + + - + @@ -1303,10 +1672,10 @@ The task spawns asynchronous tasks to perform parallel reduction over typename P - DefaultPartitioner + DefaultPartitioner - std::enable_if_t< is_partitioner_v< std::decay_t< P >>, void > * + std::enable_if_t< is_partitioner_v< std::decay_t< P > >, void > * nullptr @@ -1314,6 +1683,7 @@ The task spawns asynchronous tasks to perform parallel reduction over Task tf::FlowBuilder::transform_reduce (B first, E last, T &init, BOP bop, UOP uop, P part=P()) transform_reduce + tf::FlowBuilder::transform_reduce B first @@ -1388,7 +1758,7 @@ The task spawns asynchronous tasks to perform parallel reduction over P -partitioner type (default tf::DefaultPartitioner) +partitioner type (default tf::DefaultPartitioner) @@ -1448,12 +1818,14 @@ The task spawns asynchronous tasks to perform parallel reduction over init=bop(init,uop(*itr)); } -Iterators are templated to enable stateful range using std::reference_wrapper. -Please refer to Parallel Reduction for details. +Iterators can be made stateful by using std::reference_wrapper +Please refer to Parallel Reduction for details. + + - + @@ -1477,10 +1849,10 @@ The task spawns asynchronous tasks to perform parallel reduction over typename P - DefaultPartitioner + DefaultPartitioner - std::enable_if_t<!is_partitioner_v< std::decay_t< BOP_T >>, void > * + std::enable_if_t<!is_partitioner_v< std::decay_t< BOP_T > >, void > * nullptr @@ -1488,6 +1860,7 @@ The task spawns asynchronous tasks to perform parallel reduction over Task tf::FlowBuilder::transform_reduce (B1 first1, E1 last1, B2 first2, T &init, BOP_R bop_r, BOP_T bop_t, P part=P()) transform_reduce + tf::FlowBuilder::transform_reduce B1 first1 @@ -1574,7 +1947,7 @@ The task spawns asynchronous tasks to perform parallel reduction over P -partitioner type (default tf::DefaultPartitioner) +partitioner type (default tf::DefaultPartitioner) @@ -1642,14 +2015,16 @@ The task spawns asynchronous tasks to perform parallel reduction over init=bop_r(init,bop_t(*itr1,*itr2)); } -Iterators are templated to enable stateful range using std::reference_wrapper. -Please refer to Parallel Reduction for details. +Iterators can be made stateful by using std::reference_wrapper +Please refer to Parallel Reduction for details. + + - + - + typename B @@ -1663,19 +2038,12 @@ The task spawns asynchronous tasks to perform parallel reduction over typename BOP - - typename P - DefaultPartitioner - - - std::enable_if_t< is_partitioner_v< std::decay_t< P >>, void > * - nullptr - Task Task tf::FlowBuilder::inclusive_scan - (B first, E last, D d_first, BOP bop, P part=P()) + (B first, E last, D d_first, BOP bop) inclusive_scan + tf::FlowBuilder::inclusive_scan B first @@ -1692,11 +2060,6 @@ The task spawns asynchronous tasks to perform parallel reduction over BOP bop - - P - part - P() - creates an STL-styled parallel inclusive-scan task @@ -1730,15 +2093,7 @@ The task spawns asynchronous tasks to perform parallel reduction over BOP -summation operator type - - - - -P - - -partitioner type (default tf::DefaultPartitioner) +summation operator type @@ -1771,36 +2126,30 @@ The task spawns asynchronous tasks to perform parallel reduction over bop -function to perform summation - - - - -part - - -partitioning algorithm to schedule parallel iterations +function to perform summation Performs the cumulative sum (aka prefix sum, aka scan) of the input range and writes the result to the output range. Each element of the output range contains the running total of all earlier elements using the given binary operator for summation. This function generates an inclusive scan, meaning that the N-th element of the output range is the sum of the first N input elements, so the N-th input element is included. -std::vector<int>input={1,2,3,4,5}; +std::vector<int>input={1,2,3,4,5}; taskflow.inclusive_scan( -input.begin(),input.end(),input.begin(),std::plus<int>{} +input.begin(),input.end(),input.begin(),std::plus<int>{} ); executor.run(taskflow).wait(); //inputis{1,3,6,10,15} -Iterators are templated to enable stateful range using std::reference_wrapper. -Please refer to Parallel Scan for details. +Iterators can be made stateful by using std::reference_wrapper +Please refer to Parallel Scan for details. + + - + - + typename B @@ -1817,19 +2166,12 @@ Performs the cumulative sum (aka prefix sum, aka scan) of the input range and wr typename T - - typename P - DefaultPartitioner - - - std::enable_if_t<!is_partitioner_v< std::decay_t< T >>, void > * - nullptr - Task Task tf::FlowBuilder::inclusive_scan - (B first, E last, D d_first, BOP bop, T init, P part=P()) + (B first, E last, D d_first, BOP bop, T init) inclusive_scan + tf::FlowBuilder::inclusive_scan B first @@ -1850,11 +2192,6 @@ Performs the cumulative sum (aka prefix sum, aka scan) of the input range and wr T init - - P - part - P() - creates an STL-styled parallel inclusive-scan task with an initial value @@ -1896,15 +2233,7 @@ Performs the cumulative sum (aka prefix sum, aka scan) of the input range and wr T -initial value type - - - - -P - - -partitioner type (default tf::DefaultPartitioner) +initial value type @@ -1945,36 +2274,30 @@ Performs the cumulative sum (aka prefix sum, aka scan) of the input range and wr init -initial value - - - - -part - - -partitioning algorithm to schedule parallel iterations +initial value Performs the cumulative sum (aka prefix sum, aka scan) of the input range and writes the result to the output range. Each element of the output range contains the running total of all earlier elements (and the initial value) using the given binary operator for summation. This function generates an inclusive scan, meaning the N-th element of the output range is the sum of the first N input elements, so the N-th input element is included. -std::vector<int>input={1,2,3,4,5}; +std::vector<int>input={1,2,3,4,5}; taskflow.inclusive_scan( -input.begin(),input.end(),input.begin(),std::plus<int>{},-1 +input.begin(),input.end(),input.begin(),std::plus<int>{},-1 ); executor.run(taskflow).wait(); //inputis{0,2,5,9,14} -Iterators are templated to enable stateful range using std::reference_wrapper. -Please refer to Parallel Scan for details. +Iterators can be made stateful by using std::reference_wrapper +Please refer to Parallel Scan for details. + + - + - + typename B @@ -1991,15 +2314,12 @@ Performs the cumulative sum (aka prefix sum, aka scan) of the input range and wr typename BOP - - typename P - DefaultPartitioner - Task Task tf::FlowBuilder::exclusive_scan - (B first, E last, D d_first, T init, BOP bop, P part=P()) + (B first, E last, D d_first, T init, BOP bop) exclusive_scan + tf::FlowBuilder::exclusive_scan B first @@ -2020,11 +2340,6 @@ Performs the cumulative sum (aka prefix sum, aka scan) of the input range and wr BOP bop - - P - part - P() - creates an STL-styled parallel exclusive-scan task @@ -2066,15 +2381,7 @@ Performs the cumulative sum (aka prefix sum, aka scan) of the input range and wr BOP -summation operator type - - - - -P - - -partitioner type (default tf::DefaultPartitioner) +summation operator type @@ -2115,36 +2422,30 @@ Performs the cumulative sum (aka prefix sum, aka scan) of the input range and wr bop -function to perform summation - - - - -part - - -partitioning algorithm to schedule parallel iterations +function to perform summation Performs the cumulative sum (aka prefix sum, aka scan) of the input range and writes the result to the output range. Each element of the output range contains the running total of all earlier elements (and the initial value) using the given binary operator for summation. This function generates an exclusive scan, meaning the N-th element of the output range is the sum of the first N-1 input elements, so the N-th input element is not included. -std::vector<int>input={1,2,3,4,5}; +std::vector<int>input={1,2,3,4,5}; taskflow.exclusive_scan( -input.begin(),input.end(),input.begin(),-1,std::plus<int>{} +input.begin(),input.end(),input.begin(),-1,std::plus<int>{} ); executor.run(taskflow).wait(); //inputis{-1,0,2,5,9} -Iterators are templated to enable stateful range using std::reference_wrapper. -Please refer to Parallel Scan for details. +Iterators can be made stateful by using std::reference_wrapper +Please refer to Parallel Scan for details. + + - + - + typename B @@ -2161,19 +2462,12 @@ Performs the cumulative sum (aka prefix sum, aka scan) of the input range and wr typename UOP - - typename P - DefaultPartitioner - - - std::enable_if_t< is_partitioner_v< std::decay_t< P >>, void > * - nullptr - Task Task tf::FlowBuilder::transform_inclusive_scan - (B first, E last, D d_first, BOP bop, UOP uop, P part=P()) + (B first, E last, D d_first, BOP bop, UOP uop) transform_inclusive_scan + tf::FlowBuilder::transform_inclusive_scan B first @@ -2194,11 +2488,6 @@ Performs the cumulative sum (aka prefix sum, aka scan) of the input range and wr UOP uop - - P - part - P() - creates an STL-styled parallel transform-inclusive scan task @@ -2240,15 +2529,7 @@ Performs the cumulative sum (aka prefix sum, aka scan) of the input range and wr UOP -transform operator type - - - - -P - - -partitioner type (default tf::DefaultPartitioner) +transform operator type @@ -2289,37 +2570,31 @@ Performs the cumulative sum (aka prefix sum, aka scan) of the input range and wr uop -function to transform elements of the input range - - - - -part - - -partitioning algorithm to schedule parallel iterations +function to transform elements of the input range Write the cumulative sum (aka prefix sum, aka scan) of the input range to the output range. Each element of the output range contains the running total of all earlier elements using uop to transform the input elements and using bop for summation. This function generates an inclusive scan, meaning the Nth element of the output range is the sum of the first N input elements, so the Nth input element is included. -std::vector<int>input={1,2,3,4,5}; +std::vector<int>input={1,2,3,4,5}; taskflow.transform_inclusive_scan( -input.begin(),input.end(),input.begin(),std::plus<int>{}, +input.begin(),input.end(),input.begin(),std::plus<int>{}, [](intitem){return-item;} ); executor.run(taskflow).wait(); //inputis{-1,-3,-6,-10,-15} -Iterators are templated to enable stateful range using std::reference_wrapper. -Please refer to Parallel Scan for details. +Iterators can be made stateful by using std::reference_wrapper +Please refer to Parallel Scan for details. + + - + - + typename B @@ -2339,19 +2614,12 @@ Write the cumulative sum (aka prefix sum, aka scan) of the input range to the ou typename T - - typename P - DefaultPartitioner - - - std::enable_if_t<!is_partitioner_v< std::decay_t< T >>, void > * - nullptr - Task Task tf::FlowBuilder::transform_inclusive_scan - (B first, E last, D d_first, BOP bop, UOP uop, T init, P part=P()) + (B first, E last, D d_first, BOP bop, UOP uop, T init) transform_inclusive_scan + tf::FlowBuilder::transform_inclusive_scan B first @@ -2376,11 +2644,6 @@ Write the cumulative sum (aka prefix sum, aka scan) of the input range to the ou T init - - P - part - P() - creates an STL-styled parallel transform-inclusive scan task @@ -2430,15 +2693,7 @@ Write the cumulative sum (aka prefix sum, aka scan) of the input range to the ou T -initial value type - - - - -P - - -partitioner type (default tf::DefaultPartitioner) +initial value type @@ -2487,23 +2742,15 @@ Write the cumulative sum (aka prefix sum, aka scan) of the input range to the ou init -initial value - - - - -part - - -partitioning algorithm to schedule parallel iterations +initial value Write the cumulative sum (aka prefix sum, aka scan) of the input range to the output range. Each element of the output range contains the running total of all earlier elements (including an initial value) using uop to transform the input elements and using bop for summation. This function generates an inclusive scan, meaning the Nth element of the output range is the sum of the first N input elements, so the Nth input element is included. -std::vector<int>input={1,2,3,4,5}; +std::vector<int>input={1,2,3,4,5}; taskflow.transform_inclusive_scan( -input.begin(),input.end(),input.begin(),std::plus<int>{}, +input.begin(),input.end(),input.begin(),std::plus<int>{}, [](intitem){return-item;}, -1 ); @@ -2511,14 +2758,16 @@ Write the cumulative sum (aka prefix sum, aka scan) of the input range to the ou //inputis{-2,-4,-7,-11,-16} -Iterators are templated to enable stateful range using std::reference_wrapper. -Please refer to Parallel Scan for details. +Iterators can be made stateful by using std::reference_wrapper +Please refer to Parallel Scan for details. + + - + - + typename B @@ -2538,15 +2787,12 @@ Write the cumulative sum (aka prefix sum, aka scan) of the input range to the ou typename UOP - - typename P - DefaultPartitioner - Task Task tf::FlowBuilder::transform_exclusive_scan - (B first, E last, D d_first, T init, BOP bop, UOP uop, P part=P()) + (B first, E last, D d_first, T init, BOP bop, UOP uop) transform_exclusive_scan + tf::FlowBuilder::transform_exclusive_scan B first @@ -2571,11 +2817,6 @@ Write the cumulative sum (aka prefix sum, aka scan) of the input range to the ou UOP uop - - P - part - P() - creates an STL-styled parallel transform-exclusive scan task @@ -2625,15 +2866,7 @@ Write the cumulative sum (aka prefix sum, aka scan) of the input range to the ou T -initial value type - - - - -P - - -partitioner type (default tf::DefaultPartitioner) +initial value type @@ -2682,35 +2915,29 @@ Write the cumulative sum (aka prefix sum, aka scan) of the input range to the ou init -initial value - - - - -part - - -partitioning algorithm to schedule parallel iterations +initial value Write the cumulative sum (aka prefix sum, aka scan) of the input range to the output range. Each element of the output range contains the running total of all earlier elements (including an initial value) using uop to transform the input elements and using bop for summation. This function generates an exclusive scan, meaning the Nth element of the output range is the sum of the first N-1 input elements, so the Nth input element is not included. -std::vector<int>input={1,2,3,4,5}; +std::vector<int>input={1,2,3,4,5}; taskflow.transform_exclusive_scan( -input.begin(),input.end(),input.begin(),-1,std::plus<int>{}, +input.begin(),input.end(),input.begin(),-1,std::plus<int>{}, [](intitem){return-item;} ); executor.run(taskflow).wait(); //inputis{-1,-2,-4,-7,-11} -Iterators are templated to enable stateful range using std::reference_wrapper. -Please refer to Parallel Scan for details. +Iterators can be made stateful by using std::reference_wrapper +Please refer to Parallel Scan for details. + + - + @@ -2728,13 +2955,14 @@ Write the cumulative sum (aka prefix sum, aka scan) of the input range to the ou typename P - DefaultPartitioner + DefaultPartitioner Task Task tf::FlowBuilder::find_if (B first, E last, T &result, UOP predicate, P part=P()) find_if + tf::FlowBuilder::find_if B first @@ -2838,7 +3066,7 @@ Write the cumulative sum (aka prefix sum, aka scan) of the input range to the ou part -partitioning algorithm (default tf::DefaultPartitioner) +partitioning algorithm (default tf::DefaultPartitioner) @@ -2853,19 +3081,19 @@ Returns an iterator to the first element in the range [first, la } For example, the code below find the element that satisfies the given criteria (value plus one is equal to 23) from an input range of 10 elements: -std::vector<int>input={1,6,9,10,22,5,7,8,9,11}; -std::vector<int>::iteratorresult; +std::vector<int>input={1,6,9,10,22,5,7,8,9,11}; +std::vector<int>::iteratorresult; taskflow.find_if( input.begin(),input.end(),[](inti){returni+1=23;},result ); executor.run(taskflow).wait(); assert(*result==22); -Iterators are templated to enable stateful range using std::reference_wrapper. +Iterators can be made stateful by using std::reference_wrapper - + @@ -2883,13 +3111,14 @@ Returns an iterator to the first element in the range [first, la typename P - DefaultPartitioner + DefaultPartitioner Task Task tf::FlowBuilder::find_if_not (B first, E last, T &result, UOP predicate, P part=P()) find_if_not + tf::FlowBuilder::find_if_not B first @@ -2993,7 +3222,7 @@ Returns an iterator to the first element in the range [first, la part -partitioning algorithm (default tf::DefaultPartitioner) +partitioning algorithm (default tf::DefaultPartitioner) @@ -3008,19 +3237,19 @@ Returns an iterator to the first element in the range [first, la } For example, the code below find the element that satisfies the given criteria (value is not equal to 1) from an input range of 10 elements: -std::vector<int>input={1,1,1,1,22,1,1,1,1,1}; -std::vector<int>::iteratorresult; +std::vector<int>input={1,1,1,1,22,1,1,1,1,1}; +std::vector<int>::iteratorresult; taskflow.find_if_not( input.begin(),input.end(),[](inti){returni==1;},result ); executor.run(taskflow).wait(); assert(*result==22); -Iterators are templated to enable stateful range using std::reference_wrapper. +Iterators can be made stateful by using std::reference_wrapper - + @@ -3044,6 +3273,7 @@ Returns an iterator to the first element in the range [first, la Task tf::FlowBuilder::min_element (B first, E last, T &result, C comp, P part) min_element + tf::FlowBuilder::min_element B first @@ -3146,7 +3376,7 @@ Returns an iterator to the first element in the range [first, la part -partitioning algorithm (default tf::DefaultPartitioner) +partitioning algorithm (default tf::DefaultPartitioner) @@ -3164,19 +3394,19 @@ Finds the smallest element in the [first, last) returnsmallest; For example, the code below find the smallest element from an input range of 10 elements. -std::vector<int>input={1,1,1,1,1,-1,1,1,1,1}; -std::vector<int>::iteratorresult; +std::vector<int>input={1,1,1,1,1,-1,1,1,1,1}; +std::vector<int>::iteratorresult; taskflow.min_element( -input.begin(),input.end(),std::less<int>(),result +input.begin(),input.end(),std::less<int>(),result ); executor.run(taskflow).wait(); assert(*result==-1); -Iterators are templated to enable stateful range using std::reference_wrapper. +Iterators can be made stateful by using std::reference_wrapper - + @@ -3200,6 +3430,7 @@ Finds the smallest element in the [first, last) Task tf::FlowBuilder::max_element (B first, E last, T &result, C comp, P part) max_element + tf::FlowBuilder::max_element B first @@ -3302,7 +3533,7 @@ Finds the smallest element in the [first, last) part -partitioning algorithm (default tf::DefaultPartitioner) +partitioning algorithm (default tf::DefaultPartitioner) @@ -3320,19 +3551,19 @@ Finds the largest element in the [first, last) returnlargest; For example, the code below find the largest element from an input range of 10 elements. -std::vector<int>input={1,1,1,1,1,2,1,1,1,1}; -std::vector<int>::iteratorresult; +std::vector<int>input={1,1,1,1,1,2,1,1,1,1}; +std::vector<int>::iteratorresult; taskflow.max_element( -input.begin(),input.end(),std::less<int>(),result +input.begin(),input.end(),std::less<int>(),result ); executor.run(taskflow).wait(); assert(*result==2); -Iterators are templated to enable stateful range using std::reference_wrapper. +Iterators can be made stateful by using std::reference_wrapper - + @@ -3350,6 +3581,7 @@ Finds the largest element in the [first, last) Task tf::FlowBuilder::sort (B first, E last, C cmp) sort + tf::FlowBuilder::sort B first @@ -3417,12 +3649,14 @@ Finds the largest element in the [first, last) The task spawns asynchronous tasks to sort elements in the range [first, last) in parallel. -Iterators are templated to enable stateful range using std::reference_wrapper. -Please refer to Parallel Sort for details. +Iterators can be made stateful by using std::reference_wrapper +Please refer to Parallel Sort for details. + + - + @@ -3437,6 +3671,7 @@ The task spawns asynchronous tasks to sort elements in the range Task tf::FlowBuilder::sort (B first, E last) sort + tf::FlowBuilder::sort B first @@ -3484,15 +3719,17 @@ The task spawns asynchronous tasks to sort elements in the range The task spawns asynchronous tasks to parallel sort elements in the range [first, last) using the std::less<T> comparator, where T is the dereferenced iterator type. -Iterators are templated to enable stateful range using std::reference_wrapper. -Please refer to Parallel Sort for details. +Iterators can be made stateful by using std::reference_wrapper +Please refer to Parallel Sort for details. + + - + - - + + @@ -3503,6 +3740,7 @@ The task spawns asynchronous tasks to parallel sort elements in the range void tf::FlowBuilder::_linearize (L &) _linearize + tf::FlowBuilder::_linearize L & keys @@ -3513,9 +3751,9 @@ The task spawns asynchronous tasks to parallel sort elements in the range - + - + class to build a task dependency graph @@ -3541,6 +3779,9 @@ The task spawns asynchronous tasks to parallel sort elements in the range + + + @@ -3551,9 +3792,11 @@ The task spawns asynchronous tasks to parallel sort elements in the range + + - + tf::FlowBuilder_graph tf::FlowBuilder_linearize @@ -3562,30 +3805,33 @@ The task spawns asynchronous tasks to parallel sort elements in the range tf::FlowBuilderemplace tf::FlowBuilderemplace tf::FlowBuilderemplace + tf::FlowBuilderemplace tf::FlowBuilderemplace tf::FlowBuildererase - tf::FlowBuilderexclusive_scan + tf::FlowBuilderexclusive_scan tf::FlowBuilderExecutor tf::FlowBuilderfind_if tf::FlowBuilderfind_if_not tf::FlowBuilderFlowBuilder tf::FlowBuilderfor_each + tf::FlowBuilderfor_each_by_index tf::FlowBuilderfor_each_index - tf::FlowBuilderinclusive_scan - tf::FlowBuilderinclusive_scan + tf::FlowBuilderinclusive_scan + tf::FlowBuilderinclusive_scan tf::FlowBuilderlinearize tf::FlowBuilderlinearize tf::FlowBuildermax_element tf::FlowBuildermin_element tf::FlowBuilderplaceholder tf::FlowBuilderreduce + tf::FlowBuilderreduce_by_index tf::FlowBuildersort tf::FlowBuildersort tf::FlowBuildertransform tf::FlowBuildertransform - tf::FlowBuildertransform_exclusive_scan - tf::FlowBuildertransform_inclusive_scan - tf::FlowBuildertransform_inclusive_scan + tf::FlowBuildertransform_exclusive_scan + tf::FlowBuildertransform_inclusive_scan + tf::FlowBuildertransform_inclusive_scan tf::FlowBuildertransform_reduce tf::FlowBuildertransform_reduce diff --git a/docs/xml/classtf_1_1Future.xml b/docs/xml/classtf_1_1Future.xml index 2374d5384..5a34a5493 100644 --- a/docs/xml/classtf_1_1Future.xml +++ b/docs/xml/classtf_1_1Future.xml @@ -1,20 +1,21 @@ - + tf::Future std::future< T > - taskflow.hpp + taskflow/core/taskflow.hpp typename T - + class friend class Executor Executor + tf::Future::Executor Executor @@ -24,13 +25,14 @@ - + class friend class Subflow Subflow + tf::Future::Subflow Subflow @@ -40,13 +42,14 @@ - + class friend class Runtime Runtime + tf::Future::Runtime Runtime @@ -56,30 +59,32 @@ - + - - + + - std::weak_ptr< Topology > + std::weak_ptr< Topology > std::weak_ptr<Topology> tf::Future< T >::_topology _topology + tf::Future::_topology - + - - + + tf::Future< T >::Future ()=default Future + tf::Future::Future default constructor @@ -87,13 +92,14 @@ - + tf::Future< T >::Future (const Future &)=delete Future + tf::Future::Future const Future & @@ -104,13 +110,14 @@ - + tf::Future< T >::Future (Future &&)=default Future + tf::Future::Future Future && @@ -121,13 +128,14 @@ - + - + Future & - Future& tf::Future< T >::operator= + Future & tf::Future< T >::operator= (const Future &)=delete operator= + tf::Future::operator= const Future & @@ -138,13 +146,14 @@ - + - + Future & - Future& tf::Future< T >::operator= + Future & tf::Future< T >::operator= (Future &&)=default operator= + tf::Future::operator= Future && @@ -155,40 +164,56 @@ - + bool bool tf::Future< T >::cancel () cancel + tf::Future::cancel cancels the execution of the running taskflow associated with this future object true if the execution can be cancelled or false if the execution has already completed -When you request a cancellation, the executor will stop scheduling any tasks onwards. Tasks that are already running will continue to finish (non-preemptive). You can call tf::Future::wait to wait for the cancellation to complete. +When you request a cancellation, the executor will stop scheduling any tasks onwards. Tasks that are already running will continue to finish as their executions are non-preemptive. You can call tf::Future::wait to wait for the cancellation to complete. +//createataskflowoffourtasksandsubmitittoanexecutor +taskflow.emplace( +[](){std::cout<<"TaskA\n";}, +[](){std::cout<<"TaskB\n";}, +[](){std::cout<<"TaskC\n";}, +[](){std::cout<<"TaskD\n";} +); +autofuture=executor.run(taskflow); + +//canceltheexecutionofthetaskflowandwaituntilitfinishesallrunningtasks +future.cancel(); +future.wait(); + +In the above example, we submit a taskflow of four tasks to the executor and then issue a cancellation to stop its execution. Since the cancellation is non-deterministic with the executor runtime, we may still see some tasks complete their executions or none. - + - - + + tf::Future< T >::Future (std::future< T > &&, std::weak_ptr< Topology >=std::weak_ptr< Topology >()) Future + tf::Future::Future - std::future< T > && + std::future< T > && f - std::weak_ptr< Topology > + std::weak_ptr< Topology > p - std::weak_ptr< Topology >() + std::weak_ptr< Topology >() @@ -196,20 +221,20 @@ When you request a cancellation, the executor will stop scheduling any tasks onw - + - + class to access the result of an execution -tf::Future is a derived class from std::future that will eventually hold the execution result of a submitted taskflow (tf::Executor::run) In addition to the base methods inherited from std::future, you can call tf::Future::cancel to cancel the execution of the running taskflow associated with this future object. The following example cancels a submission of a taskflow that contains 1000 tasks each running one second. +tf::Future is a derived class from std::future that will eventually hold the execution result of a submitted taskflow (tf::Executor::run series). In addition to the base methods inherited from std::future, you can call tf::Future::cancel to cancel the execution of the running taskflow associated with this future object. The following example cancels a submission of a taskflow that contains 1000 tasks each running one second. tf::Executorexecutor; tf::Taskflowtaskflow; for(inti=0;i<1000;i++){ taskflow.emplace([](){ -std::this_thread::sleep_for(std::chrono::seconds(1)); +std::this_thread::sleep_for(std::chrono::seconds(1)); }); } @@ -245,7 +270,7 @@ When you request a cancellation, the executor will stop scheduling any tasks onw - + tf::Future_topology tf::Futurecancel @@ -254,8 +279,8 @@ When you request a cancellation, the executor will stop scheduling any tasks onw tf::FutureFuture tf::FutureFuture tf::FutureFuture - tf::Futureoperator= - tf::Futureoperator= + tf::Futureoperator= + tf::Futureoperator= tf::FutureRuntime tf::FutureSubflow diff --git a/docs/xml/classtf_1_1Graph.xml b/docs/xml/classtf_1_1Graph.xml index e6bb18cc3..5495f1630 100644 --- a/docs/xml/classtf_1_1Graph.xml +++ b/docs/xml/classtf_1_1Graph.xml @@ -1,14 +1,16 @@ - + tf::Graph - graph.hpp - + std::vector< std::unique_ptr< Node > > + taskflow/core/graph.hpp + class friend class Node Node + tf::Graph::Node Node @@ -18,13 +20,14 @@ - + class friend class FlowBuilder FlowBuilder + tf::Graph::FlowBuilder FlowBuilder @@ -34,13 +37,14 @@ - + class friend class Subflow Subflow + tf::Graph::Subflow Subflow @@ -50,13 +54,14 @@ - + class friend class Taskflow Taskflow + tf::Graph::Taskflow Taskflow @@ -66,13 +71,14 @@ - + class friend class Executor Executor + tf::Graph::Executor Executor @@ -82,30 +88,16 @@ - + - - - - std::vector< Node * > - std::vector<Node*> tf::Graph::_nodes - - _nodes - - - - - - - - - - + + tf::Graph::Graph ()=default Graph + tf::Graph::Graph constructs a graph object @@ -113,13 +105,14 @@ - + tf::Graph::Graph (const Graph &)=delete Graph + tf::Graph::Graph const Graph & @@ -130,16 +123,16 @@ - + - + tf::Graph::Graph - (Graph &&) + (Graph &&)=default Graph + tf::Graph::Graph Graph && - other constructs a graph using move semantics @@ -148,27 +141,14 @@ - + - - - tf::Graph::~Graph - () - ~Graph - -destructs the graph object - - - - - - - - + Graph & - Graph& tf::Graph::operator= + Graph & tf::Graph::operator= (const Graph &)=delete operator= + tf::Graph::operator= const Graph & @@ -179,16 +159,16 @@ - + - + Graph & Graph & tf::Graph::operator= - (Graph &&) + (Graph &&)=default operator= + tf::Graph::operator= Graph && - other assigns a graph using move semantics @@ -197,100 +177,16 @@ - - - - bool - bool tf::Graph::empty - () const - empty - -queries if the graph is empty - - - - - - - - - size_t - size_t tf::Graph::size - () const - size - -queries the number of nodes in the graph - - - - - - - - - void - void tf::Graph::clear - () - clear - -clears the graph - - - - - - - - - - - void - void tf::Graph::_clear - () - _clear - - - - - - - - - - void - void tf::Graph::_clear_detached - () - _clear_detached - - - - - - - - - - void - void tf::Graph::_merge - (Graph &&) - _merge - - Graph && - g - - - - - - - - + + + void void tf::Graph::_erase (Node *) _erase + tf::Graph::_erase Node * node @@ -301,9 +197,9 @@ - + - + typename ... @@ -312,9 +208,10 @@ Node * - Node* tf::Graph::_emplace_back + Node * tf::Graph::_emplace_back (ArgsT &&...) _emplace_back + tf::Graph::_emplace_back ArgsT && ... @@ -325,9 +222,9 @@ - + - + typename ... @@ -336,9 +233,10 @@ Node * - Node* tf::Graph::_emplace_back + Node * tf::Graph::_emplace_back (ArgsT &&... args) _emplace_back + tf::Graph::_emplace_back ArgsT &&... args @@ -349,9 +247,9 @@ - + - + class to create a graph object @@ -359,29 +257,43 @@ A graph is the ultimate storage for a task dependency graph and is the main gateway to interact with an executor. A graph manages a set of nodes in a global object pool that animates and recycles node objects efficiently without going through repetitive and expensive memory allocations and deallocations. This class is mainly used for creating an opaque graph object in a custom class to interact with the executor through taskflow composition. A graph object is move-only. - + + + + + + + + + + + + + + + + + + + + + + + - tf::Graph_clear - tf::Graph_clear_detached - tf::Graph_emplace_back - tf::Graph_emplace_back + tf::Graph_emplace_back + tf::Graph_emplace_back tf::Graph_erase - tf::Graph_merge - tf::Graph_nodes - tf::Graphclear - tf::Graphempty tf::GraphExecutor tf::GraphFlowBuilder tf::GraphGraph tf::GraphGraph - tf::GraphGraph + tf::GraphGraph tf::GraphNode - tf::Graphoperator= - tf::Graphoperator= - tf::Graphsize + tf::Graphoperator= + tf::Graphoperator= tf::GraphSubflow tf::GraphTaskflow - tf::Graph~Graph diff --git a/docs/xml/classtf_1_1GuidedPartitioner.xml b/docs/xml/classtf_1_1GuidedPartitioner.xml index 5feecffd6..6821fdd00 100644 --- a/docs/xml/classtf_1_1GuidedPartitioner.xml +++ b/docs/xml/classtf_1_1GuidedPartitioner.xml @@ -1,21 +1,22 @@ - + tf::GuidedPartitioner tf::PartitionerBase< DefaultClosureWrapper > - partitioner.hpp + taskflow/algorithm/partitioner.hpp typename C - DefaultClosureWrapper + DefaultClosureWrapper - + - constexpr PartitionerType + PartitionerType static constexpr PartitionerType tf::GuidedPartitioner< C >::type () type + tf::GuidedPartitioner::type queries the partition type (dynamic) @@ -23,15 +24,16 @@ - + - - + + tf::GuidedPartitioner< C >::GuidedPartitioner ()=default GuidedPartitioner + tf::GuidedPartitioner::GuidedPartitioner default constructor @@ -39,13 +41,14 @@ - + tf::GuidedPartitioner< C >::GuidedPartitioner (size_t sz) GuidedPartitioner + tf::GuidedPartitioner::GuidedPartitioner size_t sz @@ -57,13 +60,14 @@ - + tf::GuidedPartitioner< C >::GuidedPartitioner (size_t sz, C &&closure) GuidedPartitioner + tf::GuidedPartitioner::GuidedPartitioner size_t sz @@ -79,10 +83,10 @@ - + - - + + @@ -97,6 +101,7 @@ void tf::GuidedPartitioner< C >::loop (size_t N, size_t W, std::atomic< size_t > &next, F &&func) const loop + tf::GuidedPartitioner::loop size_t N @@ -106,7 +111,7 @@ W - std::atomic< size_t > & + std::atomic< size_t > & next @@ -119,7 +124,7 @@ - + @@ -135,6 +140,7 @@ void tf::GuidedPartitioner< C >::loop_until (size_t N, size_t W, std::atomic< size_t > &next, F &&func) const loop_until + tf::GuidedPartitioner::loop_until size_t N @@ -144,7 +150,7 @@ W - std::atomic< size_t > & + std::atomic< size_t > & next @@ -157,11 +163,11 @@ - + - + -class to construct a guided partitioner for scheduling parallel algorithms +class to create a guided partitioner for scheduling parallel algorithms @@ -169,17 +175,18 @@ C -closure wrapper type (default tf::DefaultClosureWrapper) +closure wrapper type (default tf::DefaultClosureWrapper) -The size of a partition is proportional to the number of unassigned iterations divided by the number of workers, and the size will gradually decrease to the given chunk size. The last partition may be smaller than the chunk size. + +The size of a partition is proportional to the number of unassigned iterations divided by the number of workers, and the size will gradually decrease to the given chunk size. The last partition may be smaller than the chunk size. In addition to partition size, the application can specify a closure wrapper for a guided partitioner. A closure wrapper allows the application to wrapper a partitioned task (i.e., closure) with a custom function object that performs additional tasks. For example: -std::atomic<int>count=0; +std::atomic<int>count=0; tf::Taskflowtaskflow; taskflow.for_each_index(0,100,1, [](){ -printf("%d\n",i); +printf("%d\n",i); }, tf::GuidedPartitioner(0,[](auto&&closure){ //dosomethingbeforeinvokingthepartitionedtask @@ -196,15 +203,15 @@ The size of a partition is proportional to the number of unassigned iterations d - - - + + + @@ -213,8 +220,9 @@ The size of a partition is proportional to the number of unassigned iterations d - - + + + @@ -222,27 +230,36 @@ The size of a partition is proportional to the number of unassigned iterations d + + + + + _closure_wrapper + - + tf::GuidedPartitioner_chunk_size tf::GuidedPartitioner_closure_wrapper tf::GuidedPartitionerchunk_size tf::GuidedPartitionerchunk_size - tf::GuidedPartitionerclosure_wrapper + tf::GuidedPartitionerclosure_wrapper + tf::GuidedPartitionerclosure_wrapper tf::GuidedPartitionerclosure_wrapper tf::GuidedPartitionerclosure_wrapper_type tf::GuidedPartitionerGuidedPartitioner tf::GuidedPartitionerGuidedPartitioner tf::GuidedPartitionerGuidedPartitioner + tf::GuidedPartitioneris_default_wrapper_v tf::GuidedPartitionerloop tf::GuidedPartitionerloop_until + tf::GuidedPartitioneroperator() tf::GuidedPartitionerPartitionerBase tf::GuidedPartitionerPartitionerBase tf::GuidedPartitionerPartitionerBase diff --git a/docs/xml/classtf_1_1IndexRange.xml b/docs/xml/classtf_1_1IndexRange.xml new file mode 100644 index 000000000..dd6037dcc --- /dev/null +++ b/docs/xml/classtf_1_1IndexRange.xml @@ -0,0 +1,371 @@ + + + + tf::IndexRange + taskflow/utility/iterator.hpp + + + typename T + + + + + T + using tf::IndexRange< T >::index_type = T + + index_type + tf::IndexRange::index_type + +alias for the index type used in the range + + + + + + + + + + + T + T tf::IndexRange< T >::_beg + + _beg + tf::IndexRange::_beg + + + + + + + + + + T + T tf::IndexRange< T >::_end + + _end + tf::IndexRange::_end + + + + + + + + + + T + T tf::IndexRange< T >::_step_size + + _step_size + tf::IndexRange::_step_size + + + + + + + + + + + + + tf::IndexRange< T >::IndexRange + ()=default + IndexRange + tf::IndexRange::IndexRange + +constructs an index range object without any initialization + + + + + + + + + + tf::IndexRange< T >::IndexRange + (T beg, T end, T step_size) + IndexRange + tf::IndexRange::IndexRange + + T + beg + + + T + end + + + T + step_size + + +constructs an IndexRange object + + + + +beg + + +starting index of the range + + + + +end + + +ending index of the range (exclusive) + + + + +step_size + + +step size between consecutive indices in the range + + + + + + + + + + + T + T tf::IndexRange< T >::begin + () const + begin + tf::IndexRange::begin + +queries the starting index of the range + + + + + + + + + T + T tf::IndexRange< T >::end + () const + end + tf::IndexRange::end + +queries the ending index of the range + + + + + + + + + T + T tf::IndexRange< T >::step_size + () const + step_size + tf::IndexRange::step_size + +queries the step size of the range + + + + + + + + + IndexRange< T > & + IndexRange< T > & tf::IndexRange< T >::reset + (T begin, T end, T step_size) + reset + tf::IndexRange::reset + + T + begin + + + T + end + + + T + step_size + + +updates the range with the new starting index, ending index, and step size + + + + + + + + + IndexRange< T > & + IndexRange< T > & tf::IndexRange< T >::begin + (T new_begin) + begin + tf::IndexRange::begin + + T + new_begin + + +updates the starting index of the range + + + + + + + + + IndexRange< T > & + IndexRange< T > & tf::IndexRange< T >::end + (T new_end) + end + tf::IndexRange::end + + T + new_end + + +updates the ending index of the range + + + + + + + + + IndexRange< T > & + IndexRange< T > & tf::IndexRange< T >::step_size + (T new_step_size) + step_size + tf::IndexRange::step_size + + T + new_step_size + + +updates the step size of the range + + + + + + + + + size_t + size_t tf::IndexRange< T >::size + () const + size + tf::IndexRange::size + +queries the number of elements in the range + + +The number of elements is equivalent to the number of iterations in the range. For instance, the range [0, 10) with step size of 2 will iterate five elements, 0, 2, 4, 6, and 8. + + + + + + + IndexRange + IndexRange tf::IndexRange< T >::discrete_domain + (size_t part_beg, size_t part_end) const + discrete_domain + tf::IndexRange::discrete_domain + + size_t + part_beg + + + size_t + part_end + + +returns a range from the given discrete domain + + + + +part_beg + + +starting index of the discrete domain + + + + +part_end + + +ending index of the discrete domain + + + +a new IndexRange object representing the given discrete domain + +The discrete domain of a range refers to a counter-based sequence indexed from 0 to N, where N is the size (i.e., number of iterated elements) of the range. For example, a discrete domain of the range [0, 10) with a step size of 2 corresponds to the sequence 0, 1, 2, 3, and 4, which map to the range elements 0, 2, 4, 6, and 8. +For a partitioned domain [part_beg, part_end), this function returns the corresponding range. For instance, the partitioned domain [2, 5) for the above example returns the range [4, 10) with the same step size of 2. +Users must ensure the specified domain is valid with respect to the range. + + + + + + + + + +class to create an index range of integral indices with a step size + + +This class provides functionality for managing a range of indices, where the range is defined by a starting index, an ending index, and a step size. The indices must be of an integral type. For example, the range [0, 10) with a step size 2 represents the five elements, 0, 2, 4, 6, and 8. + + +T + + +the integral type of the indices + + + +It is user's responsibility to ensure the given range is valid. + + + + + + tf::IndexRange_beg + tf::IndexRange_end + tf::IndexRange_step_size + tf::IndexRangebegin + tf::IndexRangebegin + tf::IndexRangediscrete_domain + tf::IndexRangeend + tf::IndexRangeend + tf::IndexRangeindex_type + tf::IndexRangeIndexRange + tf::IndexRangeIndexRange + tf::IndexRangereset + tf::IndexRangesize + tf::IndexRangestep_size + tf::IndexRangestep_size + + + diff --git a/docs/xml/classtf_1_1Node.xml b/docs/xml/classtf_1_1Node.xml index 933a71e06..a3ba0f994 100644 --- a/docs/xml/classtf_1_1Node.xml +++ b/docs/xml/classtf_1_1Node.xml @@ -1,5 +1,5 @@ - + tf::Node tf::Node::Async @@ -7,78 +7,57 @@ tf::Node::DependentAsync tf::Node::Module tf::Node::MultiCondition + tf::Node::Runtime tf::Node::Semaphores tf::Node::Static tf::Node::Subflow - - - int - AsyncState - - UNFINISHED - = 0 - - - - - - - LOCKED - = 1 - - - - - - - FINISHED - = 2 - - - - - - - - - - - - - + std::monostate using tf::Node::Placeholder = std::monostate Placeholder + tf::Node::Placeholder - + - - std::variant< Placeholder, Static, Subflow, Condition, MultiCondition, Module, Async, DependentAsync > - using tf::Node::handle_t = std::variant< Placeholder, Static, Subflow, Condition, MultiCondition, Module, Async, DependentAsync > + + std::variant< Placeholder, Static, Runtime, Subflow, Condition, MultiCondition, Module, Async, DependentAsync > + using tf::Node::handle_t = std::variant< + Placeholder, + Static, + Runtime, + Subflow, + Condition, + MultiCondition, + Module, + Async, + DependentAsync + > handle_t + tf::Node::handle_t - + - - + + class friend class Graph Graph + tf::Node::Graph Graph @@ -88,13 +67,14 @@ - + class friend class Task Task + tf::Node::Task Task @@ -104,13 +84,14 @@ - + class friend class AsyncTask AsyncTask + tf::Node::AsyncTask AsyncTask @@ -120,13 +101,14 @@ - + class friend class TaskView TaskView + tf::Node::TaskView TaskView @@ -136,13 +118,14 @@ - + class friend class Taskflow Taskflow + tf::Node::Taskflow Taskflow @@ -152,13 +135,14 @@ - + class friend class Executor Executor + tf::Node::Executor Executor @@ -168,13 +152,14 @@ - + class friend class FlowBuilder FlowBuilder + tf::Node::FlowBuilder FlowBuilder @@ -184,13 +169,14 @@ - + class friend class Subflow Subflow + tf::Node::Subflow Subflow @@ -200,15 +186,16 @@ - + class friend class Runtime Runtime + tf::Node::Runtime - Runtime + Runtime @@ -216,388 +203,374 @@ - + - - - - - tf::Node::TF_ENABLE_POOLABLE_ON_THIS - - TF_ENABLE_POOLABLE_ON_THIS - - - - - - - - - - std::string - std::string tf::Node::_name - - _name - - - - - - - - - - unsigned - unsigned tf::Node::_priority - - _priority - {0} - - - - - - - - - - void * - void* tf::Node::_data + + class + friend class AnchorGuard - _data - {nullptr} + AnchorGuard + tf::Node::AnchorGuard + + AnchorGuard + - + - - Topology * - Topology* tf::Node::_topology + + class + friend class PreemptionGuard - _topology - {nullptr} + PreemptionGuard + tf::Node::PreemptionGuard + + PreemptionGuard + - + - - Node * - Node* tf::Node::_parent + + + + auto + auto tf::Node::PLACEHOLDER - _parent - {nullptr} + PLACEHOLDER + tf::Node::PLACEHOLDER + = get_index_v<Placeholder, handle_t> - + - - SmallVector< Node * > - SmallVector<Node*> tf::Node::_successors + + auto + auto tf::Node::STATIC - _successors + STATIC + tf::Node::STATIC + = get_index_v<Static, handle_t> - + - - SmallVector< Node * > - SmallVector<Node*> tf::Node::_dependents + + auto + auto tf::Node::RUNTIME - _dependents + RUNTIME + tf::Node::RUNTIME + = get_index_v<Runtime, handle_t> - + - - std::atomic< int > - std::atomic<int> tf::Node::_state + + auto + auto tf::Node::SUBFLOW - _state - {0} + SUBFLOW + tf::Node::SUBFLOW + = get_index_v<Subflow, handle_t> - + - - std::atomic< size_t > - std::atomic<size_t> tf::Node::_join_counter + + auto + auto tf::Node::CONDITION - _join_counter - {0} + CONDITION + tf::Node::CONDITION + = get_index_v<Condition, handle_t> - + - - std::unique_ptr< Semaphores > - std::unique_ptr<Semaphores> tf::Node::_semaphores + + auto + auto tf::Node::MULTI_CONDITION - _semaphores + MULTI_CONDITION + tf::Node::MULTI_CONDITION + = get_index_v<MultiCondition, handle_t> - + - - std::exception_ptr - std::exception_ptr tf::Node::_exception_ptr + + auto + auto tf::Node::MODULE - _exception_ptr - {nullptr} + MODULE + tf::Node::MODULE + = get_index_v<Module, handle_t> - + - - handle_t - handle_t tf::Node::_handle + + auto + auto tf::Node::ASYNC - _handle + ASYNC + tf::Node::ASYNC + = get_index_v<Async, handle_t> - + - - - - constexpr static int - constexpr static int tf::Node::CONDITIONED + + auto + auto tf::Node::DEPENDENT_ASYNC - CONDITIONED - = 1 + DEPENDENT_ASYNC + tf::Node::DEPENDENT_ASYNC + = get_index_v<DependentAsync, handle_t> - + - - constexpr static int - constexpr static int tf::Node::DETACHED + + + + nstate_t + nstate_t tf::Node::_nstate - DETACHED - = 2 + _nstate + tf::Node::_nstate + {NSTATE::NONE} - + - - constexpr static int - constexpr static int tf::Node::ACQUIRED + + std::atomic< estate_t > + std::atomic<estate_t> tf::Node::_estate - ACQUIRED - = 4 + _estate + tf::Node::_estate + {ESTATE::NONE} - + - - constexpr static int - constexpr static int tf::Node::READY + + std::string + std::string tf::Node::_name - READY - = 8 + _name + tf::Node::_name - + - - constexpr static int - constexpr static int tf::Node::EXCEPTION + + void * + void* tf::Node::_data - EXCEPTION - = 16 + _data + tf::Node::_data + {nullptr} - + - - - - constexpr static auto - constexpr static auto tf::Node::PLACEHOLDER + + Topology * + Topology* tf::Node::_topology - PLACEHOLDER - = get_index_v<Placeholder, handle_t> + _topology + tf::Node::_topology + {nullptr} - + - - constexpr static auto - constexpr static auto tf::Node::STATIC + + Node * + Node* tf::Node::_parent - STATIC - = get_index_v<Static, handle_t> + _parent + tf::Node::_parent + {nullptr} - + - - constexpr static auto - constexpr static auto tf::Node::SUBFLOW + + size_t + size_t tf::Node::_num_successors - SUBFLOW - = get_index_v<Subflow, handle_t> + _num_successors + tf::Node::_num_successors + {0} - + - - constexpr static auto - constexpr static auto tf::Node::CONDITION + + SmallVector< Node *, 4 > + SmallVector<Node*, 4> tf::Node::_edges - CONDITION - = get_index_v<Condition, handle_t> + _edges + tf::Node::_edges - + - - constexpr static auto - constexpr static auto tf::Node::MULTI_CONDITION + + std::atomic< size_t > + std::atomic<size_t> tf::Node::_join_counter - MULTI_CONDITION - = get_index_v<MultiCondition, handle_t> + _join_counter + tf::Node::_join_counter + {0} - + - - constexpr static auto - constexpr static auto tf::Node::MODULE + + handle_t + handle_t tf::Node::_handle - MODULE - = get_index_v<Module, handle_t> + _handle + tf::Node::_handle - + - - constexpr static auto - constexpr static auto tf::Node::ASYNC + + std::unique_ptr< Semaphores > + std::unique_ptr<Semaphores> tf::Node::_semaphores - ASYNC - = get_index_v<Async, handle_t> + _semaphores + tf::Node::_semaphores - + - - constexpr static auto - constexpr static auto tf::Node::DEPENDENT_ASYNC + + std::exception_ptr + std::exception_ptr tf::Node::_exception_ptr - DEPENDENT_ASYNC - = get_index_v<DependentAsync, handle_t> + _exception_ptr + tf::Node::_exception_ptr + {nullptr} - + - - + + tf::Node::Node ()=default Node + tf::Node::Node - + - + typename... @@ -607,49 +580,17 @@ tf::Node::Node - (const std::string &, unsigned, Topology *, Node *, size_t, Args &&...) + (nstate_t, estate_t, const TaskParams &, Topology *, Node *, size_t, Args &&...) Node + tf::Node::Node - const std::string & - - - unsigned - - - Topology * - - - Node * + nstate_t - size_t + estate_t - Args && - ... - - - - - - - - - - - - - typename... - Args - Args - - - - tf::Node::Node - (const std::string &, Topology *, Node *, size_t, Args &&...) - Node - - const std::string & + const TaskParams & Topology * @@ -670,9 +611,9 @@ - + - + typename... @@ -682,46 +623,17 @@ tf::Node::Node - (const TaskParams &, Topology *, Node *, size_t, Args &&...) + (nstate_t, estate_t, const DefaultTaskParams &, Topology *, Node *, size_t, Args &&...) Node + tf::Node::Node - const TaskParams & + nstate_t - Topology * + estate_t - Node * - - - size_t - - - Args && - ... - - - - - - - - - - - - - typename... - Args - Args - - - - tf::Node::Node - (const DefaultTaskParams &, Topology *, Node *, size_t, Args &&...) - Node - - const DefaultTaskParams & + const DefaultTaskParams & Topology * @@ -742,87 +654,79 @@ - - - - - tf::Node::~Node - () - ~Node - - - - - - - + size_t size_t tf::Node::num_successors () const num_successors + tf::Node::num_successors - + - + size_t - size_t tf::Node::num_dependents + size_t tf::Node::num_predecessors () const - num_dependents + num_predecessors + tf::Node::num_predecessors - + - + size_t - size_t tf::Node::num_strong_dependents + size_t tf::Node::num_strong_dependencies () const - num_strong_dependents + num_strong_dependencies + tf::Node::num_strong_dependencies - + - + size_t - size_t tf::Node::num_weak_dependents + size_t tf::Node::num_weak_dependencies () const - num_weak_dependents + num_weak_dependencies + tf::Node::num_weak_dependencies - + - const std::string & + const std::string & const std::string & tf::Node::name () const name + tf::Node::name - + - + typename... @@ -832,15 +736,20 @@ tf::Node::Node - (const std::string &name, unsigned priority, Topology *topology, Node *parent, size_t join_counter, Args &&... args) + (nstate_t nstate, estate_t estate, const TaskParams &params, Topology *topology, Node *parent, size_t join_counter, Args &&... args) Node + tf::Node::Node - const std::string & - name + nstate_t + nstate - unsigned - priority + estate_t + estate + + + const TaskParams & + params Topology * @@ -864,9 +773,9 @@ - + - + typename... @@ -876,11 +785,19 @@ tf::Node::Node - (const std::string &name, Topology *topology, Node *parent, size_t join_counter, Args &&... args) + (nstate_t nstate, estate_t estate, const DefaultTaskParams &, Topology *topology, Node *parent, size_t join_counter, Args &&... args) Node + tf::Node::Node - const std::string & - name + nstate_t + nstate + + + estate_t + estate + + + const DefaultTaskParams & Topology * @@ -904,78 +821,79 @@ - + - - - - typename... - Args - Args - - - - tf::Node::Node - (const TaskParams &params, Topology *topology, Node *parent, size_t join_counter, Args &&... args) - Node - - const TaskParams & - params - - - Topology * - topology - - - Node * - parent - - - size_t - join_counter - - - Args &&... - args - + + + + bool + bool tf::Node::_is_cancelled + () const + _is_cancelled + tf::Node::_is_cancelled - + - - - - typename... - Args - Args - - - - tf::Node::Node - (const DefaultTaskParams &, Topology *topology, Node *parent, size_t join_counter, Args &&... args) - Node - - const DefaultTaskParams & - - - Topology * - topology - - - Node * - parent - + + bool + bool tf::Node::_is_conditioner + () const + _is_conditioner + tf::Node::_is_conditioner + + + + + + + + + + bool + bool tf::Node::_is_preempted + () const + _is_preempted + tf::Node::_is_preempted + + + + + + + + + + bool + bool tf::Node::_acquire_all + (SmallVector< Node * > &) + _acquire_all + tf::Node::_acquire_all - size_t - join_counter + SmallVector< Node * > & + nodes + + + + + + + + + + void + void tf::Node::_release_all + (SmallVector< Node * > &) + _release_all + tf::Node::_release_all - Args &&... - args + SmallVector< Node * > & + nodes @@ -983,15 +901,14 @@ - + - - void void tf::Node::_precede (Node *) _precede + tf::Node::_precede Node * v @@ -1002,68 +919,45 @@ - + void void tf::Node::_set_up_join_counter () _set_up_join_counter + tf::Node::_set_up_join_counter - + - + void - void tf::Node::_process_exception + void tf::Node::_rethrow_exception () - _process_exception - - - - - - - - - - bool - bool tf::Node::_is_cancelled - () const - _is_cancelled - - - - - - - - - - bool - bool tf::Node::_is_conditioner - () const - _is_conditioner + _rethrow_exception + tf::Node::_rethrow_exception - + - - bool - bool tf::Node::_acquire_all - (SmallVector< Node * > &) - _acquire_all + + void + void tf::Node::_remove_successors + (Node *) + _remove_successors + tf::Node::_remove_successors - SmallVector< Node * > & - nodes + Node * + node @@ -1071,88 +965,87 @@ - + - - SmallVector< Node * > - SmallVector< Node * > tf::Node::_release_all - () - _release_all + + void + void tf::Node::_remove_predecessors + (Node *) + _remove_predecessors + tf::Node::_remove_predecessors + + Node * + node + - + - + - + tf::Node_acquire_all tf::Node_data - tf::Node_dependents + tf::Node_edges + tf::Node_estate tf::Node_exception_ptr tf::Node_handle tf::Node_is_cancelled tf::Node_is_conditioner + tf::Node_is_preempted tf::Node_join_counter tf::Node_name + tf::Node_nstate + tf::Node_num_successors tf::Node_parent tf::Node_precede - tf::Node_priority - tf::Node_process_exception - tf::Node_release_all + tf::Node_release_all + tf::Node_remove_predecessors + tf::Node_remove_successors + tf::Node_rethrow_exception tf::Node_semaphores tf::Node_set_up_join_counter - tf::Node_state - tf::Node_successors tf::Node_topology - tf::NodeACQUIRED - tf::NodeASYNC - tf::NodeAsyncState + tf::NodeAnchorGuard + tf::NodeASYNC tf::NodeAsyncTask - tf::NodeCONDITION - tf::NodeCONDITIONED - tf::NodeDEPENDENT_ASYNC - tf::NodeDETACHED - tf::NodeEXCEPTION + tf::NodeCONDITION + tf::NodeDEPENDENT_ASYNC tf::NodeExecutor tf::NodeFlowBuilder tf::NodeGraph - tf::Nodehandle_t - tf::NodeMODULE - tf::NodeMULTI_CONDITION + tf::Nodehandle_t + tf::NodeMODULE + tf::NodeMULTI_CONDITION tf::Nodename tf::NodeNode - tf::NodeNode - tf::NodeNode - tf::NodeNode - tf::NodeNode - tf::NodeNode - tf::NodeNode - tf::NodeNode - tf::NodeNode - tf::Nodenum_dependents - tf::Nodenum_strong_dependents + tf::NodeNode + tf::NodeNode + tf::NodeNode + tf::NodeNode + tf::Nodenum_predecessors + tf::Nodenum_strong_dependencies tf::Nodenum_successors - tf::Nodenum_weak_dependents - tf::NodePLACEHOLDER + tf::Nodenum_weak_dependencies + tf::NodePLACEHOLDER tf::NodePlaceholder - tf::NodeREADY + tf::NodePreemptionGuard + tf::NodeRUNTIME tf::NodeRuntime - tf::NodeSTATIC + tf::NodeSTATIC + tf::NodeSUBFLOW tf::NodeSubflow - tf::NodeSUBFLOW tf::NodeTask tf::NodeTaskflow tf::NodeTaskView - tf::NodeTF_ENABLE_POOLABLE_ON_THIS - tf::Node~Node diff --git a/docs/xml/classtf_1_1ObserverInterface.xml b/docs/xml/classtf_1_1ObserverInterface.xml index 1618e2f7f..7e4ce906f 100644 --- a/docs/xml/classtf_1_1ObserverInterface.xml +++ b/docs/xml/classtf_1_1ObserverInterface.xml @@ -1,16 +1,17 @@ - + tf::ObserverInterface tf::ChromeObserver tf::TFProfObserver - observer.hpp - + taskflow/core/observer.hpp + virtual tf::ObserverInterface::~ObserverInterface ()=default ~ObserverInterface + tf::ObserverInterface::~ObserverInterface virtual destructor @@ -18,13 +19,14 @@ - + void virtual void tf::ObserverInterface::set_up (size_t num_workers)=0 set_up + tf::ObserverInterface::set_up set_up set_up @@ -48,13 +50,14 @@ - + void virtual void tf::ObserverInterface::on_entry (WorkerView wv, TaskView task_view)=0 on_entry + tf::ObserverInterface::on_entry on_entry on_entry @@ -90,13 +93,14 @@ - + void virtual void tf::ObserverInterface::on_exit (WorkerView wv, TaskView task_view)=0 on_exit + tf::ObserverInterface::on_exit on_exit on_exit @@ -132,9 +136,9 @@ - + - + class to derive an executor observer @@ -144,24 +148,24 @@ Example usage: structMyObserver:publictf::ObserverInterface{ -MyObserver(conststd::string&name){ -std::cout<<"constructingobserver"<<name<<'\n'; +MyObserver(conststd::string&name){ +std::cout<<"constructingobserver"<<name<<'\n'; } voidset_up(size_tnum_workers)overridefinal{ -std::cout<<"settingupobserverwith"<<num_workers<<"workers\n"; +std::cout<<"settingupobserverwith"<<num_workers<<"workers\n"; } voidon_entry(WorkerVieww,tf::TaskViewtv)overridefinal{ -std::ostringstreamoss; +std::ostringstreamoss; oss<<"worker"<<w.id()<<"readytorun"<<tv.name()<<'\n'; -std::cout<<oss.str(); +std::cout<<oss.str(); } voidon_exit(WorkerVieww,tf::TaskViewtv)overridefinal{ -std::ostringstreamoss; +std::ostringstreamoss; oss<<"worker"<<w.id()<<"finishedrunning"<<tv.name()<<'\n'; -std::cout<<oss.str(); +std::cout<<oss.str(); } }; @@ -172,23 +176,23 @@ //... //createacustomobserver -std::shared_ptr<MyObserver>observer=executor.make_observer<MyObserver>("MyObserver"); +std::shared_ptr<MyObserver>observer=executor.make_observer<MyObserver>("MyObserver"); //runthetaskflow executor.run(taskflow).wait(); - - - - + + + + @@ -196,7 +200,7 @@ - + tf::ObserverInterfaceon_entry tf::ObserverInterfaceon_exit diff --git a/docs/xml/classtf_1_1PartitionerBase.xml b/docs/xml/classtf_1_1PartitionerBase.xml index 7dd590373..eea3be2a4 100644 --- a/docs/xml/classtf_1_1PartitionerBase.xml +++ b/docs/xml/classtf_1_1PartitionerBase.xml @@ -1,21 +1,22 @@ - + tf::PartitionerBase tf::IsPartitioner - partitioner.hpp + taskflow/algorithm/partitioner.hpp typename C - DefaultClosureWrapper + DefaultClosureWrapper - + C using tf::PartitionerBase< C >::closure_wrapper_type = C closure_wrapper_type + tf::PartitionerBase::closure_wrapper_type the closure type @@ -23,15 +24,34 @@ - + - - + + + + bool + bool tf::PartitionerBase< C >::is_default_wrapper_v + + is_default_wrapper_v + tf::PartitionerBase::is_default_wrapper_v + = std::is_same_v<C, DefaultClosureWrapper> + +indicating if the given closure wrapper is a default wrapper (i.e., empty) + + + + + + + + + size_t size_t tf::PartitionerBase< C >::_chunk_size _chunk_size + tf::PartitionerBase::_chunk_size {0} chunk size @@ -40,13 +60,14 @@ - + C C tf::PartitionerBase< C >::_closure_wrapper _closure_wrapper + tf::PartitionerBase::_closure_wrapper closure wrapper @@ -54,15 +75,16 @@ - + - - + + tf::PartitionerBase< C >::PartitionerBase ()=default PartitionerBase + tf::PartitionerBase::PartitionerBase default constructor @@ -70,13 +92,14 @@ - + tf::PartitionerBase< C >::PartitionerBase (size_t chunk_size) PartitionerBase + tf::PartitionerBase::PartitionerBase size_t chunk_size @@ -88,13 +111,14 @@ - + tf::PartitionerBase< C >::PartitionerBase (size_t chunk_size, C &&closure_wrapper) PartitionerBase + tf::PartitionerBase::PartitionerBase size_t chunk_size @@ -110,13 +134,14 @@ - + size_t size_t tf::PartitionerBase< C >::chunk_size () const chunk_size + tf::PartitionerBase::chunk_size query the chunk size of this partitioner @@ -124,13 +149,14 @@ - + void void tf::PartitionerBase< C >::chunk_size (size_t cz) chunk_size + tf::PartitionerBase::chunk_size size_t cz @@ -142,13 +168,14 @@ - + - + const C & - const C& tf::PartitionerBase< C >::closure_wrapper + const C & tf::PartitionerBase< C >::closure_wrapper () const closure_wrapper + tf::PartitionerBase::closure_wrapper acquire an immutable access to the closure wrapper object @@ -156,7 +183,22 @@ - + + + + C & + C & tf::PartitionerBase< C >::closure_wrapper + () + closure_wrapper + tf::PartitionerBase::closure_wrapper + +acquire a mutable access to the closure wrapper object + + + + + + @@ -168,6 +210,7 @@ void tf::PartitionerBase< C >::closure_wrapper (F &&fn) closure_wrapper + tf::PartitionerBase::closure_wrapper F && fn @@ -179,9 +222,33 @@ - + - + + + + typename F + + + TF_FORCE_INLINE decltype(auto) + TF_FORCE_INLINE decltype(auto) tf::PartitionerBase< C >::operator() + (F &&callable) + operator() + tf::PartitionerBase::operator() + + F && + callable + + +wraps the given callable with the associated closure wrapper + + + + + + + + class to derive a partitioner for scheduling parallel algorithms @@ -205,14 +272,14 @@ The class provides base methods to derive a partitioner that can be used to sche Depending on applications, partitioning algorithms can impact the performance a lot. For example, if a parallel-iteration workload contains a regular work unit per iteration, tf::StaticPartitioner can deliver the best performance. On the other hand, if the work unit per iteration is irregular and unbalanced, tf::GuidedPartitioner or tf::DynamicPartitioner can outperform tf::StaticPartitioner. In most situations, tf::GuidedPartitioner can deliver decent performance and is thus used as our default partitioner. -Giving the partition size of 0 lets the Taskflow runtime automatically determines the partition size for the given partitioner. +Giving the partition size of 0 lets the Taskflow runtime automatically determines the partition size for the given partitioner. In addition to partition size, the application can specify a closure wrapper for a partitioner. A closure wrapper allows the application to wrapper a partitioned task (i.e., closure) with a custom function object that performs additional tasks. For example: -std::atomic<int>count=0; +std::atomic<int>count=0; tf::Taskflowtaskflow; taskflow.for_each_index(0,100,1, [](){ -printf("%d\n",i); +printf("%d\n",i); }, tf::StaticPartitioner(0,[](auto&&closure){ //dosomethingbeforeinvokingthepartitionedtask @@ -227,22 +294,29 @@ In addition to partition size, the application can specify a closure wrapper for ); executor.run(taskflow).wait(); -The default closure wrapper (tf::DefaultClosureWrapper) does nothing but invoke the partitioned task (closure). +The default closure wrapper (tf::DefaultClosureWrapper) does nothing but invoke the partitioned task (closure). + + + + + + + + + - - @@ -252,23 +326,19 @@ In addition to partition size, the application can specify a closure wrapper for _closure_wrapper - - - - - - - - + tf::PartitionerBase_chunk_size tf::PartitionerBase_closure_wrapper tf::PartitionerBasechunk_size tf::PartitionerBasechunk_size - tf::PartitionerBaseclosure_wrapper + tf::PartitionerBaseclosure_wrapper + tf::PartitionerBaseclosure_wrapper tf::PartitionerBaseclosure_wrapper tf::PartitionerBaseclosure_wrapper_type + tf::PartitionerBaseis_default_wrapper_v + tf::PartitionerBaseoperator() tf::PartitionerBasePartitionerBase tf::PartitionerBasePartitionerBase tf::PartitionerBasePartitionerBase diff --git a/docs/xml/classtf_1_1Pipe.xml b/docs/xml/classtf_1_1Pipe.xml index 8a1da7006..4d2ca7adf 100644 --- a/docs/xml/classtf_1_1Pipe.xml +++ b/docs/xml/classtf_1_1Pipe.xml @@ -1,20 +1,21 @@ - + tf::Pipe - pipeline.hpp + taskflow/algorithm/pipeline.hpp typename C - std::function<void(tf::Pipeflow&)> + std::function<void(tf::Pipeflow&)> - + C using tf::Pipe< C >::callable_t = C callable_t + tf::Pipe::callable_t alias of the callable type @@ -22,10 +23,10 @@ - + - - + + @@ -38,6 +39,7 @@ friend class Pipeline Pipeline + tf::Pipe::Pipeline Pipeline @@ -47,7 +49,7 @@ - + @@ -59,6 +61,7 @@ friend class ScalablePipeline ScalablePipeline + tf::Pipe::ScalablePipeline ScalablePipeline @@ -68,43 +71,46 @@ - + - - + + PipeType PipeType tf::Pipe< C >::_type _type + tf::Pipe::_type - + C C tf::Pipe< C >::_callable _callable + tf::Pipe::_callable - + - - + + tf::Pipe< C >::Pipe ()=default Pipe + tf::Pipe::Pipe default constructor @@ -112,13 +118,14 @@ - + tf::Pipe< C >::Pipe (PipeType d, C &&callable) Pipe + tf::Pipe::Pipe PipeType d @@ -155,13 +162,14 @@ The constructor constructs a pipe with the given direction ( + PipeType PipeType tf::Pipe< C >::type () const type + tf::Pipe::type queries the type of the pipe @@ -170,13 +178,14 @@ The constructor constructs a pipe with the given direction ( + void void tf::Pipe< C >::type (PipeType type) type + tf::Pipe::type PipeType type @@ -198,7 +207,7 @@ The constructor constructs a pipe with the given direction ( + @@ -210,6 +219,7 @@ The constructor constructs a pipe with the given direction (callable -a callable object constructible from std::function<void(tf::Pipeflow&)> +a callable object constructible from std::function<void(tf::Pipeflow&)> @@ -240,9 +250,9 @@ Assigns a new callable to the pipe with universal forwarding. - + - + class to create a pipe object for a pipeline stage @@ -261,7 +271,7 @@ A pipe represents a stage of a pipeline. A pipe can be either parallel The pipeflow object is used to query the statistics of a scheduling token in the pipeline, such as pipe, line, and token numbers. - + tf::Pipe_callable tf::Pipe_type diff --git a/docs/xml/classtf_1_1Pipeflow.xml b/docs/xml/classtf_1_1Pipeflow.xml index 23ea31453..a1ce406cb 100644 --- a/docs/xml/classtf_1_1Pipeflow.xml +++ b/docs/xml/classtf_1_1Pipeflow.xml @@ -1,9 +1,9 @@ - + tf::Pipeflow - pipeline.hpp - + taskflow/algorithm/pipeline.hpp + @@ -16,6 +16,7 @@ friend class Pipeline Pipeline + tf::Pipeflow::Pipeline Pipeline @@ -25,7 +26,7 @@ - + @@ -37,6 +38,7 @@ friend class ScalablePipeline ScalablePipeline + tf::Pipeflow::ScalablePipeline ScalablePipeline @@ -46,7 +48,7 @@ - + @@ -60,6 +62,7 @@ friend class DataPipeline DataPipeline + tf::Pipeflow::DataPipeline DataPipeline @@ -69,95 +72,102 @@ - + - - + + size_t size_t tf::Pipeflow::_line _line + tf::Pipeflow::_line - + size_t size_t tf::Pipeflow::_pipe _pipe + tf::Pipeflow::_pipe - + size_t size_t tf::Pipeflow::_token _token + tf::Pipeflow::_token - + bool bool tf::Pipeflow::_stop _stop + tf::Pipeflow::_stop - + size_t size_t tf::Pipeflow::_num_deferrals _num_deferrals + tf::Pipeflow::_num_deferrals - + - std::unordered_set< size_t > + std::unordered_set< size_t > std::unordered_set<size_t> tf::Pipeflow::_dependents _dependents + tf::Pipeflow::_dependents - + - - + + tf::Pipeflow::Pipeflow ()=default Pipeflow + tf::Pipeflow::Pipeflow default constructor @@ -165,13 +175,14 @@ - + size_t size_t tf::Pipeflow::line () const line + tf::Pipeflow::line queries the line identifier of the present token @@ -179,13 +190,14 @@ - + size_t size_t tf::Pipeflow::pipe () const pipe + tf::Pipeflow::pipe queries the pipe identifier of the present token @@ -193,13 +205,14 @@ - + size_t size_t tf::Pipeflow::token () const token + tf::Pipeflow::token queries the token identifier @@ -207,13 +220,14 @@ - + void void tf::Pipeflow::stop () stop + tf::Pipeflow::stop stops the pipeline scheduling @@ -222,13 +236,14 @@ - + size_t size_t tf::Pipeflow::num_deferrals () const num_deferrals + tf::Pipeflow::num_deferrals queries the number of deferrals @@ -236,13 +251,14 @@ - + void void tf::Pipeflow::defer (size_t token) defer + tf::Pipeflow::defer size_t token @@ -255,16 +271,16 @@ - + - + class to create a pipeflow object used by the pipe callable Pipeflow represents a scheduling token in the pipeline scheduling framework. A pipeflow is created by the pipeline scheduler at runtime to pass to the pipe callable. Users can query the present statistics of that scheduling token, including the line identifier, pipe identifier, and token identifier, and build their application algorithms based on these statistics. At the first stage, users can explicitly call the stop method to stop the pipeline scheduler. tf::Pipe{tf::PipeType::SERIAL,[](tf::Pipeflow&pf){ -std::cout<<"tokenid="<<pf.token() +std::cout<<"tokenid="<<pf.token() <<"atline="<<pf.line() <<"atpipe="<<pf.pipe() <<'\n'; @@ -272,7 +288,7 @@ Pipeflow can only be created privately by the tf::Pipeline and be used through the pipe callable. - + tf::Pipeflow_dependents tf::Pipeflow_line diff --git a/docs/xml/classtf_1_1Pipeline.xml b/docs/xml/classtf_1_1Pipeline.xml index 90aafaa97..def3600bb 100644 --- a/docs/xml/classtf_1_1Pipeline.xml +++ b/docs/xml/classtf_1_1Pipeline.xml @@ -1,8 +1,8 @@ - + tf::Pipeline - pipeline.hpp + taskflow/algorithm/pipeline.hpp tf::Pipeline::Line tf::Pipeline::PipeMeta @@ -12,142 +12,153 @@ Ps - + Graph Graph tf::Pipeline< Ps >::_graph _graph + tf::Pipeline::_graph - + size_t size_t tf::Pipeline< Ps >::_num_tokens _num_tokens + tf::Pipeline::_num_tokens - + - std::tuple< Ps... > + std::tuple< Ps... > std::tuple<Ps...> tf::Pipeline< Ps >::_pipes _pipes + tf::Pipeline::_pipes - + - std::array< PipeMeta, sizeof...(Ps)> + std::array< PipeMeta, sizeof...(Ps)> std::array<PipeMeta, sizeof...(Ps)> tf::Pipeline< Ps >::_meta _meta + tf::Pipeline::_meta - + - std::vector< std::array< Line, sizeof...(Ps)> > + std::vector< std::array< Line, sizeof...(Ps)> > std::vector<std::array<Line, sizeof...(Ps)> > tf::Pipeline< Ps >::_lines _lines + tf::Pipeline::_lines - + - std::vector< Task > + std::vector< Task > std::vector<Task> tf::Pipeline< Ps >::_tasks _tasks + tf::Pipeline::_tasks - + - std::vector< Pipeflow > + std::vector< Pipeflow > std::vector<Pipeflow> tf::Pipeline< Ps >::_pipeflows _pipeflows + tf::Pipeline::_pipeflows - + - std::queue< std::pair< size_t, size_t > > + std::queue< std::pair< size_t, size_t > > std::queue<std::pair<size_t, size_t> > tf::Pipeline< Ps >::_ready_tokens _ready_tokens + tf::Pipeline::_ready_tokens - + - std::unordered_map< size_t, std::vector< size_t > > + std::unordered_map< size_t, std::vector< size_t > > std::unordered_map<size_t, std::vector<size_t> > tf::Pipeline< Ps >::_token_dependencies _token_dependencies + tf::Pipeline::_token_dependencies - + - std::unordered_map< size_t, DeferredPipeflow > + std::unordered_map< size_t, DeferredPipeflow > std::unordered_map<size_t, DeferredPipeflow> tf::Pipeline< Ps >::_deferred_tokens _deferred_tokens + tf::Pipeline::_deferred_tokens - + size_t size_t tf::Pipeline< Ps >::_longest_deferral _longest_deferral + tf::Pipeline::_longest_deferral = 0 @@ -155,15 +166,16 @@ - + - - + + tf::Pipeline< Ps >::Pipeline (size_t num_lines, Ps &&... ps) Pipeline + tf::Pipeline::Pipeline size_t num_lines @@ -197,19 +209,20 @@ Constructs a pipeline of up to num_lines parall - + tf::Pipeline< Ps >::Pipeline (size_t num_lines, std::tuple< Ps... > &&ps) Pipeline + tf::Pipeline::Pipeline size_t num_lines - std::tuple< Ps... > && + std::tuple< Ps... > && ps @@ -237,13 +250,14 @@ Constructs a pipeline of up to num_lines parall - + size_t size_t tf::Pipeline< Ps >::num_lines () const noexcept num_lines + tf::Pipeline::num_lines queries the number of parallel lines @@ -252,13 +266,14 @@ Constructs a pipeline of up to num_lines parall - + - constexpr size_t - constexpr size_t tf::Pipeline< Ps >::num_pipes + size_t + size_t tf::Pipeline< Ps >::num_pipes () const noexcept num_pipes + tf::Pipeline::num_pipes queries the number of pipes @@ -267,13 +282,14 @@ Constructs a pipeline of up to num_lines parall - + void void tf::Pipeline< Ps >::reset () reset + tf::Pipeline::reset resets the pipeline @@ -282,13 +298,14 @@ Constructs a pipeline of up to num_lines parall - + size_t size_t tf::Pipeline< Ps >::num_tokens () const noexcept num_tokens + tf::Pipeline::num_tokens queries the number of generated tokens in the pipeline @@ -297,13 +314,14 @@ Constructs a pipeline of up to num_lines parall - + Graph & Graph & tf::Pipeline< Ps >::graph () graph + tf::Pipeline::graph obtains the graph object associated with the pipeline construct @@ -312,10 +330,10 @@ Constructs a pipeline of up to num_lines parall - + - - + + @@ -328,8 +346,9 @@ Constructs a pipeline of up to num_lines parall auto tf::Pipeline< Ps >::_gen_meta (std::tuple< Ps... > &&, std::index_sequence< I... >) _gen_meta + tf::Pipeline::_gen_meta - std::tuple< Ps... > && + std::tuple< Ps... > && ps @@ -341,13 +360,14 @@ Constructs a pipeline of up to num_lines parall - + void void tf::Pipeline< Ps >::_on_pipe (Pipeflow &, Runtime &) _on_pipe + tf::Pipeline::_on_pipe Pipeflow & pf @@ -362,26 +382,28 @@ Constructs a pipeline of up to num_lines parall - + void void tf::Pipeline< Ps >::_build () _build + tf::Pipeline::_build - + void void tf::Pipeline< Ps >::_check_dependents (Pipeflow &) _check_dependents + tf::Pipeline::_check_dependents Pipeflow & pf @@ -392,13 +414,14 @@ Constructs a pipeline of up to num_lines parall - + void void tf::Pipeline< Ps >::_construct_deferred_tokens (Pipeflow &) _construct_deferred_tokens + tf::Pipeline::_construct_deferred_tokens Pipeflow & pf @@ -409,13 +432,14 @@ Constructs a pipeline of up to num_lines parall - + void void tf::Pipeline< Ps >::_resolve_token_dependencies (Pipeflow &) _resolve_token_dependencies + tf::Pipeline::_resolve_token_dependencies Pipeflow & pf @@ -426,9 +450,9 @@ Constructs a pipeline of up to num_lines parall - + - + class to create a pipeline scheduling framework @@ -450,7 +474,7 @@ A pipeline is a composable graph object for users to create a pipeline constsize_tnum_pipes=3; //createacustomdatabuffer -std::array<std::array<int, num_pipes>,num_lines>buffer; +std::array<std::array<int, num_pipes>,num_lines>buffer; //createapipelinegraphoffourconcurrentlinesandthreeserialpipes tf::Pipelinepipeline(num_lines, @@ -476,11 +500,11 @@ A pipeline is a composable graph object for users to create a pipeline ); //buildthepipelinegraphusingcomposition -tf::Taskinit=taskflow.emplace([](){std::cout<<"ready\n";}) +tf::Taskinit=taskflow.emplace([](){std::cout<<"ready\n";}) .name("startingpipeline"); tf::Tasktask=taskflow.composed_of(pipeline) .name("pipeline"); -tf::Taskstop=taskflow.emplace([](){std::cout<<"stopped\n";}) +tf::Taskstop=taskflow.emplace([](){std::cout<<"stopped\n";}) .name("pipelinestopped"); //createtaskdependency @@ -491,7 +515,7 @@ A pipeline is a composable graph object for users to create a pipeline executor.run(taskflow).wait(); The above example creates a pipeline graph that schedules five tokens over four parallel lines in a circular fashion, as depicted below: -o->o->o +o->o->o ||| vvv o->o->o @@ -503,9 +527,9 @@ A pipeline is a composable graph object for users to create a pipeline o->o->o At each pipe stage, the program propagates the result to the next pipe by adding one to the result stored in a custom data storage, buffer. The pipeline scheduler will generate five scheduling tokens and then stop. -Internally, tf::Pipeline uses std::tuple to store the given sequence of pipes. The definition of each pipe can be different, completely decided by the compiler to optimize the object layout. After a pipeline is constructed, it is not possible to change its pipes. If applications need to change these pipes, please use tf::ScalablePipeline. +Internally, tf::Pipeline uses std::tuple to store the given sequence of pipes. The definition of each pipe can be different, completely decided by the compiler to optimize the object layout. After a pipeline is constructed, it is not possible to change its pipes. If applications need to change these pipes, please use tf::ScalablePipeline. - + tf::Pipeline_build tf::Pipeline_check_dependents diff --git a/docs/xml/classtf_1_1PreemptionGuard.xml b/docs/xml/classtf_1_1PreemptionGuard.xml new file mode 100644 index 000000000..c449704a9 --- /dev/null +++ b/docs/xml/classtf_1_1PreemptionGuard.xml @@ -0,0 +1,138 @@ + + + + tf::PreemptionGuard + + + Runtime & + Runtime& tf::PreemptionGuard::_runtime + + _runtime + tf::PreemptionGuard::_runtime + + + + + + + + + + + + + tf::PreemptionGuard::PreemptionGuard + (Runtime &runtime) + PreemptionGuard + tf::PreemptionGuard::PreemptionGuard + + Runtime & + runtime + + + + + + + + + + + + tf::PreemptionGuard::~PreemptionGuard + () + ~PreemptionGuard + tf::PreemptionGuard::~PreemptionGuard + + + + + + + + + + + tf::PreemptionGuard::PreemptionGuard + (const PreemptionGuard &)=delete + PreemptionGuard + tf::PreemptionGuard::PreemptionGuard + + const PreemptionGuard & + + + + + + + + + + + + tf::PreemptionGuard::PreemptionGuard + (PreemptionGuard &&)=delete + PreemptionGuard + tf::PreemptionGuard::PreemptionGuard + + PreemptionGuard && + + + + + + + + + + + PreemptionGuard & + PreemptionGuard & tf::PreemptionGuard::operator= + (const PreemptionGuard &)=delete + operator= + tf::PreemptionGuard::operator= + + const PreemptionGuard & + + + + + + + + + + + PreemptionGuard & + PreemptionGuard & tf::PreemptionGuard::operator= + (PreemptionGuard &&)=delete + operator= + tf::PreemptionGuard::operator= + + PreemptionGuard && + + + + + + + + + + + + + + + + + tf::PreemptionGuard_runtime + tf::PreemptionGuardoperator= + tf::PreemptionGuardoperator= + tf::PreemptionGuardPreemptionGuard + tf::PreemptionGuardPreemptionGuard + tf::PreemptionGuardPreemptionGuard + tf::PreemptionGuard~PreemptionGuard + + + diff --git a/docs/xml/classtf_1_1RandomPartitioner.xml b/docs/xml/classtf_1_1RandomPartitioner.xml index 348f5cd6c..3c079dd18 100644 --- a/docs/xml/classtf_1_1RandomPartitioner.xml +++ b/docs/xml/classtf_1_1RandomPartitioner.xml @@ -1,21 +1,22 @@ - + tf::RandomPartitioner tf::PartitionerBase< DefaultClosureWrapper > - partitioner.hpp + taskflow/algorithm/partitioner.hpp typename C - DefaultClosureWrapper + DefaultClosureWrapper - + float float tf::RandomPartitioner< C >::_alpha _alpha + tf::RandomPartitioner::_alpha {0.01f} @@ -23,29 +24,31 @@ - + float float tf::RandomPartitioner< C >::_beta _beta - {0.5f} + tf::RandomPartitioner::_beta + {0.50f} - + - - + + - constexpr PartitionerType + PartitionerType static constexpr PartitionerType tf::RandomPartitioner< C >::type () type + tf::RandomPartitioner::type queries the partition type (dynamic) @@ -53,15 +56,16 @@ - + - - + + tf::RandomPartitioner< C >::RandomPartitioner ()=default RandomPartitioner + tf::RandomPartitioner::RandomPartitioner default constructor @@ -69,13 +73,14 @@ - + tf::RandomPartitioner< C >::RandomPartitioner (size_t sz) RandomPartitioner + tf::RandomPartitioner::RandomPartitioner size_t sz @@ -87,13 +92,14 @@ - + tf::RandomPartitioner< C >::RandomPartitioner (size_t sz, C &&closure) RandomPartitioner + tf::RandomPartitioner::RandomPartitioner size_t sz @@ -109,13 +115,14 @@ - + tf::RandomPartitioner< C >::RandomPartitioner (float alpha, float beta) RandomPartitioner + tf::RandomPartitioner::RandomPartitioner float alpha @@ -131,13 +138,14 @@ - + tf::RandomPartitioner< C >::RandomPartitioner (float alpha, float beta, C &&closure) RandomPartitioner + tf::RandomPartitioner::RandomPartitioner float alpha @@ -157,13 +165,14 @@ - + float float tf::RandomPartitioner< C >::alpha () const alpha + tf::RandomPartitioner::alpha queries the alpha value @@ -171,13 +180,14 @@ - + float float tf::RandomPartitioner< C >::beta () const beta + tf::RandomPartitioner::beta queries the beta value @@ -185,13 +195,14 @@ - + - - std::pair< size_t, size_t > - std::pair<size_t, size_t> tf::RandomPartitioner< C >::chunk_size_range + + std::pair< size_t, size_t > + std::pair< size_t, size_t > tf::RandomPartitioner< C >::chunk_size_range (size_t N, size_t W) const chunk_size_range + tf::RandomPartitioner::chunk_size_range size_t N @@ -225,10 +236,10 @@ - + - - + + @@ -243,6 +254,7 @@ void tf::RandomPartitioner< C >::loop (size_t N, size_t W, std::atomic< size_t > &next, F &&func) const loop + tf::RandomPartitioner::loop size_t N @@ -252,7 +264,7 @@ W - std::atomic< size_t > & + std::atomic< size_t > & next @@ -265,7 +277,7 @@ - + @@ -281,6 +293,7 @@ void tf::RandomPartitioner< C >::loop_until (size_t N, size_t W, std::atomic< size_t > &next, F &&func) const loop_until + tf::RandomPartitioner::loop_until size_t N @@ -290,7 +303,7 @@ W - std::atomic< size_t > & + std::atomic< size_t > & next @@ -303,9 +316,9 @@ - + - + class to construct a random partitioner for scheduling parallel algorithms @@ -315,17 +328,17 @@ C -closure wrapper type (default tf::DefaultClosureWrapper) +closure wrapper type (default tf::DefaultClosureWrapper) Similar to tf::DynamicPartitioner, the partitioner splits iterations into many partitions but each with a random chunk size in the range, c = [alpha * N * W, beta * N * W]. By default, alpha is 0.01 and beta is 0.5, respectively. In addition to partition size, the application can specify a closure wrapper for a random partitioner. A closure wrapper allows the application to wrapper a partitioned task (i.e., closure) with a custom function object that performs additional tasks. For example: -std::atomic<int>count=0; +std::atomic<int>count=0; tf::Taskflowtaskflow; taskflow.for_each_index(0,100,1, [](){ -printf("%d\n",i); +printf("%d\n",i); }, tf::RandomPartitioner(0,[](auto&&closure){ //dosomethingbeforeinvokingthepartitionedtask @@ -342,12 +355,6 @@ Similar to tf::Dyn - - - - - - @@ -357,14 +364,18 @@ Similar to tf::Dyn - - + + + + + + @@ -373,9 +384,18 @@ Similar to tf::Dyn + + _closure_wrapper + + + + + + + - + tf::RandomPartitioner_alpha tf::RandomPartitioner_beta @@ -385,12 +405,15 @@ Similar to tf::Dyn tf::RandomPartitionerbeta tf::RandomPartitionerchunk_size tf::RandomPartitionerchunk_size - tf::RandomPartitionerchunk_size_range - tf::RandomPartitionerclosure_wrapper + tf::RandomPartitionerchunk_size_range + tf::RandomPartitionerclosure_wrapper + tf::RandomPartitionerclosure_wrapper tf::RandomPartitionerclosure_wrapper tf::RandomPartitionerclosure_wrapper_type + tf::RandomPartitioneris_default_wrapper_v tf::RandomPartitionerloop tf::RandomPartitionerloop_until + tf::RandomPartitioneroperator() tf::RandomPartitionerPartitionerBase tf::RandomPartitionerPartitionerBase tf::RandomPartitionerPartitionerBase diff --git a/docs/xml/classtf_1_1Runtime.xml b/docs/xml/classtf_1_1Runtime.xml index e2851b50d..6d32e60c5 100644 --- a/docs/xml/classtf_1_1Runtime.xml +++ b/docs/xml/classtf_1_1Runtime.xml @@ -1,15 +1,15 @@ - + tf::Runtime - tf::Subflow - graph.hpp - + taskflow/core/runtime.hpp + class friend class Executor Executor + tf::Runtime::Executor Executor @@ -19,13 +19,14 @@ - + class friend class FlowBuilder FlowBuilder + tf::Runtime::FlowBuilder FlowBuilder @@ -35,71 +36,109 @@ - + - - + + class + friend class PreemptionGuard + + PreemptionGuard + tf::Runtime::PreemptionGuard + + PreemptionGuard + + + + + + + + + + + class + friend class Algorithm + + Algorithm + tf::Runtime::Algorithm + + Algorithm + + + + + + + + + + + Executor & Executor& tf::Runtime::_executor _executor + tf::Runtime::_executor - + Worker & Worker& tf::Runtime::_worker _worker + tf::Runtime::_worker - + Node * Node* tf::Runtime::_parent _parent + tf::Runtime::_parent - + - - - - - tf::Runtime::~Runtime - () - ~Runtime + + bool + bool tf::Runtime::_preempted + + _preempted + tf::Runtime::_preempted + {false} -destroys the runtime object -Issues a tf::Runtime::corun_all to finish all spawned asynchronous tasks and then destroys the runtime object. - + + + Executor & Executor & tf::Runtime::executor () executor + tf::Runtime::executor obtains the running executor @@ -115,13 +154,29 @@ - + + + + Worker & + Worker & tf::Runtime::worker + () + worker + tf::Runtime::worker + +acquire a reference to the underlying worker + + + + + + void void tf::Runtime::schedule (Task task) schedule + tf::Runtime::schedule Task task @@ -141,23 +196,26 @@ This member function immediately schedules an active task to the task queue of the associated worker in the runtime task. An active task is a task in a running taskflow. The task may or may not be running, and scheduling that task will immediately put the task into the task queue of the worker that is running the runtime task. Consider the following example: tf::TaskA,B,C,D; -std::tie(A,B,C,D)=taskflow.emplace( +std::tie(A,B,C,D)=taskflow.emplace( [](){return0;}, [&C](tf::Runtime&rt){//Cmustbecapturedbyreference -std::cout<<"B\n"; +std::cout<<"B\n"; rt.schedule(C); }, -[](){std::cout<<"C\n";}, -[](){std::cout<<"D\n";} +[](){std::cout<<"C\n";}, +[](){std::cout<<"D\n";} ); A.precede(B,C,D); executor.run(taskflow).wait(); -The executor will first run the condition task A which returns 0 to inform the scheduler to go to the runtime task B. During the execution of B, it directly schedules task C without going through the normal taskflow graph scheduling process. At this moment, task C is active because its parent taskflow is running. When the taskflow finishes, we will see both B and C in the output. +The executor will first run the condition task A which returns 0 to inform the scheduler to go to the runtime task B. During the execution of B, it directly schedules task C without going through the normal taskflow graph scheduling process. At this moment, task C is active because its parent taskflow is running. When the taskflow finishes, we will see both B and C in the output. +This method can only be called by the parent worker of this runtime, or the behavior is undefined. + + - + @@ -169,6 +227,7 @@ This member function immediately schedules an active task to the task queue of t auto tf::Runtime::async (F &&f) async + tf::Runtime::async F && f @@ -195,8 +254,8 @@ This member function immediately schedules an active task to the task queue of t -The method creates an asynchronous task to launch the given function on the given arguments. The difference to tf::Executor::async is that the created asynchronous task pertains to the runtime object. Applications can explicitly issue tf::Runtime::corun_all to wait for all spawned asynchronous tasks to finish. For example: -std::atomic<int>counter(0); +The method creates an asynchronous task to launch the given function on the given arguments. The difference to tf::Executor::async is that the created asynchronous task pertains to the runtime object. Applications can explicitly issue tf::Runtime::corun to wait for all spawned asynchronous tasks to finish. For example: +std::atomic<int>counter(0); taskflow.emplace([&](tf::Runtime&rt){ autofu1=rt.async([&](){counter++;}); autofu2=rt.async([&](){counter++;}); @@ -206,16 +265,16 @@ The method creates an asynchronous task to launch the given function on the give //spawn100asynchronoustasksfromtheworkeroftheruntime for(inti=0;i<100;i++){ -rt.async([&](){counter++;}); +rt.silent_async([&](){counter++;}); } //waitforthe100asynchronoustaskstofinish -rt.corun_all(); +rt.corun(); assert(counter==102); }); This method is thread-safe and can be called by multiple workers that hold the reference to the runtime. For example, the code below spawns 100 tasks from the worker of a runtime, and each of the 100 tasks spawns another task that will be run by another worker. -std::atomic<int>counter(0); +std::atomic<int>counter(0); taskflow.emplace([&](tf::Runtime&rt){ //workeroftheruntimespawns100taskseachspawninganothertask //thatwillberunbyanotherworker @@ -227,14 +286,14 @@ The method creates an asynchronous task to launch the given function on the give } //waitforthe200asynchronoustaskstofinish -rt.corun_all(); +rt.corun(); assert(counter==200); }); - + @@ -249,6 +308,7 @@ The method creates an asynchronous task to launch the given function on the give auto tf::Runtime::async (P &&params, F &&f) async + tf::Runtime::async P && params @@ -295,7 +355,8 @@ The method creates an asynchronous task to launch the given function on the give -taskflow.emplace([&](tf::Runtime&rt){ + +taskflow.emplace([&](tf::Runtime&rt){ autofuture=rt.async("mytask",[](){}); future.get(); }); @@ -303,7 +364,7 @@ The method creates an asynchronous task to launch the given function on the give - + @@ -315,6 +376,7 @@ The method creates an asynchronous task to launch the given function on the give void tf::Runtime::silent_async (F &&f) silent_async + tf::Runtime::silent_async F && f @@ -342,12 +404,12 @@ The method creates an asynchronous task to launch the given function on the give This member function is more efficient than tf::Runtime::async and is encouraged to use when there is no data returned. -std::atomic<int>counter(0); +std::atomic<int>counter(0); taskflow.emplace([&](tf::Runtime&rt){ for(inti=0;i<100;i++){ rt.silent_async([&](){counter++;}); } -rt.corun_all(); +rt.corun(); assert(counter==100); }); @@ -355,7 +417,7 @@ This member function is more efficient than + @@ -370,6 +432,7 @@ This member function is more efficient than -taskflow.emplace([&](tf::Runtime&rt){ -rt.silent_async("mytask",[](){}); -rt.corun_all(); -}); - - - - - - - - - - typename F - - - void - void tf::Runtime::silent_async_unchecked - (F &&f) - silent_async_unchecked - - F && - f - - -similar to tf::Runtime::silent_async but the caller must be the worker of the runtime - - - - -F - - -callable type - - - - - -f - - -callable - - - -The method bypass the check of the caller worker from the executor and thus can only called by the worker of this runtime. -taskflow.emplace([&](tf::Runtime&rt){ -//runningbytheworkerofthisruntime -rt.silent_async_unchecked([](){}); -rt.corun_all(); -}); - - - - - - - - - - typename P - - - typename F - - - void - void tf::Runtime::silent_async_unchecked - (P &&params, F &&f) - silent_async_unchecked - - P && - params - - - F && - f - - -similar to tf::Runtime::silent_async but the caller must be the worker of the runtime - - - - -F - - -callable type - - - - -P - - -task parameters type - - - - - -params - - -task parameters - - - - -f - - -callable - - - -The method bypass the check of the caller worker from the executor and thus can only called by the worker of this runtime. + taskflow.emplace([&](tf::Runtime&rt){ -//runningbytheworkerofthisruntime -rt.silent_async_unchecked("mytask",[](){}); -rt.corun_all(); +rt.silent_async("mytask",[](){}); +rt.corun(); }); - + @@ -547,6 +492,7 @@ The method bypass the check of the caller worker from the executor and thus can void tf::Runtime::corun (T &&target) corun + tf::Runtime::corun T && target @@ -555,215 +501,109 @@ The method bypass the check of the caller worker from the executor and thus can co-runs the given target and waits until it completes -A target can be one of the following forms: -a subflow task to spawn a subflow or -a composable graph object with tf::Graph& T::graph() defined - - -//co-runasubflowandwaituntilalltaskscomplete -taskflow.emplace([](tf::Runtime&rt){ -rt.corun([](tf::Subflow&sf){ -tf::TaskA=sf.emplace([](){}); -tf::TaskB=sf.emplace([](){}); -}); -}); - -//co-runataskflowandwaituntilalltaskscomplete -tf::Taskflowtaskflow1,taskflow2; -taskflow1.emplace([](){std::cout<<"runningtaskflow1\n";}); +A corunnable target must have tf::Graph& T::graph() defined. +co-run a taskflow and wait until all tasks complete tf::Taskflowtaskflow1,taskflow2; +taskflow1.emplace([](){std::cout<<"runningtaskflow1\n";}); taskflow2.emplace([&](tf::Runtime&rt){ -std::cout<<"runningtaskflow2\n"; +std::cout<<"runningtaskflow2\n"; rt.corun(taskflow1); }); executor.run(taskflow2).wait(); Although tf::Runtime::corun blocks until the operation completes, the caller thread (worker) is not blocked (e.g., sleeping or holding any lock). Instead, the caller thread joins the work-stealing loop of the executor and returns when all tasks in the target completes. -Only the worker of this tf::Runtime can issue corun. +This method can only be called by the parent worker of this runtime, or the behavior is undefined. - + - - - - typename P - - + void - void tf::Runtime::corun_until - (P &&predicate) - corun_until - - P && - predicate - - -keeps running the work-stealing loop until the predicate becomes true - - - - -P - - -predicate type - - - - - -predicate - - -a boolean predicate to indicate when to stop the loop - - - -The method keeps the caller worker running in the work-stealing loop until the stop predicate becomes true. -Only the worker of this tf::Runtime can issue corun. - - - - - - - - - void - void tf::Runtime::corun_all + void tf::Runtime::corun () - corun_all + corun + tf::Runtime::corun -corun all asynchronous tasks spawned by this runtime with other workers +corun all tasks spawned by this runtime with other workers -Coruns all asynchronous tasks (tf::Runtime::async, tf::Runtime::silent_async) with other workers until all those asynchronous tasks finish. -std::atomic<size_t>counter{0}; +Coruns all tasks spawned by this runtime with other workers until all these tasks finish. +std::atomic<size_t>counter{0}; taskflow.emplace([&](tf::Runtime&rt){ //spawn100asynctasksandwait for(inti=0;i<100;i++){ rt.silent_async([&](){counter++;}); } -rt.corun_all(); +rt.corun(); assert(counter==100); //spawnanother100asynctasksandwait for(inti=0;i<100;i++){ rt.silent_async([&](){counter++;}); } -rt.corun_all(); +rt.corun(); assert(counter==200); }); -Only the worker of this tf::Runtime can issue tf::Runtime::corun_all. +This method can only be called by the parent worker of this runtime, or the behavior is undefined. - + - - Worker & - Worker & tf::Runtime::worker + + void + void tf::Runtime::corun_all () - worker + corun_all + tf::Runtime::corun_all -acquire a reference to the underlying worker +equivalent to tf::Runtime::corun - just an alias for legacy purpose - + - - - - - tf::Runtime::Runtime - (Executor &, Worker &, Node *) - Runtime - - Executor & - e - - - Worker & - w - - - Node * - p - + + bool + bool tf::Runtime::is_cancelled + () + is_cancelled + tf::Runtime::is_cancelled +This method verifies if the task has been cancelled. - + - - - - typename P - - - typename F - - - auto - auto tf::Runtime::_async - (Worker &w, P &&params, F &&f) - _async - - Worker & - w - - - P && - params - + + + + + tf::Runtime::Runtime + (Executor &, Worker &, Node *) + Runtime + tf::Runtime::Runtime - F && - f + Executor & + executor - - - - - - - - - - - - typename P - - - typename F - - - void - void tf::Runtime::_silent_async - (Worker &w, P &&params, F &&f) - _silent_async Worker & - w - - - P && - params + worker - F && - f + Node * + parent @@ -771,64 +611,54 @@ The method keeps the caller worker running in the work-stealing loop until the s - + - + class to include a runtime object in a task -A runtime object allows users to interact with the scheduling runtime inside a task, such as scheduling an active task, spawning a subflow, and so on. +A runtime object allows users to interact with the scheduling runtime inside a task (or the parent task of this runtime), such as scheduling an active task, spawning an asynchronous task, corunning a graph target, and so on. tf::TaskA,B,C,D; -std::tie(A,B,C,D)=taskflow.emplace( +std::tie(A,B,C,D)=taskflow.emplace( [](){return0;}, [&C](tf::Runtime&rt){//Cmustbecapturedbyreference -std::cout<<"B\n"; +std::cout<<"B\n"; rt.schedule(C); }, -[](){std::cout<<"C\n";}, -[](){std::cout<<"D\n";} +[](){std::cout<<"C\n";}, +[](){std::cout<<"D\n";} ); A.precede(B,C,D); executor.run(taskflow).wait(); -A runtime object is associated with the worker and the executor that runs the task. +A runtime object is associated with the worker and the executor that runs its parent task. +To understand how Taskflow schedules a runtime task, please refer to Runtime Tasking. + + - - - - - - - - - - - - - + - tf::Runtime_async tf::Runtime_executor tf::Runtime_parent - tf::Runtime_silent_async + tf::Runtime_preempted tf::Runtime_worker + tf::RuntimeAlgorithm tf::Runtimeasync tf::Runtimeasync tf::Runtimecorun + tf::Runtimecorun tf::Runtimecorun_all - tf::Runtimecorun_until tf::RuntimeExecutor tf::Runtimeexecutor tf::RuntimeFlowBuilder + tf::Runtimeis_cancelled + tf::RuntimePreemptionGuard tf::RuntimeRuntime tf::Runtimeschedule tf::Runtimesilent_async tf::Runtimesilent_async - tf::Runtimesilent_async_unchecked - tf::Runtimesilent_async_unchecked tf::Runtimeworker - tf::Runtime~Runtime diff --git a/docs/xml/classtf_1_1ScalablePipeline.xml b/docs/xml/classtf_1_1ScalablePipeline.xml index 44cf7878c..126c3596f 100644 --- a/docs/xml/classtf_1_1ScalablePipeline.xml +++ b/docs/xml/classtf_1_1ScalablePipeline.xml @@ -1,20 +1,21 @@ - + tf::ScalablePipeline - pipeline.hpp + taskflow/algorithm/pipeline.hpp tf::ScalablePipeline::Line typename P - + - typename std::iterator_traits< P >::value_type + typename std::iterator_traits< P >::value_type using tf::ScalablePipeline< P >::pipe_t = typename std::iterator_traits<P>::value_type pipe_t + tf::ScalablePipeline::pipe_t pipe type @@ -22,28 +23,30 @@ - + - - + + Graph Graph tf::ScalablePipeline< P >::_graph _graph + tf::ScalablePipeline::_graph - + size_t size_t tf::ScalablePipeline< P >::_num_tokens _num_tokens + tf::ScalablePipeline::_num_tokens {0} @@ -51,104 +54,112 @@ - + - std::vector< P > + std::vector< P > std::vector<P> tf::ScalablePipeline< P >::_pipes _pipes + tf::ScalablePipeline::_pipes - + - std::vector< Task > + std::vector< Task > std::vector<Task> tf::ScalablePipeline< P >::_tasks _tasks + tf::ScalablePipeline::_tasks - + - std::vector< Pipeflow > + std::vector< Pipeflow > std::vector<Pipeflow> tf::ScalablePipeline< P >::_pipeflows _pipeflows + tf::ScalablePipeline::_pipeflows - + - std::unique_ptr< Line[]> + std::unique_ptr< Line[]> std::unique_ptr<Line[]> tf::ScalablePipeline< P >::_lines _lines + tf::ScalablePipeline::_lines - + - std::queue< std::pair< size_t, size_t > > + std::queue< std::pair< size_t, size_t > > std::queue<std::pair<size_t, size_t> > tf::ScalablePipeline< P >::_ready_tokens _ready_tokens + tf::ScalablePipeline::_ready_tokens - + - std::unordered_map< size_t, std::vector< size_t > > + std::unordered_map< size_t, std::vector< size_t > > std::unordered_map<size_t, std::vector<size_t> > tf::ScalablePipeline< P >::_token_dependencies _token_dependencies + tf::ScalablePipeline::_token_dependencies - + - std::unordered_map< size_t, DeferredPipeflow > + std::unordered_map< size_t, DeferredPipeflow > std::unordered_map<size_t, DeferredPipeflow> tf::ScalablePipeline< P >::_deferred_tokens _deferred_tokens + tf::ScalablePipeline::_deferred_tokens - + size_t size_t tf::ScalablePipeline< P >::_longest_deferral _longest_deferral + tf::ScalablePipeline::_longest_deferral = 0 @@ -156,15 +167,16 @@ - + - - + + tf::ScalablePipeline< P >::ScalablePipeline ()=default ScalablePipeline + tf::ScalablePipeline::ScalablePipeline default constructor @@ -172,13 +184,14 @@ - + tf::ScalablePipeline< P >::ScalablePipeline (size_t num_lines) ScalablePipeline + tf::ScalablePipeline::ScalablePipeline size_t num_lines @@ -200,13 +213,14 @@ An empty scalable pipeline does not have any pipes. The pipeline needs to be res - + tf::ScalablePipeline< P >::ScalablePipeline (size_t num_lines, P first, P last) ScalablePipeline + tf::ScalablePipeline::ScalablePipeline size_t num_lines @@ -253,13 +267,14 @@ Constructs a pipeline from the given range of pipes specified in - + tf::ScalablePipeline< P >::ScalablePipeline (const ScalablePipeline &)=delete ScalablePipeline + tf::ScalablePipeline::ScalablePipeline const ScalablePipeline & @@ -270,13 +285,14 @@ Constructs a pipeline from the given range of pipes specified in - + tf::ScalablePipeline< P >::ScalablePipeline (ScalablePipeline &&rhs) ScalablePipeline + tf::ScalablePipeline::ScalablePipeline ScalablePipeline && rhs @@ -289,13 +305,14 @@ Constructs a pipeline from the given range of pipes specified in - + - + ScalablePipeline & - ScalablePipeline& tf::ScalablePipeline< P >::operator= + ScalablePipeline & tf::ScalablePipeline< P >::operator= (const ScalablePipeline &)=delete operator= + tf::ScalablePipeline::operator= const ScalablePipeline & @@ -306,13 +323,14 @@ Constructs a pipeline from the given range of pipes specified in - + ScalablePipeline & ScalablePipeline< P > & tf::ScalablePipeline< P >::operator= (ScalablePipeline &&rhs) operator= + tf::ScalablePipeline::operator= ScalablePipeline && rhs @@ -325,13 +343,14 @@ Constructs a pipeline from the given range of pipes specified in - + size_t size_t tf::ScalablePipeline< P >::num_lines () const noexcept num_lines + tf::ScalablePipeline::num_lines queries the number of parallel lines @@ -340,13 +359,14 @@ Constructs a pipeline from the given range of pipes specified in - + size_t size_t tf::ScalablePipeline< P >::num_pipes () const noexcept num_pipes + tf::ScalablePipeline::num_pipes queries the number of pipes @@ -355,13 +375,14 @@ Constructs a pipeline from the given range of pipes specified in - + void void tf::ScalablePipeline< P >::reset () reset + tf::ScalablePipeline::reset resets the pipeline @@ -370,13 +391,14 @@ Constructs a pipeline from the given range of pipes specified in - + void void tf::ScalablePipeline< P >::reset (P first, P last) reset + tf::ScalablePipeline::reset P first @@ -411,13 +433,14 @@ The member function assigns the pipeline to a new range of pipes specified in - + void void tf::ScalablePipeline< P >::reset (size_t num_lines, P first, P last) reset + tf::ScalablePipeline::reset size_t num_lines @@ -464,13 +487,14 @@ The member function resets the pipeline to a new number of parallel lines and a - + size_t size_t tf::ScalablePipeline< P >::num_tokens () const noexcept num_tokens + tf::ScalablePipeline::num_tokens queries the number of generated tokens in the pipeline @@ -479,13 +503,14 @@ The member function resets the pipeline to a new number of parallel lines and a - + Graph & Graph & tf::ScalablePipeline< P >::graph () graph + tf::ScalablePipeline::graph obtains the graph object associated with the pipeline construct @@ -494,15 +519,16 @@ The member function resets the pipeline to a new number of parallel lines and a - + - - + + void void tf::ScalablePipeline< P >::_check_dependents (Pipeflow &) _check_dependents + tf::ScalablePipeline::_check_dependents Pipeflow & pf @@ -513,13 +539,14 @@ The member function resets the pipeline to a new number of parallel lines and a - + void void tf::ScalablePipeline< P >::_construct_deferred_tokens (Pipeflow &) _construct_deferred_tokens + tf::ScalablePipeline::_construct_deferred_tokens Pipeflow & pf @@ -530,13 +557,14 @@ The member function resets the pipeline to a new number of parallel lines and a - + void void tf::ScalablePipeline< P >::_resolve_token_dependencies (Pipeflow &) _resolve_token_dependencies + tf::ScalablePipeline::_resolve_token_dependencies Pipeflow & pf @@ -547,13 +575,14 @@ The member function resets the pipeline to a new number of parallel lines and a - + void void tf::ScalablePipeline< P >::_on_pipe (Pipeflow &, Runtime &) _on_pipe + tf::ScalablePipeline::_on_pipe Pipeflow & pf @@ -568,26 +597,28 @@ The member function resets the pipeline to a new number of parallel lines and a - + void void tf::ScalablePipeline< P >::_build () _build + tf::ScalablePipeline::_build - + Line & ScalablePipeline< P >::Line & tf::ScalablePipeline< P >::_line (size_t, size_t) _line + tf::ScalablePipeline::_line size_t l @@ -602,9 +633,9 @@ The member function resets the pipeline to a new number of parallel lines and a - + - + class to create a scalable pipeline object @@ -625,7 +656,7 @@ A scalable pipeline is a composable graph object for users to create a constsize_tnum_lines=4; //createdatastorage -std::array<int, num_lines>buffer; +std::array<int, num_lines>buffer; //definethepipecallable autopipe_callable=[&buffer](tf::Pipeflow&pf)mutable{ @@ -637,7 +668,7 @@ A scalable pipeline is a composable graph object for users to create a pf.stop(); } else{ -printf("stage1:inputtoken=%zu\n",pf.token()); +printf("stage1:inputtoken=%zu\n",pf.token()); buffer[pf.line()]=pf.token(); } return; @@ -647,7 +678,7 @@ A scalable pipeline is a composable graph object for users to create a //otherstagespropagatethepreviousresulttothispipeand //incrementitbyone default:{ -printf( +printf( "stage%zu:inputbuffer[%zu]=%d\n",pf.pipe(),pf.line(),buffer[pf.line()] ); buffer[pf.line()]=buffer[pf.line()]+1; @@ -657,7 +688,7 @@ A scalable pipeline is a composable graph object for users to create a }; //createavectorofthreepipes -std::vector<tf::Pipe<std::function<void(tf::Pipeflow&)>>>pipes; +std::vector<tf::Pipe<std::function<void(tf::Pipeflow&)>>>pipes; for(size_ti=0;i<3;i++){ pipes.emplace_back(tf::PipeType::SERIAL,pipe_callable); @@ -667,11 +698,11 @@ A scalable pipeline is a composable graph object for users to create a tf::ScalablePipelinepl(num_lines,pipes.begin(),pipes.end()); //buildthepipelinegraphusingcomposition -tf::Taskinit=taskflow.emplace([](){std::cout<<"ready\n";}) +tf::Taskinit=taskflow.emplace([](){std::cout<<"ready\n";}) .name("startingpipeline"); tf::Tasktask=taskflow.composed_of(pl) .name("pipeline"); -tf::Taskstop=taskflow.emplace([](){std::cout<<"stopped\n";}) +tf::Taskstop=taskflow.emplace([](){std::cout<<"stopped\n";}) .name("pipelinestopped"); //createtaskdependency @@ -679,7 +710,7 @@ A scalable pipeline is a composable graph object for users to create a task.precede(stop); //dumpthepipelinegraphstructure(withcomposition) -taskflow.dump(std::cout); +taskflow.dump(std::cout); //runthepipeline executor.run(taskflow).wait(); @@ -694,7 +725,7 @@ A scalable pipeline is a composable graph object for users to create a executor.run(taskflow).wait(); The above example creates a pipeline graph that schedules five tokens over four parallel lines in a circular fashion, first going through three serial pipes and then five serial pipes: -#initialconstructionofthreeserialpipes +#initialconstructionofthreeserialpipes o->o->o ||| vvv @@ -721,7 +752,7 @@ A scalable pipeline is a composable graph object for users to create a Each pipe has the same type of tf::Pipe<std::function<void(tf::Pipeflow&)>> and is kept in a vector that is amenable to change. We construct the scalable pipeline using two range iterators pointing to the beginning and the end of the vector. At each pipe stage, the program propagates the result to the next pipe by adding one to the result stored in a custom data storage, buffer. The pipeline scheduler will generate five scheduling tokens and then stop. A scalable pipeline is move-only. - + tf::ScalablePipeline_build tf::ScalablePipeline_check_dependents @@ -743,7 +774,7 @@ A scalable pipeline is a composable graph object for users to create a tf::ScalablePipelinenum_lines tf::ScalablePipelinenum_pipes tf::ScalablePipelinenum_tokens - tf::ScalablePipelineoperator= + tf::ScalablePipelineoperator= tf::ScalablePipelineoperator= tf::ScalablePipelinepipe_t tf::ScalablePipelinereset diff --git a/docs/xml/classtf_1_1Semaphore.xml b/docs/xml/classtf_1_1Semaphore.xml index 624b0899f..34506fea6 100644 --- a/docs/xml/classtf_1_1Semaphore.xml +++ b/docs/xml/classtf_1_1Semaphore.xml @@ -1,15 +1,15 @@ - + tf::Semaphore - tf::CriticalSection - semaphore.hpp - + taskflow/core/semaphore.hpp + class friend class Node Node + tf::Semaphore::Node Node @@ -19,62 +19,115 @@ - + - - - - std::mutex + + class + friend class Executor + + Executor + tf::Semaphore::Executor + + Executor + + + + + + + + + + + + + std::mutex std::mutex tf::Semaphore::_mtx _mtx + tf::Semaphore::_mtx + + + + + + + + + + size_t + size_t tf::Semaphore::_max_value + + _max_value + tf::Semaphore::_max_value + {0} - + - + size_t - size_t tf::Semaphore::_counter + size_t tf::Semaphore::_cur_value - _counter + _cur_value + tf::Semaphore::_cur_value + {0} - + - - std::vector< Node * > - std::vector<Node*> tf::Semaphore::_waiters + + SmallVector< Node * > + SmallVector<Node*> tf::Semaphore::_waiters _waiters + tf::Semaphore::_waiters + + + + + + + + + + + + + tf::Semaphore::Semaphore + ()=default + Semaphore + tf::Semaphore::Semaphore +constructs a default semaphore +A default semaphore has the value of zero. Users can call tf::Semaphore::reset to reassign a new value to the semaphore. - + - - - + tf::Semaphore::Semaphore - (size_t max_workers) + (size_t max_value) Semaphore + tf::Semaphore::Semaphore size_t - max_workers + max_value -constructs a semaphore with the given counter +constructs a semaphore with the given value (i.e., counter) A semaphore creates a constraint that limits the maximum concurrency, i.e., the number of workers, in a set of tasks. @@ -83,29 +136,80 @@ - + - + size_t - size_t tf::Semaphore::count + size_t tf::Semaphore::value () const - count + value + tf::Semaphore::value -queries the counter value (not thread-safe during the run) +queries the current counter value - + - - + + size_t + size_t tf::Semaphore::max_value + () const + max_value + tf::Semaphore::max_value + +queries the maximum allowable value of this semaphore + + + + + + + + + void + void tf::Semaphore::reset + () + reset + tf::Semaphore::reset + +resets the semaphores to a clean state + + + + + + + + + void + void tf::Semaphore::reset + (size_t new_max_value) + reset + tf::Semaphore::reset + + size_t + new_max_value + + +resets the semaphores to a clean state with the given new maximum value + + + + + + + + + bool bool tf::Semaphore::_try_acquire_or_wait (Node *) _try_acquire_or_wait + tf::Semaphore::_try_acquire_or_wait Node * me @@ -116,22 +220,27 @@ - + - - std::vector< Node * > - std::vector< Node * > tf::Semaphore::_release - () + + void + void tf::Semaphore::_release + (SmallVector< Node * > &) _release + tf::Semaphore::_release + + SmallVector< Node * > & + dst + - + - + class to create a semophore object for building a concurrency constraint @@ -142,45 +251,39 @@ tf::Semaphoresemaphore(1);//createasemaphorewithinitialcount1 -std::vector<tf::Task>tasks{ -taskflow.emplace([](){std::cout<<"A"<<std::endl;}), +SmallVector<tf::Task>tasks{ +taskflow.emplace([](){std::cout<<"A"<<std::endl;}), taskflow.emplace([](){std::cout<<"B"<<std::endl;}), -taskflow.emplace([](){std::cout<<"C"<<std::endl;}), +taskflow.emplace([](){std::cout<<"C"<<std::endl;}), taskflow.emplace([](){std::cout<<"D"<<std::endl;}), -taskflow.emplace([](){std::cout<<"E"<<std::endl;}) +taskflow.emplace([](){std::cout<<"E"<<std::endl;}) }; for(auto&task:tasks){//eachtaskacquiresandreleasethesemaphore -task.acquire(semaphore); -task.release(semaphore); +task.acquire(semaphore); +task.release(semaphore); } executor.run(taskflow).wait(); The above example creates five tasks with no dependencies between them. Under normal circumstances, the five tasks would be executed concurrently. However, this example has a semaphore with initial count 1, and all tasks need to acquire that semaphore before running and release that semaphore after they are done. This arrangement limits the number of concurrently running tasks to only one. - - - - - - - - - - - - - + - tf::Semaphore_counter + tf::Semaphore_cur_value + tf::Semaphore_max_value tf::Semaphore_mtx - tf::Semaphore_release + tf::Semaphore_release tf::Semaphore_try_acquire_or_wait - tf::Semaphore_waiters - tf::Semaphorecount + tf::Semaphore_waiters + tf::SemaphoreExecutor + tf::Semaphoremax_value tf::SemaphoreNode - tf::SemaphoreSemaphore + tf::Semaphorereset + tf::Semaphorereset + tf::SemaphoreSemaphore + tf::SemaphoreSemaphore + tf::Semaphorevalue diff --git a/docs/xml/classtf_1_1SmallVector.xml b/docs/xml/classtf_1_1SmallVector.xml index 11bb68cb8..9aee8e7a1 100644 --- a/docs/xml/classtf_1_1SmallVector.xml +++ b/docs/xml/classtf_1_1SmallVector.xml @@ -1,9 +1,9 @@ - + tf::SmallVector tf::SmallVectorImpl< T > - small_vector.hpp + taskflow/utility/small_vector.hpp typename T @@ -15,12 +15,13 @@ 2 - + SmallVectorStorage< T, N > SmallVectorStorage<T, N> tf::SmallVector< T, N >::Storage Storage + tf::SmallVector::Storage Inline space for elements which aren't stored in the base class. @@ -28,15 +29,16 @@ - + - - + + tf::SmallVector< T, N >::SmallVector () SmallVector + tf::SmallVector::SmallVector constructs an empty vector @@ -44,13 +46,14 @@ - + tf::SmallVector< T, N >::SmallVector (size_t Size, const T &Value=T()) SmallVector + tf::SmallVector::SmallVector size_t Size @@ -67,7 +70,7 @@ - + @@ -79,6 +82,7 @@ tf::SmallVector< T, N >::SmallVector (ItTy S, ItTy E) SmallVector + tf::SmallVector::SmallVector ItTy S @@ -94,15 +98,16 @@ - + tf::SmallVector< T, N >::SmallVector (std::initializer_list< T > IL) SmallVector + tf::SmallVector::SmallVector - std::initializer_list< T > + std::initializer_list< T > IL @@ -112,13 +117,14 @@ - + tf::SmallVector< T, N >::SmallVector (const SmallVector &RHS) SmallVector + tf::SmallVector::SmallVector const SmallVector & RHS @@ -130,13 +136,14 @@ - + tf::SmallVector< T, N >::SmallVector (SmallVector &&RHS) SmallVector + tf::SmallVector::SmallVector SmallVector && RHS @@ -148,13 +155,14 @@ - + - + const SmallVector & - const SmallVector& tf::SmallVector< T, N >::operator= + const SmallVector & tf::SmallVector< T, N >::operator= (const SmallVector &RHS) operator= + tf::SmallVector::operator= const SmallVector & RHS @@ -166,13 +174,14 @@ - + - + const SmallVector & - const SmallVector& tf::SmallVector< T, N >::operator= + const SmallVector & tf::SmallVector< T, N >::operator= (SmallVector &&RHS) operator= + tf::SmallVector::operator= SmallVector && RHS @@ -184,13 +193,14 @@ - + tf::SmallVector< T, N >::SmallVector (SmallVectorImpl< T > &&RHS) SmallVector + tf::SmallVector::SmallVector SmallVectorImpl< T > && RHS @@ -202,13 +212,14 @@ - + - + const SmallVector & - const SmallVector& tf::SmallVector< T, N >::operator= + const SmallVector & tf::SmallVector< T, N >::operator= (SmallVectorImpl< T > &&RHS) operator= + tf::SmallVector::operator= SmallVectorImpl< T > && RHS @@ -220,15 +231,16 @@ - + - + const SmallVector & - const SmallVector& tf::SmallVector< T, N >::operator= + const SmallVector & tf::SmallVector< T, N >::operator= (std::initializer_list< T > IL) operator= + tf::SmallVector::operator= - std::initializer_list< T > + std::initializer_list< T > IL @@ -238,9 +250,9 @@ - + - + class to define a vector optimized for small array @@ -266,58 +278,58 @@ The class defines a C++ STL-styled vector (a variable-sized array) optimized for The class is stripped from the LLVM codebase. - - - + + + + + + + - - - + + + + + + + - - - - - - + + - - - - - - - + + + - + tf::SmallVectorappend tf::SmallVectorappend @@ -335,14 +347,14 @@ The class defines a C++ STL-styled vector (a variable-sized array) optimized for tf::SmallVectorcapacity_ptr tf::SmallVectorCapacityX tf::SmallVectorclear - tf::SmallVectorconst_iterator - tf::SmallVectorconst_pointer - tf::SmallVectorconst_reference - tf::SmallVectorconst_reverse_iterator + tf::SmallVectorconst_iterator + tf::SmallVectorconst_pointer + tf::SmallVectorconst_reference + tf::SmallVectorconst_reverse_iterator tf::SmallVectordata tf::SmallVectordata tf::SmallVectordestroy_range - tf::SmallVectordifference_type + tf::SmallVectordifference_type tf::SmallVectoremplace_back tf::SmallVectorempty tf::SmallVectorend @@ -361,38 +373,38 @@ The class defines a C++ STL-styled vector (a variable-sized array) optimized for tf::SmallVectorinsert tf::SmallVectorinsert tf::SmallVectorisSmall - tf::SmallVectoriterator + tf::SmallVectoriterator tf::SmallVectormax_size tf::SmallVectoroperator!= tf::SmallVectoroperator< - tf::SmallVectoroperator= - tf::SmallVectoroperator= - tf::SmallVectoroperator= - tf::SmallVectoroperator= + tf::SmallVectoroperator= + tf::SmallVectoroperator= + tf::SmallVectoroperator= + tf::SmallVectoroperator= tf::SmallVectoroperator= tf::SmallVectoroperator== tf::SmallVectoroperator[] tf::SmallVectoroperator[] - tf::SmallVectorpointer + tf::SmallVectorpointer tf::SmallVectorpop_back tf::SmallVectorpop_back_val tf::SmallVectorpush_back tf::SmallVectorpush_back tf::SmallVectorrbegin tf::SmallVectorrbegin - tf::SmallVectorreference + tf::SmallVectorreference tf::SmallVectorrend tf::SmallVectorrend tf::SmallVectorreserve tf::SmallVectorresetToSmall tf::SmallVectorresize tf::SmallVectorresize - tf::SmallVectorreverse_iterator + tf::SmallVectorreverse_iterator tf::SmallVectorset_size tf::SmallVectorsetEnd tf::SmallVectorsize tf::SmallVectorsize_in_bytes - tf::SmallVectorsize_type + tf::SmallVectorsize_type tf::SmallVectorSmallVector tf::SmallVectorSmallVector tf::SmallVectorSmallVector @@ -408,7 +420,7 @@ The class defines a C++ STL-styled vector (a variable-sized array) optimized for tf::SmallVectorswap tf::SmallVectoruninitialized_copy tf::SmallVectoruninitialized_move - tf::SmallVectorvalue_type + tf::SmallVectorvalue_type tf::SmallVector~SmallVectorImpl diff --git a/docs/xml/classtf_1_1SmallVectorBase.xml b/docs/xml/classtf_1_1SmallVectorBase.xml index a15f5c502..5a1ec30c9 100644 --- a/docs/xml/classtf_1_1SmallVectorBase.xml +++ b/docs/xml/classtf_1_1SmallVectorBase.xml @@ -1,54 +1,60 @@ - + tf::SmallVectorBase - + tf::SmallVectorTemplateCommon< Node * > + tf::SmallVectorTemplateCommon< tf::Semaphore * > + void * void* tf::SmallVectorBase::BeginX BeginX + tf::SmallVectorBase::BeginX - + void * void * tf::SmallVectorBase::EndX EndX + tf::SmallVectorBase::EndX - + void * void * tf::SmallVectorBase::CapacityX CapacityX + tf::SmallVectorBase::CapacityX - + - - + + tf::SmallVectorBase::SmallVectorBase (void *FirstEl, size_t Size) SmallVectorBase + tf::SmallVectorBase::SmallVectorBase void * FirstEl @@ -63,13 +69,14 @@ - + void void tf::SmallVectorBase::grow_pod (void *FirstEl, size_t MinSizeInBytes, size_t TSize) grow_pod + tf::SmallVectorBase::grow_pod void * FirstEl @@ -89,15 +96,16 @@ - + - - + + size_t size_t tf::SmallVectorBase::size_in_bytes () const size_in_bytes + tf::SmallVectorBase::size_in_bytes This returns size()*sizeof(T). @@ -105,13 +113,14 @@ - + size_t size_t tf::SmallVectorBase::capacity_in_bytes () const capacity_in_bytes + tf::SmallVectorBase::capacity_in_bytes capacity_in_bytes - This returns capacity()*sizeof(T). @@ -119,27 +128,43 @@ - + bool bool tf::SmallVectorBase::empty () const empty + tf::SmallVectorBase::empty - + - + - + + + + + + + + + + + + + + + + tf::SmallVectorBaseBeginX tf::SmallVectorBasecapacity_in_bytes diff --git a/docs/xml/classtf_1_1SmallVectorImpl.xml b/docs/xml/classtf_1_1SmallVectorImpl.xml index caacb24f4..c997a8635 100644 --- a/docs/xml/classtf_1_1SmallVectorImpl.xml +++ b/docs/xml/classtf_1_1SmallVectorImpl.xml @@ -1,78 +1,84 @@ - + tf::SmallVectorImpl tf::SmallVectorTemplateBase< T, IsPod< T >::value > - tf::SmallVector< Node * > + tf::SmallVector< Node *, 4 > tf::SmallVector< tf::Semaphore * > + tf::SmallVector< Node * > tf::SmallVector< T, N > typename T - - + + SmallVectorTemplateBase< T, IsPod< T >::value > - typedef SmallVectorTemplateBase<T, IsPod<T>::value> tf::SmallVectorImpl< T >::SuperClass + SmallVectorTemplateBase<T, IsPod<T>::value> tf::SmallVectorImpl< T >::SuperClass SuperClass + tf::SmallVectorImpl::SuperClass - + - - - + + + SuperClass::iterator - typedef SuperClass::iterator tf::SmallVectorImpl< T >::iterator + SuperClass::iterator tf::SmallVectorImpl< T >::iterator iterator + tf::SmallVectorImpl::iterator - + - + SuperClass::const_iterator - typedef SuperClass::const_iterator tf::SmallVectorImpl< T >::const_iterator + SuperClass::const_iterator tf::SmallVectorImpl< T >::const_iterator const_iterator + tf::SmallVectorImpl::const_iterator - + - + SuperClass::size_type - typedef SuperClass::size_type tf::SmallVectorImpl< T >::size_type + SuperClass::size_type tf::SmallVectorImpl< T >::size_type size_type + tf::SmallVectorImpl::size_type - + - - + + tf::SmallVectorImpl< T >::SmallVectorImpl (const SmallVectorImpl &)=delete SmallVectorImpl + tf::SmallVectorImpl::SmallVectorImpl const SmallVectorImpl & @@ -82,15 +88,16 @@ - + - - + + tf::SmallVectorImpl< T >::SmallVectorImpl (unsigned N) SmallVectorImpl + tf::SmallVectorImpl::SmallVectorImpl unsigned N @@ -101,41 +108,44 @@ - + - - + + tf::SmallVectorImpl< T >::~SmallVectorImpl () ~SmallVectorImpl + tf::SmallVectorImpl::~SmallVectorImpl - + void void tf::SmallVectorImpl< T >::clear () clear + tf::SmallVectorImpl::clear - + void void tf::SmallVectorImpl< T >::resize (size_type N) resize + tf::SmallVectorImpl::resize size_type N @@ -146,13 +156,14 @@ - + void void tf::SmallVectorImpl< T >::resize (size_type N, const T &NV) resize + tf::SmallVectorImpl::resize size_type N @@ -167,13 +178,14 @@ - + void void tf::SmallVectorImpl< T >::reserve (size_type N) reserve + tf::SmallVectorImpl::reserve size_type N @@ -184,26 +196,28 @@ - + T T tf::SmallVectorImpl< T >::pop_back_val () pop_back_val + tf::SmallVectorImpl::pop_back_val - + void void tf::SmallVectorImpl< T >::swap (SmallVectorImpl &RHS) swap + tf::SmallVectorImpl::swap SmallVectorImpl & RHS @@ -214,7 +228,7 @@ - + @@ -226,6 +240,7 @@ void tf::SmallVectorImpl< T >::append (in_iter in_start, in_iter in_end) append + tf::SmallVectorImpl::append in_iter in_start @@ -241,13 +256,14 @@ - + void void tf::SmallVectorImpl< T >::append (size_type NumInputs, const T &Elt) append + tf::SmallVectorImpl::append size_type NumInputs @@ -263,15 +279,16 @@ - + void void tf::SmallVectorImpl< T >::append (std::initializer_list< T > IL) append + tf::SmallVectorImpl::append - std::initializer_list< T > + std::initializer_list< T > IL @@ -280,13 +297,14 @@ - + void void tf::SmallVectorImpl< T >::assign (size_type NumElts, const T &Elt) assign + tf::SmallVectorImpl::assign size_type NumElts @@ -301,15 +319,16 @@ - + void void tf::SmallVectorImpl< T >::assign (std::initializer_list< T > IL) assign + tf::SmallVectorImpl::assign - std::initializer_list< T > + std::initializer_list< T > IL @@ -318,13 +337,14 @@ - + iterator iterator tf::SmallVectorImpl< T >::erase (const_iterator CI) erase + tf::SmallVectorImpl::erase const_iterator CI @@ -335,13 +355,14 @@ - + iterator iterator tf::SmallVectorImpl< T >::erase (const_iterator CS, const_iterator CE) erase + tf::SmallVectorImpl::erase const_iterator CS @@ -356,13 +377,14 @@ - + iterator iterator tf::SmallVectorImpl< T >::insert (iterator I, T &&Elt) insert + tf::SmallVectorImpl::insert iterator I @@ -377,13 +399,14 @@ - + iterator iterator tf::SmallVectorImpl< T >::insert (iterator I, const T &Elt) insert + tf::SmallVectorImpl::insert iterator I @@ -398,13 +421,14 @@ - + iterator iterator tf::SmallVectorImpl< T >::insert (iterator I, size_type NumToInsert, const T &Elt) insert + tf::SmallVectorImpl::insert iterator I @@ -423,7 +447,7 @@ - + @@ -435,6 +459,7 @@ iterator tf::SmallVectorImpl< T >::insert (iterator I, ItTy From, ItTy To) insert + tf::SmallVectorImpl::insert iterator I @@ -453,19 +478,20 @@ - + void void tf::SmallVectorImpl< T >::insert (iterator I, std::initializer_list< T > IL) insert + tf::SmallVectorImpl::insert iterator I - std::initializer_list< T > + std::initializer_list< T > IL @@ -474,7 +500,7 @@ - + @@ -488,6 +514,7 @@ void tf::SmallVectorImpl< T >::emplace_back (ArgTypes &&... Args) emplace_back + tf::SmallVectorImpl::emplace_back ArgTypes &&... Args @@ -498,13 +525,14 @@ - + SmallVectorImpl & SmallVectorImpl< T > & tf::SmallVectorImpl< T >::operator= (const SmallVectorImpl &RHS) operator= + tf::SmallVectorImpl::operator= const SmallVectorImpl & RHS @@ -515,13 +543,14 @@ - + SmallVectorImpl & SmallVectorImpl< T > & tf::SmallVectorImpl< T >::operator= (SmallVectorImpl &&RHS) operator= + tf::SmallVectorImpl::operator= SmallVectorImpl && RHS @@ -532,13 +561,14 @@ - + bool bool tf::SmallVectorImpl< T >::operator== (const SmallVectorImpl &RHS) const operator== + tf::SmallVectorImpl::operator== const SmallVectorImpl & RHS @@ -549,13 +579,14 @@ - + bool bool tf::SmallVectorImpl< T >::operator!= (const SmallVectorImpl &RHS) const operator!= + tf::SmallVectorImpl::operator!= const SmallVectorImpl & RHS @@ -566,13 +597,14 @@ - + bool bool tf::SmallVectorImpl< T >::operator< (const SmallVectorImpl &RHS) const operator< + tf::SmallVectorImpl::operator< const SmallVectorImpl & RHS @@ -583,13 +615,14 @@ - + void void tf::SmallVectorImpl< T >::set_size (size_type N) set_size + tf::SmallVectorImpl::set_size size_type N @@ -603,17 +636,30 @@ - + - + - - - + + + + + + + + + + + + + + + + @@ -622,53 +668,46 @@ + + + - - - - - - - - - - - - - - - - - + + + + + - - - + + + + + - + tf::SmallVectorImplappend tf::SmallVectorImplappend @@ -686,14 +725,14 @@ tf::SmallVectorImplcapacity_ptr tf::SmallVectorImplCapacityX tf::SmallVectorImplclear - tf::SmallVectorImplconst_iterator - tf::SmallVectorImplconst_pointer - tf::SmallVectorImplconst_reference - tf::SmallVectorImplconst_reverse_iterator + tf::SmallVectorImplconst_iterator + tf::SmallVectorImplconst_pointer + tf::SmallVectorImplconst_reference + tf::SmallVectorImplconst_reverse_iterator tf::SmallVectorImpldata tf::SmallVectorImpldata tf::SmallVectorImpldestroy_range - tf::SmallVectorImpldifference_type + tf::SmallVectorImpldifference_type tf::SmallVectorImplemplace_back tf::SmallVectorImplempty tf::SmallVectorImplend @@ -712,7 +751,7 @@ tf::SmallVectorImplinsert tf::SmallVectorImplinsert tf::SmallVectorImplisSmall - tf::SmallVectorImpliterator + tf::SmallVectorImpliterator tf::SmallVectorImplmax_size tf::SmallVectorImploperator!= tf::SmallVectorImploperator< @@ -721,36 +760,36 @@ tf::SmallVectorImploperator== tf::SmallVectorImploperator[] tf::SmallVectorImploperator[] - tf::SmallVectorImplpointer + tf::SmallVectorImplpointer tf::SmallVectorImplpop_back tf::SmallVectorImplpop_back_val tf::SmallVectorImplpush_back tf::SmallVectorImplpush_back tf::SmallVectorImplrbegin tf::SmallVectorImplrbegin - tf::SmallVectorImplreference + tf::SmallVectorImplreference tf::SmallVectorImplrend tf::SmallVectorImplrend tf::SmallVectorImplreserve tf::SmallVectorImplresetToSmall tf::SmallVectorImplresize tf::SmallVectorImplresize - tf::SmallVectorImplreverse_iterator + tf::SmallVectorImplreverse_iterator tf::SmallVectorImplset_size tf::SmallVectorImplsetEnd tf::SmallVectorImplsize tf::SmallVectorImplsize_in_bytes - tf::SmallVectorImplsize_type + tf::SmallVectorImplsize_type tf::SmallVectorImplSmallVectorBase tf::SmallVectorImplSmallVectorImpl tf::SmallVectorImplSmallVectorImpl tf::SmallVectorImplSmallVectorTemplateBase tf::SmallVectorImplSmallVectorTemplateCommon - tf::SmallVectorImplSuperClass + tf::SmallVectorImplSuperClass tf::SmallVectorImplswap tf::SmallVectorImpluninitialized_copy tf::SmallVectorImpluninitialized_move - tf::SmallVectorImplvalue_type + tf::SmallVectorImplvalue_type tf::SmallVectorImpl~SmallVectorImpl diff --git a/docs/xml/classtf_1_1SmallVectorTemplateBase.xml b/docs/xml/classtf_1_1SmallVectorTemplateBase.xml index 1f486acd0..2e529364d 100644 --- a/docs/xml/classtf_1_1SmallVectorTemplateBase.xml +++ b/docs/xml/classtf_1_1SmallVectorTemplateBase.xml @@ -1,8 +1,10 @@ - + tf::SmallVectorTemplateBase tf::SmallVectorTemplateCommon< T > + tf::SmallVectorImpl< Node * > + tf::SmallVectorImpl< tf::Semaphore * > typename T @@ -13,12 +15,13 @@ isPodLike - + tf::SmallVectorTemplateBase< T, isPodLike >::SmallVectorTemplateBase (size_t Size) SmallVectorTemplateBase + tf::SmallVectorTemplateBase::SmallVectorTemplateBase size_t Size @@ -29,13 +32,14 @@ - + void void tf::SmallVectorTemplateBase< T, isPodLike >::grow (size_t MinSize=0) grow + tf::SmallVectorTemplateBase::grow size_t MinSize @@ -48,15 +52,16 @@ - + - - + + void static void tf::SmallVectorTemplateBase< T, isPodLike >::destroy_range (T *S, T *E) destroy_range + tf::SmallVectorTemplateBase::destroy_range T * S @@ -71,7 +76,7 @@ - + @@ -86,6 +91,7 @@ static void tf::SmallVectorTemplateBase< T, isPodLike >::uninitialized_move (It1 I, It1 E, It2 Dest) uninitialized_move + tf::SmallVectorTemplateBase::uninitialized_move It1 I @@ -105,7 +111,7 @@ - + @@ -120,6 +126,7 @@ static void tf::SmallVectorTemplateBase< T, isPodLike >::uninitialized_copy (It1 I, It1 E, It2 Dest) uninitialized_copy + tf::SmallVectorTemplateBase::uninitialized_copy It1 I @@ -139,15 +146,16 @@ - + - - + + void void tf::SmallVectorTemplateBase< T, isPodLike >::push_back (const T &Elt) push_back + tf::SmallVectorTemplateBase::push_back const T & Elt @@ -158,13 +166,14 @@ - + void void tf::SmallVectorTemplateBase< T, isPodLike >::push_back (T &&Elt) push_back + tf::SmallVectorTemplateBase::push_back T && Elt @@ -175,47 +184,53 @@ - + void void tf::SmallVectorTemplateBase< T, isPodLike >::pop_back () pop_back + tf::SmallVectorTemplateBase::pop_back - + - + - - - - - + + + + + + + + + + - - + + @@ -224,8 +239,13 @@ + + + + + - + tf::SmallVectorTemplateBaseback tf::SmallVectorTemplateBaseback @@ -237,14 +257,14 @@ tf::SmallVectorTemplateBasecapacity_ptr tf::SmallVectorTemplateBasecapacity_ptr tf::SmallVectorTemplateBaseCapacityX - tf::SmallVectorTemplateBaseconst_iterator - tf::SmallVectorTemplateBaseconst_pointer - tf::SmallVectorTemplateBaseconst_reference - tf::SmallVectorTemplateBaseconst_reverse_iterator + tf::SmallVectorTemplateBaseconst_iterator + tf::SmallVectorTemplateBaseconst_pointer + tf::SmallVectorTemplateBaseconst_reference + tf::SmallVectorTemplateBaseconst_reverse_iterator tf::SmallVectorTemplateBasedata tf::SmallVectorTemplateBasedata tf::SmallVectorTemplateBasedestroy_range - tf::SmallVectorTemplateBasedifference_type + tf::SmallVectorTemplateBasedifference_type tf::SmallVectorTemplateBaseempty tf::SmallVectorTemplateBaseend tf::SmallVectorTemplateBaseend @@ -255,31 +275,31 @@ tf::SmallVectorTemplateBasegrow_pod tf::SmallVectorTemplateBasegrow_pod tf::SmallVectorTemplateBaseisSmall - tf::SmallVectorTemplateBaseiterator + tf::SmallVectorTemplateBaseiterator tf::SmallVectorTemplateBasemax_size tf::SmallVectorTemplateBaseoperator[] tf::SmallVectorTemplateBaseoperator[] - tf::SmallVectorTemplateBasepointer + tf::SmallVectorTemplateBasepointer tf::SmallVectorTemplateBasepop_back tf::SmallVectorTemplateBasepush_back tf::SmallVectorTemplateBasepush_back tf::SmallVectorTemplateBaserbegin tf::SmallVectorTemplateBaserbegin - tf::SmallVectorTemplateBasereference + tf::SmallVectorTemplateBasereference tf::SmallVectorTemplateBaserend tf::SmallVectorTemplateBaserend tf::SmallVectorTemplateBaseresetToSmall - tf::SmallVectorTemplateBasereverse_iterator + tf::SmallVectorTemplateBasereverse_iterator tf::SmallVectorTemplateBasesetEnd tf::SmallVectorTemplateBasesize tf::SmallVectorTemplateBasesize_in_bytes - tf::SmallVectorTemplateBasesize_type + tf::SmallVectorTemplateBasesize_type tf::SmallVectorTemplateBaseSmallVectorBase tf::SmallVectorTemplateBaseSmallVectorTemplateBase tf::SmallVectorTemplateBaseSmallVectorTemplateCommon tf::SmallVectorTemplateBaseuninitialized_copy tf::SmallVectorTemplateBaseuninitialized_move - tf::SmallVectorTemplateBasevalue_type + tf::SmallVectorTemplateBasevalue_type diff --git a/docs/xml/classtf_1_1SmallVectorTemplateBase_3_01T_00_01true_01_4.xml b/docs/xml/classtf_1_1SmallVectorTemplateBase_3_01T_00_01true_01_4.xml index 6f07d9383..a1fdc6a8c 100644 --- a/docs/xml/classtf_1_1SmallVectorTemplateBase_3_01T_00_01true_01_4.xml +++ b/docs/xml/classtf_1_1SmallVectorTemplateBase_3_01T_00_01true_01_4.xml @@ -1,5 +1,5 @@ - + tf::SmallVectorTemplateBase< T, true > tf::SmallVectorTemplateCommon< T > @@ -8,12 +8,13 @@ typename T - + tf::SmallVectorTemplateBase< T, true >::SmallVectorTemplateBase (size_t Size) SmallVectorTemplateBase + tf::SmallVectorTemplateBase< T, true >::SmallVectorTemplateBase size_t Size @@ -24,13 +25,14 @@ - + void void tf::SmallVectorTemplateBase< T, true >::grow (size_t MinSize=0) grow + tf::SmallVectorTemplateBase< T, true >::grow size_t MinSize @@ -43,15 +45,16 @@ - + - - + + void static void tf::SmallVectorTemplateBase< T, true >::destroy_range (T *, T *) destroy_range + tf::SmallVectorTemplateBase< T, true >::destroy_range T * @@ -64,7 +67,7 @@ - + @@ -79,6 +82,7 @@ static void tf::SmallVectorTemplateBase< T, true >::uninitialized_move (It1 I, It1 E, It2 Dest) uninitialized_move + tf::SmallVectorTemplateBase< T, true >::uninitialized_move It1 I @@ -98,7 +102,7 @@ - + @@ -113,6 +117,7 @@ static void tf::SmallVectorTemplateBase< T, true >::uninitialized_copy (It1 I, It1 E, It2 Dest) uninitialized_copy + tf::SmallVectorTemplateBase< T, true >::uninitialized_copy It1 I @@ -132,7 +137,7 @@ - + @@ -147,6 +152,7 @@ static void tf::SmallVectorTemplateBase< T, true >::uninitialized_copy (T1 *I, T1 *E, T2 *Dest, typename std::enable_if< std::is_same< typename std::remove_const< T1 >::type, T2 >::value >::type *=nullptr) uninitialized_copy + tf::SmallVectorTemplateBase< T, true >::uninitialized_copy T1 * I @@ -160,7 +166,7 @@ Dest - typename std::enable_if< std::is_same< typename std::remove_const< T1 >::type, T2 >::value >::type * + typename std::enable_if< std::is_same< typename std::remove_const< T1 >::type, T2 >::value >::type * nullptr @@ -170,15 +176,16 @@ - + - - + + void void tf::SmallVectorTemplateBase< T, true >::push_back (const T &Elt) push_back + tf::SmallVectorTemplateBase< T, true >::push_back const T & Elt @@ -189,32 +196,28 @@ - + void void tf::SmallVectorTemplateBase< T, true >::pop_back () pop_back + tf::SmallVectorTemplateBase< T, true >::pop_back - + - + - - - - - @@ -223,13 +226,13 @@ - - + + @@ -238,8 +241,13 @@ + + + + + - + tf::SmallVectorTemplateBase< T, true >back tf::SmallVectorTemplateBase< T, true >back @@ -251,14 +259,14 @@ tf::SmallVectorTemplateBase< T, true >capacity_ptr tf::SmallVectorTemplateBase< T, true >capacity_ptr tf::SmallVectorTemplateBase< T, true >CapacityX - tf::SmallVectorTemplateBase< T, true >const_iterator - tf::SmallVectorTemplateBase< T, true >const_pointer - tf::SmallVectorTemplateBase< T, true >const_reference - tf::SmallVectorTemplateBase< T, true >const_reverse_iterator + tf::SmallVectorTemplateBase< T, true >const_iterator + tf::SmallVectorTemplateBase< T, true >const_pointer + tf::SmallVectorTemplateBase< T, true >const_reference + tf::SmallVectorTemplateBase< T, true >const_reverse_iterator tf::SmallVectorTemplateBase< T, true >data tf::SmallVectorTemplateBase< T, true >data tf::SmallVectorTemplateBase< T, true >destroy_range - tf::SmallVectorTemplateBase< T, true >difference_type + tf::SmallVectorTemplateBase< T, true >difference_type tf::SmallVectorTemplateBase< T, true >empty tf::SmallVectorTemplateBase< T, true >end tf::SmallVectorTemplateBase< T, true >end @@ -269,31 +277,31 @@ tf::SmallVectorTemplateBase< T, true >grow_pod tf::SmallVectorTemplateBase< T, true >grow_pod tf::SmallVectorTemplateBase< T, true >isSmall - tf::SmallVectorTemplateBase< T, true >iterator + tf::SmallVectorTemplateBase< T, true >iterator tf::SmallVectorTemplateBase< T, true >max_size tf::SmallVectorTemplateBase< T, true >operator[] tf::SmallVectorTemplateBase< T, true >operator[] - tf::SmallVectorTemplateBase< T, true >pointer + tf::SmallVectorTemplateBase< T, true >pointer tf::SmallVectorTemplateBase< T, true >pop_back tf::SmallVectorTemplateBase< T, true >push_back tf::SmallVectorTemplateBase< T, true >rbegin tf::SmallVectorTemplateBase< T, true >rbegin - tf::SmallVectorTemplateBase< T, true >reference + tf::SmallVectorTemplateBase< T, true >reference tf::SmallVectorTemplateBase< T, true >rend tf::SmallVectorTemplateBase< T, true >rend tf::SmallVectorTemplateBase< T, true >resetToSmall - tf::SmallVectorTemplateBase< T, true >reverse_iterator + tf::SmallVectorTemplateBase< T, true >reverse_iterator tf::SmallVectorTemplateBase< T, true >setEnd tf::SmallVectorTemplateBase< T, true >size tf::SmallVectorTemplateBase< T, true >size_in_bytes - tf::SmallVectorTemplateBase< T, true >size_type + tf::SmallVectorTemplateBase< T, true >size_type tf::SmallVectorTemplateBase< T, true >SmallVectorBase tf::SmallVectorTemplateBase< T, true >SmallVectorTemplateBase tf::SmallVectorTemplateBase< T, true >SmallVectorTemplateCommon tf::SmallVectorTemplateBase< T, true >uninitialized_copy tf::SmallVectorTemplateBase< T, true >uninitialized_copy tf::SmallVectorTemplateBase< T, true >uninitialized_move - tf::SmallVectorTemplateBase< T, true >value_type + tf::SmallVectorTemplateBase< T, true >value_type diff --git a/docs/xml/classtf_1_1SmallVectorTemplateCommon.xml b/docs/xml/classtf_1_1SmallVectorTemplateCommon.xml index cb72540cd..0852794b1 100644 --- a/docs/xml/classtf_1_1SmallVectorTemplateCommon.xml +++ b/docs/xml/classtf_1_1SmallVectorTemplateCommon.xml @@ -1,9 +1,11 @@ - + tf::SmallVectorTemplateCommon tf::SmallVectorBase tf::SmallVectorTemplateBase< T, IsPod< T >::value > + tf::SmallVectorTemplateBase< Node *, IsPod< Node * >::value > + tf::SmallVectorTemplateBase< tf::Semaphore *, IsPod< tf::Semaphore * >::value > tf::SmallVectorTemplateCommon::AlignedUnionType @@ -14,167 +16,179 @@ void - - + + AlignedUnionType< T > - typedef AlignedUnionType<T> tf::SmallVectorTemplateCommon< T, typename >::U + AlignedUnionType<T> tf::SmallVectorTemplateCommon< T, typename >::U U + tf::SmallVectorTemplateCommon::U - + - - - + + + size_t - typedef size_t tf::SmallVectorTemplateCommon< T, typename >::size_type + size_t tf::SmallVectorTemplateCommon< T, typename >::size_type size_type + tf::SmallVectorTemplateCommon::size_type - + - + ptrdiff_t - typedef ptrdiff_t tf::SmallVectorTemplateCommon< T, typename >::difference_type + ptrdiff_t tf::SmallVectorTemplateCommon< T, typename >::difference_type difference_type + tf::SmallVectorTemplateCommon::difference_type - + - + T - typedef T tf::SmallVectorTemplateCommon< T, typename >::value_type + T tf::SmallVectorTemplateCommon< T, typename >::value_type value_type + tf::SmallVectorTemplateCommon::value_type - + - + T * - typedef T* tf::SmallVectorTemplateCommon< T, typename >::iterator + T* tf::SmallVectorTemplateCommon< T, typename >::iterator iterator + tf::SmallVectorTemplateCommon::iterator - + - + const T * - typedef const T* tf::SmallVectorTemplateCommon< T, typename >::const_iterator + const T* tf::SmallVectorTemplateCommon< T, typename >::const_iterator const_iterator + tf::SmallVectorTemplateCommon::const_iterator - + - - std::reverse_iterator< const_iterator > - typedef std::reverse_iterator<const_iterator> tf::SmallVectorTemplateCommon< T, typename >::const_reverse_iterator + + std::reverse_iterator< const_iterator > + std::reverse_iterator<const_iterator> tf::SmallVectorTemplateCommon< T, typename >::const_reverse_iterator const_reverse_iterator + tf::SmallVectorTemplateCommon::const_reverse_iterator - + - - std::reverse_iterator< iterator > - typedef std::reverse_iterator<iterator> tf::SmallVectorTemplateCommon< T, typename >::reverse_iterator + + std::reverse_iterator< iterator > + std::reverse_iterator<iterator> tf::SmallVectorTemplateCommon< T, typename >::reverse_iterator reverse_iterator + tf::SmallVectorTemplateCommon::reverse_iterator - + - + T & - typedef T& tf::SmallVectorTemplateCommon< T, typename >::reference + T& tf::SmallVectorTemplateCommon< T, typename >::reference reference + tf::SmallVectorTemplateCommon::reference - + - + const T & - typedef const T& tf::SmallVectorTemplateCommon< T, typename >::const_reference + const T& tf::SmallVectorTemplateCommon< T, typename >::const_reference const_reference + tf::SmallVectorTemplateCommon::const_reference - + - + T * - typedef T* tf::SmallVectorTemplateCommon< T, typename >::pointer + T* tf::SmallVectorTemplateCommon< T, typename >::pointer pointer + tf::SmallVectorTemplateCommon::pointer - + - + const T * - typedef const T* tf::SmallVectorTemplateCommon< T, typename >::const_pointer + const T* tf::SmallVectorTemplateCommon< T, typename >::const_pointer const_pointer + tf::SmallVectorTemplateCommon::const_pointer - + - - + + @@ -188,6 +202,7 @@ friend struct SmallVectorStorage SmallVectorStorage + tf::SmallVectorTemplateCommon::SmallVectorStorage SmallVectorStorage @@ -197,30 +212,32 @@ - + - - + + U U tf::SmallVectorTemplateCommon< T, typename >::FirstEl FirstEl + tf::SmallVectorTemplateCommon::FirstEl - + - - + + tf::SmallVectorTemplateCommon< T, typename >::SmallVectorTemplateCommon (size_t Size) SmallVectorTemplateCommon + tf::SmallVectorTemplateCommon::SmallVectorTemplateCommon size_t Size @@ -231,13 +248,14 @@ - + void void tf::SmallVectorTemplateCommon< T, typename >::grow_pod (size_t MinSizeInBytes, size_t TSize) grow_pod + tf::SmallVectorTemplateCommon::grow_pod size_t MinSizeInBytes @@ -252,13 +270,14 @@ - + bool bool tf::SmallVectorTemplateCommon< T, typename >::isSmall () const isSmall + tf::SmallVectorTemplateCommon::isSmall @@ -266,13 +285,14 @@ - + void void tf::SmallVectorTemplateCommon< T, typename >::resetToSmall () resetToSmall + tf::SmallVectorTemplateCommon::resetToSmall Put this vector in a state of being small. @@ -280,13 +300,14 @@ - + void void tf::SmallVectorTemplateCommon< T, typename >::setEnd (T *P) setEnd + tf::SmallVectorTemplateCommon::setEnd T * P @@ -297,171 +318,184 @@ - + iterator iterator tf::SmallVectorTemplateCommon< T, typename >::capacity_ptr () capacity_ptr + tf::SmallVectorTemplateCommon::capacity_ptr - + const_iterator const_iterator tf::SmallVectorTemplateCommon< T, typename >::capacity_ptr () const capacity_ptr + tf::SmallVectorTemplateCommon::capacity_ptr - + - - + + iterator iterator tf::SmallVectorTemplateCommon< T, typename >::begin () begin + tf::SmallVectorTemplateCommon::begin - + const_iterator const_iterator tf::SmallVectorTemplateCommon< T, typename >::begin () const begin + tf::SmallVectorTemplateCommon::begin - + iterator iterator tf::SmallVectorTemplateCommon< T, typename >::end () end + tf::SmallVectorTemplateCommon::end - + const_iterator const_iterator tf::SmallVectorTemplateCommon< T, typename >::end () const end + tf::SmallVectorTemplateCommon::end - + - reverse_iterator + reverse_iterator reverse_iterator tf::SmallVectorTemplateCommon< T, typename >::rbegin () rbegin + tf::SmallVectorTemplateCommon::rbegin - + - const_reverse_iterator + const_reverse_iterator const_reverse_iterator tf::SmallVectorTemplateCommon< T, typename >::rbegin () const rbegin + tf::SmallVectorTemplateCommon::rbegin - + - reverse_iterator + reverse_iterator reverse_iterator tf::SmallVectorTemplateCommon< T, typename >::rend () rend + tf::SmallVectorTemplateCommon::rend - + - const_reverse_iterator + const_reverse_iterator const_reverse_iterator tf::SmallVectorTemplateCommon< T, typename >::rend () const rend + tf::SmallVectorTemplateCommon::rend - + size_type size_type tf::SmallVectorTemplateCommon< T, typename >::size () const size + tf::SmallVectorTemplateCommon::size - + size_type size_type tf::SmallVectorTemplateCommon< T, typename >::max_size () const max_size + tf::SmallVectorTemplateCommon::max_size - + size_t size_t tf::SmallVectorTemplateCommon< T, typename >::capacity () const capacity + tf::SmallVectorTemplateCommon::capacity Return the total number of elements in the currently allocated buffer. @@ -469,13 +503,14 @@ - + pointer pointer tf::SmallVectorTemplateCommon< T, typename >::data () data + tf::SmallVectorTemplateCommon::data Return a pointer to the vector's buffer, even if empty(). @@ -483,13 +518,14 @@ - + const_pointer const_pointer tf::SmallVectorTemplateCommon< T, typename >::data () const data + tf::SmallVectorTemplateCommon::data Return a pointer to the vector's buffer, even if empty(). @@ -497,13 +533,14 @@ - + reference reference tf::SmallVectorTemplateCommon< T, typename >::operator[] (size_type idx) operator[] + tf::SmallVectorTemplateCommon::operator[] size_type idx @@ -514,13 +551,14 @@ - + const_reference const_reference tf::SmallVectorTemplateCommon< T, typename >::operator[] (size_type idx) const operator[] + tf::SmallVectorTemplateCommon::operator[] size_type idx @@ -531,69 +569,76 @@ - + reference reference tf::SmallVectorTemplateCommon< T, typename >::front () front + tf::SmallVectorTemplateCommon::front - + const_reference const_reference tf::SmallVectorTemplateCommon< T, typename >::front () const front + tf::SmallVectorTemplateCommon::front - + reference reference tf::SmallVectorTemplateCommon< T, typename >::back () back + tf::SmallVectorTemplateCommon::back - + const_reference const_reference tf::SmallVectorTemplateCommon< T, typename >::back () const back + tf::SmallVectorTemplateCommon::back - + - + - - - + + + + + + @@ -601,21 +646,28 @@ - - + + + + - - + + + + + + + - + tf::SmallVectorTemplateCommonback tf::SmallVectorTemplateCommonback @@ -627,13 +679,13 @@ tf::SmallVectorTemplateCommoncapacity_ptr tf::SmallVectorTemplateCommoncapacity_ptr tf::SmallVectorTemplateCommonCapacityX - tf::SmallVectorTemplateCommonconst_iterator - tf::SmallVectorTemplateCommonconst_pointer - tf::SmallVectorTemplateCommonconst_reference - tf::SmallVectorTemplateCommonconst_reverse_iterator + tf::SmallVectorTemplateCommonconst_iterator + tf::SmallVectorTemplateCommonconst_pointer + tf::SmallVectorTemplateCommonconst_reference + tf::SmallVectorTemplateCommonconst_reverse_iterator tf::SmallVectorTemplateCommondata tf::SmallVectorTemplateCommondata - tf::SmallVectorTemplateCommondifference_type + tf::SmallVectorTemplateCommondifference_type tf::SmallVectorTemplateCommonempty tf::SmallVectorTemplateCommonend tf::SmallVectorTemplateCommonend @@ -644,27 +696,27 @@ tf::SmallVectorTemplateCommongrow_pod tf::SmallVectorTemplateCommongrow_pod tf::SmallVectorTemplateCommonisSmall - tf::SmallVectorTemplateCommoniterator + tf::SmallVectorTemplateCommoniterator tf::SmallVectorTemplateCommonmax_size tf::SmallVectorTemplateCommonoperator[] tf::SmallVectorTemplateCommonoperator[] - tf::SmallVectorTemplateCommonpointer + tf::SmallVectorTemplateCommonpointer tf::SmallVectorTemplateCommonrbegin tf::SmallVectorTemplateCommonrbegin - tf::SmallVectorTemplateCommonreference + tf::SmallVectorTemplateCommonreference tf::SmallVectorTemplateCommonrend tf::SmallVectorTemplateCommonrend tf::SmallVectorTemplateCommonresetToSmall - tf::SmallVectorTemplateCommonreverse_iterator + tf::SmallVectorTemplateCommonreverse_iterator tf::SmallVectorTemplateCommonsetEnd tf::SmallVectorTemplateCommonsize tf::SmallVectorTemplateCommonsize_in_bytes - tf::SmallVectorTemplateCommonsize_type + tf::SmallVectorTemplateCommonsize_type tf::SmallVectorTemplateCommonSmallVectorBase tf::SmallVectorTemplateCommonSmallVectorStorage tf::SmallVectorTemplateCommonSmallVectorTemplateCommon - tf::SmallVectorTemplateCommonU - tf::SmallVectorTemplateCommonvalue_type + tf::SmallVectorTemplateCommonU + tf::SmallVectorTemplateCommonvalue_type diff --git a/docs/xml/classtf_1_1StaticPartitioner.xml b/docs/xml/classtf_1_1StaticPartitioner.xml index 2a958827e..3586ec8f6 100644 --- a/docs/xml/classtf_1_1StaticPartitioner.xml +++ b/docs/xml/classtf_1_1StaticPartitioner.xml @@ -1,21 +1,22 @@ - + tf::StaticPartitioner tf::PartitionerBase< DefaultClosureWrapper > - partitioner.hpp + taskflow/algorithm/partitioner.hpp typename C - DefaultClosureWrapper + DefaultClosureWrapper - + - constexpr PartitionerType + PartitionerType static constexpr PartitionerType tf::StaticPartitioner< C >::type () type + tf::StaticPartitioner::type queries the partition type (static) @@ -23,15 +24,16 @@ - + - - + + tf::StaticPartitioner< C >::StaticPartitioner ()=default StaticPartitioner + tf::StaticPartitioner::StaticPartitioner default constructor @@ -39,13 +41,14 @@ - + tf::StaticPartitioner< C >::StaticPartitioner (size_t sz) StaticPartitioner + tf::StaticPartitioner::StaticPartitioner size_t sz @@ -57,13 +60,14 @@ - + tf::StaticPartitioner< C >::StaticPartitioner (size_t sz, C &&closure) StaticPartitioner + tf::StaticPartitioner::StaticPartitioner size_t sz @@ -79,13 +83,14 @@ - + size_t size_t tf::StaticPartitioner< C >::adjusted_chunk_size (size_t N, size_t W, size_t w) const adjusted_chunk_size + tf::StaticPartitioner::adjusted_chunk_size size_t N @@ -106,10 +111,10 @@ - + - - + + @@ -124,6 +129,7 @@ void tf::StaticPartitioner< C >::loop (size_t N, size_t W, size_t curr_b, size_t chunk_size, F &&func) loop + tf::StaticPartitioner::loop size_t N @@ -150,7 +156,7 @@ - + @@ -166,6 +172,7 @@ void tf::StaticPartitioner< C >::loop_until (size_t N, size_t W, size_t curr_b, size_t chunk_size, F &&func) loop_until + tf::StaticPartitioner::loop_until size_t N @@ -192,9 +199,9 @@ - + - + class to construct a static partitioner for scheduling parallel algorithms @@ -204,23 +211,23 @@ C -closure wrapper type (default tf::DefaultClosureWrapper) +closure wrapper type (default tf::DefaultClosureWrapper) The partitioner divides iterations into chunks and distributes chunks to workers in order. If the chunk size is not specified (default 0), the partitioner resorts to a chunk size that equally distributes iterations into workers. -std::vector<int>data={1,2,3,4,5,6,7,8,9,10} +std::vector<int>data={1,2,3,4,5,6,7,8,9,10} taskflow.for_each( data.begin(),data.end(),[](inti){},StaticPartitioner(0) ); executor.run(taskflow).run(); In addition to partition size, the application can specify a closure wrapper for a static partitioner. A closure wrapper allows the application to wrapper a partitioned task (i.e., closure) with a custom function object that performs additional tasks. For example: -std::atomic<int>count=0; +std::atomic<int>count=0; tf::Taskflowtaskflow; taskflow.for_each_index(0,100,1, [](){ -printf("%d\n",i); +printf("%d\n",i); }, tf::StaticPartitioner(0,[](auto&&closure){ //dosomethingbeforeinvokingthepartitionedtask @@ -237,12 +244,6 @@ The partitioner divides iterations into chunks and distributes chunks to workers - - - - - - @@ -252,14 +253,18 @@ The partitioner divides iterations into chunks and distributes chunks to workers - - + + + + + + @@ -268,20 +273,32 @@ The partitioner divides iterations into chunks and distributes chunks to workers + + _closure_wrapper + + + + + + + - + tf::StaticPartitioner_chunk_size tf::StaticPartitioner_closure_wrapper tf::StaticPartitioneradjusted_chunk_size tf::StaticPartitionerchunk_size tf::StaticPartitionerchunk_size - tf::StaticPartitionerclosure_wrapper + tf::StaticPartitionerclosure_wrapper + tf::StaticPartitionerclosure_wrapper tf::StaticPartitionerclosure_wrapper tf::StaticPartitionerclosure_wrapper_type + tf::StaticPartitioneris_default_wrapper_v tf::StaticPartitionerloop tf::StaticPartitionerloop_until + tf::StaticPartitioneroperator() tf::StaticPartitionerPartitionerBase tf::StaticPartitionerPartitionerBase tf::StaticPartitionerPartitionerBase diff --git a/docs/xml/classtf_1_1Subflow.xml b/docs/xml/classtf_1_1Subflow.xml index c733c0974..dc699edd2 100644 --- a/docs/xml/classtf_1_1Subflow.xml +++ b/docs/xml/classtf_1_1Subflow.xml @@ -1,16 +1,16 @@ - + tf::Subflow tf::FlowBuilder - tf::Runtime - flow_builder.hpp - + taskflow/core/flow_builder.hpp + class friend class Executor Executor + tf::Subflow::Executor Executor @@ -20,13 +20,14 @@ - + class friend class FlowBuilder FlowBuilder + tf::Subflow::FlowBuilder FlowBuilder @@ -36,47 +37,60 @@ - + - - class - friend class Runtime + + + + Executor & + Executor& tf::Subflow::_executor - Runtime - - Runtime - + _executor + tf::Subflow::_executor - + - - - - bool - bool tf::Subflow::_joinable + + Worker & + Worker& tf::Subflow::_worker + + _worker + tf::Subflow::_worker + + + + + + + + + + Node * + Node* tf::Subflow::_parent - _joinable - {true} + _parent + tf::Subflow::_parent - + - - - + + + void void tf::Subflow::join () join + tf::Subflow::join enables the subflow to join its parent task @@ -91,87 +105,116 @@ - + - - void - void tf::Subflow::detach - () - detach + + bool + bool tf::Subflow::joinable + () const noexcept + joinable + tf::Subflow::joinable -enables the subflow to detach from its parent task +queries if the subflow is joinable -Performs an immediate action to detach the subflow. Once the subflow is detached, it is considered finished and you may not modify the subflow anymore. +This member function queries if the subflow is joinable. When a subflow is joined, it becomes not joinable. taskflow.emplace([](tf::Subflow&sf){ sf.emplace([](){}); -sf.detach(); +std::cout<<sf.joinable()<<'\n';//true +sf.join(); +std::cout<<sf.joinable()<<'\n';//false }); - -Only the worker that spawns this subflow can detach it. + - + - + + Executor & + Executor & tf::Subflow::executor + () noexcept + executor + tf::Subflow::executor + +acquires the associated executor + + + + + + + + + Graph & + Graph & tf::Subflow::graph + () + graph + tf::Subflow::graph + +acquires the associated graph + + + + + + + + void - void tf::Subflow::reset - (bool clear_graph=true) - reset + void tf::Subflow::retain + (bool flag) noexcept + retain + tf::Subflow::retain bool - clear_graph - true + flag -resets the subflow to a joinable state +specifies whether to keep the subflow after it is joined -clear_graph +flag -specifies whether to clear the associated graph (default true) +true to retain the subflow after it is joined; false to discard it -Clears the underlying task graph depending on the given variable clear_graph (default true) and then updates the subflow to a joinable state. +By default, the runtime automatically clears a spawned subflow once it is joined. Setting this flag to true allows the application to retain the subflow's structure for post-execution analysis like visualization. - + - + bool - bool tf::Subflow::joinable - () const noexcept - joinable + bool tf::Subflow::retain + () const + retain + tf::Subflow::retain -queries if the subflow is joinable +queries if the subflow will be retained after it is joined -This member function queries if the subflow is joinable. When a subflow is joined or detached, it becomes not joinable. -taskflow.emplace([](tf::Subflow&sf){ -sf.emplace([](){}); -std::cout<<sf.joinable()<<'\n';//true -sf.join(); -std::cout<<sf.joinable()<<'\n';//false -}); - +true if the subflow will be retained after it is joined; false otherwise + + - + - - + + tf::Subflow::Subflow (Executor &, Worker &, Node *, Graph &) Subflow + tf::Subflow::Subflow Executor & executor @@ -194,14 +237,63 @@ Clears the underlying task graph depending on the given variable - + + + + + tf::Subflow::Subflow + ()=delete + Subflow + tf::Subflow::Subflow + + + + + + + + + + + tf::Subflow::Subflow + (const Subflow &)=delete + Subflow + tf::Subflow::Subflow + + const Subflow & + + + + + + + + - + + + tf::Subflow::Subflow + (Subflow &&)=delete + Subflow + tf::Subflow::Subflow + + Subflow && + + + + + + + + + + class to construct a subflow graph from the execution of a dynamic task -tf::Subflow is a derived class from tf::Runtime with a specialized mechanism to manage the execution of a child graph. By default, a subflow automatically joins its parent node. You may explicitly join or detach a subflow by calling tf::Subflow::join or tf::Subflow::detach, respectively. The following example creates a taskflow graph that spawns a subflow from the execution of task B, and the subflow contains three tasks, B1, B2, and B3, where B3 runs after B1 and B2. +tf::Subflow is spawned from the execution of a task to dynamically manage a child graph that may depend on runtime variables. You can explicitly join a subflow by calling tf::Subflow::join, respectively. By default, the Taskflow runtime will implicitly join a subflow it is is joinable. +The following example creates a taskflow graph that spawns a subflow from the execution of task B, and the subflow contains three tasks, B1, B2, and B3, where B3 runs after B1 and B2. //createthreestatictasks tf::TaskA=taskflow.emplace([](){}).name("A"); tf::TaskC=taskflow.emplace([](){}).name("C"); @@ -227,20 +319,17 @@ Clears the underlying task graph depending on the given variable tf::FlowBuilder - - - - - - + + + @@ -248,51 +337,46 @@ Clears the underlying task graph depending on the given variable _graph - - - - + + - - - + + tf::Subflow_executor tf::Subflow_graph - tf::Subflow_joinable - tf::Subflowasync - tf::Subflowasync + tf::Subflow_parent + tf::Subflow_worker tf::Subflowcomposed_of - tf::Subflowcorun - tf::Subflowcorun_all - tf::Subflowcorun_until - tf::Subflowdetach + tf::Subflowemplace tf::Subflowemplace tf::Subflowemplace tf::Subflowemplace tf::Subflowemplace tf::Subflowemplace tf::Subflowerase - tf::Subflowexclusive_scan - tf::Subflowexecutor + tf::Subflowexclusive_scan tf::SubflowExecutor + tf::Subflowexecutor tf::Subflowfind_if tf::Subflowfind_if_not tf::SubflowFlowBuilder tf::SubflowFlowBuilder tf::Subflowfor_each + tf::Subflowfor_each_by_index tf::Subflowfor_each_index - tf::Subflowinclusive_scan - tf::Subflowinclusive_scan + tf::Subflowgraph + tf::Subflowinclusive_scan + tf::Subflowinclusive_scan tf::Subflowjoin tf::Subflowjoinable tf::Subflowlinearize @@ -301,25 +385,22 @@ Clears the underlying task graph depending on the given variable tf::Subflowmin_element tf::Subflowplaceholder tf::Subflowreduce - tf::Subflowreset - tf::SubflowRuntime - tf::Subflowschedule - tf::Subflowsilent_async - tf::Subflowsilent_async - tf::Subflowsilent_async_unchecked - tf::Subflowsilent_async_unchecked + tf::Subflowreduce_by_index + tf::Subflowretain + tf::Subflowretain tf::Subflowsort tf::Subflowsort tf::SubflowSubflow + tf::SubflowSubflow + tf::SubflowSubflow + tf::SubflowSubflow tf::Subflowtransform tf::Subflowtransform - tf::Subflowtransform_exclusive_scan - tf::Subflowtransform_inclusive_scan - tf::Subflowtransform_inclusive_scan + tf::Subflowtransform_exclusive_scan + tf::Subflowtransform_inclusive_scan + tf::Subflowtransform_inclusive_scan tf::Subflowtransform_reduce tf::Subflowtransform_reduce - tf::Subflowworker - tf::Subflow~Runtime diff --git a/docs/xml/classtf_1_1TFProfManager.xml b/docs/xml/classtf_1_1TFProfManager.xml index 6e72663a3..8f34e3410 100644 --- a/docs/xml/classtf_1_1TFProfManager.xml +++ b/docs/xml/classtf_1_1TFProfManager.xml @@ -1,13 +1,14 @@ - + tf::TFProfManager - + class friend class Executor Executor + tf::TFProfManager::Executor Executor @@ -17,69 +18,74 @@ - + - - + + - const std::string + const std::string const std::string tf::TFProfManager::_fpath _fpath + tf::TFProfManager::_fpath - + - std::mutex + std::mutex std::mutex tf::TFProfManager::_mutex _mutex + tf::TFProfManager::_mutex - + - std::vector< std::shared_ptr< TFProfObserver > > + std::vector< std::shared_ptr< TFProfObserver > > std::vector<std::shared_ptr<TFProfObserver> > tf::TFProfManager::_observers _observers + tf::TFProfManager::_observers - + - - + + tf::TFProfManager::~TFProfManager () ~TFProfManager + tf::TFProfManager::~TFProfManager - + tf::TFProfManager::TFProfManager (const TFProfManager &)=delete TFProfManager + tf::TFProfManager::TFProfManager const TFProfManager & @@ -89,13 +95,14 @@ - + - + TFProfManager & - TFProfManager& tf::TFProfManager::operator= + TFProfManager & tf::TFProfManager::operator= (const TFProfManager &)=delete operator= + tf::TFProfManager::operator= const TFProfManager & @@ -105,15 +112,16 @@ - + void void tf::TFProfManager::dump (std::ostream &ostream) const dump + tf::TFProfManager::dump - std::ostream & + std::ostream & ostream @@ -122,45 +130,48 @@ - + - - + + TFProfManager & TFProfManager & tf::TFProfManager::get () get + tf::TFProfManager::get - + - - + + tf::TFProfManager::TFProfManager () TFProfManager + tf::TFProfManager::TFProfManager - + void void tf::TFProfManager::_manage (std::shared_ptr< TFProfObserver > observer) _manage + tf::TFProfManager::_manage - std::shared_ptr< TFProfObserver > + std::shared_ptr< TFProfObserver > observer @@ -169,14 +180,14 @@ - + - + - + tf::TFProfManager_fpath tf::TFProfManager_manage @@ -185,7 +196,7 @@ tf::TFProfManagerdump tf::TFProfManagerExecutor tf::TFProfManagerget - tf::TFProfManageroperator= + tf::TFProfManageroperator= tf::TFProfManagerTFProfManager tf::TFProfManagerTFProfManager tf::TFProfManager~TFProfManager diff --git a/docs/xml/classtf_1_1TFProfObserver.xml b/docs/xml/classtf_1_1TFProfObserver.xml index b66d4c97c..248c878b6 100644 --- a/docs/xml/classtf_1_1TFProfObserver.xml +++ b/docs/xml/classtf_1_1TFProfObserver.xml @@ -1,18 +1,19 @@ - + tf::TFProfObserver tf::ObserverInterface - observer.hpp + taskflow/core/observer.hpp tf::TFProfObserver::Summary tf::TFProfObserver::TaskSummary tf::TFProfObserver::WorkerSummary - + class friend class Executor Executor + tf::TFProfObserver::Executor Executor @@ -22,13 +23,14 @@ - + class friend class TFProfManager TFProfManager + tf::TFProfObserver::TFProfManager TFProfManager @@ -38,45 +40,48 @@ - + - - + + Timeline Timeline tf::TFProfObserver::_timeline _timeline + tf::TFProfObserver::_timeline - + - std::vector< std::stack< observer_stamp_t > > + std::vector< std::stack< observer_stamp_t > > std::vector<std::stack<observer_stamp_t> > tf::TFProfObserver::_stacks _stacks + tf::TFProfObserver::_stacks - + - - + + void void tf::TFProfObserver::dump (std::ostream &ostream) const dump + tf::TFProfObserver::dump - std::ostream & + std::ostream & ostream @@ -86,13 +91,14 @@ - + - std::string + std::string std::string tf::TFProfObserver::dump () const dump + tf::TFProfObserver::dump dumps the timelines into a JSON string @@ -100,15 +106,16 @@ - + void void tf::TFProfObserver::summary (std::ostream &ostream) const summary + tf::TFProfObserver::summary - std::ostream & + std::ostream & ostream @@ -118,13 +125,14 @@ - + - std::string + std::string std::string tf::TFProfObserver::summary () const summary + tf::TFProfObserver::summary returns the summary report in a string @@ -132,13 +140,14 @@ - + void void tf::TFProfObserver::clear () clear + tf::TFProfObserver::clear clears the timeline data @@ -146,13 +155,14 @@ - + size_t size_t tf::TFProfObserver::num_tasks () const num_tasks + tf::TFProfObserver::num_tasks queries the number of tasks observed @@ -160,13 +170,14 @@ - + size_t size_t tf::TFProfObserver::num_workers () const num_workers + tf::TFProfObserver::num_workers queries the number of observed workers @@ -174,15 +185,16 @@ - + - - + + void void tf::TFProfObserver::set_up (size_t num_workers) override final set_up + tf::TFProfObserver::set_up set_up size_t @@ -205,13 +217,14 @@ - + void void tf::TFProfObserver::on_entry (WorkerView, TaskView) override final on_entry + tf::TFProfObserver::on_entry on_entry WorkerView @@ -246,13 +259,14 @@ - + void void tf::TFProfObserver::on_exit (WorkerView, TaskView) override final on_exit + tf::TFProfObserver::on_exit on_exit WorkerView @@ -287,9 +301,9 @@ - + - + class to create an observer based on the built-in taskflow profiler format @@ -302,13 +316,13 @@ //... //createacustomobserver -std::shared_ptr<tf::TFProfObserver>observer=executor.make_observer<tf::TFProfObserver>(); +std::shared_ptr<tf::TFProfObserver>observer=executor.make_observer<tf::TFProfObserver>(); //runthetaskflow executor.run(taskflow).wait(); //dumpthethreadactivitiestoTaskflowProfilerformat. -observer->dump(std::cout); +observer->dump(std::cout); @@ -335,7 +349,7 @@ - + tf::TFProfObserver_stacks tf::TFProfObserver_timeline diff --git a/docs/xml/classtf_1_1Task.xml b/docs/xml/classtf_1_1Task.xml index 4fcfdcf36..4e28fd33c 100644 --- a/docs/xml/classtf_1_1Task.xml +++ b/docs/xml/classtf_1_1Task.xml @@ -1,14 +1,15 @@ - + tf::Task - task.hpp - + taskflow/core/task.hpp + class friend class FlowBuilder FlowBuilder + tf::Task::FlowBuilder FlowBuilder @@ -18,13 +19,14 @@ - + class friend class Runtime Runtime + tf::Task::Runtime Runtime @@ -34,13 +36,14 @@ - + class friend class Taskflow Taskflow + tf::Task::Taskflow Taskflow @@ -50,13 +53,14 @@ - + class friend class TaskView TaskView + tf::Task::TaskView TaskView @@ -66,13 +70,14 @@ - + class friend class Executor Executor + tf::Task::Executor Executor @@ -82,15 +87,16 @@ - + - - + + Node * Node* tf::Task::_node _node + tf::Task::_node {nullptr} @@ -98,29 +104,32 @@ - + - - + + tf::Task::Task ()=default Task + tf::Task::Task constructs an empty task +An empty task is not associated with any node in a taskflow. - + tf::Task::Task (const Task &other) Task + tf::Task::Task const Task & other @@ -129,160 +138,286 @@ constructs the task with the copy of the other task + + +other + + +the other task to copy + + + +tf::Taskflowtaskflow; +tf::TaskA=taskflow.emplace([](){std::cout<<"TaskA\n";}); +tf::TaskB(A); +assert(B==A);//Now,BandArefertothesameunderlyingnode + - + - + Task & Task & tf::Task::operator= - (const Task &) + (const Task &other) operator= + tf::Task::operator= const Task & - rhs + other replaces the contents with a copy of the other task + + +other + + +the other task to copy + + + +tf::TaskA=taskflow.emplace([](){std::cout<<"A\n";}); +tf::TaskB; +B=A;//BnowreferstothesamenodeasA + - + Task & Task & tf::Task::operator= (std::nullptr_t) operator= + tf::Task::operator= - std::nullptr_t + std::nullptr_t ptr replaces the contents with a null pointer +tf::TaskA=taskflow.emplace([](){std::cout<<"A\n";}); +A=nullptr;//Anolongerreferstoanynode + - + bool bool tf::Task::operator== (const Task &rhs) const operator== + tf::Task::operator== const Task & rhs -compares if two tasks are associated with the same graph node +compares if two tasks are associated with the same taskflow node + + +rhs + + +the other task to compare with + + + +true if both tasks refer to the same node; false otherwise + +tf::TaskA=taskflow.emplace([](){std::cout<<"A\n";}); +tf::TaskB=A; +assert(A==B);//AandBrefertothesamenode + - + bool bool tf::Task::operator!= (const Task &rhs) const operator!= + tf::Task::operator!= const Task & rhs -compares if two tasks are not associated with the same graph node +compares if two tasks are not associated with the same taskflow node + + +rhs + + +the other task to compare with + + + +true if they refer to different nodes; false otherwise + +tf::TaskA=taskflow.emplace([](){std::cout<<"A\n";}); +tf::TaskB=taskflow.emplace([](){std::cout<<"B\n";}); +assert(A!=B);//AandBrefertodifferentnodes + - + - const std::string & + const std::string & const std::string & tf::Task::name () const name + tf::Task::name queries the name of the task +the name of the task as a constant string reference + +tf::Tasktask=taskflow.emplace([](){}); +task.name("MyTask"); +std::cout<<"Taskname:"<<task.name()<<std::endl; + - + size_t size_t tf::Task::num_successors () const num_successors + tf::Task::num_successors queries the number of successors of the task +the number of successor tasks. + +tf::TaskA=taskflow.emplace([](){}); +tf::TaskB=taskflow.emplace([](){}); +A.precede(B);//BisasuccessorofA +std::cout<<"Ahas"<<A.num_successors()<<"successor(s)."<<std::endl; + - + - + size_t - size_t tf::Task::num_dependents + size_t tf::Task::num_predecessors () const - num_dependents + num_predecessors + tf::Task::num_predecessors queries the number of predecessors of the task +the number of predecessor tasks + +tf::TaskA=taskflow.emplace([](){}); +tf::TaskB=taskflow.emplace([](){}); +A.precede(B);//AisapredecessorofB +std::cout<<"Bhas"<<B.num_predecessors()<<"predecessor(s)."<<std::endl; + - + - + size_t - size_t tf::Task::num_strong_dependents + size_t tf::Task::num_strong_dependencies () const - num_strong_dependents + num_strong_dependencies + tf::Task::num_strong_dependencies -queries the number of strong dependents of the task +queries the number of strong dependencies of the task +the number of strong dependencies to this task + +A strong dependency is a preceding link from one non-condition task to another task. For instance, task cond below has one strong dependency, while tasks yes and no each have one weak dependency. +auto[init,cond,yes,no]=taskflow.emplace( +[](){}, +[](){return0;}, +[](){std::cout<<"yes\n";}, +[](){std::cout<<"no\n";} +); +cond.succeed(init) +.precede(yes,no);//executesyesifcondreturns0 +//executesnoifcondreturns1 + + + +To understand how Taskflow schedule tasks under strong and weak dependencies, please refer to Conditional Tasking. + + - + - + size_t - size_t tf::Task::num_weak_dependents + size_t tf::Task::num_weak_dependencies () const - num_weak_dependents + num_weak_dependencies + tf::Task::num_weak_dependencies -queries the number of weak dependents of the task +queries the number of weak dependencies of the task +the number of weak dependencies to this task + +A weak dependency is a preceding link from one condition task to another task. For instance, task cond below has one strong dependency, while tasks yes and no each have one weak dependency. +auto[init,cond,yes,no]=taskflow.emplace( +[](){}, +[](){return0;}, +[](){std::cout<<"yes\n";}, +[](){std::cout<<"no\n";} +); +cond.succeed(init) +.precede(yes,no);//executesyesifcondreturns0 +//executesnoifcondreturns1 + + + +To understand how Taskflow schedule tasks under strong and weak dependencies, please refer to Conditional Tasking. + + - + Task & Task & tf::Task::name (const std::string &name) name + tf::Task::name - const std::string & + const std::string & name @@ -294,17 +429,19 @@ name -a std::string acceptable string +a std::string *this - +tf::Tasktask=taskflow.emplace([](){}).name("foo"); +assert(task.name*)=="foo"); + - + @@ -316,6 +453,7 @@ Task & tf::Task::work (C &&callable) work + tf::Task::work C && callable @@ -344,11 +482,17 @@ *this - +A tf::Task is polymorphic. Once created, you can reassign it to a different callable of a different task type using tf::Task::work. For example, the code below creates a static task and reworks it to a subflow task: +tf::Tasktask=taskflow.emplace([](){}).name("statictask"); +task.work([](tf::Subflow&sf){ +tf::Taskstask1=sf.emplace([](){}); +tf::Taskstask2=sf.emplace([](){}); +}).name("subflowtask"); + - + @@ -360,6 +504,7 @@ Task & tf::Task::composed_of (T &object) composed_of + tf::Task::composed_of T & object @@ -388,11 +533,14 @@ *this - +The example below creates a module task from a taskflow: +task.composed_of(taskflow); + +To understand how Taskflow schedules a module task including how to create a schedulable graph, pleas refer to Create a Custom Composable Graph. - + @@ -406,6 +554,7 @@ Task & tf::Task::precede (Ts &&... tasks) precede + tf::Task::precede Ts &&... tasks @@ -434,11 +583,17 @@ *this - +The example below creates a taskflow of two tasks, where task1 runs before task2. +auto[task1,task2]=taskflow.emplace( +[](){std::cout<<"task1\n";}, +[](){std::cout<<"task2\n";} +); +task1.precede(task2); + - + @@ -452,6 +607,7 @@ Task & tf::Task::succeed (Ts &&... tasks) succeed + tf::Task::succeed Ts &&... tasks @@ -480,53 +636,248 @@ *this - +The example below creates a taskflow of two tasks, where task1 runs before task2. +auto[task1,task2]=taskflow.emplace( +[](){std::cout<<"task1\n";}, +[](){std::cout<<"task2\n";} +); +task2.succeed(task1); + + + + + + + + + + typename... + Ts + Ts + + + Task & + Task & tf::Task::remove_predecessors + (Ts &&... tasks) + remove_predecessors + tf::Task::remove_predecessors + + Ts &&... + tasks + + +removes predecessor links from other tasks to this + + + + +Ts + + +parameter pack + + + + + +tasks + + +one or multiple tasks + + + +*this + +This method removes the dependency links where the given tasks are predecessors of this task (i.e., tasks -> this). It ensures both sides of the dependency are updated to maintain graph consistency. +tf::TaskA=taskflow.emplace([](){}); +tf::TaskB=taskflow.emplace([](){}); +tf::TaskC=taskflow.emplace([](){}); +//createalinearchainoftasks,A->B->C +B.succeed(A) +.precede(C); +assert(B.num_successors()==1&&C.num_predecessors()==1); + +//removeCfromB'ssuccessorlist +C.remove_predecessors(B); +assert(B.num_successors()==0&&C.num_predecessors()==0); + - + + + + + + typename... + Ts + Ts + + + Task & + Task & tf::Task::remove_successors + (Ts &&... tasks) + remove_successors + tf::Task::remove_successors + + Ts &&... + tasks + + +removes successor links from this to other tasks + + + + +Ts + + +parameter pack + + + + + +tasks + + +one or multiple tasks + + + +*this + +This method removes the dependency links where this task is a predecessor of the given tasks (i.e., this -> tasks). It ensures both sides of the dependency are updated to maintain graph consistency. +tf::TaskA=taskflow.emplace([](){}); +tf::TaskB=taskflow.emplace([](){}); +tf::TaskC=taskflow.emplace([](){}); +//createalinearchainoftasks,A->B->C +B.succeed(A) +.precede(C); +assert(B.num_successors()==1&&C.num_predecessors()==1); + +//removeCfromB'ssuccessorlist +B.remove_successors(C); +assert(B.num_successors()==0&&C.num_predecessors()==0); + + + + + Task & Task & tf::Task::release (Semaphore &semaphore) release + tf::Task::release Semaphore & semaphore -makes the task release this semaphore +makes the task release the given semaphore +To know more about tf::Semaphore, please refer to Limit the Maximum Concurrency. + + - + + + + + + typename I + + + Task & + Task & tf::Task::release + (I first, I last) + release + tf::Task::release + + I + first + + + I + last + + +makes the task release the given range of semaphores + + +To know more about tf::Semaphore, please refer to Limit the Maximum Concurrency. + + + + + + Task & Task & tf::Task::acquire (Semaphore &semaphore) acquire + tf::Task::acquire Semaphore & semaphore -makes the task acquire this semaphore +makes the task acquire the given semaphore +To know more about tf::Semaphore, please refer to Limit the Maximum Concurrency. + + - + + + + + + typename I + + + Task & + Task & tf::Task::acquire + (I first, I last) + acquire + tf::Task::acquire + + I + first + + + I + last + + +makes the task acquire the given range of semaphores + + +To know more about tf::Semaphore, please refer to Limit the Maximum Concurrency. + + + + + + Task & Task & tf::Task::data (void *data) data + tf::Task::data void * data @@ -540,88 +891,62 @@ data -pointer to user data +pointer to user data -The following example shows how to attach user data to a task and run the task iteratively while changing the data value: +*this + +The following example shows how to attach a user data to a task and retrieve it during the execution of the task. tf::Executorexecutor; tf::Taskflowtaskflow("attachdatatoatask"); -intdata; +intdata;//userdata -//createataskandattachitthedata +//createataskandattachitauserdata autoA=taskflow.placeholder(); A.data(&data).work([A](){ autod=*static_cast<int*>(A.data()); -std::cout<<"datais"<<d<<std::endl; +std::cout<<"datais"<<d<<std::endl; }); //runthetaskflowiterativelywithchangingdata for(data=0;data<10;data++){ executor.run(taskflow).wait(); } - -*this - - + - - - - Task & - Task & tf::Task::priority - (TaskPriority p) - priority - - TaskPriority - p - - -assigns a priority value to the task - - -A priority value can be one of the following three levels, tf::TaskPriority::HIGH (numerically equivalent to 0), tf::TaskPriority::NORMAL (numerically equivalent to 1), and tf::TaskPriority::LOW (numerically equivalent to 2). The smaller the priority value, the higher the priority. - - - - - - - TaskPriority - TaskPriority tf::Task::priority - () const - priority - -queries the priority value of the task - - - - - - + void void tf::Task::reset () reset + tf::Task::reset resets the task handle to null +Resetting a task will remove its associated taskflow node and make it an empty task. +tf::Tasktask=taskflow.emplace([](){}); +assert(task.empty()==false); +task.reset(); +assert(task.empty()==true); + - + void void tf::Task::reset_work () reset_work + tf::Task::reset_work resets the associated work to a placeholder @@ -629,35 +954,51 @@ The following example shows how to attach user data to a task and run the task i - + bool bool tf::Task::empty () const empty + tf::Task::empty -queries if the task handle points to a task node +queries if the task handle is associated with a taskflow node +true if the task is not associated with any taskflow node; otherwise false + +tf::Tasktask; +assert(task.empty()==true); + +Note that an empty task is not equal to a placeholder task. A placeholder task is created from tf::Taskflow::placeholder and is associated with a taskflow node, but its work is not assigned yet. - + bool bool tf::Task::has_work () const has_work + tf::Task::has_work queries if the task has a work assigned +true if the task has a work assigned (not placeholder); otherwise false + +tf::Tasktask=taskflow.placeholder(); +assert(task.has_work()==false); +//assignastatictaskcallabletothistask +task.work([](){}); +assert(task.has_work()==true); + - + @@ -669,6 +1010,7 @@ The following example shows how to attach user data to a task and run the task i void tf::Task::for_each_successor (V &&visitor) const for_each_successor + tf::Task::for_each_successor V && visitor @@ -677,101 +1019,250 @@ The following example shows how to attach user data to a task and run the task i applies an visitor callable to each successor of the task + + +V + + +a callable type (function, lambda, etc.) that accepts a tf::Task handle + + + + + +visitor + + +visitor to apply to each subflow task + + + +This method allows you to traverse and inspect successor tasks of this task. For instance, the code below iterates the two successors (task2 and task3) of task1. +auto[task1,task2,task3]=taskflow.emplace( +[](){std::cout<<"task1\n";}, +[](){std::cout<<"task2\n";}, +[](){std::cout<<"task3\n";} +}); +task1.precede(task2,task3); +task1.for_each_successor([](tf::Tasksuccessor){ +std::cout<<"successortask"<<successor.name()<<'\n'; +}); + - + - + typename V void - void tf::Task::for_each_dependent + void tf::Task::for_each_predecessor (V &&visitor) const - for_each_dependent + for_each_predecessor + tf::Task::for_each_predecessor V && visitor -applies an visitor callable to each dependents of the task +applies an visitor callable to each predecessor of the task + + +V + + +a callable type (function, lambda, etc.) that accepts a tf::Task handle + + + + + +visitor + + +visitor to apply to each predecessor task + + + +This method allows you to traverse and inspect predecessor tasks of this task. For instance, the code below iterates the two predecessors (task2 and task3) of task1. +auto[task1,task2,task3]=taskflow.emplace( +[](){std::cout<<"task1\n";}, +[](){std::cout<<"task2\n";}, +[](){std::cout<<"task3\n";} +}); +task1.succeed(task2,task3); +task1.for_each_predecessor([](tf::Taskpredecessor){ +std::cout<<"predecessortask"<<predecessor.name()<<'\n'; +}); + + + + + + + + + + typename V + + + void + void tf::Task::for_each_subflow_task + (V &&visitor) const + for_each_subflow_task + tf::Task::for_each_subflow_task + + V && + visitor + + +applies an visitor callable to each subflow task + + + + +V + + +a callable type (function, lambda, etc.) that accepts a tf::Task handle + + + + + +visitor + + +visitor to apply to each subflow task + + + +This method allows you to traverse and inspect tasks within a subflow. It only applies to a subflow task. +tf::Tasktask=taskflow.emplace([](tf::Subflow&sf){ +tf::Taskstask1=sf.emplace([](){}).name("stask1"); +tf::Taskstask2=sf.emplace([](){}).name("stask2"); +}); +//Iteratetasksinthesubflowandprinteachsubflowtask. +task.for_each_subflow_task([](tf::Taskstask){ +std::cout<<"subflowtask"<<stask.name()<<'\n'; +}); + - + size_t size_t tf::Task::hash_value () const hash_value + tf::Task::hash_value obtains a hash value of the underlying node +the hash value of the underlying node + +The method returns std::hash on the underlying node pointer. +tf::Tasktask=taskflow.emplace([](){}); +std::cout<<"hashvalueoftaskis"<<task.hash_value()<<'\n'; + - + TaskType TaskType tf::Task::type () const type + tf::Task::type returns the task type +A task can be one of the types defined in tf::TaskType and can be printed in a human-readable form using tf::to_string. +autotask=taskflow.emplace([](){}).name("task"); +std::cout<<task.name()<<"type=["<<tf::to_string(task.type())<<"]\n"; + - + void void tf::Task::dump (std::ostream &ostream) const dump + tf::Task::dump - std::ostream & + std::ostream & ostream dumps the task through an output stream +The method dumps the name and the type of this task through std::cout. +task.dump(std::cout); + - + void * void * tf::Task::data () const data + tf::Task::data queries pointer to user data +C-styled pointer to the attached user data by tf::Task::data(void* data) + +The following example shows how to attach a user data to a task and retrieve it during the execution of the task. +tf::Executorexecutor; +tf::Taskflowtaskflow("attachdatatoatask"); + +intdata;//userdata + +//createataskandattachitauserdata +autoA=taskflow.placeholder(); +A.data(&data).work([A](){ +autod=*static_cast<int*>(A.data()); +std::cout<<"datais"<<d<<std::endl; +}); + +//runthetaskflowiterativelywithchangingdata +for(data=0;data<10;data++){ +executor.run(taskflow).wait(); +} + - + - - + + tf::Task::Task (Node *) Task + tf::Task::Task Node * node @@ -782,19 +1273,55 @@ The following example shows how to attach user data to a task and run the task i - + - + -class to create a task handle over a node in a taskflow graph +class to create a task handle over a taskflow node -A task is a wrapper over a node in a taskflow graph. It provides a set of methods for users to access and modify the attributes of the associated node in the taskflow graph. A task is very lightweight object (i.e., only storing a node pointer) that can be trivially copied around, and it does not own the lifetime of the associated node. +A task points to a node in a taskflow graph and provides a set of methods for users to access and modify attributes of the associated node, such as dependencies, callable, names, and so on. A task is a very lightweight object (i.e., it only stores a node pointer) and can be trivially copied around. +//createtwotaskswithonedependency +autotask1=taskflow.emplace([](){}).name("task1"); +autotask2=taskflow.emplace([](){}).name("task2"); +task1.precede(task2); + +//dumpthetaskinformationthroughstd::cout +task1.dump(std::cout); + +A task created from a taskflow can be one of the following types: +tf::TaskType::STATIC - Static Tasking +tf::TaskType::CONDITION - Conditional Tasking +tf::TaskType::RUNTIME - Runtime Tasking +tf::TaskType::SUBFLOW - Subflow Tasking +tf::TaskType::MODULE - Composable Tasking + + +tf::Tasktask1=taskflow.emplace([](){}).name("statictask"); +tf::Tasktask2=taskflow.emplace([](){return3;}).name("conditiontask"); +tf::Tasktask3=taskflow.emplace([](tf::Runtime&){}).name("runtimetask"); +tf::Tasktask4=taskflow.emplace([](tf::Subflow&sf){ +tf::Taskstask1=sf.emplace([](){}); +tf::Taskstask2=sf.emplace([](){}); +}).name("subflowtask"); +tf::Tasktask5=taskflow.composed_of(taskflow2).name("moduletask"); + +A tf::Task is polymorphic. Once created, you can assign a different task type to it using tf::Task::work. For example, the code below creates a static task and then reworks it to a subflow task: +tf::Tasktask=taskflow.emplace([](){}).name("statictask"); +task.work([](tf::Subflow&sf){ +tf::Taskstask1=sf.emplace([](){}); +tf::Taskstask2=sf.emplace([](){}); +}).name("subflowtask"); + +tf::Task does not own the lifetime of the associated node. Accessing the attributes of the associated node after the taskflow has been destroyed can result in undefined behavior. + + - + tf::Task_node tf::Taskacquire + tf::Taskacquire tf::Taskcomposed_of tf::Taskdata tf::Taskdata @@ -802,24 +1329,26 @@ The following example shows how to attach user data to a task and run the task i tf::Taskempty tf::TaskExecutor tf::TaskFlowBuilder - tf::Taskfor_each_dependent + tf::Taskfor_each_predecessor + tf::Taskfor_each_subflow_task tf::Taskfor_each_successor tf::Taskhas_work tf::Taskhash_value tf::Taskname tf::Taskname - tf::Tasknum_dependents - tf::Tasknum_strong_dependents + tf::Tasknum_predecessors + tf::Tasknum_strong_dependencies tf::Tasknum_successors - tf::Tasknum_weak_dependents + tf::Tasknum_weak_dependencies tf::Taskoperator!= - tf::Taskoperator= + tf::Taskoperator= tf::Taskoperator= tf::Taskoperator== tf::Taskprecede - tf::Taskpriority - tf::Taskpriority tf::Taskrelease + tf::Taskrelease + tf::Taskremove_predecessors + tf::Taskremove_successors tf::Taskreset tf::Taskreset_work tf::TaskRuntime diff --git a/docs/xml/classtf_1_1TaskParams.xml b/docs/xml/classtf_1_1TaskParams.xml new file mode 100644 index 000000000..7e3a009f0 --- /dev/null +++ b/docs/xml/classtf_1_1TaskParams.xml @@ -0,0 +1,62 @@ + + + + tf::TaskParams + taskflow/core/graph.hpp + + + std::string + std::string tf::TaskParams::name + + name + tf::TaskParams::name + +name of the task + + + + + + + + + void * + void* tf::TaskParams::data + + data + tf::TaskParams::data + {nullptr} + +C-styled pointer to user data. + + + + + + + + + +class to create a task parameter object + + + + + + + + + + + + name + + + + + + tf::TaskParamsdata + tf::TaskParamsname + + + diff --git a/docs/xml/classtf_1_1TaskQueue.xml b/docs/xml/classtf_1_1TaskQueue.xml deleted file mode 100644 index 9de6b50ce..000000000 --- a/docs/xml/classtf_1_1TaskQueue.xml +++ /dev/null @@ -1,445 +0,0 @@ - - - - tf::TaskQueue - tsq.hpp - tf::TaskQueue::Array - - - typename T - - - unsigned - TF_MAX_PRIORITY - TF_MAX_PRIORITY - static_cast<unsigned>(TaskPriority::MAX) - - - - - CachelineAligned< std::atomic< int64_t > > - CachelineAligned<std::atomic<int64_t> > tf::TaskQueue< T, TF_MAX_PRIORITY >::_top[TF_MAX_PRIORITY] - [TF_MAX_PRIORITY] - _top - - - - - - - - - - CachelineAligned< std::atomic< int64_t > > - CachelineAligned<std::atomic<int64_t> > tf::TaskQueue< T, TF_MAX_PRIORITY >::_bottom[TF_MAX_PRIORITY] - [TF_MAX_PRIORITY] - _bottom - - - - - - - - - - std::atomic< Array * > - std::atomic<Array*> tf::TaskQueue< T, TF_MAX_PRIORITY >::_array[TF_MAX_PRIORITY] - [TF_MAX_PRIORITY] - _array - - - - - - - - - - std::vector< Array * > - std::vector<Array*> tf::TaskQueue< T, TF_MAX_PRIORITY >::_garbage[TF_MAX_PRIORITY] - [TF_MAX_PRIORITY] - _garbage - - - - - - - - - - - - - tf::TaskQueue< T, TF_MAX_PRIORITY >::TaskQueue - (int64_t capacity=512) - TaskQueue - - int64_t - capacity - 512 - - -constructs the queue with a given capacity - - - - -capacity - - -the capacity of the queue (must be power of 2) - - - - - - - - - - - - tf::TaskQueue< T, TF_MAX_PRIORITY >::~TaskQueue - () - ~TaskQueue - -destructs the queue - - - - - - - - - bool - bool tf::TaskQueue< T, TF_MAX_PRIORITY >::empty - () const noexcept - empty - -queries if the queue is empty at the time of this call - - - - - - - - - bool - bool tf::TaskQueue< T, TF_MAX_PRIORITY >::empty - (unsigned priority) const noexcept - empty - - unsigned - priority - - -queries if the queue is empty at a specific priority value - - - - - - - - - size_t - size_t tf::TaskQueue< T, TF_MAX_PRIORITY >::size - () const noexcept - size - -queries the number of items at the time of this call - - - - - - - - - size_t - size_t tf::TaskQueue< T, TF_MAX_PRIORITY >::size - (unsigned priority) const noexcept - size - - unsigned - priority - - -queries the number of items with the given priority at the time of this call - - - - - - - - - int64_t - int64_t tf::TaskQueue< T, TF_MAX_PRIORITY >::capacity - () const noexcept - capacity - -queries the capacity of the queue - - - - - - - - - int64_t - int64_t tf::TaskQueue< T, TF_MAX_PRIORITY >::capacity - (unsigned priority) const noexcept - capacity - - unsigned - priority - - -queries the capacity of the queue at a specific priority value - - - - - - - - - TF_FORCE_INLINE void - TF_FORCE_INLINE void tf::TaskQueue< T, TF_MAX_PRIORITY >::push - (T item, unsigned priority) - push - - T - item - - - unsigned - priority - - -inserts an item to the queue - - - - -item - - -the item to push to the queue - - - - -priority - - -priority value of the item to push (default = 0) - - - -Only the owner thread can insert an item to the queue. The operation can trigger the queue to resize its capacity if more space is required. - - - - - - - T - T tf::TaskQueue< T, TF_MAX_PRIORITY >::pop - () - pop - -pops out an item from the queue - - -Only the owner thread can pop out an item from the queue. The return can be a nullptr if this operation failed (empty queue). - - - - - - - TF_FORCE_INLINE T - TF_FORCE_INLINE T tf::TaskQueue< T, TF_MAX_PRIORITY >::pop - (unsigned priority) - pop - - unsigned - priority - - -pops out an item with a specific priority value from the queue - - - - -priority - - -priority of the item to pop - - - -Only the owner thread can pop out an item from the queue. The return can be a nullptr if this operation failed (empty queue). - - - - - - - T - T tf::TaskQueue< T, TF_MAX_PRIORITY >::steal - () - steal - -steals an item from the queue - - -Any threads can try to steal an item from the queue. The return can be a nullptr if this operation failed (not necessary empty). - - - - - - - T - T tf::TaskQueue< T, TF_MAX_PRIORITY >::steal - (unsigned priority) - steal - - unsigned - priority - - -steals an item with a specific priority value from the queue - - - - -priority - - -priority of the item to steal - - - -Any threads can try to steal an item from the queue. The return can be a nullptr if this operation failed (not necessary empty). - - - - - - - - - TF_NO_INLINE Array * - TF_NO_INLINE TaskQueue< T, TF_MAX_PRIORITY >::Array * tf::TaskQueue< T, TF_MAX_PRIORITY >::resize_array - (Array *a, unsigned p, std::int64_t b, std::int64_t t) - resize_array - - Array * - a - - - unsigned - p - - - std::int64_t - b - - - std::int64_t - t - - - - - - - - - - - -class to create a lock-free unbounded single-producer multiple-consumer queue - - - - -T - - -data type (must be a pointer type) - - - - -TF_MAX_PRIORITY - - -maximum level of the priority - - - -This class implements the work-stealing queue described in the paper, Correct and Efficient Work-Stealing for Weak Memory Models, and extends it to include priority. -Only the queue owner can perform pop and push operations, while others can steal data from the queue simultaneously. Priority starts from zero (highest priority) to the template value TF_MAX_PRIORITY-1 (lowest priority). All operations are associated with priority values to indicate the corresponding queues to which an operation is applied. -The default template value, TF_MAX_PRIORITY, is TaskPriority::MAX which applies only three priority levels to the task queue. -auto[A,B,C,D,E]=taskflow.emplace( -[](){}, -[&](){ -std::cout<<"TaskB:"<<counter++<<'\n';//0 -}, -[&](){ -std::cout<<"TaskC:"<<counter++<<'\n';//2 -}, -[&](){ -std::cout<<"TaskD:"<<counter++<<'\n';//1 -}, -[](){} -); - -A.precede(B,C,D); -E.succeed(B,C,D); - -B.priority(tf::TaskPriority::HIGH); -C.priority(tf::TaskPriority::LOW); -D.priority(tf::TaskPriority::NORMAL); - -executor.run(taskflow).wait(); - -In the above example, we have a task graph of five tasks, A, B, C, D, and E, in which B, C, and D can run in simultaneously when A finishes. Since we only uses one worker thread in the executor, we can deterministically run B first, then D, and C in order of their priority values. The output is as follows: -TaskB:0 -TaskD:1 -TaskC:2 - - - - - tf::TaskQueue_array - tf::TaskQueue_bottom - tf::TaskQueue_garbage - tf::TaskQueue_top - tf::TaskQueuecapacity - tf::TaskQueuecapacity - tf::TaskQueueempty - tf::TaskQueueempty - tf::TaskQueuepop - tf::TaskQueuepop - tf::TaskQueuepush - tf::TaskQueueresize_array - tf::TaskQueuesize - tf::TaskQueuesize - tf::TaskQueuesteal - tf::TaskQueuesteal - tf::TaskQueueTaskQueue - tf::TaskQueue~TaskQueue - - - diff --git a/docs/xml/classtf_1_1TaskView.xml b/docs/xml/classtf_1_1TaskView.xml index 6a80e0802..ee6bf5936 100644 --- a/docs/xml/classtf_1_1TaskView.xml +++ b/docs/xml/classtf_1_1TaskView.xml @@ -1,14 +1,15 @@ - + tf::TaskView - task.hpp - + taskflow/core/task.hpp + class friend class Executor Executor + tf::TaskView::Executor Executor @@ -18,30 +19,32 @@ - + - - + + const Node & const Node& tf::TaskView::_node _node + tf::TaskView::_node - + - - + + - const std::string & + const std::string & const std::string & tf::TaskView::name () const name + tf::TaskView::name queries the name of the task @@ -49,13 +52,14 @@ - + size_t size_t tf::TaskView::num_successors () const num_successors + tf::TaskView::num_successors queries the number of successors of the task @@ -63,13 +67,14 @@ - + - + size_t - size_t tf::TaskView::num_dependents + size_t tf::TaskView::num_predecessors () const - num_dependents + num_predecessors + tf::TaskView::num_predecessors queries the number of predecessors of the task @@ -77,35 +82,37 @@ - + - + size_t - size_t tf::TaskView::num_strong_dependents + size_t tf::TaskView::num_strong_dependencies () const - num_strong_dependents + num_strong_dependencies + tf::TaskView::num_strong_dependencies -queries the number of strong dependents of the task +queries the number of strong dependencies of the task - + - + size_t - size_t tf::TaskView::num_weak_dependents + size_t tf::TaskView::num_weak_dependencies () const - num_weak_dependents + num_weak_dependencies + tf::TaskView::num_weak_dependencies -queries the number of weak dependents of the task +queries the number of weak dependencies of the task - + @@ -117,6 +124,7 @@ void tf::TaskView::for_each_successor (V &&visitor) const for_each_successor + tf::TaskView::for_each_successor V && visitor @@ -125,39 +133,79 @@ applies an visitor callable to each successor of the task + + +V + + +a callable type (function, lambda, etc.) that accepts a tf::Task handle + + + + + +visitor + + +visitor to apply to each subflow task + + + +This method allows you to traverse and inspect successor tasks of this task. - + - + typename V void - void tf::TaskView::for_each_dependent + void tf::TaskView::for_each_predecessor (V &&visitor) const - for_each_dependent + for_each_predecessor + tf::TaskView::for_each_predecessor V && visitor -applies an visitor callable to each dependents of the task +applies an visitor callable to each predecessor of the task + + +V + + +a callable type (function, lambda, etc.) that accepts a tf::Task handle + + + + + +visitor + + +visitor to apply to each predecessor task + + + +This method allows you to traverse and inspect predecessor tasks of this task. - + TaskType TaskType tf::TaskView::type () const type + tf::TaskView::type queries the task type @@ -165,13 +213,14 @@ - + size_t size_t tf::TaskView::hash_value () const hash_value + tf::TaskView::hash_value obtains a hash value of the underlying node @@ -179,15 +228,16 @@ - + - - + + tf::TaskView::TaskView (const Node &) TaskView + tf::TaskView::TaskView const Node & node @@ -198,13 +248,14 @@ - + tf::TaskView::TaskView (const TaskView &)=default TaskView + tf::TaskView::TaskView const TaskView & @@ -214,26 +265,26 @@ - + - + class to access task information from the observer interface - + tf::TaskView_node tf::TaskViewExecutor - tf::TaskViewfor_each_dependent + tf::TaskViewfor_each_predecessor tf::TaskViewfor_each_successor tf::TaskViewhash_value tf::TaskViewname - tf::TaskViewnum_dependents - tf::TaskViewnum_strong_dependents + tf::TaskViewnum_predecessors + tf::TaskViewnum_strong_dependencies tf::TaskViewnum_successors - tf::TaskViewnum_weak_dependents + tf::TaskViewnum_weak_dependencies tf::TaskViewTaskView tf::TaskViewTaskView tf::TaskViewtype diff --git a/docs/xml/classtf_1_1Taskflow.xml b/docs/xml/classtf_1_1Taskflow.xml index 12e6dd644..70695fb8e 100644 --- a/docs/xml/classtf_1_1Taskflow.xml +++ b/docs/xml/classtf_1_1Taskflow.xml @@ -1,16 +1,17 @@ - + tf::Taskflow tf::FlowBuilder - taskflow.hpp + taskflow/core/taskflow.hpp tf::Taskflow::Dumper - + class friend class Topology Topology + tf::Taskflow::Topology Topology @@ -20,13 +21,14 @@ - + class friend class Executor Executor + tf::Taskflow::Executor Executor @@ -36,13 +38,14 @@ - + class friend class FlowBuilder FlowBuilder + tf::Taskflow::FlowBuilder FlowBuilder @@ -52,84 +55,107 @@ - + - - + + class + friend class Subflow + + Subflow + tf::Taskflow::Subflow + + Subflow + + + + + + + + + + + - std::mutex + std::mutex std::mutex tf::Taskflow::_mutex _mutex + tf::Taskflow::_mutex - + - std::string + std::string std::string tf::Taskflow::_name _name + tf::Taskflow::_name - + Graph Graph tf::Taskflow::_graph _graph + tf::Taskflow::_graph - + - std::queue< std::shared_ptr< Topology > > + std::queue< std::shared_ptr< Topology > > std::queue<std::shared_ptr<Topology> > tf::Taskflow::_topologies _topologies + tf::Taskflow::_topologies - + - std::optional< std::list< Taskflow >::iterator > + std::optional< std::list< Taskflow >::iterator > std::optional<std::list<Taskflow>::iterator> tf::Taskflow::_satellite _satellite + tf::Taskflow::_satellite - + - - + + tf::Taskflow::Taskflow (const std::string &name) Taskflow + tf::Taskflow::Taskflow - const std::string & + const std::string & name @@ -137,18 +163,19 @@ tf::Taskflowtaskflow("MyTaskflow"); -std::cout<<taskflow.name();//"MyTaskflow" +std::cout<<taskflow.name();//"MyTaskflow" - + tf::Taskflow::Taskflow () Taskflow + tf::Taskflow::Taskflow constructs a taskflow @@ -156,13 +183,14 @@ - + tf::Taskflow::Taskflow (Taskflow &&rhs) Taskflow + tf::Taskflow::Taskflow Taskflow && rhs @@ -172,20 +200,23 @@ Constructing a taskflow taskflow1 from a moved taskflow taskflow2 will migrate the graph of taskflow2 to taskflow1. After the move, taskflow2 will become empty. -tf::Taskflowtaskflow1(std::move(taskflow2)); +tf::Taskflowtaskflow1(std::move(taskflow2)); assert(taskflow2.empty()); -Notice that taskflow2 should not be running in an executor during the move operation, or the behavior is undefined. +You should avoid moving a taskflow that is currently running on an executor. Doing so results in undefined behavior. + + - + Taskflow & Taskflow & tf::Taskflow::operator= (Taskflow &&rhs) operator= + tf::Taskflow::operator= Taskflow && rhs @@ -195,20 +226,23 @@ Moving a taskflow taskflow2 to another taskflow taskflow1 will destroy the existing graph of taskflow1 and assign it the graph of taskflow2. After the move, taskflow2 will become empty. -taskflow1=std::move(taskflow2); +taskflow1=std::move(taskflow2); assert(taskflow2.empty()); -Notice that both taskflow1 and taskflow2 should not be running in an executor during the move operation, or the behavior is undefined. +You should avoid moving a taskflow that is currently running on an executor. Doing so results in undefined behavior. + + - + tf::Taskflow::~Taskflow ()=default ~Taskflow + tf::Taskflow::~Taskflow default destructor @@ -228,124 +262,156 @@ - + void void tf::Taskflow::dump (std::ostream &ostream) const dump + tf::Taskflow::dump - std::ostream & + std::ostream & ostream -dumps the taskflow to a DOT format through a std::ostream target +dumps the taskflow to a DOT format through a std::ostream target -taskflow.dump(std::cout);//dumpthegraphtothestandardoutput +taskflow.dump(std::cout);//dumpthegraphtothestandardoutput -std::ofstreamofs("output.dot"); +std::ofstreamofs("output.dot"); taskflow.dump(ofs);//dumpthegraphtothefileoutput.dot For dynamically spawned tasks, such as module tasks, subflow tasks, and GPU tasks, you need to run the taskflow first before you can dump the entire graph. tf::Taskparent=taskflow.emplace([](tf::Subflowsf){ -sf.emplace([](){std::cout<<"child\n";}); +sf.emplace([](){std::cout<<"child\n";}); }); -taskflow.dump(std::cout);//thisdumpsonlytheparenttasks +taskflow.dump(std::cout);//thisdumpsonlytheparenttasks executor.run(taskflow).wait(); -taskflow.dump(std::cout);//thisdumpsbothparentandchildtasks +taskflow.dump(std::cout);//thisdumpsbothparentandchildtasks - + - std::string + std::string std::string tf::Taskflow::dump () const dump + tf::Taskflow::dump -dumps the taskflow to a std::string of DOT format +dumps the taskflow to a std::string of DOT format This method is similar to tf::Taskflow::dump(std::ostream& ostream), but returning a string of the graph in DOT format. - + size_t size_t tf::Taskflow::num_tasks () const num_tasks + tf::Taskflow::num_tasks -queries the number of tasks +queries the number of tasks in this taskflow +The number of tasks in this taskflow is defined at the first level of hierarchy. Tasks that are created dynamically, such as those via tf::Subflow, are not counted. +tf::Taskflowtaskflow; +automy_task=taskflow.emplace([](){}); +assert(taskflow.num_tasks()==1); + +//reassignmy_tasktoasubflowoffourtasks +my_task.work([](tf::Subflow&sf){ +sf.emplace( +[](){std::cout<<"TaskA\n";}, +[](){std::cout<<"TaskB\n";}, +[](){std::cout<<"TaskC\n";}, +[](){std::cout<<"TaskD\n";} +); +}); + +//subflowtaskswillnotbecounted +assert(taskflow.num_tasks()==1); + - + bool bool tf::Taskflow::empty () const empty + tf::Taskflow::empty -queries the emptiness of the taskflow +queries if this taskflow is empty (has no tasks) -An empty taskflow has no tasks. That is the return of tf::Taskflow::num_tasks is zero. +An empty taskflow has no tasks, i.e., the return of tf::Taskflow::num_tasks is 0. +tf::Taskflowtaskflow; +assert(taskflow.empty()==true); +taskflow.emplace([](){}); +assert(taskflow.empty()==false); + - + void void tf::Taskflow::name (const std::string &) name + tf::Taskflow::name - const std::string & + const std::string & name -assigns a name to the taskflow +assigns a new name to this taskflow -taskflow.name("assignanothername"); +taskflow.name("foo"); +assert(taskflow.name()=="foo"); - + - const std::string & + const std::string & const std::string & tf::Taskflow::name () const name + tf::Taskflow::name -queries the name of the taskflow +queries the name of this taskflow -std::cout<<"mynameis:"<<taskflow.name(); +tf::Taskflowtaskflow("foo"); +assert(taskflow.name()=="foo"); - + void void tf::Taskflow::clear () clear + tf::Taskflow::clear clears the associated task dependency graph @@ -354,7 +420,7 @@ - + @@ -366,29 +432,31 @@ void tf::Taskflow::for_each_task (V &&visitor) const for_each_task + tf::Taskflow::for_each_task V && visitor -applies a visitor to each task in the taskflow +applies a visitor to each task in this taskflow A visitor is a callable that takes an argument of type tf::Task and returns nothing. The following example iterates each task in a taskflow and prints its name: taskflow.for_each_task([](tf::Tasktask){ -std::cout<<task.name()<<'\n'; +std::cout<<task.name()<<'\n'; }); - + void void tf::Taskflow::remove_dependency (Task from, Task to) remove_dependency + tf::Taskflow::remove_dependency Task from @@ -418,7 +486,8 @@ -tf::Taskflowtaskflow; +Removing the depencency from task from to task to is equivalent to removing to from the succcessor list of from and removing from from the predecessor list of to. +tf::Taskflowtaskflow; autoa=taskflow.placeholder().name("a"); autob=taskflow.placeholder().name("b"); autoc=taskflow.placeholder().name("c"); @@ -426,43 +495,48 @@ a.precede(b,c,d); assert(a.num_successors()==3); -assert(b.num_dependents()==1); -assert(c.num_dependents()==1); -assert(d.num_dependents()==1); +assert(b.num_predecessors()==1); +assert(c.num_predecessors()==1); +assert(d.num_predecessors()==1); taskflow.remove_dependency(a,b); assert(a.num_successors()==2); -assert(b.num_dependents()==0); - +assert(b.num_predecessors()==0); + +For performance reason, Taskflow does not store the graph using linked lists but vectors with contiguous space. Therefore, removing tasks or dependencies incurs linear time complexity proportional to the size of the graph and the dependency count of a task. + + - + Graph & Graph & tf::Taskflow::graph () graph + tf::Taskflow::graph returns a reference to the underlying graph object -A graph object (of type tf::Graph) is the ultimate storage for the task dependency graph and should only be used as an opaque data structure to interact with the executor (e.g., composition). +A graph object is of type tf::Graph and stores a task dependency graph that can be executed by an tf::Executor. - + - - + + void void tf::Taskflow::_dump (std::ostream &, const Graph *) const _dump + tf::Taskflow::_dump - std::ostream & + std::ostream & os @@ -475,15 +549,16 @@ - + void void tf::Taskflow::_dump (std::ostream &, const Node *, Dumper &) const _dump + tf::Taskflow::_dump - std::ostream & + std::ostream & os @@ -500,15 +575,16 @@ - + void void tf::Taskflow::_dump (std::ostream &, const Graph *, Dumper &) const _dump + tf::Taskflow::_dump - std::ostream & + std::ostream & os @@ -525,30 +601,30 @@ - + - + class to create a taskflow object A taskflow manages a task dependency graph where each task represents a callable object (e.g., lambda, std::function) and an edge represents a dependency between two tasks. A task is one of the following types: -static task : the callable constructible from std::function<void()> -subflow task : the callable constructible from std::function<void(tf::Subflow&)> -condition task : the callable constructible from std::function<int()> +static task : the callable constructible from std::function<void()> +subflow task : the callable constructible from std::function<void(tf::Subflow&)> +condition task : the callable constructible from std::function<int()> multi-condition task: the callable constructible from std::function<tf::SmallVector<int>()> -module task : the task constructed from tf::Taskflow::composed_of std::function<void(tf::Runtime&)> +module task : the task constructed from tf::Taskflow::composed_of std::function<void(tf::Runtime&)> Each task is a basic computation unit and is run by one worker thread from an executor. The following example creates a simple taskflow graph of four static tasks, A, B, C, and D, where A runs before B and C and D runs after B and C. tf::Executorexecutor; tf::Taskflowtaskflow("simple"); -tf::TaskA=taskflow.emplace([](){std::cout<<"TaskA\n";}); -tf::TaskB=taskflow.emplace([](){std::cout<<"TaskB\n";}); -tf::TaskC=taskflow.emplace([](){std::cout<<"TaskC\n";}); -tf::TaskD=taskflow.emplace([](){std::cout<<"TaskD\n";}); +tf::TaskA=taskflow.emplace([](){std::cout<<"TaskA\n";}); +tf::TaskB=taskflow.emplace([](){std::cout<<"TaskB\n";}); +tf::TaskC=taskflow.emplace([](){std::cout<<"TaskC\n";}); +tf::TaskD=taskflow.emplace([](){std::cout<<"TaskD\n";}); A.precede(B,C);//ArunsbeforeBandC D.succeed(B,C);//DrunsafterBandC @@ -571,6 +647,9 @@ + + + @@ -581,6 +660,8 @@ + + @@ -589,7 +670,7 @@ - + tf::Taskflow_dump tf::Taskflow_dump @@ -607,21 +688,23 @@ tf::Taskflowemplace tf::Taskflowemplace tf::Taskflowemplace + tf::Taskflowemplace tf::Taskflowemplace tf::Taskflowempty tf::Taskflowerase - tf::Taskflowexclusive_scan + tf::Taskflowexclusive_scan tf::TaskflowExecutor tf::Taskflowfind_if tf::Taskflowfind_if_not tf::TaskflowFlowBuilder tf::TaskflowFlowBuilder tf::Taskflowfor_each + tf::Taskflowfor_each_by_index tf::Taskflowfor_each_index tf::Taskflowfor_each_task tf::Taskflowgraph - tf::Taskflowinclusive_scan - tf::Taskflowinclusive_scan + tf::Taskflowinclusive_scan + tf::Taskflowinclusive_scan tf::Taskflowlinearize tf::Taskflowlinearize tf::Taskflowmax_element @@ -632,18 +715,20 @@ tf::Taskflowoperator= tf::Taskflowplaceholder tf::Taskflowreduce + tf::Taskflowreduce_by_index tf::Taskflowremove_dependency tf::Taskflowsort tf::Taskflowsort + tf::TaskflowSubflow tf::TaskflowTaskflow tf::TaskflowTaskflow tf::TaskflowTaskflow tf::TaskflowTopology tf::Taskflowtransform tf::Taskflowtransform - tf::Taskflowtransform_exclusive_scan - tf::Taskflowtransform_inclusive_scan - tf::Taskflowtransform_inclusive_scan + tf::Taskflowtransform_exclusive_scan + tf::Taskflowtransform_inclusive_scan + tf::Taskflowtransform_inclusive_scan tf::Taskflowtransform_reduce tf::Taskflowtransform_reduce tf::Taskflow~Taskflow diff --git a/docs/xml/classtf_1_1UnboundedTaskQueue.xml b/docs/xml/classtf_1_1UnboundedTaskQueue.xml new file mode 100644 index 000000000..7a369674b --- /dev/null +++ b/docs/xml/classtf_1_1UnboundedTaskQueue.xml @@ -0,0 +1,315 @@ + + + + tf::UnboundedTaskQueue + taskflow/core/tsq.hpp + tf::UnboundedTaskQueue::Array + + + typename T + + + + + std::atomic< int64_t > + std::atomic<int64_t> tf::UnboundedTaskQueue< T >::_top + + _top + tf::UnboundedTaskQueue::_top + + + + + + + + + + std::atomic< int64_t > + std::atomic<int64_t> tf::UnboundedTaskQueue< T >::_bottom + + _bottom + tf::UnboundedTaskQueue::_bottom + + + + + + + + + + std::atomic< Array * > + std::atomic<Array*> tf::UnboundedTaskQueue< T >::_array + + _array + tf::UnboundedTaskQueue::_array + + + + + + + + + + std::vector< Array * > + std::vector<Array*> tf::UnboundedTaskQueue< T >::_garbage + + _garbage + tf::UnboundedTaskQueue::_garbage + + + + + + + + + + + + + tf::UnboundedTaskQueue< T >::UnboundedTaskQueue + (int64_t LogSize=TF_DEFAULT_UNBOUNDED_TASK_QUEUE_LOG_SIZE) + UnboundedTaskQueue + tf::UnboundedTaskQueue::UnboundedTaskQueue + + int64_t + LogSize + TF_DEFAULT_UNBOUNDED_TASK_QUEUE_LOG_SIZE + + +constructs the queue with the given size in the base-2 logarithm + + + + +LogSize + + +the base-2 logarithm of the queue size + + + + + + + + + + + + tf::UnboundedTaskQueue< T >::~UnboundedTaskQueue + () + ~UnboundedTaskQueue + tf::UnboundedTaskQueue::~UnboundedTaskQueue + +destructs the queue + + + + + + + + + bool + bool tf::UnboundedTaskQueue< T >::empty + () const noexcept + empty + tf::UnboundedTaskQueue::empty + +queries if the queue is empty at the time of this call + + + + + + + + + size_t + size_t tf::UnboundedTaskQueue< T >::size + () const noexcept + size + tf::UnboundedTaskQueue::size + +queries the number of items at the time of this call + + + + + + + + + int64_t + int64_t tf::UnboundedTaskQueue< T >::capacity + () const noexcept + capacity + tf::UnboundedTaskQueue::capacity + +queries the capacity of the queue + + + + + + + + + void + void tf::UnboundedTaskQueue< T >::push + (T item) + push + tf::UnboundedTaskQueue::push + + T + item + + +inserts an item to the queue + + + + +item + + +the item to push to the queue + + + +Only the owner thread can insert an item to the queue. The operation can trigger the queue to resize its capacity if more space is required. + + + + + + + T + T tf::UnboundedTaskQueue< T >::pop + () + pop + tf::UnboundedTaskQueue::pop + +pops out an item from the queue + + +Only the owner thread can pop out an item from the queue. The return can be a nullptr if this operation failed (empty queue). + + + + + + + T + T tf::UnboundedTaskQueue< T >::steal + () + steal + tf::UnboundedTaskQueue::steal + +steals an item from the queue + + +Any threads can try to steal an item from the queue. The return can be a nullptr if this operation failed (not necessary empty). + + + + + + + T + T tf::UnboundedTaskQueue< T >::steal_with_hint + (size_t &num_empty_steals) + steal_with_hint + tf::UnboundedTaskQueue::steal_with_hint + + size_t & + num_empty_steals + + +attempts to steal a task with a hint mechanism + + + + +num_empty_steals + + +a reference to a counter tracking consecutive empty steal attempts + + + +This function tries to steal a task from the queue. If the steal attempt is successful, the stolen task is returned. Additionally, if the queue is empty, the provided counter num_empty_steals is incremented; otherwise, num_empty_steals is reset to zero. + + + + + + + + + Array * + UnboundedTaskQueue< T >::Array * tf::UnboundedTaskQueue< T >::resize_array + (Array *a, int64_t b, int64_t t) + resize_array + tf::UnboundedTaskQueue::resize_array + + Array * + a + + + int64_t + b + + + int64_t + t + + + + + + + + + + + +class to create a lock-free unbounded work-stealing queue + + + + +T + + +data type (must be a pointer type) + + + + +This class implements the work-stealing queue described in the paper, Correct and Efficient Work-Stealing for Weak Memory Models. +Only the queue owner can perform pop and push operations, while others can steal data from the queue simultaneously. + + + + tf::UnboundedTaskQueue_array + tf::UnboundedTaskQueue_bottom + tf::UnboundedTaskQueue_garbage + tf::UnboundedTaskQueue_top + tf::UnboundedTaskQueuecapacity + tf::UnboundedTaskQueueempty + tf::UnboundedTaskQueuepop + tf::UnboundedTaskQueuepush + tf::UnboundedTaskQueueresize_array + tf::UnboundedTaskQueuesize + tf::UnboundedTaskQueuesteal + tf::UnboundedTaskQueuesteal_with_hint + tf::UnboundedTaskQueueUnboundedTaskQueue + tf::UnboundedTaskQueue~UnboundedTaskQueue + + + diff --git a/docs/xml/classtf_1_1Worker.xml b/docs/xml/classtf_1_1Worker.xml index fcbbaf39f..3eaaf4d68 100644 --- a/docs/xml/classtf_1_1Worker.xml +++ b/docs/xml/classtf_1_1Worker.xml @@ -1,14 +1,15 @@ - + tf::Worker - worker.hpp - + taskflow/core/worker.hpp + class friend class Executor Executor + tf::Worker::Executor Executor @@ -18,13 +19,31 @@ - + + + + class + friend class Runtime + + Runtime + tf::Worker::Runtime + + Runtime + + + + + + + + class friend class WorkerView WorkerView + tf::Worker::WorkerView WorkerView @@ -34,122 +53,132 @@ - + + + + + + std::atomic< bool > + std::atomic<bool> tf::Worker::_done + + _done + tf::Worker::_done + {false} + + + + + + + - - size_t size_t tf::Worker::_id _id + tf::Worker::_id - + size_t size_t tf::Worker::_vtm _vtm + tf::Worker::_vtm - + Executor * Executor* tf::Worker::_executor _executor + tf::Worker::_executor + {nullptr} - + - - std::thread * - std::thread* tf::Worker::_thread + + DefaultNotifier::Waiter * + DefaultNotifier::Waiter* tf::Worker::_waiter - _thread + _waiter + tf::Worker::_waiter - + - - Notifier::Waiter * - Notifier::Waiter* tf::Worker::_waiter + + std::thread + std::thread tf::Worker::_thread - _waiter + _thread + tf::Worker::_thread - + - std::default_random_engine + std::default_random_engine std::default_random_engine tf::Worker::_rdgen _rdgen - { std::random_device{}() } + tf::Worker::_rdgen - + - - TaskQueue< Node * > - TaskQueue<Node*> tf::Worker::_wsq + + BoundedTaskQueue< Node * > + BoundedTaskQueue<Node*> tf::Worker::_wsq _wsq + tf::Worker::_wsq - - - - Node * - Node* tf::Worker::_cache - - _cache - - - - - - - + - - + + size_t size_t tf::Worker::id () const id + tf::Worker::id queries the worker id associated with its parent executor @@ -158,27 +187,14 @@ - - - - std::thread * - std::thread* tf::Worker::thread - () const - thread - -acquires a pointer access to the underlying thread - - - - - - + size_t size_t tf::Worker::queue_size () const queue_size + tf::Worker::queue_size queries the size of the queue (i.e., number of enqueued tasks to run) associated with the worker @@ -186,13 +202,14 @@ - + size_t size_t tf::Worker::queue_capacity () const queue_capacity + tf::Worker::queue_capacity queries the current capacity of the queue @@ -200,30 +217,62 @@ - + + + + Executor * + Executor * tf::Worker::executor + () + executor + tf::Worker::executor + +acquires the associated executor + + + + + + + + + std::thread & + std::thread & tf::Worker::thread + () + thread + tf::Worker::thread + +acquires the associated thread + + + + + + - + class to create a worker in an executor -The class is primarily used by the executor to perform work-stealing algorithm. Users can access a worker object and alter its property (e.g., changing the thread affinity in a POSIX-like system) using tf::WorkerInterface. +The class is primarily used by the executor to perform work-stealing algorithm. Users can access a worker object and alter its property (e.g., changing the thread affinity in a POSIX-like system) using tf::WorkerInterface. - + - tf::Worker_cache + tf::Worker_done tf::Worker_executor tf::Worker_id tf::Worker_rdgen - tf::Worker_thread + tf::Worker_thread tf::Worker_vtm - tf::Worker_waiter - tf::Worker_wsq + tf::Worker_waiter + tf::Worker_wsq tf::WorkerExecutor + tf::Workerexecutor tf::Workerid tf::Workerqueue_capacity tf::Workerqueue_size - tf::Workerthread + tf::WorkerRuntime + tf::Workerthread tf::WorkerWorkerView diff --git a/docs/xml/classtf_1_1WorkerInterface.xml b/docs/xml/classtf_1_1WorkerInterface.xml new file mode 100644 index 000000000..56ed38018 --- /dev/null +++ b/docs/xml/classtf_1_1WorkerInterface.xml @@ -0,0 +1,136 @@ + + + + tf::WorkerInterface + taskflow/core/worker.hpp + + + + virtual tf::WorkerInterface::~WorkerInterface + ()=default + ~WorkerInterface + tf::WorkerInterface::~WorkerInterface + +default destructor + + + + + + + + + void + virtual void tf::WorkerInterface::scheduler_prologue + (Worker &worker)=0 + scheduler_prologue + tf::WorkerInterface::scheduler_prologue + + Worker & + worker + + +method to call before a worker enters the scheduling loop + + + + +worker + + +a reference to the worker + + + +The method is called by the constructor of an executor. + + + + + + + void + virtual void tf::WorkerInterface::scheduler_epilogue + (Worker &worker, std::exception_ptr ptr)=0 + scheduler_epilogue + tf::WorkerInterface::scheduler_epilogue + + Worker & + worker + + + std::exception_ptr + ptr + + +method to call after a worker leaves the scheduling loop + + + + +worker + + +a reference to the worker + + + + +ptr + + +an pointer to the exception thrown by the scheduling loop + + + +The method is called by the constructor of an executor. + + + + + + + +class to configure worker behavior in an executor + + +The tf::WorkerInterface class allows users to customize worker properties when creating an executor. Examples include binding workers to specific CPU cores or invoking custom methods before and after a worker enters or leaves the work-stealing loop. When you create an executor, it spawns a set of workers to execute tasks with the following logic: +for(size_tn=0;n<num_workers;n++){ +create_thread([](Worker&worker) + +//pre-processingexecutor-specificworkerinformation +//... + +//entertheschedulingloop +//Here,WorkerInterface::scheduler_prologueisinvoked,ifany +worker_interface->scheduler_prologue(worker); + +try{ +while(1){ +perform_work_stealing_algorithm(); +if(stop){ +break; +} +} +}catch(...){ +exception_ptr=std::current_exception(); +} + +//leavestheschedulingloopandjoinsthisworkerthread +//Here,WorkerInterface::scheduler_epilogueisinvoked,ifany +worker_interface->scheduler_epilogue(worker,exception_ptr); +); +} + +tf::WorkerInterface::scheduler_prologue and tf::WorkerInterface::scheduler_eiplogue are invoked by each worker simultaneously. + + + + + + tf::WorkerInterfacescheduler_epilogue + tf::WorkerInterfacescheduler_prologue + tf::WorkerInterface~WorkerInterface + + + diff --git a/docs/xml/classtf_1_1WorkerView.xml b/docs/xml/classtf_1_1WorkerView.xml index d7bf77b09..1176cb248 100644 --- a/docs/xml/classtf_1_1WorkerView.xml +++ b/docs/xml/classtf_1_1WorkerView.xml @@ -1,13 +1,15 @@ - + tf::WorkerView - + taskflow/core/worker.hpp + class friend class Executor Executor + tf::WorkerView::Executor Executor @@ -17,30 +19,32 @@ - + - - + + const Worker & const Worker& tf::WorkerView::_worker _worker + tf::WorkerView::_worker - + - - + + size_t size_t tf::WorkerView::id () const id + tf::WorkerView::id queries the worker id associated with its parent executor @@ -49,13 +53,14 @@ - + size_t size_t tf::WorkerView::queue_size () const queue_size + tf::WorkerView::queue_size queries the size of the queue (i.e., number of pending tasks to run) associated with the worker @@ -63,13 +68,14 @@ - + size_t size_t tf::WorkerView::queue_capacity () const queue_capacity + tf::WorkerView::queue_capacity queries the current capacity of the queue @@ -77,15 +83,16 @@ - + - - + + tf::WorkerView::WorkerView (const Worker &) WorkerView + tf::WorkerView::WorkerView const Worker & w @@ -96,13 +103,14 @@ - + tf::WorkerView::WorkerView (const WorkerView &)=default WorkerView + tf::WorkerView::WorkerView const WorkerView & @@ -112,16 +120,16 @@ - + - + -class to create an immutable view of a worker in an executor +class to create an immutable view of a worker An executor keeps a set of internal worker threads to run tasks. A worker view provides users an immutable interface to observe when a worker runs a task, and the view object is only accessible from an observer derived from tf::ObserverInterface. - + tf::WorkerView_worker tf::WorkerViewExecutor diff --git a/docs/xml/classtf_1_1cudaDeviceAllocator.xml b/docs/xml/classtf_1_1cudaDeviceAllocator.xml index feeceae5d..7c5fc3625 100644 --- a/docs/xml/classtf_1_1cudaDeviceAllocator.xml +++ b/docs/xml/classtf_1_1cudaDeviceAllocator.xml @@ -1,20 +1,20 @@ - - + + tf::cudaDeviceAllocator - cuda_memory.hpp tf::cudaDeviceAllocator::rebind typename T - + T using tf::cudaDeviceAllocator< T >::value_type = T value_type + tf::cudaDeviceAllocator::value_type element type @@ -22,13 +22,14 @@ - + T * using tf::cudaDeviceAllocator< T >::pointer = T* pointer + tf::cudaDeviceAllocator::pointer element pointer type @@ -36,13 +37,14 @@ - + T & using tf::cudaDeviceAllocator< T >::reference = T& reference + tf::cudaDeviceAllocator::reference element reference type @@ -50,13 +52,14 @@ - + const T * using tf::cudaDeviceAllocator< T >::const_pointer = const T* const_pointer + tf::cudaDeviceAllocator::const_pointer const element pointer type @@ -64,13 +67,14 @@ - + const T & using tf::cudaDeviceAllocator< T >::const_reference = const T& const_reference + tf::cudaDeviceAllocator::const_reference constant element reference type @@ -78,13 +82,14 @@ - + - std::size_t + std::size_t using tf::cudaDeviceAllocator< T >::size_type = std::size_t size_type + tf::cudaDeviceAllocator::size_type size type @@ -92,13 +97,14 @@ - + - std::ptrdiff_t + std::ptrdiff_t using tf::cudaDeviceAllocator< T >::difference_type = std::ptrdiff_t difference_type + tf::cudaDeviceAllocator::difference_type pointer difference type @@ -106,15 +112,16 @@ - + - - + + tf::cudaDeviceAllocator< T >::cudaDeviceAllocator () noexcept cudaDeviceAllocator + tf::cudaDeviceAllocator::cudaDeviceAllocator Constructs a device allocator object. @@ -122,15 +129,16 @@ - + tf::cudaDeviceAllocator< T >::cudaDeviceAllocator (const cudaDeviceAllocator &) noexcept cudaDeviceAllocator + tf::cudaDeviceAllocator::cudaDeviceAllocator - const cudaDeviceAllocator & + const cudaDeviceAllocator & Constructs a device allocator object from another device allocator object. @@ -139,7 +147,7 @@ - + @@ -151,8 +159,9 @@ tf::cudaDeviceAllocator< T >::cudaDeviceAllocator (const cudaDeviceAllocator< U > &) noexcept cudaDeviceAllocator + tf::cudaDeviceAllocator::cudaDeviceAllocator - const cudaDeviceAllocator< U > & + const cudaDeviceAllocator< U > & Constructs a device allocator object from another device allocator object with a different element type. @@ -161,13 +170,14 @@ - + tf::cudaDeviceAllocator< T >::~cudaDeviceAllocator () noexcept ~cudaDeviceAllocator + tf::cudaDeviceAllocator::~cudaDeviceAllocator Destructs the device allocator object. @@ -175,15 +185,16 @@ - + - pointer + pointer pointer tf::cudaDeviceAllocator< T >::address (reference x) address + tf::cudaDeviceAllocator::address - reference + reference x @@ -206,15 +217,16 @@ - + - const_pointer + const_pointer const_pointer tf::cudaDeviceAllocator< T >::address (const_reference x) const address + tf::cudaDeviceAllocator::address - const_reference + const_reference x @@ -237,15 +249,16 @@ - + - pointer + pointer pointer tf::cudaDeviceAllocator< T >::allocate (size_type n, const void *=0) allocate + tf::cudaDeviceAllocator::allocate - size_type + size_type n @@ -258,7 +271,7 @@ Attempts to allocate a block of storage with a size large enough to contain n elements of member type, value_type, and returns a pointer to the first element. The storage is aligned appropriately for object of type value_type, but they are not constructed. -The block of storage is allocated using cudaMalloc and throws std::bad_alloc if it cannot allocate the total amount of storage requested. +The block of storage is allocated using cudaMalloc and throws std::bad_alloc if it cannot allocate the total amount of storage requested. n @@ -274,19 +287,20 @@ - + void void tf::cudaDeviceAllocator< T >::deallocate (pointer ptr, size_type) deallocate + tf::cudaDeviceAllocator::deallocate - pointer + pointer ptr - size_type + size_type Releases a block of storage previously allocated with member allocate and not yet released. @@ -306,36 +320,38 @@ - + - size_type + size_type size_type tf::cudaDeviceAllocator< T >::max_size () const noexcept max_size + tf::cudaDeviceAllocator::max_size returns the maximum number of elements that could potentially be allocated by this allocator A call to member allocate with the value returned by this function can still fail to allocate the requested storage. -the nubmer of elements that might be allcoated as maximum by a call to member allocate +the number of elements that might be allocated as maximum by a call to member allocate - + void void tf::cudaDeviceAllocator< T >::construct (pointer, const_reference) construct + tf::cudaDeviceAllocator::construct - pointer + pointer - const_reference + const_reference ignored to avoid de-referencing device pointer from the host @@ -344,15 +360,16 @@ - + void void tf::cudaDeviceAllocator< T >::destroy (pointer) destroy + tf::cudaDeviceAllocator::destroy - pointer + pointer ignored to avoid de-referencing device pointer from the host @@ -361,7 +378,7 @@ - + @@ -373,8 +390,9 @@ bool tf::cudaDeviceAllocator< T >::operator== (const cudaDeviceAllocator< U > &) const noexcept operator== + tf::cudaDeviceAllocator::operator== - const cudaDeviceAllocator< U > & + const cudaDeviceAllocator< U > & compares two allocator of different types using == @@ -384,7 +402,7 @@ - + @@ -396,8 +414,9 @@ bool tf::cudaDeviceAllocator< T >::operator!= (const cudaDeviceAllocator< U > &) const noexcept operator!= + tf::cudaDeviceAllocator::operator!= - const cudaDeviceAllocator< U > & + const cudaDeviceAllocator< U > & compares two allocator of different types using != @@ -407,25 +426,14 @@ - + - + -class to create a CUDA device allocator - - -T - - -element type - - - -A cudaDeviceAllocator enables device-specific allocation for standard library containers. It is typically passed as template parameter when declaring standard library containers (e.g. std::vector). - + tf::cudaDeviceAllocatoraddress tf::cudaDeviceAllocatoraddress diff --git a/docs/xml/classtf_1_1cudaDeviceVector.xml b/docs/xml/classtf_1_1cudaDeviceVector.xml index e2def5cf6..8bb12151b 100644 --- a/docs/xml/classtf_1_1cudaDeviceVector.xml +++ b/docs/xml/classtf_1_1cudaDeviceVector.xml @@ -1,5 +1,5 @@ - + tf::cudaDeviceVector @@ -7,12 +7,13 @@ typename T - + T * T* tf::cudaDeviceVector< T >::_data _data + tf::cudaDeviceVector::_data {nullptr} @@ -20,13 +21,14 @@ - + size_t size_t tf::cudaDeviceVector< T >::_N _N + tf::cudaDeviceVector::_N {0} @@ -34,28 +36,30 @@ - + - - + + tf::cudaDeviceVector< T >::cudaDeviceVector ()=default cudaDeviceVector + tf::cudaDeviceVector::cudaDeviceVector - + tf::cudaDeviceVector< T >::cudaDeviceVector (size_t N) cudaDeviceVector + tf::cudaDeviceVector::cudaDeviceVector size_t N @@ -66,13 +70,14 @@ - + tf::cudaDeviceVector< T >::cudaDeviceVector (cudaDeviceVector &&rhs) cudaDeviceVector + tf::cudaDeviceVector::cudaDeviceVector cudaDeviceVector && rhs @@ -83,26 +88,28 @@ - + tf::cudaDeviceVector< T >::~cudaDeviceVector () ~cudaDeviceVector + tf::cudaDeviceVector::~cudaDeviceVector - + - + cudaDeviceVector & - cudaDeviceVector& tf::cudaDeviceVector< T >::operator= + cudaDeviceVector & tf::cudaDeviceVector< T >::operator= (cudaDeviceVector &&rhs) operator= + tf::cudaDeviceVector::operator= cudaDeviceVector && rhs @@ -113,52 +120,56 @@ - + size_t size_t tf::cudaDeviceVector< T >::size () const size + tf::cudaDeviceVector::size - + - + T * - T* tf::cudaDeviceVector< T >::data + T * tf::cudaDeviceVector< T >::data () data + tf::cudaDeviceVector::data - + - + const T * - const T* tf::cudaDeviceVector< T >::data + const T * tf::cudaDeviceVector< T >::data () const data + tf::cudaDeviceVector::data - + tf::cudaDeviceVector< T >::cudaDeviceVector (const cudaDeviceVector &)=delete cudaDeviceVector + tf::cudaDeviceVector::cudaDeviceVector const cudaDeviceVector & @@ -168,13 +179,14 @@ - + - + cudaDeviceVector & - cudaDeviceVector& tf::cudaDeviceVector< T >::operator= + cudaDeviceVector & tf::cudaDeviceVector< T >::operator= (const cudaDeviceVector &)=delete operator= + tf::cudaDeviceVector::operator= const cudaDeviceVector & @@ -184,14 +196,14 @@ - + - + - + tf::cudaDeviceVector_data tf::cudaDeviceVector_N @@ -199,10 +211,10 @@ tf::cudaDeviceVectorcudaDeviceVector tf::cudaDeviceVectorcudaDeviceVector tf::cudaDeviceVectorcudaDeviceVector - tf::cudaDeviceVectordata - tf::cudaDeviceVectordata - tf::cudaDeviceVectoroperator= - tf::cudaDeviceVectoroperator= + tf::cudaDeviceVectordata + tf::cudaDeviceVectordata + tf::cudaDeviceVectoroperator= + tf::cudaDeviceVectoroperator= tf::cudaDeviceVectorsize tf::cudaDeviceVector~cudaDeviceVector diff --git a/docs/xml/classtf_1_1cudaEvent.xml b/docs/xml/classtf_1_1cudaEvent.xml deleted file mode 100644 index cf5f9945f..000000000 --- a/docs/xml/classtf_1_1cudaEvent.xml +++ /dev/null @@ -1,94 +0,0 @@ - - - - tf::cudaEvent - cudaObject< cudaEvent_t, cudaEventCreator, cudaEventDeleter > - cuda_stream.hpp - - - - tf::cudaEvent::cudaEvent - (cudaEvent_t event) - cudaEvent - - cudaEvent_t - event - - -constructs an RAII-styled CUDA event object from the given CUDA event - - - - - - - - - - tf::cudaEvent::cudaEvent - ()=default - cudaEvent - -constructs an RAII-styled CUDA event object - - - - - - - - - - tf::cudaEvent::cudaEvent - (unsigned int flag) - cudaEvent - - unsigned int - flag - - -constructs an RAII-styled CUDA event object with the given flag - - - - - - - - - -class to create an RAII-styled wrapper over a native CUDA event - - -A cudaEvent object is an RAII-styled wrapper over a native CUDA event (cudaEvent_t). A cudaEvent object is move-only. - - - - - - - - - - - - - - - - - - - - - - - - - - tf::cudaEventcudaEvent - tf::cudaEventcudaEvent - tf::cudaEventcudaEvent - - - diff --git a/docs/xml/classtf_1_1cudaEventBase.xml b/docs/xml/classtf_1_1cudaEventBase.xml new file mode 100644 index 000000000..49750b9fd --- /dev/null +++ b/docs/xml/classtf_1_1cudaEventBase.xml @@ -0,0 +1,199 @@ + + + + tf::cudaEventBase + std::unique_ptr< std::remove_pointer_t< cudaEvent_t >, Deleter > + taskflow/cuda/cuda_stream.hpp + + + typename Creator + + + typename Deleter + + + + + std::unique_ptr< std::remove_pointer_t< cudaEvent_t >, Deleter > + using tf::cudaEventBase< Creator, Deleter >::base_type = std::unique_ptr<std::remove_pointer_t<cudaEvent_t>, Deleter> + + base_type + tf::cudaEventBase::base_type + +base type for the underlying unique pointer + + +This alias provides a shorthand for the underlying std::unique_ptr type that manages CUDA event resources with an associated deleter. + + + + + + + + + + + typename... + ArgsT + ArgsT + + + + tf::cudaEventBase< Creator, Deleter >::cudaEventBase + (ArgsT &&... args) + cudaEventBase + tf::cudaEventBase::cudaEventBase + + ArgsT &&... + args + + +constructs a cudaEvent object by passing the given arguments to the event creator + + +Constructs a cudaEvent object by passing the given arguments to the event creator + + +args + + +arguments to pass to the event creator + + + + + + + + + + + + tf::cudaEventBase< Creator, Deleter >::cudaEventBase + (cudaEventBase &&)=default + cudaEventBase + tf::cudaEventBase::cudaEventBase + + cudaEventBase && + + +constructs a cudaEvent from the given rhs using move semantics + + + + + + + + + cudaEventBase & + cudaEventBase & tf::cudaEventBase< Creator, Deleter >::operator= + (cudaEventBase &&)=default + operator= + tf::cudaEventBase::operator= + + cudaEventBase && + + +assign the rhs to *this using move semantics + + + + + + + + + + + + tf::cudaEventBase< Creator, Deleter >::cudaEventBase + (const cudaEventBase &)=delete + cudaEventBase + tf::cudaEventBase::cudaEventBase + + const cudaEventBase & + + + + + + + + + + + cudaEventBase & + cudaEventBase & tf::cudaEventBase< Creator, Deleter >::operator= + (const cudaEventBase &)=delete + operator= + tf::cudaEventBase::operator= + + const cudaEventBase & + + + + + + + + + + + +class to create a CUDA event with unique ownership + + + + +Creator + + +functor to create the stream (used in constructor) + + + + +Deleter + + +functor to delete the stream (used in destructor) + + + +The cudaEventBase class encapsulates a cudaEvent_t using std::unique_ptr, ensuring that CUDA events are properly created and destroyed with a unique ownership. + + + + + + + + + + + + + + + + + + + + + + + + + + tf::cudaEventBasebase_type + tf::cudaEventBasecudaEventBase + tf::cudaEventBasecudaEventBase + tf::cudaEventBasecudaEventBase + tf::cudaEventBaseoperator= + tf::cudaEventBaseoperator= + + + diff --git a/docs/xml/classtf_1_1cudaEventCreator.xml b/docs/xml/classtf_1_1cudaEventCreator.xml new file mode 100644 index 000000000..1bf2a9baa --- /dev/null +++ b/docs/xml/classtf_1_1cudaEventCreator.xml @@ -0,0 +1,73 @@ + + + + tf::cudaEventCreator + taskflow/cuda/cuda_stream.hpp + + + cudaEvent_t + cudaEvent_t tf::cudaEventCreator::operator() + () const + operator() + tf::cudaEventCreator::operator() + +creates a new cudaEvent_t object using cudaEventCreate + + + + + + + + + cudaEvent_t + cudaEvent_t tf::cudaEventCreator::operator() + (unsigned int flag) const + operator() + tf::cudaEventCreator::operator() + + unsigned int + flag + + +creates a new cudaEvent_t object using cudaEventCreate with the given flag + + + + + + + + + cudaEvent_t + cudaEvent_t tf::cudaEventCreator::operator() + (cudaEvent_t event) const + operator() + tf::cudaEventCreator::operator() + + cudaEvent_t + event + + +returns the given cudaEvent_t object + + + + + + + + + +class to create functors that construct CUDA events + + + + + + tf::cudaEventCreatoroperator() + tf::cudaEventCreatoroperator() + tf::cudaEventCreatoroperator() + + + diff --git a/docs/xml/classtf_1_1cudaEventDeleter.xml b/docs/xml/classtf_1_1cudaEventDeleter.xml new file mode 100644 index 000000000..f5669364f --- /dev/null +++ b/docs/xml/classtf_1_1cudaEventDeleter.xml @@ -0,0 +1,37 @@ + + + + tf::cudaEventDeleter + taskflow/cuda/cuda_stream.hpp + + + void + void tf::cudaEventDeleter::operator() + (cudaEvent_t event) const + operator() + tf::cudaEventDeleter::operator() + + cudaEvent_t + event + + +deletes the given cudaEvent_t object using cudaEventDestroy + + + + + + + + + +class to create a functor that deletes a CUDA event + + + + + + tf::cudaEventDeleteroperator() + + + diff --git a/docs/xml/classtf_1_1cudaExecutionPolicy.xml b/docs/xml/classtf_1_1cudaExecutionPolicy.xml deleted file mode 100644 index bbaefac90..000000000 --- a/docs/xml/classtf_1_1cudaExecutionPolicy.xml +++ /dev/null @@ -1,418 +0,0 @@ - - - - tf::cudaExecutionPolicy - cuda_execution_policy.hpp - - - unsigned - NT - NT - - - unsigned - VT - VT - - - - - const unsigned - const unsigned tf::cudaExecutionPolicy< NT, VT >::nt - - nt - = NT - -static constant for getting the number of threads per block - - - - - - - - - const unsigned - const unsigned tf::cudaExecutionPolicy< NT, VT >::vt - - vt - = VT - -static constant for getting the number of work units per thread - - - - - - - - - const unsigned - const unsigned tf::cudaExecutionPolicy< NT, VT >::nv - - nv - = NT*VT - -static constant for getting the number of elements to process per block - - - - - - - - - - - cudaStream_t - cudaStream_t tf::cudaExecutionPolicy< NT, VT >::_stream - - _stream - {0} - - - - - - - - - - - - - tf::cudaExecutionPolicy< NT, VT >::cudaExecutionPolicy - ()=default - cudaExecutionPolicy - -constructs an execution policy object with default stream - - - - - - - - - - tf::cudaExecutionPolicy< NT, VT >::cudaExecutionPolicy - (cudaStream_t s) - cudaExecutionPolicy - - cudaStream_t - s - - -constructs an execution policy object with the given stream - - - - - - - - - cudaStream_t - cudaStream_t tf::cudaExecutionPolicy< NT, VT >::stream - () noexcept - stream - -queries the associated stream - - - - - - - - - void - void tf::cudaExecutionPolicy< NT, VT >::stream - (cudaStream_t stream) noexcept - stream - - cudaStream_t - stream - - -assigns a stream - - - - - - - - - - - unsigned - static unsigned tf::cudaExecutionPolicy< NT, VT >::num_blocks - (unsigned N) - num_blocks - - unsigned - N - - -queries the number of blocks to accommodate N elements - - - - - - - - - - - typename T - - - unsigned - unsigned tf::cudaExecutionPolicy< NT, VT >::reduce_bufsz - (unsigned count) - reduce_bufsz - - unsigned - count - - -queries the buffer size in bytes needed to call reduce kernels - - - - -T - - -value type - - - - - -count - - -number of elements to reduce - - - -The function is used to allocate a buffer for calling tf::cuda_reduce, tf::cuda_uninitialized_reduce, tf::cuda_transform_reduce, and tf::cuda_uninitialized_transform_reduce. - - - - - - - - - typename T - - - unsigned - unsigned tf::cudaExecutionPolicy< NT, VT >::min_element_bufsz - (unsigned count) - min_element_bufsz - - unsigned - count - - -queries the buffer size in bytes needed to call tf::cuda_min_element - - - - -T - - -value type - - - - - -count - - -number of elements to search - - - -The function is used to decide the buffer size in bytes for calling tf::cuda_min_element. - - - - - - - - - typename T - - - unsigned - unsigned tf::cudaExecutionPolicy< NT, VT >::max_element_bufsz - (unsigned count) - max_element_bufsz - - unsigned - count - - -queries the buffer size in bytes needed to call tf::cuda_max_element - - - - -T - - -value type - - - - - -count - - -number of elements to search - - - -The function is used to decide the buffer size in bytes for calling tf::cuda_max_element. - - - - - - - - - typename T - - - unsigned - unsigned tf::cudaExecutionPolicy< NT, VT >::scan_bufsz - (unsigned count) - scan_bufsz - - unsigned - count - - -queries the buffer size in bytes needed to call scan kernels - - - - -T - - -value type - - - - - -count - - -number of elements to scan - - - -The function is used to allocate a buffer for calling tf::cuda_inclusive_scan, tf::cuda_exclusive_scan, tf::cuda_transform_inclusive_scan, and tf::cuda_transform_exclusive_scan. - - - - - - - unsigned - unsigned tf::cudaExecutionPolicy< NT, VT >::merge_bufsz - (unsigned a_count, unsigned b_count) - merge_bufsz - - unsigned - a_count - - - unsigned - b_count - - -queries the buffer size in bytes needed for CUDA merge algorithms - - - - -a_count - - -number of elements in the first vector to merge - - - - -b_count - - -number of elements in the second vector to merge - - - -The buffer size of merge algorithm does not depend on the data type. The buffer is purely used only for storing temporary indices (of type unsigned) required during the merge process. -The function is used to allocate a buffer for calling tf::cuda_merge and tf::cuda_merge_by_key. - - - - - - - -class to define execution policy for CUDA standard algorithms - - - - -NT - - -number of threads per block - - - - -VT - - -number of work units per thread - - - -Execution policy configures the kernel execution parameters in CUDA algorithms. The first template argument, NT, the number of threads per block should always be a power-of-two number. The second template argument, VT, the number of work units per thread is recommended to be an odd number to avoid bank conflict. -Details can be referred to Execution Policy. - - - - tf::cudaExecutionPolicy_stream - tf::cudaExecutionPolicycudaExecutionPolicy - tf::cudaExecutionPolicycudaExecutionPolicy - tf::cudaExecutionPolicymax_element_bufsz - tf::cudaExecutionPolicymerge_bufsz - tf::cudaExecutionPolicymin_element_bufsz - tf::cudaExecutionPolicynt - tf::cudaExecutionPolicynum_blocks - tf::cudaExecutionPolicynv - tf::cudaExecutionPolicyreduce_bufsz - tf::cudaExecutionPolicyscan_bufsz - tf::cudaExecutionPolicystream - tf::cudaExecutionPolicystream - tf::cudaExecutionPolicyvt - - - diff --git a/docs/xml/classtf_1_1cudaFlow.xml b/docs/xml/classtf_1_1cudaFlow.xml deleted file mode 100644 index 70cb734c9..000000000 --- a/docs/xml/classtf_1_1cudaFlow.xml +++ /dev/null @@ -1,1796 +0,0 @@ - - - - tf::cudaFlow - cudaflow.hpp - - - cudaFlowGraph - cudaFlowGraph tf::cudaFlow::_cfg - - _cfg - - - - - - - - - - cudaGraphExec - cudaGraphExec tf::cudaFlow::_exe - - _exe - {nullptr} - - - - - - - - - - - - - tf::cudaFlow::cudaFlow - () - cudaFlow - -constructs a cudaFlow - - - - - - - - - - tf::cudaFlow::~cudaFlow - ()=default - ~cudaFlow - -destroys the cudaFlow and its associated native CUDA graph and executable graph - - - - - - - - - - tf::cudaFlow::cudaFlow - (cudaFlow &&)=default - cudaFlow - - cudaFlow && - - -default move constructor - - - - - - - - - cudaFlow & - cudaFlow& tf::cudaFlow::operator= - (cudaFlow &&)=default - operator= - - cudaFlow && - - -default move assignment operator - - - - - - - - - bool - bool tf::cudaFlow::empty - () const - empty - -queries the emptiness of the graph - - - - - - - - - size_t - size_t tf::cudaFlow::num_tasks - () const - num_tasks - -queries the number of tasks - - - - - - - - - void - void tf::cudaFlow::clear - () - clear - -clears the cudaFlow object - - - - - - - - - void - void tf::cudaFlow::dump - (std::ostream &os) const - dump - - std::ostream & - os - - -dumps the cudaFlow graph into a DOT format through an output stream - - - - - - - - - void - void tf::cudaFlow::dump_native_graph - (std::ostream &os) const - dump_native_graph - - std::ostream & - os - - -dumps the native CUDA graph into a DOT format through an output stream - - -The native CUDA graph may be different from the upper-level cudaFlow graph when flow capture is involved. - - - - - - - cudaTask - cudaTask tf::cudaFlow::noop - () - noop - -creates a no-operation task - - -a tf::cudaTask handle - -An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of n nodes with a barrier between them can be represented using an empty node and 2*n dependency edges, rather than no empty node and n^2 dependency edges. - - - - - - - - - typename C - - - cudaTask - cudaTask tf::cudaFlow::host - (C &&callable) - host - - C && - callable - - -creates a host task that runs a callable on the host - - - - -C - - -callable type - - - - - -callable - - -a callable object with neither arguments nor return (i.e., constructible from std::function<void()>) - - - -a tf::cudaTask handle - -A host task can only execute CPU-specific functions and cannot do any CUDA calls (e.g., cudaMalloc). - - - - - - - - - typename C - - - void - void tf::cudaFlow::host - (cudaTask task, C &&callable) - host - - cudaTask - task - - - C && - callable - - -updates parameters of a host task - - -The method is similar to tf::cudaFlow::host but operates on a task of type tf::cudaTaskType::HOST. - - - - - - - - - typename F - - - typename... - ArgsT - ArgsT - - - cudaTask - cudaTask tf::cudaFlow::kernel - (dim3 g, dim3 b, size_t s, F f, ArgsT... args) - kernel - - dim3 - g - - - dim3 - b - - - size_t - s - - - F - f - - - ArgsT... - args - - -creates a kernel task - - - - -F - - -kernel function type - - - - -ArgsT - - -kernel function parameters type - - - - - -g - - -configured grid - - - - -b - - -configured block - - - - -s - - -configured shared memory size in bytes - - - - -f - - -kernel function - - - - -args - - -arguments to forward to the kernel function by copy - - - -a tf::cudaTask handle - - - - - - - - - - - typename F - - - typename... - ArgsT - ArgsT - - - void - void tf::cudaFlow::kernel - (cudaTask task, dim3 g, dim3 b, size_t shm, F f, ArgsT... args) - kernel - - cudaTask - task - - - dim3 - g - - - dim3 - b - - - size_t - shm - - - F - f - - - ArgsT... - args - - -updates parameters of a kernel task - - -The method is similar to tf::cudaFlow::kernel but operates on a task of type tf::cudaTaskType::KERNEL. The kernel function name must NOT change. - - - - - - - cudaTask - cudaTask tf::cudaFlow::memset - (void *dst, int v, size_t count) - memset - - void * - dst - - - int - v - - - size_t - count - - -creates a memset task that fills untyped data with a byte value - - - - -dst - - -pointer to the destination device memory area - - - - -v - - -value to set for each byte of specified memory - - - - -count - - -size in bytes to set - - - -a tf::cudaTask handle - -A memset task fills the first count bytes of device memory area pointed by dst with the byte value v. - - - - - - - void - void tf::cudaFlow::memset - (cudaTask task, void *dst, int ch, size_t count) - memset - - cudaTask - task - - - void * - dst - - - int - ch - - - size_t - count - - -updates parameters of a memset task - - -The method is similar to tf::cudaFlow::memset but operates on a task of type tf::cudaTaskType::MEMSET. The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory. - - - - - - - cudaTask - cudaTask tf::cudaFlow::memcpy - (void *tgt, const void *src, size_t bytes) - memcpy - - void * - tgt - - - const void * - src - - - size_t - bytes - - -creates a memcpy task that copies untyped data in bytes - - - - -tgt - - -pointer to the target memory block - - - - -src - - -pointer to the source memory block - - - - -bytes - - -bytes to copy - - - -a tf::cudaTask handle - -A memcpy task transfers bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs. - - - - - - - void - void tf::cudaFlow::memcpy - (cudaTask task, void *tgt, const void *src, size_t bytes) - memcpy - - cudaTask - task - - - void * - tgt - - - const void * - src - - - size_t - bytes - - -updates parameters of a memcpy task - - -The method is similar to tf::cudaFlow::memcpy but operates on a task of type tf::cudaTaskType::MEMCPY. The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory. - - - - - - - - - typename T - - - std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * - nullptr - - - cudaTask - cudaTask tf::cudaFlow::zero - (T *dst, size_t count) - zero - - T * - dst - - - size_t - count - - -creates a memset task that sets a typed memory block to zero - - - - -T - - -element type (size of T must be either 1, 2, or 4) - - - - - -dst - - -pointer to the destination device memory area - - - - -count - - -number of elements - - - -a tf::cudaTask handle - -A zero task zeroes the first count elements of type T in a device memory area pointed by dst. - - - - - - - - - typename T - - - std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * - nullptr - - - void - void tf::cudaFlow::zero - (cudaTask task, T *dst, size_t count) - zero - - cudaTask - task - - - T * - dst - - - size_t - count - - -updates parameters of a memset task to a zero task - - -The method is similar to tf::cudaFlow::zero but operates on a task of type tf::cudaTaskType::MEMSET. -The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory. - - - - - - - - - typename T - - - std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * - nullptr - - - cudaTask - cudaTask tf::cudaFlow::fill - (T *dst, T value, size_t count) - fill - - T * - dst - - - T - value - - - size_t - count - - -creates a memset task that fills a typed memory block with a value - - - - -T - - -element type (size of T must be either 1, 2, or 4) - - - - - -dst - - -pointer to the destination device memory area - - - - -value - - -value to fill for each element of type T - - - - -count - - -number of elements - - - -a tf::cudaTask handle - -A fill task fills the first count elements of type T with value in a device memory area pointed by dst. The value to fill is interpreted in type T rather than byte. - - - - - - - - - typename T - - - std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * - nullptr - - - void - void tf::cudaFlow::fill - (cudaTask task, T *dst, T value, size_t count) - fill - - cudaTask - task - - - T * - dst - - - T - value - - - size_t - count - - -updates parameters of a memset task to a fill task - - -The method is similar to tf::cudaFlow::fill but operates on a task of type tf::cudaTaskType::MEMSET. -The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory. - - - - - - - - - typename T - - - std::enable_if_t<!std::is_same_v< T, void >, void > * - nullptr - - - cudaTask - cudaTask tf::cudaFlow::copy - (T *tgt, const T *src, size_t num) - copy - - T * - tgt - - - const T * - src - - - size_t - num - - -creates a memcopy task that copies typed data - - - - -T - - -element type (non-void) - - - - - -tgt - - -pointer to the target memory block - - - - -src - - -pointer to the source memory block - - - - -num - - -number of elements to copy - - - -a tf::cudaTask handle - -A copy task transfers num*sizeof(T) bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs. - - - - - - - - - typename T - - - std::enable_if_t<!std::is_same_v< T, void >, void > * - nullptr - - - void - void tf::cudaFlow::copy - (cudaTask task, T *tgt, const T *src, size_t num) - copy - - cudaTask - task - - - T * - tgt - - - const T * - src - - - size_t - num - - -updates parameters of a memcpy task to a copy task - - -The method is similar to tf::cudaFlow::copy but operates on a task of type tf::cudaTaskType::MEMCPY. The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory. - - - - - - - void - void tf::cudaFlow::run - (cudaStream_t stream) - run - - cudaStream_t - stream - - -offloads the cudaFlow onto a GPU asynchronously via a stream - - - - -stream - - -stream for performing this operation - - - -Offloads the present cudaFlow onto a GPU asynchronously via the given stream. -An offloaded cudaFlow forces the underlying graph to be instantiated. After the instantiation, you should not modify the graph topology but update node parameters. - - - - - - - cudaGraph_t - cudaGraph_t tf::cudaFlow::native_graph - () - native_graph - -acquires a reference to the underlying CUDA graph - - - - - - - - - cudaGraphExec_t - cudaGraphExec_t tf::cudaFlow::native_executable - () - native_executable - -acquires a reference to the underlying CUDA graph executable - - - - - - - - - - - typename C - - - cudaTask - cudaTask tf::cudaFlow::single_task - (C c) - single_task - - C - c - - -runs a callable with only a single kernel thread - - - - -C - - -callable type - - - - - -c - - -callable to run by a single kernel thread - - - -a tf::cudaTask handle - - - - - - - - - - - typename C - - - void - void tf::cudaFlow::single_task - (cudaTask task, C c) - single_task - - cudaTask - task - - - C - c - - -updates a single-threaded kernel task - - -This method is similar to cudaFlow::single_task but operates on an existing task. - - - - - - - - - typename I - - - typename C - - - cudaTask - cudaTask tf::cudaFlow::for_each - (I first, I last, C callable) - for_each - - I - first - - - I - last - - - C - callable - - -applies a callable to each dereferenced element of the data array - - - - -I - - -iterator type - - - - -C - - -callable type - - - - - -first - - -iterator to the beginning (inclusive) - - - - -last - - -iterator to the end (exclusive) - - - - -callable - - -a callable object to apply to the dereferenced iterator - - - -a tf::cudaTask handle - -This method is equivalent to the parallel execution of the following loop on a GPU: -for(autoitr=first;itr!=last;itr++){ -callable(*itr); -} - - - - - - - - - - typename I - - - typename C - - - void - void tf::cudaFlow::for_each - (cudaTask task, I first, I last, C callable) - for_each - - cudaTask - task - - - I - first - - - I - last - - - C - callable - - -updates parameters of a kernel task created from tf::cudaFlow::for_each - - -The type of the iterators and the callable must be the same as the task created from tf::cudaFlow::for_each. - - - - - - - - - typename I - - - typename C - - - cudaTask - cudaTask tf::cudaFlow::for_each_index - (I first, I last, I step, C callable) - for_each_index - - I - first - - - I - last - - - I - step - - - C - callable - - -applies a callable to each index in the range with the step size - - - - -I - - -index type - - - - -C - - -callable type - - - - - -first - - -beginning index - - - - -last - - -last index - - - - -step - - -step size - - - - -callable - - -the callable to apply to each element in the data array - - - -a tf::cudaTask handle - -This method is equivalent to the parallel execution of the following loop on a GPU: -//stepispositive[first,last) -for(autoi=first;i<last;i+=step){ -callable(i); -} - -//stepisnegative[first,last) -for(autoi=first;i>last;i+=step){ -callable(i); -} - - - - - - - - - - typename I - - - typename C - - - void - void tf::cudaFlow::for_each_index - (cudaTask task, I first, I last, I step, C callable) - for_each_index - - cudaTask - task - - - I - first - - - I - last - - - I - step - - - C - callable - - -updates parameters of a kernel task created from tf::cudaFlow::for_each_index - - -The type of the iterators and the callable must be the same as the task created from tf::cudaFlow::for_each_index. - - - - - - - - - typename I - - - typename O - - - typename C - - - cudaTask - cudaTask tf::cudaFlow::transform - (I first, I last, O output, C op) - transform - - I - first - - - I - last - - - O - output - - - C - op - - -applies a callable to a source range and stores the result in a target range - - - - -I - - -input iterator type - - - - -O - - -output iterator type - - - - -C - - -unary operator type - - - - - -first - - -iterator to the beginning of the input range - - - - -last - - -iterator to the end of the input range - - - - -output - - -iterator to the beginning of the output range - - - - -op - - -the operator to apply to transform each element in the range - - - -a tf::cudaTask handle - -This method is equivalent to the parallel execution of the following loop on a GPU: -while(first!=last){ -*output++=callable(*first++); -} - - - - - - - - - - typename I - - - typename O - - - typename C - - - void - void tf::cudaFlow::transform - (cudaTask task, I first, I last, O output, C c) - transform - - cudaTask - task - - - I - first - - - I - last - - - O - output - - - C - c - - -updates parameters of a kernel task created from tf::cudaFlow::transform - - -The type of the iterators and the callable must be the same as the task created from tf::cudaFlow::for_each. - - - - - - - - - typename I1 - - - typename I2 - - - typename O - - - typename C - - - cudaTask - cudaTask tf::cudaFlow::transform - (I1 first1, I1 last1, I2 first2, O output, C op) - transform - - I1 - first1 - - - I1 - last1 - - - I2 - first2 - - - O - output - - - C - op - - -creates a task to perform parallel transforms over two ranges of items - - - - -I1 - - -first input iterator type - - - - -I2 - - -second input iterator type - - - - -O - - -output iterator type - - - - -C - - -unary operator type - - - - - -first1 - - -iterator to the beginning of the input range - - - - -last1 - - -iterator to the end of the input range - - - - -first2 - - -iterato - - - - -output - - -iterator to the beginning of the output range - - - - -op - - -binary operator to apply to transform each pair of items in the two input ranges - - - -cudaTask handle - -This method is equivalent to the parallel execution of the following loop on a GPU: -while(first1!=last1){ -*output++=op(*first1++,*first2++); -} - - - - - - - - - - typename I1 - - - typename I2 - - - typename O - - - typename C - - - void - void tf::cudaFlow::transform - (cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c) - transform - - cudaTask - task - - - I1 - first1 - - - I1 - last1 - - - I2 - first2 - - - O - output - - - C - c - - -updates parameters of a kernel task created from tf::cudaFlow::transform - - -The type of the iterators and the callable must be the same as the task created from tf::cudaFlow::for_each. - - - - - - - - - typename C - - - cudaTask - cudaTask tf::cudaFlow::capture - (C &&callable) - capture - - C && - callable - - -constructs a subflow graph through tf::cudaFlowCapturer - - - - -C - - -callable type constructible from std::function<void(tf::cudaFlowCapturer&)> - - - - - -callable - - -the callable to construct a capture flow - - - -a tf::cudaTask handle - -A captured subflow forms a sub-graph to the cudaFlow and can be used to capture custom (or third-party) kernels that cannot be directly constructed from the cudaFlow. -Example usage: -taskflow.emplace([&](tf::cudaFlow&cf){ - -tf::cudaTaskmy_kernel=cf.kernel(my_arguments); - -//createaflowcapturertocapturecustomkernels -tf::cudaTaskmy_subflow=cf.capture([&](tf::cudaFlowCapturer&capturer){ -capturer.on([&](cudaStream_tstream){ -invoke_custom_kernel_with_stream(stream,custom_arguments); -}); -}); - -my_kernel.precede(my_subflow); -}); - - - - - - - - - - typename C - - - void - void tf::cudaFlow::capture - (cudaTask task, C callable) - capture - - cudaTask - task - - - C - callable - - -updates the captured child graph - - -The method is similar to tf::cudaFlow::capture but operates on a task of type tf::cudaTaskType::SUBFLOW. The new captured graph must be topologically identical to the original captured graph. - - - - - - - -class to create a cudaFlow task dependency graph - - -A cudaFlow is a high-level interface over CUDA Graph to perform GPU operations using the task dependency graph model. The class provides a set of methods for creating and launch different tasks on one or multiple CUDA devices, for instance, kernel tasks, data transfer tasks, and memory operation tasks. The following example creates a cudaFlow of two kernel tasks, task1 and task2, where task1 runs before task2. -tf::Taskflowtaskflow; -tf::Executorexecutor; - -taskflow.emplace([&](tf::cudaFlow&cf){ -//createtwokerneltasks -tf::cudaTasktask1=cf.kernel(grid1,block1,shm_size1,kernel1,args1); -tf::cudaTasktask2=cf.kernel(grid2,block2,shm_size2,kernel2,args2); - -//kernel1runsbeforekernel2 -task1.precede(task2); -}); - -executor.run(taskflow).wait(); - -A cudaFlow is a task (tf::Task) created from tf::Taskflow and will be run by one worker thread in the executor. That is, the callable that describes a cudaFlow will be executed sequentially. Inside a cudaFlow task, different GPU tasks (tf::cudaTask) may run in parallel scheduled by the CUDA runtime. -Please refer to GPU Tasking (cudaFlow) for details. - - - - tf::cudaFlow_cfg - tf::cudaFlow_exe - tf::cudaFlowcapture - tf::cudaFlowcapture - tf::cudaFlowclear - tf::cudaFlowcopy - tf::cudaFlowcopy - tf::cudaFlowcudaFlow - tf::cudaFlowcudaFlow - tf::cudaFlowdump - tf::cudaFlowdump_native_graph - tf::cudaFlowempty - tf::cudaFlowfill - tf::cudaFlowfill - tf::cudaFlowfor_each - tf::cudaFlowfor_each - tf::cudaFlowfor_each_index - tf::cudaFlowfor_each_index - tf::cudaFlowhost - tf::cudaFlowhost - tf::cudaFlowkernel - tf::cudaFlowkernel - tf::cudaFlowmemcpy - tf::cudaFlowmemcpy - tf::cudaFlowmemset - tf::cudaFlowmemset - tf::cudaFlownative_executable - tf::cudaFlownative_graph - tf::cudaFlownoop - tf::cudaFlownum_tasks - tf::cudaFlowoperator= - tf::cudaFlowrun - tf::cudaFlowsingle_task - tf::cudaFlowsingle_task - tf::cudaFlowtransform - tf::cudaFlowtransform - tf::cudaFlowtransform - tf::cudaFlowtransform - tf::cudaFlowzero - tf::cudaFlowzero - tf::cudaFlow~cudaFlow - - - diff --git a/docs/xml/classtf_1_1cudaFlowCapturer.xml b/docs/xml/classtf_1_1cudaFlowCapturer.xml deleted file mode 100644 index c5f0acdb6..000000000 --- a/docs/xml/classtf_1_1cudaFlowCapturer.xml +++ /dev/null @@ -1,1675 +0,0 @@ - - - - tf::cudaFlowCapturer - cuda_capturer.hpp - tf::cudaFlowCapturer::External - tf::cudaFlowCapturer::Internal - - - std::variant< External, Internal > - using tf::cudaFlowCapturer::handle_t = std::variant<External, Internal> - - handle_t - - - - - - - - - - std::variant< cudaFlowRoundRobinOptimizer, cudaFlowSequentialOptimizer, cudaFlowLinearOptimizer > - using tf::cudaFlowCapturer::Optimizer = std::variant< cudaFlowRoundRobinOptimizer, cudaFlowSequentialOptimizer, cudaFlowLinearOptimizer > - - Optimizer - - - - - - - - - - - - class - friend class cudaFlow - - cudaFlow - - cudaFlow - - - - - - - - - - - class - friend class Executor - - Executor - - Executor - - - - - - - - - - - - - cudaFlowGraph - cudaFlowGraph tf::cudaFlowCapturer::_cfg - - _cfg - - - - - - - - - - Optimizer - Optimizer tf::cudaFlowCapturer::_optimizer - - _optimizer - - - - - - - - - - cudaGraphExec - cudaGraphExec tf::cudaFlowCapturer::_exe - - _exe - {nullptr} - - - - - - - - - - - - - tf::cudaFlowCapturer::cudaFlowCapturer - ()=default - cudaFlowCapturer - -constrcts a standalone cudaFlowCapturer - - -A standalone cudaFlow capturer does not go through any taskflow and can be run by the caller thread using tf::cudaFlowCapturer::run. - - - - - - - - tf::cudaFlowCapturer::~cudaFlowCapturer - ()=default - ~cudaFlowCapturer - -destructs the cudaFlowCapturer - - - - - - - - - - tf::cudaFlowCapturer::cudaFlowCapturer - (cudaFlowCapturer &&)=default - cudaFlowCapturer - - cudaFlowCapturer && - - -default move constructor - - - - - - - - - cudaFlowCapturer & - cudaFlowCapturer& tf::cudaFlowCapturer::operator= - (cudaFlowCapturer &&)=default - operator= - - cudaFlowCapturer && - - -default move assignment operator - - - - - - - - - bool - bool tf::cudaFlowCapturer::empty - () const - empty - -queries the emptiness of the graph - - - - - - - - - size_t - size_t tf::cudaFlowCapturer::num_tasks - () const - num_tasks - -queries the number of tasks - - - - - - - - - void - void tf::cudaFlowCapturer::clear - () - clear - -clear this cudaFlow capturer - - - - - - - - - void - void tf::cudaFlowCapturer::dump - (std::ostream &os) const - dump - - std::ostream & - os - - -dumps the cudaFlow graph into a DOT format through an output stream - - - - - - - - - void - void tf::cudaFlowCapturer::dump_native_graph - (std::ostream &os) const - dump_native_graph - - std::ostream & - os - - -dumps the native captured graph into a DOT format through an output stream - - - - - - - - - - - typename C - - - std::enable_if_t< std::is_invocable_r_v< void, C, cudaStream_t >, void > * - nullptr - - - cudaTask - cudaTask tf::cudaFlowCapturer::on - (C &&callable) - on - - C && - callable - - -captures a sequential CUDA operations from the given callable - - - - -C - - -callable type constructible with std::function<void(cudaStream_t)> - - - - - -callable - - -a callable to capture CUDA operations with the stream - - - -This methods applies a stream created by the flow to capture a sequence of CUDA operations defined in the callable. - - - - - - - - - typename C - - - std::enable_if_t< std::is_invocable_r_v< void, C, cudaStream_t >, void > * - nullptr - - - void - void tf::cudaFlowCapturer::on - (cudaTask task, C &&callable) - on - - cudaTask - task - - - C && - callable - - -updates a capture task to another sequential CUDA operations - - -The method is similar to cudaFlowCapturer::on but operates on an existing task. - - - - - - - cudaTask - cudaTask tf::cudaFlowCapturer::noop - () - noop - -captures a no-operation task - - -a tf::cudaTask handle - -An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of n nodes with a barrier between them can be represented using an empty node and 2*n dependency edges, rather than no empty node and n^2 dependency edges. - - - - - - - void - void tf::cudaFlowCapturer::noop - (cudaTask task) - noop - - cudaTask - task - - -updates a task to a no-operation task - - -The method is similar to tf::cudaFlowCapturer::noop but operates on an existing task. - - - - - - - cudaTask - cudaTask tf::cudaFlowCapturer::memcpy - (void *dst, const void *src, size_t count) - memcpy - - void * - dst - - - const void * - src - - - size_t - count - - -copies data between host and device asynchronously through a stream - - - - -dst - - -destination memory address - - - - -src - - -source memory address - - - - -count - - -size in bytes to copy - - - -The method captures a cudaMemcpyAsync operation through an internal stream. - - - - - - - void - void tf::cudaFlowCapturer::memcpy - (cudaTask task, void *dst, const void *src, size_t count) - memcpy - - cudaTask - task - - - void * - dst - - - const void * - src - - - size_t - count - - -updates a capture task to a memcpy operation - - -The method is similar to cudaFlowCapturer::memcpy but operates on an existing task. - - - - - - - - - typename T - - - std::enable_if_t<!std::is_same_v< T, void >, void > * - nullptr - - - cudaTask - cudaTask tf::cudaFlowCapturer::copy - (T *tgt, const T *src, size_t num) - copy - - T * - tgt - - - const T * - src - - - size_t - num - - -captures a copy task of typed data - - - - -T - - -element type (non-void) - - - - - -tgt - - -pointer to the target memory block - - - - -src - - -pointer to the source memory block - - - - -num - - -number of elements to copy - - - -cudaTask handle - -A copy task transfers num*sizeof(T) bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs. - - - - - - - - - typename T - - - std::enable_if_t<!std::is_same_v< T, void >, void > * - nullptr - - - void - void tf::cudaFlowCapturer::copy - (cudaTask task, T *tgt, const T *src, size_t num) - copy - - cudaTask - task - - - T * - tgt - - - const T * - src - - - size_t - num - - -updates a capture task to a copy operation - - -The method is similar to cudaFlowCapturer::copy but operates on an existing task. - - - - - - - cudaTask - cudaTask tf::cudaFlowCapturer::memset - (void *ptr, int v, size_t n) - memset - - void * - ptr - - - int - v - - - size_t - n - - -initializes or sets GPU memory to the given value byte by byte - - - - -ptr - - -pointer to GPU mempry - - - - -v - - -value to set for each byte of the specified memory - - - - -n - - -size in bytes to set - - - -The method captures a cudaMemsetAsync operation through an internal stream to fill the first count bytes of the memory area pointed to by devPtr with the constant byte value value. - - - - - - - void - void tf::cudaFlowCapturer::memset - (cudaTask task, void *ptr, int value, size_t n) - memset - - cudaTask - task - - - void * - ptr - - - int - value - - - size_t - n - - -updates a capture task to a memset operation - - -The method is similar to cudaFlowCapturer::memset but operates on an existing task. - - - - - - - - - typename F - - - typename... - ArgsT - ArgsT - - - cudaTask - cudaTask tf::cudaFlowCapturer::kernel - (dim3 g, dim3 b, size_t s, F f, ArgsT &&... args) - kernel - - dim3 - g - - - dim3 - b - - - size_t - s - - - F - f - - - ArgsT &&... - args - - -captures a kernel - - - - -F - - -kernel function type - - - - -ArgsT - - -kernel function parameters type - - - - - -g - - -configured grid - - - - -b - - -configured block - - - - -s - - -configured shared memory size in bytes - - - - -f - - -kernel function - - - - -args - - -arguments to forward to the kernel function by copy - - - -cudaTask handle - - - - - - - - - - - typename F - - - typename... - ArgsT - ArgsT - - - void - void tf::cudaFlowCapturer::kernel - (cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT &&... args) - kernel - - cudaTask - task - - - dim3 - g - - - dim3 - b - - - size_t - s - - - F - f - - - ArgsT &&... - args - - -updates a capture task to a kernel operation - - -The method is similar to cudaFlowCapturer::kernel but operates on an existing task. - - - - - - - - - typename C - - - cudaTask - cudaTask tf::cudaFlowCapturer::single_task - (C c) - single_task - - C - c - - -capturers a kernel to runs the given callable with only one thread - - - - -C - - -callable type - - - - - -c - - -callable to run by a single kernel thread - - - - - - - - - - - - - typename C - - - void - void tf::cudaFlowCapturer::single_task - (cudaTask task, C c) - single_task - - cudaTask - task - - - C - c - - -updates a capture task to a single-threaded kernel - - -This method is similar to cudaFlowCapturer::single_task but operates on an existing task. - - - - - - - - - typename I - - - typename C - - - cudaTask - cudaTask tf::cudaFlowCapturer::for_each - (I first, I last, C callable) - for_each - - I - first - - - I - last - - - C - callable - - -captures a kernel that applies a callable to each dereferenced element of the data array - - - - -I - - -iterator type - - - - -C - - -callable type - - - - - -first - - -iterator to the beginning - - - - -last - - -iterator to the end - - - - -callable - - -a callable object to apply to the dereferenced iterator - - - -cudaTask handle - -This method is equivalent to the parallel execution of the following loop on a GPU: -for(autoitr=first;itr!=last;i++){ -callable(*itr); -} - - - - - - - - - - typename I - - - typename C - - - void - void tf::cudaFlowCapturer::for_each - (cudaTask task, I first, I last, C callable) - for_each - - cudaTask - task - - - I - first - - - I - last - - - C - callable - - -updates a capture task to a for-each kernel task - - -This method is similar to cudaFlowCapturer::for_each but operates on an existing task. - - - - - - - - - typename I - - - typename C - - - cudaTask - cudaTask tf::cudaFlowCapturer::for_each_index - (I first, I last, I step, C callable) - for_each_index - - I - first - - - I - last - - - I - step - - - C - callable - - -captures a kernel that applies a callable to each index in the range with the step size - - - - -I - - -index type - - - - -C - - -callable type - - - - - -first - - -beginning index - - - - -last - - -last index - - - - -step - - -step size - - - - -callable - - -the callable to apply to each element in the data array - - - -cudaTask handle - -This method is equivalent to the parallel execution of the following loop on a GPU: -//stepispositive[first,last) -for(autoi=first;i<last;i+=step){ -callable(i); -} - -//stepisnegative[first,last) -for(autoi=first;i>last;i+=step){ -callable(i); -} - - - - - - - - - - typename I - - - typename C - - - void - void tf::cudaFlowCapturer::for_each_index - (cudaTask task, I first, I last, I step, C callable) - for_each_index - - cudaTask - task - - - I - first - - - I - last - - - I - step - - - C - callable - - -updates a capture task to a for-each-index kernel task - - -This method is similar to cudaFlowCapturer::for_each_index but operates on an existing task. - - - - - - - - - typename I - - - typename O - - - typename C - - - cudaTask - cudaTask tf::cudaFlowCapturer::transform - (I first, I last, O output, C op) - transform - - I - first - - - I - last - - - O - output - - - C - op - - -captures a kernel that transforms an input range to an output range - - - - -I - - -input iterator type - - - - -O - - -output iterator type - - - - -C - - -unary operator type - - - - - -first - - -iterator to the beginning of the input range - - - - -last - - -iterator to the end of the input range - - - - -output - - -iterator to the beginning of the output range - - - - -op - - -unary operator to apply to transform each item in the range - - - -cudaTask handle - -This method is equivalent to the parallel execution of the following loop on a GPU: -while(first!=last){ -*output++=op(*first++); -} - - - - - - - - - - typename I - - - typename O - - - typename C - - - void - void tf::cudaFlowCapturer::transform - (cudaTask task, I first, I last, O output, C op) - transform - - cudaTask - task - - - I - first - - - I - last - - - O - output - - - C - op - - -updates a capture task to a transform kernel task - - -This method is similar to cudaFlowCapturer::transform but operates on an existing task. - - - - - - - - - typename I1 - - - typename I2 - - - typename O - - - typename C - - - cudaTask - cudaTask tf::cudaFlowCapturer::transform - (I1 first1, I1 last1, I2 first2, O output, C op) - transform - - I1 - first1 - - - I1 - last1 - - - I2 - first2 - - - O - output - - - C - op - - -captures a kernel that transforms two input ranges to an output range - - - - -I1 - - -first input iterator type - - - - -I2 - - -second input iterator type - - - - -O - - -output iterator type - - - - -C - - -unary operator type - - - - - -first1 - - -iterator to the beginning of the input range - - - - -last1 - - -iterator to the end of the input range - - - - -first2 - - -iterato - - - - -output - - -iterator to the beginning of the output range - - - - -op - - -binary operator to apply to transform each pair of items in the two input ranges - - - -cudaTask handle - -This method is equivalent to the parallel execution of the following loop on a GPU: -while(first1!=last1){ -*output++=op(*first1++,*first2++); -} - - - - - - - - - - typename I1 - - - typename I2 - - - typename O - - - typename C - - - void - void tf::cudaFlowCapturer::transform - (cudaTask task, I1 first1, I1 last1, I2 first2, O output, C op) - transform - - cudaTask - task - - - I1 - first1 - - - I1 - last1 - - - I2 - first2 - - - O - output - - - C - op - - -updates a capture task to a transform kernel task - - -This method is similar to cudaFlowCapturer::transform but operates on an existing task. - - - - - - - - - typename OPT - - - typename... - ArgsT - ArgsT - - - OPT & - OPT & tf::cudaFlowCapturer::make_optimizer - (ArgsT &&... args) - make_optimizer - - ArgsT &&... - args - - -selects a different optimization algorithm - - - - -OPT - - -optimizer type - - - - -ArgsT - - -arguments types - - - - - -args - - -arguments to forward to construct the optimizer - - - -a reference to the optimizer - -We currently supports the following optimization algorithms to capture a user-described cudaFlow: -tf::cudaFlowSequentialOptimizer -tf::cudaFlowRoundRobinOptimizer -tf::cudaFlowLinearOptimizer - - -By default, tf::cudaFlowCapturer uses the round-robin optimization algorithm with four streams to transform a user-level graph into a native CUDA graph. - - - - - - - cudaGraph_t - cudaGraph_t tf::cudaFlowCapturer::capture - () - capture - -captures the cudaFlow and turns it into a CUDA Graph - - - - - - - - - void - void tf::cudaFlowCapturer::run - (cudaStream_t stream) - run - - cudaStream_t - stream - - -offloads the cudaFlowCapturer onto a GPU asynchronously via a stream - - - - -stream - - -stream for performing this operation - - - -Offloads the present cudaFlowCapturer onto a GPU asynchronously via the given stream. -An offloaded cudaFlowCapturer forces the underlying graph to be instantiated. After the instantiation, you should not modify the graph topology but update node parameters. - - - - - - - cudaGraph_t - cudaGraph_t tf::cudaFlowCapturer::native_graph - () - native_graph - -acquires a reference to the underlying CUDA graph - - - - - - - - - cudaGraphExec_t - cudaGraphExec_t tf::cudaFlowCapturer::native_executable - () - native_executable - -acquires a reference to the underlying CUDA graph executable - - - - - - - - - -class to create a cudaFlow graph using stream capture - - -The usage of tf::cudaFlowCapturer is similar to tf::cudaFlow, except users can call the method tf::cudaFlowCapturer::on to capture a sequence of asynchronous CUDA operations through the given stream. The following example creates a CUDA graph that captures two kernel tasks, task_1 and task_2, where task_1 runs before task_2. -taskflow.emplace([](tf::cudaFlowCapturer&capturer){ - -//capturemy_kernel_1throughthegivenstreammanagedbythecapturer -autotask_1=capturer.on([&](cudaStream_tstream){ -my_kernel_1<<<grid_1,block_1,shm_size_1,stream>>>(my_parameters_1); -}); - -//capturemy_kernel_2throughthegivenstreammanagedbythecapturer -autotask_2=capturer.on([&](cudaStream_tstream){ -my_kernel_2<<<grid_2,block_2,shm_size_2,stream>>>(my_parameters_2); -}); - -task_1.precede(task_2); -}); - -Similar to tf::cudaFlow, a cudaFlowCapturer is a task (tf::Task) created from tf::Taskflow and will be run by one worker thread in the executor. That is, the callable that describes a cudaFlowCapturer will be executed sequentially. Inside a cudaFlow capturer task, different GPU tasks (tf::cudaTask) may run in parallel depending on the selected optimization algorithm. By default, we use tf::cudaFlowRoundRobinOptimizer to transform a user-level graph into a native CUDA graph. -Please refer to GPU Tasking (cudaFlowCapturer) for details. - - - - tf::cudaFlowCapturer_cfg - tf::cudaFlowCapturer_exe - tf::cudaFlowCapturer_optimizer - tf::cudaFlowCapturercapture - tf::cudaFlowCapturerclear - tf::cudaFlowCapturercopy - tf::cudaFlowCapturercopy - tf::cudaFlowCapturercudaFlow - tf::cudaFlowCapturercudaFlowCapturer - tf::cudaFlowCapturercudaFlowCapturer - tf::cudaFlowCapturerdump - tf::cudaFlowCapturerdump_native_graph - tf::cudaFlowCapturerempty - tf::cudaFlowCapturerExecutor - tf::cudaFlowCapturerfor_each - tf::cudaFlowCapturerfor_each - tf::cudaFlowCapturerfor_each_index - tf::cudaFlowCapturerfor_each_index - tf::cudaFlowCapturerhandle_t - tf::cudaFlowCapturerkernel - tf::cudaFlowCapturerkernel - tf::cudaFlowCapturermake_optimizer - tf::cudaFlowCapturermemcpy - tf::cudaFlowCapturermemcpy - tf::cudaFlowCapturermemset - tf::cudaFlowCapturermemset - tf::cudaFlowCapturernative_executable - tf::cudaFlowCapturernative_graph - tf::cudaFlowCapturernoop - tf::cudaFlowCapturernoop - tf::cudaFlowCapturernum_tasks - tf::cudaFlowCaptureron - tf::cudaFlowCaptureron - tf::cudaFlowCaptureroperator= - tf::cudaFlowCapturerOptimizer - tf::cudaFlowCapturerrun - tf::cudaFlowCapturersingle_task - tf::cudaFlowCapturersingle_task - tf::cudaFlowCapturertransform - tf::cudaFlowCapturertransform - tf::cudaFlowCapturertransform - tf::cudaFlowCapturertransform - tf::cudaFlowCapturer~cudaFlowCapturer - - - diff --git a/docs/xml/classtf_1_1cudaFlowLinearOptimizer.xml b/docs/xml/classtf_1_1cudaFlowLinearOptimizer.xml deleted file mode 100644 index d5c77f874..000000000 --- a/docs/xml/classtf_1_1cudaFlowLinearOptimizer.xml +++ /dev/null @@ -1,97 +0,0 @@ - - - - tf::cudaFlowLinearOptimizer - tf::cudaFlowOptimizerBase - cuda_optimizer.hpp - - - class - friend class cudaFlowCapturer - - cudaFlowCapturer - - cudaFlowCapturer - - - - - - - - - - - - - - tf::cudaFlowLinearOptimizer::cudaFlowLinearOptimizer - ()=default - cudaFlowLinearOptimizer - -constructs a linear optimizer - - - - - - - - - - - cudaGraph_t - cudaGraph_t tf::cudaFlowLinearOptimizer::_optimize - (cudaFlowGraph &graph) - _optimize - - cudaFlowGraph & - graph - - - - - - - - - - - -class to capture a linear CUDA graph using a sequential stream - - -A linear capturing algorithm is a special case of tf::cudaFlowSequentialOptimizer and assumes the input task graph to be a single linear chain of tasks (i.e., a straight line). This assumption allows faster optimization during the capturing process. If the input task graph is not a linear chain, the behavior is undefined. - - - - - - - - - - - - - - - - - - - - - - - - - - tf::cudaFlowLinearOptimizer_levelize - tf::cudaFlowLinearOptimizer_optimize - tf::cudaFlowLinearOptimizer_toposort - tf::cudaFlowLinearOptimizercudaFlowCapturer - tf::cudaFlowLinearOptimizercudaFlowLinearOptimizer - - - diff --git a/docs/xml/classtf_1_1cudaFlowOptimizerBase.xml b/docs/xml/classtf_1_1cudaFlowOptimizerBase.xml deleted file mode 100644 index a14697bbb..000000000 --- a/docs/xml/classtf_1_1cudaFlowOptimizerBase.xml +++ /dev/null @@ -1,78 +0,0 @@ - - - - tf::cudaFlowOptimizerBase - tf::cudaFlowLinearOptimizer - tf::cudaFlowRoundRobinOptimizer - tf::cudaFlowSequentialOptimizer - - - std::vector< cudaFlowNode * > - std::vector< cudaFlowNode * > tf::cudaFlowOptimizerBase::_toposort - (cudaFlowGraph &) - _toposort - - cudaFlowGraph & - graph - - - - - - - - - - - std::vector< std::vector< cudaFlowNode * > > - std::vector< std::vector< cudaFlowNode * > > tf::cudaFlowOptimizerBase::_levelize - (cudaFlowGraph &) - _levelize - - cudaFlowGraph & - graph - - - - - - - - - - - -class to provide helper common methods for optimization algorithms - - - - - - - - - - - - - - - - - - - - - - - - - - - - - tf::cudaFlowOptimizerBase_levelize - tf::cudaFlowOptimizerBase_toposort - - - diff --git a/docs/xml/classtf_1_1cudaFlowRoundRobinOptimizer.xml b/docs/xml/classtf_1_1cudaFlowRoundRobinOptimizer.xml deleted file mode 100644 index 66aeb51b1..000000000 --- a/docs/xml/classtf_1_1cudaFlowRoundRobinOptimizer.xml +++ /dev/null @@ -1,189 +0,0 @@ - - - - tf::cudaFlowRoundRobinOptimizer - tf::cudaFlowOptimizerBase - cuda_optimizer.hpp - - - class - friend class cudaFlowCapturer - - cudaFlowCapturer - - cudaFlowCapturer - - - - - - - - - - - - - size_t - size_t tf::cudaFlowRoundRobinOptimizer::_num_streams - - _num_streams - {4} - - - - - - - - - - - - - tf::cudaFlowRoundRobinOptimizer::cudaFlowRoundRobinOptimizer - ()=default - cudaFlowRoundRobinOptimizer - -constructs a round-robin optimizer with 4 streams by default - - - - - - - - - - tf::cudaFlowRoundRobinOptimizer::cudaFlowRoundRobinOptimizer - (size_t num_streams) - cudaFlowRoundRobinOptimizer - - size_t - num_streams - - -constructs a round-robin optimizer with the given number of streams - - - - - - - - - size_t - size_t tf::cudaFlowRoundRobinOptimizer::num_streams - () const - num_streams - -queries the number of streams used by the optimizer - - - - - - - - - void - void tf::cudaFlowRoundRobinOptimizer::num_streams - (size_t n) - num_streams - - size_t - n - - -sets the number of streams used by the optimizer - - - - - - - - - - - cudaGraph_t - cudaGraph_t tf::cudaFlowRoundRobinOptimizer::_optimize - (cudaFlowGraph &graph) - _optimize - - cudaFlowGraph & - graph - - - - - - - - - - - void - void tf::cudaFlowRoundRobinOptimizer::_reset - (std::vector< std::vector< cudaFlowNode * >> &graph) - _reset - - std::vector< std::vector< cudaFlowNode * >> & - graph - - - - - - - - - - - -class to capture a CUDA graph using a round-robin algorithm - - -A round-robin capturing algorithm levelizes the user-described graph and assign streams to nodes in a round-robin order level by level. The algorithm is based on the following paper published in Euro-Par 2021: -Dian-Lun Lin and Tsung-Wei Huang, "Efficient GPU Computation using Task Graph Parallelism," European Conference on Parallel and Distributed Computing (Euro-Par), 2021 - - -The round-robin optimization algorithm is best suited for large cudaFlow graphs that compose hundreds of or thousands of GPU operations (e.g., kernels and memory copies) with many of them being able to run in parallel. You can configure the number of streams to the optimizer to adjust the maximum kernel currency in the captured CUDA graph. - - - - - - - - - - - - - - - - - - - - - - - - - - tf::cudaFlowRoundRobinOptimizer_levelize - tf::cudaFlowRoundRobinOptimizer_num_streams - tf::cudaFlowRoundRobinOptimizer_optimize - tf::cudaFlowRoundRobinOptimizer_reset - tf::cudaFlowRoundRobinOptimizer_toposort - tf::cudaFlowRoundRobinOptimizercudaFlowCapturer - tf::cudaFlowRoundRobinOptimizercudaFlowRoundRobinOptimizer - tf::cudaFlowRoundRobinOptimizercudaFlowRoundRobinOptimizer - tf::cudaFlowRoundRobinOptimizernum_streams - tf::cudaFlowRoundRobinOptimizernum_streams - - - diff --git a/docs/xml/classtf_1_1cudaFlowSequentialOptimizer.xml b/docs/xml/classtf_1_1cudaFlowSequentialOptimizer.xml deleted file mode 100644 index 62e003296..000000000 --- a/docs/xml/classtf_1_1cudaFlowSequentialOptimizer.xml +++ /dev/null @@ -1,97 +0,0 @@ - - - - tf::cudaFlowSequentialOptimizer - tf::cudaFlowOptimizerBase - cuda_optimizer.hpp - - - class - friend class cudaFlowCapturer - - cudaFlowCapturer - - cudaFlowCapturer - - - - - - - - - - - - - - tf::cudaFlowSequentialOptimizer::cudaFlowSequentialOptimizer - ()=default - cudaFlowSequentialOptimizer - -constructs a sequential optimizer - - - - - - - - - - - cudaGraph_t - cudaGraph_t tf::cudaFlowSequentialOptimizer::_optimize - (cudaFlowGraph &graph) - _optimize - - cudaFlowGraph & - graph - - - - - - - - - - - -class to capture a CUDA graph using a sequential stream - - -A sequential capturing algorithm finds a topological order of the described graph and captures dependent GPU tasks using a single stream. All GPU tasks run sequentially without breaking inter dependencies. - - - - - - - - - - - - - - - - - - - - - - - - - - tf::cudaFlowSequentialOptimizer_levelize - tf::cudaFlowSequentialOptimizer_optimize - tf::cudaFlowSequentialOptimizer_toposort - tf::cudaFlowSequentialOptimizercudaFlowCapturer - tf::cudaFlowSequentialOptimizercudaFlowSequentialOptimizer - - - diff --git a/docs/xml/classtf_1_1cudaGraphBase.xml b/docs/xml/classtf_1_1cudaGraphBase.xml new file mode 100644 index 000000000..4621ee577 --- /dev/null +++ b/docs/xml/classtf_1_1cudaGraphBase.xml @@ -0,0 +1,1303 @@ + + + + tf::cudaGraphBase + std::unique_ptr< std::remove_pointer_t< cudaGraph_t >, cudaGraphDeleter > + taskflow/cuda/cuda_graph.hpp + + + typename Creator + + + typename Deleter + + + + + std::unique_ptr< std::remove_pointer_t< cudaGraph_t >, Deleter > + using tf::cudaGraphBase< Creator, Deleter >::base_type = std::unique_ptr<std::remove_pointer_t<cudaGraph_t>, Deleter> + + base_type + tf::cudaGraphBase::base_type + +base std::unique_ptr type + + + + + + + + + + + + + typename... + ArgsT + ArgsT + + + + tf::cudaGraphBase< Creator, Deleter >::cudaGraphBase + (ArgsT &&... args) + cudaGraphBase + tf::cudaGraphBase::cudaGraphBase + + ArgsT &&... + args + + +constructs a cudaGraph object by passing the given arguments to the executable CUDA graph creator + + +Constructs a cudaGraph object by passing the given arguments to the executable CUDA graph creator + + +args + + +arguments to pass to the executable CUDA graph creator + + + + + + + + + + + + tf::cudaGraphBase< Creator, Deleter >::cudaGraphBase + (cudaGraphBase &&)=default + cudaGraphBase + tf::cudaGraphBase::cudaGraphBase + + cudaGraphBase && + + +constructs a cudaGraph from the given rhs using move semantics + + + + + + + + + cudaGraphBase & + cudaGraphBase & tf::cudaGraphBase< Creator, Deleter >::operator= + (cudaGraphBase &&)=default + operator= + tf::cudaGraphBase::operator= + + cudaGraphBase && + + +assign the rhs to *this using move semantics + + + + + + + + + size_t + size_t tf::cudaGraphBase< Creator, Deleter >::num_nodes + () const + num_nodes + tf::cudaGraphBase::num_nodes + +queries the number of nodes in a native CUDA graph + + + + + + + + + size_t + size_t tf::cudaGraphBase< Creator, Deleter >::num_edges + () const + num_edges + tf::cudaGraphBase::num_edges + +queries the number of edges in a native CUDA graph + + + + + + + + + bool + bool tf::cudaGraphBase< Creator, Deleter >::empty + () const + empty + tf::cudaGraphBase::empty + +queries if the graph is empty + + + + + + + + + void + void tf::cudaGraphBase< Creator, Deleter >::dump + (std::ostream &os) + dump + tf::cudaGraphBase::dump + + std::ostream & + os + + +dumps the CUDA graph to a DOT format through the given output stream + + + + +os + + +target output stream + + + + + + + + + + + cudaTask + cudaTask tf::cudaGraphBase< Creator, Deleter >::noop + () + noop + tf::cudaGraphBase::noop + +creates a no-operation task + + +a tf::cudaTask handle + +An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of n nodes with a barrier between them can be represented using an empty node and 2*n dependency edges, rather than no empty node and n^2 dependency edges. + + + + + + + + + typename C + + + cudaTask + cudaTask tf::cudaGraphBase< Creator, Deleter >::host + (C &&callable, void *user_data) + host + tf::cudaGraphBase::host + + C && + callable + + + void * + user_data + + +creates a host task that runs a callable on the host + + + + +C + + +callable type + + + + + +callable + + +a callable object with neither arguments nor return (i.e., constructible from std::function<void()>) + + + + +user_data + + +a pointer to the user data + + + +a tf::cudaTask handle + +A host task can only execute CPU-specific functions and cannot do any CUDA calls (e.g., cudaMalloc). + + + + + + + + + typename F + + + typename... + ArgsT + ArgsT + + + cudaTask + cudaTask tf::cudaGraphBase< Creator, Deleter >::kernel + (dim3 g, dim3 b, size_t s, F f, ArgsT... args) + kernel + tf::cudaGraphBase::kernel + + dim3 + g + + + dim3 + b + + + size_t + s + + + F + f + + + ArgsT... + args + + +creates a kernel task + + + + +F + + +kernel function type + + + + +ArgsT + + +kernel function parameters type + + + + + +g + + +configured grid + + + + +b + + +configured block + + + + +s + + +configured shared memory size in bytes + + + + +f + + +kernel function + + + + +args + + +arguments to forward to the kernel function by copy + + + +a tf::cudaTask handle + + + + + + + + + cudaTask + cudaTask tf::cudaGraphBase< Creator, Deleter >::memset + (void *dst, int v, size_t count) + memset + tf::cudaGraphBase::memset + + void * + dst + + + int + v + + + size_t + count + + +creates a memset task that fills untyped data with a byte value + + + + +dst + + +pointer to the destination device memory area + + + + +v + + +value to set for each byte of specified memory + + + + +count + + +size in bytes to set + + + +a tf::cudaTask handle + +A memset task fills the first count bytes of device memory area pointed by dst with the byte value v. + + + + + + + cudaTask + cudaTask tf::cudaGraphBase< Creator, Deleter >::memcpy + (void *tgt, const void *src, size_t bytes) + memcpy + tf::cudaGraphBase::memcpy + + void * + tgt + + + const void * + src + + + size_t + bytes + + +creates a memcpy task that copies untyped data in bytes + + + + +tgt + + +pointer to the target memory block + + + + +src + + +pointer to the source memory block + + + + +bytes + + +bytes to copy + + + +a tf::cudaTask handle + +A memcpy task transfers bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs. + + + + + + + + + typename T + + + std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * + nullptr + + + cudaTask + cudaTask tf::cudaGraphBase< Creator, Deleter >::zero + (T *dst, size_t count) + zero + tf::cudaGraphBase::zero + + T * + dst + + + size_t + count + + +creates a memset task that sets a typed memory block to zero + + + + +T + + +element type (size of T must be either 1, 2, or 4) + + + + + +dst + + +pointer to the destination device memory area + + + + +count + + +number of elements + + + +a tf::cudaTask handle + +A zero task zeroes the first count elements of type T in a device memory area pointed by dst. + + + + + + + + + typename T + + + std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * + nullptr + + + cudaTask + cudaTask tf::cudaGraphBase< Creator, Deleter >::fill + (T *dst, T value, size_t count) + fill + tf::cudaGraphBase::fill + + T * + dst + + + T + value + + + size_t + count + + +creates a memset task that fills a typed memory block with a value + + + + +T + + +element type (size of T must be either 1, 2, or 4) + + + + + +dst + + +pointer to the destination device memory area + + + + +value + + +value to fill for each element of type T + + + + +count + + +number of elements + + + +a tf::cudaTask handle + +A fill task fills the first count elements of type T with value in a device memory area pointed by dst. The value to fill is interpreted in type T rather than byte. + + + + + + + + + typename T + + + std::enable_if_t<!std::is_same_v< T, void >, void > * + nullptr + + + cudaTask + cudaTask tf::cudaGraphBase< Creator, Deleter >::copy + (T *tgt, const T *src, size_t num) + copy + tf::cudaGraphBase::copy + + T * + tgt + + + const T * + src + + + size_t + num + + +creates a memcopy task that copies typed data + + + + +T + + +element type (non-void) + + + + + +tgt + + +pointer to the target memory block + + + + +src + + +pointer to the source memory block + + + + +num + + +number of elements to copy + + + +a tf::cudaTask handle + +A copy task transfers num*sizeof(T) bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs. + + + + + + + + + typename C + + + cudaTask + cudaTask tf::cudaGraphBase< Creator, Deleter >::single_task + (C c) + single_task + tf::cudaGraphBase::single_task + + C + c + + +runs a callable with only a single kernel thread + + + + +C + + +callable type + + + + + +c + + +callable to run by a single kernel thread + + + +a tf::cudaTask handle + + + + + + + + + + + typename I + + + typename C + + + typename E + cudaDefaultExecutionPolicy + + + cudaTask + cudaTask tf::cudaGraphBase< Creator, Deleter >::for_each + (I first, I last, C callable) + for_each + tf::cudaGraphBase::for_each + + I + first + + + I + last + + + C + callable + + +applies a callable to each dereferenced element of the data array + + + + +I + + +iterator type + + + + +C + + +callable type + + + + +E + + +execution poligy (default tf::cudaDefaultExecutionPolicy) + + + + + +first + + +iterator to the beginning (inclusive) + + + + +last + + +iterator to the end (exclusive) + + + + +callable + + +a callable object to apply to the dereferenced iterator + + + +a tf::cudaTask handle + +This method is equivalent to the parallel execution of the following loop on a GPU: +for(autoitr=first;itr!=last;itr++){ +callable(*itr); +} + + + + + + + + + + typename I + + + typename C + + + typename E + cudaDefaultExecutionPolicy + + + cudaTask + cudaTask tf::cudaGraphBase< Creator, Deleter >::for_each_index + (I first, I last, I step, C callable) + for_each_index + tf::cudaGraphBase::for_each_index + + I + first + + + I + last + + + I + step + + + C + callable + + +applies a callable to each index in the range with the step size + + + + +I + + +index type + + + + +C + + +callable type + + + + +E + + +execution poligy (default tf::cudaDefaultExecutionPolicy) + + + + + +first + + +beginning index + + + + +last + + +last index + + + + +step + + +step size + + + + +callable + + +the callable to apply to each element in the data array + + + +a tf::cudaTask handle + +This method is equivalent to the parallel execution of the following loop on a GPU: +//stepispositive[first,last) +for(autoi=first;i<last;i+=step){ +callable(i); +} + +//stepisnegative[first,last) +for(autoi=first;i>last;i+=step){ +callable(i); +} + + + + + + + + + + typename I + + + typename O + + + typename C + + + typename E + cudaDefaultExecutionPolicy + + + cudaTask + cudaTask tf::cudaGraphBase< Creator, Deleter >::transform + (I first, I last, O output, C op) + transform + tf::cudaGraphBase::transform + + I + first + + + I + last + + + O + output + + + C + op + + +applies a callable to a source range and stores the result in a target range + + + + +I + + +input iterator type + + + + +O + + +output iterator type + + + + +C + + +unary operator type + + + + +E + + +execution poligy (default tf::cudaDefaultExecutionPolicy) + + + + + +first + + +iterator to the beginning of the input range + + + + +last + + +iterator to the end of the input range + + + + +output + + +iterator to the beginning of the output range + + + + +op + + +the operator to apply to transform each element in the range + + + +a tf::cudaTask handle + +This method is equivalent to the parallel execution of the following loop on a GPU: +while(first!=last){ +*output++=callable(*first++); +} + + + + + + + + + + typename I1 + + + typename I2 + + + typename O + + + typename C + + + typename E + cudaDefaultExecutionPolicy + + + cudaTask + cudaTask tf::cudaGraphBase< Creator, Deleter >::transform + (I1 first1, I1 last1, I2 first2, O output, C op) + transform + tf::cudaGraphBase::transform + + I1 + first1 + + + I1 + last1 + + + I2 + first2 + + + O + output + + + C + op + + +creates a task to perform parallel transforms over two ranges of items + + + + +I1 + + +first input iterator type + + + + +I2 + + +second input iterator type + + + + +O + + +output iterator type + + + + +C + + +unary operator type + + + + +E + + +execution poligy (default tf::cudaDefaultExecutionPolicy) + + + + + +first1 + + +iterator to the beginning of the input range + + + + +last1 + + +iterator to the end of the input range + + + + +first2 + + +iterato + + + + +output + + +iterator to the beginning of the output range + + + + +op + + +binary operator to apply to transform each pair of items in the two input ranges + + + +cudaTask handle + +This method is equivalent to the parallel execution of the following loop on a GPU: +while(first1!=last1){ +*output++=op(*first1++,*first2++); +} + + + + + + + + + + + tf::cudaGraphBase< Creator, Deleter >::cudaGraphBase + (const cudaGraphBase &)=delete + cudaGraphBase + tf::cudaGraphBase::cudaGraphBase + + const cudaGraphBase & + + + + + + + + + + + cudaGraphBase & + cudaGraphBase & tf::cudaGraphBase< Creator, Deleter >::operator= + (const cudaGraphBase &)=delete + operator= + tf::cudaGraphBase::operator= + + const cudaGraphBase & + + + + + + + + + + + +class to create a CUDA graph with uunique ownership + + + + +Creator + + +functor to create the stream (used in constructor) + + + + +Deleter + + +functor to delete the stream (used in destructor) + + + +This class wraps a cudaGraph_t handle with std::unique_ptr to ensure proper resource management and automatic cleanup. + + + + + + + + + + + + + + + + + + + + + + + + + + tf::cudaGraphBasebase_type + tf::cudaGraphBasecopy + tf::cudaGraphBasecudaGraphBase + tf::cudaGraphBasecudaGraphBase + tf::cudaGraphBasecudaGraphBase + tf::cudaGraphBasedump + tf::cudaGraphBaseempty + tf::cudaGraphBasefill + tf::cudaGraphBasefor_each + tf::cudaGraphBasefor_each_index + tf::cudaGraphBasehost + tf::cudaGraphBasekernel + tf::cudaGraphBasememcpy + tf::cudaGraphBasememset + tf::cudaGraphBasenoop + tf::cudaGraphBasenum_edges + tf::cudaGraphBasenum_nodes + tf::cudaGraphBaseoperator= + tf::cudaGraphBaseoperator= + tf::cudaGraphBasesingle_task + tf::cudaGraphBasetransform + tf::cudaGraphBasetransform + tf::cudaGraphBasezero + + + diff --git a/docs/xml/classtf_1_1cudaGraphCreator.xml b/docs/xml/classtf_1_1cudaGraphCreator.xml new file mode 100644 index 000000000..fe9aa2529 --- /dev/null +++ b/docs/xml/classtf_1_1cudaGraphCreator.xml @@ -0,0 +1,67 @@ + + + + tf::cudaGraphCreator + taskflow/cuda/cuda_graph.hpp + + + cudaGraph_t + cudaGraph_t tf::cudaGraphCreator::operator() + () const + operator() + tf::cudaGraphCreator::operator() + +creates a new CUDA graph + + +Calls cudaGraphCreate to generate a CUDA native graph and returns it. If the graph creation fails, an error is reported. +A newly created cudaGraph_t instance. + + + +If + + +CUDA graph creation fails, an error is logged. + + + + + + + + + + + cudaGraph_t + cudaGraph_t tf::cudaGraphCreator::operator() + (cudaGraph_t graph) const + operator() + tf::cudaGraphCreator::operator() + + cudaGraph_t + graph + + +return the given CUDA graph + + + + + + + + + +class to create functors that construct CUDA graphs + + +This class define functors to new CUDA graphs using cudaGraphCreate. + + + + tf::cudaGraphCreatoroperator() + tf::cudaGraphCreatoroperator() + + + diff --git a/docs/xml/classtf_1_1cudaGraphDeleter.xml b/docs/xml/classtf_1_1cudaGraphDeleter.xml new file mode 100644 index 000000000..585232923 --- /dev/null +++ b/docs/xml/classtf_1_1cudaGraphDeleter.xml @@ -0,0 +1,49 @@ + + + + tf::cudaGraphDeleter + taskflow/cuda/cuda_graph.hpp + + + void + void tf::cudaGraphDeleter::operator() + (cudaGraph_t g) const + operator() + tf::cudaGraphDeleter::operator() + + cudaGraph_t + g + + +deletes a CUDA graph + + +Calls cudaGraphDestroy to release the CUDA graph resource if it is valid. + + +g + + +the CUDA graph to be destroyed + + + + + + + + + + + +class to create a functor that deletes a CUDA graph + + +This structure provides an overloaded function call operator to safely destroy a CUDA graph using cudaGraphDestroy. + + + + tf::cudaGraphDeleteroperator() + + + diff --git a/docs/xml/classtf_1_1cudaGraphExecBase.xml b/docs/xml/classtf_1_1cudaGraphExecBase.xml new file mode 100644 index 000000000..ad972692e --- /dev/null +++ b/docs/xml/classtf_1_1cudaGraphExecBase.xml @@ -0,0 +1,704 @@ + + + + tf::cudaGraphExecBase + std::unique_ptr< std::remove_pointer_t< cudaGraphExec_t >, Deleter > + taskflow/cuda/cuda_graph_exec.hpp + + + typename Creator + + + typename Deleter + + + + + std::unique_ptr< std::remove_pointer_t< cudaGraphExec_t >, Deleter > + using tf::cudaGraphExecBase< Creator, Deleter >::base_type = std::unique_ptr<std::remove_pointer_t<cudaGraphExec_t>, Deleter> + + base_type + tf::cudaGraphExecBase::base_type + +base std::unique_ptr type + + + + + + + + + + + + + typename... + ArgsT + ArgsT + + + + tf::cudaGraphExecBase< Creator, Deleter >::cudaGraphExecBase + (ArgsT &&... args) + cudaGraphExecBase + tf::cudaGraphExecBase::cudaGraphExecBase + + ArgsT &&... + args + + +constructs a cudaGraphExec object by passing the given arguments to the executable CUDA graph creator + + +Constructs a cudaGraphExec object by passing the given arguments to the executable CUDA graph creator + + +args + + +arguments to pass to the executable CUDA graph creator + + + + + + + + + + + + tf::cudaGraphExecBase< Creator, Deleter >::cudaGraphExecBase + (cudaGraphExecBase &&)=default + cudaGraphExecBase + tf::cudaGraphExecBase::cudaGraphExecBase + + cudaGraphExecBase && + + +constructs a cudaGraphExec from the given rhs using move semantics + + + + + + + + + cudaGraphExecBase & + cudaGraphExecBase & tf::cudaGraphExecBase< Creator, Deleter >::operator= + (cudaGraphExecBase &&)=default + operator= + tf::cudaGraphExecBase::operator= + + cudaGraphExecBase && + + +assign the rhs to *this using move semantics + + + + + + + + + + + typename C + + + void + void tf::cudaGraphExecBase< Creator, Deleter >::host + (cudaTask task, C &&callable, void *user_data) + host + tf::cudaGraphExecBase::host + + cudaTask + task + + + C && + callable + + + void * + user_data + + +updates parameters of a host task + + +This method updates the parameter of the given host task (similar to tf::cudaFlow::host). + + + + + + + + + typename F + + + typename... + ArgsT + ArgsT + + + void + void tf::cudaGraphExecBase< Creator, Deleter >::kernel + (cudaTask task, dim3 g, dim3 b, size_t shm, F f, ArgsT... args) + kernel + tf::cudaGraphExecBase::kernel + + cudaTask + task + + + dim3 + g + + + dim3 + b + + + size_t + shm + + + F + f + + + ArgsT... + args + + +updates parameters of a kernel task + + +The method is similar to tf::cudaFlow::kernel but operates on a task of type tf::cudaTaskType::KERNEL. The kernel function name must NOT change. + + + + + + + void + void tf::cudaGraphExecBase< Creator, Deleter >::memset + (cudaTask task, void *dst, int ch, size_t count) + memset + tf::cudaGraphExecBase::memset + + cudaTask + task + + + void * + dst + + + int + ch + + + size_t + count + + +updates parameters of a memset task + + +The method is similar to tf::cudaFlow::memset but operates on a task of type tf::cudaTaskType::MEMSET. The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory. + + + + + + + void + void tf::cudaGraphExecBase< Creator, Deleter >::memcpy + (cudaTask task, void *tgt, const void *src, size_t bytes) + memcpy + tf::cudaGraphExecBase::memcpy + + cudaTask + task + + + void * + tgt + + + const void * + src + + + size_t + bytes + + +updates parameters of a memcpy task + + +The method is similar to tf::cudaFlow::memcpy but operates on a task of type tf::cudaTaskType::MEMCPY. The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory. + + + + + + + + + typename T + + + std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * + nullptr + + + void + void tf::cudaGraphExecBase< Creator, Deleter >::zero + (cudaTask task, T *dst, size_t count) + zero + tf::cudaGraphExecBase::zero + + cudaTask + task + + + T * + dst + + + size_t + count + + +updates parameters of a memset task to a zero task + + +The method is similar to tf::cudaFlow::zero but operates on a task of type tf::cudaTaskType::MEMSET. +The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory. + + + + + + + + + typename T + + + std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * + nullptr + + + void + void tf::cudaGraphExecBase< Creator, Deleter >::fill + (cudaTask task, T *dst, T value, size_t count) + fill + tf::cudaGraphExecBase::fill + + cudaTask + task + + + T * + dst + + + T + value + + + size_t + count + + +updates parameters of a memset task to a fill task + + +The method is similar to tf::cudaFlow::fill but operates on a task of type tf::cudaTaskType::MEMSET. +The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory. + + + + + + + + + typename T + + + std::enable_if_t<!std::is_same_v< T, void >, void > * + nullptr + + + void + void tf::cudaGraphExecBase< Creator, Deleter >::copy + (cudaTask task, T *tgt, const T *src, size_t num) + copy + tf::cudaGraphExecBase::copy + + cudaTask + task + + + T * + tgt + + + const T * + src + + + size_t + num + + +updates parameters of a memcpy task to a copy task + + +The method is similar to tf::cudaFlow::copy but operates on a task of type tf::cudaTaskType::MEMCPY. The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory. + + + + + + + + + typename C + + + void + void tf::cudaGraphExecBase< Creator, Deleter >::single_task + (cudaTask task, C c) + single_task + tf::cudaGraphExecBase::single_task + + cudaTask + task + + + C + c + + +updates a single-threaded kernel task + + +This method is similar to cudaFlow::single_task but operates on an existing task. + + + + + + + + + typename I + + + typename C + + + typename E + cudaDefaultExecutionPolicy + + + void + void tf::cudaGraphExecBase< Creator, Deleter >::for_each + (cudaTask task, I first, I last, C callable) + for_each + tf::cudaGraphExecBase::for_each + + cudaTask + task + + + I + first + + + I + last + + + C + callable + + +updates parameters of a for_each kernel task created from the CUDA graph of *this + + + + + + + + + + + typename I + + + typename C + + + typename E + cudaDefaultExecutionPolicy + + + void + void tf::cudaGraphExecBase< Creator, Deleter >::for_each_index + (cudaTask task, I first, I last, I step, C callable) + for_each_index + tf::cudaGraphExecBase::for_each_index + + cudaTask + task + + + I + first + + + I + last + + + I + step + + + C + callable + + +updates parameters of a for_each_index kernel task created from the CUDA graph of *this + + + + + + + + + + + typename I + + + typename O + + + typename C + + + typename E + cudaDefaultExecutionPolicy + + + void + void tf::cudaGraphExecBase< Creator, Deleter >::transform + (cudaTask task, I first, I last, O output, C c) + transform + tf::cudaGraphExecBase::transform + + cudaTask + task + + + I + first + + + I + last + + + O + output + + + C + c + + +updates parameters of a transform kernel task created from the CUDA graph of *this + + + + + + + + + + + typename I1 + + + typename I2 + + + typename O + + + typename C + + + typename E + cudaDefaultExecutionPolicy + + + void + void tf::cudaGraphExecBase< Creator, Deleter >::transform + (cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c) + transform + tf::cudaGraphExecBase::transform + + cudaTask + task + + + I1 + first1 + + + I1 + last1 + + + I2 + first2 + + + O + output + + + C + c + + +updates parameters of a transform kernel task created from the CUDA graph of *this + + + + + + + + + + + + tf::cudaGraphExecBase< Creator, Deleter >::cudaGraphExecBase + (const cudaGraphExecBase &)=delete + cudaGraphExecBase + tf::cudaGraphExecBase::cudaGraphExecBase + + const cudaGraphExecBase & + + + + + + + + + + + cudaGraphExecBase & + cudaGraphExecBase & tf::cudaGraphExecBase< Creator, Deleter >::operator= + (const cudaGraphExecBase &)=delete + operator= + tf::cudaGraphExecBase::operator= + + const cudaGraphExecBase & + + + + + + + + + + + +class to create an executable CUDA graph with unique ownership + + + + +Creator + + +functor to create the stream (used in constructor) + + + + +Deleter + + +functor to delete the stream (used in destructor) + + + +This class wraps a cudaGraphExec_t handle with std::unique_ptr to ensure proper resource management and automatic cleanup. + + + + + + + + + + + + + + + + + + + + + + + + + + tf::cudaGraphExecBasebase_type + tf::cudaGraphExecBasecopy + tf::cudaGraphExecBasecudaGraphExecBase + tf::cudaGraphExecBasecudaGraphExecBase + tf::cudaGraphExecBasecudaGraphExecBase + tf::cudaGraphExecBasefill + tf::cudaGraphExecBasefor_each + tf::cudaGraphExecBasefor_each_index + tf::cudaGraphExecBasehost + tf::cudaGraphExecBasekernel + tf::cudaGraphExecBasememcpy + tf::cudaGraphExecBasememset + tf::cudaGraphExecBaseoperator= + tf::cudaGraphExecBaseoperator= + tf::cudaGraphExecBasesingle_task + tf::cudaGraphExecBasetransform + tf::cudaGraphExecBasetransform + tf::cudaGraphExecBasezero + + + diff --git a/docs/xml/classtf_1_1cudaGraphExecCreator.xml b/docs/xml/classtf_1_1cudaGraphExecCreator.xml new file mode 100644 index 000000000..48459bf87 --- /dev/null +++ b/docs/xml/classtf_1_1cudaGraphExecCreator.xml @@ -0,0 +1,102 @@ + + + + tf::cudaGraphExecCreator + taskflow/cuda/cuda_graph_exec.hpp + + + cudaGraphExec_t + cudaGraphExec_t tf::cudaGraphExecCreator::operator() + () const + operator() + tf::cudaGraphExecCreator::operator() + +returns a null executable CUDA graph + + + + + + + + + cudaGraphExec_t + cudaGraphExec_t tf::cudaGraphExecCreator::operator() + (cudaGraphExec_t exec) const + operator() + tf::cudaGraphExecCreator::operator() + + cudaGraphExec_t + exec + + +returns the given executable graph + + + + + + + + + cudaGraphExec_t + cudaGraphExec_t tf::cudaGraphExecCreator::operator() + (cudaGraph_t graph) const + operator() + tf::cudaGraphExecCreator::operator() + + cudaGraph_t + graph + + +returns a newly instantiated executable graph from the given CUDA graph + + + + + + + + + + + typename C + + + typename D + + + cudaGraphExec_t + cudaGraphExec_t tf::cudaGraphExecCreator::operator() + (const cudaGraphBase< C, D > &graph) const + operator() + tf::cudaGraphExecCreator::operator() + + const cudaGraphBase< C, D > & + graph + + +returns a newly instantiated executable graph from the given CUDA graph + + + + + + + + + +class to create functors for constructing executable CUDA graphs + + +This class provides an overloaded function call operator to create a new executable CUDA graph using cudaGraphCreate. + + + + tf::cudaGraphExecCreatoroperator() + tf::cudaGraphExecCreatoroperator() + tf::cudaGraphExecCreatoroperator() + tf::cudaGraphExecCreatoroperator() + + + diff --git a/docs/xml/classtf_1_1cudaGraphExecDeleter.xml b/docs/xml/classtf_1_1cudaGraphExecDeleter.xml new file mode 100644 index 000000000..003ae60d7 --- /dev/null +++ b/docs/xml/classtf_1_1cudaGraphExecDeleter.xml @@ -0,0 +1,49 @@ + + + + tf::cudaGraphExecDeleter + taskflow/cuda/cuda_graph_exec.hpp + + + void + void tf::cudaGraphExecDeleter::operator() + (cudaGraphExec_t executable) const + operator() + tf::cudaGraphExecDeleter::operator() + + cudaGraphExec_t + executable + + +deletes an executable CUDA graph + + +Calls cudaGraphDestroy to release the CUDA graph resource if it is valid. + + +executable + + +the executable CUDA graph to be destroyed + + + + + + + + + + + +class to create a functor for deleting an executable CUDA graph + + +This class provides an overloaded function call operator to safely destroy a CUDA graph using cudaGraphDestroy. + + + + tf::cudaGraphExecDeleteroperator() + + + diff --git a/docs/xml/classtf_1_1cudaScopedDevice.xml b/docs/xml/classtf_1_1cudaScopedDevice.xml index 9388f2d6c..abe2de9ba 100644 --- a/docs/xml/classtf_1_1cudaScopedDevice.xml +++ b/docs/xml/classtf_1_1cudaScopedDevice.xml @@ -1,29 +1,31 @@ - + tf::cudaScopedDevice - cuda_device.hpp - + taskflow/cuda/cuda_device.hpp + int int tf::cudaScopedDevice::_p _p + tf::cudaScopedDevice::_p - + - - + + tf::cudaScopedDevice::cudaScopedDevice (int device) cudaScopedDevice + tf::cudaScopedDevice::cudaScopedDevice int device @@ -45,13 +47,14 @@ - + tf::cudaScopedDevice::~cudaScopedDevice () ~cudaScopedDevice + tf::cudaScopedDevice::~cudaScopedDevice destructs the guard and switches back to the previous device context @@ -59,28 +62,30 @@ - + - - + + tf::cudaScopedDevice::cudaScopedDevice ()=delete cudaScopedDevice + tf::cudaScopedDevice::cudaScopedDevice - + tf::cudaScopedDevice::cudaScopedDevice (const cudaScopedDevice &)=delete cudaScopedDevice + tf::cudaScopedDevice::cudaScopedDevice const cudaScopedDevice & @@ -90,13 +95,14 @@ - + tf::cudaScopedDevice::cudaScopedDevice (cudaScopedDevice &&)=delete cudaScopedDevice + tf::cudaScopedDevice::cudaScopedDevice cudaScopedDevice && @@ -106,9 +112,9 @@ - + - + class to create an RAII-styled context switch @@ -125,7 +131,7 @@ cudaScopedDevice is neither movable nor copyable. - + tf::cudaScopedDevice_p tf::cudaScopedDevicecudaScopedDevice diff --git a/docs/xml/classtf_1_1cudaStream.xml b/docs/xml/classtf_1_1cudaStream.xml deleted file mode 100644 index b12dcc2f1..000000000 --- a/docs/xml/classtf_1_1cudaStream.xml +++ /dev/null @@ -1,175 +0,0 @@ - - - - tf::cudaStream - cudaObject< cudaStream_t, cudaStreamCreator, cudaStreamDeleter > - cuda_stream.hpp - - - - tf::cudaStream::cudaStream - (cudaStream_t stream) - cudaStream - - cudaStream_t - stream - - -constructs an RAII-styled object from the given CUDA stream - - -Constructs a cudaStream object which owns stream. - - - - - - - - tf::cudaStream::cudaStream - ()=default - cudaStream - -default constructor - - - - - - - - - void - void tf::cudaStream::synchronize - () const - synchronize - -synchronizes the associated stream - - -Equivalently calling cudaStreamSynchronize to block until this stream has completed all operations. - - - - - - - void - void tf::cudaStream::begin_capture - (cudaStreamCaptureMode m=cudaStreamCaptureModeGlobal) const - begin_capture - - cudaStreamCaptureMode - m - cudaStreamCaptureModeGlobal - - -begins graph capturing on the stream - - -When a stream is in capture mode, all operations pushed into the stream will not be executed, but will instead be captured into a graph, which will be returned via cudaStream::end_capture. -A thread's mode can be one of the following: -cudaStreamCaptureModeGlobal: This is the default mode. If the local thread has an ongoing capture sequence that was not initiated with cudaStreamCaptureModeRelaxed at cuStreamBeginCapture, or if any other thread has a concurrent capture sequence initiated with cudaStreamCaptureModeGlobal, this thread is prohibited from potentially unsafe API calls. -cudaStreamCaptureModeThreadLocal: If the local thread has an ongoing capture sequence not initiated with cudaStreamCaptureModeRelaxed, it is prohibited from potentially unsafe API calls. Concurrent capture sequences in other threads are ignored. -cudaStreamCaptureModeRelaxed: The local thread is not prohibited from potentially unsafe API calls. Note that the thread is still prohibited from API calls which necessarily conflict with stream capture, for example, attempting cudaEventQuery on an event that was last recorded inside a capture sequence. - - - - - - - - - cudaGraph_t - cudaGraph_t tf::cudaStream::end_capture - () const - end_capture - -ends graph capturing on the stream - - -Equivalently calling cudaStreamEndCapture to end capture on stream and returning the captured graph. Capture must have been initiated on stream via a call to cudaStream::begin_capture. If capture was invalidated, due to a violation of the rules of stream capture, then a NULL graph will be returned. - - - - - - - void - void tf::cudaStream::record - (cudaEvent_t event) const - record - - cudaEvent_t - event - - -records an event on the stream - - -Equivalently calling cudaEventRecord to record an event on this stream, both of which must be on the same CUDA context. - - - - - - - void - void tf::cudaStream::wait - (cudaEvent_t event) const - wait - - cudaEvent_t - event - - -waits on an event - - -Equivalently calling cudaStreamWaitEvent to make all future work submitted to stream wait for all work captured in event. - - - - - - - -class to create an RAII-styled wrapper over a native CUDA stream - - -A cudaStream object is an RAII-styled wrapper over a native CUDA stream (cudaStream_t). A cudaStream object is move-only. - - - - - - - - - - - - - - - - - - - - - - - - - - tf::cudaStreambegin_capture - tf::cudaStreamcudaStream - tf::cudaStreamcudaStream - tf::cudaStreamend_capture - tf::cudaStreamrecord - tf::cudaStreamsynchronize - tf::cudaStreamwait - - - diff --git a/docs/xml/classtf_1_1cudaStreamBase.xml b/docs/xml/classtf_1_1cudaStreamBase.xml new file mode 100644 index 000000000..2acf10d30 --- /dev/null +++ b/docs/xml/classtf_1_1cudaStreamBase.xml @@ -0,0 +1,398 @@ + + + + tf::cudaStreamBase + std::unique_ptr< std::remove_pointer_t< cudaStream_t >, Deleter > + taskflow/cuda/cuda_stream.hpp + + + typename Creator + + + typename Deleter + + + + + std::unique_ptr< std::remove_pointer_t< cudaStream_t >, Deleter > + using tf::cudaStreamBase< Creator, Deleter >::base_type = std::unique_ptr<std::remove_pointer_t<cudaStream_t>, Deleter> + + base_type + tf::cudaStreamBase::base_type + +base type for the underlying unique pointer + + +This alias provides a shorthand for the underlying std::unique_ptr type that manages CUDA stream resources with an associated deleter. + + + + + + + + + + + typename... + ArgsT + ArgsT + + + + tf::cudaStreamBase< Creator, Deleter >::cudaStreamBase + (ArgsT &&... args) + cudaStreamBase + tf::cudaStreamBase::cudaStreamBase + + ArgsT &&... + args + + +constructs a cudaStream object by passing the given arguments to the stream creator + + +Constructs a cudaStream object by passing the given arguments to the stream creator + + +args + + +arguments to pass to the stream creator + + + + + + + + + + + + tf::cudaStreamBase< Creator, Deleter >::cudaStreamBase + (cudaStreamBase &&)=default + cudaStreamBase + tf::cudaStreamBase::cudaStreamBase + + cudaStreamBase && + + +constructs a cudaStream from the given rhs using move semantics + + + + + + + + + cudaStreamBase & + cudaStreamBase & tf::cudaStreamBase< Creator, Deleter >::operator= + (cudaStreamBase &&)=default + operator= + tf::cudaStreamBase::operator= + + cudaStreamBase && + + +assign the rhs to *this using move semantics + + + + + + + + + cudaStreamBase & + cudaStreamBase & tf::cudaStreamBase< Creator, Deleter >::synchronize + () + synchronize + tf::cudaStreamBase::synchronize + +synchronizes the associated stream + + +Equivalently calling cudaStreamSynchronize to block until this stream has completed all operations. + + + + + + + void + void tf::cudaStreamBase< Creator, Deleter >::begin_capture + (cudaStreamCaptureMode m=cudaStreamCaptureModeGlobal) const + begin_capture + tf::cudaStreamBase::begin_capture + + cudaStreamCaptureMode + m + cudaStreamCaptureModeGlobal + + +begins graph capturing on the stream + + +When a stream is in capture mode, all operations pushed into the stream will not be executed, but will instead be captured into a graph, which will be returned via cudaStream::end_capture. +A thread's mode can be one of the following: +cudaStreamCaptureModeGlobal: This is the default mode. If the local thread has an ongoing capture sequence that was not initiated with cudaStreamCaptureModeRelaxed at cuStreamBeginCapture, or if any other thread has a concurrent capture sequence initiated with cudaStreamCaptureModeGlobal, this thread is prohibited from potentially unsafe API calls. +cudaStreamCaptureModeThreadLocal: If the local thread has an ongoing capture sequence not initiated with cudaStreamCaptureModeRelaxed, it is prohibited from potentially unsafe API calls. Concurrent capture sequences in other threads are ignored. +cudaStreamCaptureModeRelaxed: The local thread is not prohibited from potentially unsafe API calls. Note that the thread is still prohibited from API calls which necessarily conflict with stream capture, for example, attempting cudaEventQuery on an event that was last recorded inside a capture sequence. + + + + + + + + + cudaGraph_t + cudaGraph_t tf::cudaStreamBase< Creator, Deleter >::end_capture + () const + end_capture + tf::cudaStreamBase::end_capture + +ends graph capturing on the stream + + +Equivalently calling cudaStreamEndCapture to end capture on stream and returning the captured graph. Capture must have been initiated on stream via a call to cudaStream::begin_capture. If capture was invalidated, due to a violation of the rules of stream capture, then a NULL graph will be returned. + + + + + + + void + void tf::cudaStreamBase< Creator, Deleter >::record + (cudaEvent_t event) const + record + tf::cudaStreamBase::record + + cudaEvent_t + event + + +records an event on the stream + + +Equivalently calling cudaEventRecord to record an event on this stream, both of which must be on the same CUDA context. + + + + + + + void + void tf::cudaStreamBase< Creator, Deleter >::wait + (cudaEvent_t event) const + wait + tf::cudaStreamBase::wait + + cudaEvent_t + event + + +waits on an event + + +Equivalently calling cudaStreamWaitEvent to make all future work submitted to stream wait for all work captured in event. + + + + + + + + + typename C + + + typename D + + + cudaStreamBase & + cudaStreamBase & tf::cudaStreamBase< Creator, Deleter >::run + (const cudaGraphExecBase< C, D > &exec) + run + tf::cudaStreamBase::run + + const cudaGraphExecBase< C, D > & + exec + + +runs the given executable CUDA graph + + + + +exec + + +the given cudaGraphExec + + + + + + + + + + + cudaStreamBase & + cudaStreamBase< SC, SD > & tf::cudaStreamBase< SC, SD >::run + (cudaGraphExec_t exec) + run + tf::cudaStreamBase::run + + cudaGraphExec_t + exec + + +runs the given executable CUDA graph + + + + +exec + + +the given cudaGraphExec_t + + + + + + + + + + + + + + tf::cudaStreamBase< Creator, Deleter >::cudaStreamBase + (const cudaStreamBase &)=delete + cudaStreamBase + tf::cudaStreamBase::cudaStreamBase + + const cudaStreamBase & + + + + + + + + + + + cudaStreamBase & + cudaStreamBase & tf::cudaStreamBase< Creator, Deleter >::operator= + (const cudaStreamBase &)=delete + operator= + tf::cudaStreamBase::operator= + + const cudaStreamBase & + + + + + + + + + + + + + typename EC + + + typename ED + + + cudaStreamBase< SC, SD > & + cudaStreamBase< SC, SD > & tf::cudaStreamBase< Creator, Deleter >::run + (const cudaGraphExecBase< EC, ED > &exec) + run + tf::cudaStreamBase::run + + const cudaGraphExecBase< EC, ED > & + exec + + + + + + + + + + + +class to create a CUDA stream with unique ownership + + + + +Creator + + +functor to create the stream (used in constructor) + + + + +Deleter + + +functor to delete the stream (used in destructor) + + + +The cudaStream class encapsulates a cudaStream_t using std::unique_ptr, ensuring that CUDA events are properly created and destroyed with a unique ownership. + + + + + + + + + + + + + + + + + + + + + + + + + + tf::cudaStreamBasebase_type + tf::cudaStreamBasebegin_capture + tf::cudaStreamBasecudaStreamBase + tf::cudaStreamBasecudaStreamBase + tf::cudaStreamBasecudaStreamBase + tf::cudaStreamBaseend_capture + tf::cudaStreamBaseoperator= + tf::cudaStreamBaseoperator= + tf::cudaStreamBaserecord + tf::cudaStreamBaserun + tf::cudaStreamBaserun + tf::cudaStreamBaserun + tf::cudaStreamBasesynchronize + tf::cudaStreamBasewait + + + diff --git a/docs/xml/classtf_1_1cudaStreamCreator.xml b/docs/xml/classtf_1_1cudaStreamCreator.xml new file mode 100644 index 000000000..784dcf68e --- /dev/null +++ b/docs/xml/classtf_1_1cudaStreamCreator.xml @@ -0,0 +1,53 @@ + + + + tf::cudaStreamCreator + taskflow/cuda/cuda_stream.hpp + + + cudaStream_t + cudaStream_t tf::cudaStreamCreator::operator() + () const + operator() + tf::cudaStreamCreator::operator() + +constructs a new cudaStream_t object using cudaStreamCreate + + + + + + + + + cudaStream_t + cudaStream_t tf::cudaStreamCreator::operator() + (cudaStream_t stream) const + operator() + tf::cudaStreamCreator::operator() + + cudaStream_t + stream + + +returns the given cudaStream_t object + + + + + + + + + +class to create functors that construct CUDA streams + + + + + + tf::cudaStreamCreatoroperator() + tf::cudaStreamCreatoroperator() + + + diff --git a/docs/xml/classtf_1_1cudaStreamDeleter.xml b/docs/xml/classtf_1_1cudaStreamDeleter.xml new file mode 100644 index 000000000..194a91057 --- /dev/null +++ b/docs/xml/classtf_1_1cudaStreamDeleter.xml @@ -0,0 +1,37 @@ + + + + tf::cudaStreamDeleter + taskflow/cuda/cuda_stream.hpp + + + void + void tf::cudaStreamDeleter::operator() + (cudaStream_t stream) const + operator() + tf::cudaStreamDeleter::operator() + + cudaStream_t + stream + + +deletes the given cudaStream_t object + + + + + + + + + +class to create a functor that deletes a CUDA stream + + + + + + tf::cudaStreamDeleteroperator() + + + diff --git a/docs/xml/classtf_1_1cudaTask.xml b/docs/xml/classtf_1_1cudaTask.xml index 6288e857f..5b86c4d21 100644 --- a/docs/xml/classtf_1_1cudaTask.xml +++ b/docs/xml/classtf_1_1cudaTask.xml @@ -1,16 +1,67 @@ - + tf::cudaTask - cuda_task.hpp - + taskflow/cuda/cuda_graph.hpp + + + + + typename Creator + + + typename Deleter + + + class + friend class cudaGraphBase + + cudaGraphBase + tf::cudaTask::cudaGraphBase + + cudaGraphBase + + + + + + + + + + + + + typename Creator + + + typename Deleter + + + class + friend class cudaGraphExecBase + + cudaGraphExecBase + tf::cudaTask::cudaGraphExecBase + + cudaGraphExecBase + + + + + + + + + class friend class cudaFlow cudaFlow + tf::cudaTask::cudaFlow - cudaFlow + cudaFlow @@ -18,15 +69,16 @@ - + class friend class cudaFlowCapturer cudaFlowCapturer + tf::cudaTask::cudaFlowCapturer - cudaFlowCapturer + cudaFlowCapturer @@ -34,13 +86,14 @@ - + class friend class cudaFlowCapturerBase cudaFlowCapturerBase + tf::cudaTask::cudaFlowCapturerBase cudaFlowCapturerBase @@ -50,20 +103,21 @@ - + - - std::ostream & - std::ostream& operator<< - (std::ostream &, const cudaTask &) + + std::ostream & + std::ostream & operator<< + (std::ostream &os, const cudaTask &ct) operator<< + tf::cudaTask::operator<< - std::ostream & - os + std::ostream & + os const cudaTask & - ct + ct overload of ostream inserter operator for cudaTask @@ -72,15 +126,16 @@ - + - - - - cudaFlowNode * - cudaFlowNode* tf::cudaTask::_node + + + + cudaGraph_t + cudaGraph_t tf::cudaTask::_native_graph - _node + _native_graph + tf::cudaTask::_native_graph {nullptr} @@ -88,15 +143,31 @@ - + - - + + cudaGraphNode_t + cudaGraphNode_t tf::cudaTask::_native_node + + _native_node + tf::cudaTask::_native_node + {nullptr} + + + + + + + + + + tf::cudaTask::cudaTask ()=default cudaTask + tf::cudaTask::cudaTask constructs an empty cudaTask @@ -104,13 +175,14 @@ - + tf::cudaTask::cudaTask (const cudaTask &)=default cudaTask + tf::cudaTask::cudaTask const cudaTask & @@ -121,13 +193,14 @@ - + - + cudaTask & - cudaTask& tf::cudaTask::operator= + cudaTask & tf::cudaTask::operator= (const cudaTask &)=default operator= + tf::cudaTask::operator= const cudaTask & @@ -138,7 +211,7 @@ - + @@ -152,6 +225,7 @@ cudaTask & tf::cudaTask::precede (Ts &&... tasks) precede + tf::cudaTask::precede Ts &&... tasks @@ -184,7 +258,7 @@ - + @@ -198,6 +272,7 @@ cudaTask & tf::cudaTask::succeed (Ts &&... tasks) succeed + tf::cudaTask::succeed Ts &&... tasks @@ -230,57 +305,14 @@ - - - - cudaTask & - cudaTask & tf::cudaTask::name - (const std::string &name) - name - - const std::string & - name - - -assigns a name to the task - - - - -name - - -a std::string acceptable string - - - -*this - - - - - - - - - const std::string & - const std::string & tf::cudaTask::name - () const - name - -queries the name of the task - - - - - - + size_t size_t tf::cudaTask::num_successors () const num_successors + tf::cudaTask::num_successors queries the number of successors @@ -288,13 +320,14 @@ - + - + size_t - size_t tf::cudaTask::num_dependents + size_t tf::cudaTask::num_predecessors () const - num_dependents + num_predecessors + tf::cudaTask::num_predecessors queries the number of dependents @@ -302,66 +335,40 @@ - + - - bool - bool tf::cudaTask::empty - () const - empty - -queries if the task is associated with a cudaFlowNode - - - - - - - - - cudaTaskType - cudaTaskType tf::cudaTask::type + + auto + auto tf::cudaTask::type () const type + tf::cudaTask::type -queries the task type +queries the type of this task - + - - - - typename T - - + void void tf::cudaTask::dump - (T &ostream) const + (std::ostream &os) const dump + tf::cudaTask::dump - T & - ostream + std::ostream & + os dumps the task through an output stream - - -T - - -output stream type with insertion operator (<<) defined - - - - + -ostream +os an output stream target @@ -372,64 +379,23 @@ - + - - - - typename V - - - void - void tf::cudaTask::for_each_successor - (V &&visitor) const - for_each_successor - - V && - visitor - - -applies an visitor callable to each successor of the task - - - - - - - - - - - typename V - - - void - void tf::cudaTask::for_each_dependent - (V &&visitor) const - for_each_dependent - - V && - visitor - - -applies an visitor callable to each dependents of the task - - - - - - - - - - + + + tf::cudaTask::cudaTask - (cudaFlowNode *) + (cudaGraph_t, cudaGraphNode_t) cudaTask + tf::cudaTask::cudaTask + + cudaGraph_t + native_graph + - cudaFlowNode * - node + cudaGraphNode_t + native_node @@ -437,36 +403,34 @@ - + - + -class to create a task handle over an internal node of a cudaFlow graph +class to create a task handle of a CUDA Graph node - + - tf::cudaTask_node + tf::cudaTask_native_graph + tf::cudaTask_native_node tf::cudaTaskcudaFlow tf::cudaTaskcudaFlowCapturer tf::cudaTaskcudaFlowCapturerBase + tf::cudaTaskcudaGraphBase + tf::cudaTaskcudaGraphExecBase tf::cudaTaskcudaTask tf::cudaTaskcudaTask - tf::cudaTaskcudaTask - tf::cudaTaskdump - tf::cudaTaskempty - tf::cudaTaskfor_each_dependent - tf::cudaTaskfor_each_successor - tf::cudaTaskname - tf::cudaTaskname - tf::cudaTasknum_dependents + tf::cudaTaskcudaTask + tf::cudaTaskdump + tf::cudaTasknum_predecessors tf::cudaTasknum_successors - tf::cudaTaskoperator<< - tf::cudaTaskoperator= + tf::cudaTaskoperator<< + tf::cudaTaskoperator= tf::cudaTaskprecede tf::cudaTasksucceed - tf::cudaTasktype + tf::cudaTasktype diff --git a/docs/xml/classtf_1_1cudaUSMAllocator.xml b/docs/xml/classtf_1_1cudaUSMAllocator.xml index 5039cf4ca..12f417a60 100644 --- a/docs/xml/classtf_1_1cudaUSMAllocator.xml +++ b/docs/xml/classtf_1_1cudaUSMAllocator.xml @@ -1,20 +1,20 @@ - - + + tf::cudaUSMAllocator - cuda_memory.hpp tf::cudaUSMAllocator::rebind typename T - + T using tf::cudaUSMAllocator< T >::value_type = T value_type + tf::cudaUSMAllocator::value_type element type @@ -22,13 +22,14 @@ - + T * using tf::cudaUSMAllocator< T >::pointer = T* pointer + tf::cudaUSMAllocator::pointer element pointer type @@ -36,13 +37,14 @@ - + T & using tf::cudaUSMAllocator< T >::reference = T& reference + tf::cudaUSMAllocator::reference element reference type @@ -50,13 +52,14 @@ - + const T * using tf::cudaUSMAllocator< T >::const_pointer = const T* const_pointer + tf::cudaUSMAllocator::const_pointer const element pointer type @@ -64,13 +67,14 @@ - + const T & using tf::cudaUSMAllocator< T >::const_reference = const T& const_reference + tf::cudaUSMAllocator::const_reference constant element reference type @@ -78,13 +82,14 @@ - + - std::size_t + std::size_t using tf::cudaUSMAllocator< T >::size_type = std::size_t size_type + tf::cudaUSMAllocator::size_type size type @@ -92,13 +97,14 @@ - + - std::ptrdiff_t + std::ptrdiff_t using tf::cudaUSMAllocator< T >::difference_type = std::ptrdiff_t difference_type + tf::cudaUSMAllocator::difference_type pointer difference type @@ -106,15 +112,16 @@ - + - - + + tf::cudaUSMAllocator< T >::cudaUSMAllocator () noexcept cudaUSMAllocator + tf::cudaUSMAllocator::cudaUSMAllocator Constructs a device allocator object. @@ -122,15 +129,16 @@ - + tf::cudaUSMAllocator< T >::cudaUSMAllocator (const cudaUSMAllocator &) noexcept cudaUSMAllocator + tf::cudaUSMAllocator::cudaUSMAllocator - const cudaUSMAllocator & + const cudaUSMAllocator & Constructs a device allocator object from another device allocator object. @@ -139,7 +147,7 @@ - + @@ -151,8 +159,9 @@ tf::cudaUSMAllocator< T >::cudaUSMAllocator (const cudaUSMAllocator< U > &) noexcept cudaUSMAllocator + tf::cudaUSMAllocator::cudaUSMAllocator - const cudaUSMAllocator< U > & + const cudaUSMAllocator< U > & Constructs a device allocator object from another device allocator object with a different element type. @@ -161,13 +170,14 @@ - + tf::cudaUSMAllocator< T >::~cudaUSMAllocator () noexcept ~cudaUSMAllocator + tf::cudaUSMAllocator::~cudaUSMAllocator Destructs the device allocator object. @@ -175,15 +185,16 @@ - + - pointer + pointer pointer tf::cudaUSMAllocator< T >::address (reference x) address + tf::cudaUSMAllocator::address - reference + reference x @@ -206,15 +217,16 @@ - + - const_pointer + const_pointer const_pointer tf::cudaUSMAllocator< T >::address (const_reference x) const address + tf::cudaUSMAllocator::address - const_reference + const_reference x @@ -237,15 +249,16 @@ - + - pointer + pointer pointer tf::cudaUSMAllocator< T >::allocate (size_type n, const void *=0) allocate + tf::cudaUSMAllocator::allocate - size_type + size_type n @@ -258,7 +271,7 @@ Attempts to allocate a block of storage with a size large enough to contain n elements of member type, value_type, and returns a pointer to the first element. The storage is aligned appropriately for object of type value_type, but they are not constructed. -The block of storage is allocated using cudaMalloc and throws std::bad_alloc if it cannot allocate the total amount of storage requested. +The block of storage is allocated using cudaMalloc and throws std::bad_alloc if it cannot allocate the total amount of storage requested. n @@ -274,19 +287,20 @@ - + void void tf::cudaUSMAllocator< T >::deallocate (pointer ptr, size_type) deallocate + tf::cudaUSMAllocator::deallocate - pointer + pointer ptr - size_type + size_type Releases a block of storage previously allocated with member allocate and not yet released. @@ -306,37 +320,39 @@ - + - size_type + size_type size_type tf::cudaUSMAllocator< T >::max_size () const noexcept max_size + tf::cudaUSMAllocator::max_size returns the maximum number of elements that could potentially be allocated by this allocator A call to member allocate with the value returned by this function can still fail to allocate the requested storage. -the nubmer of elements that might be allcoated as maximum by a call to member allocate +the number of elements that might be allocated as maximum by a call to member allocate - + void void tf::cudaUSMAllocator< T >::construct (pointer ptr, const_reference val) construct + tf::cudaUSMAllocator::construct - pointer + pointer ptr - const_reference + const_reference val @@ -364,15 +380,16 @@ - + void void tf::cudaUSMAllocator< T >::destroy (pointer ptr) destroy + tf::cudaUSMAllocator::destroy - pointer + pointer ptr @@ -393,7 +410,7 @@ - + @@ -405,8 +422,9 @@ bool tf::cudaUSMAllocator< T >::operator== (const cudaUSMAllocator< U > &) const noexcept operator== + tf::cudaUSMAllocator::operator== - const cudaUSMAllocator< U > & + const cudaUSMAllocator< U > & compares two allocator of different types using == @@ -416,7 +434,7 @@ - + @@ -428,8 +446,9 @@ bool tf::cudaUSMAllocator< T >::operator!= (const cudaUSMAllocator< U > &) const noexcept operator!= + tf::cudaUSMAllocator::operator!= - const cudaUSMAllocator< U > & + const cudaUSMAllocator< U > & compares two allocator of different types using != @@ -439,25 +458,14 @@ - + - + -class to create a unified shared memory (USM) allocator - - -T - - -element type - - - -A cudaUSMAllocator enables using unified shared memory (USM) allocation for standard library containers. It is typically passed as template parameter when declaring standard library containers (e.g. std::vector). - + tf::cudaUSMAllocatoraddress tf::cudaUSMAllocatoraddress diff --git a/docs/xml/codeofconduct.xml b/docs/xml/codeofconduct.xml index e531afb17..a21e03ee5 100644 --- a/docs/xml/codeofconduct.xml +++ b/docs/xml/codeofconduct.xml @@ -1,5 +1,5 @@ - + codeofconduct Codestin Search App @@ -7,11 +7,11 @@ Taskflow Community Code of Conduct codeofconduct_1TaskflowCodeOfConduct - + Report Violations codeofconduct_1ReportViolations - + @@ -20,8 +20,7 @@ -Codestin Search App -The Taskflow community is made up of members from around the globe with a diverse set of skills, personalities, and experiences. It is through these differences that our community experiences success and continued growth. We expect everyone in our community to follow these guidelines when interacting with others both inside and outside of our community. Our goal is to keep ours a positive, inclusive, successful, and growing community. +Codestin Search AppThe Taskflow community is made up of members from around the globe with a diverse set of skills, personalities, and experiences. It is through these differences that our community experiences success and continued growth. We expect everyone in our community to follow these guidelines when interacting with others both inside and outside of our community. Our goal is to keep ours a positive, inclusive, successful, and growing community. As members of the community, We pledge to treat all people with respect and provide a harassment- and bullying-free environment, regardless of sex, sexual orientation and/or gender identity, disability, physical appearance, body size, race, nationality, ethnicity, and religion. In particular, sexual language and imagery, sexist, racist, or otherwise exclusionary jokes are not appropriate. @@ -41,10 +40,9 @@ This code of conduct applies to all community situations online and offline, inc This code of conduct has been adapted from the Astropy Code of Conduct, which in turn uses parts of the PSF code of conduct. -Codestin Search App -To report any violations of the code of conduct, please contact the Taskflow team. We will treat reports confidentially. +Codestin Search AppTo report any violations of the code of conduct, please contact the Taskflow team. We will treat reports confidentially. - + diff --git a/docs/xml/codeofconduct_8dox.xml b/docs/xml/codeofconduct_8dox.xml index 4e467a37b..801a3479b 100644 --- a/docs/xml/codeofconduct_8dox.xml +++ b/docs/xml/codeofconduct_8dox.xml @@ -1,5 +1,5 @@ - + codeofconduct.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/composable__tasking_8dox.xml b/docs/xml/composable__tasking_8dox.xml index 90bee0c8d..1acaf1d5d 100644 --- a/docs/xml/composable__tasking_8dox.xml +++ b/docs/xml/composable__tasking_8dox.xml @@ -1,5 +1,5 @@ - + composable_tasking.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/compound.xsd b/docs/xml/compound.xsd index 6a8a83463..65728bc89 100644 --- a/docs/xml/compound.xsd +++ b/docs/xml/compound.xsd @@ -24,17 +24,23 @@ + + + + + + @@ -59,8 +65,8 @@ - - + + @@ -97,12 +103,26 @@ - + + + + + + + + + + + + + + + @@ -124,11 +144,38 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + - + + @@ -137,16 +184,19 @@ - - - - - - + + + + + + + + + @@ -160,6 +210,7 @@ + @@ -169,7 +220,11 @@ + + + + @@ -218,9 +273,9 @@ - + - + @@ -237,11 +292,11 @@ - + - - - + + + @@ -262,7 +317,7 @@ - + @@ -271,7 +326,7 @@ - + @@ -332,7 +387,7 @@ - + @@ -344,7 +399,7 @@ - + @@ -356,7 +411,7 @@ - + @@ -368,15 +423,39 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + @@ -401,16 +480,30 @@ - + + + + + + + + + + + + + + + - + @@ -424,6 +517,7 @@ + @@ -433,9 +527,9 @@ - - - + + + @@ -697,13 +791,300 @@ + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -712,14 +1093,15 @@ - - - + + + + @@ -757,12 +1139,16 @@ + + + + @@ -833,11 +1219,19 @@ + + + + + + + + - + @@ -851,6 +1245,34 @@ + + + + + + + + + + + + + The mentioned file will be located in the directory as specified by XML_OUTPUT + + + + + + + + + + + + + + + @@ -920,6 +1342,13 @@ + + + + + + + @@ -936,7 +1365,10 @@ - + + + + @@ -1033,6 +1465,8 @@ + + @@ -1063,6 +1497,7 @@ + @@ -1116,6 +1551,7 @@ + @@ -1141,11 +1577,19 @@ + + + + + + + + @@ -1158,6 +1602,33 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1211,5 +1682,15 @@ + + + + + + + + + + diff --git a/docs/xml/conditional__tasking_8dox.xml b/docs/xml/conditional__tasking_8dox.xml index 559650f15..7a352912d 100644 --- a/docs/xml/conditional__tasking_8dox.xml +++ b/docs/xml/conditional__tasking_8dox.xml @@ -1,5 +1,5 @@ - + conditional_tasking.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/contributing_8dox.xml b/docs/xml/contributing_8dox.xml index 91e19288b..c63f1e676 100644 --- a/docs/xml/contributing_8dox.xml +++ b/docs/xml/contributing_8dox.xml @@ -1,5 +1,5 @@ - + contributing.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/contributors.xml b/docs/xml/contributors.xml index 1ac7da3b1..0810db4b4 100644 --- a/docs/xml/contributors.xml +++ b/docs/xml/contributors.xml @@ -1,5 +1,5 @@ - + contributors Codestin Search App @@ -7,28 +7,34 @@ Thank You for Developing Taskflow contributors_1ThankYouForDevelopingTaskflow - + Thank You for Using Taskflow contributors_1ThankYouForUsingTaskflow - + -Codestin Search App -We are grateful for the following contributors (alphabetic order) to the Taskflow project: +Codestin Search AppWe are grateful for the following contributors (alphabetic order) to the Taskflow project: Alexander Neumann: made Taskflow importable from external CMake projects +Andatr: improved the hashing performance in freelist + +Anesthesia4: added unit tests for parallel-transform algorithms + + Antony Chan: added unit tests for parallel-transform algorithms Andreas Olofsson: supported the Taskflow project through the DARPA IDEA program Aaron Boxer: fixed compiler warning caused by unsigned-signed conversion -Benson Muite: fixed compilation errors of the wavefront benchmark +Wolfgang Bangerth: fixed the redundant nullptr check + +Benson Muite: fixed compilation errors of the BFS benchmark Cheng-Hsiang Chiu: improved the documentation, fixes typos, and test code examples @@ -36,14 +42,18 @@ Chun-Xun Lin: co-created the Taskflow project and designed the core functionalities +Conrad Jones: added cancellation query support from the runtime task + Craffael: improved the CMake to allow relocatable installation -Dan Kersten: designed an interface to allow customizing worker behaviors upon their creation in an executor +Dan Kersten: designed an interface to allow customizing worker behaviors -Daniel Jour: improved cmake through out-of-tree builds and designed the semaphore interface +Daniel Jour: improved cmake via out-of-tree builds and designed the semaphore interface Dian-Lun Lin: applied Taskflow to win the champion award of the IEEE HPEC 2020 Graph Challenge +Evgeny Gorodetskiy: fixed task queue compilation error due to wrong macro locations + Filip Strugar: fixed the bugs in fire-and-get taskflow execution and parallel algorithms Foge Mistress: helped design the executor interface to avoid over-subscribed threads @@ -60,10 +70,14 @@ Hoildkv: fixed documentation errors in explaining the observer interface of executor -Jean Michael: integrated Taskflow to the OSSIA project and reported feedback in comparison to TBB +Isaac Yousuf: fixed the bug in exception handling for worker loop + +Jean Michael: integrated Taskflow to the OSSIA project Jiawei Liu: fixed typos in the documentation +Junlian Gilbey: added the explicit link to libatomic on some architectures + Junlin Huang: fixed the erroneous template argument in serializer and deserializer KingDuckZ: helped discover memory leak in the object pool @@ -76,6 +90,8 @@ Lukas Burgholzer: improved the MAC OS compatibility with the standard variant library +Lukasz Wojakowski: identified delayed execution bug in module task + Luke Majors: implemented a sanitizer algorithm to sanitize deadlock control-flow tasks McKay Mower: implemented a sanitizer algorithm to sanitize non-reachable control-flow tasks @@ -92,9 +108,11 @@ Nate: fixed the compilation error of priority task queue on MS platforms +Nan Xiao: fixed compilation error of unit tests on the Arch platform + Netcan: designed a domain-specific graph language to simplify the creation of taskflows -Nan Xiao: fixed compilation error of unit tests on the Arch platform +Nevin: fixed the macro crash in windows Ojas Mithbavkar: implemented cancellation of submitted taskflows @@ -114,7 +132,9 @@ Remi Bedard-Couture: added big object compilation support on MSVC -Robin Soderholm: fixed the runtime error of cudaEvent destructor +Robin Soderholm: fixed the runtime error of cudaEvent destructor + +Ruixin Huang: fixed bugs in conditional tasking documentation Soonho Kong: fixed the compilation warning of unused lambda variables @@ -132,16 +152,20 @@ Vedran Miletic: patched the OS detection utility to include Solaris and illumos -Vladimir Von­drus: helped modernize Taskflow handbook using m.css and make pages mobile-friendly +Vladimir Von­drus: helped modernize Taskflow handbook using m.css Vladyslav: fixed comment errors in README.md and examples +WiCyn: identified a bug in scheduling condition tasks during run-n + Yasin Zamani: benchmarked the parallel sort with the TBB baseline Yibo Lin: helped design the interface of conditional tasking Yilin Qiu: helped implement the dependency removal methods in Taskflow +Yumeno Yan: fixed the C++ macro error in the MSVC environment + Weile: helped added Taskflow to the compiler explorer interface Zizheng Guo: applied Taskflow to speed up VLSI timing analysis and shared his feedback @@ -150,8 +174,7 @@ Please contact us if we forgot your name! -Codestin Search App -We are grateful for the following organizations and projects that are using Taskflow: +Codestin Search AppWe are grateful for the following organizations and projects that are using Taskflow: OpenTimer: A high-performance timing analysis tool for VLSI designs @@ -189,7 +212,7 @@ Please contact us i RPGMPacker: CLI program for packaging RPG Maker games in an automated build/deploy pipeline. -Leanify: A lightweight lossless file minifier and optimizer +Leanify: A lightweight lossless file compressor Xanadu AI: Accelerate simulation using quantum computing @@ -213,7 +236,7 @@ Please contact us i OOX: Out-of-order task execution library in modern C++ -ReAgent: An open end-to-end platform for applied reinforcement learning developed and used at Facebook +ReAgent: An open-source platform for applied reinforcement learning developed by Meta Beast-Build: A build system built for speed and power @@ -229,13 +252,15 @@ Please contact us i AMD Vivao: AMD's software synthesis suite for hardware designs -ModuleWorks: Industry-proven ModuleWorks CAD/CAM technology into software solutions +ModuleWorks: Industry-proven ModuleWorks CAD/CAM technology into software solutions + +Nvidia std::exec: Nvidia's implementation for C++26 Standard executor libraries -... more at GitHub. +... more at GitHub. Please contact us if we forgot your name! - + diff --git a/docs/xml/contributors_8dox.xml b/docs/xml/contributors_8dox.xml index 75bb73e7e..87182429e 100644 --- a/docs/xml/contributors_8dox.xml +++ b/docs/xml/contributors_8dox.xml @@ -1,5 +1,5 @@ - + contributors.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/core_2taskflow_8hpp.xml b/docs/xml/core_2taskflow_8hpp.xml index 4ab834c74..8f21e7108 100644 --- a/docs/xml/core_2taskflow_8hpp.xml +++ b/docs/xml/core_2taskflow_8hpp.xml @@ -1,7 +1,269 @@ - + - core/taskflow.hpp + taskflow.hpp + flow_builder.hpp + taskflow/core/executor.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf::Taskflow tf::Taskflow::Dumper tf::Future @@ -11,6 +273,6 @@ - + diff --git a/docs/xml/critical_8hpp.xml b/docs/xml/critical_8hpp.xml deleted file mode 100644 index 47089b101..000000000 --- a/docs/xml/critical_8hpp.xml +++ /dev/null @@ -1,14 +0,0 @@ - - - - critical.hpp - tf::CriticalSection - tf - -critical include file - - - - - - diff --git a/docs/xml/cudaFlowAlgorithms.xml b/docs/xml/cudaFlowAlgorithms.xml deleted file mode 100644 index adcaac1f0..000000000 --- a/docs/xml/cudaFlowAlgorithms.xml +++ /dev/null @@ -1,22 +0,0 @@ - - - - cudaFlowAlgorithms - Codestin Search App - Single %Task - Parallel Iterations - Parallel Transforms - - - -cudaFlow provides template methods for expressing standard parallel algorithms in a GPU task graph. - -Single Task -Parallel Iterations -Parallel Transforms - - - - - - diff --git a/docs/xml/cudaStandardAlgorithms.xml b/docs/xml/cudaStandardAlgorithms.xml deleted file mode 100644 index 3347117f8..000000000 --- a/docs/xml/cudaStandardAlgorithms.xml +++ /dev/null @@ -1,32 +0,0 @@ - - - - cudaStandardAlgorithms - Codestin Search App - Execution Policy - Single %Task - Parallel Iterations - Parallel Transforms - Parallel Reduction - Parallel Scan - Parallel Merge - Parallel Find - - - -Taskflow provides template methods for expressing standard parallel algorithms on a CUDA GPU. - -Execution Policy -Single Task -Parallel Iterations -Parallel Transforms -Parallel Reduction -Parallel Scan -Parallel Merge -Parallel Find - - - - - - diff --git a/docs/xml/cuda__capturer_8hpp.xml b/docs/xml/cuda__capturer_8hpp.xml deleted file mode 100644 index 7798a07af..000000000 --- a/docs/xml/cuda__capturer_8hpp.xml +++ /dev/null @@ -1,16 +0,0 @@ - - - - cuda_capturer.hpp - tf::cudaFlowCapturer - tf::cudaFlowCapturer::External - tf::cudaFlowCapturer::Internal - tf - -cudaFlow capturer include file - - - - - - diff --git a/docs/xml/cuda__compile_8dox.xml b/docs/xml/cuda__compile_8dox.xml index d713e7de7..dc68ec1bf 100644 --- a/docs/xml/cuda__compile_8dox.xml +++ b/docs/xml/cuda__compile_8dox.xml @@ -1,5 +1,5 @@ - + cuda_compile.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/cuda__device_8hpp.xml b/docs/xml/cuda__device_8hpp.xml index 37677ac3a..1ad21c6a9 100644 --- a/docs/xml/cuda__device_8hpp.xml +++ b/docs/xml/cuda__device_8hpp.xml @@ -1,7 +1,64 @@ - + cuda_device.hpp + cuda_error.hpp + taskflow/cuda/cuda_memory.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf::cudaScopedDevice tf @@ -9,6 +66,6 @@ - + diff --git a/docs/xml/cuda__execution__policy_8hpp.xml b/docs/xml/cuda__execution__policy_8hpp.xml deleted file mode 100644 index d7be9bda8..000000000 --- a/docs/xml/cuda__execution__policy_8hpp.xml +++ /dev/null @@ -1,14 +0,0 @@ - - - - cuda_execution_policy.hpp - tf::cudaExecutionPolicy - tf - -CUDA execution policy include file. - - - - - - diff --git a/docs/xml/cuda__graph_8hpp.xml b/docs/xml/cuda__graph_8hpp.xml new file mode 100644 index 000000000..d7c0f2f57 --- /dev/null +++ b/docs/xml/cuda__graph_8hpp.xml @@ -0,0 +1,101 @@ + + + + cuda_graph.hpp + filesystem + cuda_memory.hpp + cuda_stream.hpp + cuda_meta.hpp + ../utility/traits.hpp + taskflow/cuda/cuda_graph_exec.hpp + taskflow/cuda/cudaflow.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf::cudaTask + tf::cudaGraphCreator + tf::cudaGraphDeleter + tf::cudaGraphBase + tf + + + + + + + diff --git a/docs/xml/cuda__graph__exec_8hpp.xml b/docs/xml/cuda__graph__exec_8hpp.xml new file mode 100644 index 000000000..53f4efdbf --- /dev/null +++ b/docs/xml/cuda__graph__exec_8hpp.xml @@ -0,0 +1,93 @@ + + + + cuda_graph_exec.hpp + cuda_graph.hpp + taskflow/cuda/cudaflow.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf::cudaGraphExecCreator + tf::cudaGraphExecDeleter + tf::cudaGraphExecBase + tf + + + + + + + diff --git a/docs/xml/cuda__memory_8hpp.xml b/docs/xml/cuda__memory_8hpp.xml index 9aa9b0cec..60acc8973 100644 --- a/docs/xml/cuda__memory_8hpp.xml +++ b/docs/xml/cuda__memory_8hpp.xml @@ -1,7 +1,64 @@ - + cuda_memory.hpp + cuda_device.hpp + taskflow/cuda/cuda_graph.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf::cudaSharedMemory tf::cudaSharedMemory< int > tf::cudaSharedMemory< unsigned int > @@ -14,9 +71,9 @@ tf::cudaSharedMemory< bool > tf::cudaSharedMemory< float > tf::cudaSharedMemory< double > - tf::cudaDeviceAllocator + tf::cudaDeviceAllocator tf::cudaDeviceAllocator::rebind - tf::cudaUSMAllocator + tf::cudaUSMAllocator tf::cudaUSMAllocator::rebind tf::cudaDeviceVector tf @@ -25,6 +82,6 @@ - + diff --git a/docs/xml/cuda__optimizer_8hpp.xml b/docs/xml/cuda__optimizer_8hpp.xml deleted file mode 100644 index 64a5dc2e7..000000000 --- a/docs/xml/cuda__optimizer_8hpp.xml +++ /dev/null @@ -1,17 +0,0 @@ - - - - cuda_optimizer.hpp - tf::cudaFlowOptimizerBase - tf::cudaFlowSequentialOptimizer - tf::cudaFlowLinearOptimizer - tf::cudaFlowRoundRobinOptimizer - tf - -cudaFlow capturing algorithms include file - - - - - - diff --git a/docs/xml/cuda__std__algorithms_8dox.xml b/docs/xml/cuda__std__algorithms_8dox.xml deleted file mode 100644 index 31f689d2e..000000000 --- a/docs/xml/cuda__std__algorithms_8dox.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - cuda_std_algorithms.dox - tf - - - - - - - diff --git a/docs/xml/cuda__std__execution__policy_8dox.xml b/docs/xml/cuda__std__execution__policy_8dox.xml deleted file mode 100644 index 966c88ed7..000000000 --- a/docs/xml/cuda__std__execution__policy_8dox.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - cuda_std_execution_policy.dox - tf - - - - - - - diff --git a/docs/xml/cuda__std__for__each_8dox.xml b/docs/xml/cuda__std__for__each_8dox.xml deleted file mode 100644 index e674ea72d..000000000 --- a/docs/xml/cuda__std__for__each_8dox.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - cuda_std_for_each.dox - tf - - - - - - - diff --git a/docs/xml/cuda__std__reduce_8dox.xml b/docs/xml/cuda__std__reduce_8dox.xml deleted file mode 100644 index 6cf40de03..000000000 --- a/docs/xml/cuda__std__reduce_8dox.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - cuda_std_reduce.dox - tf - - - - - - - diff --git a/docs/xml/cuda__std__single__task_8dox.xml b/docs/xml/cuda__std__single__task_8dox.xml deleted file mode 100644 index 481cc3b2c..000000000 --- a/docs/xml/cuda__std__single__task_8dox.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - cuda_std_single_task.dox - tf - - - - - - - diff --git a/docs/xml/cuda__std__transform_8dox.xml b/docs/xml/cuda__std__transform_8dox.xml deleted file mode 100644 index 3c33ba3af..000000000 --- a/docs/xml/cuda__std__transform_8dox.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - cuda_std_transform.dox - tf - - - - - - - diff --git a/docs/xml/cuda__stream_8hpp.xml b/docs/xml/cuda__stream_8hpp.xml index 27abf0fa1..fe9ec9047 100644 --- a/docs/xml/cuda__stream_8hpp.xml +++ b/docs/xml/cuda__stream_8hpp.xml @@ -1,19 +1,70 @@ - + cuda_stream.hpp - tf::cudaStreamCreator - tf::cudaStreamDeleter - tf::cudaStream - tf::cudaEventCreator - tf::cudaEventDeleter - tf::cudaEvent + cuda_error.hpp + taskflow/cuda/cuda_graph.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf::cudaEventCreator + tf::cudaEventDeleter + tf::cudaEventBase + tf::cudaStreamCreator + tf::cudaStreamDeleter + tf::cudaStreamBase tf CUDA stream utilities include file. - + diff --git a/docs/xml/cuda__task_8hpp.xml b/docs/xml/cuda__task_8hpp.xml deleted file mode 100644 index 637a85ac6..000000000 --- a/docs/xml/cuda__task_8hpp.xml +++ /dev/null @@ -1,14 +0,0 @@ - - - - cuda_task.hpp - tf::cudaTask - tf - -cudaTask include file - - - - - - diff --git a/docs/xml/cudaflow_8hpp.xml b/docs/xml/cudaflow_8hpp.xml index e92e8be81..1ad79bd10 100644 --- a/docs/xml/cudaflow_8hpp.xml +++ b/docs/xml/cudaflow_8hpp.xml @@ -1,14 +1,368 @@ - + cudaflow.hpp - tf::cudaFlow + ../taskflow.hpp + cuda_graph.hpp + cuda_graph_exec.hpp + algorithm/single_task.hpp + taskflow/cuda/algorithm/for_each.hpp + taskflow/cuda/algorithm/transform.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf cudaFlow include file - + diff --git a/docs/xml/cudaflow__algorithms_8dox.xml b/docs/xml/cudaflow__algorithms_8dox.xml deleted file mode 100644 index 014408aa0..000000000 --- a/docs/xml/cudaflow__algorithms_8dox.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - cudaflow_algorithms.dox - tf - - - - - - - diff --git a/docs/xml/cudaflow__for__each_8dox.xml b/docs/xml/cudaflow__for__each_8dox.xml deleted file mode 100644 index 37bc3dfb1..000000000 --- a/docs/xml/cudaflow__for__each_8dox.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - cudaflow_for_each.dox - tf - - - - - - - diff --git a/docs/xml/cudaflow__single__task_8dox.xml b/docs/xml/cudaflow__single__task_8dox.xml deleted file mode 100644 index 769676a8c..000000000 --- a/docs/xml/cudaflow__single__task_8dox.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - cudaflow_single_task.dox - tf - - - - - - - diff --git a/docs/xml/cudaflow__transform_8dox.xml b/docs/xml/cudaflow__transform_8dox.xml deleted file mode 100644 index f58ff8d0e..000000000 --- a/docs/xml/cudaflow__transform_8dox.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - cudaflow_transform.dox - tf - - - - - - - diff --git a/docs/xml/cudaflow_capturer_1.dot b/docs/xml/cudaflow_capturer_1.dot deleted file mode 100644 index 44bea6e54..000000000 --- a/docs/xml/cudaflow_capturer_1.dot +++ /dev/null @@ -1,7 +0,0 @@ -digraph cudaFlowCapturer { - rankdir="LR"; - subgraph cluster_capturer{ - label="cudaFlow: capturer" - my_kernel_1 -> my_kernel_2; - } -} diff --git a/docs/xml/cudaflow_capturer_2.dot b/docs/xml/cudaflow_capturer_2.dot deleted file mode 100644 index 0fc20f0c2..000000000 --- a/docs/xml/cudaflow_capturer_2.dot +++ /dev/null @@ -1,8 +0,0 @@ -digraph cudaFlowCapturer { - rankdir="LR"; - subgraph cluster_capturer{ - label="cudaFlow: capturer" - h2d -> my_kernel; - my_kernel -> dh2; - } -} diff --git a/docs/xml/cudaflow_capturer_3.dot b/docs/xml/cudaflow_capturer_3.dot deleted file mode 100644 index 3e5875796..000000000 --- a/docs/xml/cudaflow_capturer_3.dot +++ /dev/null @@ -1,14 +0,0 @@ -digraph cudaFlow { -rankdir="LR"; -p0x28fcca0[label="kernel" style="filled" color="white" fillcolor="black" fontcolor="white" shape="box3d"]; -p0x28fcca0 -> p0x28fd510; -p0x28fd510[label="capturer" style="filled" color="black" fillcolor="purple" fontcolor="white" shape="folder"]; -subgraph cluster_p0x28fd510 { -label="cudaSubflow: capturer"; -color="purple" -p0x28fd5e0[label="kernel_1"]; -p0x28fd5e0 -> p0x28fd6b0; -p0x28fd6b0[label="kernel_2"]; -p0x28fd6b0 -> p0x28fd510; -} -} diff --git a/docs/xml/data__pipeline_8dox.xml b/docs/xml/data__pipeline_8dox.xml index ebcdea791..4657bfd7a 100644 --- a/docs/xml/data__pipeline_8dox.xml +++ b/docs/xml/data__pipeline_8dox.xml @@ -1,5 +1,5 @@ - + data_pipeline.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/data__pipeline_8hpp.xml b/docs/xml/data__pipeline_8hpp.xml index 44d518736..706b6ff5a 100644 --- a/docs/xml/data__pipeline_8hpp.xml +++ b/docs/xml/data__pipeline_8hpp.xml @@ -1,7 +1,289 @@ - + data_pipeline.hpp + pipeline.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf::DataPipe tf::DataPipeline tf::DataPipeline::Line @@ -11,6 +293,6 @@ - + diff --git a/docs/xml/dependent__async__tasking_8dox.xml b/docs/xml/dependent__async__tasking_8dox.xml index 137a254d2..d3ea9256c 100644 --- a/docs/xml/dependent__async__tasking_8dox.xml +++ b/docs/xml/dependent__async__tasking_8dox.xml @@ -1,5 +1,5 @@ - + dependent_async_tasking.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/dir_04c130fdbeeccfa0338db9f77a5dc2c3.xml b/docs/xml/dir_04c130fdbeeccfa0338db9f77a5dc2c3.xml index d43bcc56c..b73ab22c0 100644 --- a/docs/xml/dir_04c130fdbeeccfa0338db9f77a5dc2c3.xml +++ b/docs/xml/dir_04c130fdbeeccfa0338db9f77a5dc2c3.xml @@ -1,9 +1,9 @@ - + - algorithm - critical.hpp + taskflow/algorithm data_pipeline.hpp + module.hpp partitioner.hpp pipeline.hpp @@ -11,6 +11,6 @@ - + diff --git a/docs/xml/dir_05586da0f4e90fa96d454e8d75d56e9a.xml b/docs/xml/dir_05586da0f4e90fa96d454e8d75d56e9a.xml deleted file mode 100644 index c464be95f..000000000 --- a/docs/xml/dir_05586da0f4e90fa96d454e8d75d56e9a.xml +++ /dev/null @@ -1,11 +0,0 @@ - - - - cudaflow_algorithms - - - - - - - diff --git a/docs/xml/dir_0c6655e7a474ec7aa2f43d8d56b9e1c1.xml b/docs/xml/dir_0c6655e7a474ec7aa2f43d8d56b9e1c1.xml index ef8563cf1..9cfbc75db 100644 --- a/docs/xml/dir_0c6655e7a474ec7aa2f43d8d56b9e1c1.xml +++ b/docs/xml/dir_0c6655e7a474ec7aa2f43d8d56b9e1c1.xml @@ -1,11 +1,11 @@ - + - examples + doxygen/examples - + diff --git a/docs/xml/dir_13901390c6d5ee592c18c2f167e01d4f.xml b/docs/xml/dir_13901390c6d5ee592c18c2f167e01d4f.xml index 5dc144afc..6c2a50c93 100644 --- a/docs/xml/dir_13901390c6d5ee592c18c2f167e01d4f.xml +++ b/docs/xml/dir_13901390c6d5ee592c18c2f167e01d4f.xml @@ -1,11 +1,11 @@ - + - install + doxygen/install - + diff --git a/docs/xml/dir_183ade9c70bd4384e3037d383160f942.xml b/docs/xml/dir_183ade9c70bd4384e3037d383160f942.xml index a50a75e47..c9bd79da8 100644 --- a/docs/xml/dir_183ade9c70bd4384e3037d383160f942.xml +++ b/docs/xml/dir_183ade9c70bd4384e3037d383160f942.xml @@ -1,11 +1,11 @@ - + - usecases + doxygen/usecases - + diff --git a/docs/xml/dir_220cd4d9b8cb38c840b455d5d75c25bb.xml b/docs/xml/dir_220cd4d9b8cb38c840b455d5d75c25bb.xml index 6f1acbe66..0553702ed 100644 --- a/docs/xml/dir_220cd4d9b8cb38c840b455d5d75c25bb.xml +++ b/docs/xml/dir_220cd4d9b8cb38c840b455d5d75c25bb.xml @@ -1,15 +1,16 @@ - + - core + taskflow/core async_task.hpp executor.hpp flow_builder.hpp graph.hpp observer.hpp + runtime.hpp semaphore.hpp task.hpp - core/taskflow.hpp + taskflow.hpp tsq.hpp worker.hpp @@ -17,6 +18,6 @@ - + diff --git a/docs/xml/dir_4e8d938e9ddb5a617c200d5739d1f41a.xml b/docs/xml/dir_4e8d938e9ddb5a617c200d5739d1f41a.xml new file mode 100644 index 000000000..18984e202 --- /dev/null +++ b/docs/xml/dir_4e8d938e9ddb5a617c200d5739d1f41a.xml @@ -0,0 +1,21 @@ + + + + doxygen + doxygen/algorithms + doxygen/contributing + doxygen/cookbook + doxygen/examples + doxygen/governance + doxygen/install + doxygen/references + doxygen/releases + doxygen/usecases + header.html + + + + + + + diff --git a/docs/xml/dir_61bd9e18b52c497a2e6d3af3a72c0d02.xml b/docs/xml/dir_61bd9e18b52c497a2e6d3af3a72c0d02.xml index 926e32ab8..5f146110b 100644 --- a/docs/xml/dir_61bd9e18b52c497a2e6d3af3a72c0d02.xml +++ b/docs/xml/dir_61bd9e18b52c497a2e6d3af3a72c0d02.xml @@ -1,11 +1,11 @@ - + - cookbook + doxygen/cookbook - + diff --git a/docs/xml/dir_638d51f8e6f20ea8c720cc8c006296ba.xml b/docs/xml/dir_638d51f8e6f20ea8c720cc8c006296ba.xml index 677f177b0..a597b1433 100644 --- a/docs/xml/dir_638d51f8e6f20ea8c720cc8c006296ba.xml +++ b/docs/xml/dir_638d51f8e6f20ea8c720cc8c006296ba.xml @@ -1,21 +1,19 @@ - + - cuda - algorithm - cuda_capturer.hpp + taskflow/cuda + taskflow/cuda/algorithm cuda_device.hpp - cuda_execution_policy.hpp + cuda_graph.hpp + cuda_graph_exec.hpp cuda_memory.hpp - cuda_optimizer.hpp cuda_stream.hpp - cuda_task.hpp cudaflow.hpp taskflow CUDA include dir - + diff --git a/docs/xml/dir_73635165b734e23094c358e517ec45fc.xml b/docs/xml/dir_73635165b734e23094c358e517ec45fc.xml index 441725485..af42cacec 100644 --- a/docs/xml/dir_73635165b734e23094c358e517ec45fc.xml +++ b/docs/xml/dir_73635165b734e23094c358e517ec45fc.xml @@ -1,11 +1,11 @@ - + - algorithms + doxygen/algorithms - + diff --git a/docs/xml/dir_7c512093e4879e21c0dd502d7d593a16.xml b/docs/xml/dir_7c512093e4879e21c0dd502d7d593a16.xml index 9a4bd5754..c3c7c3946 100644 --- a/docs/xml/dir_7c512093e4879e21c0dd502d7d593a16.xml +++ b/docs/xml/dir_7c512093e4879e21c0dd502d7d593a16.xml @@ -1,11 +1,11 @@ - + - releases + doxygen/releases - + diff --git a/docs/xml/dir_7d8f2e56a3b68fb88e627c2a1db4941a.xml b/docs/xml/dir_7d8f2e56a3b68fb88e627c2a1db4941a.xml index 2aee9e6bb..ff30f3a67 100644 --- a/docs/xml/dir_7d8f2e56a3b68fb88e627c2a1db4941a.xml +++ b/docs/xml/dir_7d8f2e56a3b68fb88e627c2a1db4941a.xml @@ -1,18 +1,13 @@ - + - algorithm - find.hpp + taskflow/cuda/algorithm for_each.hpp - merge.hpp - reduce.hpp - scan.hpp - sort.hpp transform.hpp - + diff --git a/docs/xml/dir_87abf3142b2bf0ff331672dc90c991b0.xml b/docs/xml/dir_87abf3142b2bf0ff331672dc90c991b0.xml index add265766..76ef0b176 100644 --- a/docs/xml/dir_87abf3142b2bf0ff331672dc90c991b0.xml +++ b/docs/xml/dir_87abf3142b2bf0ff331672dc90c991b0.xml @@ -1,11 +1,11 @@ - + - governance + doxygen/governance - + diff --git a/docs/xml/dir_88dad41ea55ca2177e141d32a93e931c.xml b/docs/xml/dir_88dad41ea55ca2177e141d32a93e931c.xml index 2df89a48c..b3595fe57 100644 --- a/docs/xml/dir_88dad41ea55ca2177e141d32a93e931c.xml +++ b/docs/xml/dir_88dad41ea55ca2177e141d32a93e931c.xml @@ -1,17 +1,17 @@ - + taskflow - algorithm - core - cuda - utility + taskflow/algorithm + taskflow/core + taskflow/cuda + taskflow/utility taskflow.hpp root taskflow include dir - + diff --git a/docs/xml/dir_b300e8dd3979c341db683b8f1cb76e6e.xml b/docs/xml/dir_b300e8dd3979c341db683b8f1cb76e6e.xml deleted file mode 100644 index e255a5916..000000000 --- a/docs/xml/dir_b300e8dd3979c341db683b8f1cb76e6e.xml +++ /dev/null @@ -1,11 +0,0 @@ - - - - cuda_std_algorithms - - - - - - - diff --git a/docs/xml/dir_ce5b1d0a1b287ae7223729d7a3a091a8.xml b/docs/xml/dir_ce5b1d0a1b287ae7223729d7a3a091a8.xml index 099b69ea7..7cf92a155 100644 --- a/docs/xml/dir_ce5b1d0a1b287ae7223729d7a3a091a8.xml +++ b/docs/xml/dir_ce5b1d0a1b287ae7223729d7a3a091a8.xml @@ -1,12 +1,15 @@ - + - utility + taskflow/utility + iterator.hpp + math.hpp + os.hpp small_vector.hpp - + diff --git a/docs/xml/dir_d7a9e4fcc659571fb4c113eec28c5eeb.xml b/docs/xml/dir_d7a9e4fcc659571fb4c113eec28c5eeb.xml index c2ce98489..b2233b9eb 100644 --- a/docs/xml/dir_d7a9e4fcc659571fb4c113eec28c5eeb.xml +++ b/docs/xml/dir_d7a9e4fcc659571fb4c113eec28c5eeb.xml @@ -1,11 +1,11 @@ - + - references + doxygen/references - + diff --git a/docs/xml/dir_ecfa7d70310a08b350e190615cc70712.xml b/docs/xml/dir_ecfa7d70310a08b350e190615cc70712.xml index 768b300e6..2121d9b79 100644 --- a/docs/xml/dir_ecfa7d70310a08b350e190615cc70712.xml +++ b/docs/xml/dir_ecfa7d70310a08b350e190615cc70712.xml @@ -1,11 +1,11 @@ - + - contributing + doxygen/contributing - + diff --git a/docs/xml/doxyfile.xsd b/docs/xml/doxyfile.xsd index fbfc2c13d..e0da4781a 100644 --- a/docs/xml/doxyfile.xsd +++ b/docs/xml/doxyfile.xsd @@ -16,7 +16,7 @@ - + @@ -26,6 +26,311 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/xml/dreamplace.xml b/docs/xml/dreamplace.xml index 15c1e6d2d..b1a8bf88d 100644 --- a/docs/xml/dreamplace.xml +++ b/docs/xml/dreamplace.xml @@ -1,5 +1,5 @@ - + dreamplace Codestin Search App @@ -7,31 +7,30 @@ DreamPlace: GPU-accelerated Placement Engine dreamplace_1UseCasesDreamPlace - + Programming Effort dreamplace_1UseCasesDreamPlaceProgrammingEffort - + Performance dreamplace_1UseCasesDreamPlacePerformance - + Conclusion dreamplace_1UseCasesDreamPlaceConclusion - + References dreamplace_1UseCasesDreamPlaceReferences - + We applied Taskflow to solve a VLSI placement problem. The goal is to determine the physical locations of cells (logic gates) in a fixed layout region using minimal interconnect wirelength. -Codestin Search App -Placement is an important step in the layout generation stage of a circuit design. It places each cell of synthesized netlists in a region and optimizes their interconnect. The following figure shows a placement layout of an industrial design, adaptec1. +Codestin Search AppPlacement is an important step in the layout generation stage of a circuit design. It places each cell of synthesized netlists in a region and optimizes their interconnect. The following figure shows a placement layout of an industrial design, adaptec1. Modern placement typically incorporates hundreds of millions of cells and takes several hours to finish. To reduce the long runtime, recent work started investigating new CPU-GPU algorithms. We consider matching-based hybrid CPU-GPU placement refinement algorithm developed by DREAMPlace. The algorithm iterates the following: @@ -48,9 +47,8 @@ Each iteration contains overlapped CPU and GPU tasks with nested conditions to d -Codestin Search App -We implemented the hybrid CPU-GPU placement algorithm using Taskflow, Intel TBB, and StarPU. The algorithm is crafted on one GPU and many CPUs. Since TBB and StarPU have no support for nested conditions, we unroll their task graphs across fixed-length iterations found in hindsight. The figure below shows a partial taskflow of 4 cudaFlows, 1 conditioned cycle, and 12 static tasks for one placement iteration. - +Codestin Search AppWe implemented the hybrid CPU-GPU placement algorithm using Taskflow, Intel TBB, and StarPU. The algorithm is crafted on one GPU and many CPUs. Since TBB and StarPU have no support for nested conditions, we unroll their task graphs across fixed-length iterations found in hindsight. The figure below shows a partial taskflow of 4 cudaFlows, 1 conditioned cycle, and 12 static tasks for one placement iteration. + The table below lists the programming effort of each method, measured by SLOCCount. Taskflow outperforms TBB and StarPU in all aspects. The whole program is about 1.5x and 1.7x less complex than that of TBB and StarPU, respectively.
    @@ -85,8 +83,7 @@ Each iteration contains overlapped CPU and GPU tasks with nested conditions to d -Codestin Search App -Using 8 CPUs and 1 GPU, Taskflow is consistently faster than others across all problem sizes (placement iterations). The gap becomes clear at large problem size; at 100 iterations, Taskflow is 17% faster than TBB and StarPU. We observed similar results using other CPU numbers. Performance saturates at about 16 CPUs, primarily due to the inherent irregularity of the placement algorithm. +Codestin Search AppUsing 8 CPUs and 1 GPU, Taskflow is consistently faster than others across all problem sizes (placement iterations). The gap becomes clear at large problem size; at 100 iterations, Taskflow is 17% faster than TBB and StarPU. We observed similar results using other CPU numbers. Performance saturates at about 16 CPUs, primarily due to the inherent irregularity of the placement algorithm. The memory footprint shows the benefit of our conditional tasking. We keep nearly no growth of memory when the problem size increases, whereas StarPU and TBB grow linearly due to unrolled task graphs. At a vertical scale, increasing the number of CPUs bumps up the memory usage of all methods, but the growth rate of Taskflow is much slower than the others. @@ -100,12 +97,10 @@ Each iteration contains overlapped CPU and GPU tasks with nested conditions to d -Codestin Search App -We have observed two significant benefits of Taskflow over existing programming systems. The first benefit is our conditional tasking. Condition tasks encode control-flow decisions directly in a cyclic task graph rather than unrolling it statically across iterations, saving a lot of memory usage. The second benefit is our runtime scheduler. Our scheduler is able to adapt the number of worker threads to available task parallelism at any time during the graph execution, providing improved performance, power efficiency, and system throughput. +Codestin Search AppWe have observed two significant benefits of Taskflow over existing programming systems. The first benefit is our conditional tasking. Condition tasks encode control-flow decisions directly in a cyclic task graph rather than unrolling it statically across iterations, saving a lot of memory usage. The second benefit is our runtime scheduler. Our scheduler is able to adapt the number of worker threads to available task parallelism at any time during the graph execution, providing improved performance, power efficiency, and system throughput. -Codestin Search App - +Codestin Search App Yibo Lin, Wuxi Li, Jiaqi Gu, Haoxing Ren, Brucek Khailany and David Z. Pan, "ABCDPlace: Accelerated Batch-based Concurrent Detailed Placement on Multi-threaded CPUs and GPUs," IEEE Transactions on Computer-aided Design of Integrated Circuits and Systems (TCAD), vol. 39, no. 12, pp. 5083-5096, Dec. 2020 Yibo Lin, Shounak Dhar, Wuxi Li, Haoxing Ren, Brucek Khailany and David Z. Pan, "DREAMPlace: Deep Learning Toolkit-Enabled GPU Acceleration for Modern VLSI Placement", ACM/IEEE Design Automation Conference (DAC), Las Vegas, NV, Jun 2-6, 2019 @@ -114,6 +109,6 @@ Each iteration contains overlapped CPU and GPU tasks with nested conditions to d - + diff --git a/docs/xml/dreamplace_8dox.xml b/docs/xml/dreamplace_8dox.xml index f7887ae35..5b8acad05 100644 --- a/docs/xml/dreamplace_8dox.xml +++ b/docs/xml/dreamplace_8dox.xml @@ -1,5 +1,5 @@ - + dreamplace.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/examples_8dox.xml b/docs/xml/examples_8dox.xml index 36aec409d..b2670efc5 100644 --- a/docs/xml/examples_8dox.xml +++ b/docs/xml/examples_8dox.xml @@ -1,5 +1,5 @@ - + examples.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/exception_8dox.xml b/docs/xml/exception_8dox.xml index 35e18a41f..28bee7db0 100644 --- a/docs/xml/exception_8dox.xml +++ b/docs/xml/exception_8dox.xml @@ -1,5 +1,5 @@ - + exception.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/executor_8dox.xml b/docs/xml/executor_8dox.xml index 825914a04..c11d98747 100644 --- a/docs/xml/executor_8dox.xml +++ b/docs/xml/executor_8dox.xml @@ -1,5 +1,5 @@ - + executor.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/executor_8hpp.xml b/docs/xml/executor_8hpp.xml index 9773213e8..0269d8373 100644 --- a/docs/xml/executor_8hpp.xml +++ b/docs/xml/executor_8hpp.xml @@ -1,7 +1,314 @@ - + executor.hpp + observer.hpp + taskflow.hpp + async_task.hpp + freelist.hpp + taskflow/core/runtime.hpp + taskflow/taskflow.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf::Executor tf @@ -9,6 +316,6 @@ - + diff --git a/docs/xml/fibonacci.xml b/docs/xml/fibonacci.xml index 1c8e7726a..604b704fd 100644 --- a/docs/xml/fibonacci.xml +++ b/docs/xml/fibonacci.xml @@ -1,5 +1,5 @@ - + fibonacci Codestin Search App @@ -7,19 +7,26 @@ Problem Formulation fibonacci_1FibonacciNumberProblem - + Recursive Fibonacci Parallelism fibonacci_1RecursiveFibonacciParallelism - + + + Tail Recursion Optimization + fibonacci_1TailRecursionOptimization + + + Benchmarking + fibonacci_1FibonacciNumberBenchmarking + We study the classic problem, Fibonacci Number, to demonstrate the use of recursive task parallelism. -Codestin Search App -In mathematics, the Fibonacci numbers, commonly denoted F(n), form a sequence such that each number is the sum of the two preceding ones, starting from 0 and 1. +Codestin Search AppIn mathematics, the Fibonacci numbers, commonly denoted F(n), form a sequence such that each number is the sum of the two preceding ones, starting from 0 and 1. 0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, ... A common solution for computing fibonacci numbers is recursion. intfib(intn){ @@ -29,47 +36,101 @@ -Codestin Search App -We use tf::Subflow to recursively compute fibonacci numbers in parallel. +Codestin Search AppWe use Runtime Tasking and Asynchronous Tasking to recursively compute Fibonacci numbers in parallel. A runtime task tasks a reference to tf::Runtime as its argument, allowing users to interact with the executor and spawn tasks dynamically. The example below demonstrates a parallel recursive implementation of Fibonacci numbers using tf::Runtime: #include<taskflow/taskflow.hpp> -intspawn(intn,tf::Subflow&sbf){ -if(n<2)returnn; -intres1,res2; -sbf.emplace([&res1,n](tf::Subflow&sbf){res1=spawn(n-1,sbf);}) -.name(std::to_string(n-1)); -sbf.emplace([&res2,n](tf::Subflow&sbf){res2=spawn(n-2,sbf);}) -.name(std::to_string(n-2)); -sbf.join(); +size_tfibonacci(size_tN,tf::Runtime&rt){ + +if(N<2)returnN; + +size_tres1,res2; +rt.silent_async([N,&res1](tf::Runtime&rt1){res1=fibonacci(N-1,rt1);}); +rt.silent_async([N,&res2](tf::Runtime&rt2){res2=fibonacci(N-2,rt2);}); + +//usecoruntoavoidblockingtheworkerfromwaitingthetwochildrentasks +//tofinish +rt.corun(); + returnres1+res2; } -intmain(intargc,char*argv[]){ - -intN=5; -intres; +intmain(){ tf::Executorexecutor; -tf::Taskflowtaskflow("fibonacci"); + +size_tN=5,res; +executor.silent_async([N,&res](tf::Runtime&rt){res=fibonacci(N,rt);}); +executor.wait_for_all(); -taskflow.emplace([&res,N](tf::Subflow&sbf){res=spawn(N,sbf);}) -.name(std::to_string(N)); +std::cout<<N<<"-thFibonaccinumberis"<<res<<'\n'; -executor.run(taskflow).wait(); +return0; +} + +The fibonacci function recursively spawns two asynchronous tasks to compute fibonacci(N-1) and fibonacci(N-2) in parallel using tf::Runtime::silent_async. After spawning the two tasks, the function invokes tf::Runtime::corun() to wait until all tasks spawned by rt complete, without blocking the caller worker. In the main function, the executor creates an async task from the top Fibonacci number and waits for completion using tf::Executor::wait_for_all. Once finished, the result is printed. The figure below shows the execution diagram, where the suffixes *_1 and *_2 represent the left and right children spawned by their parent runtime: + + + + +Codestin Search AppIn recursive parallelism, especially for problems like Fibonacci computation, spawning both recursive branches as asynchronous tasks can lead to excessive task creation and stack growth, which may degrade performance and overwhelm the runtime scheduler. Additionally, when both child tasks are launched asynchronously, the parent task must wait for both to finish, potentially blocking a worker thread and reducing parallel throughput. To address these issues, we apply tail recursion optimization to one branch of the Fibonacci call. This allows one of the recursive calls to proceed immediately in the current execution context, reducing both scheduling overhead and stack usage. +size_tfibonacci(size_tN,tf::Runtime&rt){ -taskflow.dump(std::cout); +if(N<2)returnN; -std::cout<<"Fib["<<N<<"]:"<<res<<std::endl; +size_tres1,res2; +rt.silent_async([N,&res1](tf::Runtime&rt1){res1=fibonacci(N-1,rt1);}); + +//tailoptimizationfortherightchild +res2=fibonacci(N-2,rt); -return0; +//usecoruntoavoidblockingtheworkerfromwaitingthetwochildrentasks +//tofinish +rt.corun(); + +returnres1+res2; } -The spawned taskflow graph for computing up to the fifth fibonacci number is shown below: - +The figure below shows the execution diagram, where the suffix *_1 represent the left child spawned by its parent runtime. As we can see, the right child is optimized out through tail recursion optimization. + + + + +Codestin Search AppBased on the discussion above, we compare the runtime of recursive Fibonacci parallelism (1) with tail recursion optimization and (2) without it, across different Fibonacci numbers. +
    +N +w/ tail recursion optimization +w/o tail recursion optimization + + +20 +0.23 ms +0.31 ms + + +25 +2 ms +4 ms + + +30 +23 ms +42 ms + + +35 +269 ms +483 ms + + +40 +3003 ms +5124 ms + +
    -Even if recursive dynamic tasking or subflows are possible, the recursion depth may not be too deep or it can cause stack overflow. +As N increases, the performance gap between the two versions widens significantly. With tail recursion optimization, the program avoids spawning another async task, thereby reducing scheduling overhead and stack pressure. This leads to better CPU utilization and lower task management cost. For example, at N = 40, tail recursion optimization reduces the runtime by over 40%.
    - +
    diff --git a/docs/xml/fibonacci_4.dot b/docs/xml/fibonacci_4.dot new file mode 100644 index 000000000..b9f9df7c6 --- /dev/null +++ b/docs/xml/fibonacci_4.dot @@ -0,0 +1,26 @@ +digraph Fibonacci { + rankdir=TB; + node [shape=box]; + + F4 [label="fibonacci(4)\n[rt]"]; + F3_1 [label="fibonacci(3)\n[rt1]"]; + F2_1 [label="fibonacci(2)\n[rt1_1]"]; + F1_1 [label="fibonacci(1)\n[rt1_1_1]"]; + F0_1 [label="fibonacci(0)\n[rt1_1_2]"]; + F1_2 [label="fibonacci(1)\n[rt1_2]"]; + F2_2 [label="fibonacci(2)\n[rt2]"]; + F1_3 [label="fibonacci(1)\n[rt2_1]"]; + F0_2 [label="fibonacci(0)\n[rt2_2]"]; + + F4 -> F3_1; + F4 -> F2_2; + + F3_1 -> F2_1; + F3_1 -> F1_2; + + F2_1 -> F1_1; + F2_1 -> F0_1; + + F2_2 -> F1_3; + F2_2 -> F0_2; +} diff --git a/docs/xml/fibonacci_4_tail_optimized.dot b/docs/xml/fibonacci_4_tail_optimized.dot new file mode 100644 index 000000000..dfa3224dc --- /dev/null +++ b/docs/xml/fibonacci_4_tail_optimized.dot @@ -0,0 +1,26 @@ +digraph Fibonacci { + rankdir=TB; + node [shape=box]; + + F4 [label="fibonacci(4)\n[rt]"]; + F3_1 [label="fibonacci(3)\n[rt1]"]; + F2_1 [label="fibonacci(2)\n[rt1_1]"]; + F1_1 [label="fibonacci(1)\n[rt1_1_1]"]; + F0_1 [label="fibonacci(0)\n[rt1_1]"]; + F1_2 [label="fibonacci(1)\n[rt1]"]; + F2_2 [label="fibonacci(2)\n[rt]"]; + F1_3 [label="fibonacci(1)\n[rt1]"]; + F0_2 [label="fibonacci(0)\n[rt]"]; + + F4 -> F3_1; + F4 -> F2_2; + + F3_1 -> F2_1; + F3_1 -> F1_2; + + F2_1 -> F1_1; + F2_1 -> F0_1; + + F2_2 -> F1_3; + F2_2 -> F0_2; +} diff --git a/docs/xml/fibonacci_8dox.xml b/docs/xml/fibonacci_8dox.xml index 48a932beb..748f0ca00 100644 --- a/docs/xml/fibonacci_8dox.xml +++ b/docs/xml/fibonacci_8dox.xml @@ -1,5 +1,5 @@ - + fibonacci.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/find_8dox.xml b/docs/xml/find_8dox.xml index b4144bef1..3d4eb3fcd 100644 --- a/docs/xml/find_8dox.xml +++ b/docs/xml/find_8dox.xml @@ -1,5 +1,5 @@ - + find.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/find_8hpp.xml b/docs/xml/find_8hpp.xml deleted file mode 100644 index b6779b193..000000000 --- a/docs/xml/find_8hpp.xml +++ /dev/null @@ -1,15 +0,0 @@ - - - - find.hpp - tf::detail::cudaFindPair - tf - tf::detail - -cuda find algorithms include file - - - - - - diff --git a/docs/xml/flipcoins.xml b/docs/xml/flipcoins.xml index abfe0d812..9a1fe6123 100644 --- a/docs/xml/flipcoins.xml +++ b/docs/xml/flipcoins.xml @@ -1,5 +1,5 @@ - + flipcoins Codestin Search App @@ -7,29 +7,27 @@ Problem Formulation flipcoins_1FlipCoinsProblemFormulation - + Probabilistic Conditions flipcoins_1FlipCoinsProbabilistic - + Ternary Coins flipcoins_1FlipCoinsTernaryCoins - + We study dynamic control flow of non-determinism using conditional tasking. Non-deterministic control flow is a fundamental building block in many optimization and simulation algorithms that rely on stochastic convergence rules or probabilistic pruning. -Codestin Search App -We have a fair binary coin and want to simulate its tosses. We flip the coin for five times. Apparently, the probability for the result to be all heads is 1/32. It is equivalently to say the expected number we need to toss for obtaining five heads is 32. - +Codestin Search AppWe have a fair binary coin and want to simulate its tosses. We flip the coin for five times. Apparently, the probability for the result to be all heads is 1/32. It is equivalently to say the expected number we need to toss for obtaining five heads is 32. + -Codestin Search App -We use condition tasks to simulate the five coin tosses. We create five condition tasks each returning a random binary number. If the return is zero (head toss), the execution moves to the next condition task; or it (tail toss) goes back to the first condition task to start over the simulation. +Codestin Search AppWe use condition tasks to simulate the five coin tosses. We create five condition tasks each returning a random binary number. If the return is zero (head toss), the execution moves to the next condition task; or it (tail toss) goes back to the first condition task to start over the simulation. #include<taskflow/taskflow.hpp> intmain(){ @@ -45,15 +43,15 @@ tf::TaskA=taskflow.emplace([&](){tosses=0;}) .name("init"); -tf::TaskB=taskflow.emplace([&](){++tosses;returnstd::rand()%2;}) +tf::TaskB=taskflow.emplace([&](){++tosses;returnstd::rand()%2;}) .name("flip-coin-1"); -tf::TaskC=taskflow.emplace([&](){returnstd::rand()%2;}) +tf::TaskC=taskflow.emplace([&](){returnstd::rand()%2;}) .name("flip-coin-2"); -tf::TaskD=taskflow.emplace([&](){returnstd::rand()%2;}) +tf::TaskD=taskflow.emplace([&](){returnstd::rand()%2;}) .name("flip-coin-3"); -tf::TaskE=taskflow.emplace([&](){returnstd::rand()%2;}) +tf::TaskE=taskflow.emplace([&](){returnstd::rand()%2;}) .name("flip-coin-4"); -tf::TaskF=taskflow.emplace([&](){returnstd::rand()%2;}) +tf::TaskF=taskflow.emplace([&](){returnstd::rand()%2;}) .name("flip-coin-5"); //reachthetarget;recordthenumberoftosses @@ -75,32 +73,31 @@ //calculatetheexpectednumberoftosses average_tosses=total_tosses/(double)rounds; -assert(std::fabs(average_tosses-32.0)<1.0); +assert(std::fabs(average_tosses-32.0)<1.0); return0; } Running the taskflow by a fair number of times, the average tosses we have is close to 32. The taskflow diagram is depicted below. - + Although the execution of this taskflow is non-deterministic, its control flow can expand to a tree of tasks based on our scheduling rule for conditional tasking (see Conditional Tasking). Each path from the root to a leaf represents a result of five heads, and none of them can overlap at the same time (no task race). You must follow the same rule when creating a probabilistic framework using conditional tasking. -Codestin Search App -We can extend the binary coin example to a ternary case. Each condition task has one successor going back to the beginning and two successors moving to the next task. The expected number of tosses to reach five identical results is 3*3*3*3*3 = 243. +Codestin Search AppWe can extend the binary coin example to a ternary case. Each condition task has one successor going back to the beginning and two successors moving to the next task. The expected number of tosses to reach five identical results is 3*3*3*3*3 = 243. tf::TaskA=taskflow.emplace([&](){tosses=0;}) .name("init"); //startovertheflipagain -tf::TaskB=taskflow.emplace([&](){++tosses;returnstd::rand()%3;}) +tf::TaskB=taskflow.emplace([&](){++tosses;returnstd::rand()%3;}) .name("flip-coin-1"); -tf::TaskC=taskflow.emplace([&](){returnstd::rand()%3;}) +tf::TaskC=taskflow.emplace([&](){returnstd::rand()%3;}) .name("flip-coin-2"); -tf::TaskD=taskflow.emplace([&](){returnstd::rand()%3;}) +tf::TaskD=taskflow.emplace([&](){returnstd::rand()%3;}) .name("flip-coin-3"); -tf::TaskE=taskflow.emplace([&](){returnstd::rand()%3;}) +tf::TaskE=taskflow.emplace([&](){returnstd::rand()%3;}) .name("flip-coin-4"); -tf::TaskF=taskflow.emplace([&](){returnstd::rand()%3;}) +tf::TaskF=taskflow.emplace([&](){returnstd::rand()%3;}) .name("flip-coin-5"); //reachthetarget;recordthenumberoftosses @@ -122,13 +119,13 @@ //calculatetheexpectednumberoftosses average_tosses=total_tosses/(double)rounds; -assert(std::fabs(average_tosses-243.0)<1.0); +assert(std::fabs(average_tosses-243.0)<1.0); - + Similarly, we can extend the probabilistic condition to any degree. - + diff --git a/docs/xml/flipcoins_8dox.xml b/docs/xml/flipcoins_8dox.xml index f41e8e09d..eebdb4525 100644 --- a/docs/xml/flipcoins_8dox.xml +++ b/docs/xml/flipcoins_8dox.xml @@ -1,5 +1,5 @@ - + flipcoins.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/flow__builder_8hpp.xml b/docs/xml/flow__builder_8hpp.xml index 4faa2898e..464b174eb 100644 --- a/docs/xml/flow__builder_8hpp.xml +++ b/docs/xml/flow__builder_8hpp.xml @@ -1,7 +1,270 @@ - + flow_builder.hpp + task.hpp + ../algorithm/partitioner.hpp + taskflow/core/taskflow.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf::FlowBuilder tf::Subflow tf @@ -10,6 +273,6 @@ - + diff --git a/docs/xml/for__each_8dox.xml b/docs/xml/for__each_8dox.xml index 16e6139d7..3d276b1d2 100644 --- a/docs/xml/for__each_8dox.xml +++ b/docs/xml/for__each_8dox.xml @@ -1,5 +1,5 @@ - + for_each.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/for__each_8hpp.xml b/docs/xml/for__each_8hpp.xml index c23c66add..ea8c93c38 100644 --- a/docs/xml/for__each_8hpp.xml +++ b/docs/xml/for__each_8hpp.xml @@ -1,7 +1,345 @@ - + for_each.hpp + ../cudaflow.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf tf::detail @@ -9,6 +347,6 @@ - + diff --git a/docs/xml/governance_8dox.xml b/docs/xml/governance_8dox.xml index 92fe85204..783661c5e 100644 --- a/docs/xml/governance_8dox.xml +++ b/docs/xml/governance_8dox.xml @@ -1,5 +1,5 @@ - + governance.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/gpu__tasking_8dox.xml b/docs/xml/gpu__tasking_8dox.xml new file mode 100644 index 000000000..ce944d55c --- /dev/null +++ b/docs/xml/gpu__tasking_8dox.xml @@ -0,0 +1,12 @@ + + + + gpu_tasking.dox + tf + + + + + + + diff --git a/docs/xml/gpu__tasking__cudaflow_8dox.xml b/docs/xml/gpu__tasking__cudaflow_8dox.xml deleted file mode 100644 index ed562e125..000000000 --- a/docs/xml/gpu__tasking__cudaflow_8dox.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - gpu_tasking_cudaflow.dox - tf - - - - - - - diff --git a/docs/xml/gpu__tasking__cudaflow__capturer_8dox.xml b/docs/xml/gpu__tasking__cudaflow__capturer_8dox.xml deleted file mode 100644 index a8a86b070..000000000 --- a/docs/xml/gpu__tasking__cudaflow__capturer_8dox.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - gpu_tasking_cudaflow_capturer.dox - tf - - - - - - - diff --git a/docs/xml/graph_8hpp.xml b/docs/xml/graph_8hpp.xml index a2bd4c27f..09ab51a5f 100644 --- a/docs/xml/graph_8hpp.xml +++ b/docs/xml/graph_8hpp.xml @@ -1,13 +1,299 @@ - + graph.hpp + ../utility/macros.hpp + ../utility/traits.hpp + ../utility/iterator.hpp + ../utility/os.hpp + ../utility/math.hpp + ../utility/small_vector.hpp + ../utility/serializer.hpp + ../utility/lazy_string.hpp + error.hpp + declarations.hpp + semaphore.hpp + environment.hpp + topology.hpp + tsq.hpp + taskflow/core/async_task.hpp + taskflow/core/task.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf::Graph - tf::Runtime - tf::TaskParams - tf::DefaultTaskParams + tf::TaskParams + tf::DefaultTaskParams tf::Node tf::Node::Static + tf::Node::Runtime tf::Node::Subflow tf::Node::Condition tf::Node::MultiCondition @@ -15,13 +301,15 @@ tf::Node::Async tf::Node::DependentAsync tf::Node::Semaphores - tf::NodeDeleter + tf::AnchorGuard + tf::has_graph tf + tf::detail graph include file - + diff --git a/docs/xml/graph__pipeline_8dox.xml b/docs/xml/graph__pipeline_8dox.xml index e3cfc4636..827164d85 100644 --- a/docs/xml/graph__pipeline_8dox.xml +++ b/docs/xml/graph__pipeline_8dox.xml @@ -1,5 +1,5 @@ - + graph_pipeline.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/graph__traversal_8dox.xml b/docs/xml/graph__traversal_8dox.xml index cf7562a63..1e57beb6f 100644 --- a/docs/xml/graph__traversal_8dox.xml +++ b/docs/xml/graph__traversal_8dox.xml @@ -1,5 +1,5 @@ - + graph_traversal.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/graphtraversal.xml b/docs/xml/graphtraversal.xml index bdbf73562..3f57b2808 100644 --- a/docs/xml/graphtraversal.xml +++ b/docs/xml/graphtraversal.xml @@ -1,5 +1,5 @@ - + graphtraversal Codestin Search App @@ -7,41 +7,39 @@ Problem Formulation graphtraversal_1GraphTraversalProblemFormulation - + Graph Representation graphtraversal_1GraphTraversalGraphRepresentation - + Static Traversal graphtraversal_1GraphTraversalStaticTraversal - + Dynamic Traversal graphtraversal_1GraphTraversalDynamicTraversal - + We study the graph traversal problem by visiting each vertex in parallel following their edge dependencies. Traversing a graph is a fundamental building block of many graph applications especially for large-scale graph analytics. -Codestin Search App -Given a directed acyclic graph (DAG), i.e., a graph that has no cycles, we would like to traverse each vertex in order without breaking dependency constraints defined by edges. The following figure shows a graph of six vertices and seven edges. Each vertex represents a particular task and each edge represents a task dependency between two tasks. - +Codestin Search AppGiven a directed acyclic graph (DAG), i.e., a graph that has no cycles, we would like to traverse each vertex in order without breaking dependency constraints defined by edges. The following figure shows a graph of six vertices and seven edges. Each vertex represents a particular task and each edge represents a task dependency between two tasks. + Traversing the above graph in parallel, the maximum parallelism we can acquire is three. When Task1 finishes, we can run Task2, Task3, and Task4 in parallel. -Codestin Search App -We define the data structure of our graph. The graph is represented by an array of nodes of the following structure: +Codestin Search AppWe define the data structure of our graph. The graph is represented by an array of nodes of the following structure: structNode{ -std::stringname; +std::stringname; size_tidx;//indexofthenodeinaarray boolvisited{false}; -std::atomic<size_t>dependents{0};//numberofincomingedges -std::vector<Node*>successors;//numberofoutgoingedges +std::atomic<size_t>dependents{0};//numberofincomingedges +std::vector<Node*>successors;//numberofoutgoingedges voidprecede(Node&n){ successors.emplace_back(&n); @@ -50,21 +48,21 @@ }; Based on the data structure, we randomly generate a DAG using ordered edges. -std::unique_ptr<Node[]>make_dag(size_tnum_nodes,size_tmax_degree){ +std::unique_ptr<Node[]>make_dag(size_tnum_nodes,size_tmax_degree){ -std::unique_ptr<Node[]>nodes(newNode[num_nodes]); +std::unique_ptr<Node[]>nodes(newNode[num_nodes]); //Makesurenodesareincleanstate for(size_ti=0;i<num_nodes;i++){ nodes[i].idx=i; -nodes[i].name=std::to_string(i); +nodes[i].name=std::to_string(i); } //CreateaDAGbyrandomlyinsertorderededges for(size_ti=0;i<num_nodes;i++){ size_tdegree{0}; for(size_tj=i+1;j<num_nodes&&degree<max_degree;j++){ -if(std::rand()%2==1){ +if(std::rand()%2==1){ nodes[i].precede(nodes[j]); degree++; } @@ -77,13 +75,12 @@ The function, make_dag, accepts two arguments, num_nodes and max_degree, to restrict the number of nodes in the graph and the maximum number of outgoing edges of every node. -Codestin Search App -We create a taskflow to traverse the graph using static tasks (see Static Tasking). Each task does nothing but marks visited to true and subtracts dependents from one, both of which are used for validation after the graph is traversed. In practice, this computation may be replaced with a heavy function. +Codestin Search AppWe create a taskflow to traverse the graph using static tasks (see Static Tasking). Each task does nothing but marks visited to true and subtracts dependents from one, both of which are used for validation after the graph is traversed. In practice, this computation may be replaced with a heavy function. tf::Taskflowtaskflow; tf::Executorexecutor; -std::unique_ptr<Node[]>nodes=make_dag(100000,4); -std::vector<tf::Task>tasks; +std::unique_ptr<Node[]>nodes=make_dag(100000,4); +std::vector<tf::Task>tasks; //createthetraversaltaskforeachnode for(size_ti=0;i<num_nodes;++i){ @@ -113,18 +110,17 @@ } The code above has two parts to construct the parallel graph traversal. First, it iterates each node and constructs a traversal task for that node. Second, it iterates each outgoing edge of a node and creates a dependency between the node and the other end (successor) of that edge. The resulting taskflow structure is topologically equivalent to the given graph. - + With task parallelism, we flow computation naturally with the graph structure. The runtime autonomously distributes tasks across processor cores to obtain maximum task parallelism. You do not need to worry about details of scheduling. -Codestin Search App -We can traverse the graph dynamically using tf::Subflow (see Subflow Tasking). We start from the source nodes of zero incoming edges and recursively spawn subflows whenever the dependency of a node is meet. Since we are creating tasks from the execution context of another task, we need to store the task callable in advance. +Codestin Search AppWe can traverse the graph dynamically using tf::Subflow (see Subflow Tasking). We start from the source nodes of zero incoming edges and recursively spawn subflows whenever the dependency of a node is meet. Since we are creating tasks from the execution context of another task, we need to store the task callable in advance. tf::Taskflowtaskflow; tf::Executorexecutor; //taskcallableoftraversinganodeusingsubflow -std::function<void(Node*,tf::Subflow&)>traverse; +std::function<void(Node*,tf::Subflow&)>traverse; traverse=[&](Node*n,tf::Subflow&subflow){ assert(!n->visited); @@ -139,10 +135,10 @@ }; //createagraph -std::unique_ptr<Node[]>nodes=make_dag(100000,4); +std::unique_ptr<Node[]>nodes=make_dag(100000,4); //findthesourcenodes(noincomingedges) -std::vector<Node*>src; +std::vector<Node*>src; for(size_ti=0;i<num_nodes;i++){ if(nodes[i].dependents==0){ src.emplace_back(&(nodes[i])); @@ -165,11 +161,11 @@ } A partial graph is shown as follows: - + In general, the dynamic version of graph traversal is slower than the static version due to the overhead incurred by spawning subflows. However, it may be useful for the situation where the graph structure is unknown at once but being partially explored during the traversal. - + diff --git a/docs/xml/guidelines.xml b/docs/xml/guidelines.xml index e7dbe0483..6bfbff25f 100644 --- a/docs/xml/guidelines.xml +++ b/docs/xml/guidelines.xml @@ -1,5 +1,5 @@ - + guidelines Codestin Search App @@ -7,57 +7,56 @@ How Can I Contribute? guidelines_1HowCanIContribute - + How Can I Get Credit? guidelines_1HowCanIGetCredit - + How Can I Get Started? guidelines_1HowCanIGetStarted - - - Step 1: Look around - guidelines_1Step1LookAround - - - Step 2: Write a Taskflow program - guidelines_1Step2WriteATaskflowProgram - - - Step 3: Dive in - guidelines_1Step3WriteATaskflowProgram - - - + + + Step 1: Look around + guidelines_1Step1LookAround + + + Step 2: Write a Taskflow program + guidelines_1Step2WriteATaskflowProgram + + + Step 3: Dive in + guidelines_1Step3WriteATaskflowProgram + + + How Can I Report Issues? guidelines_1HowCanIReportAnIssue - + How Can I Edit the Documentation? guidelines_1HowCanIEditTheDocumentation - + How Can I Submit a Patch? guidelines_1HowCanISubmitAPatch - + How Can I Lead a Project? guidelines_1HowCanILeadAProject - + Your Voice Matters! guidelines_1YourVoiceMatters - + This pages outlines the process that you will need to follow to get a patch merged. -Codestin Search App -There are multiple ways in which you can contribute to Taskflow: +Codestin Search AppThere are multiple ways in which you can contribute to Taskflow: Use it! Let us know what you think and how it helps your jobs! @@ -73,8 +72,7 @@ Your contributions are always welcome. Every contribution regardless of its size is significant to keep Taskflow thrive. -Codestin Search App -Your contribution is an undeniably important piece of the Taskflow project, and we want to make sure you always get credit for your work. Depending on the technical innovation and engineering effort, we credit your contributions as follows: +Codestin Search AppYour contribution is an undeniably important piece of the Taskflow project, and we want to make sure you always get credit for your work. Depending on the technical innovation and engineering effort, we credit your contributions as follows: We document your commit or pull request at the Contributors page @@ -88,15 +86,12 @@ Your contributions are always welcome. Every contribution regardless of its size Your effort really matters to us and we are eater to acknowledge your contributions! As such, we would welcome any advice and recommendations that can improve our credit system. Please contact us. -Codestin Search App -There are no better ways other than trying out Taskflow before you want to contribute. We summarize a few steps below for you to follow. +Codestin Search AppThere are no better ways other than trying out Taskflow before you want to contribute. We summarize a few steps below for you to follow. -Codestin Search App -Visit the Project Website and get an 1000-feet overview of Taskflow, in which you shall find recent news, releases, use cases, and other useful information of Taskflow. We also provided a showcase presentation for you to quickly understand the technical work of Taskflow. Then, check out our Real Use Cases and get a sense about the problems Taskflow is good at. +Codestin Search AppVisit the Project Website and get an 1000-feet overview of Taskflow, in which you shall find recent news, releases, use cases, and other useful information of Taskflow. We also provided a showcase presentation for you to quickly understand the technical work of Taskflow. Then, check out our Real Use Cases and get a sense about the problems Taskflow is good at. -Codestin Search App -Taskflow is a programming system. We believe it is impossible to understand what Taskflow is doing without writing real code. Visit the quick-start page and program your first hello-world with Taskflow! +Codestin Search AppTaskflow is a programming system. We believe it is impossible to understand what Taskflow is doing without writing real code. Visit the quick-start page and program your first hello-world with Taskflow! #include<taskflow/taskflow.hpp>//Taskflowisheader-only intmain(){ @@ -105,10 +100,10 @@ Your effort really matters to us and we are eater to acknowledge your contributi tf::Taskflowtaskflow; auto[A,B,C,D]=taskflow.emplace( -[](){std::cout<<"TaskA\n";},//taskdependencygraph -[](){std::cout<<"TaskB\n";},// -[](){std::cout<<"TaskC\n";},//+---+ -[](){std::cout<<"TaskD\n";}//+---->|B|-----+ +[](){std::cout<<"TaskA\n";},//taskdependencygraph +[](){std::cout<<"TaskB\n";},// +[](){std::cout<<"TaskC\n";},//+---+ +[](){std::cout<<"TaskD\n";}//+---->|B|-----+ );//|+---+| //+---++-v-+ A.precede(B);//ArunsbeforeB//|A||D| @@ -124,23 +119,19 @@ Your effort really matters to us and we are eater to acknowledge your contributi The hello-world program creates four tasks, A, B, C, and D, where A runs before B and C, and D runs after B and C. When A finishes, B and C can run in parallel, and then D. -Codestin Search App -After you successfully finish the hello-world example, give a deep dive-in to the technical details by visiting Cookbook, Taskflow Algorithms, and Learning from Examples. These pages provides you step-by-step tutorials about the fundamental syntaxes and tasking models in Taskflow that you need to fully take advantage of task graph parallelism to boost your application performance. +Codestin Search AppAfter you successfully finish the hello-world example, give a deep dive-in to the technical details by visiting Cookbook, Taskflow Algorithms, and Learning from Examples. These pages provides you step-by-step tutorials about the fundamental syntaxes and tasking models in Taskflow that you need to fully take advantage of task graph parallelism to boost your application performance. At this stage, you may encounter issues, features requests, and questions. Then, start your first contribution by posting them in our issue tracker! -Codestin Search App -Taskflow is in active development. We are not surprised that you encounter something that needs improvement or fixes to work for your use cases. Or you want to suggest something that can improve Taskflow's functionality. Please do not hesitate to share any of these issues with by by opening an post at our issue tracker! +Codestin Search AppTaskflow is in active development. We are not surprised that you encounter something that needs improvement or fixes to work for your use cases. Or you want to suggest something that can improve Taskflow's functionality. Please do not hesitate to share any of these issues with by by opening an post at our issue tracker! Please make sure that you provide all the necessary information in the issue body to communicate your problem clearly so we can work on it efficiently. -Codestin Search App -Documentation is just as important as the codebase! There is always a scope of improvement in documentation to add some missing information or to make it easier to read. We use the famous Doxygen to compile our documentation. You can edit the documentation source which is stored as a text file in the doxygen directory of Taskflow. After editing the file locally, you can submit your changes to us by making a patch. +Codestin Search AppDocumentation is just as important as the codebase! There is always a scope of improvement in documentation to add some missing information or to make it easier to read. We use the famous Doxygen to compile our documentation. You can edit the documentation source which is stored as a text file in the doxygen directory of Taskflow. After editing the file locally, you can submit your changes to us by making a patch. -Codestin Search App -To contribute your code to Taskflow, you need to make a pull request from your fork of Taskflow. GitHub makes the development flow of submitting pull requests extremely handy as long as you follow the standard fork process. +Codestin Search AppTo contribute your code to Taskflow, you need to make a pull request from your fork of Taskflow. GitHub makes the development flow of submitting pull requests extremely handy as long as you follow the standard fork process. When you make a pull request, please provide all the necessary information requested by prompts in the pull request body. In addition, make sure the code you are submitting always accounts for the following three guidelines: Run the tests: You must pass through our unit tests (see Building and Installing) before submitting the pull request. Our unit tests have accumulated many corner cases over the past years that can detect defects in the newly developed features or bugs when changing the existing functionality. @@ -157,8 +148,7 @@ Your effort really matters to us and we are eater to acknowledge your contributi Please let us know all people who are involved in the pull request so that we can appropriately acknowledge everyone's effort at the Contributors page. If there are any issues that you would like to communicate offline, please contact us. -Codestin Search App -There are many on-going and future projects that interest us and the Taskflow community. Given the tremendous amount of work, we welcome organizations or individuals to take lead on these projects. The table below summarizes a list of projects that need you to either take lead or contribute: +Codestin Search AppThere are many on-going and future projects that interest us and the Taskflow community. Given the tremendous amount of work, we welcome organizations or individuals to take lead on these projects. The table below summarizes a list of projects that need you to either take lead or contribute: Item Status @@ -192,7 +182,7 @@ Please let us know all people who are involved in the pull request so that we ca Integrating OpenCL need leaders -design another task type, clFlow, to support OpenCL in a task-graph fasion and schedule OpenCL tasks using graph parallelism +design another task type, clFlow, to support OpenCL in a task-graph fashion and schedule OpenCL tasks using graph parallelism Supporting pipeline @@ -209,11 +199,10 @@ Please let us know all people who are involved in the pull request so that we ca If you have identified any other projects that can be included to the list, please make a post at our issue tracker or contact us. -Codestin Search App -If you find Taskflow helpful, please share it with your peers, colleagues, and anyone who can benefit from Taskflow. By telling other people about how Taskflow helped you, you will help us in turn and broaden our impact. +Codestin Search AppIf you find Taskflow helpful, please share it with your peers, colleagues, and anyone who can benefit from Taskflow. By telling other people about how Taskflow helped you, you will help us in turn and broaden our impact. Thank you very much for contributing! - + diff --git a/docs/xml/guidelines_8dox.xml b/docs/xml/guidelines_8dox.xml index bc05b4141..5a2bc479d 100644 --- a/docs/xml/guidelines_8dox.xml +++ b/docs/xml/guidelines_8dox.xml @@ -1,5 +1,5 @@ - + guidelines.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/header_8html.xml b/docs/xml/header_8html.xml index b8eab5f9f..5495ca342 100644 --- a/docs/xml/header_8html.xml +++ b/docs/xml/header_8html.xml @@ -1,11 +1,11 @@ - + header.html - + diff --git a/docs/xml/index.xml b/docs/xml/index.xml index 7336f6ca0..4002be7de 100644 --- a/docs/xml/index.xml +++ b/docs/xml/index.xml @@ -1,21 +1,27 @@ - + tf::SmallVectorTemplateCommon::AlignedUnionType - buff - - tf::TaskQueue::Array - C - M - S - Array - ~Array - capacity - push - pop - resize + max_size + buff + + tf::AnchorGuard + _node + AnchorGuard + ~AnchorGuard + + tf::UnboundedTaskQueue::Array + C + M + S + Array + ~Array + capacity + push + pop + resize tf::Node::Async - work + work Async Async @@ -37,6 +43,28 @@ _incref _decref + tf::BoundedTaskQueue + BufferSize + BufferMask + _top + _bottom + _buffer + BoundedTaskQueue + ~BoundedTaskQueue + empty + size + capacity + try_push + push + pop + steal + steal_with_hint + + tf::CachelineAligned + data + get + get + tf::ChromeObserver Executor _timeline @@ -49,32 +77,9 @@ on_exit tf::Node::Condition - work + work Condition - tf::CriticalSection - CriticalSection - add - - tf::detail::cudaBlockReduce - group_size - num_passes - num_items - operator() - - tf::detail::cudaBlockScan - num_warps - num_passes - capacity - operator() - operator() - - tf::detail::cudaBlockSort - has_values - num_passes - merge_pass - block_sort - tf::cudaDeviceAllocator value_type pointer @@ -104,185 +109,89 @@ cudaDeviceVector cudaDeviceVector ~cudaDeviceVector - operator= + operator= size - data - data + data + data cudaDeviceVector - operator= - - tf::cudaEvent - cudaEvent - cudaEvent - cudaEvent - - tf::cudaEventCreator - operator() - operator() - - tf::cudaEventDeleter - operator() - - tf::cudaExecutionPolicy - nt - vt - nv - _stream - cudaExecutionPolicy - cudaExecutionPolicy - stream - stream - num_blocks - reduce_bufsz - min_element_bufsz - max_element_bufsz - scan_bufsz - merge_bufsz - - tf::detail::cudaFindPair - key - index - operator unsigned - - tf::cudaFlow - _cfg - _exe - cudaFlow - ~cudaFlow - cudaFlow - operator= - empty - num_tasks - clear - dump - dump_native_graph - noop - host - host - kernel - kernel - memset - memset - memcpy - memcpy - zero - zero - fill - fill - copy - copy - run - native_graph - native_executable - single_task - single_task - for_each - for_each - for_each_index - for_each_index - transform - transform - transform - transform - capture - capture - - tf::cudaFlowCapturer - handle_t - Optimizer - cudaFlow - Executor - _cfg - _optimizer - _exe - cudaFlowCapturer - ~cudaFlowCapturer - cudaFlowCapturer - operator= - empty - num_tasks - clear - dump - dump_native_graph - on - on - noop - noop - memcpy - memcpy - copy - copy - memset - memset - kernel - kernel - single_task - single_task - for_each - for_each - for_each_index - for_each_index - transform - transform - transform - transform - make_optimizer - capture - run - native_graph - native_executable - - tf::cudaFlowLinearOptimizer - cudaFlowCapturer - cudaFlowLinearOptimizer - _optimize - - tf::cudaFlowOptimizerBase - _toposort - _levelize - - tf::cudaFlowRoundRobinOptimizer - cudaFlowCapturer - _num_streams - cudaFlowRoundRobinOptimizer - cudaFlowRoundRobinOptimizer - num_streams - num_streams - _optimize - _reset - - tf::cudaFlowSequentialOptimizer - cudaFlowCapturer - cudaFlowSequentialOptimizer - _optimize - - tf::detail::cudaMergePair - keys - indices - - tf::detail::cudaMergeRange - a_begin - a_end - b_begin - b_end - a_count - b_count - total - a_range - b_range - to_local - partition - partition - a_valid - b_valid - - tf::detail::cudaScanResult - scan - reduction - - tf::detail::cudaScanResult< T, vt, true > - scan - reduction + operator= + + tf::cudaEventBase + base_type + cudaEventBase + cudaEventBase + operator= + cudaEventBase + operator= + + tf::cudaEventCreator + operator() + operator() + operator() + + tf::cudaEventDeleter + operator() + + tf::cudaGraphBase + base_type + cudaGraphBase + cudaGraphBase + operator= + num_nodes + num_edges + empty + dump + noop + host + kernel + memset + memcpy + zero + fill + copy + single_task + for_each + for_each_index + transform + transform + cudaGraphBase + operator= + + tf::cudaGraphCreator + operator() + operator() + + tf::cudaGraphDeleter + operator() + + tf::cudaGraphExecBase + base_type + cudaGraphExecBase + cudaGraphExecBase + operator= + host + kernel + memset + memcpy + zero + fill + copy + single_task + for_each + for_each_index + transform + transform + cudaGraphExecBase + operator= + + tf::cudaGraphExecCreator + operator() + operator() + operator() + operator() + + tf::cudaGraphExecDeleter + operator() tf::cudaScopedDevice _p @@ -293,77 +202,83 @@ cudaScopedDevice tf::cudaSharedMemory - get + get tf::cudaSharedMemory< bool > - get + get tf::cudaSharedMemory< char > - get + get tf::cudaSharedMemory< double > - get + get tf::cudaSharedMemory< float > - get + get tf::cudaSharedMemory< int > - get + get tf::cudaSharedMemory< long > - get + get tf::cudaSharedMemory< short > - get + get tf::cudaSharedMemory< unsigned char > - get + get tf::cudaSharedMemory< unsigned int > - get + get tf::cudaSharedMemory< unsigned long > - get + get tf::cudaSharedMemory< unsigned short > - get - - tf::cudaStream - cudaStream - cudaStream - synchronize - begin_capture - end_capture - record - wait - - tf::cudaStreamCreator - operator() - - tf::cudaStreamDeleter - operator() + get + + tf::cudaStreamBase + base_type + cudaStreamBase + cudaStreamBase + operator= + synchronize + begin_capture + end_capture + record + wait + run + run + cudaStreamBase + operator= + run + + tf::cudaStreamCreator + operator() + operator() + + tf::cudaStreamDeleter + operator() tf::cudaTask + cudaGraphBase + cudaGraphExecBase cudaFlow cudaFlowCapturer cudaFlowCapturerBase - operator<< - _node + operator<< + _native_graph + _native_node cudaTask cudaTask - operator= + operator= precede succeed - name - name num_successors - num_dependents - empty - type - dump - for_each_successor - for_each_dependent - cudaTask + num_predecessors + type + dump + cudaTask tf::cudaUSMAllocator value_type @@ -401,7 +316,7 @@ callable tf::DataPipeline - data_t + data_t _graph _num_tokens _pipes @@ -421,9 +336,9 @@ _on_pipe _build - tf::DefaultClosureWrapper + tf::DefaultClosureWrapper - tf::DefaultTaskParams + tf::DefaultTaskParams tf::DeferredPipeflow Pipeline @@ -435,13 +350,13 @@ DeferredPipeflow DeferredPipeflow DeferredPipeflow - operator= - operator= + operator= + operator= tf::Node::DependentAsync - work + work use_count - state + state DependentAsync tf::Taskflow::Dumper @@ -461,23 +376,18 @@ FlowBuilder Subflow Runtime - _MAX_STEALS - _wsq_mutex + Algorithm _taskflows_mutex - _num_topologies - _all_spawned + _workers + _notifier _topology_cv _topology_mutex _num_topologies - _wids - _threads - _workers _taskflows - _notifier - _wsq - _done + _buffers + _worker_interface _observers - Executor + Executor ~Executor run run @@ -495,6 +405,8 @@ corun_until wait_for_all num_workers + num_waiters + num_queues num_topologies num_taskflows this_worker_id @@ -513,43 +425,46 @@ dependent_async dependent_async dependent_async - _this_worker - _wait_for_task - _invoke_module_task_internal + _shutdown _observer_prologue _observer_epilogue _spawn _exploit_task - _explore_task + _explore_task _schedule _schedule - _schedule - _schedule _set_up_topology - _set_up_graph _tear_down_topology - _tear_down_async - _tear_down_dependent_async - _tear_down_invoke + _tear_down_async + _tear_down_dependent_async + _tear_down_invoke _increment_topology _decrement_topology _invoke _invoke_static_task - _invoke_subflow_task - _detach_subflow_task _invoke_condition_task _invoke_multi_condition_task - _invoke_module_task - _invoke_async_task - _invoke_dependent_async_task - _process_async_dependent + _process_dependent_async _process_exception _schedule_async_task - _corun_graph + _update_cache + _wait_for_task + _invoke_subflow_task + _invoke_module_task + _invoke_module_task_impl + _invoke_async_task + _invoke_dependent_async_task + _invoke_runtime_task + _invoke_runtime_task_impl + _invoke_runtime_task_impl + _set_up_graph _corun_until - - tf::cudaFlowCapturer::External - graph + _corun_graph + _schedule + _schedule + _schedule_graph_with_parent + _async + _silent_async tf::FlowBuilder Executor @@ -559,6 +474,7 @@ emplace emplace emplace + emplace emplace erase composed_of @@ -567,17 +483,19 @@ linearize for_each for_each_index + for_each_by_index transform transform reduce + reduce_by_index transform_reduce transform_reduce - inclusive_scan - inclusive_scan - exclusive_scan - transform_inclusive_scan - transform_inclusive_scan - transform_exclusive_scan + inclusive_scan + inclusive_scan + exclusive_scan + transform_inclusive_scan + transform_inclusive_scan + transform_exclusive_scan find_if find_if_not min_element @@ -594,8 +512,8 @@ Future Future Future - operator= - operator= + operator= + operator= cancel Future @@ -605,22 +523,14 @@ Subflow Taskflow Executor - _nodes Graph Graph - Graph - ~Graph - operator= - operator= - empty - size - clear - _clear - _clear_detached - _merge + Graph + operator= + operator= _erase - _emplace_back - _emplace_back + _emplace_back + _emplace_back tf::GuidedPartitioner type @@ -630,7 +540,30 @@ loop loop_until - tf::cudaFlowCapturer::Internal + tf::has_graph + + tf::IndexRange + index_type + _beg + _end + _step_size + IndexRange + IndexRange + begin + end + step_size + reset + begin + end + step_size + size + discrete_domain + + tf::is_runtime_task + + tf::is_static_task + + tf::is_subflow_task tf::IsPartitioner @@ -650,16 +583,12 @@ Module tf::Node::MultiCondition - work + work MultiCondition tf::Node - AsyncState - UNFINISHED - LOCKED - FINISHED Placeholder - handle_t + handle_t Graph Task AsyncTask @@ -669,57 +598,49 @@ FlowBuilder Subflow Runtime - TF_ENABLE_POOLABLE_ON_THIS + AnchorGuard + PreemptionGuard + PLACEHOLDER + STATIC + RUNTIME + SUBFLOW + CONDITION + MULTI_CONDITION + MODULE + ASYNC + DEPENDENT_ASYNC + _nstate + _estate _name - _priority _data _topology _parent - _successors - _dependents - _state + _num_successors + _edges _join_counter + _handle _semaphores _exception_ptr - _handle - CONDITIONED - DETACHED - ACQUIRED - READY - EXCEPTION - PLACEHOLDER - STATIC - SUBFLOW - CONDITION - MULTI_CONDITION - MODULE - ASYNC - DEPENDENT_ASYNC Node - Node - Node - Node - Node - ~Node + Node + Node num_successors - num_dependents - num_strong_dependents - num_weak_dependents + num_predecessors + num_strong_dependencies + num_weak_dependencies name - Node - Node - Node - Node - _precede - _set_up_join_counter - _process_exception + Node + Node _is_cancelled _is_conditioner + _is_preempted _acquire_all - _release_all - - tf::NodeDeleter - operator() + _release_all + _precede + _set_up_join_counter + _rethrow_exception + _remove_successors + _remove_predecessors tf::ObserverInterface ~ObserverInterface @@ -729,6 +650,7 @@ tf::PartitionerBase closure_wrapper_type + is_default_wrapper_v _chunk_size _closure_wrapper PartitionerBase @@ -736,8 +658,10 @@ PartitionerBase chunk_size chunk_size - closure_wrapper + closure_wrapper + closure_wrapper closure_wrapper + operator() tf::Pipe callable_t @@ -801,13 +725,22 @@ tf::Pipeline::PipeMeta type + tf::PreemptionGuard + _runtime + PreemptionGuard + ~PreemptionGuard + PreemptionGuard + PreemptionGuard + operator= + operator= + tf::ProfileData timelines ProfileData ProfileData ProfileData - operator= - operator= + operator= + operator= save load @@ -822,7 +755,7 @@ RandomPartitioner alpha beta - chunk_size_range + chunk_size_range loop loop_until @@ -832,28 +765,31 @@ tf::cudaUSMAllocator::rebind other + tf::Node::Runtime + work + Runtime + tf::Runtime Executor FlowBuilder + PreemptionGuard + Algorithm _executor _worker _parent - ~Runtime + _preempted executor + worker schedule async async silent_async silent_async - silent_async_unchecked - silent_async_unchecked corun - corun_until + corun corun_all - worker + is_cancelled Runtime - _async - _silent_async tf::ScalablePipeline pipe_t @@ -872,7 +808,7 @@ ScalablePipeline ScalablePipeline ScalablePipeline - operator= + operator= operator= num_lines num_pipes @@ -907,13 +843,19 @@ tf::Semaphore Node + Executor _mtx - _counter - _waiters - Semaphore - count + _max_value + _cur_value + _waiters + Semaphore + Semaphore + value + max_value + reset + reset _try_acquire_or_wait - _release + _release tf::Node::Semaphores to_acquire @@ -927,11 +869,11 @@ SmallVector SmallVector SmallVector - operator= - operator= + operator= + operator= SmallVector - operator= - operator= + operator= + operator= tf::SmallVectorBase BeginX @@ -944,10 +886,10 @@ empty tf::SmallVectorImpl - SuperClass - iterator - const_iterator - size_type + SuperClass + iterator + const_iterator + size_type SmallVectorImpl SmallVectorImpl ~SmallVectorImpl @@ -1005,18 +947,18 @@ pop_back tf::SmallVectorTemplateCommon - U - size_type - difference_type - value_type - iterator - const_iterator - const_reverse_iterator - reverse_iterator - reference - const_reference - pointer - const_pointer + U + size_type + difference_type + value_type + iterator + const_iterator + const_reverse_iterator + reverse_iterator + reference + const_reference + pointer + const_pointer SmallVectorStorage FirstEl SmallVectorTemplateCommon @@ -1047,7 +989,7 @@ back tf::Node::Static - work + work Static tf::StaticPartitioner @@ -1059,19 +1001,6 @@ loop loop_until - tf::detail::cudaBlockReduce::Storage - data - - tf::detail::cudaBlockSort::Storage - keys - vals - - tf::detail::cudaBlockScan::storage_t - data - threads - warps - @1 - tf::Node::Subflow work subgraph @@ -1080,13 +1009,19 @@ tf::Subflow Executor FlowBuilder - Runtime - _joinable + _executor + _worker + _parent join - detach - reset joinable + executor + graph + retain + retain Subflow + Subflow + Subflow + Subflow tf::TFProfObserver::Summary tsum @@ -1104,31 +1039,34 @@ _node Task Task - operator= + operator= operator= operator== operator!= name num_successors - num_dependents - num_strong_dependents - num_weak_dependents + num_predecessors + num_strong_dependencies + num_weak_dependencies name work composed_of precede succeed + remove_predecessors + remove_successors release + release acquire + acquire data - priority - priority reset reset_work empty has_work for_each_successor - for_each_dependent + for_each_predecessor + for_each_subflow_task hash_value type dump @@ -1139,6 +1077,7 @@ Topology Executor FlowBuilder + Subflow _mutex _name _graph @@ -1163,30 +1102,9 @@ _dump _dump - tf::TaskParams - name - priority - data - - tf::TaskQueue - _top - _bottom - _array - _garbage - TaskQueue - ~TaskQueue - empty - empty - size - size - capacity - capacity - push - pop - pop - steal - steal - resize_array + tf::TaskParams + name + data tf::TFProfObserver::TaskSummary count @@ -1200,11 +1118,11 @@ _node name num_successors - num_dependents - num_strong_dependents - num_weak_dependents + num_predecessors + num_strong_dependencies + num_weak_dependencies for_each_successor - for_each_dependent + for_each_predecessor type hash_value TaskView @@ -1217,7 +1135,7 @@ _observers ~TFProfManager TFProfManager - operator= + operator= dump get TFProfManager @@ -1251,26 +1169,49 @@ Timeline Timeline Timeline - operator= - operator= + operator= + operator= save load + tf::UnboundedTaskQueue + _top + _bottom + _array + _garbage + UnboundedTaskQueue + ~UnboundedTaskQueue + empty + size + capacity + push + pop + steal + steal_with_hint + resize_array + tf::Worker Executor + Runtime WorkerView + _done _id _vtm _executor - _thread - _waiter + _waiter + _thread _rdgen - _wsq - _cache + _wsq id - thread queue_size queue_capacity + executor + thread + + tf::WorkerInterface + ~WorkerInterface + scheduler_prologue + scheduler_epilogue tf::TFProfObserver::WorkerSummary id @@ -1291,542 +1232,11 @@ WorkerView WorkerView - std - atomic_fetch_and_explicit - atomic_fetch_xor_explicit - set_unexpected - fputs - modf - not2 - strlen - exp2 - setiosflags - adjacent_difference - cos - fwscanf - atomic_init - forward_as_tuple - abort - wcsncmp - set_intersection - atomic_signal_fence - llabs - make_move_iterator - scanf - nextafter - stol - strcspn - ungetwc - transform - putc - iswdigit - rint - memset - isgraph - replace_copy_if - scalbn - partial_sort_copy - make_exception_ptr - frexp - isxdigit - atomic_exchange_explicit - wprintf - fdim - wctype - mbrtoc32 - setw - get_temporary_buffer - fmax - atomic_thread_fence - atomic_exchange - fgetwc - swprintf - prev_permutation - max_element - set_symmetric_difference - wcscpy - const_pointer_cast - minmax_element - wcstok - ref - feupdateenv - endl - end - wmemmove - fmin - uninitialized_fill_n - nouppercase - noshowpos - ctime - wmemset - iswpunct - pop_heap - sprintf - fixed - make_shared - make_heap - fmod - atol - uninitialized_copy - dynamic_pointer_cast - set_union - hexfloat - vswprintf - asctime - iswspace - nan - sort - quick_exit - log10 - mbstowcs - isspace - strncat - isinf - atof - erf - is_sorted_until - cbrt - log1p - return_temporary_buffer - mbsrtowcs - feraiseexcept - fseek - atomic_fetch_or_explicit - log - putchar - make_tuple - expm1 - fma - remove_copy_if - showpoint - fscanf - stable_partition - fill_n - remove_copy - atomic_compare_exchange_strong_explicit - wctomb - fgets - remainder - allocate_shared - unique - includes - iswalnum - exit - put_time - to_string - is_heap_until - wcstold - stold - ftell - copy_backward - wcstoll - perror - vwscanf - stable_sort - generic_category - abs(int) - fgetws - showpos - exp - fill - isalpha - lgamma - feclearexcept - wcsncpy - undeclare_reachable - oct - strspn - realloc - copy - binary_search - system_category - mbrtowc - strtof - mem_fn - distance - lock - strcmp - tmpfile - hypot - getenv - strrchr - count - tan - strftime - stod - towupper - atoll - atomic_store - stoi - rethrow_exception - sin - atomic_fetch_sub_explicit - unexpected - mbtowc - get_time - partition - next - isfinite - boolalpha - fetestexcept - mbrlen - iswgraph - time - atomic_compare_exchange_strong - wcschr - uppercase - lower_bound - copy_if - isnan - has_facet - kill_dependency - uninitialized_copy_n - feholdexcept - div - at_quick_exit - wcspbrk - search - find_first_of - iota - declare_reachable - atomic_compare_exchange_weak - strtod - accumulate - wcsrchr - min_element - clearerr - random_shuffle - iswalpha - atomic_fetch_and - wmemchr - bsearch - ilogb - unique_copy - _Exit - move - find_end - fesetexceptflag - nth_element - gets - lexicographical_compare - nearbyint - memcpy - fwrite - unitbuf - iswlower - mblen - swscanf - wcstoimax - fprintf - find_if - strtoimax - isalnum - atomic_fetch_add_explicit - push_heap - min - fwprintf - uncaught_exception - strtoll - throw_with_nested - shuffle - isprint - get_new_handler - call_once - trunc - wcscspn - mbrtoc16 - lround - pow - tgamma - erfc - llround - abs(float) - asinh - feof - noskipws - find - atoi - not1 - vfscanf - stof - regex_search - rotate_copy - set_new_handler - undeclare_no_pointers - async - partition_point - vsscanf - fesetround - atomic_is_lock_free - tanh - ldiv - setbase - remove - strtol - strpbrk - signbit - wcsncat - get_money - set_difference - cref - getline - to_wstring - system - static_pointer_cast - wcstoumax - memmove - getwchar - scientific - wcsftime - begin - ceil - sinh - is_permutation - generate_n - acosh - advance - flush - atomic_fetch_xor - ws - signal - noshowbase - generate - ldexp - vsnprintf - remove_if - stoull - fegetexceptflag - find_if_not - merge - free - count_if - clock - mktime - inserter - puts - asin - iscntrl - difftime - terminate - memcmp - uninitialized_fill - hex - tie - back_inserter - upper_bound - adjacent_find - use_facet - vfwprintf - atomic_fetch_add - fsetpos - malloc - localtime - wcscmp - c32rtomb - isupper - wcstod - tolower - sort_heap - isdigit - wcslen - wmemcmp - move_if_noexcept - declval - fpclassify - iswupper - rand - atomic_compare_exchange_weak_explicit - partial_sort - llrint - fclose - reverse - partial_sum - showbase - vswscanf - atan - atanh - iter_swap - scalbln - reverse_copy - forward - getc - equal_range - atomic_fetch_sub - is_partitioned - next_permutation - isblank - noshowpoint - atan2 - nanf - towctrans - right - fputwc - strtoul - is_heap - fflush - strtoumax - nexttoward - nounitbuf - ispunct - noboolalpha - make_pair - iswctype - srand - replace_copy - future_category - resetiosflags - vprintf - gmtime - align - tuple_cat - ends - set_terminate - lrint - none_of - wscanf - fputc - dec - strcat - raise - wcsspn - fabs - wmemcpy - copy_n - rethrow_if_nested - setlocale - addressof - calloc - strerror - strcpy - wcstoull - c16rtomb - generate_canonical - vfprintf - notify_all_at_thread_exit - rotate - current_exception - strtok - wcscat - strncpy - towlower - floor - left - ferror - atomic_load_explicit - swap - acos - wcscoll - sqrt - mbsinit - qsort - stoll - put_money - wcstoul - wcstol - atexit - atomic_fetch_or - rewind - wcsxfrm - round - vwprintf - all_of - replace - remquo - setbuf - strncmp - localeconv - wctrans - any_of - equal - max - strxfrm - iswxdigit - labs - regex_match - fputws - wcrtomb - setprecision - setvbuf - regex_replace - freopen - logb - wctob - atomic_load - search_n - toupper - move_backward - is_sorted - strtoull - iswblank - get_pointer_safety - get_unexpected - sscanf - fesetenv - atomic_store_explicit - strtold - fread - memchr - btowc - replace_if - strcoll - vsprintf - mismatch - getchar - islower - tmpnam - nanl - fopen - for_each - fegetround - ungetc - internal - vfwscanf - fgetc - wcstof - bind - skipws - iswprint - wcstombs - inplace_merge - copysign - putwchar - wcsstr - fegetenv - longjmp - iswcntrl - declare_no_pointers - isnormal - swap_ranges - minmax - defaultfloat - rename - snprintf - try_lock - stoul - fgetpos - partition_copy - vscanf - front_inserter - get_terminate - cosh - prev - strchr - strstr - printf - setfill - inner_product - swap - swap - tf - TaskPriority - HIGH - NORMAL - LOW - MAX TaskType PLACEHOLDER STATIC + RUNTIME SUBFLOW CONDITION MODULE @@ -1842,31 +1252,53 @@ PipeType PARALLEL SERIAL - cudaTaskType - EMPTY - HOST - MEMSET - MEMCPY - KERNEL - SUBFLOW - CAPTURE - UNDEFINED - observer_stamp_t - DefaultPartitioner - cudaDefaultExecutionPolicy + DefaultNotifier + observer_stamp_t + DefaultPartitioner + cudaEvent + cudaStream + cudaGraph + cudaGraphExec is_task_params_v - node_pool - TASK_TYPES + has_graph_v + TASK_TYPES + is_static_task_v is_subflow_task_v + is_runtime_task_v is_condition_task_v is_multi_condition_task_v - is_static_task_v is_partitioner_v capacity_in_bytes - to_string - operator<< - to_string + next_pow2 + is_pow2 + floor_log2 + static_floor_log2 + median_of_three + pseudo_median_of_nine + sort2 + sort3 + unique_id + atomic_max + atomic_min + seed + ctz + coprime + make_coprime_lut + get_env + has_env + pause + pause + spin_until + is_index_range_invalid + distance + animate + recycle + make_worker_interface + to_string + operator<< + to_string make_data_pipe + make_module_task cuda_get_num_devices cuda_get_device cuda_set_device @@ -1889,239 +1321,145 @@ cuda_get_runtime_version cuda_get_free_mem cuda_get_total_mem - cuda_malloc_device - cuda_malloc_device - cuda_malloc_shared + cuda_malloc_device + cuda_malloc_device + cuda_malloc_shared cuda_free cuda_free cuda_memcpy_async cuda_memset_async - to_string - operator<< - cuda_single_task - cuda_for_each - cuda_for_each_index - cuda_single_task - cuda_transform - cuda_transform - cuda_reduce - cuda_uninitialized_reduce - cuda_transform_reduce - cuda_uninitialized_transform_reduce - cuda_inclusive_scan - cuda_transform_inclusive_scan - cuda_exclusive_scan - cuda_transform_exclusive_scan - cuda_merge_by_key - cuda_merge - cuda_sort_buffer_size - cuda_sort_by_key - cuda_sort - cuda_find_if - cuda_min_element - cuda_max_element - version + cuda_get_copy_parms + cuda_get_memcpy_parms + cuda_get_memset_parms + cuda_get_fill_parms + cuda_get_zero_parms + cuda_graph_get_num_root_nodes + cuda_graph_get_num_nodes + cuda_graph_get_num_edges + cuda_graph_get_nodes + cuda_graph_get_root_nodes + cuda_graph_get_edges + cuda_get_graph_node_type + to_string + operator<< + version tf::detail - cudaScanType - EXCLUSIVE - INCLUSIVE - cudaMergeBoundType - LOWER - UPPER - cudaScanRecursionThreshold NextCapacity - cuda_for_each_kernel - cuda_for_each_index_kernel - cuda_transform_kernel - cuda_transform_kernel - cuda_reduce_kernel - cuda_reduce_loop - cuda_uninitialized_reduce_kernel - cuda_uninitialized_reduce_loop - cuda_single_pass_scan - cuda_scan_loop - cuda_merge_path - cuda_merge_path - cuda_merge_predicate - cuda_compute_merge_range - cuda_load_two_streams_reg - load_two_streams_reg - cuda_load_two_streams_shared - cuda_gather_two_streams_strided - cuda_gather_two_streams_strided - cuda_transfer_two_streams_strided - cuda_serial_merge - block_merge_from_mem - cuda_merge_path_partitions - cuda_merge_loop - cuda_clz - cuda_find_log2 - cuda_odd_even_sort - cuda_odd_even_sort - cuda_out_of_range_flags - cuda_compute_merge_sort_frame - cuda_compute_merge_sort_range - cuda_compute_merge_sort_range - cuda_merge_sort_partitions - merge_sort_loop - cuda_find_if_loop - cuda_min_element_loop - cuda_max_element_loop + get_node_ptr + cuda_for_each_kernel + cuda_for_each_index_kernel + cuda_transform_kernel + cuda_transform_kernel - algorithms.dox - - async_task.hpp - - async_tasking.dox - - benchmark_taskflow.dox + tf::pt + this_worker - cancellation.dox - - codeofconduct.dox - - composable_tasking.dox - - conditional_tasking.dox - - contributing.dox - - contributors.dox - - Cookbook.dox - - critical.hpp - - cuda_capturer.hpp - - cuda_compile.dox - - cuda_device.hpp - - cuda_execution_policy.hpp + algorithms.dox - cuda_memory.hpp + data_pipeline.dox - cuda_optimizer.hpp + find.dox - cuda_std_algorithms.dox + for_each.dox - cuda_std_execution_policy.dox + module.dox - cuda_std_find.dox + partitioner.dox - cuda_std_for_each.dox + pipeline.dox - cuda_std_merge.dox + pipeline_with_token_dependencies.dox - cuda_std_reduce.dox + reduce.dox - cuda_std_scan.dox + scalable_pipeline.dox - cuda_std_single_task.dox + scan.dox - cuda_std_transform.dox + sort.dox - cuda_stream.hpp + transform.dox - cuda_task.hpp + contributing.dox - cudaflow.hpp + contributors.dox - cudaflow_algorithms.dox + guidelines.dox - cudaflow_for_each.dox + async_tasking.dox - cudaflow_single_task.dox + cancellation.dox - cudaflow_transform.dox + composable_tasking.dox - data_pipeline.dox + conditional_tasking.dox - data_pipeline.hpp + Cookbook.dox dependent_async_tasking.dox - dreamplace.dox - - examples.dox - exception.dox executor.dox - executor.hpp - - FAQ.dox - - fibonacci.dox - - find.dox + gpu_tasking.dox - find.hpp + motivation.dox - flipcoins.dox + profiler.dox - flow_builder.hpp + runtime_tasking.dox - for_each.dox + semaphore.dox - for_each.hpp + static_tasking.dox - governance.dox + subflow_tasking.dox - gpu_tasking_cudaflow.dox + examples.dox - gpu_tasking_cudaflow_capturer.dox + fibonacci.dox - graph.hpp + flipcoins.dox graph_pipeline.dox graph_traversal.dox - guidelines.dox - - header.html - - install.dox - kmeans.dox - kmeans_cudaflow.dox + kmeans_cuda.dox - matrix_multiplication.dox + matmul.dox - matrix_multiplication_cudaflow.dox + matmul_cuda.dox - merge.hpp - - motivation.dox + taskflow_pipeline.dox - observer.hpp + text_pipeline.dox - opentimer.dox + wavefront.dox - partitioner.dox + FAQ.dox - partitioner.hpp + codeofconduct.dox - pipeline.dox + governance.dox - pipeline.hpp + rules.dox - pipeline_with_token_dependencies.dox + team.dox - prioritized_tasking.dox + header.html - profiler.dox + benchmark_taskflow.dox - QuickStart.dox + cuda_compile.dox - reduce.dox + install.dox - reduce.hpp + QuickStart.dox references.dox @@ -2149,6 +1487,10 @@ release-3.1.0.dox + release-3.10.0.dox + + release-3.11.0.dox + release-3.2.0.dox release-3.3.0.dox @@ -2161,62 +1503,108 @@ release-3.7.0.dox + release-3.8.0.dox + + release-3.9.0.dox + release-roadmap.dox releases.dox - rules.dox + dreamplace.dox - runtime_tasking.dox + opentimer.dox - scalable_pipeline.dox + usecases.dox - scan.dox + data_pipeline.hpp - scan.hpp + module.hpp - semaphore.dox + partitioner.hpp - semaphore.hpp + pipeline.hpp - small_vector.hpp + async_task.hpp - sort.dox + executor.hpp - sort.hpp + flow_builder.hpp - static_tasking.dox + graph.hpp - subflow_tasking.dox + observer.hpp + + runtime.hpp + TF_RUNTIME_CHECK_CALLER + + semaphore.hpp task.hpp - core/taskflow.hpp + taskflow.hpp taskflow.hpp + TF_VERSION + TF_MAJOR_VERSION + TF_MINOR_VERSION + TF_PATCH_VERSION - taskflow_pipeline.dox - - team.dox + tsq.hpp + TF_DEFAULT_BOUNDED_TASK_QUEUE_LOG_SIZE + TF_DEFAULT_UNBOUNDED_TASK_QUEUE_LOG_SIZE - text_pipeline.dox + worker.hpp - transform.dox + for_each.hpp transform.hpp - tsq.hpp + cuda_device.hpp - usecases.dox + cuda_graph.hpp - wavefront.dox + cuda_graph_exec.hpp - worker.hpp + cuda_memory.hpp + + cuda_stream.hpp + + cudaflow.hpp + + iterator.hpp + + math.hpp + + os.hpp + TF_OS_LINUX + TF_OS_DRAGONFLY + TF_OS_FREEBSD + TF_OS_NETBSD + TF_OS_OPENBSD + TF_OS_DARWIN + TF_OS_WINDOWS + TF_OS_CNK + TF_OS_HURD + TF_OS_SOLARIS + TF_OS_UNIX + TF_OS_UNKNOWN + TF_CACHELINE_SIZE + + small_vector.hpp Releases release-roadmap + release-3-11-0 + + release-3-10-0 + + release-3-9-0 + + release-3-8-0 + release-3-7-0 release-3-6-0 @@ -2275,8 +1663,6 @@ RuntimeTasking - PrioritizedTasking - LimitTheMaximumConcurrency AsyncTasking @@ -2285,9 +1671,7 @@ ExceptionHandling - GPUTaskingcudaFlow - - GPUTaskingcudaFlowCapturer + GPUTasking RequestCancellation @@ -2309,6 +1693,8 @@ ParallelFind + ModuleAlgorithm + TaskParallelPipeline TaskParallelScalablePipeline @@ -2317,43 +1703,17 @@ TaskParallelPipelineWithTokenDependencies - cudaFlowAlgorithms - - SingleTaskCUDA - - ForEachCUDA - - ParallelTransformsCUDA - - cudaStandardAlgorithms - - CUDASTDExecutionPolicy - - CUDASTDSingleTask - - CUDASTDForEach - - CUDASTDTransform - - CUDASTDReduce - - CUDASTDScan - - CUDASTDMerge - - CUDASTDFind - Examples wavefront matrix_multiplication - matrix_multiplication_cudaflow + MatrixMultiplicationWithCUDAGPU kmeans - kmeans_cudaflow + KMeansWithCUDAGPU fibonacci @@ -2391,39 +1751,37 @@ References - algorithm - - algorithm + taskflow/algorithm - algorithms + taskflow/cuda/algorithm - contributing + doxygen/algorithms - cookbook + doxygen/contributing - core + doxygen/cookbook - cuda + taskflow/core - cuda_std_algorithms + taskflow/cuda - cudaflow_algorithms + doxygen - examples + doxygen/examples - governance + doxygen/governance - install + doxygen/install - references + doxygen/references - releases + doxygen/releases taskflow - usecases + doxygen/usecases - utility + taskflow/utility index diff --git a/docs/xml/index.xsd b/docs/xml/index.xsd index edb1d347d..6c847cc36 100644 --- a/docs/xml/index.xsd +++ b/docs/xml/index.xsd @@ -45,6 +45,8 @@ + + diff --git a/docs/xml/indexpage.xml b/docs/xml/indexpage.xml index 7951bd370..4164a4296 100644 --- a/docs/xml/indexpage.xml +++ b/docs/xml/indexpage.xml @@ -1,67 +1,64 @@ - + index - Codestin Search App + Codestin Search App Start Your First Taskflow Program indexpage_1ASimpleFirstProgram - + Create a Subflow Graph indexpage_1QuickStartCreateASubflowGraph - + Integrate Control Flow into a Task Graph indexpage_1QuickStartIntegrateControlFlowIntoATaskGraph - - - Offload Tasks to a GPU - indexpage_1QuickStartOffloadTasksToGPU - + Compose Task Graphs indexpage_1QuickStartComposeTaskGraphs - + Launch Asynchronous Tasks indexpage_1QuickStartLaunchAsyncTasks - + + + Leverage Standard Parallel Algorithms + indexpage_1QuickStartLeverageStandardParallelAlgorithms + Run a Taskflow through an Executor indexpage_1QuickStartRunATaskflowThroughAnExecution - + - Leverage Standard Parallel Algorithms - indexpage_1QuickStartLeverageStandardParallelAlgorithms - + Offload Tasks to a GPU + indexpage_1QuickStartOffloadTasksToGPU + Visualize Taskflow Graphs indexpage_1QuickStartVisualizeATaskflow - + Supported Compilers indexpage_1SupportedCompilers - + Get Involved indexpage_1QuickStartGetInvolved - + License indexpage_1License - + Taskflow helps you quickly write parallel and heterogeneous task programs with high performance and simultaneous high productivity. It is faster, more expressive, fewer lines of code, and easier for drop-in integration than many of existing task programming libraries. The source code is available in our Project GitHub. -Codestin Search App -The following program (simple.cpp) creates four tasks A, B, C, and D, where A runs before B and C, and D runs after B and C. When A finishes, B and C can run in parallel. - - +Codestin Search AppThe following program (simple.cpp) creates a taskflow of four tasks A, B, C, and D, where A runs before B and C, and D runs after B and C. When A finishes, B and C can run in parallel. #include<taskflow/taskflow.hpp>//Taskflowisheader-only intmain(){ @@ -70,10 +67,10 @@ tf::Taskflowtaskflow; auto[A,B,C,D]=taskflow.emplace(//createfourtasks -[](){std::cout<<"TaskA\n";}, -[](){std::cout<<"TaskB\n";}, -[](){std::cout<<"TaskC\n";}, -[](){std::cout<<"TaskD\n";} +[](){std::cout<<"TaskA\n";}, +[](){std::cout<<"TaskB\n";}, +[](){std::cout<<"TaskC\n";}, +[](){std::cout<<"TaskD\n";} ); A.precede(B,C);//ArunsbeforeBandC @@ -84,9 +81,11 @@ return0; } + + Taskflow is header-only and there is no wrangle with installation. To compile the program, clone the Taskflow project and tell the compiler to include the headers under taskflow/. -~$gitclonehttps://github.com/taskflow/taskflow.git#cloneitonlyonce -~$g++-std=c++17simple.cpp-Itaskflow/-O2-pthread-osimple +~$gitclonehttps://github.com/taskflow/taskflow.git#cloneitonlyonce +~$g++-std=c++20simple.cpp-Itaskflow/-O2-pthread-osimple ~$./simple TaskA TaskC @@ -96,7 +95,7 @@ Taskflow comes with a built-in profiler, Taskflow Profiler, for you to profile and visualize taskflow programs in an easy-to-use web-based interface. -#runtheprogramwiththeenvironmentvariableTF_ENABLE_PROFILERenabled +#runtheprogramwiththeenvironmentvariableTF_ENABLE_PROFILERenabled ~$TF_ENABLE_PROFILER=simple.json./simple ~$catsimple.json [ @@ -106,8 +105,7 @@ -Codestin Search App -Taskflow supports recursive tasking for you to create a subflow graph from the execution of a task to perform recursive parallelism. The following program spawns a task dependency graph parented at task B. +Codestin Search AppTaskflow supports recursive tasking for you to create a subflow graph from the execution of a task to perform recursive parallelism. The following program spawns a task dependency graph parented at task B. tf::TaskA=taskflow.emplace([](){}).name("A"); tf::TaskC=taskflow.emplace([](){}).name("C"); tf::TaskD=taskflow.emplace([](){}).name("D"); @@ -122,112 +120,66 @@ A.precede(B,C);//ArunsbeforeBandC D.succeed(B,C);//DrunsafterBandC - + -Codestin Search App -Taskflow supports conditional tasking for you to make rapid control-flow decisions across dependent tasks to implement cycles and conditions in an end-to-end task graph. +Codestin Search AppTaskflow supports conditional tasking for you to make rapid control-flow decisions across dependent tasks to implement cycles and conditions in an end-to-end task graph. tf::Taskinit=taskflow.emplace([](){}).name("init"); tf::Taskstop=taskflow.emplace([](){}).name("stop"); //createsaconditiontaskthatreturnsarandombinary -tf::Taskcond=taskflow.emplace([](){returnstd::rand()%2;}).name("cond"); +tf::Taskcond=taskflow.emplace([](){returnstd::rand()%2;}).name("cond"); //createsafeedbackloop{0:cond,1:stop} init.precede(cond); cond.precede(cond,stop);//movesonto'cond'onreturning0,or'stop'on1 - - - - -Codestin Search App -Taskflow supports GPU tasking for you to accelerate a wide range of scientific computing applications by harnessing the power of CPU-GPU collaborative computing using CUDA. -__global__voidsaxpy(intn,floata,float*x,float*y){ -inti=blockIdx.x*blockDim.x+threadIdx.x; -if(i<n){ -y[i]=a*x[i]+y[i]; -} -} -tf::Taskcudaflow=taskflow.emplace([&](tf::cudaFlow&cf){ -tf::cudaTaskh2d_x=cf.copy(dx,hx.data(),N).name("h2d_x"); -tf::cudaTaskh2d_y=cf.copy(dy,hy.data(),N).name("h2d_y"); -tf::cudaTaskd2h_x=cf.copy(hx.data(),dx,N).name("d2h_x"); -tf::cudaTaskd2h_y=cf.copy(hy.data(),dy,N).name("d2h_y"); -tf::cudaTasksaxpy=cf.kernel((N+255)/256,256,0,saxpy,N,2.0f,dx,dy) -.name("saxpy");//parameterstothesaxpykernel -saxpy.succeed(h2d_x,h2d_y) -.precede(d2h_x,d2h_y); -}).name("cudaFlow"); - - + -Codestin Search App -Taskflow is composable. You can create large parallel graphs through composition of modular and reusable blocks that are easier to optimize at an individual scope. +Codestin Search AppTaskflow is composable. You can create large parallel graphs through composition of modular and reusable blocks that are easier to optimize at an individual scope. tf::Taskflowf1,f2; //createtaskflowf1oftwotasks -tf::Taskf1A=f1.emplace([](){std::cout<<"Taskf1A\n";}).name("f1A"); -tf::Taskf1B=f1.emplace([](){std::cout<<"Taskf1B\n";}).name("f1B"); +tf::Taskf1A=f1.emplace([](){std::cout<<"Taskf1A\n";}).name("f1A"); +tf::Taskf1B=f1.emplace([](){std::cout<<"Taskf1B\n";}).name("f1B"); //createtaskflowf2withonemoduletaskcomposedoff1 -tf::Taskf2A=f2.emplace([](){std::cout<<"Taskf2A\n";}).name("f2A"); -tf::Taskf2B=f2.emplace([](){std::cout<<"Taskf2B\n";}).name("f2B"); -tf::Taskf2C=f2.emplace([](){std::cout<<"Taskf2C\n";}).name("f2C"); +tf::Taskf2A=f2.emplace([](){std::cout<<"Taskf2A\n";}).name("f2A"); +tf::Taskf2B=f2.emplace([](){std::cout<<"Taskf2B\n";}).name("f2B"); +tf::Taskf2C=f2.emplace([](){std::cout<<"Taskf2C\n";}).name("f2C"); tf::Taskf1_module_task=f2.composed_of(f1).name("module"); f1_module_task.succeed(f2A,f2B) .precede(f2C); - + -Codestin Search App -Taskflow supports asynchronous tasking. You can launch tasks asynchronously to dynamically explore task graph parallelism. +Codestin Search AppTaskflow supports asynchronous tasking. You can launch tasks asynchronously to dynamically explore task graph parallelism. tf::Executorexecutor; //createasynchronoustasksdirectlyfromanexecutor -std::future<int>future=executor.async([](){ -std::cout<<"asynctaskreturns1\n"; +std::future<int>future=executor.async([](){ +std::cout<<"asynctaskreturns1\n"; return1; }); -executor.silent_async([](){std::cout<<"asynctaskdoesnotreturn\n";}); +executor.silent_async([](){std::cout<<"asynctaskdoesnotreturn\n";}); //createasynchronoustaskswithdynamicdependencies -tf::AsyncTaskA=executor.silent_dependent_async([](){printf("A\n");}); -tf::AsyncTaskB=executor.silent_dependent_async([](){printf("B\n");},A); -tf::AsyncTaskC=executor.silent_dependent_async([](){printf("C\n");},A); -tf::AsyncTaskD=executor.silent_dependent_async([](){printf("D\n");},B,C); - -executor.wait_for_all(); - - - -Codestin Search App -The executor provides several thread-safe methods to run a taskflow. You can run a taskflow once, multiple times, or until a stopping criteria is met. These methods are non-blocking with a tf::Future<void> return to let you query the execution status. -//runsthetaskflowonce -tf::Future<void>run_once=executor.run(taskflow); - -//waitonthisruntofinish -run_once.get(); - -//runthetaskflowfourtimes -executor.run_n(taskflow,4); +tf::AsyncTaskA=executor.silent_dependent_async([](){printf("A\n");}); +tf::AsyncTaskB=executor.silent_dependent_async([](){printf("B\n");},A); +tf::AsyncTaskC=executor.silent_dependent_async([](){printf("C\n");},A); +tf::AsyncTaskD=executor.silent_dependent_async([](){printf("D\n");},B,C); -//runsthetaskflowfivetimes -executor.run_until(taskflow,[counter=5](){return--counter==0;}); - -//blockstheexecutoruntilallsubmittedtaskflowscomplete executor.wait_for_all(); -Codestin Search App -Taskflow defines algorithms for you to quickly express common parallel patterns using standard C++ syntaxes, such as parallel iterations, parallel reductions, and parallel sort. +Codestin Search AppTaskflow defines algorithms for you to quickly express common parallel patterns using standard C++ syntaxes, such as parallel iterations, parallel reductions, and parallel sort. //standardparallelCPUalgorithms tf::Tasktask1=taskflow.for_each(//assigneachelementto100inparallel first,last,[](auto&i){i=100;} @@ -248,47 +200,91 @@ } }}, tf::Pipe{tf::PipeType::SERIAL,[](tf::Pipeflow&pf){ -printf("stage2:inputbuffer[%zu]=%d\n",pf.line(),buffer[pf.line()]); +printf("stage2:inputbuffer[%zu]=%d\n",pf.line(),buffer[pf.line()]); }}, tf::Pipe{tf::PipeType::SERIAL,[](tf::Pipeflow&pf){ -printf("stage3:inputbuffer[%zu]=%d\n",pf.line(),buffer[pf.line()]); +printf("stage3:inputbuffer[%zu]=%d\n",pf.line(),buffer[pf.line()]); }} ); taskflow.composed_of(pl) executor.run(taskflow).wait(); + +Codestin Search AppThe executor provides several thread-safe methods to run a taskflow. You can run a taskflow once, multiple times, or until a stopping criteria is met. These methods are non-blocking with a tf::Future<void> return to let you query the execution status. +//runsthetaskflowonce +tf::Future<void>run_once=executor.run(taskflow); + +//waitonthisruntofinish +run_once.get(); + +//runthetaskflowfourtimes +executor.run_n(taskflow,4); + +//runsthetaskflowfivetimes +executor.run_until(taskflow,[counter=5](){return--counter==0;}); + +//blockstheexecutoruntilallsubmittedtaskflowscomplete +executor.wait_for_all(); + + + +Codestin Search AppTaskflow supports GPU tasking for you to accelerate a wide range of scientific computing applications by harnessing the power of CPU-GPU collaborative computing using Nvidia CUDA Graph. +__global__voidsaxpy(intn,floata,float*x,float*y){ +inti=blockIdx.x*blockDim.x+threadIdx.x; +if(i<n){ +y[i]=a*x[i]+y[i]; +} +} +//createaCUDAGaphtask +tf::Taskcudaflow=taskflow.emplace([&](){ +tf::cudaGraphcg; +tf::cudaTaskh2d_x=cg.copy(dx,hx.data(),N); +tf::cudaTaskh2d_y=cg.copy(dy,hy.data(),N); +tf::cudaTaskd2h_x=cg.copy(hx.data(),dx,N); +tf::cudaTaskd2h_y=cg.copy(hy.data(),dy,N); +tf::cudaTasksaxpy=cg.kernel((N+255)/256,256,0,saxpy,N,2.0f,dx,dy); +saxpy.succeed(h2d_x,h2d_y) +.precede(d2h_x,d2h_y); + +//instantiateanexecutableCUDAgraphandrunitthroughastream +tf::cudaGraphExecexec(cg); +tf::cudaStreamstream; +stream.run(exec).synchronize(); +}).name("CUDAGraphTask"); + + + + -Codestin Search App -You can dump a taskflow graph to a DOT format and visualize it using a number of free GraphViz tools such as GraphViz Online. +Codestin Search AppYou can dump a taskflow graph to a DOT format and visualize it using a number of free GraphViz tools such as GraphViz Online. tf::Taskflowtaskflow; -tf::TaskA=taskflow.emplace([](){}).name("A"); -tf::TaskB=taskflow.emplace([](){}).name("B"); -tf::TaskC=taskflow.emplace([](){}).name("C"); -tf::TaskD=taskflow.emplace([](){}).name("D"); -tf::TaskE=taskflow.emplace([](){}).name("E"); +tf::TaskA=taskflow.emplace([](){}).name("A"); +tf::TaskB=taskflow.emplace([](){}).name("B"); +tf::TaskC=taskflow.emplace([](){}).name("C"); +tf::TaskD=taskflow.emplace([](){}).name("D"); +tf::TaskE=taskflow.emplace([](){}).name("E"); A.precede(B,C,E); C.precede(D); B.precede(D,E); //dumpthegraphtoaDOTfilethroughstd::cout -taskflow.dump(std::cout); +taskflow.dump(std::cout); - + -Codestin Search App -To use Taskflow, you only need a compiler that supports C++17: +Codestin Search AppTo use Taskflow, you only need a compiler that supports C++17: GNU C++ Compiler at least v8.4 with -std=c++17 Clang C++ Compiler at least v6.0 with -std=c++17 -Microsoft Visual Studio at least v19.27 with /std:c++17 +Microsoft Visual Studio at least v19.14 with /std:c++17 -AppleClang Xcode Version at least v12.0 with -std=c++17 +Apple Clang Xcode Version at least v12.0 with -std=c++17 Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17 @@ -298,10 +294,12 @@ Taskflow works on Linux, Windows, and Mac OS X. +Although Taskflow supports primarily C++17, you can enable C++20 compilation through -std=c++20 (or /std:c++20 for MSVC) to achieve better performance due to new C++20 features. + + -Codestin Search App -Visit our Project Website and showcase presentation to learn more about Taskflow. To get involved: +Codestin Search AppVisit our Project Website and showcase presentation to learn more about Taskflow. To get involved: See release notes at Release Notes Read the step-by-step tutorial at Cookbook @@ -310,7 +308,7 @@ Taskflow works on Linux, Windows, and Mac OS X. Watch our 2020 CppCon Taskflow Talk and 2020 MUC++ Taskflow Talk -We are committed to support trustworthy developments for both academic and industrial research projects in parallel and heterogeneous computing. If you are using Taskflow, please cite the following paper we publised at 2022 IEEE TPDS: +We are committed to support trustworthy developments for both academic and industrial research projects in parallel and heterogeneous computing. If you are using Taskflow, please cite the following paper we published at 2022 IEEE TPDS: Tsung-Wei Huang, Dian-Lun Lin, Chun-Xun Lin, and Yibo Lin, "Taskflow: A Lightweight Parallel and Heterogeneous Task Graph Computing System," IEEE Transactions on Parallel and Distributed Systems (TPDS), vol. 33, no. 6, pp. 1303-1320, June 2022 @@ -335,7 +333,8 @@ Taskflow works on Linux, Windows, and Mac OS X. - + + @@ -343,10 +342,9 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App -Taskflow is open-source under permissive MIT license. You are completely free to use, modify, and redistribute any work on top of Taskflow. The source code is available in Project GitHub and is actively maintained by Dr. Tsung-Wei Huang and his research group at the University of Wisconsin at Madison. +Codestin Search AppTaskflow is open-source under permissive MIT license. You are completely free to use, modify, and redistribute any work on top of Taskflow. The source code is available in Project GitHub and is actively maintained by Dr. Tsung-Wei Huang and his research group at the University of Wisconsin at Madison. - + diff --git a/docs/xml/install.xml b/docs/xml/install.xml index b46477670..191b82135 100644 --- a/docs/xml/install.xml +++ b/docs/xml/install.xml @@ -1,5 +1,5 @@ - + install Codestin Search App @@ -9,39 +9,38 @@ Supported Compilers install_1BAISupportedCompilers - + Integrate Taskflow to Your Project install_1BAIIntegrateTaskflowToYourProject - + Build Examples and Unit Tests install_1BAIBuildExamplesAndUnitTests - + Build CUDA Examples and Unit Tests install_1BAIBuildCUDACode - + Build Sanitizers install_1BAIBuildSanitizers - + Build Benchmarks install_1BAIBuildBenchmarks - + Build Documentation install_1BAIBuildDocumentation - + This page describes how to set up Taskflow in your project. We will also go through the building process of unit tests and examples. -Codestin Search App -To use Taskflow, you only need a compiler that supports C++17: +Codestin Search AppTo use Taskflow, you only need a compiler that supports C++17: GNU C++ Compiler at least v8.4 with -std=c++17 @@ -53,7 +52,7 @@ Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17 -Intel C++ Compiler (nvcc) at least v19.0.1 with -std=c++17 +Intel C++ Compiler (icpc) at least v19.0.1 with -std=c++17 Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 and SYCL20 @@ -61,20 +60,18 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App -Taskflow is header-only and there is no need for installation. Simply download the source and copy the headers under the directory taskflow/ to your project. -~$gitclonehttps://github.com/taskflow/taskflow.git +Codestin Search AppTaskflow is header-only and there is no need for installation. Simply download the source and copy the headers under the directory taskflow/ to your project. +~$gitclonehttps://github.com/taskflow/taskflow.git ~$cdtaskflow/ ~$cp-rtaskflowmyproject/include/ Taskflow is written in C++17 and is built on top of C++ standardized threading libraries to improve portability. To compile a Taskflow program, say simple.cpp, you need to tell the compiler where to find the Taskflow header files and link it through the system thread library (usually POSIX threads in Linux-like systems). Take gcc for an example: -~$g++simple.cpp-std=c++17-Imyproject/include/-O2-pthread-osimple +~$g++simple.cpp-std=c++17-Imyproject/include/-O2-pthread-osimple -Codestin Search App -Taskflow uses CMake to build examples and unit tests. We recommend using out-of-source build. -~$cdpath/to/taskflow +Codestin Search AppTaskflow uses CMake to build examples and unit tests. We recommend using out-of-source build. +~$cdpath/to/taskflow ~$mkdirbuild ~$cdbuild ~$cmake../ @@ -97,7 +94,7 @@ Taskflow works on Linux, Windows, and Mac OS X. TotalTesttime(real)=29.67sec When the building completes, you can find the executables for examples and tests under the two folders, examples/ and unittests/. You can list a set of available options in the cmake. -~$cmake-LA +~$cmake-LA ... TF_BUILD_EXAMPLES:BOOL=ON#bydefault,wecompileexamples TF_BUILD_TESTS:BOOL=ON#bydefault,wecompiletests @@ -135,22 +132,20 @@ Taskflow works on Linux, Windows, and Mac OS X.
    To enable or disable a specific option, use -D in the CMake build. For example: -~$cmake../-DTF_BUILD_EXAMPLES=OFF +~$cmake../-DTF_BUILD_EXAMPLES=OFF The above command turns off building Taskflow examples.
    -Codestin Search App -To build CUDA code, including unit tests and examples, enable the CMake option TF_BUILD_CUDA to ON. Cmake will automatically detect the existence of nvcc and use it to compile and link .cu code. -~$cmake../-DTF_BUILD_CUDA=ON +Codestin Search AppTo build CUDA code, including unit tests and examples, enable the CMake option TF_BUILD_CUDA to ON. Cmake will automatically detect the existence of nvcc and use it to compile and link .cu code. +~$cmake../-DTF_BUILD_CUDA=ON ~$make Please visit the page Compile Taskflow with CUDA for details. -Codestin Search App -You can build Taskflow with sanitizers to detect a variety of errors, such as data race, memory leak, undefined behavior, and others. To enable a sanitizer, add the sanitizer flag to the CMake variable CMAKE_CXX_FLAGS. The following example enables thread sanitizer in building Taskflow code to detect data race: -#buildTaskflowcodewiththreadsanitizertodetectdatarace +Codestin Search AppYou can build Taskflow with sanitizers to detect a variety of errors, such as data race, memory leak, undefined behavior, and others. To enable a sanitizer, add the sanitizer flag to the CMake variable CMAKE_CXX_FLAGS. The following example enables thread sanitizer in building Taskflow code to detect data race: +#buildTaskflowcodewiththreadsanitizertodetectdatarace ~$cmake../-DCMAKE_CXX_FLAGS="-fsanitize=thread-g" #buildTaskflowcodewithaddresssanitizertodetectillegalmemoryaccess @@ -160,39 +155,37 @@ Taskflow works on Linux, Windows, and Mac OS X. ~$cmake../-DCMAKE_CXX_FLAGS="-fsanitize=undefined-g" Our continuous integration workflows incorporates thread sanitizer (-fsanitize=thread), address sanitizer (-fsanitize=address), and leak sanitizer (-fsanitize=leak) to detect data race, illegal memory address, and memory leak. To our best knowledge, Taskflow is one of the very few parallel programming libraries that are free from data race. -Some sanitizers are supported by certain computing architectures. You can find the information about architecture support of each sanitizer at Clang Documentation and GCC Instrumentation Options. +Some sanitizers are supported by certain computing architectures. You can find the information about architecture support of each sanitizer at Clang Documentation and GCC Instrumentation Options. -Codestin Search App -The Taskflow project contains a set of benchmarks to evaluate and compare the performance of Taskflow with existing parallel programming libraries. To build the benchmark code, enable the CMake option TF_BUILD_BENCHMARKS to ON as follows: -~$cmake../-DTF_BUILD_BENCHMARKS=ON +Codestin Search AppThe Taskflow project contains a set of benchmarks to evaluate and compare the performance of Taskflow with existing parallel programming libraries. To build the benchmark code, enable the CMake option TF_BUILD_BENCHMARKS to ON as follows: +~$cmake../-DTF_BUILD_BENCHMARKS=ON ~$make Please visit the page Benchmark Taskflow for details. -Codestin Search App -Taskflow uses Doxygen and m.css to generate this documentation. The source of documentation is located in the folder taskflow/doxygen and the generated html is output to the folder taskflow/docs. To generate the documentation, you need to first install doxygen: -#ubuntuasanexample +Codestin Search AppTaskflow uses Doxygen and m.css to generate this documentation. The source of documentation is located in the folder taskflow/doxygen and the generated html is output to the folder taskflow/docs. To generate the documentation, you need to first install doxygen: +#ubuntuasanexample ~$sudoapt-getinstalldoxygengraphviz Once you have doxygen and dot graph generator installed, clone the m.css project and enter the m.css/documentation directory: -~$gitclonehttps://github.com/mosra/m.css.git +~$gitclonehttps://github.com/mosra/m.css.git ~$cdm.css/documentation The script doxygen.py requires Python 3.6, depends on Jinja2 for templating and Pygments for code block highlighting. You can install the dependencies via pip or your distribution package manager: -#Youmayneedsudohere +#Youmayneedsudohere #Moredetailsareavailableathttps://mcss.mosra.cz/documentation/doxygen/ ~$pip3installjinja2Pygments Next, invoke doxygen.py and point it to the taskflow/doxygen/conf.py: -~$./doxygen.pypath/to/taskflow/doxygen/conf.py +~$./doxygen.pypath/to/taskflow/doxygen/conf.py You can find the documentation output in taskflow/docs.
    - +
    diff --git a/docs/xml/install_8dox.xml b/docs/xml/install_8dox.xml index 3f63d1721..54af17f51 100644 --- a/docs/xml/install_8dox.xml +++ b/docs/xml/install_8dox.xml @@ -1,5 +1,5 @@ - + install.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/iterator_8hpp.xml b/docs/xml/iterator_8hpp.xml new file mode 100644 index 000000000..6f81b6536 --- /dev/null +++ b/docs/xml/iterator_8hpp.xml @@ -0,0 +1,134 @@ + + + + iterator.hpp + cstddef + type_traits + taskflow/core/graph.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf::IndexRange + tf + + + + + + + diff --git a/docs/xml/kmeans.xml b/docs/xml/kmeans.xml index 9f9b07086..695d14ac3 100644 --- a/docs/xml/kmeans.xml +++ b/docs/xml/kmeans.xml @@ -1,5 +1,5 @@ - + kmeans Codestin Search App @@ -7,23 +7,22 @@ Problem Formulation kmeans_1KMeansProblemFormulation - + Parallel k-means using CPUs kmeans_1ParallelKMeansUsingCPUs - + Benchmarking kmeans_1KMeansBenchmarking - + We study a fundamental clustering problem in unsupervised learning, k-means clustering. We will begin by discussing the problem formulation and then learn how to write a parallel k-means algorithm. -Codestin Search App -k-means clustering uses centroids, k different randomly-initiated points in the data, and assigns every data point to the nearest centroid. After every point has been assigned, the centroid is moved to the average of all of the points assigned to it. We describe the k-means algorithm in the following steps: +Codestin Search Appk-means clustering uses centroids, k different randomly-initiated points in the data, and assigns every data point to the nearest centroid. After every point has been assigned, the centroid is moved to the average of all of the points assigned to it. We describe the k-means algorithm in the following steps: Step 1: initialize k random centroids @@ -49,29 +48,29 @@ //M:numberofiterations //px/py:2Dpointvector voidkmeans_seq( -intN,intK,intM,conststd::vector<float>&px,conststd::vector<float>&py +intN,intK,intM,conststd::vector<float>&px,conststd::vector<float>&py ){ -std::vector<int>c(K); -std::vector<float>sx(K),sy(K),mx(K),my(K); +std::vector<int>c(K); +std::vector<float>sx(K),sy(K),mx(K),my(K); //initialcentroids -std::copy_n(px.begin(),K,mx.begin()); -std::copy_n(py.begin(),K,my.begin()); +std::copy_n(px.begin(),K,mx.begin()); +std::copy_n(py.begin(),K,my.begin()); //k-meansiteration for(intm=0;m<M;m++){ //clearthestorage -std::fill_n(sx.begin(),K,0.0f); -std::fill_n(sy.begin(),K,0.0f); -std::fill_n(c.begin(),K,0); +std::fill_n(sx.begin(),K,0.0f); +std::fill_n(sy.begin(),K,0.0f); +std::fill_n(c.begin(),K,0); //findthebestk(clusterid)foreachpoint for(inti=0;i<N;++i){ floatx=px[i]; floaty=py[i]; -floatbest_d=std::numeric_limits<float>::max(); +floatbest_d=std::numeric_limits<float>::max(); intbest_k=0; for(intk=0;k<K;++k){ constfloatd=L2(x,y,mx[k],my[k]); @@ -87,24 +86,23 @@ //updatethecentroid for(intk=0;k<K;k++){ -constintcount=max(1,c[k]);//turn0/0to0/1 -mx[k]=sx[k]/count; -my[k]=sy[k]/count; +constintcount=max(1,c[k]);//turn0/0to0/1 +mx[k]=sx[k]/count; +my[k]=sy[k]/count; } } //printthekcentroidsfound for(intk=0;k<K;++k){ -std::cout<<"centroid"<<k<<":"<<std::setw(10)<<mx[k]<<'' -<<std::setw(10)<<my[k]<<'\n'; +std::cout<<"centroid"<<k<<":"<<std::setw(10)<<mx[k]<<'' +<<std::setw(10)<<my[k]<<'\n'; } } -Codestin Search App -The second step of k-means algorithm, assigning every point to the nearest centroid, is highly parallelizable across individual points. We can create a parallel-for task to run parallel iterations. -std::vector<int>best_ks(N);//nearestcentroidofeachpoint +Codestin Search AppThe second step of k-means algorithm, assigning every point to the nearest centroid, is highly parallelizable across individual points. We can create a parallel-for task to run parallel iterations. +std::vector<int>best_ks(N);//nearestcentroidofeachpoint unsignedP=12;//12partitionedtasks @@ -112,7 +110,7 @@ taskflow.for_each_index(0,N,1,[&](inti){ floatx=px[i]; floaty=py[i]; -floatbest_d=std::numeric_limits<float>::max(); +floatbest_d=std::numeric_limits<float>::max(); intbest_k=0; for(intk=0;k<K;++k){ constfloatd=L2(x,y,mx[k],my[k]); @@ -135,9 +133,9 @@ //averageofpoints for(intk=0;k<K;++k){ -autocount=max(1,c[k]);//turn0/0to0/1 -mx[k]=sx[k]/count; -my[k]=sy[k]/count; +autocount=max(1,c[k]);//turn0/0to0/1 +mx[k]=sx[k]/count; +my[k]=sy[k]/count; } }); @@ -152,7 +150,7 @@ //M:numberofiterations //px/py:2Dpointvector voidkmeans_par( -intN,intK,intM,cconststd::vector<float>&px,conststd::vector<float>&py +intN,intK,intM,cconststd::vector<float>&px,conststd::vector<float>&py ){ unsignedP=12;//12partitionsoftheparallel-forgraph @@ -160,8 +158,8 @@ tf::Executorexecutor; tf::Taskflowtaskflow("K-Means"); -std::vector<int>c(K),best_ks(N); -std::vector<float>sx(K),sy(K),mx(K),my(K); +std::vector<int>c(K),best_ks(N); +std::vector<float>sx(K),sy(K),mx(K),my(K); //initialcentroids tf::Taskinit=taskflow.emplace([&](){ @@ -184,7 +182,7 @@ tf::Taskpf=taskflow.for_each_index(0,N,1,[&](inti){ floatx=px[i]; floaty=py[i]; -floatbest_d=std::numeric_limits<float>::max(); +floatbest_d=std::numeric_limits<float>::max(); intbest_k=0; for(intk=0;k<K;++k){ constfloatd=L2(x,y,mx[k],my[k]); @@ -204,9 +202,9 @@ } for(intk=0;k<K;++k){ -autocount=max(1,c[k]);//turn0/0to0/1 -mx[k]=sx[k]/count; -my[k]=sy[k]/count; +autocount=max(1,c[k]);//turn0/0to0/1 +mx[k]=sx[k]/count; +my[k]=sy[k]/count; } }).name("update_cluster"); @@ -227,13 +225,12 @@ } The taskflow consists of two parts, a clean_up task and a parallel-for graph. The former cleans up the storage sx, sy, and c that are used to average points for new centroids, and the later parallelizes the searching for nearest centroids across individual points using 12 tasks (may vary depending on the machine). If the iteration count is smaller than M, the condition task returns 0 to let the execution path go back to clean_up. Otherwise, it returns 1 to stop (i.e., no successor tasks at index 1). The taskflow graph is illustrated below: - + -The scheduler starts with init, moves on to clean_up, and then enters the parallel-for task paralle-for that spawns a subflow of 12 workers to perform parallel iterations. When parallel-for completes, it updates the cluster centroids and checks if they have converged through a condition task. If not, the condition task informs the scheduler to go back to clean_up and then parallel-for; otherwise, it returns a nominal index to stop the scheduler. +The scheduler starts with init, moves on to clean_up, and then enters the parallel-for task parallel-for that spawns a subflow of 12 workers to perform parallel iterations. When parallel-for completes, it updates the cluster centroids and checks if they have converged through a condition task. If not, the condition task informs the scheduler to go back to clean_up and then parallel-for; otherwise, it returns a nominal index to stop the scheduler. -Codestin Search App -Based on the discussion above, we compare the runtime of computing various k-means problem sizes between a sequential CPU and parallel CPUs on a machine of 12 Intel i7-8700 CPUs at 3.2 GHz. +Codestin Search AppBased on the discussion above, we compare the runtime of computing various k-means problem sizes between a sequential CPU and parallel CPUs on a machine of 12 Intel i7-8700 CPUs at 3.2 GHz. N K @@ -281,6 +278,6 @@ When the number of points is larger than 10K, the parallel CPU implementation starts to outperform the sequential CPU implementation. - + diff --git a/docs/xml/kmeans_8dox.xml b/docs/xml/kmeans_8dox.xml index 5b8261142..2da139e9d 100644 --- a/docs/xml/kmeans_8dox.xml +++ b/docs/xml/kmeans_8dox.xml @@ -1,5 +1,5 @@ - + kmeans.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/kmeans__cuda_8dox.xml b/docs/xml/kmeans__cuda_8dox.xml new file mode 100644 index 000000000..5ce0a574b --- /dev/null +++ b/docs/xml/kmeans__cuda_8dox.xml @@ -0,0 +1,12 @@ + + + + kmeans_cuda.dox + tf + + + + + + + diff --git a/docs/xml/math_8hpp.xml b/docs/xml/math_8hpp.xml new file mode 100644 index 000000000..7195192b4 --- /dev/null +++ b/docs/xml/math_8hpp.xml @@ -0,0 +1,133 @@ + + + + math.hpp + atomic + chrono + taskflow/core/graph.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf + + + + + + + diff --git a/docs/xml/matmul_8dox.xml b/docs/xml/matmul_8dox.xml new file mode 100644 index 000000000..47b826f7d --- /dev/null +++ b/docs/xml/matmul_8dox.xml @@ -0,0 +1,12 @@ + + + + matmul.dox + tf + + + + + + + diff --git a/docs/xml/matmul__cuda_8dox.xml b/docs/xml/matmul__cuda_8dox.xml new file mode 100644 index 000000000..05a0c0004 --- /dev/null +++ b/docs/xml/matmul__cuda_8dox.xml @@ -0,0 +1,12 @@ + + + + matmul_cuda.dox + tf + + + + + + + diff --git a/docs/xml/matrix__multiplication_8dox.xml b/docs/xml/matrix__multiplication_8dox.xml deleted file mode 100644 index 41116f158..000000000 --- a/docs/xml/matrix__multiplication_8dox.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - matrix_multiplication.dox - tf - - - - - - - diff --git a/docs/xml/matrix__multiplication__cudaflow_8dox.xml b/docs/xml/matrix__multiplication__cudaflow_8dox.xml deleted file mode 100644 index e75819a5d..000000000 --- a/docs/xml/matrix__multiplication__cudaflow_8dox.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - matrix_multiplication_cudaflow.dox - tf - - - - - - - diff --git a/docs/xml/matrix_multiplication.xml b/docs/xml/matrix_multiplication.xml index a5516d645..6fd51ab2a 100644 --- a/docs/xml/matrix_multiplication.xml +++ b/docs/xml/matrix_multiplication.xml @@ -1,5 +1,5 @@ - + matrix_multiplication Codestin Search App @@ -7,23 +7,22 @@ Problem Formulation matrix_multiplication_1MatrixMultiplicationProblem - + Parallel Patterns matrix_multiplication_1MatrixMultiplicationParallelPattern - + Benchmarking matrix_multiplication_1MatrixMultiplicationBenchmarking - + We study the classic problem, 2D matrix multiplication. We will start with a short introduction about the problem and then discuss how to solve it parallel CPUs. -Codestin Search App -We are multiplying two matrices, A (MxK) and B (KxN). The numbers of columns of A must match the number of rows of B. The output matrix C has the shape of (MxN) where M is the rows of A and N the columns of B. The following example multiplies a 3x3 matrix with a 3x2 matrix to derive a 3x2 matrix. +Codestin Search AppWe are multiplying two matrices, A (MxK) and B (KxN). The numbers of columns of A must match the number of rows of B. The output matrix C has the shape of (MxN) where M is the rows of A and N the columns of B. The following example multiplies a 3x3 matrix with a 3x2 matrix to derive a 3x2 matrix. As a general view, for each element of C we iterate a complete row of A and a complete column of B, multiplying each element and summing them. @@ -41,8 +40,7 @@ -Codestin Search App -At a fine-grained level, computing each element of C is independent of each other. Similarly, computing each row of C or each column of C is also independent of one another. With task parallelism, we prefer coarse-grained model to have each task perform rather large computation to amortize the overhead of creating and scheduling tasks. In this case, we avoid intensive tasks each working on only a single element. by creating a task per row of C to multiply a row of A by every column of B. +Codestin Search AppAt a fine-grained level, computing each element of C is independent of each other. Similarly, computing each row of C or each column of C is also independent of one another. With task parallelism, we prefer coarse-grained model to have each task perform rather large computation to amortize the overhead of creating and scheduling tasks. In this case, we avoid intensive tasks each working on only a single element. by creating a task per row of C to multiply a row of A by every column of B. //C=A*B //AisaMxKmatrix,BisaKxNmatrix,andCisaMxNmatrix voidmatrix_multiplication(int**A,int**B,int**C,intM,intK,intN){ @@ -73,8 +71,7 @@ Please visit Parallel Iterations for more details. -Codestin Search App -Based on the discussion above, we compare the runtime of computing various matrix sizes of A, B, and C between a sequential CPU and parallel CPUs on a machine of 12 Intel i7-8700 CPUs at 3.2 GHz. +Codestin Search AppBased on the discussion above, we compare the runtime of computing various matrix sizes of A, B, and C between a sequential CPU and parallel CPUs on a machine of 12 Intel i7-8700 CPUs at 3.2 GHz.
    A B @@ -129,6 +126,6 @@ The speed-up of parallel execution becomes clean as we increase the problem size. For example, at 4000x4000, the parallel runtime is 6.3 times faster than the sequential runtime. - + diff --git a/docs/xml/merge_8hpp.xml b/docs/xml/merge_8hpp.xml deleted file mode 100644 index ec7ad8f3a..000000000 --- a/docs/xml/merge_8hpp.xml +++ /dev/null @@ -1,16 +0,0 @@ - - - - merge.hpp - tf::detail::cudaMergePair - tf::detail::cudaMergeRange - tf - tf::detail - -CUDA merge algorithm include file. - - - - - - diff --git a/docs/xml/module_8dox.xml b/docs/xml/module_8dox.xml new file mode 100644 index 000000000..386b128eb --- /dev/null +++ b/docs/xml/module_8dox.xml @@ -0,0 +1,12 @@ + + + + module.dox + tf + + + + + + + diff --git a/docs/xml/module_8hpp.xml b/docs/xml/module_8hpp.xml new file mode 100644 index 000000000..f3225842f --- /dev/null +++ b/docs/xml/module_8hpp.xml @@ -0,0 +1,288 @@ + + + + module.hpp + ../taskflow.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf + + + + + + + diff --git a/docs/xml/module_task_1.dot b/docs/xml/module_task_1.dot new file mode 100644 index 000000000..814e8fc8e --- /dev/null +++ b/docs/xml/module_task_1.dot @@ -0,0 +1,6 @@ +digraph Taskflow { +A; +B; +C; +D; +} diff --git a/docs/xml/module_task_2.dot b/docs/xml/module_task_2.dot new file mode 100644 index 000000000..3d64d2928 --- /dev/null +++ b/docs/xml/module_task_2.dot @@ -0,0 +1,6 @@ +digraph Taskflow { +rankdir="LR"; +A->B; +B->C; +C->D; +} diff --git a/docs/xml/motivation_8dox.xml b/docs/xml/motivation_8dox.xml index 17e6c5001..63d8e683b 100644 --- a/docs/xml/motivation_8dox.xml +++ b/docs/xml/motivation_8dox.xml @@ -1,5 +1,5 @@ - + motivation.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/multi-condition-task-2.dot b/docs/xml/multi-condition-task-2.dot deleted file mode 100644 index 62d2e908e..000000000 --- a/docs/xml/multi-condition-task-2.dot +++ /dev/null @@ -1,17 +0,0 @@ -digraph Taskflow { -rankdir="LR"; -p0x7bc400014030[label="init" ]; -p0x7bc400014030 -> p0x7bc400014118; -p0x7bc400014118[label="A" shape=diamond color=black fillcolor=aquamarine style=filled]; -p0x7bc400014118 -> p0x7bc400014118 [style=dashed label="0"]; -p0x7bc400014118 -> p0x7bc400014200 [style=dashed label="1"]; -p0x7bc400014200[label="B" shape=diamond color=black fillcolor=aquamarine style=filled]; -p0x7bc400014200 -> p0x7bc400014200 [style=dashed label="0"]; -p0x7bc400014200 -> p0x7bc4000142e8 [style=dashed label="1"]; -p0x7bc4000142e8[label="C" shape=diamond color=black fillcolor=aquamarine style=filled]; -p0x7bc4000142e8 -> p0x7bc4000142e8 [style=dashed label="0"]; -p0x7bc4000142e8 -> p0x7bc4000143d0 [style=dashed label="1"]; -p0x7bc4000143d0[label="D" shape=diamond color=black fillcolor=aquamarine style=filled]; -p0x7bc4000143d0 -> p0x7bc4000143d0 [style=dashed label="0"]; -} - diff --git a/docs/xml/namespacestd.xml b/docs/xml/namespacestd.xml index bbe7732b7..9da88f288 100644 --- a/docs/xml/namespacestd.xml +++ b/docs/xml/namespacestd.xml @@ -8919,7 +8919,7 @@ - + @@ -8951,13 +8951,13 @@ - + - + diff --git a/docs/xml/namespacetf.xml b/docs/xml/namespacetf.xml index 2b4a3269a..f5eb47dcc 100644 --- a/docs/xml/namespacetf.xml +++ b/docs/xml/namespacetf.xml @@ -1,143 +1,101 @@ - + tf + tf::AnchorGuard + tf::AsyncTask + tf::BoundedTaskQueue + tf::CachelineAligned + tf::ChromeObserver + tf::cudaDeviceAllocator + tf::cudaDeviceVector + tf::cudaEventBase + tf::cudaEventCreator + tf::cudaEventDeleter + tf::cudaGraphBase + tf::cudaGraphCreator + tf::cudaGraphDeleter + tf::cudaGraphExecBase + tf::cudaGraphExecCreator + tf::cudaGraphExecDeleter + tf::cudaScopedDevice + tf::cudaSharedMemory + tf::cudaSharedMemory< bool > + tf::cudaSharedMemory< char > + tf::cudaSharedMemory< double > + tf::cudaSharedMemory< float > + tf::cudaSharedMemory< int > + tf::cudaSharedMemory< long > + tf::cudaSharedMemory< short > + tf::cudaSharedMemory< unsigned char > + tf::cudaSharedMemory< unsigned int > + tf::cudaSharedMemory< unsigned long > + tf::cudaSharedMemory< unsigned short > + tf::cudaStreamBase + tf::cudaStreamCreator + tf::cudaStreamDeleter + tf::cudaTask + tf::cudaUSMAllocator + tf::DataPipe + tf::DataPipeline + tf::DefaultClosureWrapper + tf::DefaultTaskParams + tf::DeferredPipeflow + tf::DynamicPartitioner + tf::Executor + tf::FlowBuilder + tf::Future + tf::Graph + tf::GuidedPartitioner + tf::has_graph + tf::IndexRange + tf::is_runtime_task + tf::is_static_task + tf::is_subflow_task + tf::IsPartitioner tf::IsPod + tf::Node + tf::ObserverInterface + tf::PartitionerBase + tf::Pipe + tf::Pipeflow + tf::Pipeline + tf::PreemptionGuard + tf::ProfileData + tf::RandomPartitioner + tf::Runtime + tf::ScalablePipeline + tf::Segment + tf::Semaphore + tf::SmallVector tf::SmallVectorBase + tf::SmallVectorImpl tf::SmallVectorStorage - tf::SmallVectorTemplateCommon + tf::SmallVectorStorage< T, 0 > + tf::SmallVectorStorage< T, 1 > tf::SmallVectorTemplateBase tf::SmallVectorTemplateBase< T, true > - tf::SmallVectorImpl - tf::SmallVectorStorage< T, 1 > - tf::SmallVectorStorage< T, 0 > - tf::SmallVector - tf::Graph - tf::Runtime - tf::TaskParams - tf::DefaultTaskParams - tf::Node - tf::NodeDeleter - tf::TaskQueue - tf::FlowBuilder + tf::SmallVectorTemplateCommon + tf::StaticPartitioner tf::Subflow - tf::Worker - tf::WorkerView - tf::Executor tf::Task - tf::TaskView - tf::AsyncTask - tf::Semaphore tf::Taskflow - tf::Future - tf::Segment - tf::Timeline - tf::ProfileData - tf::ObserverInterface - tf::ChromeObserver - tf::TFProfObserver + tf::TaskParams + tf::TaskView tf::TFProfManager - tf::DefaultClosureWrapper - tf::IsPartitioner - tf::PartitionerBase - tf::GuidedPartitioner - tf::DynamicPartitioner - tf::StaticPartitioner - tf::RandomPartitioner - tf::CriticalSection - tf::DeferredPipeflow - tf::Pipeflow - tf::Pipe - tf::Pipeline - tf::ScalablePipeline - tf::DataPipe - tf::DataPipeline - tf::cudaScopedDevice - tf::cudaSharedMemory - tf::cudaSharedMemory< int > - tf::cudaSharedMemory< unsigned int > - tf::cudaSharedMemory< char > - tf::cudaSharedMemory< unsigned char > - tf::cudaSharedMemory< short > - tf::cudaSharedMemory< unsigned short > - tf::cudaSharedMemory< long > - tf::cudaSharedMemory< unsigned long > - tf::cudaSharedMemory< bool > - tf::cudaSharedMemory< float > - tf::cudaSharedMemory< double > - tf::cudaDeviceAllocator - tf::cudaUSMAllocator - tf::cudaDeviceVector - tf::cudaStreamCreator - tf::cudaStreamDeleter - tf::cudaStream - tf::cudaEventCreator - tf::cudaEventDeleter - tf::cudaEvent - tf::cudaTask - tf::cudaFlow - tf::cudaFlowOptimizerBase - tf::cudaFlowSequentialOptimizer - tf::cudaFlowLinearOptimizer - tf::cudaFlowRoundRobinOptimizer - tf::cudaFlowCapturer - tf::cudaExecutionPolicy + tf::TFProfObserver + tf::Timeline + tf::UnboundedTaskQueue + tf::Worker + tf::WorkerInterface + tf::WorkerView tf::detail - - - unsigned - TaskPriority - - HIGH - = 0 - -value of the highest priority (i.e., 0) - - - - - - - NORMAL - = 1 - -value of the normal priority (i.e., 1) - - - - - - - LOW - = 2 - -value of the lowest priority (i.e., 2) - - - - - - MAX - = 3 - -conventional value for iterating priority values - - - - - -enumeration of all task priority values - - -A priority is an enumerated value of type unsigned. Currently, Taskflow defines three priority levels, HIGH, NORMAL, and LOW, starting from 0, 1, to 2. That is, the lower the value, the higher the priority. - - - - - + tf::pt + int TaskType + tf::TaskType PLACEHOLDER = 0 @@ -155,6 +113,14 @@ + + RUNTIME + +runtime task type + + + + SUBFLOW @@ -202,11 +168,12 @@ - + int ObserverType + tf::ObserverType TFPROF = 0 @@ -236,11 +203,12 @@ - + int PartitionerType + tf::PartitionerType STATIC @@ -264,11 +232,12 @@ - + int PipeType + tf::PipeType PARALLEL = 1 @@ -294,92 +263,30 @@ - + - - int - cudaTaskType - - EMPTY - = 0 - -empty task type - - - - - - HOST - -host task type - - - - - - MEMSET - -memory set task type - - - - - - MEMCPY - -memory copy task type - - - - - - KERNEL - -memory copy task type - - - - - - SUBFLOW - -subflow (child graph) task type - - - - - - CAPTURE - -capture task type - - - - - - UNDEFINED - -undefined task type - - - - + + + + NonblockingNotifierV2 + using tf::DefaultNotifier = NonblockingNotifierV2 + + DefaultNotifier + tf::DefaultNotifier -enumeration of all cudaTask types - + - - - - std::chrono::time_point< std::chrono::steady_clock > - using tf::observer_stamp_t = typedef std::chrono::time_point<std::chrono::steady_clock> + + std::chrono::time_point< std::chrono::steady_clock > + using tf::observer_stamp_t = std::chrono::time_point<std::chrono::steady_clock> observer_stamp_t + tf::observer_stamp_t default time point type of observers @@ -387,89 +294,186 @@ - + - + GuidedPartitioner<> - using tf::DefaultPartitioner = typedef GuidedPartitioner<> + using tf::DefaultPartitioner = GuidedPartitioner<> DefaultPartitioner + tf::DefaultPartitioner default partitioner set to tf::GuidedPartitioner -Guided partitioner can achieve decent performance for most parallel algorithms, especially for those with irregular and unbalanced workload per iteration. +Guided partitioning algorithm can achieve stable and decent performance for most parallel algorithms. + + + + + + + cudaEventBase< cudaEventCreator, cudaEventDeleter > + using tf::cudaEvent = cudaEventBase<cudaEventCreator, cudaEventDeleter> + + cudaEvent + tf::cudaEvent + +default smart pointer type to manage a cudaEvent_t object with unique ownership + + + + + + + + + cudaStreamBase< cudaStreamCreator, cudaStreamDeleter > + using tf::cudaStream = cudaStreamBase<cudaStreamCreator, cudaStreamDeleter> + + cudaStream + tf::cudaStream + +default smart pointer type to manage a cudaStream_t object with unique ownership + + + + + + + + + cudaGraphBase< cudaGraphCreator, cudaGraphDeleter > + using tf::cudaGraph = cudaGraphBase<cudaGraphCreator, cudaGraphDeleter> + + cudaGraph + tf::cudaGraph + +default smart pointer type to manage a cudaGraph_t object with unique ownership + + - + - - cudaExecutionPolicy< 512, 7 > - using tf::cudaDefaultExecutionPolicy = typedef cudaExecutionPolicy<512, 7> + + cudaGraphExecBase< cudaGraphExecCreator, cudaGraphExecDeleter > + using tf::cudaGraphExec = cudaGraphExecBase<cudaGraphExecCreator, cudaGraphExecDeleter> - cudaDefaultExecutionPolicy + cudaGraphExec + tf::cudaGraphExec -default execution policy +default smart pointer type to manage a cudaGraphExec_t object with unique ownership - + - - + + typename P - constexpr bool - constexpr bool tf::is_task_params_v + bool + bool tf::is_task_params_v is_task_params_v + tf::is_task_params_v = - std::is_same_v<std::decay_t<P>, TaskParams> || - std::is_same_v<std::decay_t<P>, DefaultTaskParams> || - std::is_constructible_v<std::string, P> + std::is_same_v<std::decay_t<P>, TaskParams> || + std::is_same_v<std::decay_t<P>, DefaultTaskParams> || + std::is_constructible_v<std::string, P> determines if the given type is a task parameter type Task parameters can be specified in one of the following types: -tf::TaskParams: assign the struct of defined parameters -tf::DefaultTaskParams: assign nothing -std::string: assign a name to the task +tf::TaskParams +tf::DefaultTaskParams +std::string - + - - ObjectPool< Node > - ObjectPool<Node> tf::node_pool + + + + typename T + + + bool + bool tf::has_graph_v - node_pool + has_graph_v + tf::has_graph_v + = has_graph<T>::value +determines if the given type has a member function Graph& graph() +This trait determines if the provided type T contains a member function with the exact signature tf::Graph& graph(). It uses SFINAE and std::void_t to detect the presence of the member function and its return type. + + +T + + +The type to inspect. + + + + + +true + + +If the type T has a member function tf::Graph& graph(). + + + + +false + + +Otherwise. + + + +Example usage: structA{ +tf::Graph&graph(){returnmy_graph;}; +tf::Graphmy_graph; + +//othercustommemberstoaltermy_graph +}; + +structC{};//Nographfunction + +static_assert(has_graph_v<A>,"Ahasgraph()"); +static_assert(!has_graph_v<C>,"Cdoesnothavegraph()"); + - + - - constexpr std::array< TaskType, 6 > - constexpr std::array<TaskType, 6> tf::TASK_TYPES + + std::array< TaskType, 7 > + std::array<TaskType, 7> tf::TASK_TYPES TASK_TYPES + tf::TASK_TYPES = { TaskType::PLACEHOLDER, TaskType::STATIC, + TaskType::RUNTIME, TaskType::SUBFLOW, TaskType::CONDITION, TaskType::MODULE, @@ -482,102 +486,117 @@ - + - + typename C - constexpr bool - constexpr bool tf::is_subflow_task_v + bool + bool tf::is_static_task_v - is_subflow_task_v - = - std::is_invocable_r_v<void, C, Subflow&> && - !std::is_invocable_r_v<void, C, Runtime&> + is_static_task_v + tf::is_static_task_v + = is_static_task<C>::value -determines if a callable is a dynamic task +determines if a callable is a static task -A dynamic task is a callable object constructible from std::function<void(Subflow&)>. +A static task is a callable object constructible from std::function<void()>. - + - + typename C - constexpr bool - constexpr bool tf::is_condition_task_v + bool + bool tf::is_subflow_task_v - is_condition_task_v - = - (std::is_invocable_r_v<int, C> || std::is_invocable_r_v<int, C, Runtime&>) && - !is_subflow_task_v<C> + is_subflow_task_v + tf::is_subflow_task_v + = is_subflow_task<C>::value -determines if a callable is a condition task +determines if a callable is a subflow task -A condition task is a callable object constructible from std::function<int()> or std::function<int(tf::Runtime&)>. +A subflow task is a callable object constructible from std::function<void(Subflow&)>. - + - + typename C - constexpr bool - constexpr bool tf::is_multi_condition_task_v + bool + bool tf::is_runtime_task_v - is_multi_condition_task_v - = - (std::is_invocable_r_v<SmallVector<int>, C> || - std::is_invocable_r_v<SmallVector<int>, C, Runtime&>) && - !is_subflow_task_v<C> + is_runtime_task_v + tf::is_runtime_task_v + = is_runtime_task<C>::value -determines if a callable is a multi-condition task +determines if a callable is a runtime task -A multi-condition task is a callable object constructible from std::function<tf::SmallVector<int>()> or std::function<tf::SmallVector<int>(tf::Runtime&)>. +A runtime task is a callable object constructible from std::function<void(Runtime&)>. - + - + typename C - constexpr bool - constexpr bool tf::is_static_task_v + bool + bool tf::is_condition_task_v - is_static_task_v - = - (std::is_invocable_r_v<void, C> || std::is_invocable_r_v<void, C, Runtime&>) && - !is_condition_task_v<C> && - !is_multi_condition_task_v<C> && - !is_subflow_task_v<C> + is_condition_task_v + tf::is_condition_task_v + = std::is_invocable_r_v<int, C> -determines if a callable is a static task +determines if a callable is a condition task + + +A condition task is a callable object constructible from std::function<int()>. + + + + + + + + + typename C + + + bool + bool tf::is_multi_condition_task_v + + is_multi_condition_task_v + tf::is_multi_condition_task_v + = std::is_invocable_r_v<SmallVector<int>, C> + +determines if a callable is a multi-condition task -A static task is a callable object constructible from std::function<void()> or std::function<void(tf::Runtime&)>. +A multi-condition task is a callable object constructible from std::function<tf::SmallVector<int>()>. - + @@ -585,11 +604,12 @@ typename P - constexpr bool - constexpr bool tf::is_partitioner_v + bool + bool tf::is_partitioner_v is_partitioner_v - = std::is_base_of<IsPartitioner, P>::value + tf::is_partitioner_v + = std::is_base_of<IsPartitioner, P>::value determines if a type is a partitioner @@ -598,10 +618,10 @@ - + - - + + @@ -617,6 +637,7 @@ static size_t tf::capacity_in_bytes (const SmallVector< T, N > &X) capacity_in_bytes + tf::capacity_in_bytes const SmallVector< T, N > & X @@ -627,683 +648,1435 @@ - + - - const char * - const char* tf::to_string - (TaskType type) - to_string + + + + typename T + + + std::enable_if_t<(std::is_unsigned_v< std::decay_t< T > > &&sizeof(T)==8), void > * + nullptr + + + T + T tf::next_pow2 + (T x) + next_pow2 + tf::next_pow2 - TaskType - type + T + x -convert a task type to a human-readable string +rounds the given 64-bit unsigned integer to the nearest power of 2 -The name of each task type is the litte-case string of its characters. -TaskType::PLACEHOLDER->"placeholder" -TaskType::STATIC->"static" -TaskType::SUBFLOW->"subflow" -TaskType::CONDITION->"condition" -TaskType::MODULE->"module" -TaskType::ASYNC->"async" - +rounds the given 32-bit unsigned integer to the nearest power of 2 - + - - std::ostream & - std::ostream& tf::operator<< - (std::ostream &os, const Task &task) - operator<< - - std::ostream & - os - + + + + typename T + + + std::enable_if_t< std::is_integral_v< std::decay_t< T > >, void > * + nullptr + + + bool + bool tf::is_pow2 + (const T &x) + is_pow2 + tf::is_pow2 - const Task & - task + const T & + x -overload of ostream inserter operator for Task +checks if the given number is a power of 2 +This function determines if the given integer is a power of 2. + + +T + + +The type of the input. Must be an integral type. + + + + + +x + + +The integer to check. + + + +true if x is a power of 2, otherwise false. + +This function is constexpr and can be evaluated at compile time. + + - + - - const char * - const char* tf::to_string - (ObserverType type) - to_string + + + + typename T + + + size_t + size_t tf::floor_log2 + (T n) + floor_log2 + tf::floor_log2 - ObserverType - type + T + n -convert an observer type to a human-readable string +computes the floor of the base-2 logarithm of a number using count-leading-zeros (CTL). +This function efficiently calculates the floor of log2(n) for both 32-bit and 64-bit integers. + + +T + + +integer type (uint32_t or uint64_t). + + + + + +n + + +input number. + + + +floor of log2(n) + + - + - + - typename Input + size_t + N + N + + size_t + size_t tf::static_floor_log2 + () + static_floor_log2 + tf::static_floor_log2 + +returns the floor of log2(N) at compile time + + + + + + + + + - typename Output + typename RandItr typename C - auto - auto tf::make_data_pipe - (PipeType d, C &&callable) - make_data_pipe + RandItr + RandItr tf::median_of_three + (RandItr l, RandItr m, RandItr r, C cmp) + median_of_three + tf::median_of_three - PipeType - d + RandItr + l - C && - callable + RandItr + m + + + RandItr + r + + + C + cmp -function to construct a data pipe (tf::DataPipe) +finds the median of three numbers pointed to by iterators using the given comparator +This function determines the median value of the elements pointed to by three random-access iterators using the provided comparator. -Input +RandItr -input data type +The type of the random-access iterator. -Output +C -output data type +The type of the comparator. + + + + + +l + + +Iterator to the first element. -C +m -callable type +Iterator to the second element. + + + + +r + + +Iterator to the third element. + + + + +cmp + + +The comparator used to compare the dereferenced iterator values. -tf::make_data_pipe is a helper function to create a data pipe (tf::DataPipe) in a data-parallel pipeline (tf::DataPipeline). The first argument specifies the direction of the data pipe, either tf::PipeType::SERIAL or tf::PipeType::PARALLEL, and the second argument is a callable to invoke by the pipeline scheduler. Input and output data types are specified via template parameters, which will always be decayed by the library to its original form for storage purpose. The callable must take the input data type in its first argument and returns a value of the output data type. -tf::make_data_pipe<int,std::string>( -tf::PipeType::SERIAL, -[](int&input){ -returnstd::to_string(input+100); -} -); - -The callable can additionally take a reference of tf::Pipeflow, which allows you to query the runtime information of a stage task, such as its line number and token number. -tf::make_data_pipe<int,std::string>( -tf::PipeType::SERIAL, -[](int&input,tf::Pipeflow&pf){ -printf("token=%lu,line=%lu\n",pf.token(),pf.line()); -returnstd::to_string(input+100); -} -); - - - - - - - - size_t - size_t tf::cuda_get_num_devices - () - cuda_get_num_devices - -queries the number of available devices - - +The iterator pointing to the median value among the three elements. + + - + - - int - int tf::cuda_get_device - () - cuda_get_device + + + + typename RandItr + + + typename C + + + RandItr + RandItr tf::pseudo_median_of_nine + (RandItr beg, RandItr end, C cmp) + pseudo_median_of_nine + tf::pseudo_median_of_nine + + RandItr + beg + + + RandItr + end + + + C + cmp + -gets the current device associated with the caller thread +finds the pseudo median of a range of items using a spread of nine numbers +This function computes an approximate median of a range of items by sampling nine values spread across the range and finding their median. It uses a combination of the median_of_three function to determine the pseudo median. + + +RandItr + + +The type of the random-access iterator. + + + + +C + + +The type of the comparator. + + + + + +beg + + +Iterator to the beginning of the range. + + + + +end + + +Iterator to the end of the range. + + + + +cmp + + +The comparator used to compare the dereferenced iterator values. + + + +The iterator pointing to the pseudo median of the range. + +The pseudo median is an approximation of the true median and may not be the exact middle value of the range. + + - + - + + + + typename Iter + + + typename Compare + + void - void tf::cuda_set_device - (int id) - cuda_set_device + void tf::sort2 + (Iter a, Iter b, Compare comp) + sort2 + tf::sort2 - int - id + Iter + a + + + Iter + b + + + Compare + comp -switches to a given device context +sorts two elements of dereferenced iterators using the given comparison function - - - - - - - void - void tf::cuda_get_device_property - (int i, cudaDeviceProp &p) - cuda_get_device_property +This function compares two elements pointed to by iterators and swaps them if they are out of order according to the provided comparator. + + +Iter + + +The type of the iterator. + + + + +Compare + + +The type of the comparator. + + + + + +a + + +Iterator to the first element. + + + + +b + + +Iterator to the second element. + + + + +comp + + +The comparator used to compare the dereferenced iterator values. + + + + + + + + + + + + + typename Iter + + + typename Compare + + + void + void tf::sort3 + (Iter a, Iter b, Iter c, Compare comp) + sort3 + tf::sort3 - int - i + Iter + a - cudaDeviceProp & - p + Iter + b + + + Iter + c + + + Compare + comp -obtains the device property +Sorts three elements of dereferenced iterators using the given comparison function. +This function sorts three elements pointed to by iterators in ascending order according to the provided comparator. The sorting is performed using a sequence of calls to the sort2 function to ensure the correct order of elements. + + +Iter + + +The type of the iterator. + + + + +Compare + + +The type of the comparator. + + + + + +a + + +Iterator to the first element. + + + + +b + + +Iterator to the second element. + + + + +c + + +Iterator to the third element. + + + + +comp + + +The comparator used to compare the dereferenced iterator values. + + + + - + - - cudaDeviceProp - cudaDeviceProp tf::cuda_get_device_property - (int i) - cuda_get_device_property - - int - i - + + + + typename T + + + std::enable_if_t< std::is_integral_v< T >, void > * + nullptr + + + T + T tf::unique_id + () + unique_id + tf::unique_id -obtains the device property +generates a program-wide unique ID of the given type in a thread-safe manner +This function provides a globally unique identifier of the specified integral type. It uses a static std::atomic counter to ensure thread safety and increments the counter in a relaxed memory ordering for efficiency. + + +T + + +The type of the ID to generate. Must be an integral type. + + + +A unique ID of type T. + +The uniqueness of the ID is guaranteed only within the program's lifetime. + +The function does not throw exceptions. + + - + - + + + + typename T + + void - void tf::cuda_dump_device_property - (std::ostream &os, const cudaDeviceProp &p) - cuda_dump_device_property + void tf::atomic_max + (std::atomic< T > &v, const T &max_v) noexcept + atomic_max + tf::atomic_max - std::ostream & - os + std::atomic< T > & + v - const cudaDeviceProp & - p + const T & + max_v -dumps the device property +updates an atomic variable with the maximum value +This function atomically updates the provided atomic variable v to hold the maximum of its current value and max_v. The update is performed using a relaxed memory ordering for efficiency in non-synchronizing contexts. + + +T + + +The type of the atomic variable. Must be trivially copyable and comparable. + + + + + +v + + +The atomic variable to update. + + + + +max_v + + +The value to compare with the current value of v. + + + +If multiple threads call this function concurrently, the value of v will be the maximum value seen across all threads. + + - + - - size_t - size_t tf::cuda_get_device_max_threads_per_block - (int d) - cuda_get_device_max_threads_per_block + + + + typename T + + + void + void tf::atomic_min + (std::atomic< T > &v, const T &min_v) noexcept + atomic_min + tf::atomic_min - int - d + std::atomic< T > & + v + + + const T & + min_v -queries the maximum threads per block on a device +updates an atomic variable with the minimum value +This function atomically updates the provided atomic variable v to hold the minimum of its current value and min_v. The update is performed using a relaxed memory ordering for efficiency in non-synchronizing contexts. + + +T + + +The type of the atomic variable. Must be trivially copyable and comparable. + + + + + +v + + +The atomic variable to update. + + + + +min_v + + +The value to compare with the current value of v. + + + +If multiple threads call this function concurrently, the value of v will be the minimum value seen across all threads. + + - + - - size_t - size_t tf::cuda_get_device_max_x_dim_per_block - (int d) - cuda_get_device_max_x_dim_per_block - - int - d - + + + + typename T + + + T + T tf::seed + () noexcept + seed + tf::seed -queries the maximum x-dimension per block on a device +generates a random seed based on the current system clock +This function returns a seed value derived from the number of clock ticks since the epoch as measured by the system clock. The seed can be used to initialize random number generators. + + +T + + +The type of the returned seed. Must be an integral type. + + + +A seed value based on the system clock. + + - + - - size_t - size_t tf::cuda_get_device_max_y_dim_per_block - (int d) - cuda_get_device_max_y_dim_per_block + + + + typename T + + + typename + std::enable_if_t<std::is_unsigned_v<T>> + + + auto + auto tf::ctz + (T x) + ctz + tf::ctz - int - d + T + x -queries the maximum y-dimension per block on a device +counts the number of trailing zeros in an integer. +This function provides a portable implementation for counting the number of trailing zeros across different platforms and integer sizes (32-bit and 64-bit). + + +T + + +integer type (32-bit or 64-bit). + + + + + +x + + +non-zero integer to count trailing zeros from + + + +the number of trailing zeros in x + +The behavior is undefined when x is 0. + + - + - + size_t - size_t tf::cuda_get_device_max_z_dim_per_block - (int d) - cuda_get_device_max_z_dim_per_block + size_t tf::coprime + (size_t N) + coprime + tf::coprime - int - d + size_t + N -queries the maximum z-dimension per block on a device +computes a coprime of a given number +This function finds the largest number less than N that is coprime (i.e., has a greatest common divisor of 1) with N. If N is less than 3, it returns 1 as a default coprime. + + +N + + +input number for which a coprime is to be found. + + + +the largest number < N that is coprime to N + + - + - - size_t - size_t tf::cuda_get_device_max_x_dim_per_grid - (int d) - cuda_get_device_max_x_dim_per_grid - - int - d - + + + + size_t + N + N + + + std::array< size_t, N > + std::array< size_t, N > tf::make_coprime_lut + () + make_coprime_lut + tf::make_coprime_lut -queries the maximum x-dimension per grid on a device +generates a compile-time array of coprimes for numbers from 0 to N-1 +This function constructs a constexpr array where each element at index i contains a coprime of i (the largest number less than i that is coprime to it). + + +N + + +the size of the array to generate (should be greater than 0). + + + +a constexpr array of size N where each index holds a coprime of its value. + + - + - - size_t - size_t tf::cuda_get_device_max_y_dim_per_grid - (int d) - cuda_get_device_max_y_dim_per_grid + + std::string + std::string tf::get_env + (const std::string &str) + get_env + tf::get_env - int - d + const std::string & + str -queries the maximum y-dimension per grid on a device +retrieves the value of an environment variable +This function fetches the value of an environment variable by name. If the variable is not found, it returns an empty string. + + +str + + +The name of the environment variable to retrieve. + + + +The value of the environment variable as a string, or an empty string if not found. + +The implementation differs between Windows and POSIX platforms: +On Windows, it uses _dupenv_s to fetch the value. +On POSIX, it uses std::getenv. + + + + - + - - size_t - size_t tf::cuda_get_device_max_z_dim_per_grid - (int d) - cuda_get_device_max_z_dim_per_grid + + bool + bool tf::has_env + (const std::string &str) + has_env + tf::has_env - int - d + const std::string & + str -queries the maximum z-dimension per grid on a device +checks whether an environment variable is defined +This function determines if a specific environment variable exists in the current environment. + + +str + + +The name of the environment variable to check. + + + +true if the environment variable exists, false otherwise. + +The implementation differs between Windows and POSIX platforms: +On Windows, it uses _dupenv_s to check for the variable's presence. +On POSIX, it uses std::getenv to check for the variable's presence. + + + + - + - - size_t - size_t tf::cuda_get_device_max_shm_per_block - (int d) - cuda_get_device_max_shm_per_block - - int - d - + + void + void tf::pause + () + pause + tf::pause -queries the maximum shared memory size in bytes per block on a device +This function is used in spin-wait loops to hint the CPU that the current thread is in a busy-wait state. It helps reduce power consumption and improves performance on hyper-threaded processors by preventing the CPU from consuming unnecessary cycles while waiting. It is particularly useful in low-contention scenarios, where the thread is likely to quickly acquire the lock or condition it's waiting for, avoiding an expensive context switch. On modern x86 processors, this instruction can be invoked using __builtin_ia32_pause() in GCC/Clang or _mm_pause() in MSVC. In non-x86 architectures, alternative mechanisms such as yielding the CPU may be used instead. - + - - size_t - size_t tf::cuda_get_device_warp_size - (int d) - cuda_get_device_warp_size + + void + void tf::pause + (size_t count) + pause + tf::pause - int - d + size_t + count -queries the warp size on a device +pause CPU for a specified number of iterations - + - - int - int tf::cuda_get_device_compute_capability_major - (int d) - cuda_get_device_compute_capability_major + + + + typename P + + + void + void tf::spin_until + (P &&predicate) + spin_until + tf::spin_until - int - d + P && + predicate -queries the major number of compute capability of a device +spins until the given predicate becomes true + + +P + + +the type of the predicate function or callable. + + + + + +predicate + + +the callable that returns a boolean value, which is checked in the loop. + + + +This function repeatedly checks the provided predicate in a spin-wait loop and uses a backoff strategy to minimize CPU waste during the wait. Initially, it uses the pause() instruction for the first 100 iterations to hint to the CPU that the thread is waiting, thus reducing power consumption and avoiding unnecessary cycles. After 100 iterations, it switches to yielding the CPU using std::this_thread::yield() to allow other threads to run and improve system responsiveness. +The function operates as follows: +For the first 100 iterations, it invokes pause() to reduce power consumption during the spin-wait. +After 100 iterations, it uses std::this_thread::yield() to relinquish the CPU, allowing other threads to execute. + + +This function is useful when you need to wait for a condition to be true, but want to optimize CPU usage during the wait by using a busy-wait approach. + + - + - - int - int tf::cuda_get_device_compute_capability_minor - (int d) - cuda_get_device_compute_capability_minor + + + + typename B + + + typename E + + + typename S + + + std::enable_if_t< std::is_integral_v< std::decay_t< B > > &&std::is_integral_v< std::decay_t< E > > &&std::is_integral_v< std::decay_t< S > >, bool > + std::enable_if_t< std::is_integral_v< std::decay_t< B > > && std::is_integral_v< std::decay_t< E > > && std::is_integral_v< std::decay_t< S > >, bool > tf::is_index_range_invalid + (B beg, E end, S step) + is_index_range_invalid + tf::is_index_range_invalid - int - d + B + beg - -queries the minor number of compute capability of a device - - - - - - - - - bool - bool tf::cuda_get_device_unified_addressing - (int d) - cuda_get_device_unified_addressing - int - d + E + end + + + S + step -queries if the device supports unified addressing - - - - - - - - - int - int tf::cuda_get_driver_version - () - cuda_get_driver_version - -queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver - - - - - - - - - int - int tf::cuda_get_runtime_version - () - cuda_get_runtime_version - -queries the CUDA Runtime version (1000 * major + 10 * minor) +checks if the given index range is invalid + + +B + + +type of the beginning index + + + + +E + + +type of the ending index + + + + +S + + +type of the step size + + + + + +beg + + +starting index of the range + + + + +end + + +ending index of the range + + + + +step + + +step size to traverse the range + + + +returns true if the range is invalid; false otherwise. + +A range is considered invalid under the following conditions: +The step is zero and the begin and end values are not equal. +A positive range (begin < end) with a non-positive step. +A negative range (begin > end) with a non-negative step. + + - + - - size_t - size_t tf::cuda_get_free_mem - (int d) - cuda_get_free_mem + + + + typename B + + + typename E + + + typename S + + + std::enable_if_t< std::is_integral_v< std::decay_t< B > > &&std::is_integral_v< std::decay_t< E > > &&std::is_integral_v< std::decay_t< S > >, size_t > + std::enable_if_t< std::is_integral_v< std::decay_t< B > > && std::is_integral_v< std::decay_t< E > > && std::is_integral_v< std::decay_t< S > >, size_t > tf::distance + (B beg, E end, S step) + distance + tf::distance - int - d + B + beg + + + E + end + + + S + step -queries the free memory (expensive call) +calculates the number of iterations in the given index range - - + + +B + + +type of the beginning index + + + + +E + + +type of the ending index + + + + +S + + +type of the step size + + + + + +beg + + +starting index of the range + + + + +end + + +ending index of the range + + + + +step + + +step size to traverse the range + + + +returns the number of required iterations to traverse the range + +The distance of a range represents the number of required iterations to traverse the range from the beginning index to the ending index (exclusive) with the given step size. +Example 1: //Range:0to10withstepsize2 +size_tdist=distance(0,10,2);//Returns5,thesequenceis[0,2,4,6,8] + +Example 2: //Range:10to0withstepsize-2 +size_tdist=distance(10,0,-2);//Returns5,thesequenceis[10,8,6,4,2] + +Example 3: //Range:5to20withstepsize5 +size_tdist=distance(5,20,5);//Returns3,thesequenceis[5,10,15] + +It is user's responsibility to ensure the given index range is valid. + + + + - + - - size_t - size_t tf::cuda_get_total_mem - (int d) - cuda_get_total_mem + + + + typename... + ArgsT + ArgsT + + + TF_FORCE_INLINE Node * + TF_FORCE_INLINE Node * tf::animate + (ArgsT &&... args) + animate + tf::animate - int - d + ArgsT &&... + args + + + + + + + + + + + TF_FORCE_INLINE void + TF_FORCE_INLINE void tf::recycle + (Node *ptr) + recycle + tf::recycle + + Node * + ptr -queries the total available memory (expensive call) - + - + typename T + + typename... + ArgsT + ArgsT + - T * - T* tf::cuda_malloc_device - (size_t N, int d) - cuda_malloc_device + std::unique_ptr< T > + std::unique_ptr< T > tf::make_worker_interface + (ArgsT &&... args) + make_worker_interface + tf::make_worker_interface - size_t - N + ArgsT &&... + args + +helper function to create an instance derived from tf::WorkerInterface + + + + +T + + +type derived from tf::WorkerInterface + + + + +ArgsT + + +argument types to construct T + + + + + +args + + +arguments to forward to the constructor of T + + + + + + + + + + + const char * + const char * tf::to_string + (TaskType type) + to_string + tf::to_string - int - d + TaskType + type -allocates memory on the given device for holding N elements of type T +convert a task type to a human-readable string -The function calls cudaMalloc to allocate N*sizeof(T) bytes of memory on the given device d and returns a pointer to the starting address of the device memory. +The name of each task type is the litte-case string of its characters. +TaskType::PLACEHOLDER is of string placeholder +TaskType::STATIC is of string static +TaskType::RUNTIME is of string runtime +TaskType::SUBFLOW is of string subflow +TaskType::CONDITION is of string condition +TaskType::MODULE is of string module +TaskType::ASYNC is of string async + + - + - - - - typename T - - - T * - T* tf::cuda_malloc_device - (size_t N) - cuda_malloc_device + + std::ostream & + std::ostream & tf::operator<< + (std::ostream &os, const Task &task) + operator<< + tf::operator<< - size_t - N + std::ostream & + os + + + const Task & + task -allocates memory on the current device associated with the caller +overload of ostream inserter operator for Task -The function calls malloc_device from the current device associated with the caller. - + - - - - typename T - - - T * - T* tf::cuda_malloc_shared - (size_t N) - cuda_malloc_shared + + const char * + const char * tf::to_string + (ObserverType type) + to_string + tf::to_string - size_t - N + ObserverType + type -allocates shared memory for holding N elements of type T +convert an observer type to a human-readable string -The function calls cudaMallocManaged to allocate N*sizeof(T) bytes of memory and returns a pointer to the starting address of the shared memory. - + - + - typename T + typename Input + + + typename Output + + + typename C - void - void tf::cuda_free - (T *ptr, int d) - cuda_free + auto + auto tf::make_data_pipe + (PipeType d, C &&callable) + make_data_pipe + tf::make_data_pipe - T * - ptr + PipeType + d - int - d + C && + callable -frees memory on the GPU device +function to construct a data pipe (tf::DataPipe) -T +Input -pointer type +input data type - - + -ptr +Output -device pointer to memory to free +output data type -d +C -device context identifier +callable type -This methods call cudaFree to free the memory space pointed to by ptr using the given device context. +tf::make_data_pipe is a helper function to create a data pipe (tf::DataPipe) in a data-parallel pipeline (tf::DataPipeline). The first argument specifies the direction of the data pipe, either tf::PipeType::SERIAL or tf::PipeType::PARALLEL, and the second argument is a callable to invoke by the pipeline scheduler. Input and output data types are specified via template parameters, which will always be decayed by the library to its original form for storage purpose. The callable must take the input data type in its first argument and returns a value of the output data type. +tf::make_data_pipe<int, std::string>( +tf::PipeType::SERIAL, +[](int&input){ +returnstd::to_string(input+100); +} +); + +The callable can additionally take a reference of tf::Pipeflow, which allows you to query the runtime information of a stage task, such as its line number and token number. +tf::make_data_pipe<int, std::string>( +tf::PipeType::SERIAL, +[](int&input,tf::Pipeflow&pf){ +printf("token=%lu,line=%lu\n",pf.token(),pf.line()); +returnstd::to_string(input+100); +} +); + - + - + typename T - void - void tf::cuda_free - (T *ptr) - cuda_free + auto + auto tf::tf::Algorithm::make_module_task + (T &&target) + make_module_task + tf::make_module_task - T * - ptr + T && + target -frees memory on the GPU device +creates a module task using the given target @@ -1311,3112 +2084,1123 @@ This methods call cudaFree to free the memory s T -pointer type +Type of the target object, which must define the method tf::Graph& graph(). -ptr +target -device pointer to memory to free +The target object used to create the module task. -This methods call cudaFree to free the memory space pointed to by ptr using the current device context of the caller. +module task that can be used by Taskflow or asynchronous tasking. + +This example demonstrates how to create and launch multiple taskflows in parallel using asynchronous tasking: +tf::Executorexecutor; + +tf::TaskflowA; +tf::TaskflowB; +tf::TaskflowC; +tf::TaskflowD; + +A.emplace([](){printf("TaskflowA\n");}); +B.emplace([](){printf("TaskflowB\n");}); +C.emplace([](){printf("TaskflowC\n");}); +D.emplace([](){printf("TaskflowD\n");}); + +//launchthefourtaskflowsusingasynchronoustasking +executor.async(tf::make_module_task(A)); +executor.async(tf::make_module_task(B)); +executor.async(tf::make_module_task(C)); +executor.async(tf::make_module_task(D)); +executor.wait_for_all(); + +The module task maker, tf::make_module_task, is basically the same as tf::Taskflow::composed_of but provides a more generic interface that can be used beyond Taskflow. For instance, the following two approaches achieve the same functionality. +//approach1:compositionusingcomposed_of +tf::Taskm1=taskflow1.composed_of(taskflow2); + +//approach2:compositionusingmake_module_task +tf::Taskm1=taskflow1.emplace(tf::make_module_task(taskflow2)); + +Users are responsible for ensuring that the given target remains valid throughout its execution. The executor does not assume ownership of the target object. + + - + - - void - void tf::cuda_memcpy_async - (cudaStream_t stream, void *dst, const void *src, size_t count) - cuda_memcpy_async - - cudaStream_t - stream - - - void * - dst - - - const void * - src - - - size_t - count - + + size_t + size_t tf::cuda_get_num_devices + () + cuda_get_num_devices + tf::cuda_get_num_devices -copies data between host and device asynchronously through a stream +queries the number of available devices - - -stream - - -stream identifier - - - - -dst - - -destination memory address - - - - -src - - -source memory address - - - - -count - - -size in bytes to copy - - - -The method calls cudaMemcpyAsync with the given stream using cudaMemcpyDefault to infer the memory space of the source and the destination pointers. The memory areas may not overlap. - + - + + int + int tf::cuda_get_device + () + cuda_get_device + tf::cuda_get_device + +gets the current device associated with the caller thread + + + + + + + + void - void tf::cuda_memset_async - (cudaStream_t stream, void *devPtr, int value, size_t count) - cuda_memset_async - - cudaStream_t - stream - + void tf::cuda_set_device + (int id) + cuda_set_device + tf::cuda_set_device - void * - devPtr + int + id + +switches to a given device context + + + + + + + + + void + void tf::cuda_get_device_property + (int i, cudaDeviceProp &p) + cuda_get_device_property + tf::cuda_get_device_property int - value + i - size_t - count + cudaDeviceProp & + p -initializes or sets GPU memory to the given value byte by byte +obtains the device property - - -stream - - -stream identifier - - - - -devPtr - - -pointer to GPU mempry - - - - -value - - -value to set for each byte of the specified memory - - - - -count - - -size in bytes to set - - - -The method calls cudaMemsetAsync with the given stream to fill the first count bytes of the memory area pointed to by devPtr with the constant byte value value. - + - - constexpr const char * - constexpr const char* tf::to_string - (cudaTaskType type) - to_string + + cudaDeviceProp + cudaDeviceProp tf::cuda_get_device_property + (int i) + cuda_get_device_property + tf::cuda_get_device_property - cudaTaskType - type + int + i -convert a cuda_task type to a human-readable string +obtains the device property - + - - std::ostream & - std::ostream& tf::operator<< - (std::ostream &os, const cudaTask &ct) - operator<< + + void + void tf::cuda_dump_device_property + (std::ostream &os, const cudaDeviceProp &p) + cuda_dump_device_property + tf::cuda_dump_device_property - std::ostream & + std::ostream & os - const cudaTask & - ct + const cudaDeviceProp & + p -overload of ostream inserter operator for cudaTask +dumps the device property - + - - - - typename P - - - typename C - - - void - void tf::cuda_single_task - (P &&p, C c) - cuda_single_task - - P && - p - + + size_t + size_t tf::cuda_get_device_max_threads_per_block + (int d) + cuda_get_device_max_threads_per_block + tf::cuda_get_device_max_threads_per_block - C - c + int + d -runs a callable asynchronously using one kernel thread +queries the maximum threads per block on a device - - -P - - -execution policy type - - - - -C - - -closure type - - - - - -p - - -execution policy - - - - -c - - -closure to run by one kernel thread - - - -The function launches a single kernel thread to run the given callable through the stream in the execution policy object. - + - - - - typename P - - - typename I - - - typename C - - - void - void tf::cuda_for_each - (P &&p, I first, I last, C c) - cuda_for_each - - P && - p - + + size_t + size_t tf::cuda_get_device_max_x_dim_per_block + (int d) + cuda_get_device_max_x_dim_per_block + tf::cuda_get_device_max_x_dim_per_block - I - first + int + d + +queries the maximum x-dimension per block on a device + + + + + + + + + size_t + size_t tf::cuda_get_device_max_y_dim_per_block + (int d) + cuda_get_device_max_y_dim_per_block + tf::cuda_get_device_max_y_dim_per_block - I - last + int + d + +queries the maximum y-dimension per block on a device + + + + + + + + + size_t + size_t tf::cuda_get_device_max_z_dim_per_block + (int d) + cuda_get_device_max_z_dim_per_block + tf::cuda_get_device_max_z_dim_per_block - C - c + int + d -performs asynchronous parallel iterations over a range of items +queries the maximum z-dimension per block on a device - - -P - - -execution policy type - - - - -I - - -input iterator type - - - - -C - - -unary operator type - - - - - -p - - -execution policy object - - - - -first - - -iterator to the beginning of the range - - - - -last - - -iterator to the end of the range - - - - -c - - -unary operator to apply to each dereferenced iterator - - - -This function is equivalent to a parallel execution of the following loop on a GPU: -for(autoitr=first;itr!=last;itr++){ -c(*itr); -} - - + - - - - typename P - - - typename I - - - typename C - - - void - void tf::cuda_for_each_index - (P &&p, I first, I last, I inc, C c) - cuda_for_each_index - - P && - p - - - I - first - - - I - last - + + size_t + size_t tf::cuda_get_device_max_x_dim_per_grid + (int d) + cuda_get_device_max_x_dim_per_grid + tf::cuda_get_device_max_x_dim_per_grid - I - inc + int + d + +queries the maximum x-dimension per grid on a device + + + + + + + + + size_t + size_t tf::cuda_get_device_max_y_dim_per_grid + (int d) + cuda_get_device_max_y_dim_per_grid + tf::cuda_get_device_max_y_dim_per_grid - C - c + int + d -performs asynchronous parallel iterations over an index-based range of items +queries the maximum y-dimension per grid on a device - - -P - - -execution policy type - - - - -I - - -input index type - - - - -C - - -unary operator type - - - - - -p - - -execution policy object - - - - -first - - -index to the beginning of the range - - - - -last - - -index to the end of the range - - - - -inc - - -step size between successive iterations - - - - -c - - -unary operator to apply to each index - - - -This function is equivalent to a parallel execution of the following loop on a GPU: -//stepispositive[first,last) -for(autoi=first;i<last;i+=step){ -c(i); -} - -//stepisnegative[first,last) -for(autoi=first;i>last;i+=step){ -c(i); -} - - + - - - - typename C - - - __global__ void - __global__ void tf::cuda_single_task - (C callable) - cuda_single_task + + size_t + size_t tf::cuda_get_device_max_z_dim_per_grid + (int d) + cuda_get_device_max_z_dim_per_grid + tf::cuda_get_device_max_z_dim_per_grid - C - callable + int + d +queries the maximum z-dimension per grid on a device - + - - - - typename P - - - typename I - - - typename O - - - typename C - - - void - void tf::cuda_transform - (P &&p, I first, I last, O output, C op) - cuda_transform + + size_t + size_t tf::cuda_get_device_max_shm_per_block + (int d) + cuda_get_device_max_shm_per_block + tf::cuda_get_device_max_shm_per_block - P && - p + int + d + +queries the maximum shared memory size in bytes per block on a device + + + + + + + + + size_t + size_t tf::cuda_get_device_warp_size + (int d) + cuda_get_device_warp_size + tf::cuda_get_device_warp_size - I - first + int + d + +queries the warp size on a device + + + + + + + + + int + int tf::cuda_get_device_compute_capability_major + (int d) + cuda_get_device_compute_capability_major + tf::cuda_get_device_compute_capability_major - I - last + int + d + +queries the major number of compute capability of a device + + + + + + + + + int + int tf::cuda_get_device_compute_capability_minor + (int d) + cuda_get_device_compute_capability_minor + tf::cuda_get_device_compute_capability_minor - O - output + int + d + +queries the minor number of compute capability of a device + + + + + + + + + bool + bool tf::cuda_get_device_unified_addressing + (int d) + cuda_get_device_unified_addressing + tf::cuda_get_device_unified_addressing - C - op + int + d -performs asynchronous parallel transforms over a range of items +queries if the device supports unified addressing - - -P - - -execution policy type - - - - -I - - -input iterator type - - - - -O - - -output iterator type - - - - -C - - -unary operator type - - - - - -p - - -execution policy - - - - -first - - -iterator to the beginning of the range - - - - -last - - -iterator to the end of the range - - - - -output - - -iterator to the beginning of the output range - - - - -op - - -unary operator to apply to transform each item - - - -This method is equivalent to the parallel execution of the following loop on a GPU: -while(first!=last){ -*output++=op(*first++); -} - - - - - - - - - - typename P - - - typename I1 - - - typename I2 - - - typename O - - - typename C - - - void - void tf::cuda_transform - (P &&p, I1 first1, I1 last1, I2 first2, O output, C op) - cuda_transform - - P && - p - - - I1 - first1 - - - I1 - last1 - - - I2 - first2 - - - O - output - - - C - op - - -performs asynchronous parallel transforms over two ranges of items - - - - -P - - -execution policy type - - - - -I1 - - -first input iterator type - - - - -I2 - - -second input iterator type - - - - -O - - -output iterator type - - - - -C - - -binary operator type - - - - - -p - - -execution policy - - - - -first1 - - -iterator to the beginning of the first range - - - - -last1 - - -iterator to the end of the first range - - - - -first2 - - -iterator to the beginning of the second range - - - - -output - - -iterator to the beginning of the output range - - - - -op - - -binary operator to apply to transform each pair of items - - - -This method is equivalent to the parallel execution of the following loop on a GPU: -while(first1!=last1){ -*output++=op(*first1++,*first2++); -} - - - - - - - - - - typename P - - - typename I - - - typename T - - - typename O - - - void - void tf::cuda_reduce - (P &&p, I first, I last, T *res, O op, void *buf) - cuda_reduce - - P && - p - - - I - first - - - I - last - - - T * - res - - - O - op - - - void * - buf - - -performs asynchronous parallel reduction over a range of items - - - - -P - - -execution policy type - - - - -I - - -input iterator type - - - - -T - - -value type - - - - -O - - -binary operator type - - - - - -p - - -execution policy - - - - -first - - -iterator to the beginning of the range - - - - -last - - -iterator to the end of the range - - - - -res - - -pointer to the result - - - - -op - - -binary operator to apply to reduce elements - - - - -buf - - -pointer to the temporary buffer - - - -This method is equivalent to the parallel execution of the following loop on a GPU: -while(first!=last){ -*result=op(*result,*first++); -} - - - - - - - - - - typename P - - - typename I - - - typename T - - - typename O - - - void - void tf::cuda_uninitialized_reduce - (P &&p, I first, I last, T *res, O op, void *buf) - cuda_uninitialized_reduce - - P && - p - - - I - first - - - I - last - - - T * - res - - - O - op - - - void * - buf - - -performs asynchronous parallel reduction over a range of items without an initial value - - - - -P - - -execution policy type - - - - -I - - -input iterator type - - - - -T - - -value type - - - - -O - - -binary operator type - - - - - -p - - -execution policy - - - - -first - - -iterator to the beginning of the range - - - - -last - - -iterator to the end of the range - - - - -res - - -pointer to the result - - - - -op - - -binary operator to apply to reduce elements - - - - -buf - - -pointer to the temporary buffer - - - -This method is equivalent to the parallel execution of the following loop on a GPU: -*result=*first++;//noinitialvaluespartitipcateintheloop -while(first!=last){ -*result=op(*result,*first++); -} - - - - - - - - - - typename P - - - typename I - - - typename T - - - typename O - - - typename U - - - void - void tf::cuda_transform_reduce - (P &&p, I first, I last, T *res, O bop, U uop, void *buf) - cuda_transform_reduce - - P && - p - - - I - first - - - I - last - - - T * - res - - - O - bop - - - U - uop - - - void * - buf - - -performs asynchronous parallel reduction over a range of transformed items without an initial value - - - - -P - - -execution policy type - - - - -I - - -input iterator type - - - - -T - - -value type - - - - -O - - -binary operator type - - - - -U - - -unary operator type - - - - - -p - - -execution policy - - - - -first - - -iterator to the beginning of the range - - - - -last - - -iterator to the end of the range - - - - -res - - -pointer to the result - - - - -bop - - -binary operator to apply to reduce elements - - - - -uop - - -unary operator to apply to transform elements - - - - -buf - - -pointer to the temporary buffer - - - -This method is equivalent to the parallel execution of the following loop on a GPU: -while(first!=last){ -*result=bop(*result,uop(*first++)); -} - - - - - - - - - - typename P - - - typename I - - - typename T - - - typename O - - - typename U - - - void - void tf::cuda_uninitialized_transform_reduce - (P &&p, I first, I last, T *res, O bop, U uop, void *buf) - cuda_uninitialized_transform_reduce - - P && - p - - - I - first - - - I - last - - - T * - res - - - O - bop - - - U - uop - - - void * - buf - - -performs asynchronous parallel reduction over a range of transformed items with an initial value - - - - -P - - -execution policy type - - - - -I - - -input iterator type - - - - -T - - -value type - - - - -O - - -binary operator type - - - - -U - - -unary operator type - - - - - -p - - -execution policy - - - - -first - - -iterator to the beginning of the range - - - - -last - - -iterator to the end of the range - - - - -res - - -pointer to the result - - - - -bop - - -binary operator to apply to reduce elements - - - - -uop - - -unary operator to apply to transform elements - - - - -buf - - -pointer to the temporary buffer - - - -This method is equivalent to the parallel execution of the following loop on a GPU: -*result=uop(*first++);//noinitialvaluespartitipcateintheloop -while(first!=last){ -*result=bop(*result,uop(*first++)); -} - - - - - - - - - - typename P - - - typename I - - - typename O - - - typename C - - - void - void tf::cuda_inclusive_scan - (P &&p, I first, I last, O output, C op, void *buf) - cuda_inclusive_scan - - P && - p - - - I - first - - - I - last - - - O - output - - - C - op - - - void * - buf - - -performs asynchronous inclusive scan over a range of items - - - - -P - - -execution policy type - - - - -I - - -input iterator - - - - -O - - -output iterator - - - - -C - - -binary operator type - - - - - -p - - -execution policy - - - - -first - - -iterator to the beginning of the input range - - - - -last - - -iterator to the end of the input range - - - - -output - - -iterator to the beginning of the output range - - - - -op - - -binary operator to apply to scan - - - - -buf - - -pointer to the temporary buffer - - - - - - - - - - - - - typename P - - - typename I - - - typename O - - - typename C - - - typename U - - - void - void tf::cuda_transform_inclusive_scan - (P &&p, I first, I last, O output, C bop, U uop, void *buf) - cuda_transform_inclusive_scan - - P && - p - - - I - first - - - I - last - - - O - output - - - C - bop - - - U - uop - - - void * - buf - - -performs asynchronous inclusive scan over a range of transformed items - - - - -P - - -execution policy type - - - - -I - - -input iterator - - - - -O - - -output iterator - - - - -C - - -binary operator type - - - - -U - - -unary operator type - - - - - -p - - -execution policy - - - - -first - - -iterator to the beginning of the input range - - - - -last - - -iterator to the end of the input range - - - - -output - - -iterator to the beginning of the output range - - - - -bop - - -binary operator to apply to scan - - - - -uop - - -unary operator to apply to transform each item before scan - - - - -buf - - -pointer to the temporary buffer - - - - - - - - - - - - - typename P - - - typename I - - - typename O - - - typename C - - - void - void tf::cuda_exclusive_scan - (P &&p, I first, I last, O output, C op, void *buf) - cuda_exclusive_scan - - P && - p - - - I - first - - - I - last - - - O - output - - - C - op - - - void * - buf - - -performs asynchronous exclusive scan over a range of items - - - - -P - - -execution policy type - - - - -I - - -input iterator - - - - -O - - -output iterator - - - - -C - - -binary operator type - - - - - -p - - -execution policy - - - - -first - - -iterator to the beginning of the input range - - - - -last - - -iterator to the end of the input range - - - - -output - - -iterator to the beginning of the output range - - - - -op - - -binary operator to apply to scan - - - - -buf - - -pointer to the temporary buffer - - - - - - - - - - - - - typename P - - - typename I - - - typename O - - - typename C - - - typename U - - - void - void tf::cuda_transform_exclusive_scan - (P &&p, I first, I last, O output, C bop, U uop, void *buf) - cuda_transform_exclusive_scan - - P && - p - - - I - first - - - I - last - - - O - output - - - C - bop - - - U - uop - - - void * - buf - - -performs asynchronous exclusive scan over a range of items - - - - -P - - -execution policy type - - - - -I - - -input iterator - - - - -O - - -output iterator - - - - -C - - -binary operator type - - - - -U - - -unary operator type - - - - - -p - - -execution policy - - - - -first - - -iterator to the beginning of the input range - - - - -last - - -iterator to the end of the input range - - - - -output - - -iterator to the beginning of the output range - - - - -bop - - -binary operator to apply to scan - - - - -uop - - -unary operator to apply to transform each item before scan - - - - -buf - - -pointer to the temporary buffer - - - - - + - - - - typename P - - - typename a_keys_it - - - typename a_vals_it - - - typename b_keys_it - - - typename b_vals_it - - - typename c_keys_it - - - typename c_vals_it - - - typename C - - - void - void tf::cuda_merge_by_key - (P &&p, a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp, void *buf) - cuda_merge_by_key - - P && - p - - - a_keys_it - a_keys_first - - - a_keys_it - a_keys_last - - - a_vals_it - a_vals_first - - - b_keys_it - b_keys_first - - - b_keys_it - b_keys_last - - - b_vals_it - b_vals_first - - - c_keys_it - c_keys_first - - - c_vals_it - c_vals_first - - - C - comp - - - void * - buf - + + int + int tf::cuda_get_driver_version + () + cuda_get_driver_version + tf::cuda_get_driver_version -performs asynchronous key-value merge over a range of keys and values +queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver - - -P - - -execution policy type - - - - -a_keys_it - - -first key iterator type - - - - -a_vals_it - - -first value iterator type - - - - -b_keys_it - - -second key iterator type - - - - -b_vals_it - - -second value iterator type - - - - -c_keys_it - - -output key iterator type - - - - -c_vals_it - - -output value iterator type - - - - -C - - -comparator type - - - - - -p - - -execution policy - - - - -a_keys_first - - -iterator to the beginning of the first key range - - - - -a_keys_last - - -iterator to the end of the first key range - - - - -a_vals_first - - -iterator to the beginning of the first value range - - - - -b_keys_first - - -iterator to the beginning of the second key range - - - - -b_keys_last - - -iterator to the end of the second key range - - - - -b_vals_first - - -iterator to the beginning of the second value range - - - - -c_keys_first - - -iterator to the beginning of the output key range - - - - -c_vals_first - - -iterator to the beginning of the output value range - - - - -comp - - -comparator - - - - -buf - - -pointer to the temporary buffer - - - -Performs a key-value merge that copies elements from [a_keys_first, a_keys_last) and [b_keys_first, b_keys_last) into a single range, [c_keys_first, c_keys_last + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first)) such that the resulting range is in ascending key order. -At the same time, the merge copies elements from the two associated ranges [a_vals_first + (a_keys_last - a_keys_first)) and [b_vals_first + (b_keys_last - b_keys_first)) into a single range, [c_vals_first, c_vals_first + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first)) such that the resulting range is in ascending order implied by each input element's associated key. -For example, assume: -a_keys = {1, 8}; -a_vals = {2, 1}; -b_keys = {3, 7}; -b_vals = {3, 4}; - - -After the merge, we have: -c_keys = {1, 3, 7, 8} -c_vals = {2, 3, 4, 1} - - - - - - - - - - typename P - - - typename a_keys_it - - - typename b_keys_it - - - typename c_keys_it - - - typename C - - - void - void tf::cuda_merge - (P &&p, a_keys_it a_keys_first, a_keys_it a_keys_last, b_keys_it b_keys_first, b_keys_it b_keys_last, c_keys_it c_keys_first, C comp, void *buf) - cuda_merge - - P && - p - - - a_keys_it - a_keys_first - - - a_keys_it - a_keys_last - - - b_keys_it - b_keys_first - - - b_keys_it - b_keys_last - - - c_keys_it - c_keys_first - - - C - comp - + + + + + + int + int tf::cuda_get_runtime_version + () + cuda_get_runtime_version + tf::cuda_get_runtime_version + +queries the CUDA Runtime version (1000 * major + 10 * minor) + + + + + + + + + size_t + size_t tf::cuda_get_free_mem + (int d) + cuda_get_free_mem + tf::cuda_get_free_mem - void * - buf + int + d -performs asynchronous key-only merge over a range of keys +queries the free memory (expensive call) - - -P - - -execution policy type - - - - -a_keys_it - - -first key iterator type - - - - -b_keys_it - - -second key iterator type - - - - -c_keys_it - - -output key iterator type - - - - -C - - -comparator type - - - - - -p - - -execution policy - - - - -a_keys_first - - -iterator to the beginning of the first key range - - - - -a_keys_last - - -iterator to the end of the first key range - - - - -b_keys_first - - -iterator to the beginning of the second key range - - - - -b_keys_last - - -iterator to the end of the second key range - - - - -c_keys_first - - -iterator to the beginning of the output key range - - - - -comp - - -comparator - - - - -buf - - -pointer to the temporary buffer - - - -This function is equivalent to tf::cuda_merge_by_key without values. - + - - - - typename P - - - typename K - - - typename V - cudaEmpty - - - unsigned - unsigned tf::cuda_sort_buffer_size - (unsigned count) - cuda_sort_buffer_size + + size_t + size_t tf::cuda_get_total_mem + (int d) + cuda_get_total_mem + tf::cuda_get_total_mem - unsigned - count + int + d -queries the buffer size in bytes needed to call sort kernels for the given number of elements +queries the total available memory (expensive call) - - -P - - -execution policy type - - - - -K - - -key type - - - - -V - - -value type (default tf::cudaEmpty) - - - - - -count - - -number of keys/values to sort - - - -The function is used to allocate a buffer for calling tf::cuda_sort. - + - + - typename P - - - typename K_it - - - typename V_it - - - typename C + typename T - void - void tf::cuda_sort_by_key - (P &&p, K_it k_first, K_it k_last, V_it v_first, C comp, void *buf) - cuda_sort_by_key - - P && - p - - - K_it - k_first - - - K_it - k_last - - - V_it - v_first - - - C - comp - - - void * - buf - - -performs asynchronous key-value sort on a range of items - - - - -P - - -execution policy type - - - - -K_it - - -key iterator type - - - - -V_it - - -value iterator type - - - - -C - - -comparator type - - - - - -p - - -execution policy - - - - -k_first - - -iterator to the beginning of the key range - - - - -k_last - - -iterator to the end of the key range - - - - -v_first - - -iterator to the beginning of the value range - - - - -comp - - -binary comparator - - - - -buf - - -pointer to the temporary buffer - - - -Sorts key-value elements in [k_first, k_last) and [v_first, v_first + (k_last - k_first)) into ascending key order using the given comparator comp. If i and j are any two valid iterators in [k_first, k_last) such that i precedes j, and p and q are iterators in [v_first, v_first + (k_last - k_first)) corresponding to i and j respectively, then comp(*j, *i) evaluates to false. -For example, assume: -keys are {1, 4, 2, 8, 5, 7} -values are {'a', 'b', 'c', 'd', 'e', 'f'} - - -After sort: -keys are {1, 2, 4, 5, 7, 8} -values are {'a', 'c', 'b', 'e', 'f', 'd'} - - + T * + T * tf::cuda_malloc_device + (size_t N, int d) + cuda_malloc_device + tf::cuda_malloc_device + + size_t + N + + + int + d + + +allocates memory on the given device for holding N elements of type T + + +The function calls cudaMalloc to allocate N*sizeof(T) bytes of memory on the given device d and returns a pointer to the starting address of the device memory. - + - + - typename P + typename T + + T * + T * tf::cuda_malloc_device + (size_t N) + cuda_malloc_device + tf::cuda_malloc_device + + size_t + N + + +allocates memory on the current device associated with the caller + + +The function calls malloc_device from the current device associated with the caller. + + + + + + + - typename K_it + typename T + + T * + T * tf::cuda_malloc_shared + (size_t N) + cuda_malloc_shared + tf::cuda_malloc_shared + + size_t + N + + +allocates shared memory for holding N elements of type T + + +The function calls cudaMallocManaged to allocate N*sizeof(T) bytes of memory and returns a pointer to the starting address of the shared memory. + + + + + + + - typename C + typename T void - void tf::cuda_sort - (P &&p, K_it k_first, K_it k_last, C comp, void *buf) - cuda_sort - - P && - p - - - K_it - k_first - - - K_it - k_last - + void tf::cuda_free + (T *ptr, int d) + cuda_free + tf::cuda_free - C - comp + T * + ptr - void * - buf + int + d -performs asynchronous key-only sort on a range of items +frees memory on the GPU device -P - - -execution policy type - - - - -K_it - - -key iterator type - - - - -C +T -comparator type +pointer type -p - - -execution policy - - - - -k_first - - -iterator to the beginning of the key range - - - - -k_last - - -iterator to the end of the key range - - - - -comp +ptr -binary comparator +device pointer to memory to free -buf +d -pointer to the temporary buffer +device context identifier -This method is equivalent to tf::cuda_sort_by_key without values. +This methods call cudaFree to free the memory space pointed to by ptr using the given device context. - + - + - typename P - - - typename I - - - typename U + typename T void - void tf::cuda_find_if - (P &&p, I first, I last, unsigned *idx, U op) - cuda_find_if - - P && - p - - - I - first - - - I - last - - - unsigned * - idx - + void tf::cuda_free + (T *ptr) + cuda_free + tf::cuda_free - U - op + T * + ptr -finds the index of the first element that satisfies the given criteria +frees memory on the GPU device -P - - -execution policy type - - - - -I - - -input iterator type - - - - -U +T -unary operator type +pointer type -p - - -execution policy - - - - -first - - -iterator to the beginning of the range - - - - -last - - -iterator to the end of the range - - - - -idx - - -pointer to the index of the found element - - - - -op +ptr -unary operator which returns true for the required element +device pointer to memory to free -The function launches kernels asynchronously to find the index idx of the first element in the range [first, last) such that op(*(first+idx)) is true. This is equivalent to the parallel execution of the following loop: -unsignedidx=0; -for(;first!=last;++first,++idx){ -if(p(*first)){ -returnidx; -} -} -returnidx; - +This methods call cudaFree to free the memory space pointed to by ptr using the current device context of the caller. - + - - - - typename P - - - typename I - - - typename O - - + void - void tf::cuda_min_element - (P &&p, I first, I last, unsigned *idx, O op, void *buf) - cuda_min_element - - P && - p - - - I - first - + void tf::cuda_memcpy_async + (cudaStream_t stream, void *dst, const void *src, size_t count) + cuda_memcpy_async + tf::cuda_memcpy_async - I - last + cudaStream_t + stream - unsigned * - idx + void * + dst - O - op + const void * + src - void * - buf + size_t + count -finds the index of the minimum element in a range +copies data between host and device asynchronously through a stream - + -P +stream -execution policy type +stream identifier -I +dst -input iterator type +destination memory address -O - - -comparator type - - - - - -p +src -execution policy object +source memory address -first +count -iterator to the beginning of the range +size in bytes to copy - + +The method calls cudaMemcpyAsync with the given stream using cudaMemcpyDefault to infer the memory space of the source and the destination pointers. The memory areas may not overlap. + + + + + + + void + void tf::cuda_memset_async + (cudaStream_t stream, void *devPtr, int value, size_t count) + cuda_memset_async + tf::cuda_memset_async + + cudaStream_t + stream + + + void * + devPtr + + + int + value + + + size_t + count + + +initializes or sets GPU memory to the given value byte by byte + + + -last +stream -iterator to the end of the range +stream identifier -idx +devPtr -solution index of the minimum element +pointer to GPU memory -op +value -comparison function object +value to set for each byte of the specified memory -buf +count -pointer to the buffer +size in bytes to set -The function launches kernels asynchronously to find the smallest element in the range [first, last) using the given comparator op. You need to provide a buffer that holds at least tf::cuda_min_element_bufsz bytes for internal use. The function is equivalent to a parallel execution of the following loop: -if(first==last){ -return0; -} -autosmallest=first; -for(++first;first!=last;++first){ -if(op(*first,*smallest)){ -smallest=first; -} -} -returnstd::distance(first,smallest); - +The method calls cudaMemsetAsync with the given stream to fill the first count bytes of the memory area pointed to by devPtr with the constant byte value value. - + - + - typename P + typename T + + + std::enable_if_t<!std::is_same_v< T, void >, void > * + nullptr + + cudaMemcpy3DParms + cudaMemcpy3DParms tf::cuda_get_copy_parms + (T *tgt, const T *src, size_t num) + cuda_get_copy_parms + tf::cuda_get_copy_parms + + T * + tgt + + + const T * + src + + + size_t + num + + +gets the memcpy node parameter of a copy task + + + + + + + + + cudaMemcpy3DParms + cudaMemcpy3DParms tf::cuda_get_memcpy_parms + (void *tgt, const void *src, size_t bytes) + cuda_get_memcpy_parms + tf::cuda_get_memcpy_parms + + void * + tgt + + + const void * + src + + + size_t + bytes + + +gets the memcpy node parameter of a memcpy task (untyped) + + + + + + + + + cudaMemsetParams + cudaMemsetParams tf::cuda_get_memset_parms + (void *dst, int ch, size_t count) + cuda_get_memset_parms + tf::cuda_get_memset_parms + + void * + dst + + + int + ch + + + size_t + count + + +gets the memset node parameter of a memcpy task (untyped) + + + + + + + + + - typename I + typename T - typename O + std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * + nullptr - void - void tf::cuda_max_element - (P &&p, I first, I last, unsigned *idx, O op, void *buf) - cuda_max_element + cudaMemsetParams + cudaMemsetParams tf::cuda_get_fill_parms + (T *dst, T value, size_t count) + cuda_get_fill_parms + tf::cuda_get_fill_parms - P && - p + T * + dst - I - first + T + value - I - last + size_t + count + +gets the memset node parameter of a fill task (typed) + + + + + + + + + + + typename T + + + std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * + nullptr + + + cudaMemsetParams + cudaMemsetParams tf::cuda_get_zero_parms + (T *dst, size_t count) + cuda_get_zero_parms + tf::cuda_get_zero_parms - unsigned * - idx + T * + dst - O - op + size_t + count + +gets the memset node parameter of a zero task (typed) + + + + + + + + + size_t + size_t tf::cuda_graph_get_num_root_nodes + (cudaGraph_t graph) + cuda_graph_get_num_root_nodes + tf::cuda_graph_get_num_root_nodes - void * - buf + cudaGraph_t + graph -finds the index of the maximum element in a range +queries the number of root nodes in a native CUDA graph + + + + + + + + + size_t + size_t tf::cuda_graph_get_num_nodes + (cudaGraph_t graph) + cuda_graph_get_num_nodes + tf::cuda_graph_get_num_nodes + + cudaGraph_t + graph + + +queries the number of nodes in a native CUDA graph + + + + + + + + + size_t + size_t tf::cuda_graph_get_num_edges + (cudaGraph_t graph) + cuda_graph_get_num_edges + tf::cuda_graph_get_num_edges + + cudaGraph_t + graph + + +queries the number of edges in a native CUDA graph + + + + + + + + + std::vector< cudaGraphNode_t > + std::vector< cudaGraphNode_t > tf::cuda_graph_get_nodes + (cudaGraph_t graph) + cuda_graph_get_nodes + tf::cuda_graph_get_nodes + + cudaGraph_t + graph + + +acquires the nodes in a native CUDA graph + + + + + + + + + std::vector< cudaGraphNode_t > + std::vector< cudaGraphNode_t > tf::cuda_graph_get_root_nodes + (cudaGraph_t graph) + cuda_graph_get_root_nodes + tf::cuda_graph_get_root_nodes + + cudaGraph_t + graph + + +acquires the root nodes in a native CUDA graph + + + + + + + + + std::vector< std::pair< cudaGraphNode_t, cudaGraphNode_t > > + std::vector< std::pair< cudaGraphNode_t, cudaGraphNode_t > > tf::cuda_graph_get_edges + (cudaGraph_t graph) + cuda_graph_get_edges + tf::cuda_graph_get_edges + + cudaGraph_t + graph + + +acquires the edges in a native CUDA graph + + + + + + + + + cudaGraphNodeType + cudaGraphNodeType tf::cuda_get_graph_node_type + (cudaGraphNode_t node) + cuda_get_graph_node_type + tf::cuda_get_graph_node_type + + cudaGraphNode_t + node + + +queries the type of a native CUDA graph node + + +valid type values are: +cudaGraphNodeTypeKernel = 0x00 +cudaGraphNodeTypeMemcpy = 0x01 +cudaGraphNodeTypeMemset = 0x02 +cudaGraphNodeTypeHost = 0x03 +cudaGraphNodeTypeGraph = 0x04 +cudaGraphNodeTypeEmpty = 0x05 +cudaGraphNodeTypeWaitEvent = 0x06 +cudaGraphNodeTypeEventRecord = 0x07 + + + + + + + + + const char * + const char * tf::to_string + (cudaGraphNodeType type) + to_string + tf::to_string + + cudaGraphNodeType + type + + +convert a cuda_task type to a human-readable string + + + + + + + + + std::ostream & + std::ostream & tf::operator<< + (std::ostream &os, const cudaTask &ct) + operator<< + tf::operator<< + + std::ostream & + os + + + const cudaTask & + ct + + +overload of ostream inserter operator for cudaTask - - -P - - -execution policy type - - - - -I - - -input iterator type - - - - -O - - -comparator type - - - - - -p - - -execution policy object - - - - -first - - -iterator to the beginning of the range - - - - -last - - -iterator to the end of the range - - - - -idx - - -solution index of the maximum element - - - - -op - - -comparison function object - - - - -buf - - -pointer to the buffer - - - -The function launches kernels asynchronously to find the largest element in the range [first, last) using the given comparator op. You need to provide a buffer that holds at least tf::cuda_max_element_bufsz bytes for internal use. The function is equivalent to a parallel execution of the following loop: -if(first==last){ -return0; -} -autolargest=first; -for(++first;first!=last;++first){ -if(op(*largest,*first)){ -largest=first; -} -} -returnstd::distance(first,largest); - - + - - constexpr const char * - constexpr const char* tf::version + + const char * + const char * tf::version () version + tf::version queries the version information in a string format major.minor.patch @@ -4425,14 +3209,14 @@ The function launches kernels asynchronously to find the largest element in the - + - + taskflow namespace - + diff --git a/docs/xml/namespacetf_1_1detail.xml b/docs/xml/namespacetf_1_1detail.xml index 6242de886..68e5eb1c5 100644 --- a/docs/xml/namespacetf_1_1detail.xml +++ b/docs/xml/namespacetf_1_1detail.xml @@ -1,91 +1,14 @@ - + tf::detail - tf::detail::cudaBlockReduce - tf::detail::cudaScanResult - tf::detail::cudaScanResult< T, vt, true > - tf::detail::cudaBlockScan - tf::detail::cudaMergePair - tf::detail::cudaMergeRange - tf::detail::cudaBlockSort - tf::detail::cudaFindPair - - - int - cudaScanType - - EXCLUSIVE - = 1 - - - - - - - INCLUSIVE - - - - - - - - - - - - - - - - cudaMergeBoundType - - LOWER - - - - - - - UPPER - - - - - - -merge bound type - - - - - - - - - - - constexpr unsigned - constexpr unsigned tf::detail::cudaScanRecursionThreshold - - cudaScanRecursionThreshold - = 8 - - - - - - - - - - + uint64_t uint64_t tf::detail::NextCapacity (uint64_t A) NextCapacity + tf::detail::NextCapacity uint64_t A @@ -97,31 +20,48 @@ - + - + - size_t - nt - nt - - - size_t - vt - vt + typename T + + TF_FORCE_INLINE Node * + TF_FORCE_INLINE Node * tf::detail::get_node_ptr + (T &node) + get_node_ptr + tf::detail::get_node_ptr + + T & + node + + + + + + + + + + + typename I typename C + + typename E + __global__ void __global__ void tf::detail::cuda_for_each_kernel (I first, unsigned count, C c) cuda_for_each_kernel + tf::detail::cuda_for_each_kernel I first @@ -140,31 +80,25 @@ - + - + - - size_t - nt - nt - - - size_t - vt - vt - typename I typename C + + typename E + __global__ void __global__ void tf::detail::cuda_for_each_index_kernel (I first, I inc, unsigned count, C c) cuda_for_each_index_kernel + tf::detail::cuda_for_each_index_kernel I first @@ -187,20 +121,10 @@ - + - + - - size_t - nt - nt - - - size_t - vt - vt - typename I @@ -210,11 +134,15 @@ typename C + + typename E + __global__ void __global__ void tf::detail::cuda_transform_kernel (I first, unsigned count, O output, C op) cuda_transform_kernel + tf::detail::cuda_transform_kernel I first @@ -237,20 +165,10 @@ - + - + - - size_t - nt - nt - - - size_t - vt - vt - typename I1 @@ -263,11 +181,15 @@ typename C + + typename E + __global__ void __global__ void tf::detail::cuda_transform_kernel (I1 first1, I2 first2, unsigned count, O output, C op) cuda_transform_kernel + tf::detail::cuda_transform_kernel I1 first1 @@ -294,1605 +216,13 @@ - - - - - - size_t - nt - nt - - - size_t - vt - vt - - - typename I - - - typename T - - - typename O - - - __global__ void - __global__ void tf::detail::cuda_reduce_kernel - (I input, unsigned count, T *res, O op, void *ptr) - cuda_reduce_kernel - - I - input - - - unsigned - count - - - T * - res - - - O - op - - - void * - ptr - - - - - - - - - - - - - typename P - - - typename I - - - typename T - - - typename O - - - void - void tf::detail::cuda_reduce_loop - (P &&p, I input, unsigned count, T *res, O op, void *ptr) - cuda_reduce_loop - - P && - p - - - I - input - - - unsigned - count - - - T * - res - - - O - op - - - void * - ptr - - - - - - - - - - - - - size_t - nt - nt - - - size_t - vt - vt - - - typename I - - - typename T - - - typename O - - - __global__ void - __global__ void tf::detail::cuda_uninitialized_reduce_kernel - (I input, unsigned count, T *res, O op, void *ptr) - cuda_uninitialized_reduce_kernel - - I - input - - - unsigned - count - - - T * - res - - - O - op - - - void * - ptr - - - - - - - - - - - - - typename P - - - typename I - - - typename T - - - typename O - - - void - void tf::detail::cuda_uninitialized_reduce_loop - (P &&p, I input, unsigned count, T *res, O op, void *ptr) - cuda_uninitialized_reduce_loop - - P && - p - - - I - input - - - unsigned - count - - - T * - res - - - O - op - - - void * - ptr - - - - - - - - - - - - - typename P - - - typename I - - - typename O - - - typename C - - - void - void tf::detail::cuda_single_pass_scan - (P &&p, cudaScanType scan_type, I input, unsigned count, O output, C op) - cuda_single_pass_scan - - P && - p - - - cudaScanType - scan_type - - - I - input - - - unsigned - count - - - O - output - - - C - op - - -single-pass scan for small input - - - - - - - - - - - typename P - - - typename I - - - typename O - - - typename C - - - void - void tf::detail::cuda_scan_loop - (P &&p, cudaScanType scan_type, I input, unsigned count, O output, C op, void *ptr) - cuda_scan_loop - - P && - p - - - cudaScanType - scan_type - - - I - input - - - unsigned - count - - - O - output - - - C - op - - - void * - ptr - - -main scan loop - - - - - - - - - - - cudaMergeBoundType - bounds - bounds - cudaMergeBoundType::LOWER - - - typename a_keys_it - - - typename b_keys_it - - - typename comp_t - - - __device__ auto - __device__ auto tf::detail::cuda_merge_path - (a_keys_it a_keys, unsigned a_count, b_keys_it b_keys, unsigned b_count, unsigned diag, comp_t comp) - cuda_merge_path - - a_keys_it - a_keys - - - unsigned - a_count - - - b_keys_it - b_keys - - - unsigned - b_count - - - unsigned - diag - - - comp_t - comp - - - - - - - - - - - - - cudaMergeBoundType - bounds - bounds - - - typename keys_it - - - typename comp_t - - - __device__ auto - __device__ auto tf::detail::cuda_merge_path - (keys_it keys, cudaMergeRange range, unsigned diag, comp_t comp) - cuda_merge_path - - keys_it - keys - - - cudaMergeRange - range - - - unsigned - diag - - - comp_t - comp - - - - - - - - - - - - - cudaMergeBoundType - bounds - bounds - - - bool - range_check - range_check - - - typename T - - - typename comp_t - - - __device__ bool - __device__ bool tf::detail::cuda_merge_predicate - (T a_key, T b_key, cudaMergeRange range, comp_t comp) - cuda_merge_predicate - - T - a_key - - - T - b_key - - - cudaMergeRange - range - - - comp_t - comp - - - - - - - - - - - __device__ auto - __device__ auto tf::detail::cuda_compute_merge_range - (unsigned a_count, unsigned b_count, unsigned partition, unsigned spacing, unsigned mp0, unsigned mp1) - cuda_compute_merge_range - - unsigned - a_count - - - unsigned - b_count - - - unsigned - partition - - - unsigned - spacing - - - unsigned - mp0 - - - unsigned - mp1 - - - - - - - - - - - - - unsigned - nt - nt - - - unsigned - vt - vt - - - typename T - - - __device__ auto - __device__ auto tf::detail::cuda_load_two_streams_reg - (const T *a, unsigned a_count, const T *b, unsigned b_count, unsigned tid) - cuda_load_two_streams_reg - - const T * - a - - - unsigned - a_count - - - const T * - b - - - unsigned - b_count - - - unsigned - tid - - - - -Specialization that emits just one LD instruction. Can only reliably used with raw pointer types. Fixed not to use pointer arithmetic so that we don't get undefined behaviors with unaligned types. - - - - - - - - - unsigned - nt - nt - - - unsigned - vt - vt - - - typename T - - - typename a_it - - - typename b_it - - - __device__ std::enable_if_t< !(std::is_pointer< a_it >::value &&std::is_pointer< b_it >::value), cudaArray< T, vt >> - __device__ std::enable_if_t< !(std::is_pointer<a_it>::value && std::is_pointer<b_it>::value), cudaArray<T, vt>> tf::detail::load_two_streams_reg - (a_it a, unsigned a_count, b_it b, unsigned b_count, unsigned tid) - load_two_streams_reg - - a_it - a - - - unsigned - a_count - - - b_it - b - - - unsigned - b_count - - - unsigned - tid - - - - - - - - - - - - - unsigned - nt - nt - - - unsigned - vt - vt - - - typename A - - - typename B - - - typename T - - - unsigned - S - S - - - __device__ void - __device__ void tf::detail::cuda_load_two_streams_shared - (A a, unsigned a_count, B b, unsigned b_count, unsigned tid, T(&shared)[S], bool sync=true) - cuda_load_two_streams_shared - - A - a - - - unsigned - a_count - - - B - b - - - unsigned - b_count - - - unsigned - tid - - - T(&) - shared - [S] - - - bool - sync - true - - - - - - - - - - - - - unsigned - nt - nt - - - unsigned - vt - vt - - - typename T - - - __device__ auto - __device__ auto tf::detail::cuda_gather_two_streams_strided - (const T *a, unsigned a_count, const T *b, unsigned b_count, cudaArray< unsigned, vt > indices, unsigned tid) - cuda_gather_two_streams_strided - - const T * - a - - - unsigned - a_count - - - const T * - b - - - unsigned - b_count - - - cudaArray< unsigned, vt > - indices - - - unsigned - tid - - - - - - - - - - - - - unsigned - nt - nt - - - unsigned - vt - vt - - - typename T - - - typename a_it - - - typename b_it - - - __device__ std::enable_if_t< !(std::is_pointer< a_it >::value &&std::is_pointer< b_it >::value), cudaArray< T, vt >> - __device__ std::enable_if_t< !(std::is_pointer<a_it>::value && std::is_pointer<b_it>::value), cudaArray<T, vt>> tf::detail::cuda_gather_two_streams_strided - (a_it a, unsigned a_count, b_it b, unsigned b_count, cudaArray< unsigned, vt > indices, unsigned tid) - cuda_gather_two_streams_strided - - a_it - a - - - unsigned - a_count - - - b_it - b - - - unsigned - b_count - - - cudaArray< unsigned, vt > - indices - - - unsigned - tid - - - - - - - - - - - - - unsigned - nt - nt - - - unsigned - vt - vt - - - typename a_it - - - typename b_it - - - typename c_it - - - __device__ void - __device__ void tf::detail::cuda_transfer_two_streams_strided - (a_it a, unsigned a_count, b_it b, unsigned b_count, cudaArray< unsigned, vt > indices, unsigned tid, c_it c) - cuda_transfer_two_streams_strided - - a_it - a - - - unsigned - a_count - - - b_it - b - - - unsigned - b_count - - - cudaArray< unsigned, vt > - indices - - - unsigned - tid - - - c_it - c - - - - - - - - - - - - - cudaMergeBoundType - bounds - bounds - - - unsigned - vt - vt - - - typename T - - - typename comp_t - - - __device__ auto - __device__ auto tf::detail::cuda_serial_merge - (const T *keys_shared, cudaMergeRange range, comp_t comp, bool sync=true) - cuda_serial_merge - - const T * - keys_shared - - - cudaMergeRange - range - - - comp_t - comp - - - bool - sync - true - - - - -This function must be able to dereference keys[a_begin] and keys[b_begin], no matter the indices for each. The caller should allocate at least nt * vt + 1 elements for - - - - - - - - - cudaMergeBoundType - bounds - bounds - - - unsigned - nt - nt - - - unsigned - vt - vt - - - typename a_it - - - typename b_it - - - typename T - - - typename comp_t - - - unsigned - S - S - - - __device__ auto - __device__ auto tf::detail::block_merge_from_mem - (a_it a, b_it b, cudaMergeRange range_mem, unsigned tid, comp_t comp, T(&keys_shared)[S]) - block_merge_from_mem - - a_it - a - - - b_it - b - - - cudaMergeRange - range_mem - - - unsigned - tid - - - comp_t - comp - - - T(&) - keys_shared - [S] - - - - -Load arrays a and b from global memory and merge unsignedo register. - - - - - - - - - cudaMergeBoundType - bounds - bounds - - - typename P - - - typename a_keys_it - - - typename b_keys_it - - - typename comp_t - - - void - void tf::detail::cuda_merge_path_partitions - (P &&p, a_keys_it a, unsigned a_count, b_keys_it b, unsigned b_count, unsigned spacing, comp_t comp, unsigned *buf) - cuda_merge_path_partitions - - P && - p - - - a_keys_it - a - - - unsigned - a_count - - - b_keys_it - b - - - unsigned - b_count - - - unsigned - spacing - - - comp_t - comp - - - unsigned * - buf - - - - - - - - - - - - - typename P - - - typename a_keys_it - - - typename a_vals_it - - - typename b_keys_it - - - typename b_vals_it - - - typename c_keys_it - - - typename c_vals_it - - - typename comp_t - - - void - void tf::detail::cuda_merge_loop - (P &&p, a_keys_it a_keys, a_vals_it a_vals, unsigned a_count, b_keys_it b_keys, b_vals_it b_vals, unsigned b_count, c_keys_it c_keys, c_vals_it c_vals, comp_t comp, void *ptr) - cuda_merge_loop - - P && - p - - - a_keys_it - a_keys - - - a_vals_it - a_vals - - - unsigned - a_count - - - b_keys_it - b_keys - - - b_vals_it - b_vals - - - unsigned - b_count - - - c_keys_it - c_keys - - - c_vals_it - c_vals - - - comp_t - comp - - - void * - ptr - - - - - - - - - - - constexpr int - constexpr int tf::detail::cuda_clz - (int x) - cuda_clz - - int - x - - -counts the number of leading zeros starting from the most significant bit - - - - - - - - - constexpr int - constexpr int tf::detail::cuda_find_log2 - (int x, bool round_up=false) - cuda_find_log2 - - int - x - - - bool - round_up - false - - -finds log2(x) and optionally round up to the next integer logarithm. - - - - - - - - - - - typename T - - - unsigned - vt - vt - - - typename C - - - __device__ auto - __device__ auto tf::detail::cuda_odd_even_sort - (cudaArray< T, vt > x, C comp, int flags=0) - cuda_odd_even_sort - - cudaArray< T, vt > - x - - - C - comp - - - int - flags - 0 - - - - - - - - - - - - - typename K - - - typename V - - - unsigned - vt - vt - - - typename C - - - __device__ auto - __device__ auto tf::detail::cuda_odd_even_sort - (cudaKVArray< K, V, vt > x, C comp, int flags=0) - cuda_odd_even_sort - - cudaKVArray< K, V, vt > - x - - - C - comp - - - int - flags - 0 - - - - - - - - - - - __device__ int - __device__ int tf::detail::cuda_out_of_range_flags - (int first, int vt, int count) - cuda_out_of_range_flags - - int - first - - - int - vt - - - int - count - - - - - - - - - - - __device__ auto - __device__ auto tf::detail::cuda_compute_merge_sort_frame - (unsigned partition, unsigned coop, unsigned spacing) - cuda_compute_merge_sort_frame - - unsigned - partition - - - unsigned - coop - - - unsigned - spacing - - - - - - - - - - - __device__ auto - __device__ auto tf::detail::cuda_compute_merge_sort_range - (unsigned count, unsigned partition, unsigned coop, unsigned spacing) - cuda_compute_merge_sort_range - - unsigned - count - - - unsigned - partition - - - unsigned - coop - - - unsigned - spacing - - - - - - - - - - - __device__ auto - __device__ auto tf::detail::cuda_compute_merge_sort_range - (unsigned count, unsigned partition, unsigned coop, unsigned spacing, unsigned mp0, unsigned mp1) - cuda_compute_merge_sort_range - - unsigned - count - - - unsigned - partition - - - unsigned - coop - - - unsigned - spacing - - - unsigned - mp0 - - - unsigned - mp1 - - - - - - - - - - - - - typename P - - - typename K - - - typename C - - - void - void tf::detail::cuda_merge_sort_partitions - (P &&p, K keys, unsigned count, unsigned coop, unsigned spacing, C comp, unsigned *buf) - cuda_merge_sort_partitions - - P && - p - - - K - keys - - - unsigned - count - - - unsigned - coop - - - unsigned - spacing - - - C - comp - - - unsigned * - buf - - - - - - - - - - - - - typename P - - - typename K_it - - - typename V_it - - - typename C - - - void - void tf::detail::merge_sort_loop - (P &&p, K_it keys_input, V_it vals_input, unsigned count, C comp, void *buf) - merge_sort_loop - - P && - p - - - K_it - keys_input - - - V_it - vals_input - - - unsigned - count - - - C - comp - - - void * - buf - - - - - - - - - - - - - typename P - - - typename I - - - typename U - - - void - void tf::detail::cuda_find_if_loop - (P &&p, I input, unsigned count, unsigned *idx, U pred) - cuda_find_if_loop - - P && - p - - - I - input - - - unsigned - count - - - unsigned * - idx - - - U - pred - - - - - - - - - - - - - typename P - - - typename I - - - typename O - - - void - void tf::detail::cuda_min_element_loop - (P &&p, I input, unsigned count, unsigned *idx, O op, void *ptr) - cuda_min_element_loop - - P && - p - - - I - input - - - unsigned - count - - - unsigned * - idx - - - O - op - - - void * - ptr - - - - - - - - - - - - - typename P - - - typename I - - - typename O - - - void - void tf::detail::cuda_max_element_loop - (P &&p, I input, unsigned count, unsigned *idx, O op, void *ptr) - cuda_max_element_loop - - P && - p - - - I - input - - - unsigned - count - - - unsigned * - idx - - - O - op - - - void * - ptr - - - - - - - - + - + - + diff --git a/docs/xml/namespacetf_1_1pt.xml b/docs/xml/namespacetf_1_1pt.xml new file mode 100644 index 000000000..7e41f8ee8 --- /dev/null +++ b/docs/xml/namespacetf_1_1pt.xml @@ -0,0 +1,28 @@ + + + + tf::pt + + + thread_local Worker * + thread_local Worker* tf::pt::this_worker + + this_worker + tf::pt::this_worker + {nullptr} + + + + + + + + + + + + + + + + diff --git a/docs/xml/observer_8hpp.xml b/docs/xml/observer_8hpp.xml index 6402ecec0..710fdb33f 100644 --- a/docs/xml/observer_8hpp.xml +++ b/docs/xml/observer_8hpp.xml @@ -1,7 +1,278 @@ - + observer.hpp + task.hpp + worker.hpp + taskflow/core/executor.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf::Segment tf::Timeline tf::ProfileData @@ -20,6 +291,6 @@ - + diff --git a/docs/xml/opentimer.xml b/docs/xml/opentimer.xml index 18ec8628d..4cc2cdbbf 100644 --- a/docs/xml/opentimer.xml +++ b/docs/xml/opentimer.xml @@ -1,5 +1,5 @@ - + opentimer Codestin Search App @@ -7,38 +7,36 @@ OpenTimer: A High-performance Timing Analysis Tool opentimer_1UseCasesOpenTimer - + Programming Effort opentimer_1UseCaseOpenTimerProgrammingEffort - + Performance Improvement opentimer_1UseCaseOpenTimerPerformanceImprovement - + Conclusion opentimer_1UseCaseOpenTimerConclusion - + References opentimer_1UseCaseOpenTimerReferences - + We have applied Taskflow to solve a real-world VLSI static timing analysis problem that incorporates hundreds of millions of tasks and dependencies. The goal is to analyze the timing behavior of a design. -Codestin Search App -Static timing analysis (STA) is an important step in the overall chip design flow. It verifies the static behavior of a circuit design and ensure its correct functionality under the given clock speed. However, efficient parallel timing analysis is extremely challenging to design and implement, due to large irregularity and graph-oriented computing. The following figure shows an extracted timing graph from an industrial design. +Codestin Search AppStatic timing analysis (STA) is an important step in the overall chip design flow. It verifies the static behavior of a circuit design and ensure its correct functionality under the given clock speed. However, efficient parallel timing analysis is extremely challenging to design and implement, due to large irregularity and graph-oriented computing. The following figure shows an extracted timing graph from an industrial design. We consider our research project OpenTimer, an open-source static timing analyzer that has been used in many industrial and academic projects. The first release v1 in 2015 implemented the pipeline-based levelization algorithm using the OpenMP 4.5 task dependency clause. To overcome the performance bottleneck caused by pipeline, we rewrote the core incremental timing engine using Taskflow in the second release v2. -Codestin Search App -The table below measures the software costs of two OpenTimer versions using the Linux tool SLOCCount. In OpenTimer v2, a large amount of exhaustive OpenMP dependency clauses that were used to carry out task dependencies are now replaced with only a few lines of flexible Taskflow code (9123 vs 4482). The maximum cyclomatic complexity in a single function is reduced from 58 to 20, due to Taskflow's programmability. +Codestin Search AppThe table below measures the software costs of two OpenTimer versions using the Linux tool SLOCCount. In OpenTimer v2, a large amount of exhaustive OpenMP dependency clauses that were used to carry out task dependencies are now replaced with only a few lines of flexible Taskflow code (9123 vs 4482). The maximum cyclomatic complexity in a single function is reduced from 58 to 20, due to Taskflow's programmability.
    Tool Task Model @@ -63,12 +61,11 @@
    OpenTimer v1 relied on a pipeline data structure to adtop loop parallelism with OpenMP. We found it very difficult to go beyond this paradigm because of the insufficient support for dynamic dependencies in OpenMP. With Taskflow in place, we can break this bottleneck and easily model both static and dynamic task dependencies at programming time and runtime. The task dependency graph flows computations naturally with the timing graph, providing improved asynchrony and performance. The following figure shows a task graph to carry one iteration of timing update. - +
    -Codestin Search App -We compare the performance between OpenTimer v1 and v2. We evaluated the runtime versus incremental iterations under 16 CPUs on two industrial circuit designs tv80 (5.3K gates and 5.3K nets) and vga_lcd (139.5K gates and 139.6K nets) with 45nm NanGate cell libraris. Each incremental iteration refers a design modification followed by a timing query to trigger a timing update. In v1, this includes the time to reconstruct the data structure required by OpenMP to alter the task dependencies. In v2, this includes the time to create and launch a new task dependency graph +Codestin Search AppWe compare the performance between OpenTimer v1 and v2. We evaluated the runtime versus incremental iterations under 16 CPUs on two industrial circuit designs tv80 (5.3K gates and 5.3K nets) and vga_lcd (139.5K gates and 139.6K nets) with 45nm NanGate cell libraris. Each incremental iteration refers a design modification followed by a timing query to trigger a timing update. In v1, this includes the time to reconstruct the data structure required by OpenMP to alter the task dependencies. In v2, this includes the time to create and launch a new task dependency graph The scalability of Taskflow is shown in the Figure below. We used two million-scale designs, netcard (1.4M gates) and leon3mp (1.2M gates) to evaluate the runtime of v1 and v2 across different number of CPUs. There are two important observations. First, v2 is slightly slower than v1 at one CPU (3-4%), where all OpenMP's constructs are literally disabled. This shows the graph overhead of Taskflow; yet it is negligible. Second, v2 is consistently faster than v1 regardless of CPU numbers except one. This highlights that Taskflow's programming model largely improves the design of a parallel VLSI timing engine that would otherwise not be possible with OpenMP. @@ -76,12 +73,10 @@ -Codestin Search App -Programming models matter. Different models give different implementations. The parallel code sections may run fast, yet the data structures to support a parallel decomposition strategy may outweigh its parallelism benefits. In OpenTimer v1, loop-based OpenMP code is very fast. But it's too costly to maintain the pipeline data structure over iterations. +Codestin Search AppProgramming models matter. Different models give different implementations. The parallel code sections may run fast, yet the data structures to support a parallel decomposition strategy may outweigh its parallelism benefits. In OpenTimer v1, loop-based OpenMP code is very fast. But it's too costly to maintain the pipeline data structure over iterations. -Codestin Search App - +Codestin Search App Tsung-Wei Huang, Guannan Guo, Chun-Xun Lin, and Martin Wong, "OpenTimer v2: A New Parallel Incremental Timing Analysis Engine," IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems (TCAD), vol. 40, no. 4, pp. 776-786, April 2021 Tsung-Wei Huang, Chun-Xun Lin, Guannan Guo, and Martin Wong, "Cpp-Taskflow: Fast Task-based Parallel Programming using Modern C++," IEEE International Parallel and Distributed Processing Symposium (IPDPS), pp. 974-983, Rio de Janeiro, Brazil, 2019. @@ -92,6 +87,6 @@
    - +
    diff --git a/docs/xml/opentimer_8dox.xml b/docs/xml/opentimer_8dox.xml index 02435f2ad..2a274d8d2 100644 --- a/docs/xml/opentimer_8dox.xml +++ b/docs/xml/opentimer_8dox.xml @@ -1,5 +1,5 @@ - + opentimer.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/os_8hpp.xml b/docs/xml/os_8hpp.xml new file mode 100644 index 000000000..3f6044241 --- /dev/null +++ b/docs/xml/os_8hpp.xml @@ -0,0 +1,291 @@ + + + + os.hpp + cstdlib + cstdio + string + thread + taskflow/core/graph.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf::CachelineAligned + tf + + + TF_OS_LINUX + 0 + + + + + + + + + + TF_OS_DRAGONFLY + 0 + + + + + + + + + + TF_OS_FREEBSD + 0 + + + + + + + + + + TF_OS_NETBSD + 0 + + + + + + + + + + TF_OS_OPENBSD + 0 + + + + + + + + + + TF_OS_DARWIN + 0 + + + + + + + + + + TF_OS_WINDOWS + 0 + + + + + + + + + + TF_OS_CNK + 0 + + + + + + + + + + TF_OS_HURD + 0 + + + + + + + + + + TF_OS_SOLARIS + 0 + + + + + + + + + + TF_OS_UNIX + 0 + + + + + + + + + + TF_OS_UNKNOWN + 1 + + + + + + + + + + TF_CACHELINE_SIZE + 64 + + + + + + + + + + + + + + + + diff --git a/docs/xml/partitioner_8dox.xml b/docs/xml/partitioner_8dox.xml index 018d2ccf0..57e061197 100644 --- a/docs/xml/partitioner_8dox.xml +++ b/docs/xml/partitioner_8dox.xml @@ -1,5 +1,5 @@ - + partitioner.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/partitioner_8hpp.xml b/docs/xml/partitioner_8hpp.xml index 6858d5a52..a24b94d9e 100644 --- a/docs/xml/partitioner_8hpp.xml +++ b/docs/xml/partitioner_8hpp.xml @@ -1,8 +1,83 @@ - + partitioner.hpp - tf::DefaultClosureWrapper + taskflow/core/flow_builder.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf::DefaultClosureWrapper tf::IsPartitioner tf::PartitionerBase tf::GuidedPartitioner @@ -15,6 +90,6 @@ - + diff --git a/docs/xml/pipeline_8dox.xml b/docs/xml/pipeline_8dox.xml index 25b06781f..22391a2be 100644 --- a/docs/xml/pipeline_8dox.xml +++ b/docs/xml/pipeline_8dox.xml @@ -1,5 +1,5 @@ - + pipeline.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/pipeline_8hpp.xml b/docs/xml/pipeline_8hpp.xml index f7c332a36..1c36de0e8 100644 --- a/docs/xml/pipeline_8hpp.xml +++ b/docs/xml/pipeline_8hpp.xml @@ -1,7 +1,296 @@ - + pipeline.hpp + ../taskflow.hpp + taskflow/algorithm/data_pipeline.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf::DeferredPipeflow tf::Pipeflow tf::Pipe @@ -16,6 +305,6 @@ - + diff --git a/docs/xml/pipeline__with__token__dependencies_8dox.xml b/docs/xml/pipeline__with__token__dependencies_8dox.xml index 7ef7362d5..5e32b3c2d 100644 --- a/docs/xml/pipeline__with__token__dependencies_8dox.xml +++ b/docs/xml/pipeline__with__token__dependencies_8dox.xml @@ -1,5 +1,5 @@ - + pipeline_with_token_dependencies.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/prioritized__tasking_8dox.xml b/docs/xml/prioritized__tasking_8dox.xml deleted file mode 100644 index 7144f8997..000000000 --- a/docs/xml/prioritized__tasking_8dox.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - prioritized_tasking.dox - tf - - - - - - - diff --git a/docs/xml/profiler_8dox.xml b/docs/xml/profiler_8dox.xml index c39d46c2e..abac3f559 100644 --- a/docs/xml/profiler_8dox.xml +++ b/docs/xml/profiler_8dox.xml @@ -1,5 +1,5 @@ - + profiler.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/reduce_8dox.xml b/docs/xml/reduce_8dox.xml index e73df4678..e41e84fc8 100644 --- a/docs/xml/reduce_8dox.xml +++ b/docs/xml/reduce_8dox.xml @@ -1,5 +1,5 @@ - + reduce.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/reduce_8hpp.xml b/docs/xml/reduce_8hpp.xml deleted file mode 100644 index 4b51d9c9f..000000000 --- a/docs/xml/reduce_8hpp.xml +++ /dev/null @@ -1,16 +0,0 @@ - - - - reduce.hpp - tf::detail::cudaBlockReduce - tf::detail::cudaBlockReduce::Storage - tf - tf::detail - -cuda reduce algorithms include file - - - - - - diff --git a/docs/xml/references_8dox.xml b/docs/xml/references_8dox.xml index b8ed0cd0e..dc852024d 100644 --- a/docs/xml/references_8dox.xml +++ b/docs/xml/references_8dox.xml @@ -1,5 +1,5 @@ - + references.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/release-1-x-x.xml b/docs/xml/release-1-x-x.xml index 81c7a3a73..d0fae98d9 100644 --- a/docs/xml/release-1-x-x.xml +++ b/docs/xml/release-1-x-x.xml @@ -1,5 +1,5 @@ - + release-1-x-x Codestin Search App @@ -9,6 +9,6 @@ Prior to being open-source in 2018, Cpp-Taskflow was internal to the OpenTimer project sponsored by the NSF and DARPA. Later, we decided to open our knowledge of parallelizing large-scale parallel applications by making Cpp-Taskflow a standalone open-source project that can benefit generic C++ developers. Due to the difference license agreement imposed by OpenTimer at an earlier stage (i.e., users and funding agency requirement), we are unable to open the source of Cpp-Taskflow in the 1.x line. Starting from 2.x, we have switched to MIT license and made the source completely open and transparent to the community. - + diff --git a/docs/xml/release-1_8x_8x_8dox.xml b/docs/xml/release-1_8x_8x_8dox.xml index 054690ee7..9b61c4bbf 100644 --- a/docs/xml/release-1_8x_8x_8dox.xml +++ b/docs/xml/release-1_8x_8x_8dox.xml @@ -1,5 +1,5 @@ - + release-1.x.x.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/release-2-0-0.xml b/docs/xml/release-2-0-0.xml index 4c27434e8..fd4ba51c3 100644 --- a/docs/xml/release-2-0-0.xml +++ b/docs/xml/release-2-0-0.xml @@ -1,5 +1,5 @@ - + release-2-0-0 Codestin Search App @@ -7,35 +7,33 @@ Download release-2-0-0_1release-2-0-0_download - + New Features release-2-0-0_1release-2-0-0_new_features - + Improvements and Enhancements release-2-0-0_1release-2-0-0_improvements_enhancements - + Breaks and Deprecated Features release-2-0-0_1release-2-0-0_breaks_and_deprecated_features - + Bug Fixes release-2-0-0_1release-2-0-0_bug_fixes - + Cpp-Taskflow 2.0.0 is the first release in the 2.x line! This release includes several new changes such as dynamic tasking, executor, thread pool, etc. In addition, this release improved usability, stability, and performance. -Codestin Search App -Cpp-Taskflow 2.0.0 can be downloaded from here. +Codestin Search AppCpp-Taskflow 2.0.0 can be downloaded from here. -Codestin Search App - +Codestin Search App New dynamic tasking capability through tf::SubflowBuilder to spawn tasks at runtime. New algorithm collections (tf::FlowBuilder::reduce, tf::FlowBuilder::transform_reduce) added to tf::FlowBuilder. @@ -46,8 +44,7 @@ -Codestin Search App - +Codestin Search App Improved the performance of speculative Removed the target node from a topology. @@ -62,14 +59,12 @@ -Codestin Search App -There are no breaks and deprecated features in this release. +Codestin Search AppThere are no breaks and deprecated features in this release. -Codestin Search App -There are no major bug fixes in this release. +Codestin Search AppThere are no major bug fixes in this release. - + diff --git a/docs/xml/release-2-1-0.xml b/docs/xml/release-2-1-0.xml index 37fcf48bc..35ddaac00 100644 --- a/docs/xml/release-2-1-0.xml +++ b/docs/xml/release-2-1-0.xml @@ -1,5 +1,5 @@ - + release-2-1-0 Codestin Search App @@ -7,35 +7,33 @@ Download release-2-1-0_1release-2-1-0_download - + New Features release-2-1-0_1release-2-1-0_new_features - + Improvements and Enhancements release-2-1-0_1release-2-1-0_improvements_enhancements - + Breaks and Deprecated Features release-2-1-0_1release-2-1-0_breaks_and_deprecated_features - + Bug Fixes release-2-1-0_1release-2-1-0_bug_fixes - + Cpp-Taskflow 2.1.0 is the second release in the 2.x line! This release includes several new changes such as tf::Framework, tf::WorkStealingThreadpool, tf::SpeculativeThreadpool, allocators, benchmarks, and so forth. In addition, this release improved usability, stability, and performance. -Codestin Search App -Cpp-Taskflow 2.1.0 can be downloaded from here. +Codestin Search AppCpp-Taskflow 2.1.0 can be downloaded from here. -Codestin Search App - +Codestin Search App A new reuseable task dependency graph tf::Framework. New API (tf::Taskflow::run, tf::Taskflow::run_n, tf::Taskflow::run_until) to execute a framework @@ -44,11 +42,10 @@ -Codestin Search App - +Codestin Search App Improved the performance of tf::WorkStealingThreadpool (non-blocking notifier, work stealing strategy, etc.) -Changed the data structure to store nodes and topologies to std::list. +Changed the data structure to store nodes and topologies to std::list. Added memory pool and allocator to manage the memory allocation of nodes and topologies. @@ -66,18 +63,16 @@ -Codestin Search App - -tf::Taskflow::emplace is now merged with tf::Taskflow::silent_emplace (both are the same) and no longer returns std::future in order to support tf::Framework +Codestin Search App +tf::Taskflow::emplace is now merged with tf::Taskflow::silent_emplace (both are the same) and no longer returns std::future in order to support tf::Framework -Codestin Search App -There are no major bug fixes in this release. +Codestin Search AppThere are no major bug fixes in this release. - + diff --git a/docs/xml/release-2-2-0.xml b/docs/xml/release-2-2-0.xml index 00477ee4f..082fd412b 100644 --- a/docs/xml/release-2-2-0.xml +++ b/docs/xml/release-2-2-0.xml @@ -1,5 +1,5 @@ - + release-2-2-0 Codestin Search App @@ -7,27 +7,25 @@ Download release-2-2-0_1release-2-2-0_download - + New Features release-2-2-0_1release-2-2-0_new_features - + Breaks and Deprecated Features release-2-2-0_1release-2-2-0_breaks_and_deprecated_features - + Cpp-Taskflow 2.2.0 is the 3rd release in the 2.x line! This release includes several new changes such as tf::ExecutorObserverInterface, tf::Executor, isolation of taskflow graph and executor, benchmarks, and so forth. In particular, this release improve the performance of the work stealing scheduler. -Codestin Search App -Cpp-Taskflow 2.2.0 can be downloaded from here. +Codestin Search AppCpp-Taskflow 2.2.0 can be downloaded from here. -Codestin Search App - +Codestin Search App A new executor class to isolate the execution module from a taskflow A new observer interface to inspect the activities of an executor @@ -40,24 +38,23 @@ -Codestin Search App -In this release, we isolated the executor interface from tf::Taskflow, and merge tf::Framework with tf::Taskflow. This change largely improved the modularity and composability of Cpp-Taskflow in creating clean task dependency graphs and execution flows. Performance is also better. While this introduced some breaks in tf::Taskflow, we have managed to make it as less painful as possible for users to adapt to the new change. +Codestin Search AppIn this release, we isolated the executor interface from tf::Taskflow, and merge tf::Framework with tf::Taskflow. This change largely improved the modularity and composability of Cpp-Taskflow in creating clean task dependency graphs and execution flows. Performance is also better. While this introduced some breaks in tf::Taskflow, we have managed to make it as less painful as possible for users to adapt to the new change. Previously, tf::Taskflow is a hero class that manages both a task dependency graph and the execution of all graphs including frameworks. For example: //beforev2.2.0,tf::Taskflowmanagesbothgraphandexecution tf::Taskflowtaskflow(4);//createataskflowobjectwith4threads -taskflow.emplace([](){std::cout<<"taskA\n";}); +taskflow.emplace([](){std::cout<<"taskA\n";}); taskflow.wait_for_all();//dispatchthepresentgraph tf::Frameworkframework;//createaframeworkobject -framework.emplace([](){std::cout<<"taskB\n";}); +framework.emplace([](){std::cout<<"taskB\n";}); taskflow.run(framework);//runtheframeworkonce taskflow.wait_for_all();//waituntiltheframeworkfinishes However, this design is awkward in many aspects. For instance, calling wait_for_all dispatches the present graph and the graph vanishes when the execution completes. To reuse a graph, users have to create another special graph called framework and mix its execution with the one in a taskflow object. Given the user feedback and lessons we have learned so far, we decided to isolate the executor interface out of tf::Taskflow and merge tf::Framework with tf::Taskflow. All execution methods such as dispatch and wait_for_all have been moved from tf::Taskflow to tf::Executor. //startingfromv2.2.0,tf::Executormanagestheexecutionofgraphs tf::Taskflowtaskflow;//createataskflowtobuilddependenttasks -tf::TaskA=taskflow.emplace([](){std::cout<<"taskA\n";}); -tf::TaskB=taskflow.emplace([](){std::cout<<"taskB\n";}); +tf::TaskA=taskflow.emplace([](){std::cout<<"taskA\n";}); +tf::TaskB=taskflow.emplace([](){std::cout<<"taskB\n";}); A.precede(B); tf::Executorexecutor(4);//createanexecutorof4threads @@ -69,6 +66,6 @@ Again, we apologize this breaking change! I hope you can understand what we did is to make Cpp-Taskflow provide good performance scaling and user experience. - + diff --git a/docs/xml/release-2-3-0.xml b/docs/xml/release-2-3-0.xml index a57c6744d..d61aca249 100644 --- a/docs/xml/release-2-3-0.xml +++ b/docs/xml/release-2-3-0.xml @@ -1,5 +1,5 @@ - + release-2-3-0 Codestin Search App @@ -7,31 +7,29 @@ Download release-2-3-0_1release-2-3-0_download - + New Features release-2-3-0_1release-2-3-0_new_features - + Bug Fixes release-2-3-0_1release-2-3-0_bug_fixes - + Deprecated Items release-2-3-0_1release-2-3-0_deprecated_items - + Cpp-Taskflow 2.3.0 is the 4th release in the 2.x line! This release includes several new changes such as conditional tasking, modified scheduling flows, benchmarks, documentation, and so forth. -Codestin Search App -Cpp-Taskflow 2.3.0 can be downloaded from here. +Codestin Search AppCpp-Taskflow 2.3.0 can be downloaded from here. -Codestin Search App - +Codestin Search App Added full C++14/17/20 support Added a thread-safe object pool motivated by Hoard memory allocator @@ -40,7 +38,7 @@ Added tf::Task::has_work to detect if a task is a placeholder -Added tf::Task::for_each_successor, tf::Task::for_each_dependent, and tf::Taskflow::for_each_task, tf::TaskView::for_each_successor, tf::TaskView::for_each_dependent to support graph traversal +Added tf::Task::for_each_successor, tf::Task::for_each_dependent, and tf::Taskflow::for_each_task, tf::TaskView::for_each_successor, tf::TaskView::for_each_dependent to support graph traversal Modified the task scheduling flow @@ -56,8 +54,7 @@ -Codestin Search App - +Codestin Search App Fixed the stack overflow problem in zero worker execution Fixed the missing comma in output execution timelines from an executor @@ -68,13 +65,12 @@ -Codestin Search App - +Codestin Search App Removed zero worker thread support in execution Removed gather method in task handle -Removed std::vector and std::initializer_list support in task's preceed/succeed methods +Removed std::vector and std::initializer_list support in task's preceed/succeed methods Removed taskflow::silent_emplace method @@ -82,6 +78,6 @@ - + diff --git a/docs/xml/release-2-3-1.xml b/docs/xml/release-2-3-1.xml index ea88e8a6d..286daba25 100644 --- a/docs/xml/release-2-3-1.xml +++ b/docs/xml/release-2-3-1.xml @@ -1,5 +1,5 @@ - + release-2-3-1 Codestin Search App @@ -8,18 +8,16 @@ Cpp-Taskflow 2.3.1 is the 5th release in the 2.x line! -Codestin Search App -Cpp-Taskflow 2.3.1 can be downloaded from here. +Codestin Search AppCpp-Taskflow 2.3.1 can be downloaded from here. -Codestin Search App - +Codestin Search App Fixed the memory error in object pool - + diff --git a/docs/xml/release-2-4-0.xml b/docs/xml/release-2-4-0.xml index 49e75accc..d04261ce1 100644 --- a/docs/xml/release-2-4-0.xml +++ b/docs/xml/release-2-4-0.xml @@ -1,5 +1,5 @@ - + release-2-4-0 Codestin Search App @@ -7,36 +7,34 @@ Download release-2-4-0_1release-2-4-0_download - + New Features release-2-4-0_1release-2-4-0_new_features - + Bug Fixes release-2-4-0_1release-2-4-0_bug_fixes - + Miscellaneous Items release-2-4-0_1release-2-4-0_miscellaneous_items - + Cpp-Taskflow 2.4.0 is the 6th release in the 2.x line! This release includes several new changes such as CPU-GPU tasking, improved scheduling flow, documentation, and unit tests. -Codestin Search App -Cpp-Taskflow 2.4.0 can be downloaded from here. +Codestin Search AppCpp-Taskflow 2.4.0 can be downloaded from here. -Codestin Search App - -added tf::cudaFlow for concurrent CPU-GPU tasking (see GPU Tasking (cudaFlow)) +Codestin Search App +added tf::cudaFlow for concurrent CPU-GPU tasking added a new method tf::Executor::num_topologies to query the number of running taskflows in an executor -added std::hash support for tf::Task +added std::hash support for tf::Task added a new work-stealing algorithm capable of general heterogeneous domains @@ -46,8 +44,7 @@ -Codestin Search App - +Codestin Search App fixed the bug in nested execution (#152) fixed the nameless union/struct extension warning in MS environment (#153) @@ -58,14 +55,13 @@ -Codestin Search App - +Codestin Search App reflected the showcase presentation on CPU-GPU tasking - + diff --git a/docs/xml/release-2-5-0.xml b/docs/xml/release-2-5-0.xml index f0c2b2e92..647f80d2f 100644 --- a/docs/xml/release-2-5-0.xml +++ b/docs/xml/release-2-5-0.xml @@ -1,5 +1,5 @@ - + release-2-5-0 Codestin Search App @@ -7,19 +7,19 @@ Download release-2-5-0_1release-2-5-0_download - + New Features release-2-5-0_1release-2-5-0_new_features - + Bug Fixes release-2-5-0_1release-2-5-0_bug_fixes - + Miscellaneous Items release-2-5-0_1release-2-5-0_miscellaneous_items - + @@ -27,18 +27,16 @@ Starting from v2.5.0, we have renamed Cpp-Taskflow to Taskflow to broaden its impact and support. Taskflow will explore multiple scopes of applications and language bindings, rather than just C++. This also made Taskflow naming more succinct and concise. Taskflow 2.5.0 is the 7th release in the 2.x line! This release includes several new changes such as CPU-GPU tasking, web-based profiler, documentation, and unit tests. -Codestin Search App -Taskflow 2.5.0 can be downloaded from here. +Codestin Search AppTaskflow 2.5.0 can be downloaded from here. To download the newest version of Taskflow, please clone from Taskflow's GitHub. -Codestin Search App - +Codestin Search App enhanced the performance of the work-stealing algorithm -enhanced the interface of concurrent CPU-GPU tasking (added tf::cudaFlow::zero, tf::cudaFlow::memset, tf::cudaFlow::memcpy, tf::cudaFlow::fill) +enhanced the interface of concurrent CPU-GPU tasking (added tf::cudaFlow::zero, tf::cudaFlow::memset, tf::cudaFlow::memcpy, tf::cudaFlow::fill) -enhanced unittests for tf::cudaFlow +enhanced unittests for tf::cudaFlow added per-thread stream to avoid synchronizing with the default stream in running a cudaFlow @@ -46,7 +44,7 @@ added Learning from Examples pages -made observer a std::shared_ptr object +made observer a std::shared_ptr object enabled multiple observers to coexit in an executor @@ -57,8 +55,7 @@ -Codestin Search App - +Codestin Search App fixed the bug in assigning the block pointer before constructor of an object in object pool fixed the namespace conflicting in using MPark.Variant from upstream code @@ -67,8 +64,7 @@ -Codestin Search App - +Codestin Search App fixed the warning between unsigned and size_t conversion in tf::Executor submitted the technical paper to arXiv @@ -77,6 +73,6 @@ - + diff --git a/docs/xml/release-2-6-0.xml b/docs/xml/release-2-6-0.xml index 03fd01457..029b50d8a 100644 --- a/docs/xml/release-2-6-0.xml +++ b/docs/xml/release-2-6-0.xml @@ -1,5 +1,5 @@ - + release-2-6-0 Codestin Search App @@ -7,23 +7,23 @@ Download release-2-6-0_1release-2-6-0_download - + New Features release-2-6-0_1release-2-6-0_new_features - + Bug Fixes release-2-6-0_1release-2-6-0_bug_fixes - + Deprecated Items release-2-6-0_1release-2-6-0_deprecated_items - + Miscellaneous Items release-2-6-0_1release-2-6-0_miscellaneous_items - + @@ -31,21 +31,19 @@ Taskflow 2.6.0 is the 8th release in the 2.x line! This release includes several new changes such as CPU-GPU tasking, algorithm collection, enhanced web-based profiler, documentation, and unit tests. We have a new webpage for Taskflow! -Codestin Search App -Taskflow 2.6.0 can be downloaded from here. +Codestin Search AppTaskflow 2.6.0 can be downloaded from here. -Codestin Search App - -added explicit join behavior of tf::Subflow (see Join a Subflow and Fibonacci Number) +Codestin Search App +added explicit join behavior of tf::Subflow (see Join a Subflow Explicitly and Fibonacci Number) -added version macro (TF_VERSION, TF_MAJOR_VERSION, TF_MINOR_VERSION, TF_PATCH_VERSION) to retrieve version info programmatically (tf::version) +added version macro (TF_VERSION, TF_MAJOR_VERSION, TF_MINOR_VERSION, TF_PATCH_VERSION) to retrieve version info programmatically (tf::version) added TF_BUILD_TESTS and TF_BUILD_EXAMPLES (default on) to let users disable the build of tests and examples (see Building and Installing) renamed tf::Taskflkow::parallel_for to tf::Taskflow::for_each to follow the STL convention -redesigned tf::Taskflow::for_each and tf::Taskflow::for_each_index using OpenMP-styled scheduling algorithms; this redesign largely improved the performance of parallel-for using a single dynamic task return, but it breaks the previous API that returned a std::pair of tasks to synchronize on a set of static parallel-for tasks. Yet, we believe adopting this change is not difficult (see Parallel Iterations). +redesigned tf::Taskflow::for_each and tf::Taskflow::for_each_index using OpenMP-styled scheduling algorithms; this redesign largely improved the performance of parallel-for using a single dynamic task return, but it breaks the previous API that returned a std::pair of tasks to synchronize on a set of static parallel-for tasks. Yet, we believe adopting this change is not difficult (see Parallel Iterations). added multiple unit tests for tf::Taskflow::for_each and tf::Taskflow::for_each_index at different partition algorithms; we have implemented our partition algorithms based on the OpenMP library implementation of LLVM and GCC. @@ -57,9 +55,8 @@ -Codestin Search App - -fixed the bug of iteratively detaching a subflow from a run loop or a condition loop (see Detach a Subflow) +Codestin Search App +fixed the bug of iteratively detaching a subflow from a run loop or a condition loop fixed the bug of conflict macro with boost (#184) @@ -67,22 +64,20 @@ -Codestin Search App - +Codestin Search App removed two methods, tf::detached and tf::joined, due to the new join/detach behavior -Codestin Search App - +Codestin Search App improved the section Observe Thread Activities - + diff --git a/docs/xml/release-2-7-0.xml b/docs/xml/release-2-7-0.xml index 4f3e1733f..7da0a1a6e 100644 --- a/docs/xml/release-2-7-0.xml +++ b/docs/xml/release-2-7-0.xml @@ -1,5 +1,5 @@ - + release-2-7-0 Codestin Search App @@ -7,40 +7,38 @@ Download release-2-7-0_1release-2-7-0_download - + New Features release-2-7-0_1release-2-7-0_new_features - + Bug Fixes release-2-7-0_1release-2-7-0_bug_fixes - + Deprecated Items release-2-7-0_1release-2-7-0_deprecated_items - + Miscellaneous Items release-2-7-0_1release-2-7-0_miscellaneous_items - + Taskflow 2.7.0 is the 9th release in the 2.x line! This release includes several new changes such as CPU-GPU tasking, algorithm collection, enhanced web-based profiler, documentation, and unit tests. -Codestin Search App -Taskflow 2.7.0 can be downloaded from here. +Codestin Search AppTaskflow 2.7.0 can be downloaded from here. -Codestin Search App - +Codestin Search App added tf::Executor::async to support asynchronously calling a function (see Asynchronous Tasking) -added kernel algorithm, tf::cudaFlow::for_each +added kernel algorithm, tf::cudaFlow::for_each -added kernel algorithm, tf::cudaFlow::for_each_index +added kernel algorithm, tf::cudaFlow::for_each_index added explicit join method at tf::cudaFlow::join, tf::cudaFlow::join_n, tf::cudaFlow::join_until @@ -48,12 +46,10 @@ -Codestin Search App -There are no bug fixes in this release. +Codestin Search AppThere are no bug fixes in this release. -Codestin Search App - +Codestin Search App removed redundant methods, tf::Taskflow::broadcast, tf::Taskflow::precede, tf::Taskflow::succeed removed tf::cudaFlow::predicate (replaced with tf::cudaFlow::join_until) @@ -64,8 +60,7 @@ -Codestin Search App - +Codestin Search App added Contributing added Governance @@ -80,6 +75,6 @@ - + diff --git a/docs/xml/release-2_80_80_8dox.xml b/docs/xml/release-2_80_80_8dox.xml index a0bb7b95c..026d1127a 100644 --- a/docs/xml/release-2_80_80_8dox.xml +++ b/docs/xml/release-2_80_80_8dox.xml @@ -1,5 +1,5 @@ - + release-2.0.0.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/release-2_81_80_8dox.xml b/docs/xml/release-2_81_80_8dox.xml index 4c1c1390a..a692db7b8 100644 --- a/docs/xml/release-2_81_80_8dox.xml +++ b/docs/xml/release-2_81_80_8dox.xml @@ -1,5 +1,5 @@ - + release-2.1.0.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/release-2_82_80_8dox.xml b/docs/xml/release-2_82_80_8dox.xml index ed1fb328d..97720fc0b 100644 --- a/docs/xml/release-2_82_80_8dox.xml +++ b/docs/xml/release-2_82_80_8dox.xml @@ -1,5 +1,5 @@ - + release-2.2.0.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/release-2_83_80_8dox.xml b/docs/xml/release-2_83_80_8dox.xml index d5101e832..895011621 100644 --- a/docs/xml/release-2_83_80_8dox.xml +++ b/docs/xml/release-2_83_80_8dox.xml @@ -1,5 +1,5 @@ - + release-2.3.0.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/release-2_83_81_8dox.xml b/docs/xml/release-2_83_81_8dox.xml index fb068dcb0..c7e3dd67c 100644 --- a/docs/xml/release-2_83_81_8dox.xml +++ b/docs/xml/release-2_83_81_8dox.xml @@ -1,5 +1,5 @@ - + release-2.3.1.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/release-2_84_80_8dox.xml b/docs/xml/release-2_84_80_8dox.xml index 9f1ac6d9d..f59443446 100644 --- a/docs/xml/release-2_84_80_8dox.xml +++ b/docs/xml/release-2_84_80_8dox.xml @@ -1,5 +1,5 @@ - + release-2.4.0.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/release-2_85_80_8dox.xml b/docs/xml/release-2_85_80_8dox.xml index b5315ca75..9c5f15877 100644 --- a/docs/xml/release-2_85_80_8dox.xml +++ b/docs/xml/release-2_85_80_8dox.xml @@ -1,5 +1,5 @@ - + release-2.5.0.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/release-2_86_80_8dox.xml b/docs/xml/release-2_86_80_8dox.xml index 4769c3be9..7e2022462 100644 --- a/docs/xml/release-2_86_80_8dox.xml +++ b/docs/xml/release-2_86_80_8dox.xml @@ -1,5 +1,5 @@ - + release-2.6.0.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/release-2_87_80_8dox.xml b/docs/xml/release-2_87_80_8dox.xml index 3cd1bb759..1c4a42502 100644 --- a/docs/xml/release-2_87_80_8dox.xml +++ b/docs/xml/release-2_87_80_8dox.xml @@ -1,5 +1,5 @@ - + release-2.7.0.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/release-3-0-0.xml b/docs/xml/release-3-0-0.xml index 6ffb00333..223b22069 100644 --- a/docs/xml/release-3-0-0.xml +++ b/docs/xml/release-3-0-0.xml @@ -1,5 +1,5 @@ - + release-3-0-0 Codestin Search App @@ -7,86 +7,84 @@ Download release-3-0-0_1release-3-0-0_download - + System Requirements release-3-0-0_1release-3-0-0_system_requirements - + Working Items release-3-0-0_1release-3-0-0_working_items - + New Features release-3-0-0_1release-3-0-0_new_features - - - Taskflow Core - release-3-0-0_1release-3-0-0_taskflow_core - - - cudaFlow - release-3-0-0_1release-3-0-0_cudaflow - - - Utilities - release-3-0-0_1release-3-0-0_utilities - - - Taskflow Profiler (TFProf) - release-3-0-0_1release-3-0-0_profiler - - - + + + Taskflow Core + release-3-0-0_1release-3-0-0_taskflow_core + + + cudaFlow + release-3-0-0_1release-3-0-0_cudaflow + + + Utilities + release-3-0-0_1release-3-0-0_utilities + + + Taskflow Profiler (TFProf) + release-3-0-0_1release-3-0-0_profiler + + + New Algorithms release-3-0-0_1release-3-0-0_new_algorithms - - - CPU Algorithms - release-3-0-0_1release-3-0-0_cpu_algorithms - - - GPU Algorithms - release-3-0-0_1release-3-0-0_gpu_algorithms - - - + + + CPU Algorithms + release-3-0-0_1release-3-0-0_cpu_algorithms + + + GPU Algorithms + release-3-0-0_1release-3-0-0_gpu_algorithms + + + Bug Fixes release-3-0-0_1release-3-0-0_bug_fixes - + Breaking Changes release-3-0-0_1release-3-0-0_breaking_changes - + Deprecated and Removed Items release-3-0-0_1release-3-0-0_deprecated_items - + Documentation release-3-0-0_1release-3-0-0_documentation - + Miscellaneous Items release-3-0-0_1release-3-0-0_miscellaneous_items - + Taskflow 3.0.0 is the 1st release in the 3.x line! This release includes several new changes such as CPU-GPU tasking, algorithm collection, enhanced web-based profiler, documentation, and unit tests. -Starting from v3, we have migrated the codebase to the C++17 standard to largely improve the expressivity and efficiency of the codebase. +Starting from v3, we have migrated the codebase to the C++17 standard to largely improve the expressivity and efficiency of the codebase. -Codestin Search App -Taskflow 3.0.0 can be downloaded from here. +Codestin Search AppTaskflow 3.0.0 can be downloaded from here. -Codestin Search App -To use Taskflow v3.0.0, you need a compiler that supports C++17: +Codestin Search AppTo use Taskflow v3.0.0, you need a compiler that supports C++17: GNU C++ Compiler at least v7.0 with -std=c++17 @@ -104,11 +102,10 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App - +Codestin Search App enhancing the taskflow profiler (TFProf) -adding methods for updating tf::cudaFlow (with unit tests) +adding methods for updating tf::cudaFlow (with unit tests) adding support for cuBLAS @@ -120,10 +117,8 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App - -Codestin Search App - +Codestin Search App +Codestin Search App replaced all non-standard libraries with C++17 STL (e.g., std::optional, std::variant) added tf::WorkerView for users to observe the running works of tasks @@ -132,11 +127,11 @@ Taskflow works on Linux, Windows, and Mac OS X. modified tf::ObserverInterface::on_entry and tf::ObserverInterface::on_exit to take tf::WorkerView -added a custom graph interface to support dynamic polymorphism for tf::cudaGraph +added a custom graph interface to support dynamic polymorphism for tf::cudaGraph supported separate compilations between Taskflow and CUDA (see Compile Taskflow with CUDA) -added tf::Semaphore and tf::CriticalSection to limit the maximum concurrency +added tf::Semaphore and tf::CriticalSection to limit the maximum concurrency added tf::Future to support cancellation of submitted tasks (see Request Cancellation) @@ -144,17 +139,16 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App - -added tf::cudaFlowCapturer for building a cudaFlow through stream capture (see GPU Tasking (cudaFlowCapturer)) +Codestin Search App +added tf::cudaFlowCapturer for building a cudaFlow through stream capture added tf::cudaFlowCapturerBase for creating custom capturers -added tf::cudaFlow::capture for capturing a cudaFlow within a parent cudaFlow +added tf::cudaFlow::capture for capturing a cudaFlow within a parent cudaFlow added tf::Taskflow::emplace_on to place a cudaFlow on a GPU -added tf::cudaFlow::dump and tf::cudaFlowCapturer::dump to visualize cudaFlow +added tf::cudaFlow::dump and tf::cudaFlowCapturer::dump to visualize cudaFlow added tf::cudaFlow::offload and update methods to run and update a cudaFlow explicitly @@ -168,8 +162,7 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App - +Codestin Search App added utility functions to grab the cuda device properties (see cuda_device.hpp) added utility functions to control cuda memory (see cuda_memory.hpp) @@ -184,8 +177,7 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App - +Codestin Search App added visualization for asynchronous tasks added server-based profiler to support large profiling data (see Profile Taskflow Programs) @@ -195,21 +187,18 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App - -Codestin Search App - +Codestin Search App +Codestin Search App added parallel sort (see Parallel Sort) -Codestin Search App - -added single task (see Single Task) +Codestin Search App +added single task -added parallel iterations (see Parallel Iterations) +added parallel iterations added parallel transforms @@ -220,8 +209,7 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App - +Codestin Search App fixed the bug in stream capturing (need to use ThreadLocal mode) fixed the bug in reporting wrong worker ids when compiling a shared library due to the use of thread_local (now with C++17 inline variable) @@ -230,16 +218,14 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App - +Codestin Search App changed the returned values of asynchronous tasks to be std::optional in order to support cancellation (see Asynchronous Tasking and Request Cancellation) -Codestin Search App - +Codestin Search App removed tf::cudaFlow::device; users may call tf::Taskflow::emplace_on to associate a cudaflow with a GPU device removed tf::cudaFlow::join, use tf::cudaFlow::offload instead @@ -252,8 +238,7 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App - +Codestin Search App added Compile Taskflow with CUDA added Benchmark Taskflow @@ -262,19 +247,12 @@ Taskflow works on Linux, Windows, and Mac OS X. added Asynchronous Tasking -added GPU Tasking (cudaFlowCapturer) +added GPU Tasking added Request Cancellation added Profile Taskflow Programs -added cudaFlow Algorithms -Single Task to run a kernel function in just a single thread -Parallel Iterations to perform parallel iterations over a range of items -Parallel Transforms to perform parallel transforms over a range of items - - - added Governance Rules Team @@ -296,8 +274,7 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App -We have presented Taskflow in the following C++ venues with recorded videos: +Codestin Search AppWe have presented Taskflow in the following C++ venues with recorded videos: 2020 CppCon Taskflow Talk 2020 MUC++ Taskflow Talk @@ -310,6 +287,6 @@ Taskflow works on Linux, Windows, and Mac OS X. - + diff --git a/docs/xml/release-3-1-0.xml b/docs/xml/release-3-1-0.xml index 1e900c1f3..23bb10df4 100644 --- a/docs/xml/release-3-1-0.xml +++ b/docs/xml/release-3-1-0.xml @@ -1,5 +1,5 @@ - + release-3-1-0 Codestin Search App @@ -7,65 +7,63 @@ Download release-3-1-0_1release-3-1-0_download - + System Requirements release-3-1-0_1release-3-1-0_system_requirements - + New Features release-3-1-0_1release-3-1-0_new_features - - - Taskflow Core - release-3-1-0_1release-3-1-0_taskflow_core - - - cudaFlow - release-3-1-0_1release-3-1-0_cudaflow - - - Utilities - release-3-1-0_1release-3-1-0_utilities - - - Taskflow Profiler (TFProf) - release-3-1-0_1release-3-1-0_profiler - - - + + + Taskflow Core + release-3-1-0_1release-3-1-0_taskflow_core + + + cudaFlow + release-3-1-0_1release-3-1-0_cudaflow + + + Utilities + release-3-1-0_1release-3-1-0_utilities + + + Taskflow Profiler (TFProf) + release-3-1-0_1release-3-1-0_profiler + + + Bug Fixes release-3-1-0_1release-3-1-0_bug_fixes - + Breaking Changes release-3-1-0_1release-3-1-0_breaking_changes - + Deprecated and Removed Items release-3-1-0_1release-3-1-0_deprecated_items - + Documentation release-3-1-0_1release-3-1-0_documentation - + Miscellaneous Items release-3-1-0_1release-3-1-0_miscellaneous_items - + Taskflow 3.1.0 is the 2nd release in the 3.x line! This release includes several new changes such as CPU-GPU tasking, algorithm collection, enhanced web-based profiler, documentation, and unit tests. -Codestin Search App -Taskflow 3.1.0 can be downloaded from here. +Codestin Search AppTaskflow 3.1.0 can be downloaded from here. -Codestin Search App -To use Taskflow v3.1.0, you need a compiler that supports C++17: +Codestin Search AppTo use Taskflow v3.1.0, you need a compiler that supports C++17: GNU C++ Compiler at least v8.4 with -std=c++17 @@ -85,11 +83,9 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App - -Codestin Search App - -optimized task node storage by using std::unique_ptr for semaphores +Codestin Search App +Codestin Search App +optimized task node storage by using std::unique_ptr for semaphores merged the execution flow of cudaFlow and cudaFlow capturer @@ -97,23 +93,22 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App - +Codestin Search App optimized tf::cudaRoundRobinCapturing through an event-pruning heuristic optimized the default block size used in cudaFlow algorithms -added tf::cudaFlow::clear() to clean up a cudaFlow +added tf::cudaFlow::clear() to clean up a cudaFlow -added tf::cudaFlow::num_tasks() to query the task count in a cudaFlow +added tf::cudaFlow::num_tasks() to query the task count in a cudaFlow -added tf::cudaTask::num_dependents() to query the dependent count in a cudaTask +added tf::cudaTask::num_dependents() to query the dependent count in a cudaTask -added tf::cudaFlowCapturer::clear() to clean up a cudaFlow capturer +added tf::cudaFlowCapturer::clear() to clean up a cudaFlow capturer -added tf::cudaFlowCapturer::num_tasks() to query the task count in a cudaFlow capturer +added tf::cudaFlowCapturer::num_tasks() to query the task count in a cudaFlow capturer -added tf::cudaFlowCapturer rebind methods: +added tf::cudaFlowCapturer rebind methods: tf::cudaFlowCapturer::rebind_single_task tf::cudaFlowCapturer::rebind_for_each tf::cudaFlowCapturer::rebind_for_each_index @@ -123,7 +118,7 @@ Taskflow works on Linux, Windows, and Mac OS X. -added tf::cudaFlow update methods: +added tf::cudaFlow update methods: tf::cudaFlow::update_for_each tf::cudaFlow::update_for_each_index tf::cudaFlow::update_transform @@ -143,8 +138,7 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App - +Codestin Search App resolved the compiler warning in serializer caused by constexpr if resolved the compiler error of nvcc when parsin variadic namespace @@ -153,39 +147,31 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App -No update for TFProf in this release. +Codestin Search AppNo update for TFProf in this release. -Codestin Search App - +Codestin Search App fixed the macro expansion issue with MSVC on TF_CUDA_CHECK fixed the serializer compile error (#288) -fixed the tf::cudaTask::type bug in mixing host and empty task types +fixed the tf::cudaTask::type bug in mixing host and empty task types -Codestin Search App -There are no breaking changes in this release. +Codestin Search AppThere are no breaking changes in this release. -Codestin Search App -There are no deprecated or removed items in this release. +Codestin Search AppThere are no deprecated or removed items in this release. -Codestin Search App - +Codestin Search App added Query the Worker ID to the cookbook page Executor -revised update methods in GPU Tasking (cudaFlow) -revised rebind methods in GPU Tasking (cudaFlowCapturer) -Codestin Search App - +Codestin Search App removed Circle-CI from the continuous integration updated grok to the user list updated RavEngine to the user list @@ -195,6 +181,6 @@ Taskflow works on Linux, Windows, and Mac OS X. - + diff --git a/docs/xml/release-3-10-0.xml b/docs/xml/release-3-10-0.xml new file mode 100644 index 000000000..b41f07547 --- /dev/null +++ b/docs/xml/release-3-10-0.xml @@ -0,0 +1,244 @@ + + + + release-3-10-0 + Codestin Search App + + + Release Summary + release-3-10-0_1release-3-10-0_summary + + + Download + release-3-10-0_1release-3-10-0_download + + + System Requirements + release-3-10-0_1release-3-10-0_system_requirements + + + New Features + release-3-10-0_1release-3-10-0_new_features + + + Taskflow Core + release-3-10-0_1release-3-10-0_taskflow_core + + + Utilities + release-3-10-0_1release-3-10-0_utilities + + + + + Bug Fixes + release-3-10-0_1release-3-10-0_bug_fixes + + + Breaking Changes + release-3-10-0_1release-3-10-0_breaking_changes + + + Documentation + release-3-10-0_1release-3-10-0_documentation + + + Miscellaneous Items + release-3-10-0_1release-3-10-0_miscellaneous_items + + + + + + +Codestin Search AppThis release improves scheduling performance through optimized work-stealing threshold tuning and a constrained decentralized buffer. It also introduces index-range-based parallel-for and parallel-reduction algorithms and modifies subflow tasking behavior to significantly enhance the performance of recursive parallelism. + + +Codestin Search AppTaskflow 3.10.0 can be downloaded from here. + + +Codestin Search AppTo use Taskflow v3.10.0, you need a compiler that supports C++17: + +GNU C++ Compiler at least v8.4 with -std=c++17 + +Clang C++ Compiler at least v6.0 with -std=c++17 + +Microsoft Visual Studio at least v19.27 with /std:c++17 + +Apple Clang Xcode Version at least v12.0 with -std=c++17 + +Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17 + +Intel C++ Compiler at least v19.0.1 with -std=c++17 + +Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 + + +Taskflow works on Linux, Windows, and Mac OS X. +Although Taskflow supports primarily C++17, you can enable C++20 compilation through -std=c++20 to achieve better performance due to new C++20 features. + + + + +Codestin Search App +Codestin Search App +optimized work-stealing loop with an adaptive breaking strategy +optimized shut-down signal detection using decentralized variables +optimized memory layout of node by combining successors and predecessors together +changed the default notifier to use the atomic notification algorithm under C++20 +added debug mode for the windows CI to GitHub actions +added index range-based parallel-for algorithm (#551) + + +//initializedata1anddata2to10usingtwodifferentapproaches +std::vector<int>data1(100),data2(100); + +//Approach1:initializedata1usingexplicitindexrange +taskflow.for_each_index(0,100,1,[&](inti){data1[i]=10;}); + +//Approach2:initializedata2usingtf::IndexRange +tf::IndexRange<int>range(0,100,1); +taskflow.for_each_by_index(range,[&](tf::IndexRange<int>&subrange){ +for(inti=subrange.begin();i<subrange.end();i+=subrange.step_size()){ +data2[i]=10; +} +}); + + +added index range-based parallel-reduction algorithm (#654) + + +std::vector<double>data(100000); +doubleres=1.0; +taskflow.reduce_by_index( +//indexrange +tf::IndexRange<size_t>(0,N,1), +//finalresult +res, +//localreducer +[&](tf::IndexRange<size_t>subrange,std::optional<double>running_total){ +doubleresidual=running_total?*running_total:0.0; +for(size_ti=subrange.begin();i<subrange.end();i+=subrange.step_size()){ +data[i]=1.0; +residual+=data[i]; +} +printf("partialsum=%lf\n",residual); +returnresidual; +}, +//globalreducer +std::plus<double>() +); + + +added static keyword to the executor creation in taskflow benchmarks +added waiter test to detect over-subscription issues +added tf::Executor::num_waiters (C++20 only) for querying the number of non-stealing workers +added tf::make_module_task to the algorithm collection (see Module Algorithm) +added tf::Runtime::is_cancelled to query if the parent taskflow is cancelled +added tf::Runtime to async tasking to simplify designs of recursive parallelism (see Runtime Tasking) + + + + +Codestin Search App +added tf::IndexRange for index range-based parallel-for algorithm +added tf::distance to calculate the number of iterations in an index range +added tf::is_index_range_invalid to check if the given index range is valid + + + + + +Codestin Search App +fixed the compilation error of CLI11 due to version incompatibility (#672) +fixed the compilation error of template deduction on packaged_task (#657) +fixed the MSVC compilation error due to macro clash with std::min and std::max (#670) +fixed the runtime error due to the use of latch in tf::Executor::Executor (#667) +fixed the compilation error due to incorrect const qualifier used in algorithms (#673) +fixed the TSAN error when using find-if algorithm tasks with closure wrapper (#675) +fixed the task trait bug in incorrect detection for subflow and runtime tasks (#679) +fixed the infinite steal caused by incorrect num_empty_steals (#681) + + + + +Codestin Search App +corrected the terminology by replacing 'dependents' with 'predecessors' +tf::Task::num_predecessors (previously tf::Task::num_dependents) +tf::Task::for_each_predecessor (previously tf::Task::for_each_dependent) +tf::Task::num_strong_dependencies (previously tf::Task::num_strong_dependents) +tf::Task::num_weak_dependencies (previously tf::Task::num_weak_dependents) + + +disabled the support for tf::Subflow::detach due to multiple intricate and unresolved issues: +detached subflows are inherently difficult to reason about their execution logic +detached subflows can incur excessive memory consumption, especially in recursive workloads +detached subflows lack a manner to safe life cycle control and graph cleanup +detached subflows have limited practical benefits for most use cases +detached subflows can be re-implemented using taskflow composition + + +changed the default behavior of tf::Subflow to no longer retain its task graph after join +default retention can incur significant memory consumption problem (#674) +users must explicitly call tf::Subflow::retain to retain a subflow after join + + + + +tf::Taskflowtaskflow; +tf::Executorexecutor; + +taskflow.emplace([&](tf::Subflow&sf){ +sf.retain(true);//retainthesubflowafterjoinforvisualization +autoA=sf.emplace([](){std::cout<<"A\n";}); +autoB=sf.emplace([](){std::cout<<"B\n";}); +autoC=sf.emplace([](){std::cout<<"C\n";}); +A.precede(B,C);//ArunsbeforeBandC +});//subflowimplicitlyjoinshere + +executor.run(taskflow).wait(); + +//Thesubflowgraphisnowretainedandcanbevisualizedusingtaskflow.dump(...) +taskflow.dump(std::cout); + + +disabled the support for tf::cudaFlow and tf::cudaFlowCapturer +introduced a cleaner interface tf::cudaGraph directly atop CUDA Graph (see GPU Tasking) +tf::cudaGraph has similar interface to tf::cudaFlow and can be changed as follows: + + + + +//programmingtf::cudaGraphisconsistentwithNvidiaCUDAGraphbutoffersasimpler +//andmoreintuitiveinterfacebyabstractingawaylow-levelCUDAGraphboilerplate. +tf::cudaGraphcg; +cg.kernel(...);//sameascudaFlow/cudaFlowCapturer + +//unlikecudaFlow/cudaFlowCapturer,youneedtoexplicitlyinstantiateanexecutable +//CUDAgraphnowandsubmitittoastreamforexecution +tf::cudaGraphExecexec(cg); +tf::cudaStreamstream; +stream.run(exec).synchronize(); + + + +Codestin Search App +added Module Algorithm +revised Subflow Tasking +revised Asynchronous Tasking +revised Runtime Tasking +revised Executor +revised Parallel Iterations +revised Parallel Reduction +revised Parallel Find +revised Fibonacci Number + + + + +Codestin Search AppIf you are interested in collaborating with us on applying Taskflow to your projects, please feel free to reach out to Dr. Tsung-Wei Huang! + + + + + diff --git a/docs/xml/release-3-11-0.xml b/docs/xml/release-3-11-0.xml new file mode 100644 index 000000000..873b21a56 --- /dev/null +++ b/docs/xml/release-3-11-0.xml @@ -0,0 +1,127 @@ + + + + release-3-11-0 + Codestin Search App + + + Download + release-3-11-0_1release-3-11-0_download + + + System Requirements + release-3-11-0_1release-3-11-0_system_requirements + + + Release Summary + release-3-11-0_1release-3-11-0_summary + + + New Features + release-3-11-0_1release-3-11-0_new_features + + + Taskflow Core + release-3-11-0_1release-3-11-0_taskflow_core + + + Utilities + release-3-11-0_1release-3-11-0_utilities + + + + + Bug Fixes + release-3-11-0_1release-3-11-0_bug_fixes + + + Breaking Changes + release-3-11-0_1release-3-11-0_breaking_changes + + + Documentation + release-3-11-0_1release-3-11-0_documentation + + + Miscellaneous Items + release-3-11-0_1release-3-11-0_miscellaneous_items + + + + + +Taskflow 3.11.0 is the newest developing line to new features and improvements we continue to support. It is also where this documentation is generated. Many things are considered experimental and may change or break from time to time. While it may be difficult to be keep all things consistent when introducing new features, we continue to try our best to ensure backward compatibility. + +Codestin Search AppTo download the newest version of Taskflow, please clone the master branch from Taskflow's GitHub. + + +Codestin Search AppTo use Taskflow v3.11.0, you need a compiler that supports C++17: + +GNU C++ Compiler at least v8.4 with -std=c++17 + +Clang C++ Compiler at least v6.0 with -std=c++17 + +Microsoft Visual Studio at least v19.27 with /std:c++17 + +Apple Clang Xcode Version at least v12.0 with -std=c++17 + +Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17 + +Intel C++ Compiler at least v19.0.1 with -std=c++17 + +Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 + + +Taskflow works on Linux, Windows, and Mac OS X. +Although Taskflow supports primarily C++17, you can enable C++20 compilation through -std=c++20 to achieve better performance due to new C++20 features. + + + + +Codestin Search App + +Codestin Search App +Codestin Search App +added examples/task_visitor.cpp to demonstrate how to traverse a taskflow (#699) +added five benchmarks to showcase the capability of tf::Runtime +fibonacci +skynet +integrate +nqueens +primes + + + + + + +Codestin Search App + + +Codestin Search App +fixed missing exception on thread creation failure in tf::Executor (#693) +fixed segmentation fault caused by empty async dependency (#700) + + + + +Codestin Search App + +Codestin Search App +revised Static Tasking +revised Conditional Tasking +revised Runtime Tasking +revised Asynchronous Tasking +revised Asynchronous Tasking with Dependencies +revised Exception Handling +revised Request Cancellation + + + + +Codestin Search AppIf you are interested in collaborating with us on applying Taskflow to your projects, please feel free to reach out to Dr. Tsung-Wei Huang! + + + + + diff --git a/docs/xml/release-3-2-0.xml b/docs/xml/release-3-2-0.xml index 0f2a8d975..78a14e04c 100644 --- a/docs/xml/release-3-2-0.xml +++ b/docs/xml/release-3-2-0.xml @@ -1,5 +1,5 @@ - + release-3-2-0 Codestin Search App @@ -7,77 +7,75 @@ Download release-3-2-0_1release-3-2-0_download - + System Requirements release-3-2-0_1release-3-2-0_system_requirements - + Working Items release-3-2-0_1release-3-2-0_working_items - + New Features release-3-2-0_1release-3-2-0_new_features - - - Taskflow Core - release-3-2-0_1release-3-2-0_taskflow_core - - - cudaFlow - release-3-2-0_1release-3-2-0_cudaflow - - - syclFlow - release-3-2-0_1release-3-2-0_syclflow - - - CUDA Standard Parallel Algorithms - release-3-2-0_1release-3-2-0_cuda_std_algorithms - - - Utilities - release-3-2-0_1release-3-2-0_utilities - - - Taskflow Profiler (TFProf) - release-3-2-0_1release-3-2-0_profiler - - - + + + Taskflow Core + release-3-2-0_1release-3-2-0_taskflow_core + + + cudaFlow + release-3-2-0_1release-3-2-0_cudaflow + + + syclFlow + release-3-2-0_1release-3-2-0_syclflow + + + CUDA Standard Parallel Algorithms + release-3-2-0_1release-3-2-0_cuda_std_algorithms + + + Utilities + release-3-2-0_1release-3-2-0_utilities + + + Taskflow Profiler (TFProf) + release-3-2-0_1release-3-2-0_profiler + + + Bug Fixes release-3-2-0_1release-3-2-0_bug_fixes - + Breaking Changes release-3-2-0_1release-3-2-0_breaking_changes - + Deprecated and Removed Items release-3-2-0_1release-3-2-0_deprecated_items - + Documentation release-3-2-0_1release-3-2-0_documentation - + Miscellaneous Items release-3-2-0_1release-3-2-0_miscellaneous_items - + Taskflow 3.2.0 is the 3rd release in the 3.x line! This release includes several new changes such as CPU-GPU tasking, algorithm collection, enhanced web-based profiler, documentation, and unit tests. -Codestin Search App -Taskflow 3.2.0 can be downloaded from here. +Codestin Search AppTaskflow 3.2.0 can be downloaded from here. -Codestin Search App -To use Taskflow v3.2.0, you need a compiler that supports C++17: +Codestin Search AppTo use Taskflow v3.2.0, you need a compiler that supports C++17: GNU C++ Compiler at least v8.4 with -std=c++17 @@ -97,8 +95,7 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App - +Codestin Search App enhancing support for SYCL with Intel DPC++ enhancing parallel CPU and GPU algorithms @@ -109,10 +106,8 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App - -Codestin Search App - +Codestin Search App +Codestin Search App added tf::SmallVector optimization for optimizing the dependency storage in a graph added move constructor and move assignment operator for tf::Taskflow @@ -135,12 +130,11 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App - -improved the execution flow of tf::cudaFlowCapturer when updates involve +Codestin Search App +improved the execution flow of tf::cudaFlowCapturer when updates involve -New algorithms in tf::cudaFlow and tf::cudaFlowCapturer: +New algorithms in tf::cudaFlow and tf::cudaFlowCapturer: added tf::cudaFlow::reduce @@ -208,53 +202,50 @@ New algorithms in tf::cudaFl -Codestin Search App - +Codestin Search App -Codestin Search App - -added tf::cuda_for_each +Codestin Search App +added tf::cuda_for_each -added tf::cuda_for_each_index +added tf::cuda_for_each_index -added tf::cuda_transform +added tf::cuda_transform -added tf::cuda_reduce +added tf::cuda_reduce -added tf::cuda_uninitialized_reduce +added tf::cuda_uninitialized_reduce -added tf::cuda_transform_reduce +added tf::cuda_transform_reduce added tf::cuda_transform_uninitialized_reduce -added tf::cuda_inclusive_scan +added tf::cuda_inclusive_scan -added tf::cuda_exclusive_scan +added tf::cuda_exclusive_scan -added tf::cuda_transform_inclusive_scan +added tf::cuda_transform_inclusive_scan -added tf::cuda_transform_exclusive_scan +added tf::cuda_transform_exclusive_scan -added tf::cuda_merge +added tf::cuda_merge -added tf::cuda_merge_by_key +added tf::cuda_merge_by_key -added tf::cuda_sort +added tf::cuda_sort -added tf::cuda_sort_by_key +added tf::cuda_sort_by_key -added tf::cuda_find_if +added tf::cuda_find_if -added tf::cuda_min_element +added tf::cuda_min_element -added tf::cuda_max_element +added tf::cuda_max_element -Codestin Search App - +Codestin Search App added CUDA meta programming added SYCL meta programming @@ -263,12 +254,10 @@ New algorithms in tf::cudaFl -Codestin Search App - +Codestin Search App -Codestin Search App - +Codestin Search App fixed compilation errors in constructing tf::cudaRoundRobinCapturing fixed compilation errors of TLS worker pointer in tf::Executor @@ -285,12 +274,10 @@ New algorithms in tf::cudaFl -Codestin Search App -There are no breaking changes in this release. +Codestin Search AppThere are no breaking changes in this release. -Codestin Search App - +Codestin Search App removed tf::cudaFlow::kernel_on method removed explicit partitions in parallel iterations and reductions @@ -299,32 +286,20 @@ New algorithms in tf::cudaFl removed tf::cublasFlowCapturer -renamed update and rebind methods in tf::cudaFlow and tf::cudaFlowCapturer to overloads +renamed update and rebind methods in tf::cudaFlow and tf::cudaFlowCapturer to overloads -Codestin Search App - +Codestin Search App revised Static Tasking Move a Taskflow revised Executor -Execute a Taskflow with Transferred Ownership - - - -added cudaFlow Algorithms - -added CUDA Standard Algorithms -Execution Policy -Parallel Reduction -Parallel Scan -Parallel Merge -Parallel Find +Execute a Taskflow with Transferred Ownership @@ -332,13 +307,12 @@ New algorithms in tf::cudaFl -Codestin Search App -We have published tf::cudaFlow in the following conference: +Codestin Search AppWe have published tf::cudaFlow in the following conference: Dian-Lun Lin and Tsung-Wei Huang, "Efficient GPU Computation using Task Graph Parallelism," European Conference on Parallel and Distributed Computing (EuroPar), 2021 - + diff --git a/docs/xml/release-3-3-0.xml b/docs/xml/release-3-3-0.xml index cdc5a57c8..e4fa0eaa0 100644 --- a/docs/xml/release-3-3-0.xml +++ b/docs/xml/release-3-3-0.xml @@ -1,5 +1,5 @@ - + release-3-3-0 Codestin Search App @@ -7,76 +7,74 @@ Download release-3-3-0_1release-3-3-0_download - + System Requirements release-3-3-0_1release-3-3-0_system_requirements - + Release Summary release-3-3-0_1release-3-3-0_summary - + New Features release-3-3-0_1release-3-3-0_new_features - - - Taskflow Core - release-3-3-0_1release-3-3-0_taskflow_core - - - cudaFlow - release-3-3-0_1release-3-3-0_cudaflow - - - syclFlow - release-3-3-0_1release-3-3-0_syclflow - - - Utilities - release-3-3-0_1release-3-3-0_utilities - - - Taskflow Profiler (TFProf) - release-3-3-0_1release-3-3-0_profiler - - - + + + Taskflow Core + release-3-3-0_1release-3-3-0_taskflow_core + + + cudaFlow + release-3-3-0_1release-3-3-0_cudaflow + + + syclFlow + release-3-3-0_1release-3-3-0_syclflow + + + Utilities + release-3-3-0_1release-3-3-0_utilities + + + Taskflow Profiler (TFProf) + release-3-3-0_1release-3-3-0_profiler + + + Bug Fixes release-3-3-0_1release-3-3-0_bug_fixes - + Breaking Changes release-3-3-0_1release-3-3-0_breaking_changes - + Deprecated and Removed Items release-3-3-0_1release-3-3-0_deprecated_items - + Documentation release-3-3-0_1release-3-3-0_documentation - + Miscellaneous Items release-3-3-0_1release-3-3-0_miscellaneous_items - + Taskflow 3.3.0 is the 4th release in the 3.x line! This release includes several new changes, such as sanitized data race, pipeline parallelism, documentation, and unit tests. -We highly recommend that adopting Taskflow v3.3 in your projects if possible. This release has resolved pretty much all the potential data-race issues induced by incorrect memory order. +We highly recommend that adopting Taskflow v3.3 in your projects if possible. This release has resolved pretty much all the potential data-race issues induced by incorrect memory order. -Codestin Search App -Taskflow 3.3.0 can be downloaded from here. +Codestin Search AppTaskflow 3.3.0 can be downloaded from here. -Codestin Search App -To use Taskflow v3.3.0, you need a compiler that supports C++17: +Codestin Search AppTo use Taskflow v3.3.0, you need a compiler that supports C++17: GNU C++ Compiler at least v8.4 with -std=c++17 @@ -96,8 +94,7 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App - +Codestin Search App This release has resolved data race issues reported by tsan and has incorporated essential sanitizers into the continuous integration workflows for detecting data race, illegal memory access, and memory leak of the Taskflow codebase. This release has introduced a new pipeline interface (tf::Pipeline) that allow users to create a pipeline scheduling framework for implementing pipeline algorithms. This release has introduced a new thread-id mapping algorithm to resolve unexpected thread-local storage (TLS) errors when building Taskflow projects in a shared library environment. @@ -105,10 +102,8 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App - -Codestin Search App - +Codestin Search App +Codestin Search App Changed all lambda operators in parallel algorithms to copy by default Cleaned up data race errors in tsan caused by incorrect memory order Enhanced scheduling performance by caching tasks in the invoke loop @@ -126,31 +121,26 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App -Starting from v3.3, using tf::cudaFlow needs to include the header, taskflow/cuda/cudaflow.hpp. See Breaking Changes. +Codestin Search AppStarting from v3.3, using tf::cudaFlow needs to include the header, taskflow/cuda/cudaflow.hpp. See Breaking Changes. -Codestin Search App -This release does not have any update on syclFlow. +Codestin Search AppThis release does not have any update on syclFlow. -Codestin Search App - +Codestin Search App Added tf::SmallVector to the documentation Added relax_cpu call to optimize the work-stealing loop -Codestin Search App -This release does not have any update on the profiler. +Codestin Search AppThis release does not have any update on the profiler. -Codestin Search App - +Codestin Search App Fixed incorrect static TLS access when building Taskflow in a shared lib -Fixed memory leak in updating tf::cudaFlowCapturer of undestroyed graph +Fixed memory leak in updating tf::cudaFlowCapturer of undestroyed graph Fixed data race in the object-pool when accessing the heap pointer Fixed invalid lambda capture by reference in tf::Taskflow::sort Fixed invalid lambda capture by reference in tf::Taskflow::reduce @@ -162,13 +152,12 @@ Taskflow works on Linux, Windows, and Mac OS X. If you encounter any potential bugs, please submit an issue at issue tracker. -Codestin Search App -For the purpose of compilation speed, you will need to separately include the follwoing files for using specific features and algorithms: +Codestin Search AppFor the purpose of compilation speed, you will need to separately include the follwoing files for using specific features and algorithms: taskflow/algorithm/reduce.hpp for creating a parallel-reduction task taskflow/algorithm/sort.hpp for creating a parallel-sort task taskflow/algorithm/transform.hpp for creating a parallel-transform task taskflow/algorithm/pipeline.hpp for creating a parallel-pipeline task -taskflow/cuda/cudaflow.hpp for creating a tf::cudaFlow and a tf::cudaFlowCapturer tasks +taskflow/cuda/cudaflow.hpp for creating a tf::cudaFlow and a tf::cudaFlowCapturer tasks taskflow/cuda/algorithm/for_each.hpp for creating a single-threaded task on a CUDA GPU taskflow/cuda/algorithm/for_each.hpp for creating a parallel-iteration task on a CUDA GPU taskflow/cuda/algorithm/transform.hpp for creating a parallel-transform task on a CUDA GPU @@ -181,12 +170,10 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App -This release does not have any deprecated and removed items. +Codestin Search AppThis release does not have any deprecated and removed items. -Codestin Search App - +Codestin Search App Revised Building and Installing Build Sanitizers @@ -203,25 +190,21 @@ Taskflow works on Linux, Windows, and Mac OS X. Create a Multi-condition Task -Revised GPU Tasking (cudaFlow) -Revised GPU Tasking (cudaFlowCapturer) +Revised GPU Tasking Revised Limit the Maximum Concurrency Define a Conflict Graph Revised Parallel Sort to add header-include information Revised Parallel Reduction to add header-include information -Revised cudaFlow Algorithms to add header-include information -Revised CUDA Standard Algorithms to add header-include information -Added Interact with the Runtime +Added Runtime Tasking Added Parallel Transforms Added Task-parallel Pipeline -Codestin Search App -We have published Taskflow in the following venues: +Codestin Search AppWe have published Taskflow in the following venues: Tsung-Wei Huang, Dian-Lun Lin, Chun-Xun Lin, and Yibo Lin, "Taskflow: A Lightweight Parallel and Heterogeneous Task Graph Computing System," IEEE Transactions on Parallel and Distributed Systems (TPDS), vol. 33, no. 6, pp. 1303-1320, June 2022 Tsung-Wei Huang, "TFProf: Profiling Large Taskflow Programs with Modern D3 and C++," IEEE International Workshop on Programming and Performance Visualization Tools (ProTools), St. Louis, Missouri, 2021 @@ -229,6 +212,6 @@ Taskflow works on Linux, Windows, and Mac OS X. Please do not hesitate to contact Dr. Tsung-Wei Huang if you intend to collaborate with us on using Taskflow in your scientific computing projects. - + diff --git a/docs/xml/release-3-4-0.xml b/docs/xml/release-3-4-0.xml index 4562e0ff7..a6b0773e8 100644 --- a/docs/xml/release-3-4-0.xml +++ b/docs/xml/release-3-4-0.xml @@ -1,5 +1,5 @@ - + release-3-4-0 Codestin Search App @@ -7,69 +7,67 @@ Download release-3-4-0_1release-3-4-0_download - + System Requirements release-3-4-0_1release-3-4-0_system_requirements - + Release Summary release-3-4-0_1release-3-4-0_summary - + New Features release-3-4-0_1release-3-4-0_new_features - - - Taskflow Core - release-3-4-0_1release-3-4-0_taskflow_core - - - cudaFlow - release-3-4-0_1release-3-4-0_cudaflow - - - syclFlow - release-3-4-0_1release-3-4-0_syclflow - - - Utilities - release-3-4-0_1release-3-4-0_utilities - - - + + + Taskflow Core + release-3-4-0_1release-3-4-0_taskflow_core + + + cudaFlow + release-3-4-0_1release-3-4-0_cudaflow + + + syclFlow + release-3-4-0_1release-3-4-0_syclflow + + + Utilities + release-3-4-0_1release-3-4-0_utilities + + + Bug Fixes release-3-4-0_1release-3-4-0_bug_fixes - + Breaking Changes release-3-4-0_1release-3-4-0_breaking_changes - + Deprecated and Removed Items release-3-4-0_1release-3-4-0_deprecated_items - + Documentation release-3-4-0_1release-3-4-0_documentation - + Miscellaneous Items release-3-4-0_1release-3-4-0_miscellaneous_items - + Taskflow 3.4.0 is the 5th release in the 3.x line! This release includes several new changes, such as pipeline parallelism, deadlock-free execution methods, documentation, examples, and unit tests. -Codestin Search App -Taskflow 3.4.0 can be downloaded from here. +Codestin Search AppTaskflow 3.4.0 can be downloaded from here. -Codestin Search App -To use Taskflow v3.4.0, you need a compiler that supports C++17: +Codestin Search AppTo use Taskflow v3.4.0, you need a compiler that supports C++17: GNU C++ Compiler at least v8.4 with -std=c++17 @@ -89,14 +87,11 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App -This release enhances our task-parallel pipeline programming model and executor methods, supplied with several new examples and unit tests. +Codestin Search AppThis release enhances our task-parallel pipeline programming model and executor methods, supplied with several new examples and unit tests. -Codestin Search App - -Codestin Search App - +Codestin Search App +Codestin Search App Improved the pipeline performance using vertical stack optimization Added tf::ScalablePipeline to allow programming variable lengths of pipes Added tf::Runtime::run_and_wait to allow spawning a subflow @@ -110,28 +105,24 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App - -Added tf::cudaStream as a move-only, RAII-styled wrapper over a native CUDA stream -Added tf::cudaEvent as a move-only, RAII-styled wrapper over a native CUDA event +Codestin Search App +Added tf::cudaStream as a move-only, RAII-styled wrapper over a native CUDA stream +Added tf::cudaEvent as a move-only, RAII-styled wrapper over a native CUDA event -Codestin Search App -There is no update on syclFlow in this release. +Codestin Search AppThere is no update on syclFlow in this release. -Codestin Search App - +Codestin Search App Removed serializer to improve compilation speed -Codestin Search App - +Codestin Search App Fixed the compilation error due to non-portable include of immintrin.h (#371) Fixed the compilation error due to using old version of doctest (#372) Fixed the infinite loop bug due to unexpected share states in pipeline (#402) @@ -140,24 +131,20 @@ Taskflow works on Linux, Windows, and Mac OS X. If you encounter any potential bugs, please submit an issue at issue tracker. -Codestin Search App - +Codestin Search App Replaced tf::Runtime::run with tf::Runtime::run_and_wait to comply with tf::Executor::run_and_wait -Codestin Search App -There are no deprecated items in this release. +Codestin Search AppThere are no deprecated items in this release. -Codestin Search App - +Codestin Search App Revised Executor Added Execute a Taskflow from an Internal Worker -Revised Execution Policy Revised Task-parallel Pipeline Added Learn More about Taskflow Pipeline @@ -173,8 +160,7 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App -We have published Taskflow in the following venues: +Codestin Search AppWe have published Taskflow in the following venues: Dian-Lun Lin and Tsung-Wei Huang, "Accelerating Large Sparse Neural Network Inference using GPU Task Graph Parallelism," IEEE Transactions on Parallel and Distributed Systems (TPDS), 2022 Cheng-Hsiang Chiu and Tsung-Wei Huang, "Composing Pipeline Parallelism using Control Taskflow Graph," ACM International Symposium on High-Performance Parallel and Distributed Computing (HPDC), Minneapolis, Minnesota, 2022 Cheng-Hsiang Chiu and Tsung-Wei Huang, "Efficient Timing Propagation with Simultaneous Structural and Pipeline Parallelisms," ACM/IEEE Design Automation Conference (DAC), San Francisco, CA, 2022 @@ -183,6 +169,6 @@ Taskflow works on Linux, Windows, and Mac OS X. Please do not hesitate to contact Dr. Tsung-Wei Huang if you intend to collaborate with us on using Taskflow in your scientific computing projects. - + diff --git a/docs/xml/release-3-5-0.xml b/docs/xml/release-3-5-0.xml index 46c6a4d86..695075f8d 100644 --- a/docs/xml/release-3-5-0.xml +++ b/docs/xml/release-3-5-0.xml @@ -1,5 +1,5 @@ - + release-3-5-0 Codestin Search App @@ -7,69 +7,67 @@ Download release-3-5-0_1release-3-5-0_download - + System Requirements release-3-5-0_1release-3-5-0_system_requirements - + Release Summary release-3-5-0_1release-3-5-0_summary - + New Features release-3-5-0_1release-3-5-0_new_features - - - Taskflow Core - release-3-5-0_1release-3-5-0_taskflow_core - - - cudaFlow - release-3-5-0_1release-3-5-0_cudaflow - - - Utilities - release-3-5-0_1release-3-5-0_utilities - - - Taskflow Profiler (TFProf) - release-3-5-0_1release-3-5-0_profiler - - - + + + Taskflow Core + release-3-5-0_1release-3-5-0_taskflow_core + + + cudaFlow + release-3-5-0_1release-3-5-0_cudaflow + + + Utilities + release-3-5-0_1release-3-5-0_utilities + + + Taskflow Profiler (TFProf) + release-3-5-0_1release-3-5-0_profiler + + + Bug Fixes release-3-5-0_1release-3-5-0_bug_fixes - + Breaking Changes release-3-5-0_1release-3-5-0_breaking_changes - + Deprecated and Removed Items release-3-5-0_1release-3-5-0_deprecated_items - + Documentation release-3-5-0_1release-3-5-0_documentation - + Miscellaneous Items release-3-5-0_1release-3-5-0_miscellaneous_items - + Taskflow 3.5.0 is the 6th release in the 3.x line! This release includes several new changes, such as pipeline parallelism, improved work-stealing performance, profiling, documentation, examples, and unit tests. -Codestin Search App -Taskflow 3.5.0 can be downloaded from here. +Codestin Search AppTaskflow 3.5.0 can be downloaded from here. -Codestin Search App -To use Taskflow v3.5.0, you need a compiler that supports C++17: +Codestin Search AppTo use Taskflow v3.5.0, you need a compiler that supports C++17: GNU C++ Compiler at least v8.4 with -std=c++17 @@ -89,46 +87,36 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App -This release introduces a new data-parallel pipeline programming model, solves the busy-waiting problem in our work-stealing scheduler, and adds a new text-based feature for profiler report. +Codestin Search AppThis release introduces a new data-parallel pipeline programming model, solves the busy-waiting problem in our work-stealing scheduler, and adds a new text-based feature for profiler report. -Codestin Search App - -Codestin Search App - -Added tf::WorkerInterface to allow changing properties of workers upon their creations +Codestin Search App +Codestin Search App +Added tf::WorkerInterface to allow changing properties of workers upon their creations Added tf::Executor::loop_until to allow looping a worker with a custom stop predicate Added tf::DataPipeline to implement data-parallel algorithms See Data-parallel Pipeline -Extended tf::TaskQueue to include priority (tf::TaskPriority) -See Prioritized Tasking - - -Extended tf::Executor to include tf::WorkerInterface +Extended tf::Executor to include tf::WorkerInterface Improved parallel algorithms (e.g., tf::Taskflow::for_each) with tail optimization Resolved the busy-waiting problem in our work-stealing algorithm (#400) -Codestin Search App -This release has no update on tf::cudaFlow. +Codestin Search AppThis release has no update on tf::cudaFlow. -Codestin Search App - +Codestin Search App Added tf::unroll to unroll loops using template techniques -Added tf::CachelineAligned to create a cacheline-aligned object -Replaced std::aligned_union (deprecated in C++23) with a custom byte type (#445) +Added tf::CachelineAligned to create a cacheline-aligned object +Replaced std::aligned_union (deprecated in C++23) with a custom byte type (#445) -Codestin Search App - +Codestin Search App Added a new feature to generate a profile summary report See Display Profile Summary @@ -138,8 +126,7 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App - +Codestin Search App Fixed the compilation error in taking move-only types for tf::Taskflow::transform_reduce Fixed the compilation error in the graph pipeline benchmark Fixed the compilation error in unknown OS (replaced with TF_OS_UNKNOWN) @@ -148,28 +135,23 @@ Taskflow works on Linux, Windows, and Mac OS X. If you encounter any potential bugs, please submit an issue at issue tracker. -Codestin Search App -This release has no breaking changes. +Codestin Search AppThis release has no breaking changes. -Codestin Search App -This release has no deprecated and removed items. +Codestin Search AppThis release has no deprecated and removed items. -Codestin Search App - +Codestin Search App Revised Executor Added Execute a Taskflow from an Internal Worker -Added Prioritized Tasking Added Data-parallel Pipeline -Codestin Search App -We have published Taskflow in the following venues: +Codestin Search AppWe have published Taskflow in the following venues: Tsung-Wei Huang and Leslie Hwang, "Task-Parallel Programming with Constrained Parallelism," IEEE High-Performance Extreme Computing Conference (HPEC), MA, 2022 Tsung-Wei Huang, "Enhancing the Performance Portability of Heterogeneous Circuit Analysis Programs," IEEE High-Performance Extreme Computing Conference (HPEC), MA, 2022 Dian-Lun Lin, Haoxing Ren, Yanqing Zhang, and Tsung-Wei Huang, "From RTL to CUDA: A GPU Acceleration Flow for RTL Simulation with Batch Stimulus," ACM International Conference on Parallel Processing (ICPP), Bordeaux, France, 2022 @@ -178,6 +160,6 @@ Taskflow works on Linux, Windows, and Mac OS X. Please do not hesitate to contact Dr. Tsung-Wei Huang if you intend to collaborate with us on using Taskflow in your scientific computing projects. - + diff --git a/docs/xml/release-3-6-0.xml b/docs/xml/release-3-6-0.xml index 6fb852923..1f117246e 100644 --- a/docs/xml/release-3-6-0.xml +++ b/docs/xml/release-3-6-0.xml @@ -1,5 +1,5 @@ - + release-3-6-0 Codestin Search App @@ -7,65 +7,63 @@ Download release-3-6-0_1release-3-6-0_download - + System Requirements release-3-6-0_1release-3-6-0_system_requirements - + Release Summary release-3-6-0_1release-3-6-0_summary - + New Features release-3-6-0_1release-3-6-0_new_features - - - Taskflow Core - release-3-6-0_1release-3-6-0_taskflow_core - - - cudaFlow - release-3-6-0_1release-3-6-0_cudaflow - - - Utilities - release-3-6-0_1release-3-6-0_utilities - - - Taskflow Profiler (TFProf) - release-3-6-0_1release-3-6-0_profiler - - - + + + Taskflow Core + release-3-6-0_1release-3-6-0_taskflow_core + + + cudaFlow + release-3-6-0_1release-3-6-0_cudaflow + + + Utilities + release-3-6-0_1release-3-6-0_utilities + + + Taskflow Profiler (TFProf) + release-3-6-0_1release-3-6-0_profiler + + + Bug Fixes release-3-6-0_1release-3-6-0_bug_fixes - + Breaking Changes release-3-6-0_1release-3-6-0_breaking_changes - + Documentation release-3-6-0_1release-3-6-0_documentation - + Miscellaneous Items release-3-6-0_1release-3-6-0_miscellaneous_items - + Taskflow 3.6.0 is the 7th release in the 3.x line! This release includes several new changes, such as dynamic task graph parallelism, improved parallel algorithms, modified GPU tasking interface, documentation, examples, and unit tests. -Codestin Search App -Taskflow 3.6.0 can be downloaded from here. +Codestin Search AppTaskflow 3.6.0 can be downloaded from here. -Codestin Search App -To use Taskflow v3.6.0, you need a compiler that supports C++17: +Codestin Search AppTo use Taskflow v3.6.0, you need a compiler that supports C++17: GNU C++ Compiler at least v8.4 with -std=c++17 @@ -85,14 +83,11 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App -This release contains several changes to largely enhance the programmability of GPU tasking and standard parallel algorithms. More importantly, we have introduced a new dependent asynchronous tasking model that offers great flexibility for expressing dynamic task graph parallelism. +Codestin Search AppThis release contains several changes to largely enhance the programmability of GPU tasking and standard parallel algorithms. More importantly, we have introduced a new dependent asynchronous tasking model that offers great flexibility for expressing dynamic task graph parallelism. -Codestin Search App - -Codestin Search App - +Codestin Search App +Codestin Search App Added new async methods to support dynamic task graph creation tf::Executor::dependent_async(F&& func, Tasks&&... tasks) tf::Executor::dependent_async(F&& func, I first, I last) @@ -114,12 +109,12 @@ Taskflow works on Linux, Windows, and Mac OS X. Added parallel-scan algorithms to Taskflow -tf::Taskflow::inclusive_scan(B first, E last, D d_first, BOP bop) -tf::Taskflow::inclusive_scan(B first, E last, D d_first, BOP bop, T init) -tf::Taskflow::transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop) -tf::Taskflow::transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop, T init) -tf::Taskflow::exclusive_scan(B first, E last, D d_first, T init, BOP bop) -tf::Taskflow::transform_exclusive_scan(B first, E last, D d_first, T init, BOP bop, UOP uop) +tf::Taskflow::inclusive_scan(B first, E last, D d_first, BOP bop) +tf::Taskflow::inclusive_scan(B first, E last, D d_first, BOP bop, T init) +tf::Taskflow::transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop) +tf::Taskflow::transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop, T init) +tf::Taskflow::exclusive_scan(B first, E last, D d_first, T init, BOP bop) +tf::Taskflow::transform_exclusive_scan(B first, E last, D d_first, T init, BOP bop, UOP uop) Added parallel-find algorithms to Taskflow @@ -148,44 +143,39 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App - -removed algorithms that require buffer from tf::cudaFlow due to update limitation +Codestin Search App +removed algorithms that require buffer from tf::cudaFlow due to update limitation removed support for a dedicated cudaFlow task in Taskflow -all usage of tf::cudaFlow and tf::cudaFlowCapturer are standalone now +all usage of tf::cudaFlow and tf::cudaFlowCapturer are standalone now -Codestin Search App - +Codestin Search App Added all_same templates to check if a parameter pack has the same type -Codestin Search App - +Codestin Search App Removed cudaFlow and syclFlow tasks -Codestin Search App - +Codestin Search App Fixed the compilation error caused by clashing MAX_PRIORITY wtih winspool.h (#459) -Fixed the compilation error caused by tf::TaskView::for_each_successor and tf::TaskView::for_each_dependent +Fixed the compilation error caused by tf::TaskView::for_each_successor and tf::TaskView::for_each_dependent Fixed the infinite-loop bug when corunning a module task from tf::Runtime If you encounter any potential bugs, please submit an issue at issue tracker. -Codestin Search App - +Codestin Search App Dropped support for cancelling asynchronous tasks @@ -197,51 +187,51 @@ Taskflow works on Linux, Windows, and Mac OS X. std::optional<int>res=fu.get();//resmaybestd::nulloptor1 //now-usestd::futureinstead -std::future<int>fu=executor.async([](){ +std::future<int>fu=executor.async([](){ return1; }); intres=fu.get(); -Dropped in-place support for running tf::cudaFlow from a dedicated task +Dropped in-place support for running tf::cudaFlow from a dedicated task //previous-nolongersupported -taskflow.emplace([](tf::cudaFlow&cf){ +taskflow.emplace([](tf::cudaFlow&cf){ cf.offload(); }); //now-usertofullycontroltf::cudaFlowformaximumflexibility taskflow.emplace([](){ -tf::cudaFlowcf; +tf::cudaFlowcf; //offloadthecudaflowasynchronouslythroughastream -tf::cudaStreamstream; -cf.run(stream); +tf::cudaStreamstream; +cf.run(stream); //waitforthecudaflowcompletes -stream.synchronize(); +stream.synchronize(); }); -Dropped in-place support for running tf::cudaFlowCapturer from a dedicated task +Dropped in-place support for running tf::cudaFlowCapturer from a dedicated task //previous-nowlongersupported -taskflow.emplace([](tf::cudaFlowCapturer&cf){ +taskflow.emplace([](tf::cudaFlowCapturer&cf){ cf.offload(); }); //now-usertofullycontroltf::cudaFlowCapturerformaximumflexibility taskflow.emplace([](){ -tf::cudaFlowCapturercf; +tf::cudaFlowCapturercf; //offloadthecudaflowasynchronouslythroughastream -tf::cudaStreamstream; -cf.run(stream); +tf::cudaStreamstream; +cf.run(stream); //waitforthecudaflowcompletes -stream.synchronize(); +stream.synchronize(); }); @@ -250,11 +240,11 @@ Taskflow works on Linux, Windows, and Mac OS X. Move all buffer query methods of CUDA standard algorithms inside execution policy -tf::cudaExecutionPolicy<NT, VT>::reduce_bufsz -tf::cudaExecutionPolicy<NT, VT>::scan_bufsz -tf::cudaExecutionPolicy<NT, VT>::merge_bufsz -tf::cudaExecutionPolicy<NT, VT>::min_element_bufsz -tf::cudaExecutionPolicy<NT, VT>::max_element_bufsz +tf::cudaExecutionPolicy<NT, VT>::reduce_bufsz +tf::cudaExecutionPolicy<NT, VT>::scan_bufsz +tf::cudaExecutionPolicy<NT, VT>::merge_bufsz +tf::cudaExecutionPolicy<NT, VT>::min_element_bufsz +tf::cudaExecutionPolicy<NT, VT>::max_element_bufsz @@ -263,7 +253,7 @@ Taskflow works on Linux, Windows, and Mac OS X. tf::cuda_reduce_buffer_size<tf::cudaDefaultExecutionPolicy,int>(N); //now(andsimilarlyforotherparallelalgorithms) -tf::cudaDefaultExecutionPolicypolicy(stream); +tf::cudaDefaultExecutionPolicypolicy(stream); policy.reduce_bufsz<int>(N); @@ -277,10 +267,10 @@ Taskflow works on Linux, Windows, and Mac OS X. //previous-asyncallowspassingargumentstothecallable -executor.async([](inti){std::cout<<i<<std::endl;},4); +executor.async([](inti){std::cout<<i<<std::endl;},4); //now-usersareresponsibleofwrappingthearumgnetsintoacallable -executor.async([i=4](std::cout<<i<<std::endl;){}); +executor.async([i=4](std::cout<<i<<std::endl;){}); Replaced named_async with an overload that takes the name string on the first argument @@ -294,8 +284,7 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App - +Codestin Search App Revised Request Cancellation to remove support of cancelling async tasks Revised Asynchronous Tasking to include asynchronous tasking from tf::Runtime Launch Asynchronous Tasks from a Runtime @@ -308,13 +297,6 @@ Taskflow works on Linux, Windows, and Mac OS X. Parallel Reduction -Revised CUDA standard algorithms to correct the use of buffer query methods -Parallel Reduction -Parallel Find -Parallel Merge -Parallel Scan - - Added Task-parallel Pipeline with Token Dependencies Added Parallel Scan Added Asynchronous Tasking with Dependencies @@ -322,8 +304,7 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App -We have published Taskflow in the following venues: +Codestin Search AppWe have published Taskflow in the following venues: Dian-Lun Lin, Yanqing Zhang, Haoxing Ren, Shih-Hsin Wang, Brucek Khailany and Tsung-Wei Huang, "GenFuzz: GPU-accelerated Hardware Fuzzing using Genetic Algorithm with Multiple Inputs," ACM/IEEE Design Automation Conference (DAC), San Francisco, CA, 2023 Tsung-Wei Huang, "qTask: Task-parallel Quantum Circuit Simulation with Incrementality," IEEE International Parallel and Distributed Processing Symposium (IPDPS), St. Petersburg, Florida, 2023 @@ -333,6 +314,6 @@ Taskflow works on Linux, Windows, and Mac OS X. Please do not hesitate to contact Dr. Tsung-Wei Huang if you intend to collaborate with us on using Taskflow in your scientific computing projects. - + diff --git a/docs/xml/release-3-7-0.xml b/docs/xml/release-3-7-0.xml index 84506cde6..16d35ed2b 100644 --- a/docs/xml/release-3-7-0.xml +++ b/docs/xml/release-3-7-0.xml @@ -1,63 +1,61 @@ - + release-3-7-0 - Codestin Search App + Codestin Search App Download release-3-7-0_1release-3-7-0_download - + System Requirements release-3-7-0_1release-3-7-0_system_requirements - + Release Summary release-3-7-0_1release-3-7-0_summary - + New Features release-3-7-0_1release-3-7-0_new_features - - - Taskflow Core - release-3-7-0_1release-3-7-0_taskflow_core - - - Utilities - release-3-7-0_1release-3-7-0_utilities - - - + + + Taskflow Core + release-3-7-0_1release-3-7-0_taskflow_core + + + Utilities + release-3-7-0_1release-3-7-0_utilities + + + Bug Fixes release-3-7-0_1release-3-7-0_bug_fixes - + Breaking Changes release-3-7-0_1release-3-7-0_breaking_changes - + Documentation release-3-7-0_1release-3-7-0_documentation - + Miscellaneous Items release-3-7-0_1release-3-7-0_miscellaneous_items - + -Taskflow 3.7.0 is the newest developing line to new features and improvements we continue to support. It is also where this documentation is generated. Many things are considered experimental and may change or break from time to time. While it may be difficult to be keep all things consistent when introducing new features, we continue to try our best to ensure backward compatibility. +Taskflow 3.7.0 is the 8th release in the 3.x line! This release includes several new changes, such as exception support, improved scheduling algorithms, documentation, examples, and unit tests. -Codestin Search App -To download the newest version of Taskflow, please clone the master branch from Taskflow's GitHub. +Codestin Search AppTaskflow 3.7.0 can be downloaded from here. -Codestin Search App -To use Taskflow v3.7.0, you need a compiler that supports C++17: +Codestin Search AppTo use Taskflow v3.7.0, you need a compiler that supports C++17: GNU C++ Compiler at least v8.4 with -std=c++17 @@ -71,20 +69,17 @@ Intel C++ Compiler at least v19.0.1 with -std=c++17 -Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 and SYCL20 +Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App -This release introduces a new exception interface to help identify C++ errors in taskflow programs. Additionally, this release enhances the scheduling performance through integration of C++20 atomic-wait into scheduler, executor, and notifier. +Codestin Search AppThis release introduces a new exception interface to help identify C++ errors in taskflow programs. -Codestin Search App - -Codestin Search App - +Codestin Search App +Codestin Search App Improved scheduling performance of dependent asynchronous tasks Improved scheduling performance of module task by removing busy looping Improved tf::Executor::wait_for_all using C++20 atomic wait @@ -98,12 +93,12 @@ Taskflow works on Linux, Windows, and Mac OS X. tf::Executorexecutor; tf::Taskflowtaskflow; -taskflow.emplace([](){throwstd::runtime_error("exception");}); +taskflow.emplace([](){throwstd::runtime_error("exception");}); try{ executor.run(taskflow).get(); } -catch(conststd::runtime_error&e){ -std::cerr<<e.what()<<std::endl; +catch(conststd::runtime_error&e){ +std::cerr<<e.what()<<std::endl; } @@ -111,11 +106,11 @@ Taskflow works on Linux, Windows, and Mac OS X. Modified the tf::PartitionerBase to allow defining custom closure wrappers -std::atomic<int>count=0; +std::atomic<int>count=0; tf::Taskflowtaskflow; taskflow.for_each_index(0,100,1, [](){ -printf("%d\n",i); +printf("%d\n",i); }, tf::StaticPartitioner(0,[](auto&&closure){ //dosomethingbeforeinvokingthepartitionedtask @@ -132,35 +127,31 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App - +Codestin Search App -Codestin Search App - +Codestin Search App Fixed compilation error of CUDA examples caused by not including for_each.hpp Fixed the runtime error of tf::Taskflow::for_each_index when the range invalid -Codestin Search App - +Codestin Search App Renamed tf::Runtime::join to tf::Runtime::corun_all -Removed tf::WorkerInterface due to the support of exception +Removed tf::WorkerInterface due to the support of exception -Codestin Search App - +Codestin Search App Revised Asynchronous Tasking with Dependencies Added Query the Completion Status of Dependent Async Tasks Revised Exception Handling Revised Executor -Removed the section of tf::WorkerInterface +Removed the section of tf::WorkerInterface Revised Partitioning Algorithm @@ -168,8 +159,7 @@ Taskflow works on Linux, Windows, and Mac OS X. -Codestin Search App -We have published Taskflow in the following venues: +Codestin Search AppWe have published Taskflow in the following venues: Cheng-Hsiang Chiu, Zhicheng Xiong, Zizheng Guo, Tsung-Wei Huang, and Yibo Lin, "An Efficient Task-parallel Pipeline Programming Framework," ACM International Conference on High-performance Computing in Asia-Pacific Region (HPC Asia), Nagoya, Japan, 2024 Cheng-Hsiang Chiu, Dian-Lun Lin, and Tsung-Wei Huang,, "Programming Dynamic Task Parallelism for Heterogeneous EDA Algorithms," IEEE/ACM International Conference on Computer-aided Design (ICCAD), San Francisco, CA, 2023 @@ -178,6 +168,6 @@ Taskflow works on Linux, Windows, and Mac OS X. Please do not hesitate to contact Dr. Tsung-Wei Huang if you intend to collaborate with us on using Taskflow in your scientific computing projects. - + diff --git a/docs/xml/release-3-8-0.xml b/docs/xml/release-3-8-0.xml new file mode 100644 index 000000000..8ec13485a --- /dev/null +++ b/docs/xml/release-3-8-0.xml @@ -0,0 +1,152 @@ + + + + release-3-8-0 + Codestin Search App + + + Release Summary + release-3-8-0_1release-3-8-0_summary + + + Download + release-3-8-0_1release-3-8-0_download + + + System Requirements + release-3-8-0_1release-3-8-0_system_requirements + + + New Features + release-3-8-0_1release-3-8-0_new_features + + + Taskflow Core + release-3-8-0_1release-3-8-0_taskflow_core + + + Utilities + release-3-8-0_1release-3-8-0_utilities + + + + + Bug Fixes + release-3-8-0_1release-3-8-0_bug_fixes + + + Breaking Changes + release-3-8-0_1release-3-8-0_breaking_changes + + + Documentation + release-3-8-0_1release-3-8-0_documentation + + + Miscellaneous Items + release-3-8-0_1release-3-8-0_miscellaneous_items + + + + + + +Codestin Search AppThis releases (1) enhances the scheduling performance through C++20 atomic notification and a bounded queue strategy and (2) revised the semaphore model for better runtime control. + + +Codestin Search AppTaskflow 3.8.0 can be downloaded from here. + + +Codestin Search AppTo use Taskflow v3.8.0, you need a compiler that supports C++17: + +GNU C++ Compiler at least v8.4 with -std=c++17 + +Clang C++ Compiler at least v6.0 with -std=c++17 + +Microsoft Visual Studio at least v19.27 with /std:c++17 + +AppleClang Xcode Version at least v12.0 with -std=c++17 + +Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17 + +Intel C++ Compiler at least v19.0.1 with -std=c++17 + +Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 + + +Taskflow works on Linux, Windows, and Mac OS X. +Although Taskflow supports primarily C++17, you can enable C++20 compilation through -std=c++20 to achieve better performance due to new C++20 features. + + + + +Codestin Search App +Codestin Search App +Enhanced the core scheduling algorithm using a new bounded queue strategy +Enhanced the core scheduling performance using C++20 atomic notification + + +#compileyourtaskflowprogramwithC++20enabled +~$g++-std=c++20my_taskflow.cpp + + +Revised the semaphore programming model for better runtime control through tf::Runtime + + +tf::Executorexecutor(8);//createanexecutorof8workers +tf::Taskflowtaskflow; +tf::Semaphoresemaphore(1);//createasemaphorewithinitialcount1 +for(size_ti=0;i<1000;i++){ +taskflow.emplace([&](tf::Runtime&rt){ +rt.acquire(semaphore); +std::cout<<"criticalsectionhere(oneworkerhereonly)\n"; +critical_section(); +rt.release(semaphore); +}); +} +executor.run(taskflow).wait(); + + +Enhanced async-tasking performance through TLS +Added async-task benchmark +Added non-blocking notifier and atomic notifier modules +Added tf::BoundedTaskQueue and tf::UnboundedTaskQueue +Added tf::Freelist module to replace the centralized overflow queue +Removed the redundant exception handling in object pool + + + + +Codestin Search App + + +Codestin Search App +Fixed the compilation error for not finding the C++ atomic library +Fixed the missing tf::Runtime in asynchronous tasking +Fixed the non-heterogeneity of tf::Taskflow::for_each_index +Fixed the bug of UUID unit test in a multithreaded environment + + + + +Codestin Search App +Removed the support of object pool by default +Removed the support of prioritized tasking due to inconsistency with work stealing + + + + +Codestin Search App +Revised Limit the Maximum Concurrency +Removed Prioritized Tasking +Fixed typos in multiple pages + + + + +Codestin Search AppPlease do not hesitate to contact Dr. Tsung-Wei Huang if you intend to collaborate with us on using Taskflow in your scientific computing projects. + + + + + diff --git a/docs/xml/release-3-9-0.xml b/docs/xml/release-3-9-0.xml new file mode 100644 index 000000000..456f87707 --- /dev/null +++ b/docs/xml/release-3-9-0.xml @@ -0,0 +1,175 @@ + + + + release-3-9-0 + Codestin Search App + + + Release Summary + release-3-9-0_1release-3-9-0_summary + + + Download + release-3-9-0_1release-3-9-0_download + + + System Requirements + release-3-9-0_1release-3-9-0_system_requirements + + + New Features + release-3-9-0_1release-3-9-0_new_features + + + Taskflow Core + release-3-9-0_1release-3-9-0_taskflow_core + + + Utilities + release-3-9-0_1release-3-9-0_utilities + + + + + Bug Fixes + release-3-9-0_1release-3-9-0_bug_fixes + + + Breaking Changes + release-3-9-0_1release-3-9-0_breaking_changes + + + Documentation + release-3-9-0_1release-3-9-0_documentation + + + Miscellaneous Items + release-3-9-0_1release-3-9-0_miscellaneous_items + + + + + + +Codestin Search AppThis release improves scheduling performance with a decentralized work-stealing strategy and enhances exception handling across all task types. + + +Codestin Search AppTaskflow 3.9.0 can be downloaded from here. + + +Codestin Search AppTo use Taskflow v3.9.0, you need a compiler that supports C++17: + +GNU C++ Compiler at least v8.4 with -std=c++17 + +Clang C++ Compiler at least v6.0 with -std=c++17 + +Microsoft Visual Studio at least v19.27 with /std:c++17 + +AppleClang Xcode Version at least v12.0 with -std=c++17 + +Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17 + +Intel C++ Compiler at least v19.0.1 with -std=c++17 + +Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 + + +Taskflow works on Linux, Windows, and Mac OS X. +Although Taskflow supports primarily C++17, you can enable C++20 compilation through -std=c++20 to achieve better performance due to new C++20 features. + + + + +Codestin Search App +Codestin Search App +improved the core scheduling algorithm using a decentralized work-stealing strategy +tf::BoundedTaskQueue to optimize per-thread work-stealing latency +tf::UnboundedTaskQueue to handle overflowed tasks + + +enhanced tf::Runtime to support preemptible execution flows +optimized task storage by storing detached tasks in their original subflows +optimized the query efficiency for strong dependencies by embedding their values in node states +updated tf::Graph to derive from a vector of unique pointers to nodes +Graph node lifetimes are managed by std::unique_ptr +Asynchronous task node lifetimes are managed by tf::Executor. + + +expanded unit tests to include more exception handling scenarios +decoupled tf::Runtime from static task to accommodate distinct execution logic +removed the blocking behavior to avoid underutilized threads for the following tasks: +module task (#649) +subflow task +all parallel algorithms (through preemptible async tasks) + + +removed std::bind from asynchronous tasks to ensure proper constexpr switch +added compile-time macros to enable specific features +TF_ENABLE_TASK_POOL to enable the use of task pool + + +added taskflow execution through asynchronous tasking with tf::make_module_task +details can be referred to Module Algorithm + + +added tf::WorkerInterface for users to configure the behaviors of workers +details can be referred to Executor + + +added worker interface example and unit tests + + + + +Codestin Search App +added tf::pause to relax CPU during busy spinning loop +added tf::seed to generate a random seed based on calling time point +added tf::atomic_min to update an atomic variable with the minimum value +added tf::atomic_max to update an atomic variable with the maximum value +added TF_CPP20 and TF_CPP17 macro for testing cpp versions + + + + + +Codestin Search App +fixed AppleClang compile error in tsq.hpp (#651) +fixed wrong range in uuid test (#632) +fixed the exception bug in tf::Subflow::join (#602) +fixed the wrong prefix of target when running benchmark.py +fixed a bug in the join counter reset logic for scheduling condition tasks (#652) + + + + +Codestin Search App +decoupled tf::Subflow from inheriting tf::Runtime to accommodate distinct execution logic +tf::Subflow no longer supports tf::Runtime-specific features + + +removed tf::Runtime::corun_until as it duplicates tf::Executor::corun_until +removed tf::Runtime-based semaphore interface due to significant flaws of blocking corun (#647) +details can be referred to Limit the Maximum Concurrency + + + + + + +Codestin Search App +fixed missing documentation of tf::Executor due to Doxygen bugs (#625) +fixed benchmark instance names in documentation (#621) +revised Exception Handling +revised Asynchronous Tasking +revised Limit the Maximum Concurrency +added Module Algorithm + + + + +Codestin Search AppPlease do not hesitate to contact Dr. Tsung-Wei Huang if you intend to collaborate with us on using Taskflow in your scientific computing projects. + + + + + diff --git a/docs/xml/release-3_80_80_8dox.xml b/docs/xml/release-3_80_80_8dox.xml index 811809359..2c8ced0d7 100644 --- a/docs/xml/release-3_80_80_8dox.xml +++ b/docs/xml/release-3_80_80_8dox.xml @@ -1,5 +1,5 @@ - + release-3.0.0.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/kmeans__cudaflow_8dox.xml b/docs/xml/release-3_810_80_8dox.xml similarity index 55% rename from docs/xml/kmeans__cudaflow_8dox.xml rename to docs/xml/release-3_810_80_8dox.xml index 4c52df0b6..b0d431e1b 100644 --- a/docs/xml/kmeans__cudaflow_8dox.xml +++ b/docs/xml/release-3_810_80_8dox.xml @@ -1,12 +1,12 @@ - - - kmeans_cudaflow.dox + + + release-3.10.0.dox tf - + diff --git a/docs/xml/cuda__std__merge_8dox.xml b/docs/xml/release-3_811_80_8dox.xml similarity index 55% rename from docs/xml/cuda__std__merge_8dox.xml rename to docs/xml/release-3_811_80_8dox.xml index 5c5caffaf..21a4c339a 100644 --- a/docs/xml/cuda__std__merge_8dox.xml +++ b/docs/xml/release-3_811_80_8dox.xml @@ -1,12 +1,12 @@ - - - cuda_std_merge.dox + + + release-3.11.0.dox tf - + diff --git a/docs/xml/release-3_81_80_8dox.xml b/docs/xml/release-3_81_80_8dox.xml index 2405a1ffe..155386bdc 100644 --- a/docs/xml/release-3_81_80_8dox.xml +++ b/docs/xml/release-3_81_80_8dox.xml @@ -1,5 +1,5 @@ - + release-3.1.0.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/release-3_82_80_8dox.xml b/docs/xml/release-3_82_80_8dox.xml index 2cc327cd9..b619e5ca5 100644 --- a/docs/xml/release-3_82_80_8dox.xml +++ b/docs/xml/release-3_82_80_8dox.xml @@ -1,5 +1,5 @@ - + release-3.2.0.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/release-3_83_80_8dox.xml b/docs/xml/release-3_83_80_8dox.xml index 1d7981eb8..3a7be4396 100644 --- a/docs/xml/release-3_83_80_8dox.xml +++ b/docs/xml/release-3_83_80_8dox.xml @@ -1,5 +1,5 @@ - + release-3.3.0.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/release-3_84_80_8dox.xml b/docs/xml/release-3_84_80_8dox.xml index 30126566e..f4ccb3dbe 100644 --- a/docs/xml/release-3_84_80_8dox.xml +++ b/docs/xml/release-3_84_80_8dox.xml @@ -1,5 +1,5 @@ - + release-3.4.0.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/release-3_85_80_8dox.xml b/docs/xml/release-3_85_80_8dox.xml index f5e29429b..c00f63ed7 100644 --- a/docs/xml/release-3_85_80_8dox.xml +++ b/docs/xml/release-3_85_80_8dox.xml @@ -1,5 +1,5 @@ - + release-3.5.0.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/release-3_86_80_8dox.xml b/docs/xml/release-3_86_80_8dox.xml index 5521bacd6..9d5348909 100644 --- a/docs/xml/release-3_86_80_8dox.xml +++ b/docs/xml/release-3_86_80_8dox.xml @@ -1,5 +1,5 @@ - + release-3.6.0.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/release-3_87_80_8dox.xml b/docs/xml/release-3_87_80_8dox.xml index ee419dc44..e78ada042 100644 --- a/docs/xml/release-3_87_80_8dox.xml +++ b/docs/xml/release-3_87_80_8dox.xml @@ -1,5 +1,5 @@ - + release-3.7.0.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/cuda__std__scan_8dox.xml b/docs/xml/release-3_88_80_8dox.xml similarity index 55% rename from docs/xml/cuda__std__scan_8dox.xml rename to docs/xml/release-3_88_80_8dox.xml index 4bd634a62..63e126c1a 100644 --- a/docs/xml/cuda__std__scan_8dox.xml +++ b/docs/xml/release-3_88_80_8dox.xml @@ -1,12 +1,12 @@ - - - cuda_std_scan.dox + + + release-3.8.0.dox tf - + diff --git a/docs/xml/cuda__std__find_8dox.xml b/docs/xml/release-3_89_80_8dox.xml similarity index 55% rename from docs/xml/cuda__std__find_8dox.xml rename to docs/xml/release-3_89_80_8dox.xml index 620d7736a..2346e494e 100644 --- a/docs/xml/cuda__std__find_8dox.xml +++ b/docs/xml/release-3_89_80_8dox.xml @@ -1,12 +1,12 @@ - - - cuda_std_find.dox + + + release-3.9.0.dox tf - + diff --git a/docs/xml/release-roadmap.xml b/docs/xml/release-roadmap.xml index 4440ace52..45c133e67 100644 --- a/docs/xml/release-roadmap.xml +++ b/docs/xml/release-roadmap.xml @@ -1,5 +1,5 @@ - + release-roadmap Codestin Search App @@ -7,7 +7,7 @@ Milestone Summary release-roadmap_1MilestoneSummary - + @@ -16,88 +16,72 @@ -Codestin Search App -The table below summarizes the milestones of Taskflow we plan to achieve by the end of 2021. Each milestone releases technical items that significantly enhances the capability of Taskflow. - +Codestin Search AppThe table below summarizes the milestones of Taskflow we plan to achieve by the end of 2021. Each milestone releases technical items that significantly enhances the capability of Taskflow. +
    Milestone -Release -Time of Arrival +Release Migrate the codebase to C++20 -v4.x -(under progress) +v4.x Design a custom thread-creation interface -TBD -(under progress) +TBD Design a distributed tasking interface with scheduling -TBD -(under progress) +TBD Design a pipeline scheduling framework with token dependency -v3.x -(under progress) +Release 3.7.0 (2024/05/07) Design a dynamic task graph model -v3.6 -2023/05/08 (done) +Release 3.6.0 (2023/05/07) Design a pipeline scheduling framework -v3.3 -2022/01/03 (done) +Release 3.3.0 (2022/01/03) Integrate thread sanitizer into the CI -v3.3 -2022/01/03 (done) +Release 3.3.0 (2022/01/03) Integrate OpenCL and SYCL to tf::syclFlow -v3.1 -2021/04/14 (done) +Release 3.1.0 (2021/04/14) -Integrate cuBLAS into tf::cudaFlow -v3.0 -2020/01/01 (done) +Integrate cuBLAS into tf::cudaFlow +Release 3.0.0 (2021/01/01) Support building cudaFlow through stream capture -v3.0 -2021/01/01 (done) +Release 3.0.0 (2021/01/01) Support profiling large data in tfprof -v3.0 -2021/01/01 (done) +Release 3.0.0 (2021/01/01) Support cancelling Taskflow -v3.0 -2021/01/01 (done) +Release 3.0.0 (2021/01/01) Support limiting maximum concurrency -v3.0 -2021/01/01 (done) +Release 3.0.0 (2021/01/01) Migrate the codebase to C++17 -v3.0 -2021/01/01 (done) +Release 3.0.0 (2021/01/01)
    Along with the project development, we expect to have multiple releases for feature requests, bug fixes, and technical improvement.
    - +
    diff --git a/docs/xml/release-roadmap_8dox.xml b/docs/xml/release-roadmap_8dox.xml index ee1fca0fa..54d41beb4 100644 --- a/docs/xml/release-roadmap_8dox.xml +++ b/docs/xml/release-roadmap_8dox.xml @@ -1,5 +1,5 @@ - + release-roadmap.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/releases_8dox.xml b/docs/xml/releases_8dox.xml index 12d94240b..ed22299ee 100644 --- a/docs/xml/releases_8dox.xml +++ b/docs/xml/releases_8dox.xml @@ -1,5 +1,5 @@ - + releases.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/rules.xml b/docs/xml/rules.xml index 744a4c080..175bd936e 100644 --- a/docs/xml/rules.xml +++ b/docs/xml/rules.xml @@ -1,5 +1,5 @@ - + rules Codestin Search App @@ -7,31 +7,31 @@ The Project Overview rules_1TheProjectOverview - + Joining Core Members rules_1JoiningCoreMembers - + Rules for Roles rules_1RulesForRoles - + Software Decisions rules_1SoftwareDecisions - + Financial Decisions rules_1FinancialDecisions - + Community Partners rules_1CommunityPartners - + Changing the Governance Rules rules_1ChangingTheRules - + @@ -39,31 +39,25 @@ This page summarizes coordination rules fulfilled by the Taskflow Core Members. We impose these rules to ensure scientific excellence, continuity, and transparency. Since the Taskflow community has matured substantially in recent months, we will revisit these rules as the community needs. We accomplish these rules with reference to the governance document of Dask and TARDIS. -Codestin Search App -Taskflow (The Project) is an open-source software project that aims to simplify parallel and heterogeneous computing in C++ software ecosystem. We release Taskflow under the non-viral MIT license, developed openly and hosted in public GitHub repositories under the Project GitHub. Examples of project software include the Taskflow core library, Taskflow profiler (tfprof), and applications to other domains such as computer-aided design (CAD) and machine learning. We host a Project Website to highlight these components. +Codestin Search AppTaskflow (The Project) is an open-source software project that aims to simplify parallel and heterogeneous computing in C++ software ecosystem. We release Taskflow under the non-viral MIT license, developed openly and hosted in public GitHub repositories under the Project GitHub. Examples of project software include the Taskflow core library, Taskflow profiler (tfprof), and applications to other domains such as computer-aided design (CAD) and machine learning. We host a Project Website to highlight these components. Taskflow is developed by a distributed team of developers, called Contributors. Contributors are individuals who have contributed code, documentation, designs, user support, or other work to one or more project repositories. Anyone can be a Contributor. Contributors can be affiliated with any legal entity or none. Contributors participate in the project by submitting, reviewing and discussing GitHub Pull Requests and Issues and participating in open and public project discussions on GitHub, Stack Overflow, Gitter chat rooms, and mailing lists. The foundation of project participation is openness and transparency. Taskflow community consists of all contributors and users. Contributors work on behalf of and are responsible to the larger project community and we strive to keep the barrier between contributors and users as low as possible. -Codestin Search App -Core Members are essential to the growth of Taskflow because they provide the core technical development, maintenance, and support for the community. New members of are nominated by current members or our sponsors. All core members can vote on nominated candidates, who require a 2/3 majority in their favor in order to be approved. +Codestin Search AppCore Members are essential to the growth of Taskflow because they provide the core technical development, maintenance, and support for the community. New members of are nominated by current members or our sponsors. All core members can vote on nominated candidates, who require a 2/3 majority in their favor in order to be approved. -Codestin Search App -Every core member of Taskflow can vote and the election will go through an anonymous rank voting system. If there is a tie, the principal investigator will facilitate a discussion to make a runoff decision. +Codestin Search AppEvery core member of Taskflow can vote and the election will go through an anonymous rank voting system. If there is a tie, the principal investigator will facilitate a discussion to make a runoff decision. Depending on the funding status, the principal investigator may change and be re-selected. Such a change will be broadcast to all core members and we will strike a balance between how each member is funded and how the funding may direct Taskflow. At this stage, Dr. Tsung-Wei Huang is the principal investigator and will remain the role for another 3-4 years. -Codestin Search App -Decisions about software architecture and design, and releases should take into account consistency over the Taskflow codebase and best practices. The final decision rests with the core members by a 2/3 majority. +Codestin Search AppDecisions about software architecture and design, and releases should take into account consistency over the Taskflow codebase and best practices. The final decision rests with the core members by a 2/3 majority. -Codestin Search App -Financial decisions, such as research grants and company gifts, are made by the Principal Investigator, Dr. Tsung-Wei Huang. We will inform the core members with a reasonable time ahead to allow them to raise any objections, for example, biased features to an individual's interest. The core members can veto decisions with a 2/3 majority. +Codestin Search AppFinancial decisions, such as research grants and company gifts, are made by the Principal Investigator, Dr. Tsung-Wei Huang. We will inform the core members with a reasonable time ahead to allow them to raise any objections, for example, biased features to an individual's interest. The core members can veto decisions with a 2/3 majority. -Codestin Search App -We acknowledge the importance of Community Partners in disseminating Taskflow to external communities. A Community Partner is a set of individuals (does not need legal recognition) that effectively supports and communicates the needs of an external community in using the Project. External communities might be focused around a specific scientific or social discipline (like biology or education), a social grouping (like Chinese speakers), or another such group that benefits from a collective voice. Community Partners will have demonstrated technical expertise in using the Project, as well as social expertise in effectively filtering concerns, and questions from their community to keep our project thrive. +Codestin Search AppWe acknowledge the importance of Community Partners in disseminating Taskflow to external communities. A Community Partner is a set of individuals (does not need legal recognition) that effectively supports and communicates the needs of an external community in using the Project. External communities might be focused around a specific scientific or social discipline (like biology or education), a social grouping (like Chinese speakers), or another such group that benefits from a collective voice. Community Partners will have demonstrated technical expertise in using the Project, as well as social expertise in effectively filtering concerns, and questions from their community to keep our project thrive. We acknowledge Community Partners in the following ways: Public acknowledgement of their community on Taskflow webpages and other promotional material if that community is organized enough to have a central brand. @@ -74,10 +68,9 @@ Please also visit How Can I Get Credit? to understand how we acknowledge contributors. -Codestin Search App -Changes to the governance rules are submitted via a pull request to edit this documentation. The pull request is then refined in response to public comment and review, with the goal being consensus in the community. +Codestin Search AppChanges to the governance rules are submitted via a pull request to edit this documentation. The pull request is then refined in response to public comment and review, with the goal being consensus in the community. - + diff --git a/docs/xml/rules_8dox.xml b/docs/xml/rules_8dox.xml index eb08638ee..c9754be06 100644 --- a/docs/xml/rules_8dox.xml +++ b/docs/xml/rules_8dox.xml @@ -1,5 +1,5 @@ - + rules.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/runtime_8hpp.xml b/docs/xml/runtime_8hpp.xml new file mode 100644 index 000000000..df87b85f8 --- /dev/null +++ b/docs/xml/runtime_8hpp.xml @@ -0,0 +1,331 @@ + + + + runtime.hpp + executor.hpp + taskflow/taskflow.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf::Runtime + tf::PreemptionGuard + tf + + + TF_RUNTIME_CHECK_CALLER + msg + if(pt::this_worker != &_worker) { \ + TF_THROW(msg); \ + } + + + + + + + + + + + + + + + + diff --git a/docs/xml/runtime__tasking_8dox.xml b/docs/xml/runtime__tasking_8dox.xml index 501ef6a54..740bb4b7c 100644 --- a/docs/xml/runtime__tasking_8dox.xml +++ b/docs/xml/runtime__tasking_8dox.xml @@ -1,5 +1,5 @@ - + runtime_tasking.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/scalable__pipeline_8dox.xml b/docs/xml/scalable__pipeline_8dox.xml index 67872d110..8c66d42ae 100644 --- a/docs/xml/scalable__pipeline_8dox.xml +++ b/docs/xml/scalable__pipeline_8dox.xml @@ -1,5 +1,5 @@ - + scalable_pipeline.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/scalable_pipeline_2.dot b/docs/xml/scalable_pipeline_2.dot index 01dec862d..70a051b5f 100644 --- a/docs/xml/scalable_pipeline_2.dot +++ b/docs/xml/scalable_pipeline_2.dot @@ -87,6 +87,7 @@ p20 -> p21; p21 -> p22; p30 -> p31; p31 -> p32; +p32 -> p33; // Added this line p00 -> p10; p01 -> p11; p02 -> p12; diff --git a/docs/xml/scan_8dox.xml b/docs/xml/scan_8dox.xml index 0d4860091..bea484ef3 100644 --- a/docs/xml/scan_8dox.xml +++ b/docs/xml/scan_8dox.xml @@ -1,5 +1,5 @@ - + scan.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/scan_8hpp.xml b/docs/xml/scan_8hpp.xml deleted file mode 100644 index cd9fd9095..000000000 --- a/docs/xml/scan_8hpp.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - - scan.hpp - tf::detail::cudaScanResult - tf::detail::cudaScanResult< T, vt, true > - tf::detail::cudaBlockScan - tf::detail::cudaBlockScan::storage_t - tf - tf::detail - -CUDA scan algorithm include file. - - - - - - diff --git a/docs/xml/semaphore_8dox.xml b/docs/xml/semaphore_8dox.xml index 4c4430856..3ecf34130 100644 --- a/docs/xml/semaphore_8dox.xml +++ b/docs/xml/semaphore_8dox.xml @@ -1,5 +1,5 @@ - + semaphore.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/semaphore_8hpp.xml b/docs/xml/semaphore_8hpp.xml index 81c2b48d7..726f3445e 100644 --- a/docs/xml/semaphore_8hpp.xml +++ b/docs/xml/semaphore_8hpp.xml @@ -1,7 +1,180 @@ - + semaphore.hpp + mutex + declarations.hpp + ../utility/small_vector.hpp + taskflow/core/graph.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf::Semaphore tf @@ -9,6 +182,6 @@ - + diff --git a/docs/xml/small__vector_8hpp.xml b/docs/xml/small__vector_8hpp.xml index 2ba360b0c..4679d936b 100644 --- a/docs/xml/small__vector_8hpp.xml +++ b/docs/xml/small__vector_8hpp.xml @@ -1,7 +1,179 @@ - + small_vector.hpp + macros.hpp + algorithm + cassert + cstddef + cstdlib + cstring + initializer_list + iterator + memory + taskflow/core/graph.hpp + taskflow/core/semaphore.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf::IsPod tf::SmallVectorBase tf::SmallVectorTemplateCommon @@ -21,6 +193,6 @@ - + diff --git a/docs/xml/sort_8dox.xml b/docs/xml/sort_8dox.xml index a80ed6528..a834efba3 100644 --- a/docs/xml/sort_8dox.xml +++ b/docs/xml/sort_8dox.xml @@ -1,5 +1,5 @@ - + sort.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/sort_8hpp.xml b/docs/xml/sort_8hpp.xml deleted file mode 100644 index 517c20ddf..000000000 --- a/docs/xml/sort_8hpp.xml +++ /dev/null @@ -1,16 +0,0 @@ - - - - sort.hpp - tf::detail::cudaBlockSort - tf::detail::cudaBlockSort::Storage - tf - tf::detail - -CUDA sort algorithm include file. - - - - - - diff --git a/docs/xml/static__tasking_8dox.xml b/docs/xml/static__tasking_8dox.xml index 145544a84..47d3edb9e 100644 --- a/docs/xml/static__tasking_8dox.xml +++ b/docs/xml/static__tasking_8dox.xml @@ -1,5 +1,5 @@ - + static_tasking.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/structtf_1_1ChromeObserver_1_1Segment.xml b/docs/xml/structtf_1_1ChromeObserver_1_1Segment.xml index 29e3c95d4..9d25f86cb 100644 --- a/docs/xml/structtf_1_1ChromeObserver_1_1Segment.xml +++ b/docs/xml/structtf_1_1ChromeObserver_1_1Segment.xml @@ -1,64 +1,68 @@ - + tf::ChromeObserver::Segment - + - std::string + std::string std::string tf::ChromeObserver::Segment::name name + tf::ChromeObserver::Segment::name - + - observer_stamp_t + observer_stamp_t observer_stamp_t tf::ChromeObserver::Segment::beg beg + tf::ChromeObserver::Segment::beg - + - observer_stamp_t + observer_stamp_t observer_stamp_t tf::ChromeObserver::Segment::end end + tf::ChromeObserver::Segment::end - + - - + + tf::ChromeObserver::Segment::Segment (const std::string &n, observer_stamp_t b, observer_stamp_t e) Segment + tf::ChromeObserver::Segment::Segment - const std::string & + const std::string & n - observer_stamp_t + observer_stamp_t b - observer_stamp_t + observer_stamp_t e @@ -67,32 +71,32 @@ - + - + + + + + + + - beg - end + name - name + beg + end - - - - - - - + tf::ChromeObserver::Segmentbeg tf::ChromeObserver::Segmentend diff --git a/docs/xml/structtf_1_1ChromeObserver_1_1Timeline.xml b/docs/xml/structtf_1_1ChromeObserver_1_1Timeline.xml index 3191c9696..7007035d8 100644 --- a/docs/xml/structtf_1_1ChromeObserver_1_1Timeline.xml +++ b/docs/xml/structtf_1_1ChromeObserver_1_1Timeline.xml @@ -1,76 +1,79 @@ - + tf::ChromeObserver::Timeline - + - observer_stamp_t + observer_stamp_t observer_stamp_t tf::ChromeObserver::Timeline::origin origin + tf::ChromeObserver::Timeline::origin - + - std::vector< std::vector< Segment > > + std::vector< std::vector< Segment > > std::vector<std::vector<Segment> > tf::ChromeObserver::Timeline::segments segments + tf::ChromeObserver::Timeline::segments - + - std::vector< std::stack< observer_stamp_t > > + std::vector< std::stack< observer_stamp_t > > std::vector<std::stack<observer_stamp_t> > tf::ChromeObserver::Timeline::stacks stacks + tf::ChromeObserver::Timeline::stacks - + - + + + + + + + - segments + origin - stacks + segments - origin + stacks - - - - - - - + tf::ChromeObserver::Timelineorigin tf::ChromeObserver::Timelinesegments diff --git a/docs/xml/structtf_1_1DataPipeline_1_1Line.xml b/docs/xml/structtf_1_1DataPipeline_1_1Line.xml index 0b6322ed7..bc799b57b 100644 --- a/docs/xml/structtf_1_1DataPipeline_1_1Line.xml +++ b/docs/xml/structtf_1_1DataPipeline_1_1Line.xml @@ -1,38 +1,39 @@ - + tf::DataPipeline::Line - + - std::atomic< size_t > + std::atomic< size_t > std::atomic<size_t> tf::DataPipeline< Ps >::Line::join_counter join_counter + tf::DataPipeline::Line::join_counter - + - + + + + join_counter - - - - + tf::DataPipeline::Linejoin_counter diff --git a/docs/xml/structtf_1_1DataPipeline_1_1PipeMeta.xml b/docs/xml/structtf_1_1DataPipeline_1_1PipeMeta.xml index bb8510476..661c5767b 100644 --- a/docs/xml/structtf_1_1DataPipeline_1_1PipeMeta.xml +++ b/docs/xml/structtf_1_1DataPipeline_1_1PipeMeta.xml @@ -1,27 +1,28 @@ - + tf::DataPipeline::PipeMeta - + PipeType PipeType tf::DataPipeline< Ps >::PipeMeta::type type + tf::DataPipeline::PipeMeta::type - + - + - + tf::DataPipeline::PipeMetatype diff --git a/docs/xml/structtf_1_1DefaultClosureWrapper.xml b/docs/xml/structtf_1_1DefaultClosureWrapper.xml deleted file mode 100644 index bb8616566..000000000 --- a/docs/xml/structtf_1_1DefaultClosureWrapper.xml +++ /dev/null @@ -1,15 +0,0 @@ - - - - tf::DefaultClosureWrapper - partitioner.hpp - -default closure wrapper that simplies runs the given closure as is - - - - - - - - diff --git a/docs/xml/structtf_1_1DefaultTaskParams.xml b/docs/xml/structtf_1_1DefaultTaskParams.xml deleted file mode 100644 index 4088d6064..000000000 --- a/docs/xml/structtf_1_1DefaultTaskParams.xml +++ /dev/null @@ -1,15 +0,0 @@ - - - - tf::DefaultTaskParams - graph.hpp - -empty task parameter type for compile-time optimization - - - - - - - - diff --git a/docs/xml/structtf_1_1IsPartitioner.xml b/docs/xml/structtf_1_1IsPartitioner.xml index 1c3f9546c..16d836b34 100644 --- a/docs/xml/structtf_1_1IsPartitioner.xml +++ b/docs/xml/structtf_1_1IsPartitioner.xml @@ -1,5 +1,5 @@ - + tf::IsPartitioner tf::PartitionerBase< DefaultClosureWrapper > @@ -9,37 +9,25 @@ - - - - - - - - - - - - - - - + + + - - - - + + + + @@ -48,8 +36,20 @@ + + + + + + + + + + + + - + diff --git a/docs/xml/structtf_1_1IsPod.xml b/docs/xml/structtf_1_1IsPod.xml index 027d5f5ce..cfcd42085 100644 --- a/docs/xml/structtf_1_1IsPod.xml +++ b/docs/xml/structtf_1_1IsPod.xml @@ -1,5 +1,5 @@ - + tf::IsPod std::integral_constant< bool, std::is_standard_layout< T >::value &&std::is_trivial< T >::value > @@ -32,7 +32,7 @@ - + diff --git a/docs/xml/structtf_1_1NodeDeleter.xml b/docs/xml/structtf_1_1NodeDeleter.xml deleted file mode 100644 index b21443b7b..000000000 --- a/docs/xml/structtf_1_1NodeDeleter.xml +++ /dev/null @@ -1,33 +0,0 @@ - - - - tf::NodeDeleter - - - void - void tf::NodeDeleter::operator() - (Node *ptr) - operator() - - Node * - ptr - - - - - - - - - - - - - - - - - tf::NodeDeleteroperator() - - - diff --git a/docs/xml/structtf_1_1Node_1_1Async.xml b/docs/xml/structtf_1_1Node_1_1Async.xml index 8c9ca95b9..63c29514b 100644 --- a/docs/xml/structtf_1_1Node_1_1Async.xml +++ b/docs/xml/structtf_1_1Node_1_1Async.xml @@ -1,23 +1,24 @@ - + tf::Node::Async - - - std::variant< std::function< void()>, std::function< void(Runtime &)> > - std::variant< std::function<void()>, std::function<void(Runtime&)> > tf::Node::Async::work + + + std::variant< std::function< void()>, std::function< void(tf::Runtime &)>, std::function< void(tf::Runtime &, bool)> > + std::variant< std::function<void()>, std::function<void(tf::Runtime&)>, std::function<void(tf::Runtime&, bool)> > tf::Node::Async::work work + tf::Node::Async::work - + - - + + @@ -28,6 +29,7 @@ tf::Node::Async::Async (T &&) Async + tf::Node::Async::Async T && @@ -37,7 +39,7 @@ - + @@ -49,6 +51,7 @@ tf::Node::Async::Async (C &&c) Async + tf::Node::Async::Async C && c @@ -59,18 +62,18 @@ - + - + - + tf::Node::AsyncAsync tf::Node::AsyncAsync - tf::Node::Asyncwork + tf::Node::Asyncwork diff --git a/docs/xml/structtf_1_1Node_1_1Condition.xml b/docs/xml/structtf_1_1Node_1_1Condition.xml index 6e8cb0206..46d8b3cf6 100644 --- a/docs/xml/structtf_1_1Node_1_1Condition.xml +++ b/docs/xml/structtf_1_1Node_1_1Condition.xml @@ -1,23 +1,24 @@ - + tf::Node::Condition - - - std::variant< std::function< int()>, std::function< int(Runtime &)> > - std::variant< std::function<int()>, std::function<int(Runtime&)> > tf::Node::Condition::work + + + std::function< int()> + std::function<int()> tf::Node::Condition::work work + tf::Node::Condition::work - + - - + + @@ -28,6 +29,7 @@ tf::Node::Condition::Condition (C &&) Condition + tf::Node::Condition::Condition C && c @@ -38,17 +40,28 @@ - + - + - + + + + + + + + work + + + + tf::Node::ConditionCondition - tf::Node::Conditionwork + tf::Node::Conditionwork diff --git a/docs/xml/structtf_1_1Node_1_1DependentAsync.xml b/docs/xml/structtf_1_1Node_1_1DependentAsync.xml index 703c63987..b6d49cf4a 100644 --- a/docs/xml/structtf_1_1Node_1_1DependentAsync.xml +++ b/docs/xml/structtf_1_1Node_1_1DependentAsync.xml @@ -1,26 +1,28 @@ - + tf::Node::DependentAsync - - - std::variant< std::function< void()>, std::function< void(Runtime &)> > - std::variant< std::function<void()>, std::function<void(Runtime&)> > tf::Node::DependentAsync::work + + + std::variant< std::function< void()>, std::function< void(tf::Runtime &)>, std::function< void(tf::Runtime &, bool)> > + std::variant< std::function<void()>, std::function<void(tf::Runtime&)>, std::function<void(tf::Runtime&, bool)> > tf::Node::DependentAsync::work work + tf::Node::DependentAsync::work - + - std::atomic< size_t > + std::atomic< size_t > std::atomic<size_t> tf::Node::DependentAsync::use_count use_count + tf::Node::DependentAsync::use_count {1} @@ -28,24 +30,25 @@ - + - - std::atomic< AsyncState > - std::atomic<AsyncState> tf::Node::DependentAsync::state + + std::atomic< ASTATE::underlying_type > + std::atomic<ASTATE::underlying_type> tf::Node::DependentAsync::state state - {AsyncState::UNFINISHED} + tf::Node::DependentAsync::state + {ASTATE::UNFINISHED} - + - - + + @@ -56,6 +59,7 @@ tf::Node::DependentAsync::DependentAsync (C &&) DependentAsync + tf::Node::DependentAsync::DependentAsync C && c @@ -66,14 +70,20 @@ - + - + + + + + + + @@ -83,19 +93,13 @@ state - - - - - - - + tf::Node::DependentAsyncDependentAsync - tf::Node::DependentAsyncstate + tf::Node::DependentAsyncstate tf::Node::DependentAsyncuse_count - tf::Node::DependentAsyncwork + tf::Node::DependentAsyncwork diff --git a/docs/xml/structtf_1_1Node_1_1Module.xml b/docs/xml/structtf_1_1Node_1_1Module.xml index 3b3213d83..548527da7 100644 --- a/docs/xml/structtf_1_1Node_1_1Module.xml +++ b/docs/xml/structtf_1_1Node_1_1Module.xml @@ -1,23 +1,24 @@ - + tf::Node::Module - + Graph & Graph& tf::Node::Module::graph graph + tf::Node::Module::graph - + - - + + @@ -28,6 +29,7 @@ tf::Node::Module::Module (T &) Module + tf::Node::Module::Module T & obj @@ -38,17 +40,22 @@ - + - + + + + + + @@ -57,7 +64,7 @@ - + tf::Node::Modulegraph tf::Node::ModuleModule diff --git a/docs/xml/structtf_1_1Node_1_1MultiCondition.xml b/docs/xml/structtf_1_1Node_1_1MultiCondition.xml index c55bdc277..cf8adfcba 100644 --- a/docs/xml/structtf_1_1Node_1_1MultiCondition.xml +++ b/docs/xml/structtf_1_1Node_1_1MultiCondition.xml @@ -1,23 +1,24 @@ - + tf::Node::MultiCondition - - - std::variant< std::function< SmallVector< int >)>, std::function< SmallVector< int >Runtime &)> > - std::variant< std::function<SmallVector<int>)>, std::function<SmallVector<int>Runtime&)> > tf::Node::MultiCondition::work + + + std::function< SmallVector< int >()> + std::function<SmallVector<int>()> tf::Node::MultiCondition::work work + tf::Node::MultiCondition::work - + - - + + @@ -28,6 +29,7 @@ tf::Node::MultiCondition::MultiCondition (C &&) MultiCondition + tf::Node::MultiCondition::MultiCondition C && c @@ -38,17 +40,28 @@ - + - + - + + + + + + + + work + + + + tf::Node::MultiConditionMultiCondition - tf::Node::MultiConditionwork + tf::Node::MultiConditionwork diff --git a/docs/xml/structtf_1_1Node_1_1Runtime.xml b/docs/xml/structtf_1_1Node_1_1Runtime.xml new file mode 100644 index 000000000..5a6349c55 --- /dev/null +++ b/docs/xml/structtf_1_1Node_1_1Runtime.xml @@ -0,0 +1,67 @@ + + + + tf::Node::Runtime + + + std::function< void(tf::Runtime &)> + std::function<void(tf::Runtime&)> tf::Node::Runtime::work + + work + tf::Node::Runtime::work + + + + + + + + + + + + + + typename C + + + + tf::Node::Runtime::Runtime + (C &&) + Runtime + tf::Node::Runtime::Runtime + + C && + c + + + + + + + + + + + + + + + + + + + + + + work + + + + + + tf::Node::RuntimeRuntime + tf::Node::Runtimework + + + diff --git a/docs/xml/structtf_1_1Node_1_1Semaphores.xml b/docs/xml/structtf_1_1Node_1_1Semaphores.xml index 737fcee39..ab01466e4 100644 --- a/docs/xml/structtf_1_1Node_1_1Semaphores.xml +++ b/docs/xml/structtf_1_1Node_1_1Semaphores.xml @@ -1,35 +1,37 @@ - + tf::Node::Semaphores - + SmallVector< Semaphore * > SmallVector<Semaphore*> tf::Node::Semaphores::to_acquire to_acquire + tf::Node::Semaphores::to_acquire - + SmallVector< Semaphore * > SmallVector<Semaphore*> tf::Node::Semaphores::to_release to_release + tf::Node::Semaphores::to_release - + - + @@ -42,32 +44,32 @@ to_release - - - - - + + + - - - + + + + + - + tf::Node::Semaphoresto_acquire tf::Node::Semaphoresto_release diff --git a/docs/xml/structtf_1_1Node_1_1Static.xml b/docs/xml/structtf_1_1Node_1_1Static.xml index 6ee6ce26e..72544e042 100644 --- a/docs/xml/structtf_1_1Node_1_1Static.xml +++ b/docs/xml/structtf_1_1Node_1_1Static.xml @@ -1,23 +1,24 @@ - + tf::Node::Static - - - std::variant< std::function< void()>, std::function< void(Runtime &)> > - std::variant< std::function<void()>, std::function<void(Runtime&)> > tf::Node::Static::work + + + std::function< void()> + std::function<void()> tf::Node::Static::work work + tf::Node::Static::work - + - - + + @@ -28,6 +29,7 @@ tf::Node::Static::Static (C &&) Static + tf::Node::Static::Static C && c @@ -38,17 +40,28 @@ - + - + - + + + + + + + + work + + + + tf::Node::StaticStatic - tf::Node::Staticwork + tf::Node::Staticwork diff --git a/docs/xml/structtf_1_1Node_1_1Subflow.xml b/docs/xml/structtf_1_1Node_1_1Subflow.xml index 8a456c3ef..027e227bb 100644 --- a/docs/xml/structtf_1_1Node_1_1Subflow.xml +++ b/docs/xml/structtf_1_1Node_1_1Subflow.xml @@ -1,36 +1,38 @@ - + tf::Node::Subflow - + - std::function< void(tf::Subflow &)> + std::function< void(tf::Subflow &)> std::function<void(tf::Subflow&)> tf::Node::Subflow::work work + tf::Node::Subflow::work - + Graph Graph tf::Node::Subflow::subgraph subgraph + tf::Node::Subflow::subgraph - + - - + + @@ -41,6 +43,7 @@ tf::Node::Subflow::Subflow (C &&) Subflow + tf::Node::Subflow::Subflow C && c @@ -51,20 +54,25 @@ - + - + + + + + + + - - - + + @@ -76,7 +84,7 @@ - + tf::Node::SubflowSubflow tf::Node::Subflowsubgraph diff --git a/docs/xml/structtf_1_1Pipeline_1_1Line.xml b/docs/xml/structtf_1_1Pipeline_1_1Line.xml index 4704eeed6..a66a4f680 100644 --- a/docs/xml/structtf_1_1Pipeline_1_1Line.xml +++ b/docs/xml/structtf_1_1Pipeline_1_1Line.xml @@ -1,38 +1,39 @@ - + tf::Pipeline::Line - + - std::atomic< size_t > + std::atomic< size_t > std::atomic<size_t> tf::Pipeline< Ps >::Line::join_counter join_counter + tf::Pipeline::Line::join_counter - + - + + + + join_counter - - - - + tf::Pipeline::Linejoin_counter diff --git a/docs/xml/structtf_1_1Pipeline_1_1PipeMeta.xml b/docs/xml/structtf_1_1Pipeline_1_1PipeMeta.xml index 85bd677dd..5547aed69 100644 --- a/docs/xml/structtf_1_1Pipeline_1_1PipeMeta.xml +++ b/docs/xml/structtf_1_1Pipeline_1_1PipeMeta.xml @@ -1,27 +1,28 @@ - + tf::Pipeline::PipeMeta - + PipeType PipeType tf::Pipeline< Ps >::PipeMeta::type type + tf::Pipeline::PipeMeta::type - + - + - + tf::Pipeline::PipeMetatype diff --git a/docs/xml/structtf_1_1ProfileData.xml b/docs/xml/structtf_1_1ProfileData.xml index dd972d97e..0d1f752e2 100644 --- a/docs/xml/structtf_1_1ProfileData.xml +++ b/docs/xml/structtf_1_1ProfileData.xml @@ -1,41 +1,44 @@ - + tf::ProfileData - + - std::vector< Timeline > + std::vector< Timeline > std::vector<Timeline> tf::ProfileData::timelines timelines + tf::ProfileData::timelines - + - - + + tf::ProfileData::ProfileData ()=default ProfileData + tf::ProfileData::ProfileData - + tf::ProfileData::ProfileData (const ProfileData &rhs)=delete ProfileData + tf::ProfileData::ProfileData const ProfileData & rhs @@ -46,13 +49,14 @@ - + tf::ProfileData::ProfileData (ProfileData &&rhs)=default ProfileData + tf::ProfileData::ProfileData ProfileData && rhs @@ -63,13 +67,14 @@ - + - + ProfileData & - ProfileData& tf::ProfileData::operator= + ProfileData & tf::ProfileData::operator= (const ProfileData &rhs)=delete operator= + tf::ProfileData::operator= const ProfileData & rhs @@ -80,13 +85,14 @@ - + - + ProfileData & - ProfileData& tf::ProfileData::operator= + ProfileData & tf::ProfileData::operator= (ProfileData &&)=default operator= + tf::ProfileData::operator= ProfileData && @@ -96,7 +102,7 @@ - + @@ -108,6 +114,7 @@ auto tf::ProfileData::save (Archiver &ar) const save + tf::ProfileData::save Archiver & ar @@ -118,7 +125,7 @@ - + @@ -130,6 +137,7 @@ auto tf::ProfileData::load (Archiver &ar) load + tf::ProfileData::load Archiver & ar @@ -140,29 +148,29 @@ - + - + + + + timelines - - - - + tf::ProfileDataload - tf::ProfileDataoperator= - tf::ProfileDataoperator= + tf::ProfileDataoperator= + tf::ProfileDataoperator= tf::ProfileDataProfileData tf::ProfileDataProfileData tf::ProfileDataProfileData diff --git a/docs/xml/structtf_1_1ScalablePipeline_1_1Line.xml b/docs/xml/structtf_1_1ScalablePipeline_1_1Line.xml index 73b0c8899..186786deb 100644 --- a/docs/xml/structtf_1_1ScalablePipeline_1_1Line.xml +++ b/docs/xml/structtf_1_1ScalablePipeline_1_1Line.xml @@ -1,38 +1,39 @@ - + tf::ScalablePipeline::Line - + - std::atomic< size_t > + std::atomic< size_t > std::atomic<size_t> tf::ScalablePipeline< P >::Line::join_counter join_counter + tf::ScalablePipeline::Line::join_counter - + - + + + + join_counter - - - - + tf::ScalablePipeline::Linejoin_counter diff --git a/docs/xml/structtf_1_1Segment.xml b/docs/xml/structtf_1_1Segment.xml index 0e7eb05f9..aae381863 100644 --- a/docs/xml/structtf_1_1Segment.xml +++ b/docs/xml/structtf_1_1Segment.xml @@ -1,62 +1,66 @@ - + tf::Segment - + - std::string + std::string std::string tf::Segment::name name + tf::Segment::name - + TaskType TaskType tf::Segment::type type + tf::Segment::type - + - observer_stamp_t + observer_stamp_t observer_stamp_t tf::Segment::beg beg + tf::Segment::beg - + - observer_stamp_t + observer_stamp_t observer_stamp_t tf::Segment::end end + tf::Segment::end - + - - + + @@ -67,6 +71,7 @@ auto tf::Segment::save (Archiver &ar) const save + tf::Segment::save Archiver & ar @@ -77,7 +82,7 @@ - + @@ -89,6 +94,7 @@ auto tf::Segment::load (Archiver &ar) load + tf::Segment::load Archiver & ar @@ -99,28 +105,30 @@ - + tf::Segment::Segment ()=default Segment + tf::Segment::Segment - + tf::Segment::Segment (const std::string &n, TaskType t, observer_stamp_t b, observer_stamp_t e) Segment + tf::Segment::Segment - const std::string & + const std::string & n @@ -128,11 +136,11 @@ t - observer_stamp_t + observer_stamp_t b - observer_stamp_t + observer_stamp_t e @@ -141,45 +149,46 @@ - + auto auto tf::Segment::span () const span + tf::Segment::span - + - + - + + + + - beg - end + name - name + beg + end - - - - + tf::Segmentbeg tf::Segmentend diff --git a/docs/xml/structtf_1_1SmallVectorStorage.xml b/docs/xml/structtf_1_1SmallVectorStorage.xml index b3ae11e0b..ef520bdb8 100644 --- a/docs/xml/structtf_1_1SmallVectorStorage.xml +++ b/docs/xml/structtf_1_1SmallVectorStorage.xml @@ -1,5 +1,5 @@ - + tf::SmallVectorStorage @@ -12,26 +12,27 @@ N - + SmallVectorTemplateCommon< T >::U SmallVectorTemplateCommon<T>::U tf::SmallVectorStorage< T, N >::InlineElts[N - 1] [N - 1] InlineElts + tf::SmallVectorStorage::InlineElts - + - + - + tf::SmallVectorStorageInlineElts diff --git a/docs/xml/structtf_1_1SmallVectorStorage_3_01T_00_010_01_4.xml b/docs/xml/structtf_1_1SmallVectorStorage_3_01T_00_010_01_4.xml index 9b86dbcaa..410ffcddf 100644 --- a/docs/xml/structtf_1_1SmallVectorStorage_3_01T_00_010_01_4.xml +++ b/docs/xml/structtf_1_1SmallVectorStorage_3_01T_00_010_01_4.xml @@ -1,5 +1,5 @@ - + tf::SmallVectorStorage< T, 0 > @@ -11,7 +11,7 @@ - + diff --git a/docs/xml/structtf_1_1SmallVectorStorage_3_01T_00_011_01_4.xml b/docs/xml/structtf_1_1SmallVectorStorage_3_01T_00_011_01_4.xml index 655232868..728d01972 100644 --- a/docs/xml/structtf_1_1SmallVectorStorage_3_01T_00_011_01_4.xml +++ b/docs/xml/structtf_1_1SmallVectorStorage_3_01T_00_011_01_4.xml @@ -1,5 +1,5 @@ - + tf::SmallVectorStorage< T, 1 > @@ -11,7 +11,7 @@ - + diff --git a/docs/xml/structtf_1_1SmallVectorTemplateCommon_1_1AlignedUnionType.xml b/docs/xml/structtf_1_1SmallVectorTemplateCommon_1_1AlignedUnionType.xml index 5465289c7..870278c90 100644 --- a/docs/xml/structtf_1_1SmallVectorTemplateCommon_1_1AlignedUnionType.xml +++ b/docs/xml/structtf_1_1SmallVectorTemplateCommon_1_1AlignedUnionType.xml @@ -1,5 +1,5 @@ - + tf::SmallVectorTemplateCommon::AlignedUnionType @@ -7,28 +7,58 @@ typename X - - + + + std::size_t + std::size_t tf::SmallVectorTemplateCommon< T, typename >::AlignedUnionType< X >::max_size + + max_size + tf::SmallVectorTemplateCommon::AlignedUnionType::max_size + = (sizeof(std::byte) > sizeof(X)) ? sizeof(std::byte) : sizeof(X) + + + + + + + + + + + std::byte - std::byte tf::SmallVectorTemplateCommon< T, typename >::AlignedUnionType< X >::buff[std::max(sizeof(std::byte), sizeof(X))] - [std::max(sizeof(std::byte), sizeof(X))] + std::byte tf::SmallVectorTemplateCommon< T, typename >::AlignedUnionType< X >::buff[max_size] + [max_size] buff + tf::SmallVectorTemplateCommon::AlignedUnionType::buff - + - + - + + + + + + + + max_size + + + + - tf::SmallVectorTemplateCommon::AlignedUnionTypebuff + tf::SmallVectorTemplateCommon::AlignedUnionTypebuff + tf::SmallVectorTemplateCommon::AlignedUnionTypemax_size diff --git a/docs/xml/structtf_1_1TFProfObserver_1_1Summary.xml b/docs/xml/structtf_1_1TFProfObserver_1_1Summary.xml index 5ac2a634e..6409809bd 100644 --- a/docs/xml/structtf_1_1TFProfObserver_1_1Summary.xml +++ b/docs/xml/structtf_1_1TFProfObserver_1_1Summary.xml @@ -1,43 +1,46 @@ - + tf::TFProfObserver::Summary - + - std::array< TaskSummary, TASK_TYPES.size()> + std::array< TaskSummary, TASK_TYPES.size()> std::array<TaskSummary, TASK_TYPES.size()> tf::TFProfObserver::Summary::tsum tsum + tf::TFProfObserver::Summary::tsum - + - std::vector< WorkerSummary > + std::vector< WorkerSummary > std::vector<WorkerSummary> tf::TFProfObserver::Summary::wsum wsum + tf::TFProfObserver::Summary::wsum - + - - + + void void tf::TFProfObserver::Summary::dump_tsum (std::ostream &) const dump_tsum + tf::TFProfObserver::Summary::dump_tsum - std::ostream & + std::ostream & os @@ -46,15 +49,16 @@ - + void void tf::TFProfObserver::Summary::dump_wsum (std::ostream &) const dump_wsum + tf::TFProfObserver::Summary::dump_wsum - std::ostream & + std::ostream & os @@ -63,15 +67,16 @@ - + void void tf::TFProfObserver::Summary::dump (std::ostream &) const dump + tf::TFProfObserver::Summary::dump - std::ostream & + std::ostream & os @@ -80,31 +85,31 @@ - + - + + + + + + + - wsum + tsum - tsum + wsum - - - - - - - + tf::TFProfObserver::Summarydump tf::TFProfObserver::Summarydump_tsum diff --git a/docs/xml/structtf_1_1TFProfObserver_1_1TaskSummary.xml b/docs/xml/structtf_1_1TFProfObserver_1_1TaskSummary.xml index 009c77e0a..33140036b 100644 --- a/docs/xml/structtf_1_1TFProfObserver_1_1TaskSummary.xml +++ b/docs/xml/structtf_1_1TFProfObserver_1_1TaskSummary.xml @@ -1,13 +1,14 @@ - + tf::TFProfObserver::TaskSummary - + size_t size_t tf::TFProfObserver::TaskSummary::count count + tf::TFProfObserver::TaskSummary::count {0} @@ -15,13 +16,14 @@ - + size_t size_t tf::TFProfObserver::TaskSummary::total_span total_span + tf::TFProfObserver::TaskSummary::total_span {0} @@ -29,56 +31,58 @@ - + size_t size_t tf::TFProfObserver::TaskSummary::min_span min_span + tf::TFProfObserver::TaskSummary::min_span - + size_t size_t tf::TFProfObserver::TaskSummary::max_span max_span + tf::TFProfObserver::TaskSummary::max_span - + - - + + float float tf::TFProfObserver::TaskSummary::avg_span () const avg_span + tf::TFProfObserver::TaskSummary::avg_span - + - + -overall task summary - + tf::TFProfObserver::TaskSummaryavg_span tf::TFProfObserver::TaskSummarycount diff --git a/docs/xml/structtf_1_1TFProfObserver_1_1WorkerSummary.xml b/docs/xml/structtf_1_1TFProfObserver_1_1WorkerSummary.xml index e70128112..142add85f 100644 --- a/docs/xml/structtf_1_1TFProfObserver_1_1WorkerSummary.xml +++ b/docs/xml/structtf_1_1TFProfObserver_1_1WorkerSummary.xml @@ -1,39 +1,42 @@ - + tf::TFProfObserver::WorkerSummary - + size_t size_t tf::TFProfObserver::WorkerSummary::id id + tf::TFProfObserver::WorkerSummary::id - + size_t size_t tf::TFProfObserver::WorkerSummary::level level + tf::TFProfObserver::WorkerSummary::level - + size_t size_t tf::TFProfObserver::WorkerSummary::count count + tf::TFProfObserver::WorkerSummary::count {0} @@ -41,13 +44,14 @@ - + size_t size_t tf::TFProfObserver::WorkerSummary::total_span total_span + tf::TFProfObserver::WorkerSummary::total_span {0} @@ -55,13 +59,14 @@ - + size_t size_t tf::TFProfObserver::WorkerSummary::min_span min_span + tf::TFProfObserver::WorkerSummary::min_span {0} @@ -69,13 +74,14 @@ - + size_t size_t tf::TFProfObserver::WorkerSummary::max_span max_span + tf::TFProfObserver::WorkerSummary::max_span {0} @@ -83,41 +89,42 @@ - + - std::array< TaskSummary, TASK_TYPES.size()> + std::array< TaskSummary, TASK_TYPES.size()> std::array<TaskSummary, TASK_TYPES.size()> tf::TFProfObserver::WorkerSummary::tsum tsum + tf::TFProfObserver::WorkerSummary::tsum - + - - + + float float tf::TFProfObserver::WorkerSummary::avg_span () const avg_span + tf::TFProfObserver::WorkerSummary::avg_span - + - + -worker summary at a level @@ -130,7 +137,7 @@ - + tf::TFProfObserver::WorkerSummaryavg_span tf::TFProfObserver::WorkerSummarycount diff --git a/docs/xml/structtf_1_1TaskParams.xml b/docs/xml/structtf_1_1TaskParams.xml deleted file mode 100644 index 33ac3911d..000000000 --- a/docs/xml/structtf_1_1TaskParams.xml +++ /dev/null @@ -1,76 +0,0 @@ - - - - tf::TaskParams - graph.hpp - - - std::string - std::string tf::TaskParams::name - - name - -name of the task - - - - - - - - - unsigned - unsigned tf::TaskParams::priority - - priority - {0} - -priority of the tassk - - - - - - - - - void * - void* tf::TaskParams::data - - data - {nullptr} - -C-styled pointer to user data. - - - - - - - - - -task parameters to use when creating an asynchronous task - - - - - - - - - name - - - - - - - - - tf::TaskParamsdata - tf::TaskParamsname - tf::TaskParamspriority - - - diff --git a/docs/xml/structtf_1_1TaskQueue_1_1Array.xml b/docs/xml/structtf_1_1TaskQueue_1_1Array.xml deleted file mode 100644 index 86b2c9a53..000000000 --- a/docs/xml/structtf_1_1TaskQueue_1_1Array.xml +++ /dev/null @@ -1,178 +0,0 @@ - - - - tf::TaskQueue::Array - - - int64_t - int64_t tf::TaskQueue< T, TF_MAX_PRIORITY >::Array::C - - C - - - - - - - - - - int64_t - int64_t tf::TaskQueue< T, TF_MAX_PRIORITY >::Array::M - - M - - - - - - - - - - std::atomic< T > * - std::atomic<T>* tf::TaskQueue< T, TF_MAX_PRIORITY >::Array::S - - S - - - - - - - - - - - - - tf::TaskQueue< T, TF_MAX_PRIORITY >::Array::Array - (int64_t c) - Array - - int64_t - c - - - - - - - - - - - - tf::TaskQueue< T, TF_MAX_PRIORITY >::Array::~Array - () - ~Array - - - - - - - - - - int64_t - int64_t tf::TaskQueue< T, TF_MAX_PRIORITY >::Array::capacity - () const noexcept - capacity - - - - - - - - - - void - void tf::TaskQueue< T, TF_MAX_PRIORITY >::Array::push - (int64_t i, T o) noexcept - push - - int64_t - i - - - T - o - - - - - - - - - - - T - T tf::TaskQueue< T, TF_MAX_PRIORITY >::Array::pop - (int64_t i) noexcept - pop - - int64_t - i - - - - - - - - - - - Array * - Array* tf::TaskQueue< T, TF_MAX_PRIORITY >::Array::resize - (int64_t b, int64_t t) - resize - - int64_t - b - - - int64_t - t - - - - - - - - - - - - - - - - - - - S - - - - - - - - - tf::TaskQueue::ArrayArray - tf::TaskQueue::ArrayC - tf::TaskQueue::Arraycapacity - tf::TaskQueue::ArrayM - tf::TaskQueue::Arraypop - tf::TaskQueue::Arraypush - tf::TaskQueue::Arrayresize - tf::TaskQueue::ArrayS - tf::TaskQueue::Array~Array - - - diff --git a/docs/xml/structtf_1_1Taskflow_1_1Dumper.xml b/docs/xml/structtf_1_1Taskflow_1_1Dumper.xml index 3eb4a8172..5940eaa55 100644 --- a/docs/xml/structtf_1_1Taskflow_1_1Dumper.xml +++ b/docs/xml/structtf_1_1Taskflow_1_1Dumper.xml @@ -1,70 +1,73 @@ - + tf::Taskflow::Dumper - + size_t size_t tf::Taskflow::Dumper::id id + tf::Taskflow::Dumper::id - + - std::stack< std::pair< const Node *, const Graph * > > + std::stack< std::pair< const Node *, const Graph * > > std::stack<std::pair<const Node*, const Graph*> > tf::Taskflow::Dumper::stack stack + tf::Taskflow::Dumper::stack - + - std::unordered_map< const Graph *, size_t > + std::unordered_map< const Graph *, size_t > std::unordered_map<const Graph*, size_t> tf::Taskflow::Dumper::visited visited + tf::Taskflow::Dumper::visited - + - + + + + + + + - visited + stack - stack + visited - - - - - - - + tf::Taskflow::Dumperid tf::Taskflow::Dumperstack diff --git a/docs/xml/structtf_1_1Timeline.xml b/docs/xml/structtf_1_1Timeline.xml index d62422258..e3ff7220d 100644 --- a/docs/xml/structtf_1_1Timeline.xml +++ b/docs/xml/structtf_1_1Timeline.xml @@ -1,67 +1,72 @@ - + tf::Timeline - + size_t size_t tf::Timeline::uid uid + tf::Timeline::uid - + - observer_stamp_t + observer_stamp_t observer_stamp_t tf::Timeline::origin origin + tf::Timeline::origin - + - std::vector< std::vector< std::vector< Segment > > > + std::vector< std::vector< std::vector< Segment > > > std::vector<std::vector<std::vector<Segment> > > tf::Timeline::segments segments + tf::Timeline::segments - + - - + + tf::Timeline::Timeline ()=default Timeline + tf::Timeline::Timeline - + tf::Timeline::Timeline (const Timeline &rhs)=delete Timeline + tf::Timeline::Timeline const Timeline & rhs @@ -72,13 +77,14 @@ - + tf::Timeline::Timeline (Timeline &&rhs)=default Timeline + tf::Timeline::Timeline Timeline && rhs @@ -89,13 +95,14 @@ - + - + Timeline & - Timeline& tf::Timeline::operator= + Timeline & tf::Timeline::operator= (const Timeline &rhs)=delete operator= + tf::Timeline::operator= const Timeline & rhs @@ -106,13 +113,14 @@ - + - + Timeline & - Timeline& tf::Timeline::operator= + Timeline & tf::Timeline::operator= (Timeline &&rhs)=default operator= + tf::Timeline::operator= Timeline && rhs @@ -123,7 +131,7 @@ - + @@ -135,6 +143,7 @@ auto tf::Timeline::save (Archiver &ar) const save + tf::Timeline::save Archiver & ar @@ -145,7 +154,7 @@ - + @@ -157,6 +166,7 @@ auto tf::Timeline::load (Archiver &ar) load + tf::Timeline::load Archiver & ar @@ -167,35 +177,35 @@ - + - + + + + + + + - segments + origin - origin + segments - - - - - - - + tf::Timelineload - tf::Timelineoperator= - tf::Timelineoperator= + tf::Timelineoperator= + tf::Timelineoperator= tf::Timelineorigin tf::Timelinesave tf::Timelinesegments diff --git a/docs/xml/structtf_1_1UnboundedTaskQueue_1_1Array.xml b/docs/xml/structtf_1_1UnboundedTaskQueue_1_1Array.xml new file mode 100644 index 000000000..82ed00ccb --- /dev/null +++ b/docs/xml/structtf_1_1UnboundedTaskQueue_1_1Array.xml @@ -0,0 +1,187 @@ + + + + tf::UnboundedTaskQueue::Array + + + int64_t + int64_t tf::UnboundedTaskQueue< T >::Array::C + + C + tf::UnboundedTaskQueue::Array::C + + + + + + + + + + int64_t + int64_t tf::UnboundedTaskQueue< T >::Array::M + + M + tf::UnboundedTaskQueue::Array::M + + + + + + + + + + std::atomic< T > * + std::atomic<T>* tf::UnboundedTaskQueue< T >::Array::S + + S + tf::UnboundedTaskQueue::Array::S + + + + + + + + + + + + + tf::UnboundedTaskQueue< T >::Array::Array + (int64_t c) + Array + tf::UnboundedTaskQueue::Array::Array + + int64_t + c + + + + + + + + + + + + tf::UnboundedTaskQueue< T >::Array::~Array + () + ~Array + tf::UnboundedTaskQueue::Array::~Array + + + + + + + + + + int64_t + int64_t tf::UnboundedTaskQueue< T >::Array::capacity + () const noexcept + capacity + tf::UnboundedTaskQueue::Array::capacity + + + + + + + + + + void + void tf::UnboundedTaskQueue< T >::Array::push + (int64_t i, T o) noexcept + push + tf::UnboundedTaskQueue::Array::push + + int64_t + i + + + T + o + + + + + + + + + + + T + T tf::UnboundedTaskQueue< T >::Array::pop + (int64_t i) noexcept + pop + tf::UnboundedTaskQueue::Array::pop + + int64_t + i + + + + + + + + + + + Array * + Array * tf::UnboundedTaskQueue< T >::Array::resize + (int64_t b, int64_t t) + resize + tf::UnboundedTaskQueue::Array::resize + + int64_t + b + + + int64_t + t + + + + + + + + + + + + + + + + + + + + + + S + + + + + + tf::UnboundedTaskQueue::ArrayArray + tf::UnboundedTaskQueue::ArrayC + tf::UnboundedTaskQueue::Arraycapacity + tf::UnboundedTaskQueue::ArrayM + tf::UnboundedTaskQueue::Arraypop + tf::UnboundedTaskQueue::Arraypush + tf::UnboundedTaskQueue::Arrayresize + tf::UnboundedTaskQueue::ArrayS + tf::UnboundedTaskQueue::Array~Array + + + diff --git a/docs/xml/structtf_1_1cudaDeviceAllocator_1_1rebind.xml b/docs/xml/structtf_1_1cudaDeviceAllocator_1_1rebind.xml index 364813f2d..0ff548069 100644 --- a/docs/xml/structtf_1_1cudaDeviceAllocator_1_1rebind.xml +++ b/docs/xml/structtf_1_1cudaDeviceAllocator_1_1rebind.xml @@ -1,19 +1,20 @@ - + tf::cudaDeviceAllocator::rebind - cuda_memory.hpp + taskflow/cuda/cuda_memory.hpp typename U - + - cudaDeviceAllocator< U > + cudaDeviceAllocator< U > using tf::cudaDeviceAllocator< T >::rebind< U >::other = cudaDeviceAllocator<U> other + tf::cudaDeviceAllocator::rebind::other allocator of a different data type @@ -21,15 +22,15 @@ - + - + its member type U is the equivalent allocator type to allocate elements of type U - + tf::cudaDeviceAllocator::rebindother diff --git a/docs/xml/structtf_1_1cudaEventCreator.xml b/docs/xml/structtf_1_1cudaEventCreator.xml deleted file mode 100644 index 8d11b68bd..000000000 --- a/docs/xml/structtf_1_1cudaEventCreator.xml +++ /dev/null @@ -1,47 +0,0 @@ - - - - tf::cudaEventCreator - - - cudaEvent_t - cudaEvent_t tf::cudaEventCreator::operator() - () const - operator() - - - - - - - - - - cudaEvent_t - cudaEvent_t tf::cudaEventCreator::operator() - (unsigned int flag) const - operator() - - unsigned int - flag - - - - - - - - - - - - - - - - - tf::cudaEventCreatoroperator() - tf::cudaEventCreatoroperator() - - - diff --git a/docs/xml/structtf_1_1cudaEventDeleter.xml b/docs/xml/structtf_1_1cudaEventDeleter.xml deleted file mode 100644 index ebde5fb02..000000000 --- a/docs/xml/structtf_1_1cudaEventDeleter.xml +++ /dev/null @@ -1,33 +0,0 @@ - - - - tf::cudaEventDeleter - - - void - void tf::cudaEventDeleter::operator() - (cudaEvent_t event) const - operator() - - cudaEvent_t - event - - - - - - - - - - - - - - - - - tf::cudaEventDeleteroperator() - - - diff --git a/docs/xml/structtf_1_1cudaFlowCapturer_1_1External.xml b/docs/xml/structtf_1_1cudaFlowCapturer_1_1External.xml deleted file mode 100644 index eb3a0e5ff..000000000 --- a/docs/xml/structtf_1_1cudaFlowCapturer_1_1External.xml +++ /dev/null @@ -1,29 +0,0 @@ - - - - tf::cudaFlowCapturer::External - - - cudaFlowGraph - cudaFlowGraph tf::cudaFlowCapturer::External::graph - - graph - - - - - - - - - - - - - - - - tf::cudaFlowCapturer::Externalgraph - - - diff --git a/docs/xml/structtf_1_1cudaFlowCapturer_1_1Internal.xml b/docs/xml/structtf_1_1cudaFlowCapturer_1_1Internal.xml deleted file mode 100644 index d810b3d87..000000000 --- a/docs/xml/structtf_1_1cudaFlowCapturer_1_1Internal.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - tf::cudaFlowCapturer::Internal - - - - - - - - - diff --git a/docs/xml/structtf_1_1cudaSharedMemory.xml b/docs/xml/structtf_1_1cudaSharedMemory.xml index 342844046..9cefc7b36 100644 --- a/docs/xml/structtf_1_1cudaSharedMemory.xml +++ b/docs/xml/structtf_1_1cudaSharedMemory.xml @@ -1,5 +1,5 @@ - + tf::cudaSharedMemory @@ -7,28 +7,29 @@ typename T - - + + __device__ T * - __device__ T* tf::cudaSharedMemory< T >::get + __device__ T * tf::cudaSharedMemory< T >::get () get + tf::cudaSharedMemory::get - + - + - + - tf::cudaSharedMemoryget + tf::cudaSharedMemoryget diff --git a/docs/xml/structtf_1_1cudaSharedMemory_3_01bool_01_4.xml b/docs/xml/structtf_1_1cudaSharedMemory_3_01bool_01_4.xml index 39b1e1ada..179e657ef 100644 --- a/docs/xml/structtf_1_1cudaSharedMemory_3_01bool_01_4.xml +++ b/docs/xml/structtf_1_1cudaSharedMemory_3_01bool_01_4.xml @@ -1,31 +1,32 @@ - + tf::cudaSharedMemory< bool > - - + + __device__ bool * - __device__ bool* tf::cudaSharedMemory< bool >::get + __device__ bool * tf::cudaSharedMemory< bool >::get () get + tf::cudaSharedMemory< bool >::get - + - + - + - tf::cudaSharedMemory< bool >get + tf::cudaSharedMemory< bool >get diff --git a/docs/xml/structtf_1_1cudaSharedMemory_3_01char_01_4.xml b/docs/xml/structtf_1_1cudaSharedMemory_3_01char_01_4.xml index 085557e6a..82ad11fdc 100644 --- a/docs/xml/structtf_1_1cudaSharedMemory_3_01char_01_4.xml +++ b/docs/xml/structtf_1_1cudaSharedMemory_3_01char_01_4.xml @@ -1,31 +1,32 @@ - + tf::cudaSharedMemory< char > - - + + __device__ char * - __device__ char* tf::cudaSharedMemory< char >::get + __device__ char * tf::cudaSharedMemory< char >::get () get + tf::cudaSharedMemory< char >::get - + - + - + - tf::cudaSharedMemory< char >get + tf::cudaSharedMemory< char >get diff --git a/docs/xml/structtf_1_1cudaSharedMemory_3_01double_01_4.xml b/docs/xml/structtf_1_1cudaSharedMemory_3_01double_01_4.xml index ceee5fb4c..41a3e0816 100644 --- a/docs/xml/structtf_1_1cudaSharedMemory_3_01double_01_4.xml +++ b/docs/xml/structtf_1_1cudaSharedMemory_3_01double_01_4.xml @@ -1,31 +1,32 @@ - + tf::cudaSharedMemory< double > - - + + __device__ double * - __device__ double* tf::cudaSharedMemory< double >::get + __device__ double * tf::cudaSharedMemory< double >::get () get + tf::cudaSharedMemory< double >::get - + - + - + - tf::cudaSharedMemory< double >get + tf::cudaSharedMemory< double >get diff --git a/docs/xml/structtf_1_1cudaSharedMemory_3_01float_01_4.xml b/docs/xml/structtf_1_1cudaSharedMemory_3_01float_01_4.xml index 9715b35aa..160a90d99 100644 --- a/docs/xml/structtf_1_1cudaSharedMemory_3_01float_01_4.xml +++ b/docs/xml/structtf_1_1cudaSharedMemory_3_01float_01_4.xml @@ -1,31 +1,32 @@ - + tf::cudaSharedMemory< float > - - + + __device__ float * - __device__ float* tf::cudaSharedMemory< float >::get + __device__ float * tf::cudaSharedMemory< float >::get () get + tf::cudaSharedMemory< float >::get - + - + - + - tf::cudaSharedMemory< float >get + tf::cudaSharedMemory< float >get diff --git a/docs/xml/structtf_1_1cudaSharedMemory_3_01int_01_4.xml b/docs/xml/structtf_1_1cudaSharedMemory_3_01int_01_4.xml index fe9716702..abcae16eb 100644 --- a/docs/xml/structtf_1_1cudaSharedMemory_3_01int_01_4.xml +++ b/docs/xml/structtf_1_1cudaSharedMemory_3_01int_01_4.xml @@ -1,31 +1,32 @@ - + tf::cudaSharedMemory< int > - - + + __device__ int * - __device__ int* tf::cudaSharedMemory< int >::get + __device__ int * tf::cudaSharedMemory< int >::get () get + tf::cudaSharedMemory< int >::get - + - + - + - tf::cudaSharedMemory< int >get + tf::cudaSharedMemory< int >get diff --git a/docs/xml/structtf_1_1cudaSharedMemory_3_01long_01_4.xml b/docs/xml/structtf_1_1cudaSharedMemory_3_01long_01_4.xml index 4ebabfb3d..a9c02215c 100644 --- a/docs/xml/structtf_1_1cudaSharedMemory_3_01long_01_4.xml +++ b/docs/xml/structtf_1_1cudaSharedMemory_3_01long_01_4.xml @@ -1,31 +1,32 @@ - + tf::cudaSharedMemory< long > - - + + __device__ long * - __device__ long* tf::cudaSharedMemory< long >::get + __device__ long * tf::cudaSharedMemory< long >::get () get + tf::cudaSharedMemory< long >::get - + - + - + - tf::cudaSharedMemory< long >get + tf::cudaSharedMemory< long >get diff --git a/docs/xml/structtf_1_1cudaSharedMemory_3_01short_01_4.xml b/docs/xml/structtf_1_1cudaSharedMemory_3_01short_01_4.xml index 98f3eee95..0d4c51fdc 100644 --- a/docs/xml/structtf_1_1cudaSharedMemory_3_01short_01_4.xml +++ b/docs/xml/structtf_1_1cudaSharedMemory_3_01short_01_4.xml @@ -1,31 +1,32 @@ - + tf::cudaSharedMemory< short > - - + + __device__ short * - __device__ short* tf::cudaSharedMemory< short >::get + __device__ short * tf::cudaSharedMemory< short >::get () get + tf::cudaSharedMemory< short >::get - + - + - + - tf::cudaSharedMemory< short >get + tf::cudaSharedMemory< short >get diff --git a/docs/xml/structtf_1_1cudaSharedMemory_3_01unsigned_01char_01_4.xml b/docs/xml/structtf_1_1cudaSharedMemory_3_01unsigned_01char_01_4.xml index df2860511..bb27308dd 100644 --- a/docs/xml/structtf_1_1cudaSharedMemory_3_01unsigned_01char_01_4.xml +++ b/docs/xml/structtf_1_1cudaSharedMemory_3_01unsigned_01char_01_4.xml @@ -1,31 +1,32 @@ - + tf::cudaSharedMemory< unsigned char > - - + + __device__ unsigned char * - __device__ unsigned char* tf::cudaSharedMemory< unsigned char >::get + __device__ unsigned char * tf::cudaSharedMemory< unsigned char >::get () get + tf::cudaSharedMemory< unsigned char >::get - + - + - + - tf::cudaSharedMemory< unsigned char >get + tf::cudaSharedMemory< unsigned char >get diff --git a/docs/xml/structtf_1_1cudaSharedMemory_3_01unsigned_01int_01_4.xml b/docs/xml/structtf_1_1cudaSharedMemory_3_01unsigned_01int_01_4.xml index 8215f8bc5..0b870aa3a 100644 --- a/docs/xml/structtf_1_1cudaSharedMemory_3_01unsigned_01int_01_4.xml +++ b/docs/xml/structtf_1_1cudaSharedMemory_3_01unsigned_01int_01_4.xml @@ -1,31 +1,32 @@ - + tf::cudaSharedMemory< unsigned int > - - + + __device__ unsigned int * - __device__ unsigned int* tf::cudaSharedMemory< unsigned int >::get + __device__ unsigned int * tf::cudaSharedMemory< unsigned int >::get () get + tf::cudaSharedMemory< unsigned int >::get - + - + - + - tf::cudaSharedMemory< unsigned int >get + tf::cudaSharedMemory< unsigned int >get diff --git a/docs/xml/structtf_1_1cudaSharedMemory_3_01unsigned_01long_01_4.xml b/docs/xml/structtf_1_1cudaSharedMemory_3_01unsigned_01long_01_4.xml index 5f47f6cf7..2c314cecd 100644 --- a/docs/xml/structtf_1_1cudaSharedMemory_3_01unsigned_01long_01_4.xml +++ b/docs/xml/structtf_1_1cudaSharedMemory_3_01unsigned_01long_01_4.xml @@ -1,31 +1,32 @@ - + tf::cudaSharedMemory< unsigned long > - - + + __device__ unsigned long * - __device__ unsigned long* tf::cudaSharedMemory< unsigned long >::get + __device__ unsigned long * tf::cudaSharedMemory< unsigned long >::get () get + tf::cudaSharedMemory< unsigned long >::get - + - + - + - tf::cudaSharedMemory< unsigned long >get + tf::cudaSharedMemory< unsigned long >get diff --git a/docs/xml/structtf_1_1cudaSharedMemory_3_01unsigned_01short_01_4.xml b/docs/xml/structtf_1_1cudaSharedMemory_3_01unsigned_01short_01_4.xml index 41f516382..73b2126c8 100644 --- a/docs/xml/structtf_1_1cudaSharedMemory_3_01unsigned_01short_01_4.xml +++ b/docs/xml/structtf_1_1cudaSharedMemory_3_01unsigned_01short_01_4.xml @@ -1,31 +1,32 @@ - + tf::cudaSharedMemory< unsigned short > - - + + __device__ unsigned short * - __device__ unsigned short* tf::cudaSharedMemory< unsigned short >::get + __device__ unsigned short * tf::cudaSharedMemory< unsigned short >::get () get + tf::cudaSharedMemory< unsigned short >::get - + - + - + - tf::cudaSharedMemory< unsigned short >get + tf::cudaSharedMemory< unsigned short >get diff --git a/docs/xml/structtf_1_1cudaStreamCreator.xml b/docs/xml/structtf_1_1cudaStreamCreator.xml deleted file mode 100644 index 555045c50..000000000 --- a/docs/xml/structtf_1_1cudaStreamCreator.xml +++ /dev/null @@ -1,29 +0,0 @@ - - - - tf::cudaStreamCreator - - - cudaStream_t - cudaStream_t tf::cudaStreamCreator::operator() - () const - operator() - - - - - - - - - - - - - - - - tf::cudaStreamCreatoroperator() - - - diff --git a/docs/xml/structtf_1_1cudaStreamDeleter.xml b/docs/xml/structtf_1_1cudaStreamDeleter.xml deleted file mode 100644 index e6f3522dd..000000000 --- a/docs/xml/structtf_1_1cudaStreamDeleter.xml +++ /dev/null @@ -1,33 +0,0 @@ - - - - tf::cudaStreamDeleter - - - void - void tf::cudaStreamDeleter::operator() - (cudaStream_t stream) const - operator() - - cudaStream_t - stream - - - - - - - - - - - - - - - - - tf::cudaStreamDeleteroperator() - - - diff --git a/docs/xml/structtf_1_1cudaUSMAllocator_1_1rebind.xml b/docs/xml/structtf_1_1cudaUSMAllocator_1_1rebind.xml index 7b070a78a..b662a9971 100644 --- a/docs/xml/structtf_1_1cudaUSMAllocator_1_1rebind.xml +++ b/docs/xml/structtf_1_1cudaUSMAllocator_1_1rebind.xml @@ -1,19 +1,20 @@ - + tf::cudaUSMAllocator::rebind - cuda_memory.hpp + taskflow/cuda/cuda_memory.hpp typename U - + - cudaUSMAllocator< U > + cudaUSMAllocator< U > using tf::cudaUSMAllocator< T >::rebind< U >::other = cudaUSMAllocator<U> other + tf::cudaUSMAllocator::rebind::other allocator of a different data type @@ -21,15 +22,15 @@ - + - + its member type U is the equivalent allocator type to allocate elements of type U - + tf::cudaUSMAllocator::rebindother diff --git a/docs/xml/structtf_1_1detail_1_1cudaBlockReduce.xml b/docs/xml/structtf_1_1detail_1_1cudaBlockReduce.xml deleted file mode 100644 index 24de2d194..000000000 --- a/docs/xml/structtf_1_1detail_1_1cudaBlockReduce.xml +++ /dev/null @@ -1,117 +0,0 @@ - - - - tf::detail::cudaBlockReduce - tf::detail::cudaBlockReduce::Storage - - - unsigned - nt - nt - - - typename T - - - - - const unsigned - const unsigned tf::detail::cudaBlockReduce< nt, T >::group_size - - group_size - = std::min(nt, CUDA_WARP_SIZE) - - - - - - - - - - const unsigned - const unsigned tf::detail::cudaBlockReduce< nt, T >::num_passes - - num_passes - = log2(group_size) - - - - - - - - - - const unsigned - const unsigned tf::detail::cudaBlockReduce< nt, T >::num_items - - num_items - = nt / group_size - - - - - - - - - - - - - - typename op_t - - - __device__ T - __device__ T tf::detail::cudaBlockReduce< nt, T >::operator() - (unsigned, T, Storage &, unsigned, op_t, bool=true) const - operator() - - unsigned - tid - - - T - x - - - Storage & - storage - - - unsigned - count - - - op_t - op - - - bool - ret - true - - - - - - - - - - - - - - - - - tf::detail::cudaBlockReducegroup_size - tf::detail::cudaBlockReducenum_items - tf::detail::cudaBlockReducenum_passes - tf::detail::cudaBlockReduceoperator() - - - diff --git a/docs/xml/structtf_1_1detail_1_1cudaBlockReduce_1_1Storage.xml b/docs/xml/structtf_1_1detail_1_1cudaBlockReduce_1_1Storage.xml deleted file mode 100644 index 186df75a0..000000000 --- a/docs/xml/structtf_1_1detail_1_1cudaBlockReduce_1_1Storage.xml +++ /dev/null @@ -1,29 +0,0 @@ - - - - tf::detail::cudaBlockReduce::Storage - - - T - T tf::detail::cudaBlockReduce< nt, T >::Storage::data[std::max(nt, 2 *group_size)] - [std::max(nt, 2 *group_size)] - data - - - - - - - - - - - - - - - - tf::detail::cudaBlockReduce::Storagedata - - - diff --git a/docs/xml/structtf_1_1detail_1_1cudaBlockScan.xml b/docs/xml/structtf_1_1detail_1_1cudaBlockScan.xml deleted file mode 100644 index 0dd8c41d2..000000000 --- a/docs/xml/structtf_1_1detail_1_1cudaBlockScan.xml +++ /dev/null @@ -1,190 +0,0 @@ - - - - tf::detail::cudaBlockScan - tf::detail::cudaBlockScan::storage_t - - - unsigned - nt - nt - - - typename T - - - - - const unsigned - const unsigned tf::detail::cudaBlockScan< nt, T >::num_warps - - num_warps - = nt / CUDA_WARP_SIZE - - - - - - - - - - const unsigned - const unsigned tf::detail::cudaBlockScan< nt, T >::num_passes - - num_passes - = log2(nt) - - - - - - - - - - const unsigned - const unsigned tf::detail::cudaBlockScan< nt, T >::capacity - - capacity - = nt + num_warps - - - - - - - - - - - - - - typename op_t - - - __device__ cudaScanResult< T > - __device__ cudaScanResult< T > tf::detail::cudaBlockScan< nt, T >::operator() - (unsigned tid, T x, storage_t &storage, unsigned count=nt, op_t op=op_t(), T init=T(), cudaScanType type=cudaScanType::EXCLUSIVE) const - operator() - - unsigned - tid - - - T - x - - - storage_t & - storage - - - unsigned - count - nt - - - op_t - op - op_t() - - - T - init - T() - - - cudaScanType - type - cudaScanType::EXCLUSIVE - - - - - - - - - - - - - unsigned - vt - vt - - - typename op_t - - - __device__ cudaScanResult< T, vt > - __device__ cudaScanResult< T, vt > tf::detail::cudaBlockScan< nt, T >::operator() - (unsigned tid, cudaArray< T, vt > x, storage_t &storage, T carry_in=T(), bool use_carry_in=false, unsigned count=nt, op_t op=op_t(), T init=T(), cudaScanType type=cudaScanType::EXCLUSIVE) const - operator() - - unsigned - tid - - - cudaArray< T, vt > - x - - - storage_t & - storage - - - T - carry_in - T() - - - bool - use_carry_in - false - - - unsigned - count - nt - - - op_t - op - op_t() - - - T - init - T() - - - cudaScanType - type - cudaScanType::EXCLUSIVE - - - - - - - - - - - - - - - - - tf::detail::cudaBlockScancapacity - tf::detail::cudaBlockScannum_passes - tf::detail::cudaBlockScannum_warps - tf::detail::cudaBlockScanoperator() - tf::detail::cudaBlockScanoperator() - - - diff --git a/docs/xml/structtf_1_1detail_1_1cudaBlockSort.xml b/docs/xml/structtf_1_1detail_1_1cudaBlockSort.xml deleted file mode 100644 index 84b42fab3..000000000 --- a/docs/xml/structtf_1_1detail_1_1cudaBlockSort.xml +++ /dev/null @@ -1,148 +0,0 @@ - - - - tf::detail::cudaBlockSort - tf::detail::cudaBlockSort::Storage - - - unsigned - nt - nt - - - unsigned - vt - vt - - - typename K - - - typename V - - - - - constexpr bool - constexpr bool tf::detail::cudaBlockSort< nt, vt, K, V >::has_values - - has_values - = !std::is_same<V, cudaEmpty>::value - - - - - - - - - - constexpr unsigned - constexpr unsigned tf::detail::cudaBlockSort< nt, vt, K, V >::num_passes - - num_passes - = log2(nt) - - - - - - - - - - - - - - typename C - - - __device__ auto - __device__ auto tf::detail::cudaBlockSort< nt, vt, K, V >::merge_pass - (cudaKVArray< K, V, vt > x, unsigned tid, unsigned count, unsigned pass, C comp, Storage &storage) const - merge_pass - - cudaKVArray< K, V, vt > - x - - - unsigned - tid - - - unsigned - count - - - unsigned - pass - - - C - comp - - - Storage & - storage - - - - - - - - - - - - - typename C - - - __device__ auto - __device__ auto tf::detail::cudaBlockSort< nt, vt, K, V >::block_sort - (cudaKVArray< K, V, vt > x, unsigned tid, unsigned count, C comp, Storage &storage) const - block_sort - - cudaKVArray< K, V, vt > - x - - - unsigned - tid - - - unsigned - count - - - C - comp - - - Storage & - storage - - - - - - - - - - - - - - - - - tf::detail::cudaBlockSortblock_sort - tf::detail::cudaBlockSorthas_values - tf::detail::cudaBlockSortmerge_pass - tf::detail::cudaBlockSortnum_passes - - - diff --git a/docs/xml/structtf_1_1detail_1_1cudaFindPair.xml b/docs/xml/structtf_1_1detail_1_1cudaFindPair.xml deleted file mode 100644 index 47969e053..000000000 --- a/docs/xml/structtf_1_1detail_1_1cudaFindPair.xml +++ /dev/null @@ -1,64 +0,0 @@ - - - - tf::detail::cudaFindPair - - - typename T - - - - - T - T tf::detail::cudaFindPair< T >::key - - key - - - - - - - - - - unsigned - unsigned tf::detail::cudaFindPair< T >::index - - index - - - - - - - - - - - - __device__ - __device__ tf::detail::cudaFindPair< T >::operator unsigned - () const - operator unsigned - - - - - - - - - - - - - - - - tf::detail::cudaFindPairindex - tf::detail::cudaFindPairkey - tf::detail::cudaFindPairoperator unsigned - - - diff --git a/docs/xml/structtf_1_1detail_1_1cudaMergePair.xml b/docs/xml/structtf_1_1detail_1_1cudaMergePair.xml deleted file mode 100644 index 1943fb30c..000000000 --- a/docs/xml/structtf_1_1detail_1_1cudaMergePair.xml +++ /dev/null @@ -1,53 +0,0 @@ - - - - tf::detail::cudaMergePair - - - typename T - - - unsigned - N - N - - - - - cudaArray< T, N > - cudaArray<T, N> tf::detail::cudaMergePair< T, N >::keys - - keys - - - - - - - - - - cudaArray< unsigned, N > - cudaArray<unsigned, N> tf::detail::cudaMergePair< T, N >::indices - - indices - - - - - - - - - - - - - - - - tf::detail::cudaMergePairindices - tf::detail::cudaMergePairkeys - - - diff --git a/docs/xml/structtf_1_1detail_1_1cudaMergeRange.xml b/docs/xml/structtf_1_1detail_1_1cudaMergeRange.xml deleted file mode 100644 index 3bbb2e428..000000000 --- a/docs/xml/structtf_1_1detail_1_1cudaMergeRange.xml +++ /dev/null @@ -1,237 +0,0 @@ - - - - tf::detail::cudaMergeRange - - - unsigned - unsigned tf::detail::cudaMergeRange::a_begin - - a_begin - - - - - - - - - - unsigned - unsigned tf::detail::cudaMergeRange::a_end - - a_end - - - - - - - - - - unsigned - unsigned tf::detail::cudaMergeRange::b_begin - - b_begin - - - - - - - - - - unsigned - unsigned tf::detail::cudaMergeRange::b_end - - b_end - - - - - - - - - - - - __device__ unsigned - __device__ unsigned tf::detail::cudaMergeRange::a_count - () const - a_count - - - - - - - - - - __device__ unsigned - __device__ unsigned tf::detail::cudaMergeRange::b_count - () const - b_count - - - - - - - - - - __device__ unsigned - __device__ unsigned tf::detail::cudaMergeRange::total - () const - total - - - - - - - - - - __device__ cudaRange - __device__ cudaRange tf::detail::cudaMergeRange::a_range - () const - a_range - - - - - - - - - - __device__ cudaRange - __device__ cudaRange tf::detail::cudaMergeRange::b_range - () const - b_range - - - - - - - - - - __device__ cudaMergeRange - __device__ cudaMergeRange tf::detail::cudaMergeRange::to_local - () const - to_local - - - - - - - - - - __device__ cudaMergeRange - __device__ cudaMergeRange tf::detail::cudaMergeRange::partition - (unsigned mp0, unsigned diag) const - partition - - unsigned - mp0 - - - unsigned - diag - - - - - - - - - - - __device__ cudaMergeRange - __device__ cudaMergeRange tf::detail::cudaMergeRange::partition - (unsigned mp0, unsigned diag0, unsigned mp1, unsigned diag1) const - partition - - unsigned - mp0 - - - unsigned - diag0 - - - unsigned - mp1 - - - unsigned - diag1 - - - - - - - - - - - __device__ bool - __device__ bool tf::detail::cudaMergeRange::a_valid - () const - a_valid - - - - - - - - - - __device__ bool - __device__ bool tf::detail::cudaMergeRange::b_valid - () const - b_valid - - - - - - - - - - - - - - - - tf::detail::cudaMergeRangea_begin - tf::detail::cudaMergeRangea_count - tf::detail::cudaMergeRangea_end - tf::detail::cudaMergeRangea_range - tf::detail::cudaMergeRangea_valid - tf::detail::cudaMergeRangeb_begin - tf::detail::cudaMergeRangeb_count - tf::detail::cudaMergeRangeb_end - tf::detail::cudaMergeRangeb_range - tf::detail::cudaMergeRangeb_valid - tf::detail::cudaMergeRangepartition - tf::detail::cudaMergeRangepartition - tf::detail::cudaMergeRangeto_local - tf::detail::cudaMergeRangetotal - - - diff --git a/docs/xml/structtf_1_1detail_1_1cudaScanResult.xml b/docs/xml/structtf_1_1detail_1_1cudaScanResult.xml deleted file mode 100644 index d2e0e6307..000000000 --- a/docs/xml/structtf_1_1detail_1_1cudaScanResult.xml +++ /dev/null @@ -1,60 +0,0 @@ - - - - tf::detail::cudaScanResult - - - typename T - - - unsigned - vt - vt - 0 - - - bool - is_array - is_array - (vt > 0) - - - - - T - T tf::detail::cudaScanResult< T, vt, is_array >::scan - - scan - - - - - - - - - - T - T tf::detail::cudaScanResult< T, vt, is_array >::reduction - - reduction - - - - - - - - - - - - - - - - tf::detail::cudaScanResultreduction - tf::detail::cudaScanResultscan - - - diff --git a/docs/xml/structtf_1_1detail_1_1cudaScanResult_3_01T_00_01vt_00_01true_01_4.xml b/docs/xml/structtf_1_1detail_1_1cudaScanResult_3_01T_00_01vt_00_01true_01_4.xml deleted file mode 100644 index bb35b62da..000000000 --- a/docs/xml/structtf_1_1detail_1_1cudaScanResult_3_01T_00_01vt_00_01true_01_4.xml +++ /dev/null @@ -1,53 +0,0 @@ - - - - tf::detail::cudaScanResult< T, vt, true > - - - typename T - - - unsigned - vt - vt - - - - - cudaArray< T, vt > - cudaArray<T, vt> tf::detail::cudaScanResult< T, vt, true >::scan - - scan - - - - - - - - - - T - T tf::detail::cudaScanResult< T, vt, true >::reduction - - reduction - - - - - - - - - - - - - - - - tf::detail::cudaScanResult< T, vt, true >reduction - tf::detail::cudaScanResult< T, vt, true >scan - - - diff --git a/docs/xml/structtf_1_1has__graph.xml b/docs/xml/structtf_1_1has__graph.xml new file mode 100644 index 000000000..73d299055 --- /dev/null +++ b/docs/xml/structtf_1_1has__graph.xml @@ -0,0 +1,43 @@ + + + + tf::has_graph + std::false_type + + + typename T + + + typename + void + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/xml/structtf_1_1is__runtime__task.xml b/docs/xml/structtf_1_1is__runtime__task.xml new file mode 100644 index 000000000..534ec43d3 --- /dev/null +++ b/docs/xml/structtf_1_1is__runtime__task.xml @@ -0,0 +1,43 @@ + + + + tf::is_runtime_task + std::false_type + + + typename C + + + typename + void + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/xml/structtf_1_1is__static__task.xml b/docs/xml/structtf_1_1is__static__task.xml new file mode 100644 index 000000000..a66ea6b41 --- /dev/null +++ b/docs/xml/structtf_1_1is__static__task.xml @@ -0,0 +1,43 @@ + + + + tf::is_static_task + std::false_type + + + typename C + + + typename + void + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/xml/structtf_1_1is__subflow__task.xml b/docs/xml/structtf_1_1is__subflow__task.xml new file mode 100644 index 000000000..dc7a7aef6 --- /dev/null +++ b/docs/xml/structtf_1_1is__subflow__task.xml @@ -0,0 +1,43 @@ + + + + tf::is_subflow_task + std::false_type + + + typename C + + + typename + void + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/xml/subflow-detach.dot b/docs/xml/subflow-detach.dot deleted file mode 100644 index 0bd7bd532..000000000 --- a/docs/xml/subflow-detach.dot +++ /dev/null @@ -1,22 +0,0 @@ -digraph Taskflow { -subgraph cluster_p0x7ffeecc59810 { -label="Taskflow"; -p0x7fdc0dc02830[label="A" ]; -p0x7fdc0dc02830 -> p0x7fdc0dc02b60; -p0x7fdc0dc02830 -> p0x7fdc0dc02940; -p0x7fdc0dc02940[label="C" ]; -p0x7fdc0dc02940 -> p0x7fdc0dc02a50; -p0x7fdc0dc02a50[label="D" ]; -p0x7fdc0dc02b60[label="B" ]; -p0x7fdc0dc02b60 -> p0x7fdc0dc02a50; -subgraph cluster_p0x7fdc0dc02b60 { -label="Subflow: B"; -color=blue -p0x7fdc0de00120[label="B1" ]; -p0x7fdc0de00120 -> p0x7fdc0de00360; -p0x7fdc0de00240[label="B2" ]; -p0x7fdc0de00240 -> p0x7fdc0de00360; -p0x7fdc0de00360[label="B3" ]; -} -} -} diff --git a/docs/xml/subflow__tasking_8dox.xml b/docs/xml/subflow__tasking_8dox.xml index cd2b8eb08..6ab2bf8d9 100644 --- a/docs/xml/subflow__tasking_8dox.xml +++ b/docs/xml/subflow__tasking_8dox.xml @@ -1,5 +1,5 @@ - + subflow_tasking.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/subflow_detach_5.dot b/docs/xml/subflow_detach_5.dot deleted file mode 100644 index 2ce7f08d6..000000000 --- a/docs/xml/subflow_detach_5.dot +++ /dev/null @@ -1,35 +0,0 @@ -digraph Taskflow { -p0x934ff0[label="A" ]; -p0x934ff0 -> p0x935218; -p0x934ff0 -> p0x9350a8; -p0x9350a8[label="C" ]; -p0x9350a8 -> p0x935160; -p0x935160[label="D" ]; -p0x935218[label="B" ]; -p0x935218 -> p0x935160; -p0x7fd564000b90[label="B1" ]; -p0x7fd564000b90 -> p0x7fd564000d00; -p0x7fd564000c48[label="B2" ]; -p0x7fd564000c48 -> p0x7fd564000d00; -p0x7fd564000d00[label="B3" ]; -p0x7fd55c000b90[label="B1" ]; -p0x7fd55c000b90 -> p0x7fd55c000d00; -p0x7fd55c000c48[label="B2" ]; -p0x7fd55c000c48 -> p0x7fd55c000d00; -p0x7fd55c000d00[label="B3" ]; -p0x7fd55c000db8[label="B1" ]; -p0x7fd55c000db8 -> p0x7fd55c000f28; -p0x7fd55c000e70[label="B2" ]; -p0x7fd55c000e70 -> p0x7fd55c000f28; -p0x7fd55c000f28[label="B3" ]; -p0x7fd55c000fe0[label="B1" ]; -p0x7fd55c000fe0 -> p0x7fd55c001150; -p0x7fd55c001098[label="B2" ]; -p0x7fd55c001098 -> p0x7fd55c001150; -p0x7fd55c001150[label="B3" ]; -p0x7fd55c001208[label="B1" ]; -p0x7fd55c001208 -> p0x7fd55c001378; -p0x7fd55c0012c0[label="B2" ]; -p0x7fd55c0012c0 -> p0x7fd55c001378; -p0x7fd55c001378[label="B3" ]; -} diff --git a/docs/xml/task_8hpp.xml b/docs/xml/task_8hpp.xml index 224552e6f..5d921ed3a 100644 --- a/docs/xml/task_8hpp.xml +++ b/docs/xml/task_8hpp.xml @@ -1,7 +1,275 @@ - + task.hpp + graph.hpp + taskflow/core/flow_builder.hpp + taskflow/core/observer.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf::is_static_task + tf::is_subflow_task + tf::is_runtime_task tf::Task tf::TaskView tf @@ -11,6 +279,6 @@ - + diff --git a/docs/xml/task_level_scheduling.dot b/docs/xml/task_level_scheduling.dot index 4fc1a5d10..3e822e0be 100644 --- a/docs/xml/task_level_scheduling.dot +++ b/docs/xml/task_level_scheduling.dot @@ -1,15 +1,17 @@ digraph G { -atask [label="a task T"]; +atask [label="pop a task T from the queue"]; cond [label="is T a condition task?" shape=diamond color=black fillcolor=aquamarine style=filled]; atask->cond invokeN [label="invoke(T)"] invokeY [label="R = invoke(T)"] enqueueR [label="enqueue the R-th successor of T"] decrement [label="decrement strong dependencies of each successor of T by one"] -enqueueS [label="enqueue successors of zero strong dpendencies"] +enqueueS [label="enqueue successors of zero strong dependencies"] invokeN->decrement; decrement->enqueueS; invokeY->enqueueR; cond->invokeY[style=dashed,label="yes"]; cond->invokeN[style=dashed,label="no"]; +enqueueS->atask; +enqueueR->atask; } diff --git a/docs/xml/taskflow_8hpp.xml b/docs/xml/taskflow_8hpp.xml index 20fbdcf13..78cfec2c6 100644 --- a/docs/xml/taskflow_8hpp.xml +++ b/docs/xml/taskflow_8hpp.xml @@ -1,14 +1,388 @@ - + taskflow.hpp + core/executor.hpp + core/runtime.hpp + core/async.hpp + algorithm/algorithm.hpp + taskflow/algorithm/module.hpp + taskflow/algorithm/pipeline.hpp + taskflow/cuda/cudaflow.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf tf::detail + + + TF_VERSION + 301000 + +version of the Taskflow (currently 3.11.0) + + +The version system is made of a major version number, a minor version number, and a patch number: +TF_VERSION % 100 is the patch level +TF_VERSION / 100 % 1000 is the minor version +TF_VERSION / 100000 is the major version + + + + + + + + + TF_MAJOR_VERSION + TF_VERSION/100000 + +major version of Taskflow, which is equal to TF_VERSION/100000 + + + + + + + + + TF_MINOR_VERSION + TF_VERSION/100%1000 + +minor version of Taskflow, which is equal to TF_VERSION / 100 % 1000 + + + + + + + + + TF_PATCH_VERSION + TF_VERSION%100 + +patch version of Taskflow, which is equal to TF_VERSION % 100 + + + + + + + + main taskflow include file - + diff --git a/docs/xml/taskflow__pipeline_8dox.xml b/docs/xml/taskflow__pipeline_8dox.xml index c53f66a12..b4f1a81d7 100644 --- a/docs/xml/taskflow__pipeline_8dox.xml +++ b/docs/xml/taskflow__pipeline_8dox.xml @@ -1,5 +1,5 @@ - + taskflow_pipeline.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/team.xml b/docs/xml/team.xml index bb2a30527..86944fd0d 100644 --- a/docs/xml/team.xml +++ b/docs/xml/team.xml @@ -1,5 +1,5 @@ - + team Codestin Search App @@ -7,31 +7,30 @@ Core Members team_1CoreMembers - + Alumni team_1Alumni - + Freelance Developers team_1FreelanceDevelopers - + Taskflow consists of a multidisciplinary team with different areas of expertise. We adhere to our Code of Conduct. -Codestin Search App -Core members provide the essential development, maintenance, and support of Taskflow in all aspects. +Codestin Search AppCore members provide the essential development, maintenance, and support of Taskflow in all aspects. Principal Investigator: Dr. Tsung-Wei Huang -Software Developers: Tsung-Wei Huang, Dian-Lun Lin, Cheng-Hsiang Chiu +Software Developers: Tsung-Wei Huang, Cheng-Hsiang Chiu, Boyang Zhang, Chih-Chun Chang -Financial Manager: Aidza Cruz (aidza dot cruz at utah dot edu) +Financial Manager: Jessica Murnane -Ombudsperson: Jennifer Hoskins (jennifer dot hoskins at osp dot utah dot edu) +Ombudsperson: Jessica Murane Diversity, Equity, and Inclusion: Tsung-Wei Huang @@ -41,9 +40,10 @@ -Codestin Search App -Taskflow would not have reached this far without the work of these individuals who ever participated in its development. +Codestin Search AppTaskflow would not have reached this far without the work of these individuals who ever participated in its development. +Dian-Lun Lin + Guannan Guo Martin Wong @@ -56,10 +56,9 @@ -Codestin Search App -Taskflow is contributed by a distributed set of Contributors all around the world. +Codestin Search AppTaskflow is contributed by a distributed set of Contributors all around the world. - + diff --git a/docs/xml/team_8dox.xml b/docs/xml/team_8dox.xml index 7ea6ff5ce..615454b4d 100644 --- a/docs/xml/team_8dox.xml +++ b/docs/xml/team_8dox.xml @@ -1,5 +1,5 @@ - + team.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/text__pipeline_8dox.xml b/docs/xml/text__pipeline_8dox.xml index f99c411c0..759df04bb 100644 --- a/docs/xml/text__pipeline_8dox.xml +++ b/docs/xml/text__pipeline_8dox.xml @@ -1,5 +1,5 @@ - + text_pipeline.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/transform_8dox.xml b/docs/xml/transform_8dox.xml index e5c1c9c3f..721db1907 100644 --- a/docs/xml/transform_8dox.xml +++ b/docs/xml/transform_8dox.xml @@ -1,5 +1,5 @@ - + transform.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/transform_8hpp.xml b/docs/xml/transform_8hpp.xml index eefc9aeae..9036a9f00 100644 --- a/docs/xml/transform_8hpp.xml +++ b/docs/xml/transform_8hpp.xml @@ -1,7 +1,345 @@ - + transform.hpp + ../cudaflow.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf tf::detail @@ -9,6 +347,6 @@ - + diff --git a/docs/xml/tsq_8hpp.xml b/docs/xml/tsq_8hpp.xml index ba4a265cb..ab201b9fe 100644 --- a/docs/xml/tsq_8hpp.xml +++ b/docs/xml/tsq_8hpp.xml @@ -1,15 +1,172 @@ - + tsq.hpp - tf::TaskQueue - tf::TaskQueue::Array + ../utility/macros.hpp + ../utility/traits.hpp + taskflow/core/graph.hpp + taskflow/core/worker.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf::UnboundedTaskQueue + tf::UnboundedTaskQueue::Array + tf::BoundedTaskQueue tf + + + TF_DEFAULT_BOUNDED_TASK_QUEUE_LOG_SIZE + 8 + + + +This macro defines the default size of the bounded task queue in Log2. Bounded task queue is used by each worker. + + + + + + + TF_DEFAULT_UNBOUNDED_TASK_QUEUE_LOG_SIZE + 10 + + + +This macro defines the default size of the unbounded task queue in Log2. Unbounded task queue is used by the executor. + + + + + + task queue include file - + diff --git a/docs/xml/uniontf_1_1detail_1_1cudaBlockScan_1_1storage__t.xml b/docs/xml/uniontf_1_1detail_1_1cudaBlockScan_1_1storage__t.xml deleted file mode 100644 index 24ce7d049..000000000 --- a/docs/xml/uniontf_1_1detail_1_1cudaBlockScan_1_1storage__t.xml +++ /dev/null @@ -1,70 +0,0 @@ - - - - tf::detail::cudaBlockScan::storage_t - - - T - T tf::detail::cudaBlockScan< nt, T >::storage_t::data[2 *nt] - [2 *nt] - data - - - - - - - - - - T - T tf::detail::cudaBlockScan< nt, T >::storage_t::threads[nt] - [nt] - threads - - - - - - - - - - T - T tf::detail::cudaBlockScan< nt, T >::storage_t::warps[num_warps] - [num_warps] - warps - - - - - - - - - - struct tf::detail::cudaBlockScan::storage_t::@0 - struct tf::detail::cudaBlockScan::storage_t::@0 tf::detail::cudaBlockScan< nt, T >::storage_t::@1 - - @1 - - - - - - - - - - - - - - - - tf::detail::cudaBlockScan::storage_tdata - tf::detail::cudaBlockScan::storage_tthreads - tf::detail::cudaBlockScan::storage_twarps - - - diff --git a/docs/xml/uniontf_1_1detail_1_1cudaBlockSort_1_1Storage.xml b/docs/xml/uniontf_1_1detail_1_1cudaBlockSort_1_1Storage.xml deleted file mode 100644 index 43ab016dd..000000000 --- a/docs/xml/uniontf_1_1detail_1_1cudaBlockSort_1_1Storage.xml +++ /dev/null @@ -1,43 +0,0 @@ - - - - tf::detail::cudaBlockSort::Storage - - - K - K tf::detail::cudaBlockSort< nt, vt, K, V >::Storage::keys[nt *vt+1] - [nt *vt+1] - keys - - - - - - - - - - V - V tf::detail::cudaBlockSort< nt, vt, K, V >::Storage::vals[nt *vt] - [nt *vt] - vals - - - - - - - - - - - - - - - - tf::detail::cudaBlockSort::Storagekeys - tf::detail::cudaBlockSort::Storagevals - - - diff --git a/docs/xml/usecases.xml b/docs/xml/usecases.xml index ab1ccb164..2a743442b 100644 --- a/docs/xml/usecases.xml +++ b/docs/xml/usecases.xml @@ -1,5 +1,5 @@ - + usecases Codestin Search App @@ -15,6 +15,6 @@ - + diff --git a/docs/xml/usecases_8dox.xml b/docs/xml/usecases_8dox.xml index d3883ae86..57f2a2076 100644 --- a/docs/xml/usecases_8dox.xml +++ b/docs/xml/usecases_8dox.xml @@ -1,5 +1,5 @@ - + usecases.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/uw-madison-ece-logo.png b/docs/xml/uw-madison-ece-logo.png new file mode 100644 index 000000000..42258c755 Binary files /dev/null and b/docs/xml/uw-madison-ece-logo.png differ diff --git a/docs/xml/wavefront.xml b/docs/xml/wavefront.xml index fe24e4744..35a666ae6 100644 --- a/docs/xml/wavefront.xml +++ b/docs/xml/wavefront.xml @@ -1,5 +1,5 @@ - + wavefront Codestin Search App @@ -7,33 +7,31 @@ Problem Formulation wavefront_1WavefrontComputingFormulation - + Wavefront Task Graph wavefront_1WavefrontTaskGraph - + We study the wavefront parallelism, which is a common pattern in dynamic programming to sweep elements in a diagonal direction. -Codestin Search App -The computation starts at a singular point at a corner of a data plan (e.g., grid) and propagates its effect diagonally to other elements. This sweep of computation is known as wavefront. Each point in the wavefront can be computed in parallel. The following example shows a wavefront parallelism in a 2D matrix. +Codestin Search AppThe computation starts at a singular point at a corner of a data plan (e.g., grid) and propagates its effect diagonally to other elements. This sweep of computation is known as wavefront. Each point in the wavefront can be computed in parallel. The following example shows a wavefront parallelism in a 2D matrix. We partition the 9x9 grid into a 3x3 block and assign a task to one block. The wavefront propagates task dependencies from the top-left block all the way to the bottom-right block. Each task precedes two tasks, one to the right and another below. -Codestin Search App -We can describe the wavefront parallelism in a simple two-level loop. Since we need to address the two tasks upper and left to a task when creating its dependencies, we use a 2D vector to pre-allocate all tasks via tf::Taskflow::placeholder. +Codestin Search AppWe can describe the wavefront parallelism in a simple two-level loop. Since we need to address the two tasks upper and left to a task when creating its dependencies, we use a 2D vector to pre-allocate all tasks via tf::Taskflow::placeholder. #include<taskflow/taskflow.hpp> intmain(){ tf::Executorexecutor; tf::Taskflowtaskflow; intnum_blocks=3; -std::vector<std::vector<tf::Task>>node(num_blocks); +std::vector<std::vector<tf::Task>>node(num_blocks); //createnum_blocks*num_blocksplaceholdertasks for(auto&n:node){ @@ -46,7 +44,7 @@ for(inti=num_blocks;--i>=0;){ for(intj=num_blocks;--j>=0;){ //deferredtaskassignment -node[i][j].work([=](){printf("computeblock(%d,%d)",i,j);}); +node[i][j].work([=](){printf("computeblock(%d,%d)",i,j);}); //wavefrontdependency if(j+1<num_blocks)node[i][j].precede(node[i][j+1]); @@ -57,15 +55,15 @@ executor.run(taskflow).wait(); //dumpthetaskflow -taskflow.dump(std::cout); +taskflow.dump(std::cout); } The figure below shows the wavefront parallelism in a 3x3 grid: - + Wavefront parallelism has many variations in different applications, for instance, Smith-Waterman sequencing, video encoding algorithms, image analysis, and pipeline parallelism. The parallel pattern exhibits in a diagonal direction. - + diff --git a/docs/xml/wavefront_8dox.xml b/docs/xml/wavefront_8dox.xml index 084ecf401..f17eee63b 100644 --- a/docs/xml/wavefront_8dox.xml +++ b/docs/xml/wavefront_8dox.xml @@ -1,5 +1,5 @@ - + wavefront.dox tf @@ -7,6 +7,6 @@ - + diff --git a/docs/xml/work-stealing.png b/docs/xml/work-stealing.png new file mode 100644 index 000000000..95bf39ff8 Binary files /dev/null and b/docs/xml/work-stealing.png differ diff --git a/docs/xml/worker_8hpp.xml b/docs/xml/worker_8hpp.xml index 514b1fd1b..d255d2d80 100644 --- a/docs/xml/worker_8hpp.xml +++ b/docs/xml/worker_8hpp.xml @@ -1,15 +1,127 @@ - + worker.hpp + declarations.hpp + tsq.hpp + atomic_notifier.hpp + nonblocking_notifier.hpp + taskflow/core/observer.hpp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + tf::Worker tf::WorkerView + tf::WorkerInterface tf + tf::pt worker include file - + diff --git a/doxygen/Doxyfile b/doxygen/Doxyfile index f41727b81..7a36fb9b1 100644 --- a/doxygen/Doxyfile +++ b/doxygen/Doxyfile @@ -172,7 +172,7 @@ INLINE_INHERITED_MEMB = NO # shortest path that makes the file name unique will be used # The default value is: YES. -FULL_PATH_NAMES = NO +FULL_PATH_NAMES = YES # The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. # Stripping is only done if one of the specified strings matches the left-hand @@ -184,7 +184,7 @@ FULL_PATH_NAMES = NO # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. -STRIP_FROM_PATH = +STRIP_FROM_PATH = .. # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which @@ -193,7 +193,7 @@ STRIP_FROM_PATH = # specify the list of include paths that are normally passed to the compiler # using the -I flag. -STRIP_FROM_INC_PATH = +STRIP_FROM_INC_PATH = .. # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't @@ -863,7 +863,7 @@ CITE_BIB_FILES = # messages are off. # The default value is: NO. -QUIET = NO +QUIET = YES # The WARNINGS tag can be used to turn on/off the warning messages that are # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES @@ -964,6 +964,9 @@ WARN_LOGFILE = # Note: If this tag is empty the current directory is searched. INPUT = ../taskflow/utility/small_vector.hpp \ + ../taskflow/utility/math.hpp \ + ../taskflow/utility/os.hpp \ + ../taskflow/utility/iterator.hpp \ ../taskflow/core/graph.hpp \ ../taskflow/core/tsq.hpp \ ../taskflow/core/flow_builder.hpp \ @@ -971,32 +974,30 @@ INPUT = ../taskflow/utility/small_vector.hpp \ ../taskflow/core/executor.hpp \ ../taskflow/core/task.hpp \ ../taskflow/core/async_task.hpp \ + ../taskflow/core/runtime.hpp \ ../taskflow/core/semaphore.hpp \ ../taskflow/core/taskflow.hpp \ ../taskflow/core/observer.hpp \ ../taskflow/algorithm/partitioner.hpp \ - ../taskflow/algorithm/critical.hpp \ ../taskflow/algorithm/pipeline.hpp \ ../taskflow/algorithm/data_pipeline.hpp \ + ../taskflow/algorithm/module.hpp \ ../taskflow/cuda/cuda_device.hpp \ ../taskflow/cuda/cuda_memory.hpp \ ../taskflow/cuda/cuda_stream.hpp \ - ../taskflow/cuda/cuda_task.hpp \ + ../taskflow/cuda/cuda_graph.hpp \ + ../taskflow/cuda/cuda_graph_exec.hpp \ ../taskflow/cuda/cudaflow.hpp \ - ../taskflow/cuda/cuda_optimizer.hpp \ - ../taskflow/cuda/cuda_capturer.hpp \ - ../taskflow/cuda/cuda_execution_policy.hpp \ ../taskflow/cuda/algorithm/for_each.hpp \ ../taskflow/cuda/algorithm/transform.hpp \ - ../taskflow/cuda/algorithm/reduce.hpp \ - ../taskflow/cuda/algorithm/scan.hpp \ - ../taskflow/cuda/algorithm/merge.hpp \ - ../taskflow/cuda/algorithm/sort.hpp \ - ../taskflow/cuda/algorithm/find.hpp \ ../taskflow/taskflow.hpp \ QuickStart.dox \ releases/releases.dox \ releases/release-roadmap.dox \ + releases/release-3.11.0.dox \ + releases/release-3.10.0.dox \ + releases/release-3.9.0.dox \ + releases/release-3.8.0.dox \ releases/release-3.7.0.dox \ releases/release-3.6.0.dox \ releases/release-3.5.0.dox \ @@ -1026,13 +1027,11 @@ INPUT = ../taskflow/utility/small_vector.hpp \ cookbook/conditional_tasking.dox \ cookbook/composable_tasking.dox \ cookbook/runtime_tasking.dox \ - cookbook/prioritized_tasking.dox \ cookbook/semaphore.dox \ cookbook/async_tasking.dox \ cookbook/dependent_async_tasking.dox \ cookbook/exception.dox \ - cookbook/gpu_tasking_cudaflow.dox \ - cookbook/gpu_tasking_cudaflow_capturer.dox \ + cookbook/gpu_tasking.dox \ cookbook/cancellation.dox \ cookbook/profiler.dox \ algorithms/partitioner.dox \ @@ -1043,29 +1042,17 @@ INPUT = ../taskflow/utility/small_vector.hpp \ algorithms/sort.dox \ algorithms/scan.dox \ algorithms/find.dox \ + algorithms/module.dox \ algorithms/pipeline.dox \ algorithms/scalable_pipeline.dox \ algorithms/data_pipeline.dox \ algorithms/pipeline_with_token_dependencies.dox \ - cudaflow_algorithms/cudaflow_algorithms.dox \ - cudaflow_algorithms/cudaflow_single_task.dox \ - cudaflow_algorithms/cudaflow_for_each.dox \ - cudaflow_algorithms/cudaflow_transform.dox \ - cuda_std_algorithms/cuda_std_algorithms.dox \ - cuda_std_algorithms/cuda_std_execution_policy.dox \ - cuda_std_algorithms/cuda_std_single_task.dox \ - cuda_std_algorithms/cuda_std_for_each.dox \ - cuda_std_algorithms/cuda_std_transform.dox \ - cuda_std_algorithms/cuda_std_reduce.dox \ - cuda_std_algorithms/cuda_std_scan.dox \ - cuda_std_algorithms/cuda_std_merge.dox \ - cuda_std_algorithms/cuda_std_find.dox \ examples/examples.dox \ examples/wavefront.dox \ - examples/matrix_multiplication.dox \ - examples/matrix_multiplication_cudaflow.dox \ + examples/matmul.dox \ + examples/matmul_cuda.dox \ examples/kmeans.dox \ - examples/kmeans_cudaflow.dox \ + examples/kmeans_cuda.dox \ examples/fibonacci.dox \ examples/flipcoins.dox \ examples/graph_traversal.dox \ @@ -1517,15 +1504,6 @@ HTML_COLORSTYLE_SAT = 100 HTML_COLORSTYLE_GAMMA = 80 -# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML -# page will contain the date and time when the page was generated. Setting this -# to YES can help to show when doxygen was last run and thus if the -# documentation is up to date. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_TIMESTAMP = NO - # If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML # documentation will contain a main index with vertical navigation menus that # are dynamically created via JavaScript. If disabled, the navigation index will @@ -2186,14 +2164,6 @@ LATEX_HIDE_INDICES = NO LATEX_BIB_STYLE = plain -# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated -# page will contain the date and time when the page was generated. Setting this -# to NO can help when comparing the output of multiple runs. -# The default value is: NO. -# This tag requires that the tag GENERATE_LATEX is set to YES. - -LATEX_TIMESTAMP = NO - # The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute) # path from which the emoji images will be read. If a relative path is entered, # it will be relative to the LATEX_OUTPUT directory. If left blank the @@ -2412,7 +2382,7 @@ PERLMOD_MAKEVAR_PREFIX = # C-preprocessor directives found in the sources and include files. # The default value is: YES. -ENABLE_PREPROCESSING = NO +ENABLE_PREPROCESSING = YES # If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names # in the source code. If set to NO, only conditional compilation will be @@ -2462,7 +2432,7 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = +PREDEFINED = DOXYGEN_GENERATING_OUTPUT # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The diff --git a/doxygen/QuickStart.dox b/doxygen/QuickStart.dox index d2b8f3b32..6f3e8719d 100644 --- a/doxygen/QuickStart.dox +++ b/doxygen/QuickStart.dox @@ -1,6 +1,6 @@ namespace tf { -/** @mainpage Modern C++ Parallel Task Programming +/** @mainpage A General-purpose Task-parallel Programming System %Taskflow helps you quickly write parallel and heterogeneous task programs with high performance @@ -13,14 +13,13 @@ The source code is available in our @ProjectGitHub. @section ASimpleFirstProgram Start Your First Taskflow Program -The following program (@c simple.cpp) creates four tasks +The following program (@c simple.cpp) creates a taskflow of four tasks @c A, @c B, @c C, and @c D, where @c A runs before @c B and @c C, and @c D runs after @c B and @c C. When @c A finishes, @c B and @c C can run in parallel. - -@dotfile images/simple.dot + @code{.cpp} #include // Taskflow is header-only @@ -46,13 +45,16 @@ int main(){ } @endcode + +@dotfile images/simple.dot + %Taskflow is *header-only* and there is no wrangle with installation. To compile the program, clone the %Taskflow project and tell the compiler to include the headers under @c taskflow/. -@code{.shell-session} +@code{.bash} ~$ git clone https://github.com/taskflow/taskflow.git # clone it only once -~$ g++ -std=c++17 simple.cpp -I taskflow/ -O2 -pthread -o simple +~$ g++ -std=c++20 simple.cpp -I taskflow/ -O2 -pthread -o simple ~$ ./simple TaskA TaskC @@ -66,7 +68,7 @@ in an easy-to-use web-based interface. @image html images/tfprof.png -@code{.shell-session} +@code{.bash} # run the program with the environment variable TF_ENABLE_PROFILER enabled ~$ TF_ENABLE_PROFILER=simple.json ./simple ~$ cat simple.json @@ -120,31 +122,6 @@ cond.precede(cond, stop); // moves on to 'cond' on returning 0, or 'stop' on 1 @dotfile images/conditional-tasking-1.dot -@section QuickStartOffloadTasksToGPU Offload Tasks to a GPU - -%Taskflow supports GPU tasking for you to accelerate a wide range of scientific computing applications by harnessing the power of CPU-GPU collaborative computing using CUDA. - -@code{.cpp} -__global__ void saxpy(int n, float a, float *x, float *y) { - int i = blockIdx.x*blockDim.x + threadIdx.x; - if (i < n) { - y[i] = a*x[i] + y[i]; - } -} -tf::Task cudaflow = taskflow.emplace([&](tf::cudaFlow& cf) { - tf::cudaTask h2d_x = cf.copy(dx, hx.data(), N).name("h2d_x"); - tf::cudaTask h2d_y = cf.copy(dy, hy.data(), N).name("h2d_y"); - tf::cudaTask d2h_x = cf.copy(hx.data(), dx, N).name("d2h_x"); - tf::cudaTask d2h_y = cf.copy(hy.data(), dy, N).name("d2h_y"); - tf::cudaTask saxpy = cf.kernel((N+255)/256, 256, 0, saxpy, N, 2.0f, dx, dy) - .name("saxpy"); // parameters to the saxpy kernel - saxpy.succeed(h2d_x, h2d_y) - .precede(d2h_x, d2h_y); -}).name("cudaFlow"); -@endcode - -@dotfile images/saxpy_1_cudaflow.dot - @section QuickStartComposeTaskGraphs Compose Task Graphs %Taskflow is composable. You can create large parallel graphs through composition of modular and reusable blocks that are easier to optimize at an individual scope. @@ -194,30 +171,6 @@ executor.wait_for_all(); -@section QuickStartRunATaskflowThroughAnExecution Run a Taskflow through an Executor - -The executor provides several @em thread-safe methods to run a taskflow. -You can run a taskflow once, multiple times, or until a stopping criteria is met. -These methods are non-blocking with a @c tf::Future return -to let you query the execution status. - -@code{.cpp} -// runs the taskflow once -tf::Future run_once = executor.run(taskflow); - -// wait on this run to finish -run_once.get(); - -// run the taskflow four times -executor.run_n(taskflow, 4); - -// runs the taskflow five times -executor.run_until(taskflow, [counter=5](){ return --counter == 0; }); - -// blocks the executor until all submitted taskflows complete -executor.wait_for_all(); -@endcode - @section QuickStartLeverageStandardParallelAlgorithms Leverage Standard Parallel Algorithms %Taskflow defines algorithms for you to quickly express common parallel patterns @@ -259,6 +212,61 @@ taskflow.composed_of(pl) executor.run(taskflow).wait(); @endcode +@section QuickStartRunATaskflowThroughAnExecution Run a Taskflow through an Executor + +The executor provides several @em thread-safe methods to run a taskflow. +You can run a taskflow once, multiple times, or until a stopping criteria is met. +These methods are non-blocking with a @c tf::Future return +to let you query the execution status. + +@code{.cpp} +// runs the taskflow once +tf::Future run_once = executor.run(taskflow); + +// wait on this run to finish +run_once.get(); + +// run the taskflow four times +executor.run_n(taskflow, 4); + +// runs the taskflow five times +executor.run_until(taskflow, [counter=5](){ return --counter == 0; }); + +// blocks the executor until all submitted taskflows complete +executor.wait_for_all(); +@endcode + +@section QuickStartOffloadTasksToGPU Offload Tasks to a GPU + +%Taskflow supports GPU tasking for you to accelerate a wide range of scientific computing applications by harnessing the power of CPU-GPU collaborative computing using Nvidia CUDA Graph. + +@code{.cpp} +__global__ void saxpy(int n, float a, float *x, float *y) { + int i = blockIdx.x*blockDim.x + threadIdx.x; + if (i < n) { + y[i] = a*x[i] + y[i]; + } +} +// create a CUDA Gaph task +tf::Task cudaflow = taskflow.emplace([&]() { + tf::cudaGraph cg; + tf::cudaTask h2d_x = cg.copy(dx, hx.data(), N); + tf::cudaTask h2d_y = cg.copy(dy, hy.data(), N); + tf::cudaTask d2h_x = cg.copy(hx.data(), dx, N); + tf::cudaTask d2h_y = cg.copy(hy.data(), dy, N); + tf::cudaTask saxpy = cg.kernel((N+255)/256, 256, 0, saxpy, N, 2.0f, dx, dy); + saxpy.succeed(h2d_x, h2d_y) + .precede(d2h_x, d2h_y); + + // instantiate an executable CUDA graph and run it through a stream + tf::cudaGraphExec exec(cg); + tf::cudaStream stream; + stream.run(exec).synchronize(); +}).name("CUDA Graph Task"); +@endcode + +@dotfile images/saxpy_1_cudaflow.dot + @section QuickStartVisualizeATaskflow Visualize Taskflow Graphs You can dump a taskflow graph to a DOT format and visualize it @@ -268,11 +276,11 @@ using a number of free GraphViz tools such as @GraphVizOnline. @code{.cpp} tf::Taskflow taskflow; -tf::Task A = taskflow.emplace([] () {}).name("A"); -tf::Task B = taskflow.emplace([] () {}).name("B"); -tf::Task C = taskflow.emplace([] () {}).name("C"); -tf::Task D = taskflow.emplace([] () {}).name("D"); -tf::Task E = taskflow.emplace([] () {}).name("E"); +tf::Task A = taskflow.emplace([](){}).name("A"); +tf::Task B = taskflow.emplace([](){}).name("B"); +tf::Task C = taskflow.emplace([](){}).name("C"); +tf::Task D = taskflow.emplace([](){}).name("D"); +tf::Task E = taskflow.emplace([](){}).name("E"); A.precede(B, C, E); C.precede(D); B.precede(D, E); @@ -289,14 +297,18 @@ To use %Taskflow, you only need a compiler that supports C++17: @li GNU C++ Compiler at least v8.4 with -std=c++17 @li Clang C++ Compiler at least v6.0 with -std=c++17 -@li Microsoft Visual Studio at least v19.27 with /std:c++17 -@li AppleClang Xcode Version at least v12.0 with -std=c++17 +@li Microsoft Visual Studio at least v19.14 with /std:c++17 +@li Apple Clang Xcode Version at least v12.0 with -std=c++17 @li Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17 @li Intel C++ Compiler at least v19.0.1 with -std=c++17 @li Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 and SYCL20 %Taskflow works on Linux, Windows, and Mac OS X. +@attention +Although %Taskflow supports primarily C++17, you can enable C++20 compilation +through `-std=c++20` (or `/std:c++20` for MSVC) to achieve better performance due to new C++20 features. + @section QuickStartGetInvolved Get Involved Visit our @ProjectWebsite and @ShowcasePresentation @@ -311,7 +323,7 @@ to learn more about %Taskflow. To get involved: We are committed to support trustworthy developments for both academic and industrial research projects in parallel and heterogeneous computing. -If you are using %Taskflow, please cite the following paper we publised at 2022 IEEE TPDS: +If you are using %Taskflow, please cite the following paper we published at 2022 IEEE TPDS: + Tsung-Wei Huang, Dian-Lun Lin, Chun-Xun Lin, and Yibo Lin, "[Taskflow: A Lightweight Parallel and Heterogeneous Task Graph Computing System](https://tsung-wei-huang.github.io/papers/tpds21-taskflow.pdf)," IEEE Transactions on Parallel and Distributed Systems (TPDS), vol. 33, no. 6, pp. 1303-1320, June 2022 @@ -321,7 +333,7 @@ the following organizations for sponsoring the %Taskflow project! | | | | | |:-- -----:|:--------:|:--------:|:--------:| |@image html "images/utah-ece-logo.png" |@image html "images/nsf.png"|@image html "images/darpa.png"|@image html "images/NumFocus.png"| -|@image html "images/nvidia-logo.png" | | | | +|@image html "images/nvidia-logo.png" | @image html "images/uw-madison-ece-logo.png" | | | diff --git a/doxygen/__pycache__/conf.cpython-310.pyc b/doxygen/__pycache__/conf.cpython-310.pyc deleted file mode 100644 index b1c3a8df7..000000000 Binary files a/doxygen/__pycache__/conf.cpython-310.pyc and /dev/null differ diff --git a/doxygen/algorithms/algorithms.dox b/doxygen/algorithms/algorithms.dox index 28e29a61d..e86872a79 100644 --- a/doxygen/algorithms/algorithms.dox +++ b/doxygen/algorithms/algorithms.dox @@ -11,6 +11,7 @@ namespace tf { + @subpage ParallelSort + @subpage ParallelScan + @subpage ParallelFind + + @subpage ModuleAlgorithm + @subpage TaskParallelPipeline + @subpage TaskParallelScalablePipeline + @subpage TaskParallelPipelineWithTokenDependencies diff --git a/doxygen/algorithms/data_pipeline.dox b/doxygen/algorithms/data_pipeline.dox index 07196fa00..b88a43691 100644 --- a/doxygen/algorithms/data_pipeline.dox +++ b/doxygen/algorithms/data_pipeline.dox @@ -33,8 +33,7 @@ The following example creates a data-parallel pipeline that generates a total of five dataflow tokens from `void` to `int` at the first stage, from `int` to `%std::string` at the second stage, -from `%std::string` to `float` at the third stage, -and `float` to `void` at the final stage. +and `%std::string` to `void` at the final stage. Data storage between stages is automatically managed by tf::DataPipeline. @code{.cpp} @@ -43,7 +42,7 @@ Data storage between stages is automatically managed by tf::DataPipeline. int main() { - // data flow => void -> int -> std::string -> float -> void + // data flow => void -> int -> std::string -> void tf::Taskflow taskflow("pipeline"); tf::Executor executor; @@ -63,7 +62,7 @@ int main() { }), tf::make_data_pipe(tf::PipeType::SERIAL, [](int& input) { - printf("second pipe returns a strong of %d\n", input + 100); + printf("second pipe returns a string of %d\n", input + 100); return std::to_string(input + 100); }), @@ -124,7 +123,7 @@ tf::make_data_pipe( ) @endcode -@note +@attention By default, tf::DataPipeline passes the data in reference to your callable at which you can take it in copy or in reference depending on application needs. diff --git a/doxygen/algorithms/find.dox b/doxygen/algorithms/find.dox index 1126d37a8..f1e77e9df 100644 --- a/doxygen/algorithms/find.dox +++ b/doxygen/algorithms/find.dox @@ -25,10 +25,10 @@ The algorithm returns an iterator to the first found element in the range or returns @c last if there is no such iterator. %Taskflow provides the following parallel-find algorithms: -+ tf::Taskflow::find_if(B first, E last, T& result, UOP predicate, P&& part) -+ tf::Taskflow::find_if_not(B first, E last, T& result, UOP predicate, P&& part) -+ tf::Taskflow::min_element(B first, E last, T& result, C comp, P&& part) -+ tf::Taskflow::max_element(B first, E last, T& result, C comp, P&& part) ++ tf::Taskflow::find_if(B first, E last, T& result, UOP predicate, P part) ++ tf::Taskflow::find_if_not(B first, E last, T& result, UOP predicate, P part) ++ tf::Taskflow::min_element(B first, E last, T& result, C comp, P part) ++ tf::Taskflow::max_element(B first, E last, T& result, C comp, P part) @section CreateAParallelFindIfTask Create a Parallel Find-If Task @@ -174,7 +174,7 @@ executor.run(taskflow).wait(); assert(*result == 2); @endcode -@note +@attention When using tf::Taskflow::max_element to find the large element, we will still need to use std::less as our comparison function. Details can be referred to @@ -195,8 +195,9 @@ another one with the guided partitioning algorithm: std::vector vec(1024, -1); std::vector::iterator result; -tf::ExecutionPolicy static_partitioner; -tf::ExecutionPolicy guided_partitioner; +// create two partitioners with a chunk size of 10 +tf::StaticPartitioner static_partitioner(10); +tf::GuidedPartitioner guided_partitioner(10); // create a parallel-find task with a static partitioner taskflow.find_if( @@ -209,7 +210,7 @@ taskflow.find_if( ); @endcode -@note +@attention By default, parallel-find tasks use tf::DefaultPartitioner if no partitioner is specified. diff --git a/doxygen/algorithms/for_each.dox b/doxygen/algorithms/for_each.dox index 42090a66a..d5399a3c2 100644 --- a/doxygen/algorithms/for_each.dox +++ b/doxygen/algorithms/for_each.dox @@ -19,7 +19,7 @@ for using parallel-iteration algorithms. @section A1IndexBasedParallelFor Create an Index-based Parallel-Iteration Task Index-based parallel-for performs parallel iterations over a range [first, last) with the given @c step size. -The task created by tf::Taskflow::for_each_index(B first, E last, S step, C callable, P&& part) +The task created by tf::Taskflow::for_each_index(B first, E last, S step, C callable, P part) represents parallel execution of the following loop: @code{.cpp} @@ -47,9 +47,39 @@ In the positive case, the 50 items are 0, 2, 4, 6, 8, ..., 96, 98. In the negative case, the 50 items are 100, 98, 96, 04, ... 4, 2. An example of the %Taskflow graph for the positive case under 12 workers is depicted below: + @dotfile images/parallel_for_1.dot +Instead of explicitly specifying the index range and the callable for each index invocation, +the overload +tf::Taskflow::for_each_by_index(R range, C callable, P part) provides you with a more flexible way to +iterate over subranges of indices. +This overload uses tf::IndexRange to partition the range into subranges, +allowing finer control over how each subrange is processed. +For instance, the code below does the same thing using two different approaches: + +@code{.cpp} +std::vector data1(100), data2(100); + +// Approach 1: initialize data1 using explicit index range +taskflow.for_each_index(0, 100, 1, [&](int i){ data1[i] = 10; }); + +// Approach 2: initialize data2 using tf::IndexRange +tf::IndexRange range(0, 100, 1); +taskflow.for_each_by_index(range, [&](tf::IndexRange subrange){ + for(int i=subrange.begin(); iSTL-styled iterators, @c first and @c last. -The task created by tf::Taskflow::for_each(B first, E last, C callable, P&& part) represents +The task created by tf::Taskflow::for_each(B first, E last, C callable, P part) represents a parallel execution of the following loop: @code{.cpp} @@ -157,8 +187,9 @@ another one with the guided partitioning algorithm: @code{.cpp} std::vector vec(1024, 0); -tf::ExecutionPolicy static_partitioner; -tf::ExecutionPolicy guided_partitioner; +// create two partitioners with a chunk size of 10 +tf::StaticPartitioner static_partitioner(10); +tf::GuidedPartitioner guided_partitioner(10); // create a parallel-iteration task with static partitioner taskflow.for_each( @@ -177,7 +208,7 @@ taskflow.for_each( ); @endcode -@note +@attention By default, parallel-iteration tasks use tf::DefaultPartitioner if no partitioner is specified. diff --git a/doxygen/algorithms/module.dox b/doxygen/algorithms/module.dox new file mode 100644 index 000000000..77ac93b79 --- /dev/null +++ b/doxygen/algorithms/module.dox @@ -0,0 +1,169 @@ +namespace tf { + +/** @page ModuleAlgorithm Module Algorithm + +%Taskflow provides template methods that let users create reusable building blocks +called @em modules. +Users can connect modules together to build more complex parallel algorithms. + +@tableofcontents + +@section ModuleAlgorithmInclude Include the Header + +You need to include the header file, taskflow/algorithm/module.hpp, +for creating a module task over a schedulable graph target. + +@code{.cpp} +#include +@endcode + +@section WhatIsAModuleTask What is a Module Task + +Similar to @ref ComposableTasking, but in a more general setting, +the template function tf::make_module_task +allows you to create a task over a Taskflow graph that can be executed by an executor. +This provides a flexible mechanism to encapsulate and reuse complex task logic within your %Taskflow applications. +The following example demonstrates how to create and launch multiple Taskflow graphs in parallel using asynchronous tasking: + +@code{.cpp} +#include +#include + +int main() { + + tf::Executor executor; + + tf::Taskflow A; + tf::Taskflow B; + tf::Taskflow C; + tf::Taskflow D; + + A.emplace([](){ printf("Taskflow A\n"); }); + B.emplace([](){ printf("Taskflow B\n"); }); + C.emplace([](){ printf("Taskflow C\n"); }); + D.emplace([](){ printf("Taskflow D\n"); }); + + // launch the four taskflows using asynchronous tasking + executor.async(tf::make_module_task(A)); + executor.async(tf::make_module_task(B)); + executor.async(tf::make_module_task(C)); + executor.async(tf::make_module_task(D)); + executor.wait_for_all(); + + return 0; +} +@endcode + +@dotfile images/module_task_1.dot + +Since the four taskflows are launched asynchronously without any dependencies between them, +we can observe any order of the output message: + +@code{.bash} +# one possible output +Taskflow B +Taskflow C +Taskflow A +Taskflow D + +# another possible output +Taskflow D +Taskflow A +Taskflow B +Taskflow C +@endcode + +If you need to enforce dependencies among these four taskflows, +you can use dependent-async tasks. +The example below launches the four taskflows one by one in sequential: + +@code{.cpp} +tf::Executor executor; + +tf::Taskflow A; +tf::Taskflow B; +tf::Taskflow C; +tf::Taskflow D; + +A.emplace([](){ printf("Taskflow A\n"); }); +B.emplace([](){ printf("Taskflow B\n"); }); +C.emplace([](){ printf("Taskflow C\n"); }); +D.emplace([](){ printf("Taskflow D\n"); }); + +auto TA = executor.silent_dependent_async(tf::make_module_task(A)); +auto TB = executor.silent_dependent_async(tf::make_module_task(B), TA); +auto TC = executor.silent_dependent_async(tf::make_module_task(C), TB); +auto [TD, FD] = executor.dependent_async(tf::make_module_task(D), TC); +FD.get(); +@endcode + +@dotfile images/module_task_2.dot + +@code{.bash} +# dependent-async tasks enforce a sequential execution of the four taskflows +Taskflow A +Taskflow B +Taskflow C +Taskflow D +@endcode + +The module task maker, tf::make_module_task, operates similarly to tf::Taskflow::composed_of, +but provides a more general interface that can be used beyond %Taskflow. +Specifically, the following two approaches achieve equivalent functionality: + + +@code{.cpp} +// approach 1: composition using composed_of +tf::Task m1 = taskflow1.composed_of(taskflow2); + +// approach 2: composition using make_module_task +tf::Task m1 = taskflow1.emplace(tf::make_module_task(taskflow2)); +@endcode + +@attention +Similar to tf::Taskflow::composed_of, tf::make_module_task does not assume ownership of +the provided taskflow but a soft reference. +You are responsible for ensuring that the encapsulated taskflow remains valid +throughout its execution. + + +@section CreateAModuleTaskOverACustomGraph Create a Module Task over a Custom Graph + +In addition to encapsulate taskflow graphs, you can create a module task to schedule +a custom graph target. +A schedulable target (of type `T`) must define the method `T::graph()` that returns a reference +to the tf::Graph object managed by `T`. +The following example defines a custom graph that can be scheduled through making module tasks: + +@code{.cpp} +struct CustomGraph { + tf::Graph graph; + CustomGraph() { + // use flow builder to inherit all task creation methods in tf::Taskflow + tf::FlowBuilder builder(graph); + tf::Task task = builder.emplace([](){ + std::cout << "a task\n"; // static task + }); + } + // returns a reference to the graph for taskflow composition + Graph& graph() { return graph; } +}; + +CustomGraph target; +executor.async(tf::make_module_task(target)); +@endcode + +@attention +Users are responsible for ensuring the given custom graph remains valid throughout its execution. +The executor does not assume ownership of the custom graph. + + +*/ + +} + + + + + + diff --git a/doxygen/algorithms/partitioner.dox b/doxygen/algorithms/partitioner.dox index f7336167d..6da6df513 100644 --- a/doxygen/algorithms/partitioner.dox +++ b/doxygen/algorithms/partitioner.dox @@ -47,7 +47,7 @@ tf::StaticPartitioner may deliver the best performance. On the other hand, if the work unit per iteration is irregular and unbalanced, tf::GuidedPartitioner or tf::DynamicPartitioner can outperform tf::StaticPartitioner. -@note +@attention By default, all parallel algorithms in %Taskflow use tf::DefaultPartitioner, which is based on guided scheduling via tf::GuidedPartitioner. diff --git a/doxygen/algorithms/pipeline.dox b/doxygen/algorithms/pipeline.dox index 66a891868..de278c551 100644 --- a/doxygen/algorithms/pipeline.dox +++ b/doxygen/algorithms/pipeline.dox @@ -50,7 +50,7 @@ a parallel type, where a serial pipe processes data tokens sequentially and a parallel pipe processes different data tokens simultaneously. -@note +@attention Due to the nature of pipeline, %Taskflow requires the first pipe to be a serial type. The pipeline scheduling algorithm operates in a circular fashion with a factor of line count. @@ -139,7 +139,7 @@ Debrief: the pipeline scheduling framework. The taskflow graph of this pipeline example is shown as follows, where 1) one condition task is used to decide which runtime task to run and -2) four runtime tasks is used to schedule tokens at four parallel lines, respectively. +2) four runtime tasks are used to schedule tokens at four parallel lines, respectively. @dotfile images/pipeline_basic_dependency_graph.dot @@ -151,7 +151,7 @@ The following figure shows the data layout of @c buffer. @dotfile images/pipeline_memory_layout.dot -@note +@attention In practice, you may need to add padding to the data type of the buffer or align it with the cacheline size to avoid false sharing. If the data type varies at different pipes, you can use @std_variant to store the @@ -169,7 +169,7 @@ after the callable. As we can see from this example, tf::Pipeline gives you the full control to customize your application data on top of a pipeline scheduling framework. -@note +@attention 1. Calling tf::Pipeflow::stop() not at the first pipe has no effect on the pipeline scheduling. 2. In most cases, std::thread::hardware_concurrency is a good number for line count. diff --git a/doxygen/algorithms/pipeline_with_token_dependencies.dox b/doxygen/algorithms/pipeline_with_token_dependencies.dox index 83261bf16..e17cac504 100644 --- a/doxygen/algorithms/pipeline_with_token_dependencies.dox +++ b/doxygen/algorithms/pipeline_with_token_dependencies.dox @@ -47,41 +47,41 @@ The whole process has the following steps: 1. Token 1 is not a deferred token and then 1 is finished. Now the execution sequence is {1}. 2. Token 2 defers to 8. We insert DT[2]={8} and TD[8]={2}. - The black cicle 2 in the above image illustrates this step. + The black circle 2 in the above image illustrates this step. 3. Token 3 is not a deferred token and then 3 is finished. Now the execution sequence is {1,3}. 4. Token 4 is not a deferred token and then 4 is finished. Now the execution sequence is {1,3,4}. 5. Token 5 defers to 2 and 7. We insert DT[5]={2,7}, TD[2]={5}, and TD[7]={5}. - The black cicle 5 in the above image illustrates this step. + The black circle 5 in the above image illustrates this step. 6. Token 6 is not a deferred token and then 6 is finished. Now the execution sequence is {1,3,4,6}. 7. Token 7 is not a deferred token and then 7 is finished. Now the execution sequence is {1,3,4,6,7}. Since TD[7]={5}, we directly remove 7 from DT[5]. - The black cicle 7 in the above image illustrates this step. + The black circle 7 in the above image illustrates this step. 8. Token 8 is not a deferred token and then 8 is finished. Now the execution sequence is {1,3,4,6,7,8}. Since TD[8]={2}, we directly remove 8 from DT[2] and find out DT[2] is empty. Now token 2 is no longer a deferred token and we move 2 to RT. - The black cicle 8 in the above image illustrates this step. + The black circle 8 in the above image illustrates this step. 9. RT is not empty and has a token 2. Then we finish running 2. Now the execution sequence is {1,3,4,6,7,8,2}. Since TD[2]={5}, we directly remove 2 from DT[5] and find out DT[5] is empty. Now token 5 is no longer a deferred token and we move 5 to RT. - The black cicle 9 in the above image illustrates this step. + The black circle 9 in the above image illustrates this step. 10. RT is not empty and has a token 5. Then we run 5 and find out token 5 defers the second time, defers to 9. We insert DT[5]={9} and TD[9]={5}. - The black cicle 20 in the above image illustrates this step. + The black circle 20 in the above image illustrates this step. 11. Token 9 is not a deferred token and then 9 is finished. Now the execution sequence is {1,3,4,6,7,8,2,9}. Since TD[9]={5}, we directly remove 9 from DT[5] and find out DT[5] is empty. Now token 5 is no longer a deferred token and we move 5 to RT. - The black cicle 11 in the above image illustrates this step. + The black circle 11 in the above image illustrates this step. 12. RT is not empty and has a token 5. Then we finish running 5. Now the execution sequence is {1,3,4,6,7,8,2,9,5}. - The black cicle 12 in the above image illustrates this step. + The black circle 12 in the above image illustrates this step. 13. Token 10 is not a deferred token and then 10 is finished. Now the execution sequence is {1,3,4,6,7,8,2,9,5,10}. @@ -199,7 +199,7 @@ Debrief: @li Line 64 defines the pipeline taskflow graph using composition @li Line 67 executes the taskflow -The following is one of the possible outcomes of the exmaple. +The following is one of the possible outcomes of the example. @code{.bash} stage 1: Non-deferred token 0 @@ -242,7 +242,7 @@ stage 3: input token 10 @endcode -@note +@attention You can only specify the token dependencies at the first pipe to get the serial execution of tokens. diff --git a/doxygen/algorithms/reduce.dox b/doxygen/algorithms/reduce.dox index 30c47d527..53055eaa6 100644 --- a/doxygen/algorithms/reduce.dox +++ b/doxygen/algorithms/reduce.dox @@ -18,7 +18,7 @@ for creating a parallel-reduction task. @section A2ParallelReduction Create a Parallel-Reduction Task The reduction task created by -tf::Taskflow::reduce(B first, E last, T& result, O bop, P&& part) performs +tf::Taskflow::reduce(B first, E last, T& result, O bop, P part) performs parallel reduction over a range of elements specified by [first, last) using the binary operator @c bop and stores the reduced result in @c result. It represents the parallel execution of the following reduction loop: @@ -93,7 +93,7 @@ as a result of passing iterators by reference. It is common to transform each element into a new data type and then perform reduction on the transformed elements. %Taskflow provides a method, -tf::Taskflow::transform_reduce(B first, E last, T& result, BOP bop, UOP uop, P&& part), +tf::Taskflow::transform_reduce(B first, E last, T& result, BOP bop, UOP uop, P part), that applies @c uop to transform each element in the specified range and then perform parallel reduction over @c result and transformed elements. It represents the parallel execution of the following reduction loop: @@ -128,7 +128,49 @@ It is possible that the binary operator will take @em r-value in both arguments, When data passing is expensive, you may define the result type @c T to be move-constructible. -@section ParallelReductionCfigureAPartitioner Configure a Partitioner +@section ParallelReductionCreateAReduceByIndexTask Create a Reduce-by-Index Task + +Unlike @c tf::Taskflow::reduce, the @c tf::Taskflow::reduce_by_index function lets you perform a +parallel reduction over an index range, but with more control over how each part of the range is processed. +This is useful when you need to customize the reduction process for each subrange +or you want to incorporate optimizations like SIMD. +The example below performs a sum-reduction over all elements in @c data with @c res: + +@code{.cpp} +std::vector data(100000); +double res = 1.0; +taskflow.reduce_by_index( + // index range + tf::IndexRange(0, N, 1), + // final result + res, + // local reducer + [&](tf::IndexRange subrange, std::optional running_total) { + double residual = running_total ? *running_total : 0.0; + for(size_t i=subrange.begin(); i() +); + +executor.run(taskflow).wait(); +assert(res == 100001); +@endcode + +The local reducer @c lop computes a partial sum for each subrange, +and the global reducer @c gop combines the partial results into the final result and store it in @c res, +whose initial value (i.e., @c 1.0 here) also participates in the reduction process. +The second argument of the local reducer is a @std_optional type, which indicates the current partial sum +until this subrange. +Apparently, the first subrange does not have any partial sum since there is no running total from previous +subranges (i.e., @c running_total is @std_nullopt). + +@section ParallelReductionConfigureAPartitioner Configure a Partitioner You can configure a partitioner for parallel-reduction tasks to run with different scheduling methods, such as guided partitioning, dynamic partitioning, and static partitioning. @@ -156,7 +198,7 @@ taskflow.reduce(vec.begin(), vec.end(), sum2, ); @endcode -@note +@attention By default, parallel-reduction tasks use tf::DefaultPartitioner if no partitioner is specified. diff --git a/doxygen/algorithms/sort.dox b/doxygen/algorithms/sort.dox index c19ff2c72..da3d3803b 100644 --- a/doxygen/algorithms/sort.dox +++ b/doxygen/algorithms/sort.dox @@ -37,7 +37,7 @@ executor.run(taskflow).wait(); assert(std::is_sorted(data.begin(), data.end())); @endcode -@note +@attention Elements are compared using the operator @c <. @section SortARangeOfItemsWithACustomComparator Sort a Range of Items with a Custom Comparator @@ -61,7 +61,7 @@ executor.run(taskflow).wait(); assert(std::is_sorted(data.begin(), data.end(), std::greater{})); @endcode -@note +@attention tf::Taskflow::sort is not stable. That is, two or more objects with equal keys may not appear in the same order before sorting. diff --git a/doxygen/algorithms/transform.dox b/doxygen/algorithms/transform.dox index f778a41cf..3434008ab 100644 --- a/doxygen/algorithms/transform.dox +++ b/doxygen/algorithms/transform.dox @@ -20,7 +20,7 @@ for creating a parallel-transform task. Parallel-transform transforms a range of items, possibly with a different type for the transformed data, and stores the result in another range. -The task created by tf::Taskflow::transform(B first1, E last1, O d_first, C c, P&& part) +The task created by tf::Taskflow::transform(B first1, E last1, O d_first, C c, P part) is equivalent to a parallel execution of the following loop: @code{.cpp} @@ -65,7 +65,7 @@ tf::Task init = taskflow.emplace([&](){ d_first = tgt.begin(); }); -tf::Task transform = taskflow.for_each( +tf::Task transform = taskflow.transform( std::ref(first), std::ref(last), std::ref(d_first), [&](int i) { std::cout << "transforming item " << i << " to " << i + 1 << '\n'; @@ -86,7 +86,7 @@ in another range starting at @c d_first. @section ParallelBinaryTransformsOverARange Create a Binary Parallel-Transform Task You can use the overload, -tf::Taskflow::transform(B1 first1, E1 last1, B2 first2, O d_first, C c, P&& part), +tf::Taskflow::transform(B1 first1, E1 last1, B2 first2, O d_first, C c, P part), to perform parallel transforms on two source ranges pointed by @c first1 and @c first2 using the binary operator @c c @@ -151,7 +151,7 @@ taskflow.transform( ); @endcode -@note +@attention By default, parallel-transform tasks use tf::DefaultPartitioner if no partitioner is specified. diff --git a/doxygen/conf.py b/doxygen/conf.py index 94914f618..188a89d5b 100644 --- a/doxygen/conf.py +++ b/doxygen/conf.py @@ -3,7 +3,7 @@ MAIN_PROJECT_URL = 'https://taskflow.github.io' #HTML_EXTRA_STYLESHEET = ['taskflow.css'] VERSION_LABELS = True -FINE_PRINT = """

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2023.
    Generated by Doxygen {doxygen_version} and m.css.

    """ +FINE_PRINT = """

    Taskflow handbook is part of the Taskflow project, copyright © Dr. Tsung-Wei Huang, 2018–2025.
    Generated by Doxygen {doxygen_version} and m.css.

    """ LINKS_NAVBAR1 = [ ("Handbook", 'pages', []), ("Namespaces", 'namespaces', []) diff --git a/doxygen/contributing/contributors.dox b/doxygen/contributing/contributors.dox index 4c4f55e4e..a0101fe0e 100644 --- a/doxygen/contributing/contributors.dox +++ b/doxygen/contributing/contributors.dox @@ -9,16 +9,21 @@ namespace tf { We are grateful for the following contributors (alphabetic order) to the %Taskflow project: @li Alexander Neumann: made %Taskflow importable from external CMake projects + @li Andatr: improved the hashing performance in freelist + @li Anesthesia4: added unit tests for parallel-transform algorithms @li Antony Chan: added unit tests for parallel-transform algorithms @li Andreas Olofsson: supported the %Taskflow project through the DARPA IDEA program @li Aaron Boxer: fixed compiler warning caused by unsigned-signed conversion - @li Benson Muite: fixed compilation errors of the wavefront benchmark + @li Wolfgang Bangerth: fixed the redundant `nullptr` check + @li Benson Muite: fixed compilation errors of the BFS benchmark @li Cheng-Hsiang Chiu: improved the documentation, fixes typos, and test code examples @li Chandrahas Pundru: implemented cancellation of submitted taskflows @li Chun-Xun Lin: co-created the %Taskflow project and designed the core functionalities + @li Conrad Jones: added cancellation query support from the runtime task @li Craffael: improved the CMake to allow relocatable installation - @li Dan Kersten: designed an interface to allow customizing worker behaviors upon their creation in an executor - @li Daniel Jour: improved cmake through out-of-tree builds and designed the semaphore interface + @li Dan Kersten: designed an interface to allow customizing worker behaviors + @li Daniel Jour: improved cmake via out-of-tree builds and designed the semaphore interface @li Dian-Lun Lin: applied %Taskflow to win the champion award of the IEEE HPEC 2020 %Graph Challenge + @li Evgeny Gorodetskiy: fixed task queue compilation error due to wrong macro locations @li Filip Strugar: fixed the bugs in fire-and-get taskflow execution and parallel algorithms @li Foge Mistress: helped design the executor interface to avoid over-subscribed threads @li Francisco Facioni: improved the interface of %Taskflow exception support through macro @@ -27,14 +32,17 @@ We are grateful for the following contributors (alphabetic order) to the %Taskfl @li Guannan Guo: benchmarked different scheduling algorithms and architectures @li Hjxy2012: fixed the compilation error in nvcc due to removed features in C++17 @li Hoildkv: fixed documentation errors in explaining the observer interface of executor - @li Jean Michael: integrated %Taskflow to the OSSIA project and reported feedback in comparison to TBB + @li Isaac Yousuf: fixed the bug in exception handling for worker loop + @li Jean Michael: integrated %Taskflow to the OSSIA project @li Jiawei Liu: fixed typos in the documentation + @li Junlian Gilbey: added the explicit link to libatomic on some architectures @li Junlin Huang: fixed the erroneous template argument in serializer and deserializer @li KingDuckZ: helped discover memory leak in the object pool @li Levi Armstrong: added threads target to the CMake file as an interface library @li Lily: helped added %Taskflow to the MS vcpkg project @li Longpractice: fixed the MS compilation error for launch-loop algorithm @li Lukas Burgholzer: improved the MAC OS compatibility with the standard variant library + @li Lukasz Wojakowski: identified delayed execution bug in module task @li Luke Majors: implemented a sanitizer algorithm to sanitize deadlock control-flow tasks @li McKay Mower: implemented a sanitizer algorithm to sanitize non-reachable control-flow tasks @li Mamy Ratsimbazafy: fixed the reference link error in the documentation @@ -43,8 +51,9 @@ We are grateful for the following contributors (alphabetic order) to the %Taskfl @li Matthew Powelson: fixed the installation error in the cmake script @li Maxi-git: improved the scheduler by removing redundant iterations in the busy stealing loop @li Nate: fixed the compilation error of priority task queue on MS platforms - @li Netcan: designed a domain-specific graph language to simplify the creation of taskflows @li Nan Xiao: fixed compilation error of unit tests on the Arch platform + @li Netcan: designed a domain-specific graph language to simplify the creation of taskflows + @li Nevin: fixed the macro crash in windows @li Ojas Mithbavkar: implemented cancellation of submitted taskflows @li Pancpp: removed hard-coded installation prefix with relative install path @li Paolo Bolzoni: helped remove extraneous semicolons to suppress extra warning @@ -54,7 +63,8 @@ We are grateful for the following contributors (alphabetic order) to the %Taskfl @li Zizheng Xiong: added data-parallel programming models through GSoC 2022 @li Pursche: fixed compilation warning on MSVC @li Remi Bedard-Couture: added big object compilation support on MSVC - @li Robin Soderholm: fixed the runtime error of cudaEvent destructor + @li Robin Soderholm: fixed the runtime error of %cudaEvent destructor + @li Ruixin Huang: fixed bugs in conditional tasking documentation @li Soonho Kong: fixed the compilation warning of unused lambda variables @li Sztergbaum Roman: improved the CMake file to remove global setting @li Timo Heister: fixed documentation typos and integrated %Taskflow to the deal.ii project @@ -63,11 +73,13 @@ We are grateful for the following contributors (alphabetic order) to the %Taskfl @li Vedanta Krishna Bhutani: implemented cancellation of submitted taskflows @li Vlad Serebrennikov: implemented the interface to attach user data in a task @li Vedran Miletic: patched the OS detection utility to include Solaris and illumos - @li Vladimir Von­drus: helped modernize %Taskflow handbook using m.css and make pages mobile-friendly + @li Vladimir Von­drus: helped modernize %Taskflow handbook using m.css @li Vladyslav: fixed comment errors in README.md and examples + @li WiCyn: identified a bug in scheduling condition tasks during run-n @li Yasin Zamani: benchmarked the parallel sort with the TBB baseline @li Yibo Lin: helped design the interface of conditional tasking @li Yilin Qiu: helped implement the dependency removal methods in %Taskflow + @li Yumeno Yan: fixed the C++ macro error in the MSVC environment @li Weile: helped added %Taskflow to the compiler explorer interface @li Zizheng Guo: applied %Taskflow to speed up VLSI timing analysis and shared his feedback @@ -96,7 +108,7 @@ We are grateful for the following organizations and projects that are using %Tas @li GROK: World's Leading Open Source JPEG 2000 Codec @li RavEngine: A fast, easy to use C++17 3D game library for modern computers @li RPGMPacker: CLI program for packaging RPG Maker games in an automated build/deploy pipeline. -@li Leanify: A lightweight lossless file minifier and optimizer +@li Leanify: A lightweight lossless file compressor @li Xanadu AI: Accelerate simulation using quantum computing @li Operon: Modern C++ framework for Symbolic Regression using Genetic Programming @li Explosion: A modern cross-platform game engine @@ -108,7 +120,7 @@ We are grateful for the following organizations and projects that are using %Tas @li RapidFuxx: Rapid fuzzy string matching in Python using various string metrics @li AtomicDEX: Secure wallet and decentralized exchange rolled into one application @li OOX: Out-of-order task execution library in modern C++ -@li ReAgent: An open end-to-end platform for applied reinforcement learning developed and used at Facebook +@li ReAgent: An open-source platform for applied reinforcement learning developed by Meta @li Beast-Build: A build system built for speed and power @li Gate Sizing: A task-parallel gate sizing algorithm for VLSI design automation @li Shards: A scripting tool to build tools @@ -117,8 +129,9 @@ We are grateful for the following organizations and projects that are using %Tas @li NcEngine: 3D game engine written in C++20 targeting Windows @li AMD Vivao: AMD's software synthesis suite for hardware designs @li ModuleWorks: Industry-proven ModuleWorks CAD/CAM technology into software solutions +@li Nvidia std::exec: Nvidia's implementation for C++26 Standard executor libraries -... more at [GitHub](https://github.com/search?q=taskflow&type=Code). +... more at [GitHub](https://github.com/search?q=taskflow+c%2B%2B&type=commits). Please @ContactUs if we forgot your name! diff --git a/doxygen/contributing/guidelines.dox b/doxygen/contributing/guidelines.dox index 388788eb1..48aaffd79 100644 --- a/doxygen/contributing/guidelines.dox +++ b/doxygen/contributing/guidelines.dox @@ -163,7 +163,7 @@ either take lead or contribute: | Adding Benchmarks | need contributors | enhance the [benchmark pool](https://github.com/taskflow/taskflow/tree/master/benchmarks) to provide more parallel computing instances that can help profile %Taskflow | | Developing Algorithms | need contributors | enhance our generic @ref Algorithms collection by adding more parallel algorithm skeletons that can help developers quickly describe common parallel workloads (e.g., C++ 17/20 parallel algorithms) | | Developing Kernels Algorithms | need contributors | enhance our %cudaFlow by providing common GPU kernels (e.g., reduce, sort, scan, prefix_sum, etc.) that developers can quickly leverage when describing GPU work using cudaFlows | -| Integrating OpenCL| need leaders | design another task type, @em clFlow, to support OpenCL in a task-graph fasion and schedule OpenCL tasks using graph parallelism | +| Integrating OpenCL| need leaders | design another task type, @em clFlow, to support OpenCL in a task-graph fashion and schedule OpenCL tasks using graph parallelism | | Supporting pipeline | need leaders | design a tasking interface to support pipeline of a data stream over a taskflow graph, where we may resemble [tbb::parallel_pipeline](https://www.threadingbuildingblocks.org/docs/help/tbb_userguide/Working_on_the_Assembly_Line_pipeline.html) | | Diagnosing %Taskflow | need contributors | devise API and algorithms to diagnose if the given taskflow is properly conditioned under our @ref TaskSchedulingPolicy, for example, tf::Taskflow::diagnose, under two modes, before running and on the running | diff --git a/doxygen/cookbook/Cookbook.dox b/doxygen/cookbook/Cookbook.dox index 4ad0bf91e..53ddd76ba 100644 --- a/doxygen/cookbook/Cookbook.dox +++ b/doxygen/cookbook/Cookbook.dox @@ -13,12 +13,10 @@ namespace tf { + @subpage AsyncTasking + @subpage DependentAsyncTasking + @subpage RuntimeTasking - + @subpage PrioritizedTasking + @subpage ExceptionHandling - + @subpage GPUTaskingcudaFlow - + @subpage GPUTaskingcudaFlowCapturer + @subpage LimitTheMaximumConcurrency + @subpage RequestCancellation + + @subpage GPUTasking + @subpage Profiler */ diff --git a/doxygen/cookbook/async_tasking.dox b/doxygen/cookbook/async_tasking.dox index 456881975..81eb3acbe 100644 --- a/doxygen/cookbook/async_tasking.dox +++ b/doxygen/cookbook/async_tasking.dox @@ -9,37 +9,27 @@ so that you can incorporate independent, dynamic parallelism in your taskflows. @section LaunchAsynchronousTasksFromAnExecutor Launch Asynchronous Tasks from an Executor -%Taskflow executor provides an STL-styled method, -tf::Executor::async, -for you to run a callable object asynchronously. -The method returns a @std_future that will eventually hold the result -of that function call. +%Taskflow's executor provides an STL-style method, tf::Executor::async, +that allows you to run a callable object asynchronously. +This method returns a std::future which will eventually hold the result of the function call. @code{.cpp} std::future future = executor.async([](){ return 1; }); assert(future.get() == 1); @endcode -@note -Unlike std::async, the future object returned from tf::Executor::async does not block on destruction -until completing the function. - -If you do not need the return value or use a future to synchronize the execution, -you are encouraged to use tf::Executor::silent_async which returns nothing and thus -has less overhead (i.e., no shared state management) compared to tf::Executor::async. +If you do not need the return value or do not require a std::future for synchronization, +you should use tf::Executor::silent_async. +This method returns nothing and incurs less overhead than tf::Executor::async, +as it avoids the cost of managing a shared state for std::future. @code{.cpp} -executor.silent_async([](){ - // do some work without returning any result -}); +executor.silent_async([](){}); @endcode -Launching asynchronous tasks from an executor is -@em thread-safe and can be called by multiple threads both inside (i.e., worker) -and outside the executor. -Our scheduler autonomously detects whether an asynchronous task is submitted -from an external thread or a worker thread and schedules its execution -using work stealing. + +Launching asynchronous tasks from an executor is @em thread-safe and can be invoked from multiple threads, including both worker threads inside the executor and external threads outside of it. +The scheduler automatically detects the source of the submission and employs work-stealing to schedule the task efficiently, ensuring balanced workload distribution across workers. @code{.cpp} tf::Task my_task = taskflow.emplace([&](){ @@ -53,96 +43,17 @@ executor.run(taskflow); executor.wait_for_all(); // wait for all tasks to finish @endcode -@note -Asynchronous tasks created from an executor does not belong to any taskflows. -The lifetime of an asynchronous task is managed automatically by the -executor that creates the task. - -You can name an asynchronous task using the overloads, -tf::Executor::async(const std::string& name, F&& f) and -tf::Executor::silent_async(const std::string& name, F&& f), -that take a string in the first argument. -Assigned names will appear in the observers of the executor. - -@code{.cpp} -std::future fu = executor.async("async task", [](){}); -executor.silent_async("sileng async task", [](){}); -@endcode - -@section LaunchAsynchronousTasksFromAnSubflow Launch Asynchronous Tasks from a Subflow - -You can launch asynchronous tasks from tf::Subflow using -tf::Subflow::async. -Asynchronous tasks are independent tasks spawned -during the execution of a subflow. -When the subflow joins, all asynchronous tasks are guaranteed to finish. -The following code creates 100 asynchronous tasks from a subflow -and joins their executions explicitly using tf::Subflow::join. - -@code{.cpp} -tf::Taskflow taskflow; -tf::Executor executor; - -std::atomic counter{0}; - -taskflow.emplace([&] (tf::Subflow& sf){ - std::vector> futures; - for(int i=0; i<100; i++) { - futures.emplace_back(sf.async([&](){ ++counter; })); - } - sf.join(); // all of the 100 asynchronous tasks will finish by this join - assert(counter == 100); -}); - -executor.run(taskflow).wait(); -@endcode - -If you do not need the return value or the future to synchronize the execution, -you can use tf::Subflow::silent_async which has less overhead -when creating an asynchronous task compared to tf::Subflow::async. - -@code{.cpp} -tf::Taskflow taskflow; -tf::Executor executor; - -std::atomic counter{0}; - -taskflow.emplace([&] (tf::Subflow& sf){ - for(int i=0; i<100; i++) { - sf.silent_async([&](){ ++counter; }); - } - sf.join(); // all of the 100 asynchronous tasks will finish by this join - assert(counter == 100); -}); - -executor.run(taskflow).wait(); -@endcode - @attention -You should only create asynchronous tasks from a joinable subflow. -Launching asynchronous tasks from a detached subflow results in -undefined behavior. +Asynchronous tasks created from an executor do not belong to any taskflow. +Their lifetime is automatically managed by the executor that created them. -You can assign an asynchronous task a name -using the two overloads, tf::Subflow::async(const std::string& name, F&& f) -and tf::Subflow::silent_async(const std::string& name, F&& f). -Both methods take an additional argument of a string. - -@code{.cpp} -taskflow.emplace([](tf::Subflow& sf){ - std::future future = sf.async("name of the task", [](){}); - sf.silent_async("another name of the task", [](){}); - sf.join(); -}); -@endcode @section LaunchAsynchronousTasksFromARuntime Launch Asynchronous Tasks from a Runtime -The asynchronous tasking feature of tf::Subflow is indeed derived from tf::Runtime. You can launch asynchronous tasks from tf::Runtime using tf::Runtime::async or tf::Runtime::silent_async. The following code creates 100 asynchronous tasks from a runtime -and joins their executions explicitly using tf::Runtime::corun_all. +and joins their executions explicitly using tf::Runtime::corun. @code{.cpp} tf::Taskflow taskflow; @@ -154,13 +65,13 @@ taskflow.emplace([&] (tf::Runtime& rt){ for(int i=0; i<100; i++) { rt.silent_async([&](){ ++counter; })); } - rt.join(); // all of the 100 asynchronous tasks will finish by this join + rt.corun(); // all of the 100 asynchronous tasks will finish by this join assert(counter == 100); }); executor.run(taskflow).wait(); @endcode -Unlike tf::Subflow::join, you can call tf::Runtime::corun_all multiple times +Unlike tf::Subflow::join, you can call tf::Runtime::corun multiple times to synchronize the execution of asynchronous tasks between different runs. For example, the following code spawn 100 asynchronous tasks twice and join each execution to assure the spawned 100 asynchronous tasks have @@ -177,29 +88,71 @@ taskflow.emplace([&] (tf::Runtime& rt){ for(int i=0; i<100; i++) { rt.silent_async([&](){ ++counter; })); } - rt.join(); // all of the 100 asynchronous tasks will finish by this join + rt.corun(); // all of the 100 asynchronous tasks will finish by this join assert(counter == 100); // spawn another 100 asynchronous tasks and join for(int i=0; i<100; i++) { rt.silent_async([&](){ ++counter; })); } - rt.join(); // all of the 100 asynchronous tasks will finish by this join + rt.corun(); // all of the 100 asynchronous tasks will finish by this join assert(counter == 200); }); executor.run(taskflow).wait(); @endcode By default, tf::Runtime does not join like tf::Subflow. -All pending asynchronous tasks spawned by tf::Runtime -are no longer controllable when their parent runtime disappears. -It is your responsibility to properly synchronize spawned -asynchronous tasks using tf::Runtime::corun_all. - -@note -Creating asynchronous tasks from a runtime allows users to efficiently implement -parallel algorithms using recursion, such as parallel sort (tf::Taskflow::sort), -that demands dynamic parallelism at runtime. +All pending asynchronous tasks spawned from a tf::Runtime become uncontrollable once their +parent runtime goes out of scope. +It is user's responsibility to explicitly synchronize these tasks using tf::Runtime::corun. + +@attention +Creating asynchronous tasks from a runtime enables efficient implementation of recursive +parallel algorithms, such as tf::Taskflow::sort, that require dynamic task creation at runtime. + +@section LaunchAsynchronousTasksRecursivelyFromARuntime Launch Asynchronous Tasks Recursively from a Runtime + +Asynchronous tasks can take a reference to tf::Runtime, allowing them to recursively launch additional asynchronous tasks. +Combined with tf::Runtime::corun, this enables the implementation of various recursive parallelism patterns, including parallel sort, divide-and-conquer algorithms, and the [fork-join model](https://en.wikipedia.org/wiki/Fork%E2%80%93join_model). +For instance, the example below demonstrates a parallel recursive implementation of Fibonacci numbers using recursive asynchronous tasking from tf::Runtime: + +@code{.cpp} +#include + +size_t fibonacci(size_t N, tf::Runtime& rt) { + + if(N < 2) return N; + + size_t res1, res2; + rt.silent_async([N, &res1](tf::Runtime& rt1){ res1 = fibonacci(N-1, rt1); }); + + // tail optimization for the right child + res2 = fibonacci(N-2, rt); + + // use corun to avoid blocking the worker from waiting the two children tasks + // to finish + rt.corun(); + + return res1 + res2; +} + +int main() { + + tf::Executor executor; + + size_t N = 5, res; + executor.silent_async([N, &res](tf::Runtime& rt){ res = fibonacci(N, rt); }); + executor.wait_for_all(); + + std::cout << N << "-th Fibonacci number is " << res << '\n'; + + return 0; +} +@endcode + +The figure below shows the execution diagram, where the suffix *_1 represent the left child spawned by its parent runtime. + +@dotfile images/fibonacci_4_tail_optimized.dot */ diff --git a/doxygen/cookbook/cancellation.dox b/doxygen/cookbook/cancellation.dox index a2a7ab81c..d22eef3a6 100644 --- a/doxygen/cookbook/cancellation.dox +++ b/doxygen/cookbook/cancellation.dox @@ -2,20 +2,16 @@ namespace tf { /** @page RequestCancellation Request Cancellation -This chapters discusses how to cancel submitted tasks. +This chapters discusses how to cancel a running taskflow. @tableofcontents -@section CancelARunningTaskflow Cancel Execution of Taskflows +@section CancelARunningTaskflow Cancel a Running Taskflow -When you submit a taskflow to an executor (e.g., tf::Executor::run), -the executor returns a tf::Future object that will hold the result -of the execution. -tf::Future is a derived class from std::future. -In addition to base methods of std::future, -you can call tf::Future::cancel to cancel the execution of a running taskflow. -The following example cancels a submission of a taskflow that contains -1000 tasks each running one second. +When you submit a taskflow to an executor using the run series (e.g., tf::Executor::run), the executor returns a tf::Future object that holds the result of the execution. +tf::Future is derived from std::future. +In addition to the base methods of std::future, you can call tf::Future::cancel to cancel the execution of a running taskflow. +The following example demonstrates cancelling a submission of a taskflow containing 1000 tasks, each running for one second. @code{.cpp} tf::Executor executor; @@ -34,24 +30,16 @@ tf::Future fu = executor.run(taskflow); fu.cancel(); // wait until the cancellation completes -fu.get(); +fu.wait(); @endcode -@note -tf::Future::cancel is @em non-deterministic and @em out-of-order. - -When you request a cancellation, the executor will stop scheduling -the rest tasks of the taskflow. -Tasks that are already running will continue to finish, -but their successor tasks will not be scheduled to run. -A cancellation is considered complete when all these running tasks finish. -To wait for a cancellation to complete, -you may explicitly call @c tf::Future::get. - -@attention -It is your responsibility to ensure that the taskflow remains alive before the -cancellation completes. - +When you request a cancellation, the executor will stop scheduling the remaining tasks of the taskflow. +Requesting a cancellation does not guarantee an immediate stop of a running taskflow. +Tasks that are already running will continue to finish, +but their successor tasks will not be scheduled. +A cancellation is considered complete only after all running tasks have finished. +To wait for the cancellation to complete, you can explicitly call tf::Future::wait. +Note that it is your responsibility to ensure that the taskflow remains alive until the cancellation is complete, as there may still be running tasks that cannot be canceled. For instance, the following code results in undefined behavior: @code{.cpp} @@ -70,10 +58,7 @@ tf::Executor executor; } // destroying taskflow here can result in undefined behavior @endcode -The undefined behavior problem exists because tf::Future::cancel does not -guarantee an immediate cancellation. -To fix the problem, call @c get to ensure the cancellation completes -before the end of the scope destroys the taskflow. +To avoid this issue, call @c wait to ensure the cancellation completes before the taskflow is destroyed at the end of the scope. @code{.cpp} tf::Executor executor; @@ -87,16 +72,15 @@ tf::Executor executor; tf::Future fu = executor.run(taskflow); fu.cancel(); // there can still be task running after cancellation - fu.get(); // waits until the cancellation completes + fu.wait(); // wait until the cancellation completes } @endcode @section UnderstandTheLimitationsOfCancellation Understand the Limitations of Cancellation -Canceling the execution of a running taskflow has the following limitations: - + Cancellation is non-preemptive. A running task will not be cancelled until it finishes. - + Cancelling a taskflow with tasks - acquiring and/or releasing tf::Semaphore results is currently not supported. +Due to its asynchronous and non-deterministic nature, taskflow cancellation has the following limitations: + + **Non-preemptive behavior**: Cancellation does not forcibly terminate running tasks. Any task already in execution will continue to completion before cancellation takes effect. + + **%Semaphore incompatibility**: Cancelling a taskflow that includes tasks involving tf::Semaphore (i.e., acquiring or releasing) is currently unsupported and may lead to undefined behavior. We may overcome these limitations in the future releases. diff --git a/doxygen/cookbook/composable_tasking.dox b/doxygen/cookbook/composable_tasking.dox index 639ce371a..79fdc60ce 100644 --- a/doxygen/cookbook/composable_tasking.dox +++ b/doxygen/cookbook/composable_tasking.dox @@ -65,7 +65,7 @@ Debrief: @li Line 34 enforces the module task to run before task f2D -@section CreateAModuleTask Create a Module Task +@section CreateAModuleTaskFromATaskflow Create a Module Task from a %Taskflow The task created from Taskflow::composed_of is a @em module task that runs on a pre-defined taskflow. @@ -90,7 +90,8 @@ they are associated with the same graph. %Taskflow allows you to create a custom graph object that can participate in the scheduling using composition. To become a module task, -your class `T` must define a method `T::graph()` that returns a reference to a tf::Graph object. +your class `T` must define the method `T::graph()` that returns a reference to the tf::Graph object +managed by `T`. The following example defines a custom graph object that can be assembled in a taskflow throw composition: @@ -98,7 +99,7 @@ throw composition: 1: struct CustomGraph { 2: tf::Graph graph; 3: CustomGraph() { - 4: tf::FlowBuilder builder(graph); + 4: tf::FlowBuilder builder(graph); // inherit all task builders in tf::Taskflow 5: tf::Task task = builder.emplace([](){ 6: std::cout << "a task\n"; // static task 7: }); @@ -121,18 +122,13 @@ Debrief: The composition method tf::Taskflow::composed_of requires the target to define the `graph()` method that returns a reference to a tf::Graph object defined by the target. -At runtime, the executor will run dependent tasks in that graph -using the same work-stealing scheduling algorithm as other taskflows. -%Taskflow leverages this powerful feature to design high-level algorithms, -such as tf::Pipeline. - -@note -While %Taskflow gives you the flexibility to create a composable graph object, -you should consider using tf::Graph as an opaque data structure just to interact -with the library. -Additionally, as other module tasks, %Taskflow does not own the lifetime of -a custom composable graph object but keeps a soft mapping to it. -You should keep the graph object alive during its execution. +At runtime, the executor will schedule tasks in that graph +using the same work-stealing algorithm as other taskflows. + +@attention +Users are responsible for ensuring the given target remains valid throughout its execution. +The executor does not assume ownership of the target object. + */ diff --git a/doxygen/cookbook/conditional_tasking.dox b/doxygen/cookbook/conditional_tasking.dox index a6c3acc3f..ab9615511 100644 --- a/doxygen/cookbook/conditional_tasking.dox +++ b/doxygen/cookbook/conditional_tasking.dox @@ -2,19 +2,16 @@ namespace tf { /** @page ConditionalTasking Conditional Tasking -Parallel workloads often require making control-flow decisions across dependent tasks. -%Taskflow supports an very efficient interface of conditional tasking -for users to implement general control flow such as dynamic flow, cycles, and conditionals -that are otherwise difficult to do with existing frameworks. +One of the most powerful features that distinguishes %Taskflow from other systems is its support for conditional tasking, also known as the control taskflow programming model (CTFG). CTFG allows you to embed control flow directly within a taskflow graph, enabling tasks to make decisions dynamically during execution. +This mechanism supports advanced in-graph control flow patterns, such as dynamic branching, loops, and conditionals—that are typically difficult or impossible to express in traditional task graph models. @tableofcontents @section CreateAConditionTask Create a Condition Task -A condition task evalutes a set of instructions and returns an integer index -of the next successor task to execute. -The index is defined with respect to the order of its successor construction. -The following example creates an if-else block using a single condition task. +A condition task returns an integer index indicating which successor task to execute next. +The index corresponds to the position of the successor in the order it was added during task construction. +The following example creates an if-else block using a condition task. @code{.cpp} 1: tf::Taskflow taskflow; @@ -39,16 +36,13 @@ With this order, when @c cond returns 0, the execution moves on to task @c yes. When @c cond returns 1, the execution moves on to task @c no. @attention -It is your responsibility to ensure the return of a condition task goes to -a correct successor task. If the return falls beyond the range of the successors, -the executor will not schedule any tasks. +It is your responsibility to ensure that the return value of a condition task corresponds to a valid successor. +If the returned index is out of range, the executor will not schedule any successor tasks. -Condition task can go cyclic to describe @em iterative control flow. -The example below implements a simple yet commonly used feedback loop through -a condition task (line 7-10) that returns -a random binary value. -If the return value from @c cond is @c 0, it loops back to itself, -or otherwise to @c stop. + +A condition task can form a cycle to express @em iterative control flow. +The example below demonstrates a simple yet commonly used feedback loop implemented using a condition task (lines 7–10) that returns a random binary value. +If the return value from @c cond is @c 0, the task loops back to itself; otherwise, it proceeds to @c stop. @code{.cpp} 1: tf::Taskflow taskflow; @@ -72,9 +66,9 @@ or otherwise to @c stop. @dotfile images/conditional-tasking-1.dot -A taskflow of complex control flow often just takes a few lines of code -to implement, and different control flow blocks may run in parallel. -The code below creates another taskflow with three condition tasks. +Creating a taskflow with complex control flow often requires only a few lines of code to implement. +Different control flow paths can execute in parallel, making it easy to express both logic and concurrency. +The code below creates a taskflow with three condition tasks to demonstrate this capability: @code{.cpp} tf::Taskflow taskflow; @@ -112,62 +106,56 @@ cond_3.precede(cond_3, L); // return 0 to 'cond_3' or 1 to 'L' taskflow.dump(std::cout); @endcode -The above code creates three condition tasks: -(1) a condition task @c cond_1 that loops back +The above code creates three condition tasks to implement three different control-flow tasks: + 1. A condition task @c cond_1 that loops back to @c B on returning @c 0, or proceeds to @c E on returning @c 1, -(2) a condition task @c cond_2 that goes to @c G on returning @c 0, + 2. A condition task @c cond_2 that goes to @c G on returning @c 0, or @c H on returning @c 1, -(3) a condition task @c cond_3 that loops back to itself on returning @c 0, + 3. A condition task @c cond_3 that loops back to itself on returning @c 0, or proceeds to @c L on returning @c 1 @dotfile images/conditional-tasking-2.dot -You can use condition tasks to create cycles as long as the graph does not introduce task race during execution. However, cycles are not allowed in non-condition tasks. +In this particular example, we can clearly see the advantage of CTFG: the execution of @c cond_1 can overlap with @c cond_2 or @c cond_3, enabling greater concurrency in control-driven workloads. +Unlike traditional task graph models that require static structure or external orchestration to handle control flow, CTFG allows tasks to make decisions dynamically and continue execution without global synchronization barriers. +This design leads to better parallelism, reduced overhead, and more expressive task graphs, especially in workloads with branching or iterative control flows. -@note -Conditional tasking lets you make in-task control-flow decisions to -enable @em end-to-end parallelism, -instead of resorting to client-side partition or synchronizing your task graph -at the decision points of control flow. @section TaskSchedulingPolicy Understand our Task-level Scheduling In order to understand how an executor schedules condition tasks, we define two dependency types, strong dependency and weak dependency. -A strong dependency is a preceding link from a non-condition task to -another task. -A weak dependency is a preceding link from a condition task to -another task. -The number of dependents of a task is the sum of strong dependency -and weak dependency. -The table below lists the strong dependency and -weak dependency numbers of each task in the previous example. +A strong dependency is a preceding link from one non-condition task to another task. +A weak dependency is a preceding link from one condition task to another task. +The number of dependencies of a task is the sum of its strong dependencies and weak dependencies. +The table below lists the number of strong dependencies and weak dependencies +of each task in the previous example:
    -| task | strong dependency | weak dependency | dependents | -| :-: | :-: | :-: | | -| A | 0 | 0 | 0 | -| B | 1 | 1 | 2 | -| C | 1 | 0 | 1 | -| D | 1 | 0 | 1 | -| E | 0 | 1 | 1 | -| F | 1 | 0 | 1 | -| G | 0 | 1 | 1 | -| H | 0 | 1 | 1 | -| I | 1 | 0 | 1 | -| K | 1 | 0 | 1 | -| L | 0 | 1 | 1 | -| M | 1 | 0 | 1 | -| cond_1 | 1 | 0 | 1 | -| cond_2 | 1 | 0 | 1 | -| cond_3 | 1 | 1 | 2 | +| task | strong dependency | weak dependency | dependencies | +| :-: | :-: | :-: | | +| A | 0 | 0 | 0 | +| B | 1 | 1 | 2 | +| C | 1 | 0 | 1 | +| D | 1 | 0 | 1 | +| E | 0 | 1 | 1 | +| F | 1 | 0 | 1 | +| G | 0 | 1 | 1 | +| H | 0 | 1 | 1 | +| I | 1 | 0 | 1 | +| K | 1 | 0 | 1 | +| L | 0 | 1 | 1 | +| M | 1 | 0 | 1 | +| cond_1 | 1 | 0 | 1 | +| cond_2 | 1 | 0 | 1 | +| cond_3 | 1 | 1 | 2 |
    -You can query the number of strong dependents, -the number of weak dependents, -and the number of dependents of a task. +You can query the number of strong dependencies, +the number of weak dependencies, +and the number of dependencies of a task. @code{.cpp} 1: tf::Taskflow taskflow; @@ -176,13 +164,13 @@ and the number of dependents of a task. 4: 5: // ... add more tasks and preceding links 6: - 7: std::cout << task.num_dependents() << '\n'; - 8: std::cout << task.num_strong_dependents() << '\n'; - 9: std::cout << task.num_weak_dependents() << '\n'; + 7: std::cout << task.num_predecessors() << '\n'; + 8: std::cout << task.num_strong_dependencies() << '\n'; + 9: std::cout << task.num_weak_dependencies() << '\n'; @endcode When you submit a task to an executor, -the scheduler starts with tasks of zero dependents +the scheduler starts with tasks of zero dependencies (both zero strong and weak dependencies) and continues to execute successive tasks whenever their strong dependencies are met. @@ -192,7 +180,7 @@ and jumps directly to its successors indexed by the return value. @dotfile images/task_level_scheduling.dot -Each task has an @em atomic join counter to keep track of strong dependents +Each task has an @em atomic join counter to keep track of strong dependencies that are met at runtime. When a task completes, the join counter is restored to the task's strong dependency number @@ -216,7 +204,7 @@ If @c cond returns @c 1, the scheduler enqueues @c stop and then moves on. @section AvoidCommonPitfalls Avoid Common Pitfalls -Condition tasks are handy in creasing dynamic and cyclic control flows, +Condition tasks are handy in creating dynamic and cyclic control flows, but they are also easy to make mistakes. It is your responsibility to ensure a taskflow is properly conditioned. Top things to avoid include no source tasks to start with @@ -228,7 +216,7 @@ The figure below shows common pitfalls and their remedies. In the @c error1 scenario, there is no source task for the scheduler to start with, -and the simplest fix is to add a task @c S that has no dependents. +and the simplest fix is to add a task @c S that has no dependencies. In the @c error2 scenario, @c D might be scheduled twice by @c E through the strong dependency and @c C through the weak dependency (on returning @c 1). @@ -277,9 +265,8 @@ cond3.precede(equl3, grtr3); // goes to grtr3 if i>3 @subsection ImplementSwitchControlFlow Implement Switch Control Flow -You can use conditional tasking to implement @em switch control flow. -The following example creates a switch control flow diagram that -executes one of the three cases at random using four condition tasks. +You can use condition tasks to implement @em switch-style control flow. +The following example demonstrates this by creating a switch structure that randomly selects and executes one of three cases using four condition tasks. @code{.cpp} tf::Taskflow taskflow; @@ -302,7 +289,7 @@ target.succeed(case1, case2, case3); Assuming @c swcond returns 1, the program outputs: -@code{.shell-session} +@code{.bash} source switch case 2 @@ -366,7 +353,7 @@ cond.precede(body, done); The program outputs: -@code{.shell-session} +@code{.bash} i=0 i++ => i=1 i++ => i=2 @@ -405,7 +392,7 @@ back.precede(cond); The program outputs: -@code{.shell-session} +@code{.bash} i=0 while i<5 i++=0 @@ -495,52 +482,10 @@ executor.run(taskflow).wait(); @dotfile images/multi-condition-task-1.dot -@note +@attention The return type of a multi-condition task is tf::SmallVector, which provides C++ vector-style functionalities but comes with small buffer optimization. -One important application of conditional tasking is implementing -iterative control flow. -You can use multi-condition tasks to create multiple loops that run concurrently. -The following code creates a sequential chain of four loops in which -each loop increments a counter variable ten times. -When the program completes, the value of the counter variable is @c 40. - -@code{.cpp} -tf::Executor executor; -tf::Taskflow taskflow; -std::atomic counter{0}; - -auto loop = [&, i=bool{true}, c = int(0)]() mutable -> tf::SmallVector { - if(i) { - i = false; - return {0, -1}; - } - else { - counter.fetch_add(1, std::memory_order_relaxed); - return {++c < 10 ? 0 : -1}; - } -} -auto A = taskflow.emplace([](){}); -auto B = taskflow.emplace(loop); -auto C = taskflow.emplace(loop); -auto D = taskflow.emplace(loop); - -A.precede(B); -B.precede(B, C); -C.precede(C, D); -D.precede(D); - -executor.run(taskflow).wait(); // counter == 40 -@endcode - -@dotfile images/multi-condition-task-2.dot - -@attention -It is your responsibility to ensure the return of a multi-condition task -goes to a correct successor task. -If a returned index falls outside the successor range of a multi-condition task, -the scheduler will skip that index without doing anything. */ diff --git a/doxygen/cookbook/dependent_async_tasking.dox b/doxygen/cookbook/dependent_async_tasking.dox index 48befaff0..fd40b47b6 100644 --- a/doxygen/cookbook/dependent_async_tasking.dox +++ b/doxygen/cookbook/dependent_async_tasking.dox @@ -3,7 +3,7 @@ namespace tf { /** @page DependentAsyncTasking Asynchronous Tasking with Dependencies This chapters discusses how to create a task graph dynamically -using asynchronous tasks, +using dependent asynchronous (dependent-async) tasks, which is extremely beneficial for workloads that want to (1) explore task graph parallelism out of dynamic control flow or @@ -16,11 +16,10 @@ We recommend that you first read @ref AsyncTasking before digesting this chapter When the construct-and-run model of a task graph is not possible in your application, you can use tf::Executor::dependent_async and tf::Executor::silent_dependent_async -to create a task graph dynamically. -This type of parallelism is also known as on-the-fly task graph parallelism, -which offers great flexibility for expressing dynamic task graph parallelism. +to create a task graph on the fly. +This style of execution is commonly referred to as dynamic task graph parallelism and provides greater flexibility in expressing parallelism that adapts to runtime conditions. The example below dynamically creates a task graph of -four dependent async tasks, @c A, @c B, @c C, and @c D, where @c A runs before @c B and @c C +four dependent-async tasks, @c A, @c B, @c C, and @c D, where @c A runs before @c B and @c C and @c D runs after @c B and @c C: @dotfile images/simple.dot @@ -31,11 +30,11 @@ tf::AsyncTask A = executor.silent_dependent_async([](){ printf("A\n"); }); tf::AsyncTask B = executor.silent_dependent_async([](){ printf("B\n"); }, A); tf::AsyncTask C = executor.silent_dependent_async([](){ printf("C\n"); }, A); auto [D, fuD] = executor.dependent_async([](){ printf("D\n"); }, B, C); -fuD.get(); // wait for D to finish, which in turns means A, B, C finish +fuD.get(); // wait for D to finish, which in turn means A, B, C have finished @endcode Both tf::Executor::dependent_async and tf::Executor::silent_dependent_async -create a task of type tf::AsyncTask to run the given function asynchronously. +create a dependent-async task of type tf::AsyncTask to run the given function asynchronously. Additionally, tf::Executor::dependent_async returns a @std_future that eventually holds the result of the execution. When returning from both calls, the executor has scheduled a worker @@ -61,10 +60,10 @@ tf::AsyncTask A = executor.silent_dependent_async([](){ printf("A\n"); }); tf::AsyncTask C = executor.silent_dependent_async([](){ printf("C\n"); }, A); tf::AsyncTask B = executor.silent_dependent_async([](){ printf("B\n"); }, A); auto [D, fuD] = executor.dependent_async([](){ printf("D\n"); }, B, C); -fuD.get(); // wait for D to finish, which in turns means A, B, C finish +fuD.get(); // wait for D to finish, which in turn means A, B, C have finished @endcode -In addition to using @std_future to synchronize the execution, +In addition to using @std_future to synchronize the execution at a particular task point, you can use tf::Executor::wait_for_all to wait for all scheduled tasks to finish: @@ -79,35 +78,38 @@ executor.wait_for_all(); @section SpecifyARagneOfDependentAsyncTasks Specify a Range of Dependent Async Tasks -Both tf::Executor::dependent_async(F&& func, Tasks&&... tasks) and -tf::Executor::silent_dependent_async(F&& func, Tasks&&... tasks) +Both tf::Executor::dependent_async and +tf::Executor::silent_dependent_async accept an arbitrary number of tasks in the dependency list. -If the number of dependent tasks is unknown at programming time, +If the number of task dependencies (i.e., predecessors) is unknown at programming time, such as those relying on runtime variables, you can use the following two overloads -to specify dependent tasks in an iterable range [first, last): +to specify predecessor tasks in an iterable range [first, last): + tf::Executor::dependent_async(F&& func, I first, I last) + tf::Executor::silent_dependent_async(F&& func, I first, I last) -The code below creates an asynchronous task that depends on -@c N previously created asynchronous tasks stored in a vector, +The range must be an input iterator whose deferenced type is convertible to tf::AsyncTask. +The following example creates a dependent-async task that depends on +@c N previously created dependent-async tasks stored in a vector, where @c N is a runtime variable: @code{.cpp} tf::Executor executor; -std::vector dependents; +std::vector predecessors; for(size_t i=0; i= 1); // main thread holds a shared ownership to A // task A remains alive (i.e., at least one ref count by the main thread) // when being added to the dependency list of async task B tf::AsyncTask B = executor.silent_dependent_async([](){}, A); +assert(B.use_count() >= 1); // main thread holds a shared ownership to B @endcode -Currently, tf::AsyncTask is implemented based on the logic of C++ smart pointer -std::shared_ptr and is considered cheap to copy or move as long as only -a handful of objects own it. -When a worker completes an async task, it will remove the task from the executor, +Currently, tf::AsyncTask is implemented based on C++ smart pointer (std::shared_ptr) +and is considered cheap to copy or move as long as only a handful of objects own it. +When a worker completes a dependent-async task, it will remove the task from the executor, decrementing the number of shared owners by one. If that counter reaches zero, the task is destroyed. @@ -139,7 +142,7 @@ where task @c A runs before task @c B and task @c C: @code{.cpp} tf::Executor executor; -// main thread creates a dependent async task A +// main thread creates a dependent-async task A tf::AsyncTask A = executor.silent_dependent_async([](){}); // spawn a new thread to create an async task B that runs after A @@ -157,32 +160,31 @@ t1.join(); t2.join(); @endcode -Regardless of @c t1 runs before or after @c t2, -the resulting topological order is always correct with the graph definition, -either @c ABC or @c ACB. +Regardless of whether @c t1 runs before or after @c t2, the resulting topological order remains valid with respect to the graph definition. +In this example, either @c ABC or @c ACB is a correct ordering. @section QueryTheComppletionStatusOfDependentAsyncTasks Query the Completion Status of Dependent Async Tasks -When you create a dependent async task, you can query its completion status by tf::AsyncTask::is_done, -which returns @c true upon completion or @c false otherwise. -A completed dependent async task indicates that a worker has executed its associated callable. +When you create a dependent-async task, you can query its completion status using tf::AsyncTask::is_done, +which returns @c true if the task has completed its execution, or @c false otherwise. +A task is considered completed once a worker has finished executing its associated callable. @code{.cpp} -// create a dependent async task that returns 100 +// create a dependent-async task that returns 100 auto [task, fu] = executor.dependent_async([](){ return 100; }); -// loops until the dependent async task completes +// loops until the dependent-async task completes while(!task.is_done()); assert(fu.get() == 100); @endcode -tf::AsyncTask::is_done is useful when you need to wait on the result of a dependent async task +tf::AsyncTask::is_done is useful when you need to wait on the result of a dependent-async task before moving onto the next program instruction. Often, tf::AsyncTask is used together with tf::Executor::corun_until to keep a worker awake in its work-stealing loop to avoid deadlock (see @ref ExecuteATaskflowFromAnInternalWorker for more details). For instance, the code below implements the famous Fibonacci sequence using recursive -asynchronous tasking: +dependent-async tasking: @code{.cpp} tf::Executor executor; diff --git a/doxygen/cookbook/exception.dox b/doxygen/cookbook/exception.dox index d5bf4a8e6..5978cbfc0 100644 --- a/doxygen/cookbook/exception.dox +++ b/doxygen/cookbook/exception.dox @@ -27,7 +27,7 @@ catch(const std::runtime_error& e) { } @endcode -@note +@attention As tf::Future is derived from @std_future, it inherits all the exception handling behaviors defined by the C++ standard. @@ -57,7 +57,7 @@ catch(const std::runtime_error& e) { } @endcode -@code{.shell-session} +@code{.bash} ~$ exception on A # execution of taskflow is cancelled after an execution is thrown @endcode @@ -93,11 +93,91 @@ try { executor.run(taskflow).get(); } catch(const std::runtime_error& e) { - // catched either B's or C's exception + // caught either B's or C's exception std::cout << e.what() << std::endl; } @endcode +@section CatchAnExceptionFromASubflow Catch an Exception from a Subflow + +When you join a subflow using tf::Subflow::join, you can catch an exception thrown by +its children tasks. +For example, the following code catches an exception from the child task `A` of the +subflow `sf`: + +@code{.cpp} +tf::Executor executor; +tf::Taskflow taskflow; + +taskflow.emplace([](tf::Subflow& sf) { + tf::Task A = sf.emplace([]() { + std::cout << "Task A\n"; + throw std::runtime_error("exception on A"); + }); + tf::Task B = sf.emplace([]() { + std::cout << "Task B\n"; + }); + A.precede(B); + + // catch the exception + try { + sf.join(); + } + catch(const std::runtime_error& re) { + std::cout << "exception thrown during subflow joining: " << re.what() << '\n'; + } +}); + +executor.run(taskflow).get(); +@endcode + +When an exception is thrown, it will cancel the execution of the parent subflow. +All the subsequent tasks that depend on that exception task will not run. +The above code example has the following output: + +@code{.bash} +Task A +exception thrown during subflow joining: exception on A +@endcode + +Uncaught exception will be propagated to the parent level until being explicitly caught. +For example, the code below will propagate the exception to the parent of the subflow, +which in this case in its taskflow. + +@code{.cpp} +tf::Executor executor; +tf::Taskflow taskflow; + +taskflow.emplace([](tf::Subflow& sf) { + tf::Task A = sf.emplace([]() { + std::cout << "Task A\n"; + throw std::runtime_error("exception on A"); + }); + tf::Task B = sf.emplace([]() { + std::cout << "Task B\n"; + }); + A.precede(B); + + // uncaught exception will propagate to the parent + sf.join(); +}); + +try +{ + executor.run(taskflow).get(); +} +catch (const std::runtime_error& re) +{ + std::cout << "exception thrown from running the taskflow: " << re.what() << '\n'; +} +@endcode + +@code{.bash} +Task A +exception thrown from running the taskflow: exception on A +@endcode + + @section CatchAnExceptionFromAnAsyncTask Catch an Exception from an Async Task Similar to @std_future, tf::Executor::async will store the exception in the shared @@ -116,7 +196,7 @@ catch(const std::runtime_error& e) { Running the program will show the exception message on the async task: -@code{.shell-session} +@code{.bash} ~$ exception @endcode @@ -129,7 +209,7 @@ executor and (1) propagated to the its parent task if the parent task exists or tf::Taskflow taskflow; tf::Executor executor; -// execption will be silently ignored +// exception will be silently ignored executor.silent_async([](){ throw std::runtime_error("exception"); }); // exception will be propagated to the parent tf::Runtime task and then its Taskflow @@ -216,6 +296,23 @@ For the above example, if the exception is not caught with tf::Runtime::corun, it will be propagated to its parent task, which is the tf::Runtime object `rt` in this case. Then, the exception will be propagated to `taskflow2`. +@section TurnOffExceptionHandling Turn Off Exception Handling + +In some applications, exception handling may not be desirable due to performance concerns, coding style preferences, or platform constraints. %Taskflow allows you to disable exception handling entirely at compile time. +To do this, simply define the macro `TF_DISABLE_EXCEPTION_HANDLING` when compiling your program: + +@code{.cpp} +~$ g++ -DTF_DISABLE_EXCEPTION_HANDLING your_taskflow_prog.cpp +@endcode + +Disabling exception handling removes all try-catch blocks from the %Taskflow runtime, resulting in a leaner binary and potentially faster execution. +However, please note that this also means %Taskflow will not catch or report runtime exceptions. + +@attention +Disabling exception handling means that %Taskflow will not catch or report runtime exceptions. +Any exception thrown during execution will propagate unchecked and may cause your program to behave abnormally. +Use this option only if you are confident that your application does not rely on exception safety. + */ } diff --git a/doxygen/cookbook/executor.dox b/doxygen/cookbook/executor.dox index d45802924..5c19d012b 100644 --- a/doxygen/cookbook/executor.dox +++ b/doxygen/cookbook/executor.dox @@ -24,9 +24,34 @@ tf::Executor executor1; // create an executor with the number of workers tf::Executor executor2(4); // create an executor of 4 worker threads @endcode -An executor can be reused to execute multiple taskflows. -In most workloads, you may need only one executor to run multiple taskflows -where each taskflow represents a part of a parallel decomposition. +@attention +Creating a tf::Executor has non-negligible overhead. +Unless your application requires multiple executors, we recommend creating a single tf::Executor +and reusing it to run multiple taskflows. + +@section UnderstandWorkStealingInExecutor Understand Work-stealing in Executor + +%Taskflow designs a highly efficient @em work-stealing algorithm to schedule and run tasks in an executor. +Work-stealing is a dynamic scheduling algorithm widely used in parallel computing to distribute and balance workload +among multiple threads or cores. +Specifically, within an executor, each worker maintains its own local queue of tasks. +When a worker finishes its own tasks, instead of becoming idle or going sleep, it (thief) tries to @em steal a task +from the queue another worker (victim). +The figure below illustrates the idea of work-stealing: + +@image html images/work-stealing.png + +The key advantage of work-stealing lies in its *decentralized* nature and efficiency. +Most of the time, worker threads work on their local queues without contention. +Stealing only occurs when a worker becomes idle, minimizing overhead associated with synchronization and task distribution. +This decentralized strategy effectively balances the workload, ensuring that idle workers are put to work and that the overall computation progresses efficiently. + +That being said, the internal scheduling mechanisms in tf::Executor are not trivial, +and it's not easy to explain every detail in just a few sentences. +If you're interested in learning more about the technical details, please refer to our paper published in +2022 *IEEE Transactions on Parallel and Distributed Systems (TPDS)*: + ++ Tsung-Wei Huang, Dian-Lun Lin, Chun-Xun Lin, and Yibo Lin, "[Taskflow: A Lightweight Parallel and Heterogeneous Task Graph Computing System](https://tsung-wei-huang.github.io/papers/tpds21-taskflow.pdf)," IEEE Transactions on Parallel and Distributed Systems (TPDS), vol. 33, no. 6, pp. 1303-1320, June 2022 @section ExecuteATaskflow Execute a Taskflow @@ -68,7 +93,7 @@ Debrief: @li Lines 13-14 run the taskflow once and wait for completion @li Line 16 runs the taskflow once with a callback to invoke when the execution finishes @li Lines 17-18 run the taskflow four times and use tf::Executor::wait_for_all to wait for completion -@li Line 19 runs the taskflow four times and invokes a callback at the end of the forth execution +@li Line 19 runs the taskflow four times and invokes a callback at the end of the fourth execution @li Line 20 keeps running the taskflow until the predicate returns true Issuing multiple runs on the same taskflow will automatically @em synchronize @@ -98,7 +123,7 @@ tf::Executor executor; // create an executor // ... // run the taskflow - executor.run(f); + executor.run(taskflow); } // leaving the scope will destroy taskflow while it is running, // resulting in undefined behavior @@ -115,10 +140,10 @@ tf::Taskflow taskflow; // Declare an executor tf::Executor executor; -tf::Future future = taskflow.run(f); // non-blocking return +tf::Future future = executor.run(taskflow); // non-blocking return // alter the taskflow while running leads to undefined behavior -f.emplace([](){ std::cout << "Add a new task\n"; }); +taskflow.emplace([](){ std::cout << "Add a new task\n"; }); @endcode You must always keep a taskflow alive and must not modify it while @@ -201,11 +226,9 @@ tf::Executor executor(2); tf::Taskflow taskflow; std::array others; -std::atomic counter{0}; - for(size_t n=0; n<1000; n++) { for(size_t i=0; i<500; i++) { - others[n].emplace([&](){ counter++; }); + others[n].emplace([&](){}); } taskflow.emplace([&executor, &tf=others[n]](){ // blocking the worker can introduce deadlock where @@ -263,33 +286,30 @@ taskflow.emplace([&](){ You must call tf::Executor::corun_until and tf::Executor::corun from a worker of the calling executor or an exception will be thrown. -@section ThreadSafety Touch an Executor from Multiple Threads +@section ThreadSafetyOfExecution Thread Safety of Executor -All @c run\_* methods are @em thread-safe. -You can have multiple threads call these methods from an executor to run different taskflows. -However, the order which taskflow runs first is non-deterministic and is up to the -runtime. +All `run_*` methods of tf::Executor are @em thread-safe. +You can safely invoke these methods from multiple threads to run different taskflows concurrently. +However, the execution order of the submitted taskflows is non-deterministic and determined by the runtime scheduler. @code{.cpp} - 1: tf::Executor executor; - 2: - 3: for(int i=0; i<10; ++i) { - 4: std::thread([i, &](){ - 5: // ... modify my taskflow at i - 6: executor.run(taskflows[i]); // run my taskflow at i - 7: }).detach(); - 8: } - 9: -10: executor.wait_for_all(); +tf::Executor executor; +for(int i=0; i<10; ++i) { + std::thread([i, &](){ + // ... modify my taskflow at i + executor.run(taskflows[i]); // run my taskflow at i + }).detach(); +} +executor.wait_for_all(); @endcode @section QueryTheWorkerID Query the Worker ID -Each worker in an executor has an unique integer identifier in the range -[0, N) that can be queried by the caller thread using tf::Executor::this_worker_id. -If the caller thread is not a worker in the executor, @c -1 is returned. -This method is convenient for users to maintain a one-to-one mapping between -a worker and its application data structure. +Each worker thread in a tf::Executor is assigned a *unique* integer identifier in the range [0, N), +where `N` is the number of worker threads in the executor. +You can query the identifier of the calling thread using tf::Executor::this_worker_id. +If the calling thread is not a worker of the executor, the method returns -1. +This functionality is particularly useful for establishing a one-to-one mapping between worker threads and application-specific data structures. @code{.cpp} std::vector worker_vectors[8]; // one vector per worker @@ -437,6 +457,111 @@ However, the @em ready message always appears before the corresponding task mess (e.g., numbers) and then the @em finished message. +@section ModifyWorkerProperty Modify Worker Property + +You can change the property of each worker thread from its executor, +such as assigning thread-processor affinity before the worker enters the scheduler loop +and post-processing additional information after the worker leaves the scheduler loop, +by passing an instance derived from tf::WorkerInterface to the executor. +The example demonstrates the usage of tf::WorkerInterface to affine +a worker to a specific CPU core equal to its id on a linux platform: + +@code{.cpp} +// affine the given thread to the given core index (linux-specific) +bool affine(std::thread& thread, unsigned int core_id) { + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + pthread_t native_handle = thread.native_handle(); + return pthread_setaffinity_np(native_handle, sizeof(cpu_set_t), &cpuset) == 0; +} + +class CustomWorkerBehavior : public tf::WorkerInterface { + + public: + + // to call before the worker enters the scheduling loop + void scheduler_prologue(tf::Worker& w) override { + printf("worker %lu prepares to enter the work-stealing loop\n", w.id()); + + // now affine the worker to a particular CPU core equal to its id + if(affine(w.thread(), w.id())) { + printf("successfully affines worker %lu to CPU core %lu\n", w.id(), w.id()); + } + else { + printf("failed to affine worker %lu to CPU core %lu\n", w.id(), w.id()); + } + } + + // to call after the worker leaves the scheduling loop + void scheduler_epilogue(tf::Worker& w, std::exception_ptr) override { + printf("worker %lu left the work-stealing loop\n", w.id()); + } +}; + +int main() { + tf::Executor executor(4, tf::make_worker_interface()); + return 0; +} +@endcode + +When running the program, we see the following one possible output: + +@code{.bash} +worker 3 prepares to enter the work-stealing loop +successfully affines worker 3 to CPU core 3 +worker 3 left the work-stealing loop +worker 0 prepares to enter the work-stealing loop +successfully affines worker 0 to CPU core 0 +worker 0 left the work-stealing loop +worker 1 prepares to enter the work-stealing loop +worker 2 prepares to enter the work-stealing loop +successfully affines worker 1 to CPU core 1 +worker 1 left the work-stealing loop +successfully affines worker 2 to CPU core 2 +worker 2 left the work-stealing loop +@endcode + + +When you create an executor, it spawns a set of worker threads to run tasks +using a work-stealing scheduling algorithm. +The execution logic of the scheduler and its interaction with each spawned worker +via tf::WorkerInterface is given below: + +@code{.cpp} +for(size_t n=0; nscheduler_prologue(worker); + + try { + while(1) { + perform_work_stealing_algorithm(); + if(stop) { + break; + } + } + } catch(...) { + exception_ptr = std::current_exception(); + } + + // leaves the scheduling loop and joins this worker thread + // Here, WorkerInterface::scheduler_epilogue is invoked, if any + worker_interface->scheduler_epilogue(worker, exception_ptr); + ); +} +@endcode + +@attention +tf::WorkerInterface::scheduler_prologue and tf::WorkerInterface::scheduler_epilogue +are invoked by each worker simultaneously. +It is your responsibility to ensure no data race can occur during their invokation. + */ } diff --git a/doxygen/cookbook/gpu_tasking_cudaflow.dox b/doxygen/cookbook/gpu_tasking.dox similarity index 58% rename from doxygen/cookbook/gpu_tasking_cudaflow.dox rename to doxygen/cookbook/gpu_tasking.dox index fd5d9ff93..091059283 100644 --- a/doxygen/cookbook/gpu_tasking_cudaflow.dox +++ b/doxygen/cookbook/gpu_tasking.dox @@ -1,18 +1,18 @@ namespace tf { -/** @page GPUTaskingcudaFlow GPU Tasking (%cudaFlow) +/** @page GPUTasking GPU Tasking Modern scientific computing typically leverages GPU-powered parallel processing cores to speed up large-scale applications. This chapter discusses how to implement CPU-GPU heterogeneous tasking algorithms -with @NvidiaCUDA. +with Nvidia @cudaGraph. @tableofcontents -@section GPUTaskingcudaFlowIncludeTheHeader Include the Header +@section GPUTaskingIncludeTheHeader Include the Header You need to include the header file, `%taskflow/cuda/cudaflow.hpp`, -for creating a GPU task graph using tf::cudaFlow. +for creating a GPU task graph using tf::cudaGraph. @code{.cpp} #include @@ -41,18 +41,16 @@ for example, many training epochs in machine learning workloads. In that case, the initial costs of building and launching the graph will be amortized over the entire training iterations. -@note +@attention A comprehensive introduction about CUDA %Graph can be referred to the [CUDA %Graph Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cuda-graphs). -@section Create_a_cudaFlow Create a cudaFlow +@section CreateACUDAGraph Create a CUDA Graph -%Taskflow leverages @cudaGraph to enable concurrent CPU-GPU tasking -using a task graph model called tf::cudaFlow. -A %cudaFlow manages a CUDA graph explicitly -to execute dependent GPU operations in a single CPU call. -The following example implements a %cudaFlow that performs -an saxpy (A·X Plus Y) workload: +%Taskflow leverages @cudaGraph to enable concurrent CPU-GPU tasking using a task graph model called tf::cudaGraph. +A tf::cudaGraph is essentially a C++ wrapper over a native CUDA graph, designed to simplify GPU task graph programming +by eliminating much of the boilerplate code required in raw CUDA %Graph programming. +The following example creates a CUDA graph to perform the saxpy (A·X Plus Y) workload: @code{.cpp} #include @@ -79,33 +77,33 @@ int main() { cudaMalloc(&dx, N*sizeof(float)); cudaMalloc(&dy, N*sizeof(float)); - tf::cudaFlow cudaflow; + tf::cudaGraph cg; // create data transfer tasks - tf::cudaTask h2d_x = cudaflow.copy(dx, hx.data(), N).name("h2d_x"); - tf::cudaTask h2d_y = cudaflow.copy(dy, hy.data(), N).name("h2d_y"); - tf::cudaTask d2h_x = cudaflow.copy(hx.data(), dx, N).name("d2h_x"); - tf::cudaTask d2h_y = cudaflow.copy(hy.data(), dy, N).name("d2h_y"); + tf::cudaTask h2d_x = cg.copy(dx, hx.data(), N); + tf::cudaTask h2d_y = cg.copy(dy, hy.data(), N); + tf::cudaTask d2h_x = cg.copy(hx.data(), dx, N); + tf::cudaTask d2h_y = cg.copy(hy.data(), dy, N); // launch saxpy<<<(N+255)/256, 256, 0>>>(N, 2.0f, dx, dy) - tf::cudaTask kernel = cudaflow.kernel( + tf::cudaTask kernel = cg.kernel( (N+255)/256, 256, 0, saxpy, N, 2.0f, dx, dy ).name("saxpy"); kernel.succeed(h2d_x, h2d_y) .precede(d2h_x, d2h_y); - // run the cudaflow through a stream + // instantiate a CUDA graph executable and run it through a stream + tf::cudaGraphExec ecec(cg); tf::cudaStream stream; - cudaflow.run(stream) - stream.synchronize(); + stream.run(exec).synchronize(); - // dump the cudaflow - cudaflow.dump(std::cout); + // dump the graph + cg.dump(std::cout); } @endcode -The %cudaFlow graph consists of two CPU-to-GPU data copies (@c h2d_x and @c h2d_y), +The graph consists of two CPU-to-GPU data copies (@c h2d_x and @c h2d_y), one kernel (@c saxpy), and two GPU-to-CPU data copies (@c d2h_x and @c d2h_y), in this order of their task dependencies. @@ -115,46 +113,46 @@ in this order of their task dependencies. We do not expend yet another effort on simplifying kernel programming but focus on tasking CUDA operations and their dependencies. -In other words, tf::cudaFlow is a lightweight C++ abstraction over CUDA %Graph. +That is, tf::cudaGraph is simply a lightweight C++ wrapper over the native CUDA %Graph. This organization lets users fully take advantage of CUDA features that are commensurate with their domain knowledge, while leaving difficult task parallelism details to %Taskflow. -@section Compile_a_cudaFlow_program Compile a cudaFlow Program +@section CompileACUDAGraphProgram Compile a CUDA Graph Program -Use @nvcc to compile a %cudaFlow program: +Use @nvcc to compile a CUDA %Graph program: -@code{.shell-session} -~$ nvcc -std=c++17 my_cudaflow.cu -I path/to/include/taskflow -O2 -o my_cudaflow +@code{.bash} +~$ nvcc -std=c++20 my_cudaflow.cu -I path/to/include/taskflow -O2 -o my_cudaflow ~$ ./my_cudaflow @endcode Please visit the page @ref CompileTaskflowWithCUDA for more details. -@section run_a_cudaflow_on_a_specific_gpu Run a cudaFlow on Specific GPU +@section RunACUDAGraphOnASpecificGPU Run a CUDA Graph on Specific GPU -By default, a %cudaFlow runs on the current GPU context associated with the caller, +By default, a tf::cudaGraph runs on the current GPU context associated with the caller, which is typically GPU @c 0. Each CUDA GPU has an integer identifier in the range of [0, N) to represent the context of that GPU, where @c N is the number of GPUs in the system. -You can run a %cudaFlow on a specific GPU by switching the context to a different GPU +You can run a CUDA graph on a specific GPU by switching the context to a different GPU using tf::cudaScopedDevice. -The code below creates a %cudaFlow and runs it on GPU @c 2. +The code below creates a CUDA graph and runs it on GPU @c 2. @code{.cpp} { // create an RAII-styled switcher to the context of GPU 2 tf::cudaScopedDevice context(2); - // create a cudaFlow capturer under GPU 2 - tf::cudaFlowCapturer capturer; + // create a CUDA graph under GPU 2 + tf::cudaGraph graph; // ... // create a stream under GPU 2 and offload the capturer to that GPU tf::cudaStream stream; - capturer.run(stream); - stream.synchronize(); + tf::cudaGraphExec exec(graph); + stream.run(exec).synchronize(); } @endcode @@ -163,7 +161,7 @@ to the given GPU context. When the scope is destroyed, it switches back to the original context. @attention -tf::cudaScopedDeviceallows you to place a %cudaFlow on a particular GPU device, +tf::cudaScopedDevice allows you to place a CUDA Graph on a particular GPU device, but it is your responsibility to ensure correct memory access. For example, you may not allocate a memory block on GPU @c 2 while accessing it from a kernel on GPU @c 0. @@ -172,7 +170,7 @@ and let the CUDA runtime perform automatic memory migration between GPUs. @section GPUMemoryOperations Create Memory Operation Tasks -%cudaFlow provides a set of methods for users to manipulate device memory. +tf::cudaGraph provides a set of methods for users to manipulate device memory. There are two categories, @em raw data and @em typed data. Raw data operations are methods with prefix @c mem, such as @c memcpy and @c memset, that operate in @em bytes. @@ -185,13 +183,13 @@ For instance, the following three methods have the same result of zeroing int* target; cudaMalloc(&target, count*sizeof(int)); -tf::cudaFlow cudaflow; -memset_target = cudaflow.memset(target, 0, sizeof(int) * count); -same_as_above = cudaflow.fill(target, 0, count); -same_as_above_again = cudaflow.zero(target, count); +tf::cudaGraph cg; +memset_target = cg.memset(target, 0, sizeof(int) * count); +same_as_above = cg.fill(target, 0, count); +same_as_above_again = cg.zero(target, count); @endcode -The method tf::cudaFlow::fill is a more powerful variant of tf::cudaFlow::memset. +The method tf::cudaGraph::fill is a more powerful variant of tf::cudaGraph::memset. It can fill a memory area with any value of type @c T, given that sizeof(T) is 1, 2, or 4 bytes. The following example creates a GPU task to fill @c count elements @@ -201,73 +199,72 @@ in the array @c target with value @c 1234. cf.fill(target, 1234, count); @endcode -Similar concept applies to tf::cudaFlow::memcpy and tf::cudaFlow::copy as well. +Similar concept applies to tf::cudaGraph::memcpy and tf::cudaGraph::copy as well. The following two methods are equivalent to each other. @code{.cpp} -cudaflow.memcpy(target, source, sizeof(int) * count); -cudaflow.copy(target, source, count); +cg.memcpy(target, source, sizeof(int) * count); +cg.copy(target, source, count); @endcode -@section OffloadAcudaFlow Offload a cudaFlow +@section RunACUDAGraph Run a CUDA Graph -To offload a %cudaFlow to a GPU, you need to use tf::cudaFlow::run -and pass a tf::cudaStream created on that GPU. -The run method is asynchronous and can be explicitly synchronized -through the given stream. +To offload a CUDA graph to a GPU, you need to instantiate an executable CUDA graph of tf::cudaGraphExec +and create a tf::cudaStream to run the executable graph. +The run method is asynchronous and can be explicitly synchronized on the given stream. @code{.cpp} +tf::cudaGraph graph; +// modify the graph ... + +// create an executable CUDA graph and run it through a stream +tf::cudaGraphExec exec(graph); tf::cudaStream stream; -// launch a cudaflow asynchronously through a stream -cudaflow.run(stream); -// wait for the cudaflow to finish +stream.run(exec); + +// wait for the executable cuda graph to finish stream.synchronize(); @endcode -When you offload a %cudaFlow using tf::cudaFlow::run, -the runtime transforms that %cudaFlow (i.e., application GPU task graph) -into a native executable instance and submit it to the CUDA runtime for execution. -There is always an one-to-one mapping between -%cudaFlow and its native CUDA graph representation (except those constructed -by using tf::cudaFlowCapturer). +There is always an one-to-one mapping between an tf::cudaGraphExec and its parent CUDA graph +in terms of its graph structure. +However, the executable graph is an independent entity and has no lifetime dependency on its parent CUDA graph. +You can instantiate multiple executable graphs from the same CUDA graph. -@section UpdateAcudaFlow Update a cudaFlow +@section UpdateAnExecutableCUDAGraph Update an Executable CUDA Graph -Many GPU applications require you to launch a %cudaFlow multiple times -and update node parameters (e.g., kernel parameters and memory addresses) -between iterations. -%cudaFlow allows you to update the parameters of created tasks -and -run the updated %cudaFlow with new parameters. -Every task-creation method in tf::cudaFlow has an overload -to update the parameters of a created task by that method. +Many GPU applications require launching a CUDA graph multiple times and updating node parameters (e.g., kernel arguments or memory addresses) between iterations. +tf::cudaGraphExec allows you to update the parameters of tasks created from its parent CUDA graph. +Every task creation method in tf::cudaGraph has a corresponding method in tf::cudaGraphExec for updating the parameters of that task. @code{.cpp} tf::cudaStream stream; -tf::cudaFlow cf; +tf::cudaGraph cg; // create a kernel task tf::cudaTask task = cf.kernel(grid1, block1, shm1, kernel, kernel_args_1); -cf.run(stream); -stream.synchronize(); + +// instantiate an executable graph +tf::cudaGraphExec exec(cg); +stream.run(stream).synchronize(); // update the created kernel task with different parameters -cf.kernel(task, grid2, block2, shm2, kernel, kernel_args_2); -cf.run(stream); -stream.synchronize(); +exec.kernel(task, grid2, block2, shm2, kernel, kernel_args_2); + +// run the updated executable graph +stream.run(stream).synchronize(); @endcode -Between successive offloads (i.e., iterative executions of a %cudaFlow), +Between successive offloads (i.e., iterative executions of a CUDA graph), you can @em ONLY update task parameters, such as changing the kernel execution parameters and memory operation parameters. -However, you must @em NOT change the topology of the %cudaFlow, +However, you must @em NOT change the topology of the CUDA graph, such as adding a new task or adding a new dependency. -This is the limitation of CUDA %Graph. +This is the limitation of Nvidia CUDA %Graph. @attention -There are a few restrictions on updating task parameters in a %cudaFlow. -Notably, you must @em NOT change the topology of an offloaded graph. -In addition, update methods have the following limitations: +There are a few restrictions on updating task parameters in an executable CUDA graph: ++ You cannot change a task to a different type + kernel task + The kernel function is not allowed to change. This restriction applies to all algorithm tasks that are created using lambda. + memset and memcpy tasks: @@ -276,28 +273,25 @@ In addition, update methods have the following limitations: + The source/destination memory must be allocated from the same contexts as the original source/destination memory. -@section IntegrateCudaFlowIntoTaskflow Integrate a cudaFlow into Taskflow +@section IntegrateACUDAGraphIntoTaskflow Integrate a CUDA Graph into Taskflow -You can create a task to enclose a %cudaFlow and run it from a worker thread. -The usage of the %cudaFlow remains the same except that the %cudaFlow is run by a worker thread -from a taskflow task. -The following example runs a %cudaFlow from a static task: +As tf::cudaGraph is a standalone wrapper over Nvidia CUDA %Graph, +you can simply run it as a task. +The following example runs a CUDA graph from a static task: @code{.cpp} tf::Executor executor; tf::Taskflow taskflow; taskflow.emplace([](){ - // create a cudaFlow inside a static task - tf::cudaFlow cudaflow; - - // ... create a kernel task - cudaflow.kernel(...); + // create a CUDA graph inside a static task + tf::cudaGraph cg; + cg.kernel(...); - // run the capturer through a stream + // instantiate a CUDA graph executable and run it through a stream + tf::cudaGraphExec ecec(cg); tf::cudaStream stream; - capturer.run(stream); - stream.synchronize(); + stream.run(exec).synchronize(); }); @endcode diff --git a/doxygen/cookbook/gpu_tasking_cudaflow_capturer.dox b/doxygen/cookbook/gpu_tasking_cudaflow_capturer.dox deleted file mode 100644 index 16d13e8e8..000000000 --- a/doxygen/cookbook/gpu_tasking_cudaflow_capturer.dox +++ /dev/null @@ -1,248 +0,0 @@ -namespace tf { - -/** @page GPUTaskingcudaFlowCapturer GPU Tasking (%cudaFlowCapturer) - -You can create a %cudaFlow through stream capture, which allows you -to implicitly capture a CUDA graph using stream-based interface. -Compared to explicit CUDA %Graph construction (tf::cudaFlow), -implicit CUDA %Graph capturing (tf::cudaFlowCapturer) is more flexible -in building GPU task graphs. - -@tableofcontents - -@section GPUTaskingcudaFlowCapturerIncludeTheHeader Include the Header - -You need to include the header file, `%taskflow/cuda/cudaflow.hpp`, -for capturing a GPU task graph using tf::cudaFlowCapturer. - -@code{.cpp} -#include -@endcode - -@section Capture_a_cudaFlow Capture a cudaFlow - -When your program has no access to direct kernel calls but can only -invoke them through a stream-based interface (e.g., @cuBLAS and @cuDNN library functions), -you can use tf::cudaFlowCapturer to capture the hidden GPU operations into a CUDA graph. -A %cudaFlowCapturer is similar to a %cudaFlow except it constructs a GPU task graph -through stream capture. -You use the method tf::cudaFlowCapturer::on -to capture a sequence of @em asynchronous GPU operations through the given stream. -The following example creates a CUDA graph that captures two kernel tasks, -@c task_1 (@c my_kernel_1) -and -@c task_2 (@c my_kernel_2) , -where @c task_1 runs before @c task_2. - -@code{.cpp} -// create a cudaFlow capturer to run a CUDA graph using stream capturing -tf::cudaFlowCapturer capturer; - -// capture my_kernel_1 through a stream managed by capturer -tf::cudaTask task_1 = capturer.on([&](cudaStream_t stream){ - my_kernel_1<<>>(my_parameters_1); -}).name("my_kernel_1"); - -// capture my_kernel_2 through a stream managed by capturer -tf::cudaTask task_2 = capturer.on([&](cudaStream_t stream){ - my_kernel_2<<>>(my_parameters_2); -}).name("my_kernel_2"); - -// my_kernel_1 runs before my_kernel_2 -task_1.precede(task_2); - -// offload captured GPU tasks using the CUDA Graph execution model -tf::cudaStream stream; -capturer.run(stream); -stream.synchronize(); - -// dump the cudaFlow to a DOT format through std::cout -capturer.dump(std::cout) -@endcode - -@dotfile images/cudaflow_capturer_1.dot - -@warning -Inside tf::cudaFlowCapturer::on, you should @em NOT modify the properties of -the stream argument but only use it to capture @em asynchronous GPU operations -(e.g., @c kernel, @c cudaMemcpyAsync). -The stream argument is internal to the capturer use only. - -@section CommonCaptureMethods Common Capture Methods - -tf::cudaFlowCapturer defines a set of methods for capturing common GPU operations, -such as tf::cudaFlowCapturer::kernel, tf::cudaFlowCapturer::memcpy, -tf::cudaFlowCapturer::memset, and so on. -For example, the following code snippet uses these pre-defined methods -to construct a GPU task graph of one host-to-device copy, kernel, -and one device-to-host copy, in this order of their dependencies. - -@code{.cpp} -tf::cudaFlowCapturer capturer; - -// copy data from host_data to gpu_data -tf::cudaTask h2d = capturer.memcpy(gpu_data, host_data, bytes) - .name("h2d"); - -// capture my_kernel to do computation on gpu_data -tf::cudaTask kernel = capturer.kernel(grid, block, shm_size, kernel, kernel_args); - .name("my_kernel"); - -// copy data from gpu_data to host_data -tf::cudaTask d2h = capturer.memcpy(host_data, gpu_data, bytes) - .name("d2h"); - -// build task dependencies -h2d.precede(kernel); -kernel.precede(d2h); -@endcode - -@dotfile images/cudaflow_capturer_2.dot - -@section CreateACapturerOnASpecificGPU Create a Capturer on a Specific GPU - -You can run a %cudaFlow capturer on a specific GPU by switching to the context -of that GPU using tf::cudaScopedDevice, following the CUDA convention of multi-GPU programming. -The example below creates a %cudaFlow capturer and runs it on GPU @c 2: - -@code{.cpp} -{ - // create an RAII-styled switcher to the context of GPU 2 - tf::cudaScopedDevice context(2); - - // create a cudaFlow capturer under GPU 2 - tf::cudaFlowCapturer capturer; - // ... - - // create a stream under GPU 2 and offload the capturer to that GPU - tf::cudaStream stream; - capturer.run(stream); - stream.synchronize(); -} -@endcode - -tf::cudaScopedDevice is an RAII-styled wrapper to perform @em scoped switch -to the given GPU context. -When the scope is destroyed, it switches back to the original context. - -@note -By default, a %cudaFlow capturer runs on the current GPU associated with the caller, -which is typically @c 0. - -@section CreateACapturerWithinAcudaFlow Create a Capturer from a cudaFlow - -Within a parent %cudaFlow, you can capture a %cudaFlow to form a subflow that -eventually becomes a @em child node in the underlying CUDA task graph. -The following example defines a captured flow @c task2 of two dependent tasks, -@c task2_1 and @c task2_2, and @c task2 runs after @c task1. - -@code{.cpp} -tf::cudaFlow cudaflow; - -tf::cudaTask task1 = cudaflow.kernel(grid, block, shm, my_kernel, args...) - .name("kernel"); - -// task2 forms a subflow as a child node in the underlying CUDA graph -tf::cudaTask task2 = cudaflow.capture([&](tf::cudaFlowCapturer& capturer){ - - // capture kernel_1 using the given stream - tf::cudaTask task2_1 = capturer.on([&](cudaStream_t stream){ - kernel_2<<>>(args1...); - }).name("kernel_1"); - - // capture kernel_2 using the given stream - tf::cudaTask task2_2 = capturer.on([&](cudaStream_t stream){ - kernel_2<<>>(args2...); - }).name("kernel_2"); - - // kernel_1 runs before kernel_2 - task2_1.precede(task2_2); -}).name("capturer"); - -task1.precede(task2); -@endcode - -@dotfile images/cudaflow_capturer_3.dot - - -@section OffloadAcudaFlowCapturer Offload a cudaFlow Capturer - -When you offload a %cudaFlow capturer using tf::cudaFlowCapturer::run, -the runtime transforms that capturer (i.e., application GPU task graph) -into a native CUDA graph and an executable instance -both optimized for maximum kernel concurrency. -Depending on the optimization algorithm, -the application GPU task graph may be different -from the actual executable graph submitted to the CUDA runtime. - -@code{.cpp} -tf::cudaStream stream; -// launch a cudaflow capturer asynchronously through a stream -capturer.run(stream); -// wait for the cudaflow to finish -stream.synchronize(); -@endcode - -@section UpdateAcudaFlowCapturer Update a cudaFlow Capturer - -Between successive offloads (i.e., executions of a %cudaFlow capturer), -you can update the captured task with a different set of parameters. -Every task-creation method in tf::cudaFlowCapturer has an overload -to update the parameters of a created task by that method. -The following example creates a kernel task and updates its parameter -between successive runs: - -@code{.cpp} -tf::cudaStream stream; -tf::cudaFlowCapturer cf; - -// create a kernel task -tf::cudaTask task = cf.kernel(grid1, block1, shm1, kernel, kernel_args_1); -cf.run(stream); -stream.synchronize(); - -// update the created kernel task with different parameters -cf.kernel(task, grid2, block2, shm2, kernel, kernel_args_2); -cf.run(stream); -stream.synchronize(); -@endcode - - -When you run a updated %cudaFlow capturer, -%Taskflow will try to update the underlying executable -with the newly captured graph first. -If that update is unsuccessful, -%Taskflow will destroy the executable graph and re-instantiate -a new one from the newly captured graph. - -@section IntegrateCudaFlowCapturerIntoTaskflow Integrate a cudaFlow Capturer into Taskflow - -You can create a task to enclose a %cudaFlow capturer and run it from a worker thread. -The usage of the capturer remains the same except that the capturer is run by a worker thread -from a taskflow task. -The following example runs a %cudaFlow capturer from a static task: - -@code{.cpp} -tf::Executor executor; -tf::Taskflow taskflow; - -taskflow.emplace([](){ - // create a cudaFlow capturer inside a static task - tf::cudaFlowCapturer capturer; - - // ... capture a GPU task graph - capturer.kernel(...); - - // run the capturer through a stream - tf::cudaStream stream; - capturer.run(stream); - stream.synchronize(); -}); -@endcode - - -*/ - -} - - diff --git a/doxygen/cookbook/gpu_tasking_syclflow.dox b/doxygen/cookbook/gpu_tasking_syclflow.dox deleted file mode 100644 index 07b7b459e..000000000 --- a/doxygen/cookbook/gpu_tasking_syclflow.dox +++ /dev/null @@ -1,324 +0,0 @@ -namespace tf { - -/** @page GPUTaskingsyclFlow GPU Tasking (%syclFlow) - -%Taskflow supports SYCL, a general-purpose heterogeneous programming model, -to program heterogeneous tasks in a single-source C++ environment. -This chapter discusses how to write SYCL C++ kernel code with %Taskflow -based on @sycl20_spec. - -@tableofcontents - -@section GPUTaskingsyclFlowIncludeTheHeader Include the Header - -You need to include the header file, `%taskflow/sycl/syclflow.hpp`, -for using tf::syclFlow. - -@section Create_a_syclFlow Create a syclFlow - -%Taskflow introduces a task graph-based programming model, -tf::syclFlow, to program SYCL tasks and their dependencies. -A %syclFlow is a task in a taskflow and is associated with a -SYCL queue to execute kernels on a SYCL device. -To create a %syclFlow task, emplace a callable with an argument of type tf::syclFlow -and associate it with a SYCL queue. -The following example (@c saxpy.cpp) implements the canonical -saxpy (A·X Plus Y) task graph -using tf::syclFlow. - -@code{.cpp} - 1: #include - 2: - 3: constexpr size_t N = 1000000; - 4: - 5: int main() { - 6: - 7: tf::Executor executor; - 8: tf::Taskflow taskflow("saxpy example"); - 9: -10: sycl::queue queue{sycl::gpu_selector{}}; -11: -12: // allocate shared memory that is accessible on both host and device -13: float* X = sycl::malloc_shared(N, queue); -14: float* Y = sycl::malloc_shared(N, queue); -15: -16: // create a syclFlow to perform the saxpy operation -17: taskflow.emplace_on([&](tf::syclFlow& sf){ -18: tf::syclTask fillX = sf.fill(X, 1.0f, N).name("fillX"); -19: tf::syclTask fillY = sf.fill(Y, 2.0f, N).name("fillY"); -20: tf::syclTask saxpy = sf.parallel_for(sycl::range<1>(N), -21: [=] (sycl::id<1> id) { -22: X[id] = 3.0f * X[id] + Y[id]; -23: } -24: ).name("saxpy"); -25: saxpy.succeed(fillX, fillY); -26: }, queue).name("syclFlow"); -27: -28: executor.run(taskflow).wait(); // run the taskflow -29: taskflow.dump(std::cout); // dump the taskflow -30: -31: // free the shared memory to avoid memory leak -32: sycl::free(X, queue); -33: sycl::free(Y, queue); -34: } -@endcode - -@dotfile images/syclflow_saxpy.dot - -Debrief: - -@li Lines 7-8 create a taskflow and an executor -@li Lines 10 creates a SYCL queue on a default-selected GPU device -@li Lines 13-14 allocate shared memory that is accessible on both host and device -@li Lines 17-26 creates a %syclFlow to define the saxpy task graph that contains: - + one fill task to fill the memory area @c X with @c 1.0f - + one fill task to fill the memory area @c Y with @c 2.0f - + one kernel task to perform the saxpy operation on the GPU -@li Lines 28-29 executes the taskflow and dumps its graph to a DOT format -@li Lines 32-33 deallocates the shared memory to avoid memory leak - -tf::syclFlow is a lightweight task graph-based programming layer atop SYCL. -We do not expend yet another effort on simplifying kernel programming -but focus on tasking SYCL operations and their dependencies. -This organization lets users fully take advantage of SYCL features -that are commensurate with their domain knowledge, -while leaving difficult task parallelism details to %Taskflow. - -@section Compile_a_syclFlow_program Compile a syclFlow Program - -Use DPC++ clang to compile a %syclFlow program: - -@code{.shell-session} -~$ clang++ -fsycl -fsycl-unnamed-lambda \ - -fsycl-targets=nvptx64-nvidia-cuda \ # for CUDA target - -I path/to/taskflow -pthread -std=c++17 saxpy.cpp -o saxpy -~$ ./saxpy -@endcode - -Please visit the page @ref CompileTaskflowWithSYCL for more details. - -@section CreateMemoryOperationTasks Create Memory Operation Tasks - -tf::syclFlow provides a set of methods for creating tasks to perform common -memory operations, such as copy, set, and fill, -on memory area pointed to by unified shared memory (USM) pointers. -The following example creates a %syclFlow task of two copy operations -and one fill operation that set the first @c N/2 elements in the vector to @c -1. - -@code{.cpp} -sycl::queue queue; - -size_t N = 1000; -int* hvec = new int[N] (100); -int* dvec = sycl::malloc_device(N, queue); - -// create a syclflow task to set the first N/2 elements to -1 -taskflow.emplace_on([&](tf::syclFlow& syclflow){ - tf::syclTask ch2d = syclflow.copy(dvec, hvec, N); - tf::syclTask fill = syclflow.fill(dvec, -1, N/2); - tf::syclTask cd2h = syclflow.copy(hvec, dvec, N); - fill.precede(cd2h) - .succeed(ch2d); -}, queue); - -executor.run(taskflow).wait(); - -// inspect the result -for(size_t i=0; i, -where @c N is one, two or three. -Each work item in such a kernel executes independently -across a set of partitioned work groups. -tf::syclFlow::parallel_for defines several variants to create a kernel task. -The following variant pairs up a @c sycl::range and a @c sycl::id -to set each element in @c data to @c 1.0f -when it is not necessary to query the global range of the index space -being executed across. - -@code{.cpp} -tf::syclTask task = syclflow.parallel_for( - sycl::range<1>(N), [data](sycl::id<1> id){ data[id] = 1.0f; } -); -@endcode - -As the same example, -the following variant enables low-level functionality of -work items and work groups -using @c sycl::nd_range and @c sycl::nd_item. -This becomes valuable when an execution requires groups of work items -that communicate and synchronize. - -@code{.cpp} -// partition the N-element range to N/M work groups each of M work items -tf::syclTask task = syclflow.parallel_for( - sycl::nd_range<1>{sycl::range<1>(N), sycl::range<1>(M)}, - [data](sycl::nd_item<1> item){ - auto id = item.get_global_linear_id(); - data[id] = 1.0f; - - // query detailed work group information - // item.get_group_linear_id(); - // item.get_local_linear_id(); - // ... - } -); -@endcode - -All the kernel methods defined in the SYCL queue -are applicable for tf::syclFlow::parallel_for. - -@section CreateCommandGroupFunctionObjectTasks Create Command Group Function Object Tasks - -SYCL provides a way to encapsulate a device-side operation and all its -data and event dependencies in a single command group function object. -The function object accepts an argument of -command group @em handler constructed by the SYCL runtime. -Command group handler is the heart of SYCL programming as it defines -pretty much all kernel-related methods, -including submission, execution, and synchronization. -You can directly create a SYCL task from a command group function object -using tf::syclFlow::on. - -@code{.cpp} -tf::syclTask task = syclflow.on( - [=] (sycl::handler& handler) { - handler.require(accessor); - handler.single_task([=](){ // place a single-threaded kernel function - data[0] = 1; - ); - } -); -@endcode - -@section OffloadAsyclFlow Offload a syclFlow - -By default, the executor offloads and executes the %syclFlow once. -When a %syclFlow is being executed, its task graph will be materialized -by the %Taskflow runtime and submitted to its associated SYCL queue -in a topological order of task dependencies defined in that graph. -You can explicitly execute a %syclFlow using different offload methods: - -@code{.cpp} -taskflow.emplace_on([](tf::syclFlow& sf) { - // ... create SYCL tasks - sf.offload(); // offload the syclFlow and run it once - sf.offload_n(10); // offload the syclFlow and run it 10 times - sf.offload_until([repeat=5] () mutable { return repeat-- == 0; }) // five times -}, queue); -@endcode - - -After you offload a %syclFlow, -it is considered executed, and the executor will @em not run an offloaded %syclFlow -after leaving the %syclFlow task callable. -On the other hand, if a %syclFlow is not offloaded, -the executor runs it once. -For example, the following two versions represent the same execution logic. - -@code{.cpp} -// version 1: explicitly offload a syclFlow once -taskflow.emplace_on([](tf::syclFlow& sf) { - sf.single_task([](){}); - sf.offload(); -}, queue); - -// version 2 (same as version 1): executor offloads the syclFlow once -taskflow.emplace_on([](tf::syclFlow& sf) { - sf.single_task([](){}); -}, queue); -@endcode - - -@section UpdateAsyclFlow Update a syclFlow - -You can update a SYCL task from an offloaded %syclFlow and @em rebind it to another -task type. -For example, you can rebind a memory operation task to a parallel-for kernel -task from an offloaded %syclFlow and vice versa. - -@code{.cpp} -size_t N = 10000; -sycl::queue queue; -int* data = sycl::malloc_shared(N, queue); - -taskflow.emplace_on([&](tf::syclFlow& syclflow){ - - // create a task to set each element to -1 - tf::syclTask task = syclflow.fill(data, -1, N); - syclflow.offload(); - - std::for_each(data, data+N, [](int i){ assert(data[i] == -1); }); - - // rebind the task to a parallel-for kernel task setting each element to 100 - syclflow.rebind_parallel_for(task, sycl::range<1>(N), [](sycl::id<1> id){ - data[id] = 100; - }); - syclflow.offload(); - - std::for_each(data, data+N, [data](int i){ assert(data[i] == 100); }); -}, queue); - -executor.run(taskflow).wait(); -@endcode - -Each method of task creation in tf::syclFlow has a corresponding method of -rebinding a task to that task type -(e.g., tf::syclFlow::on and tf::syclFlow::rebind_on, - tf::syclFlow::parallel_for and tf::syclFlow::parallel_for). - -@section UsesyclFlowInAStandaloneEnvironment Use syclFlow in a Standalone Environment - -You can use tf::syclFlow in a standalone environment without going through -tf::Taskflow and offloads it to a SYCL device from the caller thread. -All the tasking methods we have discussed so far apply to the standalone use. - -@code{.cpp} -sycl::queue queue; -tf::syclFlow sf(queue); // create a standalone syclFlow - -tf::syclTask h2d_x = sf.copy(dx, hx.data(), N).name("h2d_x"); -tf::syclTask h2d_y = sf.copy(dy, hy.data(), N).name("h2d_y"); -tf::syclTask d2h_x = sf.copy(hx.data(), dx, N).name("d2h_x"); -tf::syclTask d2h_y = sf.copy(hy.data(), dy, N).name("d2h_y"); -tf::syclTask saxpy = sf.parallel_for( - sycl::range<1>(N), [=] (sycl::id<1> id) { - dx[id] = 2.0f * dx[id] + dy[id]; - } -).name("saxpy"); - -saxpy.succeed(h2d_x, h2d_y) // kernel runs after host-to-device copy - .precede(d2h_x, d2h_y); // kernel runs before device-to-host copy - -sf.offload(); // offload and run the standalone syclFlow once -@endcode - -@note -In the standalone mode, a written %syclFlow will not be executed untile -you explicitly call an offload method, as there is neither a taskflow nor an executor. - -*/ - -} - - diff --git a/doxygen/cookbook/prioritized_tasking.dox b/doxygen/cookbook/prioritized_tasking.dox deleted file mode 100644 index dc8a9292d..000000000 --- a/doxygen/cookbook/prioritized_tasking.dox +++ /dev/null @@ -1,84 +0,0 @@ -namespace tf { - -/** @page PrioritizedTasking Prioritized Tasking - -This chapter demonstrates how to assigns a task a priority -to @em hint the scheduler about one task of a higher priority -should start earlier than another task of a lower priority. -%Task priorities are useful in many cases. For instance, -we may prioritize some tasks over others -to improve responsiveness or data locality of parallel tasks. - -@tableofcontents - -@section AssignAPriorityToATask Assign a Priority to a Task - -%Taskflow supports three different priority levels, -tf::TaskPriority::HIGH, -tf::TaskPriority::NORMAL, and -tf::TaskPriority::LOW, -as defined in tf::TaskPriority. -When there are parallel tasks (i.e., no dependencies), -%Taskflow will @c try to execute tasks of higher priorities -before tasks of lower priorities. -By default, all tasks have the highest priorities (@c tf::TaskPriority::HIGH) -unless otherwise assigned. - -@code{.cpp} -tf::Executor executor(1); -tf::Taskflow taskflow; - -int counter = 0; - -auto [A, B, C, D, E] = taskflow.emplace( - [] () { }, - [&] () { - std::cout << "Task B: " << counter++ << '\n'; // 0 - }, - [&] () { - std::cout << "Task C: " << counter++ << '\n'; // 2 - }, - [&] () { - std::cout << "Task D: " << counter++ << '\n'; // 1 - }, - [] () { } -); - -A.precede(B, C, D); -E.succeed(B, C, D); - -B.priority(tf::TaskPriority::HIGH); -C.priority(tf::TaskPriority::LOW); -D.priority(tf::TaskPriority::NORMAL); - -executor.run(taskflow).wait(); -@endcode - -In the above code, we have a task graph of five tasks, -@c A, @c B, @c C, @c D, and @c E, in which @c B, @c C, and @c D -can run in simultaneously when @c A finishes. -Since we only uses one worker thread in the executor, -we can deterministically run @c B first, then @c D, and @c C -in order of their priority values. -The output of the above code is as follows: - -@code{.shell-session} -Task B: 0 -Task D: 1 -Task C: 2 -@endcode - -%Task priorities are just @em hints to %Taskflow's work-stealing scheduler -about which task should run before another. -Due to the randomness nature of work stealing, -there is no guarantee that the scheduler will always follow these hints -to run tasks when multiple workers exist. - -@note -Currently, %Taskflow does not have any high-level abstraction for assigning priorities -to threads but tasks. - -*/ - -} - diff --git a/doxygen/cookbook/profiler.dox b/doxygen/cookbook/profiler.dox index 7653f2ddf..3ada8052d 100644 --- a/doxygen/cookbook/profiler.dox +++ b/doxygen/cookbook/profiler.dox @@ -17,7 +17,7 @@ To enable the profiler, set the environment variable @c TF_ENABLE_PROFILER to a file name in which the profiling result will be stored. -@code{.shell-session} +@code{.bash} ~$ TF_ENABLE_PROFILER=result.json ./my_taskflow ~$ cat result.json [ @@ -56,7 +56,7 @@ To compile the server, enable the cmake option @c TF_BUILD_PROFILER. You may visit @ref install to understand %Taskflow's build environment. -@code{.shell-session} +@code{.bash} # under the build directory ~$ cmake ../ -DTF_BUILD_PROFILER=ON ~$ make @@ -67,7 +67,7 @@ you can find the executable at @c tfprof/server/tfprof. Now, generate profiling data from running a taskflow program but specify the output file with extension @c .tfp. -@code{.shell-session} +@code{.bash} ~$ TF_ENABLE_PROFILER=my_taskflow.tfp ./my_taskflow ~$ ls my_taskflow.tfp # my_taskflow.tfp is of binary format @@ -78,7 +78,7 @@ Launch the server program @c tfprof/server/tfprof and pass via the option @c --mount and (2) the @c my_taskflow.tfp via the option @c --input. -@code{.shell-session} +@code{.bash} # under the build/ directory ~$ ./tfprof/server/tfprof --mount ../tfprof/ --input my_taskflow.tfp @endcode @@ -102,7 +102,7 @@ You can display a profile summary by specifying only the environment variable The %Taskflow will generate a separate summary report of tasks and workers for each executor created by the program. -@code{.shell-session} +@code{.bash} # enable the environment variable without any value ~$ TF_ENABLE_PROFILER= ./my_taskflow_program diff --git a/doxygen/cookbook/runtime_tasking.dox b/doxygen/cookbook/runtime_tasking.dox index 99cc93f5c..46b50da1f 100644 --- a/doxygen/cookbook/runtime_tasking.dox +++ b/doxygen/cookbook/runtime_tasking.dox @@ -1,22 +1,19 @@ namespace tf { -/** @page RuntimeTasking Interact with the Runtime +/** @page RuntimeTasking Runtime Tasking %Taskflow allows you to interact with the scheduling runtime by taking a *runtime object* as an argument of a task. -This is mostly useful for designing specialized parallel algorithms -extended from the existing facility of %Taskflow. +This is mostly useful for designing recursive parallel algorithms that require dynamic +tasking on the fly. @tableofcontents -@section CreateARuntimeTask Create a Runtime Object +@section CreateARuntimeTask Create a Runtime Task -%Taskflow allows a static task and a condition task to take a referenced -tf::Runtime object that provides a set of methods to interact -with the scheduling runtime. -The following example creates a static task that leverages tf::Runtime to -explicitly schedule a conditioned task which would never run under -the normal scheduling circumstance: +%Taskflow allows users to define a runtime task that accepts a reference to a tf::Runtime object. +This object provides methods to interact with the underlying scheduling engine. +For example, a runtime task can be used to explicitly schedule another task that would not normally execute due to the graph's structure or conditional dependencies: @code{.cpp} tf::Task A, B, C, D; @@ -35,32 +32,30 @@ executor.run(taskflow).wait(); @dotfile images/runtime_task_1.dot -When the condition task @c A completes and returns @c 0, +In the above code, when the condition task @c A completes and returns @c 0, the scheduler moves on to task @c B. -Under the normal circumstance, tasks @c C and @c D will not run because their -conditional dependencies never happen. -This can be broken by forcefully scheduling @c C or/and @c D via a runtime +Under normal circumstances, tasks @c C and @c D will not run because their +conditional dependencies never occur. +This behavior can be overridden by forcefully scheduling @c C or/and @c D via a runtime object of a task that resides in the same graph. -Here, task @c B call tf::Runtime::schedule to forcefully run task @c C -even though the weak dependency between @c A and @c C will never happen +Here, task @c B calls tf::Runtime::schedule to forcefully run task @c C, +even though the weak dependency between @c A and @c C will never occur based on the graph structure itself. As a result, we will see both @c B and @c C in the output: -@code{.shell-session} -B # B leverages a runtime object to schedule C out of its dependency constraint +@code{.bash} +B # B uses a runtime object to schedule C out of its dependency constraint C @endcode @attention -You should only schedule an @em active task from a runtime object. -An active task is a task in a running taskflow. -The task may or may not be running, and scheduling that task -will immediately put it into the task queue of the worker that -is running the runtime object. +You should only schedule an @em active task when using tf::Runtime::schedule. +An active task is one that belongs to a currently running taskflow. +The task may or may not be executing at the moment, but scheduling it will immediately place it into the task queue of the worker that invoked the runtime object. @section AcquireTheRunningExecutor Acquire the Running Executor -You can acquire the reference to the running executor using tf::Runtime::executor(). +You can acquire the reference to the running executor using tf::Runtime::executor. The executor associated with a runtime object is the executor that runs the parent task of that runtime object. @@ -73,29 +68,12 @@ taskflow.emplace([&](tf::Runtime& rt){ executor.run(taskflow).wait(); @endcode -@section RuntimeTaskingRunATaskGraphSynchronously Run a Task Graph Synchronously +@section CorunTaskflowsFromARuntimeTask Corun Taskflows from a Runtime Task -A runtime object can spawn and run a task graph synchronously using tf::Runtime::corun. -This model allows you to leverage dynamic tasking to execute a parallel workload within -a runtime object. -The following code creates a subflow of two independent tasks and executes it synchronously -via the given runtime object: - -@code{.cpp} -taskflow.emplace([](tf::Runtime& rt){ - rt.corun([](tf::Subflow& sf){ - sf.emplace([](){ std::cout << "independent task 1\n"; }); - sf.emplace([](){ std::cout << "independent task 2\n"; }); - // subflow joins upon corun returns - }); -}); -@endcode - -You can also create a task graph yourself and execute it through a runtime object. -This organization avoids repetitive creation of a subflow with the same topology, -such as running a runtime object repetitively. -The following code performs the same execution logic as the above example -but using the given task graph to avoid repetitive creations of a subflow: +One of the most powerful features of a runtime task is tf::Runtime::corun. +The method tf::Runtime::corun provides a *non-blocking* mechanism that allows the calling worker to continue executing other available tasks in the executor while waiting for all tasks spawned from that runtime to complete. +This behavior is critical for avoiding deadlock in nested or recursive tasking patterns, where workers may otherwise block while waiting on subgraphs of children tasks to finish, leading to a situation where no workers are left to make forward progress. +The following example demonstrates how to use tf::Runtime::corun to run a predefined task graph during the execution of a runtime task, without blocking the calling worker: @code{.cpp} // create a custom graph @@ -104,34 +82,30 @@ graph.emplace([](){ std::cout << "independent task 1\n"; }); graph.emplace([](){ std::cout << "independent task 2\n"; }); taskflow.emplace([&](tf::Runtime& rt){ - // this worker coruns the graph through its work-stealing loop + // coruns the graph without blocking the calling worker of this runtime rt.corun(graph); }); executor.run_n(taskflow, 10000); @endcode -Although tf::Runtime::corun blocks until the operation completes, -the caller thread (worker) is not preempted (e.g., sleep or holding any lock). -Instead, the caller thread joins the work-stealing loop of the executor -and leaves whenever the spawned task graph completes. -This is different from waiting for a submitted taskflow using tf::Future::wait -which blocks the caller thread until the submitted taskflow completes. -When multiple submitted taskflows are being waited, -their executions can potentially lead to deadlock. -For example, the code below creates a taskflow of 1000 tasks -with each task running a taskflow of 500 tasks -in a blocking fashion: +Although tf::Runtime::corun does not return control to the program until the given graph finishes its execution, +the calling worker (i.e., parent worker) of the runtime indeed joins the executor's work-stealing loop +and continues executing other tasks together with graph execution. +This behavior differs from waiting on a submitted taskflow using std::future::wait (i.e., base class of tf::Future), +which blocks the calling thread entirely until completion. +If multiple taskflows are submitted and waited on in this blocking manner, +it can potentially lead to deadlock, especially in recursive or nested patterns. +For example, the code below submits a taskflow of 1000 tasks to an executor of two workers, +where each worker blocks while waiting on another taskflow of 500 tasks, causing deadlock: @code{.cpp} tf::Executor executor(2); tf::Taskflow taskflow; std::array others; -std::atomic counter{0}; - for(size_t n=0; n<1000; n++) { for(size_t i=0; i<500; i++) { - others[n].emplace([&](){ counter++; }); + others[n].emplace([&](){}); } taskflow.emplace([&executor, &tf=others[n]](){ // blocking the worker can introduce deadlock where @@ -142,23 +116,20 @@ for(size_t n=0; n<1000; n++) { executor.run(taskflow).wait(); @endcode -Using tf::Runtime::corun allows each worker to corun these -taskflows through its work-stealing loop, thus avoiding -deadlock problem caused by blocking wait. +To avoid deadlock, you should instead use tf::Runtime::corun that allows the calling worker to **corun** these taskflows without blocking its execution, +thereby avoiding deadlocks. @code{.cpp} tf::Executor executor(2); tf::Taskflow taskflow; std::array others; -std::atomic counter{0}; - for(size_t n=0; n<1000; n++) { for(size_t i=0; i<500; i++) { - others[n].emplace([&](){ counter++; }); + others[n].emplace([&](){}); } taskflow.emplace([&tf=others[n]](tf::Runtime& rt){ - // the caller worker will not block but corun these + // the caller worker will not block on wait but corun these // taskflows through its work-stealing loop rt.corun(tf); }); @@ -166,11 +137,59 @@ for(size_t n=0; n<1000; n++) { executor.run(taskflow).wait(); @endcode -@section LearnMoreAboutRuntime Learn More About Runtime +@section CorunAsynchronousTasksFromARuntimeTask Corun Asynchronous Tasks from a Runtime Task + +Similar to tf::Executor, tf::Runtime allows you to create asynchronous tasks on the fly using tf::Runtime::async or tf::Runtime::silent_async. +Asynchronous tasks spawned from a runtime task are logically parented to that runtime and can be explicitly synchronized using tf::Runtime::corun. +Furthermore, each asynchronous task can itself be a runtime task, enabling recursive task creation and dynamic parallelism. +This model is particularly powerful for implementing divide-and-conquer algorithms, such as parallel sort, graph traversal, and recursion. +For instance, the example below demonstrates a parallel recursive implementation of Fibonacci numbers using recursive asynchronous tasking with tf::Runtime: + +@code{.cpp} +#include + +size_t fibonacci(size_t N, tf::Runtime& rt) { + + if(N < 2) return N; + + size_t res1, res2; + rt.silent_async([N, &res1](tf::Runtime& rt1){ res1 = fibonacci(N-1, rt1); }); + + // tail optimization for the right child + res2 = fibonacci(N-2, rt); + + // use corun to avoid blocking the worker from waiting children tasks to finish + rt.corun(); + + return res1 + res2; +} + +int main() { + + tf::Executor executor; + + size_t N = 5, res; + executor.silent_async([N, &res](tf::Runtime& rt){ res = fibonacci(N, rt); }); + executor.wait_for_all(); + + std::cout << N << "-th Fibonacci number is " << res << '\n'; + + return 0; +} +@endcode + +The figure below shows the execution diagram, where the task with suffix `*_1` represents the left child spawned by its parent runtime. + +@dotfile images/fibonacci_4_tail_optimized.dot + +For more details, please refer to @ref AsyncTasking and @ref fibonacci. + +@attention +While asynchronous tasks spawned from a runtime task are parented to that runtime task, the runtime task does not automatically synchronize their execution or wait for their completion upon destruction. +To ensure all spawned tasks finish before proceeding, you should explicitly call tf::Runtime::corun to synchronize them. +This prevents potential issues such as tasks being destroyed prematurely or lost without execution. -t the following pages to learn more about tf::Runtime: -+ @ref LaunchAsynchronousTasksFromARuntime */ diff --git a/doxygen/cookbook/semaphore.dox b/doxygen/cookbook/semaphore.dox index 8f09c0938..b0bb465bf 100644 --- a/doxygen/cookbook/semaphore.dox +++ b/doxygen/cookbook/semaphore.dox @@ -3,7 +3,7 @@ namespace tf { /** @page LimitTheMaximumConcurrency Limit the Maximum Concurrency This chapters discusses how to limit the concurrency or the maximum -number of workers in subgraphs of a taskflow. +number of workers in your %Taskflow applications. @tableofcontents @@ -15,10 +15,10 @@ You can let a task acquire/release one or multiple semaphores before/after executing its work. A task can acquire and release a semaphore, or just acquire or just release it. -A tf::Semaphore object starts with an initial count. -As long as that count is above 0, tasks can acquire the semaphore and do +A tf::Semaphore object starts with an initial value. +As long as that value is above 0, tasks can acquire the semaphore and do their work. -If the count is 0 or less, a task trying to acquire the semaphore will not run +If the value is 0 or less, a task trying to acquire the semaphore will not run but goes to a waiting list of that semaphore. When the semaphore is released by another task, it reschedules all tasks on that waiting list. @@ -27,7 +27,7 @@ it reschedules all tasks on that waiting list. tf::Executor executor(8); // create an executor of 8 workers tf::Taskflow taskflow; -tf::Semaphore semaphore(1); // create a semaphore with initial count 1 +tf::Semaphore semaphore(1); // create a semaphore with initial value of 1 std::vector tasks { taskflow.emplace([](){ std::cout << "A" << std::endl; }), @@ -49,13 +49,13 @@ executor.run(taskflow).wait(); The above example creates five tasks with no dependencies between them. Under normal circumstances, the five tasks would be executed concurrently. -However, this example has a semaphore with initial count 1, +However, this example has a semaphore with initial value of 1, and all tasks need to acquire that semaphore before running and release that semaphore after they are done. This organization limits the number of concurrently running tasks to only one. One possible output is shown below: -@code{.shell-session} +@code{.bash} # the output is a sequential chain of five tasks A B @@ -78,7 +78,7 @@ which will limit only three workers to run the five tasks, tf::Executor executor(8); // create an executor of 8 workers tf::Taskflow taskflow; -tf::Semaphore semaphore(3); // create a semaphore with initial count 3 +tf::Semaphore semaphore(3); // create a semaphore with initial value of 3 std::vector tasks { taskflow.emplace([](){ std::cout << "A" << std::endl; }), @@ -96,48 +96,13 @@ for(auto & task : tasks) { // each task acquires and release the semaphore executor.run(taskflow).wait(); @endcode -@code{.shell-session} +@code{.bash} # One possible output: A, B, and C run concurrently, D and E run concurrently ABC ED @endcode - - - Semaphores are powerful for limiting the maximum concurrency of not only a section of tasks but also different sections of tasks. Specifically, you can have one task acquire a semaphore and have another @@ -178,37 +143,42 @@ is done. This constraint forces each pair of tasks to run sequentially, while the order of which pair runs first is up to the scheduler. -@section DefineACriticalRegion Define a Critical Section -tf::CriticalSection is a wrapper over tf::Semaphore specialized for -limiting the maximum concurrency over a section of tasks. -A critical section starts with an initial count representing that limit. -When a task is added to the critical section, -the task acquires and releases the semaphore internal to the critical section. -This method tf::CriticalSection::add -automatically calls tf::Task::acquire and tf::Task::release -for each task added to the critical section. -The following example creates a critical section of two workers to run -five tasks in the critical section. +@section UseSemaphoresAcrossDifferentTasks Use Semaphores Across Different Tasks + +You can use semaphores to limit the concurrency across different sections +of taskflow graphs. +When you submit multiple taskflows to an executor, the executor view them +as a bag of dependent tasks. +It does not matter which task in which taskflow graph acquires or releases +a semaphore. @code{.cpp} tf::Executor executor(8); // create an executor of 8 workers -tf::Taskflow taskflow; +tf::Taskflow taskflow1; +tf::Taskflow taskflow2; -// create a critical section of two workers -tf::CriticalSection critical_section(2); +tf::Semaphore semaphore(1); // create a semaphore with initial value of 1 -tf::Task A = taskflow.emplace([](){ std::cout << "A" << std::endl; }); -tf::Task B = taskflow.emplace([](){ std::cout << "B" << std::endl; }); -tf::Task C = taskflow.emplace([](){ std::cout << "C" << std::endl; }); -tf::Task D = taskflow.emplace([](){ std::cout << "D" << std::endl; }); -tf::Task E = taskflow.emplace([](){ std::cout << "E" << std::endl; }); +taskflow1.emplace([](){std::cout << "task in taskflow1"; }) + .acquire(semaphore) + .release(semaphore); -critical_section.add(A, B, C, D, E); +taskflow2.emplace([](){std::cout << "task in taskflow2"; }) + .acquire(semaphore) + .release(semaphore); -executor.run(taskflow).wait(); +executor.run(taskflow1); +executor.run(taskflow2); +executor.wait_for_all(); @endcode +The above examples creates one task from each taskflow and submits +the two taskflows to the executor. +Again, under normal circumstances, the two tasks can run concurrently, +but the semaphore restricts one worker to run the two task sequentially +in arbitrary order. + @section DefineAConflictGraph Define a Conflict Graph One important application of tf::Semaphore is conflict-aware scheduling @@ -249,39 +219,89 @@ C.acquire(conflict_AC).release(conflict_AC); executor.run(taskflow).wait(); @endcode -@code{.shell-session} +@code{.bash} # One possible output: B and C run concurrently after A A BC @endcode -@note -A task can acquire and release multiple semaphores. When the executor -is running a task, it will first try to acquire all semaphores of that task. -When the executor finishes a task, it will release all acquired semaphores of -that task. +@attention +A task can acquire and release multiple semaphores. +When the executor runs a task, it will try to acquire all semaphores needed by that task. +When the executor finishes that task, it will release all acquired semaphores by that task. + +@section ResetASemaphore Reset a Semaphore + +You can reset a semaphore to its initial state using tf::Semaphore::reset(), +or set a new maximum value with tf::Semaphore::reset(size_t new_max_value). +The method tf::Semaphore::value() allows you to query the current value of the semaphore, +which represents the number of available acquisitions. + +@code{.cpp} +tf::Semaphore semaphore(4); +assert(semaphore.value() == 4 && semaphore.max_value() == 4); -The above code can be rewritten with tf::CriticalSection for simplicity, as -shown below: +// reset the semaphore to a new value +semaphore.reset(11); +assert(semaphore.value() == 11 && semaphore.max_value() == 11); +@endcode + +@attention +When a semaphore is acquired more times than its maximum value, +an exception will be thrown. + +@section UnderstandTheLimitationOfSemaphores Understand the Limitation of Semaphores + +Currently, tf::Semaphore has limited support for exception handling and taskflow cancellation. +If a task throws an exception or the taskflow is canceled, +subsequent acquire and release operations on the semaphore may result in undefined behavior. +To ensure correct behavior, you should call tf::Semaphore::reset before reusing the semaphore +in the next run. +For instance, in the code below, when task `B` throws an exception, the executor +will cancel the execution of the taskflow. +That is, tasks `C` and `D` will not run, and thus no task will release the +acquired semaphore. +To resolve this situation, we must reset the semaphore to a clean state +for the next run. @code{.cpp} tf::Executor executor; tf::Taskflow taskflow; +tf::Semaphore semaphore(1); -tf::CriticalSection critical_section_AB(1); -tf::CriticalSection critical_section_AC(1); +tf::Task A = taskflow.emplace([](){}); +tf::Task B = taskflow.emplace([](){ throw std::runtime_error("exception"); }); +tf::Task C = taskflow.emplace([](){}); +tf::Task D = taskflow.emplace([](){}); +A.precede(B); +B.precede(C); +C.precede(D); -tf::Task A = taskflow.emplace([](){ std::cout << "A" << std::endl; }); -tf::Task B = taskflow.emplace([](){ std::cout << "B" << std::endl; }); -tf::Task C = taskflow.emplace([](){ std::cout << "C" << std::endl; }); +A.acquire(semaphore); +D.release(semaphore); -// describe the conflict graph -critical_section_AB.add(A, B); -critical_section_AC.add(A, C); +// current semaphore has a value of 1 +assert(semaphore.value() == 1); -executor.run(taskflow).wait(); +// when B throws the exception, D will not run and thus semaphore is not released +try { + executor.run(taskflow).get(); +} +catch(std::runtime_error& e) { + std::cout << e.what() << std::endl; +} + +// since A acquired the semaphore, its value is 0 +assert(semaphore.value() == 0); + +// reset the semaphore to a clean state before running the taskflow again +semaphore.reset(); +assert(semaphore.value() == 1); + +executor.run(taskflow).get(); @endcode + */ } diff --git a/doxygen/cookbook/static_tasking.dox b/doxygen/cookbook/static_tasking.dox index 117f87177..ed1b346bf 100644 --- a/doxygen/cookbook/static_tasking.dox +++ b/doxygen/cookbook/static_tasking.dox @@ -52,10 +52,10 @@ such as adding dependencies, naming, and assigning a new work. 8: 9: std::cout << A.name() << std::endl; // TaskA 10: std::cout << A.num_successors() << std::endl; // 1 -11: std::cout << A.num_dependents() << std::endl; // 0 +11: std::cout << A.num_predecessors() << std::endl; // 0 12: 13: std::cout << B.num_successors() << std::endl; // 0 -14: std::cout << B.num_dependents() << std::endl; // 1 +14: std::cout << B.num_predecessors() << std::endl; // 1 @endcode Debrief: @@ -138,7 +138,7 @@ the task handler. 15: 16: for(auto task : tasks) { // print out each task's attributes 17: std::cout << task.name() << ": " -18: << "num_dependents=" << task.num_dependents() << ", " +18: << "num_predecessors=" << task.num_predecessors() << ", " 19: << "num_successors=" << task.num_successors() << '\n'; 20: } 21: @@ -154,8 +154,8 @@ the task handler. The output of this program looks like the following: @code{.sh} -This is Task 0: num_dependents=0, num_successors=1 -This is Task 1: num_dependents=1, num_successors=0 +This is Task 0: num_predecessors=0, num_successors=1 +This is Task 1: num_predecessors=1, num_successors=0 digraph Taskflow { "This is Task 1"; "This is Task 0"; @@ -168,7 +168,7 @@ Debrief: @li Lines 7-10 create two placeholder tasks with no works and stores the corresponding task handles in a vector @li Lines 12-13 name the two tasks with human-readable strings @li Line 14 adds a dependency link from the first task to the second task -@li Lines 16-20 print out the name of each task, the number of dependents, and the number of successors +@li Lines 16-20 print out the name of each task, the number of predecessors, and the number of successors @li Line 22 dumps the task dependency graph to a @GraphVizOnline format (dot) @li Lines 24-25 assign a new target to each task @@ -177,9 +177,9 @@ The later assignment overwrites the previous values. @section TraverseAdjacentTasks Traverse Adjacent Tasks -You can iterate the successor list and the dependent list of a task by using tf::Task::for_each_successor -and tf::Task::for_each_dependent, respectively. -Each method takes a lambda and applies it to a successor or a dependent being traversed. +You can iterate the successor list and the predecessor list of a task by using tf::Task::for_each_successor +and tf::Task::for_each_predecessor, respectively. +Each method takes a lambda and applies it to a successor or a predecessor being traversed. @code{.cpp} // traverse all successors of my_task @@ -187,9 +187,18 @@ my_task.for_each_successor([s=0] (tf::Task successor) mutable { std::cout << "successor " << s++ << '\n'; }); -// traverse all dependents of my_task -my_task.for_each_dependent([d=0] (tf::Task dependent) mutable { - std::cout << "dependent " << d++ << '\n'; +// traverse all predecessors of my_task +my_task.for_each_predecessor([d=0] (tf::Task predecessor) mutable { + std::cout << "predecessor " << d++ << '\n'; +}); +@endcode + +If the task contains a subflow, you can use tf::Task::for_each_subflow_task +to iterate all tasks associated with that subflow. + +@code{.cpp} +my_task.for_each_subflow_task([](tf::Task stask){ + std::cout << "subflow task " << stask.name() << '\n'; }); @endcode diff --git a/doxygen/cookbook/subflow_tasking.dox b/doxygen/cookbook/subflow_tasking.dox index cf267acd8..524ae6038 100644 --- a/doxygen/cookbook/subflow_tasking.dox +++ b/doxygen/cookbook/subflow_tasking.dox @@ -31,7 +31,7 @@ All methods you find in tf::Taskflow are applicable for tf::Subflow. 9: tf::Task B1 = subflow.emplace([] () {}).name("B1"); // subflow task B1 10: tf::Task B2 = subflow.emplace([] () {}).name("B2"); // subflow task B2 11: tf::Task B3 = subflow.emplace([] () {}).name("B3"); // subflow task B3 -12: B1.precede(B3); // B1 runs bofore B3 +12: B1.precede(B3); // B1 runs before B3 13: B2.precede(B3); // B2 runs before B3 14: }).name("B"); 15: @@ -41,7 +41,6 @@ All methods you find in tf::Taskflow are applicable for tf::Subflow. 19: C.precede(D); // D runs after C 20: 21: executor.run(taskflow).get(); // execute the graph to spawn the subflow -22: taskflow.dump(std::cout); // dump the taskflow to a DOT format @endcode @@ -56,24 +55,45 @@ Debrief: @li Lines 8-14 create a task B that spawns a task dependency graph of three tasks B1, B2, and B3 @li Lines 16-19 add dependencies among A, B, C, and D @li Line 21 submits the graph to an executor and waits until it finishes -@li Line 22 dumps the entire task dependency graph Lines 8-14 are the main block to enable subflow tasking at task B. The runtime will create a tf::Subflow passing it to task B, and spawn a dependency graph as described by the associated callable. This new subflow graph will be added to the topology of its parent task B. -Due to the property of subflow tasking, -we cannot dump its structure before execution. -We will need to run the graph first to spawn the graph and then -call tf::Taskflow::dump. -@section JoinASubflow Join a Subflow +@section RetainASubflow Retain a Subflow -By default, a subflow joins its parent task when the program leaves its execution context. -All nodes of zero outgoing edges in the subflow precede its parent task. -You can explicitly join a subflow within its execution context to -carry out recursive patterns. -A famous implementation is fibonacci recursion. +By default, a tf::Subflow automatically clears its internal task graph once it is joined. After a subflow joins, its structure and associated resources are no longer accessible. This behavior is designed to reduce memory usage, particularly in applications that recursively spawn many subflows. +For applications that require post-processing, such as visualizing the subflow through tf::Taskflow::dump, +users can disable this default cleanup behavior by calling tf::Subflow::retain on `true`. +This instructs the runtime to retain the subflow's task graph even after it has joined, enabling further inspection or visualization. + +@code{.cpp} +tf::Taskflow taskflow; +tf::Executor executor; + +taskflow.emplace([&](tf::Subflow& sf){ + sf.retain(true); // retain the subflow after join for visualization + auto A = sf.emplace([](){ std::cout << "A\n"; }); + auto B = sf.emplace([](){ std::cout << "B\n"; }); + auto C = sf.emplace([](){ std::cout << "C\n"; }); + A.precede(B, C); // A runs before B and C +}); // subflow implicitly joins here + +executor.run(taskflow).wait(); + +// The subflow graph is now retained and can be visualized using taskflow.dump(...) +taskflow.dump(std::cout); +@endcode + +@section JoinASubflow Join a Subflow Explicitly + +By default, a subflow *implicitly* joins its parent task when execution leaves its context. +All terminal nodes (i.e., nodes with no outgoing edges) in the subflow are guaranteed to precede the parent task. +Upon joining, the subflow's task graph and associated resources are automatically cleaned up. +If your application needs to access variables defined within the subflow after it joins, +you can explicitly join the subflow and handle post-processing accordingly. +A common use case is parallelizing recursive computations such as the Fibonacci sequence: @code{.cpp} int spawn(int n, tf::Subflow& sbf) { @@ -92,70 +112,19 @@ taskflow.emplace([&res] (tf::Subflow& sbf) { executor.run(taskflow).wait(); @endcode -The code above computes the fifth fibonacci number using recursive subflow. +The code above computes the fifth Fibonacci number using recursive subflow. Calling tf::Subflow::join @em immediately materializes the subflow by executing all associated -tasks to recursively compute fibonacci numbers. +tasks to recursively compute Fibonacci numbers. The taskflow graph is shown below: @dotfile images/fibonacci_7.dot -Our implementation to join subflows is @em recursive in order to -preserve the thread context in each subflow task. -Having a deep recursion of subflows may cause stack overflow. - -@section DetachASubflow Detach a Subflow +@attention +Using tf::Subflow to implement recursive parallelism like finding Fibonacci numbers may not be +as efficient as tf::Runtime due to additional task graph overhead. +For more details, readers can refer to @ref fibonacci. -In contract to joined subflow, -you can detach a subflow from its parent task, allowing its execution to flow independently. - -@code{.cpp} - 1: tf::Taskflow taskflow; - 2: - 3: tf::Task A = taskflow.emplace([] () {}).name("A"); // static task A - 4: tf::Task C = taskflow.emplace([] () {}).name("C"); // static task C - 5: tf::Task D = taskflow.emplace([] () {}).name("D"); // static task D - 6: - 7: tf::Task B = taskflow.emplace([] (tf::Subflow& subflow) { - 8: tf::Task B1 = subflow.emplace([] () {}).name("B1"); // static task B1 - 9: tf::Task B2 = subflow.emplace([] () {}).name("B2"); // static task B2 -10: tf::Task B3 = subflow.emplace([] () {}).name("B3"); // static task B3 -11: B1.precede(B3); // B1 runs bofore B3 -12: B2.precede(B3); // B2 runs before B3 -13: subflow.detach(); // detach this subflow -14: }).name("B"); -15: -16: A.precede(B); // B runs after A -17: A.precede(C); // C runs after A -18: B.precede(D); // D runs after B -19: C.precede(D); // D runs after C -20: -21: tf::Executor executor; -22: executor.run(taskflow).wait(); // execute the graph to spawn the subflow -22: taskflow.dump(std::cout); // dump the taskflow to DOT format -@endcode - -The figure below demonstrates a detached subflow based on the previous example. -A detached subflow will eventually join the topology of its parent task. - - -@dotfile images/subflow-detach.dot - -Detached subflow becomes an independent graph attached to the top-most taskflow. -Running a taskflow multiple times will accumulate all detached tasks -in the graph. -For example, running the above taskflow 5 times results in a total of 19 tasks. - -@code{.cpp} -executor.run_n(taskflow, 5).wait(); -assert(taskflow.num_tasks() == 19); -taskflow.dump(std::cout); -@endcode - -The dumped graph is shown as follows: - - -@dotfile images/subflow_detach_5.dot @section CreateANestedSubflow Create a Nested Subflow @@ -165,18 +134,18 @@ You can create another subflow from the execution of a subflow and so on. @code{.cpp} 1: tf::Taskflow taskflow; 2: - 3: tf::Task A = taskflow.emplace([] (tf::Subflow& sbf){ + 3: tf::Task A = taskflow.emplace([] (tf::Subflow& sf){ 4: std::cout << "A spawns A1 & subflow A2\n"; - 5: tf::Task A1 = sbf.emplace([] () { + 5: tf::Task A1 = sf.emplace([] () { 6: std::cout << "subtask A1\n"; 7: }).name("A1"); 8: - 9: tf::Task A2 = sbf.emplace([] (tf::Subflow& sbf2){ + 9: tf::Task A2 = sf.emplace([] (tf::Subflow& sf2){ 10: std::cout << "A2 spawns A2_1 & A2_2\n"; -11: tf::Task A2_1 = sbf2.emplace([] () { +11: tf::Task A2_1 = sf2.emplace([] () { 12: std::cout << "subtask A2_1\n"; 13: }).name("A2_1"); -14: tf::Task A2_2 = sbf2.emplace([] () { +14: tf::Task A2_2 = sf2.emplace([] () { 15: std::cout << "subtask A2_2\n"; 16: }).name("A2_2"); 17: A2_1.precede(A2_2); @@ -186,7 +155,6 @@ You can create another subflow from the execution of a subflow and so on. 21: 22: // execute the graph to spawn the subflow 23: tf::Executor().run(taskflow).get(); -24: taskflow.dump(std::cout); @endcode @@ -196,12 +164,10 @@ Debrief: @li Line 1 creates a taskflow object @li Lines 3-20 create a task to spawn a subflow of two tasks A1 and A2 @li Lines 9-18 spawn another subflow of two tasks A2_1 and A2_2 out of its parent task A2 -@li Lines 23-24 runs the graph asynchronously and dump its structure when it finishes - -Similarly, you can detach a nested subflow from its parent subflow. -A detached subflow will run independently and eventually join the topology -of its parent subflow. +@li Lines 23 runs the defined taskflow graph +@attention +To properly visualize subflows, you must call tf::Subflow::retain on each subflow and execute the taskflow once to ensure all associated subflows are spawned. */ diff --git a/doxygen/cuda_std_algorithms/cuda_std_scan.dox b/doxygen/cuda_std_algorithms/cuda_std_scan.dox index 633bd37b0..6d83c2a02 100644 --- a/doxygen/cuda_std_algorithms/cuda_std_scan.dox +++ b/doxygen/cuda_std_algorithms/cuda_std_scan.dox @@ -145,7 +145,7 @@ cudaFree(buffer); Similarly, tf::cuda_transform_exclusive_scan performs an exclusive prefix sum over a range of transformed items. The following code computes the exclusive prefix sum over 1000000 transformed items -each multipled by 10. +each multiplied by 10. @code{.cpp} const size_t N = 1000000; diff --git a/doxygen/cudaflow_algorithms/cublas/cublas_flow_capturer.dox b/doxygen/cudaflow_algorithms/cublas/cublas_flow_capturer.dox index 7ffd50f88..2cbc73898 100644 --- a/doxygen/cudaflow_algorithms/cublas/cublas_flow_capturer.dox +++ b/doxygen/cudaflow_algorithms/cublas/cublas_flow_capturer.dox @@ -77,7 +77,7 @@ int main() { You need to link the @c cublas library when compiling a cublasFlow capturer program: -@code{.shell-session} +@code{.bash} ~$ nvcc cublasflow.cpp -I path/to/taskflow/include -lcublas @endcode @@ -263,13 +263,13 @@ We currently support the following level-3 methods: + tf::cublasFlowCapturer::c_gemm_sbatched performs batched general matrix-matrix multiplication with strided memory access on row-major layout + tf::cublasFlowCapturer::symm performs symmetric matrix-matrix multiplication - + tf::cublasFlowCapturer::c_symm performs symmetric matrix-matrix multiplicaiton on row-major layout + + tf::cublasFlowCapturer::c_symm performs symmetric matrix-matrix multiplication on row-major layout + tf::cublasFlowCapturer::syrk performs symmetric rank-k update + tf::cublasFlowCapturer::c_syrk performs symmetric rank-k update on row-major layout + tf::cublasFlowCapturer::syr2k performs symmetric rank-2k update + tf::cublasFlowCapturer::c_syr2k performs symmetric rank-2k update on row-major layout - + tf::cublasFlowCapturer::syrkx performs a variantion of symmetric rank-k update - + tf::cublasFlowCapturer::c_syrkx performs a variantion of symmetric rank-k update on row-major layout + + tf::cublasFlowCapturer::syrkx performs a variation of symmetric rank-k update + + tf::cublasFlowCapturer::c_syrkx performs a variation of symmetric rank-k update on row-major layout + tf::cublasFlowCapturer::trmm performs triangular matrix-matrix multiplication + tf::cublasFlowCapturer::c_trmm performs triangular matrix-matrix multiplication on row-major layout + tf::cublasFlowCapturer::trsm solves a triangular linear system with multiple right-hand-sides diff --git a/doxygen/cudaflow_algorithms/cudaflow_scan.dox b/doxygen/cudaflow_algorithms/cudaflow_scan.dox index 41e4a256d..f70e0c804 100644 --- a/doxygen/cudaflow_algorithms/cudaflow_scan.dox +++ b/doxygen/cudaflow_algorithms/cudaflow_scan.dox @@ -96,7 +96,7 @@ for(size_t i=1; i -int spawn(int n, tf::Subflow& sbf) { - if (n < 2) return n; - int res1, res2; - sbf.emplace([&res1, n] (tf::Subflow& sbf) { res1 = spawn(n - 1, sbf); } ) - .name(std::to_string(n-1)); - sbf.emplace([&res2, n] (tf::Subflow& sbf) { res2 = spawn(n - 2, sbf); } ) - .name(std::to_string(n-2)); - sbf.join(); +size_t fibonacci(size_t N, tf::Runtime& rt) { + + if(N < 2) return N; + + size_t res1, res2; + rt.silent_async([N, &res1](tf::Runtime& rt1){ res1 = fibonacci(N-1, rt1); }); + rt.silent_async([N, &res2](tf::Runtime& rt2){ res2 = fibonacci(N-2, rt2); }); + + // use corun to avoid blocking the worker from waiting the two children tasks + // to finish + rt.corun(); + return res1 + res2; } -int main(int argc, char* argv[]) { - - int N = 5; - int res; +int main() { tf::Executor executor; - tf::Taskflow taskflow("fibonacci"); + + size_t N = 5, res; + executor.silent_async([N, &res](tf::Runtime& rt){ res = fibonacci(N, rt); }); + executor.wait_for_all(); - taskflow.emplace([&res, N] (tf::Subflow& sbf) { res = spawn(N, sbf); }) - .name(std::to_string(N)); + std::cout << N << "-th Fibonacci number is " << res << '\n'; - executor.run(taskflow).wait(); + return 0; +} +@endcode - taskflow.dump(std::cout); +The `fibonacci` function recursively spawns two asynchronous tasks to compute `fibonacci(N-1)` and `fibonacci(N-2)` in parallel using `tf::Runtime::silent_async`. +After spawning the two tasks, the function invokes tf::Runtime::corun() to wait until all tasks spawned by `rt` complete, +without blocking the caller worker. +In the main function, the executor creates an async task from the top Fibonacci number and waits for completion using tf::Executor::wait_for_all. Once finished, the result is printed. +The figure below shows the execution diagram, where the suffixes *_1 and *_2 represent the left and right children spawned by their parent runtime: - std::cout << "Fib[" << N << "]: " << res << std::endl; +@dotfile images/fibonacci_4.dot - return 0; + +@section TailRecursionOptimization Tail Recursion Optimization + +In recursive parallelism, especially for problems like Fibonacci computation, +spawning both recursive branches as asynchronous tasks can lead to excessive task creation and stack growth, which may degrade performance and overwhelm the runtime scheduler. Additionally, when both child tasks are launched asynchronously, the parent task must wait for both to finish, potentially blocking a worker thread and reducing parallel throughput. +To address these issues, we apply tail recursion optimization to one branch of the Fibonacci call. +This allows one of the recursive calls to proceed immediately in the current execution context, reducing both scheduling overhead and stack usage. + +@code{.cpp} +size_t fibonacci(size_t N, tf::Runtime& rt) { + + if(N < 2) return N; + + size_t res1, res2; + rt.silent_async([N, &res1](tf::Runtime& rt1){ res1 = fibonacci(N-1, rt1); }); + + // tail optimization for the right child + res2 = fibonacci(N-2, rt); + + // use corun to avoid blocking the worker from waiting the two children tasks + // to finish + rt.corun(); + + return res1 + res2; } @endcode -The spawned taskflow graph for computing up to the fifth fibonacci number is shown below: +The figure below shows the execution diagram, where the suffix *_1 represent the left child spawned by its parent runtime. +As we can see, the right child is optimized out through tail recursion optimization. + +@dotfile images/fibonacci_4_tail_optimized.dot +@section FibonacciNumberBenchmarking Benchmarking -@dotfile images/fibonacci_7.dot +Based on the discussion above, we compare the runtime of recursive Fibonacci parallelism +(1) with tail recursion optimization and (2) without it, across different Fibonacci numbers. -Even if recursive dynamic tasking or subflows are possible, -the recursion depth may not be too deep or it can cause stack overflow. +
    +| N | w/ tail recursion optimization | w/o tail recursion optimization | +| :-: | :-: | :-: | +| 20 | 0.23 ms | 0.31 ms | +| 25 | 2 ms | 4 ms | +| 30 | 23 ms | 42 ms | +| 35 | 269 ms | 483 ms | +| 40 | 3003 ms | 5124 ms | +
    +As `N` increases, the performance gap between the two versions widens significantly. +With tail recursion optimization, the program avoids spawning another async task, thereby reducing scheduling overhead and stack pressure. +This leads to better CPU utilization and lower task management cost. +For example, at `N = 40`, tail recursion optimization reduces the runtime by over 40%. */ diff --git a/doxygen/examples/graph_pipeline.dox b/doxygen/examples/graph_pipeline.dox index af9eda78a..dac8c9f2b 100644 --- a/doxygen/examples/graph_pipeline.dox +++ b/doxygen/examples/graph_pipeline.dox @@ -48,7 +48,7 @@ can run in parallel. This type of parallelism is also referred to as @em wavefront parallelism, which sweeps parallel elements in a diagonal direction. -@note +@attention Depending on the graph size and the number of stage tasks, task graph parallelism and pipeline parallelism can bring very different performance results. @@ -181,7 +181,7 @@ void f3(const std::string& node) { } @endcode -@note +@attention A key advantage of %Taskflow's pipeline programming model is that we do not provide any data abstraction but give users full control over data management, which is typically application-dependent. @@ -251,7 +251,7 @@ executor.run(taskflow).wait(); Three possible outputs are shown below: -@code{.shell-session} +@code{.bash} # possible output 1 ready f1(A) diff --git a/doxygen/examples/kmeans.dox b/doxygen/examples/kmeans.dox index 5a607c9d1..dd5f0f08d 100644 --- a/doxygen/examples/kmeans.dox +++ b/doxygen/examples/kmeans.dox @@ -245,7 +245,7 @@ The taskflow graph is illustrated below: @dotfile images/kmeans_2.dot The scheduler starts with @c init, moves on to @c clean_up, and then enters the -parallel-for task @c paralle-for that spawns a subflow of 12 workers to perform +parallel-for task @c parallel-for that spawns a subflow of 12 workers to perform parallel iterations. When @c parallel-for completes, it updates the cluster centroids and checks if they have converged through a condition task. diff --git a/doxygen/examples/kmeans_cudaflow.dox b/doxygen/examples/kmeans_cuda.dox similarity index 85% rename from doxygen/examples/kmeans_cudaflow.dox rename to doxygen/examples/kmeans_cuda.dox index f6c2097e3..96c0c3776 100644 --- a/doxygen/examples/kmeans_cudaflow.dox +++ b/doxygen/examples/kmeans_cuda.dox @@ -1,9 +1,9 @@ namespace tf { -/** @page kmeans_cudaflow k-means Clustering (cudaFlow) +/** @page KMeansWithCUDAGPU k-means Clustering with CUDA GPU Following up on @ref kmeans, this page studies how to accelerate -a k-means workload on a GPU using tf::cudaFlow. +a k-means workload on a GPU using tf::cudaGraph. @tableofcontents @@ -78,9 +78,9 @@ When we recompute the cluster centroids to be the mean of all points assigned to multiple GPU threads may access the sum arrays, @c sx and @c sy, and the count array, @c c. To avoid data race, we use a simple @c atomicAdd method. -@section DefineTheKMeanscudaFlow Define the k-means cudaFlow +@section DefineTheKMeansCUDAGraph Define the k-means CUDA Graph -Based on the two kernels, we can define the %cudaFlow for the k-means workload below: +Based on the two kernels, we can define a CUDA graph for the k-means workload below: @code{.cpp} // N: number of points @@ -138,29 +138,35 @@ void kmeans_gpu( auto kmeans = taskflow.emplace([&](){ - tf::cudaFlow cf; + tf::cudaGraph cg; - auto zero_c = cf.zero(d_c, K).name("zero_c"); - auto zero_sx = cf.zero(d_sx, K).name("zero_sx"); - auto zero_sy = cf.zero(d_sy, K).name("zero_sy"); + auto zero_c = cg.zero(d_c, K); + auto zero_sx = cg.zero(d_sx, K); + auto zero_sy = cg.zero(d_sy, K); - auto cluster = cf.kernel( + auto cluster = cg.kernel( (N+512-1) / 512, 512, 0, assign_clusters, d_px, d_py, N, d_mx, d_my, d_sx, d_sy, K, d_c - ).name("cluster"); + ); - auto new_centroid = cf.kernel( + auto new_centroid = cg.kernel( 1, K, 0, compute_new_means, d_mx, d_my, d_sx, d_sy, d_c - ).name("new_centroid"); + ); cluster.precede(new_centroid) .succeed(zero_c, zero_sx, zero_sy); - // Repeat the execution for M times + // dump the CUDA graph + cg.dump(std::cout); + + // instantiate an executable CUDA graph + tf::cudaGraphExec exec(cg); + + // Repeat the execution for M times and then synchronize tf::cudaStream stream; for(int i=0; i @dotfile images/kmeans_3.dot -The main %cudaFlow task, @c update_means, must not run before all required data has settled down. +The main CUDA %Graph task, @c update_means, must not run before all required data has settled down. It precedes a condition task that circles back to itself until we reach @c M iterations. -When iteration completes, the condition task directs the execution path to the %cudaFlow, @c h2d, +When iteration completes, the condition task directs the execution path to the CUDA graph, @c h2d, to copy the results of clusters to @c h_mx and @c h_my and then deallocate all GPU memory. -@section KMeanscudaFlowBenchmarking Benchmarking +@section KMeansWithGPUBenchmarking Benchmarking We run three versions of k-means, sequential CPU, parallel CPUs, and one GPU, @@ -230,9 +234,6 @@ a Nvidia RTX 2080 GPU using various numbers of 2D point counts and iterations. When the number of points is larger than 10K, both parallel CPU and GPU implementations start to pick up the speed over than the sequential version. -We can see that using the built-in predicate, tf::cudaFlow::offload_n, -can avoid repetitively creating the graph over and over, resulting in -two times faster than conditional tasking. */ diff --git a/doxygen/examples/matrix_multiplication.dox b/doxygen/examples/matmul.dox similarity index 100% rename from doxygen/examples/matrix_multiplication.dox rename to doxygen/examples/matmul.dox diff --git a/doxygen/examples/matrix_multiplication_cudaflow.dox b/doxygen/examples/matmul_cuda.dox similarity index 83% rename from doxygen/examples/matrix_multiplication_cudaflow.dox rename to doxygen/examples/matmul_cuda.dox index 2b2c08a9c..da1515baf 100644 --- a/doxygen/examples/matrix_multiplication_cudaflow.dox +++ b/doxygen/examples/matmul_cuda.dox @@ -1,9 +1,9 @@ namespace tf { -/** @page matrix_multiplication_cudaflow Matrix Multiplication (cudaFlow) +/** @page MatrixMultiplicationWithCUDAGPU Matrix Multiplication with CUDA GPU Following up on @ref matrix_multiplication, this page studies how to accelerate -a matrix multiplication workload on a GPU using tf::cudaFlow. +a matrix multiplication workload on a GPU using tf::cudaGraph. @tableofcontents @@ -37,11 +37,11 @@ can be addressed at x * width + y in the transformed 1D layout. @image html images/matrix_multiplication_4.png width=70% -@section DefineAcudaFlowForMatrixMultiplication Define a cudaFlow for Matrix Multiplication +@section DefineACUDAGraphForMatrixMultiplication Define a CUDA Graph for Matrix Multiplication The next step is to allocate memory for @c A, @c B, and @c C at a GPU. We create three tasks each calling @c cudaMalloc to allocate space for one matrix. -Then, we create a %cudaFlow to offload matrix multiplication to a GPU. +Then, we create a CUDA graph to offload matrix multiplication to a GPU. The entire code is described as follows: @code{.cpp} @@ -65,29 +65,32 @@ void matrix_multiplication(int* A, int* B, int* C, int M, int K, int N) { cudaMalloc(&dc, M*N*sizeof(int)); }).name("allocate_c"); - // create a cudaFlow task to run the matrix multiplication + // create a CUDA graph task to run the matrix multiplication tf::Task cudaFlow = taskflow.emplace([&](){ - tf::cudaFlow cf; + tf::cudaGraph cg; // copy data to da, db, and dc - tf::cudaTask copy_da = cf.copy(da, A, M*K).name("H2D_A"); - tf::cudaTask copy_db = cf.copy(db, B, K*N).name("H2D_B"); - tf::cudaTask copy_hc = cf.copy(C, dc, M*N).name("D2H_C"); + tf::cudaTask copy_da = cg.copy(da, A, M*K); + tf::cudaTask copy_db = cg.copy(db, B, K*N); + tf::cudaTask copy_hc = cg.copy(C, dc, M*N); dim3 grid ((K+16-1)/16, (M+16-1)/16); dim3 block (16, 16); - tf::cudaTask kmatmul = cf.kernel(grid, block, 0, matmul, da, db, dc, M, K, N) - .name("matmul"); + tf::cudaTask kmatmul = cg.kernel(grid, block, 0, matmul, da, db, dc, M, K, N); kmatmul.succeed(copy_da, copy_db) .precede(copy_hc); - // launch the cudaFlow + // dump the CUDA graph + cg.dump(std::cout); + + // instantiate an executable CUDA graph and run it through a stream tf::cudaStream stream; - cf.run(stream); - stream.synchronize(); + tf::cudaGraphExec exec(cg); + stream.run(exec) + .synchronize(); }).name("cudaFlow"); @@ -102,14 +105,8 @@ void matrix_multiplication(int* A, int* B, int* C, int M, int K, int N) { cudaFlow.succeed(allocate_a, allocate_b, allocate_c) .precede(free); - // dump the graph without unfolding the cudaFlow - taskflow.dump(std::cout); - // run the taskflow executor.run(taskflow).wait(); - - // dump the entire execution graph including unfolded cudaFlow - taskflow.dump(std::cout); } @endcode diff --git a/doxygen/examples/taskflow_pipeline.dox b/doxygen/examples/taskflow_pipeline.dox index 8d5d17177..a7e393252 100644 --- a/doxygen/examples/taskflow_pipeline.dox +++ b/doxygen/examples/taskflow_pipeline.dox @@ -222,7 +222,7 @@ tf::Pipe{tf::PipeType::SERIAL, [&](tf::Pipeflow& pf) { At each pipe, we use tf::Executor::corun to execute the corresponding taskflow and wait until the execution completes. -This is important because we want te caller thread, which is the worker that invokes +This is important because we want the caller thread, which is the worker that invokes the pipe callable, to not block (i.e., `executor.run(taskflows[pf.pipe()]).wait()`) but participate in the work-stealing loop of the scheduler to avoid deadlock. @@ -256,7 +256,7 @@ executor.run(taskflow).wait(); One possible output is shown below: -@code{.shell-session} +@code{.bash} ready begin token 0 A1 diff --git a/doxygen/examples/text_pipeline.dox b/doxygen/examples/text_pipeline.dox index 092cd9508..3d9fdb9ff 100644 --- a/doxygen/examples/text_pipeline.dox +++ b/doxygen/examples/text_pipeline.dox @@ -14,7 +14,7 @@ string to a final pair type. Given an input vector of strings, we want to compute the most frequent character for each string using a series of transform operations. For example: -@code{.shell-session} +@code{.bash} # input strings abade ddddf @@ -166,7 +166,7 @@ using data_type = std::variant< std::array, num_lines> mybuffer; @endcode -@note +@attention One-dimensional buffer is sufficient because %Taskflow enables only one scheduling token per line at a time. @@ -251,7 +251,7 @@ executor.run(taskflow).wait(); As the second stage is a parallel pipe, the output may interleave. One possible result is shown below: -@code{.shell-session} +@code{.bash} ready stage 1: input token = abade stage 1: input token = ddddf diff --git a/doxygen/governance/team.dox b/doxygen/governance/team.dox index 706ddaeb3..35567d9f9 100644 --- a/doxygen/governance/team.dox +++ b/doxygen/governance/team.dox @@ -12,9 +12,9 @@ We adhere to our @ref codeofconduct. Core members provide the essential development, maintenance, and support of %Taskflow in all aspects. @li Principal Investigator: @twhuang -@li Software Developers: Tsung-Wei Huang, Dian-Lun Lin, Cheng-Hsiang Chiu -@li Financial Manager: Aidza Cruz (aidza dot cruz at utah dot edu) -@li Ombudsperson: Jennifer Hoskins (jennifer dot hoskins at osp dot utah dot edu) +@li Software Developers: Tsung-Wei Huang, Cheng-Hsiang Chiu, Boyang Zhang, Chih-Chun Chang +@li Financial Manager: [Jessica Murnane](https://www.linkedin.com/in/jessica-murnane-95565b2/) +@li Ombudsperson: [Jessica Murane](https://www.linkedin.com/in/jessica-murnane-95565b2/) @li Diversity, Equity, and Inclusion: Tsung-Wei Huang @li Outreach and Education: Tsung-Wei Huang @@ -22,6 +22,7 @@ Core members provide the essential development, maintenance, and support of %Tas %Taskflow would not have reached this far without the work of these individuals who ever participated in its development. +@li Dian-Lun Lin @li Guannan Guo @li Martin Wong @li Chun-Xun Lin diff --git a/doxygen/images/fibonacci_4.dot b/doxygen/images/fibonacci_4.dot new file mode 100644 index 000000000..b9f9df7c6 --- /dev/null +++ b/doxygen/images/fibonacci_4.dot @@ -0,0 +1,26 @@ +digraph Fibonacci { + rankdir=TB; + node [shape=box]; + + F4 [label="fibonacci(4)\n[rt]"]; + F3_1 [label="fibonacci(3)\n[rt1]"]; + F2_1 [label="fibonacci(2)\n[rt1_1]"]; + F1_1 [label="fibonacci(1)\n[rt1_1_1]"]; + F0_1 [label="fibonacci(0)\n[rt1_1_2]"]; + F1_2 [label="fibonacci(1)\n[rt1_2]"]; + F2_2 [label="fibonacci(2)\n[rt2]"]; + F1_3 [label="fibonacci(1)\n[rt2_1]"]; + F0_2 [label="fibonacci(0)\n[rt2_2]"]; + + F4 -> F3_1; + F4 -> F2_2; + + F3_1 -> F2_1; + F3_1 -> F1_2; + + F2_1 -> F1_1; + F2_1 -> F0_1; + + F2_2 -> F1_3; + F2_2 -> F0_2; +} diff --git a/doxygen/images/fibonacci_4_tail_optimized.dot b/doxygen/images/fibonacci_4_tail_optimized.dot new file mode 100644 index 000000000..dfa3224dc --- /dev/null +++ b/doxygen/images/fibonacci_4_tail_optimized.dot @@ -0,0 +1,26 @@ +digraph Fibonacci { + rankdir=TB; + node [shape=box]; + + F4 [label="fibonacci(4)\n[rt]"]; + F3_1 [label="fibonacci(3)\n[rt1]"]; + F2_1 [label="fibonacci(2)\n[rt1_1]"]; + F1_1 [label="fibonacci(1)\n[rt1_1_1]"]; + F0_1 [label="fibonacci(0)\n[rt1_1]"]; + F1_2 [label="fibonacci(1)\n[rt1]"]; + F2_2 [label="fibonacci(2)\n[rt]"]; + F1_3 [label="fibonacci(1)\n[rt1]"]; + F0_2 [label="fibonacci(0)\n[rt]"]; + + F4 -> F3_1; + F4 -> F2_2; + + F3_1 -> F2_1; + F3_1 -> F1_2; + + F2_1 -> F1_1; + F2_1 -> F0_1; + + F2_2 -> F1_3; + F2_2 -> F0_2; +} diff --git a/doxygen/images/module_task_1.dot b/doxygen/images/module_task_1.dot new file mode 100644 index 000000000..814e8fc8e --- /dev/null +++ b/doxygen/images/module_task_1.dot @@ -0,0 +1,6 @@ +digraph Taskflow { +A; +B; +C; +D; +} diff --git a/doxygen/images/module_task_2.dot b/doxygen/images/module_task_2.dot new file mode 100644 index 000000000..3d64d2928 --- /dev/null +++ b/doxygen/images/module_task_2.dot @@ -0,0 +1,6 @@ +digraph Taskflow { +rankdir="LR"; +A->B; +B->C; +C->D; +} diff --git a/doxygen/images/scalable_pipeline_2.dot b/doxygen/images/scalable_pipeline_2.dot index 01dec862d..70a051b5f 100644 --- a/doxygen/images/scalable_pipeline_2.dot +++ b/doxygen/images/scalable_pipeline_2.dot @@ -87,6 +87,7 @@ p20 -> p21; p21 -> p22; p30 -> p31; p31 -> p32; +p32 -> p33; // Added this line p00 -> p10; p01 -> p11; p02 -> p12; diff --git a/doxygen/images/task_level_scheduling.dot b/doxygen/images/task_level_scheduling.dot index 4fc1a5d10..3e822e0be 100644 --- a/doxygen/images/task_level_scheduling.dot +++ b/doxygen/images/task_level_scheduling.dot @@ -1,15 +1,17 @@ digraph G { -atask [label="a task T"]; +atask [label="pop a task T from the queue"]; cond [label="is T a condition task?" shape=diamond color=black fillcolor=aquamarine style=filled]; atask->cond invokeN [label="invoke(T)"] invokeY [label="R = invoke(T)"] enqueueR [label="enqueue the R-th successor of T"] decrement [label="decrement strong dependencies of each successor of T by one"] -enqueueS [label="enqueue successors of zero strong dpendencies"] +enqueueS [label="enqueue successors of zero strong dependencies"] invokeN->decrement; decrement->enqueueS; invokeY->enqueueR; cond->invokeY[style=dashed,label="yes"]; cond->invokeN[style=dashed,label="no"]; +enqueueS->atask; +enqueueR->atask; } diff --git a/doxygen/images/uw-madison-ece-logo.png b/doxygen/images/uw-madison-ece-logo.png new file mode 100644 index 000000000..42258c755 Binary files /dev/null and b/doxygen/images/uw-madison-ece-logo.png differ diff --git a/doxygen/images/work-stealing.png b/doxygen/images/work-stealing.png new file mode 100644 index 000000000..95bf39ff8 Binary files /dev/null and b/doxygen/images/work-stealing.png differ diff --git a/doxygen/install/benchmark_taskflow.dox b/doxygen/install/benchmark_taskflow.dox index ed5b2dd58..7f125a01f 100644 --- a/doxygen/install/benchmark_taskflow.dox +++ b/doxygen/install/benchmark_taskflow.dox @@ -9,7 +9,7 @@ namespace tf { To build the benchmark code, enable the CMake option @c TF_BUILD_BENCHMARKS to @c ON as follows: -@code{.shell-session} +@code{.bash} # under /taskflow/build ~$ cmake ../ -DTF_BUILD_BENCHMARKS=ON ~$ make @@ -19,10 +19,10 @@ After you successfully build the benchmark code, you can find all benchmark instances in the @c benchmarks/ folder. You can run the executable of each instance in the corresponding folder. -@code{.shell-session} +@code{.bash} ~$ cd benchmarks & ls -black_scholes binary_tree graph_traversal ... -~$ cd graph_traversal & ./graph_traversal +bench_black_scholes bench_binary_tree bench_graph_traversal ... +~$ ./bench_graph_traversal |V|+|E| Runtime 2 0.197 842 0.198 @@ -37,10 +37,10 @@ black_scholes binary_tree graph_traversal ... You can display the help message by giving the option @c --help. -@code{.shell-session} -~$ ./graph_traversal --help +@code{.bash} +~$ ./bench_graph_traversal --help Graph Traversal -Usage: ./graph_traversal [OPTIONS] +Usage: ./bench_graph_traversal [OPTIONS] Options: -h,--help Print this help message and exit @@ -54,18 +54,29 @@ the parallel computing community to evaluate the system performance. | Instance | Description | | :-: | :-: | -| binary_tree | traverses a complete binary tree | -| black_scholes | computes option pricing with Black-Shcoles Models | -| graph_traversal | traverses a randomly generated direct acyclic graph | -| linear_chain | traverses a linear chain of tasks | -| mandelbrot | exploits imbalanced workloads in a Mandelbrot set | -| matrix_multiplication | multiplies two 2D matrices | -| mnist | trains a neural network-based image classifier on the MNIST dataset | -| parallel_sort | sorts a range of items | -| reduce_sum | sums a range of items using reduction | -| wavefront | propagates computations in a 2D grid | -| linear_pipeline | pipeline scheduling on a linear chain of pipes | -| graph_pipeline | pipeline scheduling on a graph of pipes | +| bench_binary_tree | traverses a complete binary tree | +| bench_black_scholes | computes option pricing with Black-Shcoles Models | +| bench_graph_traversal | traverses a randomly generated direct acyclic graph | +| bench_linear_chain | traverses a linear chain of tasks | +| bench_mandelbrot | exploits imbalanced workloads in a Mandelbrot set | +| bench_matrix_multiplication | multiplies two 2D matrices | +| bench_mnist | trains a neural network-based image classifier on the MNIST dataset | +| bench_parallel_sort | sorts a range of items | +| bench_reduce_sum | sums a range of items using reduction | +| bench_wavefront | propagates computations in a 2D grid | +| bench_linear_pipeline | performs pipeline parallelism on a linear chain of pipes | +| bench_graph_pipeline | performs pipeline parallelism on a graph of pipes | +| bench_deferred_pipeline | performs pipeline parallelism with dependencies from future pipes | +| bench_data_pipeline | performs pipeline parallelisms on a cache-friendly data wrapper | +| bench_thread_pool | uses our executor as a simple thread pool | +| bench_for_each | performs parallel-iteration algorithms | +| bench_scan | performs parallel-scan algorithms | +| bench_async_task | creates asynchronous tasks | +| bench_fibonacci | finds Fibonacci numbers using recursive asynchronous tasking | +| bench_nqueens | parallelizes n-queen search using recursive asynchronous tasking | +| bench_integrate | parallelizes integration using recursive asynchronous tasking | +| bench_primes | finds a range of prime numbers using parallel-reduction algorithms | +| bench_skynet | traverses a 10-ray tree using recursive asynchronous tasking | @section ConfigureRunOptions Configure Run Options @@ -75,10 +86,10 @@ Common options are: | option | value | function | | :-: | :-: | :-: | -| @c -h | none | display the help message | -| @c -t | integer | configure the number of threads to run | -| @c -r | integer | configure the number of rounds to run | -| @c -m | string | configure the baseline models to run, tbb, omp, or tf | +| @c -h | none | displays the help message | +| @c -t | integer | configures the number of threads to run | +| @c -r | integer | configures the number of rounds to run | +| @c -m | string | configures the baseline models to run, tbb, omp, or tf | You can configure the benchmarking environment by giving different options. @@ -90,10 +101,10 @@ programming libraries, @OpenMP and @TBB, to measure and evaluate the performance of %Taskflow. You can select different implementations by passing the option @c -m. -@code{.shell-session} -~$ ./graph_traversal -m tf # run the Taskflow implementation (default) -~$ ./graph_traversal -m tbb # run the TBB implementation -~$ ./graph_traversal -m omp # run the OpenMP implementation +@code{.bash} +~$ ./bench_graph_traversal -m tf # run the Taskflow implementation (default) +~$ ./bench_graph_traversal -m tbb # run the TBB implementation +~$ ./bench_graph_traversal -m omp # run the OpenMP implementation @endcode @subsection SpecifyTheNumberOfThreads Specify the Number of Threads @@ -102,9 +113,9 @@ You can configure the number of threads to run a benchmark instance by passing the option @c -t. The default value is one. -@code{.shell-session} +@code{.bash} # run the Taskflow implementation using 4 threads -~$ ./graph_traversal -m tf -t 4 +~$ ./bench_graph_traversal -m tf -t 4 @endcode Depending on your environment, you may need to use @c taskset to set the CPU @@ -112,9 +123,9 @@ affinity of the running process. This allows the OS scheduler to keep process on the same CPU(s) as long as practical for performance reason. -@code{.shell-session} +@code{.bash} # affine the process to 4 CPUs, CPU 0, CPU 1, CPU 2, and CPU 3 -~$ taskset -c 0-3 graph_traversal -t 4 +~$ taskset -c 0-3 bench_graph_traversal -t 4 @endcode @subsection SpecifyTheNumberOfRounds Specify the Number of Rounds @@ -124,9 +135,9 @@ at different problem sizes. Each problem size corresponds to one iteration. You can configure the number of rounds per iteration to average the runtime. -@code{.shell-session} -# measure the runtime in an average of 10 runs -~$ ./graph_traversal -r 10 +@code{.bash} +# measure the %Taskflow runtime by averaging the results over 10 runs +~$ ./bench_graph_traversal -r 10 -m tf |V|+|E| Runtime 2 0.109 # the runtime value 0.109 is an average of 10 runs 842 0.298 diff --git a/doxygen/install/cuda_compile.dox b/doxygen/install/cuda_compile.dox index f525d5487..5029d3c06 100644 --- a/doxygen/install/cuda_compile.dox +++ b/doxygen/install/cuda_compile.dox @@ -19,28 +19,19 @@ function to output a message: @code{.cpp} #include #include -#include int main(int argc, const char** argv) { - tf::Executor executor; - tf::Taskflow taskflow; + // create a CUDA graph with a single-threaded task + tf::cudaGraph cg; + cf.single_task([] __device__ () { printf("hello CUDA Graph!\n"); }); + + // instantiate an executable CUDA graph and run it through a stream + tf::cudaStream stream; + tf::cudaGraphExec exec(cg); - tf::Task task1 = taskflow.emplace([](){}).name("cpu task"); - tf::Task task2 = taskflow.emplace([](){ - // create a cudaFlow of a single-threaded task - tf::cudaFlow cf; - cf.single_task([] __device__ () { printf("hello cudaFlow!\n"); }); - - // launch the cudaflow through a stream - tf::cudaStream stream; - cf.run(stream); - stream.synchronize(); - }).name("gpu task"); + stream.run(cg).synchronize(); - task1.precede(task2); - - executor.run(taskflow).wait(); return 0; } @endcode @@ -48,7 +39,7 @@ int main(int argc, const char** argv) { The easiest way to compile %Taskflow with CUDA code (e.g., %cudaFlow, kernels) is to use @nvcc: -@code{.shell-session} +@code{.bash} ~$ nvcc -std=c++17 -I path/to/taskflow/ --extended-lambda simple.cu -o simple ~$ ./simple hello cudaFlow! @@ -94,21 +85,22 @@ int main() { tf::Task make_cudaflow(tf::Taskflow& taskflow) { return taskflow.emplace([](){ - // create a cudaFlow of a single-threaded task - tf::cudaFlow cf; - cf.single_task([] __device__ () { printf("cudaflow.cpp!\n"); }); + // create a CUDA graph with a single-threaded task + tf::cudaGraph cg; + cf.single_task([] __device__ () { printf("hello CUDA Graph!\n"); }); - // launch the cudaflow through a stream + // instantiate an executable CUDA graph and run it through a stream tf::cudaStream stream; - cf.run(stream); - stream.synchronize(); + tf::cudaGraphExec exec(cg); + + stream.run(cg).synchronize(); }).name("gpu task"); } @endcode Compile each source to an object (@c g++ as an example): -@code{.shell-session} +@code{.bash} ~$ g++ -std=c++17 -I path/to/taskflow -c main.cpp -o main.o ~$ nvcc -std=c++17 --extended-lambda -x cu -I path/to/taskflow \ -dc cudaflow.cpp -o cudaflow.o @@ -131,7 +123,7 @@ on a compatible SM architecture using the option @-arch. For instance, the following command requires device code linking to have compute capability 7.5 or later: -@code{.shell-session} +@code{.bash} ~$ nvcc -std=c++17 --extended-lambda -x cu -arch=sm_75 -I path/to/taskflow \ -dc cudaflow.cpp -o cudaflow.o @endcode @@ -142,7 +134,7 @@ Using @c nvcc to link compiled object code is nothing special but replacing the normal compiler with @c nvcc and it takes care of all the necessary steps: -@code{.shell-session} +@code{.bash} ~$ nvcc main.o cudaflow.o -o main # run the main program @@ -158,20 +150,20 @@ Since your CPU compiler does not know how to link CUDA device code, you have to add a step in your build to have @c nvcc link the CUDA device code, using the option @c -dlink: -@code{.shell-session} +@code{.bash} ~$ nvcc -o gpuCode.o -dlink main.o cudaflow.o @endcode This step links all the device object code and places it into @c gpuCode.o. -@note +@attention Note that this step does not link the CPU object code and discards the CPU object code in @c main.o and @c cudaflow.o. To complete the link to an executable, you can use, for example, @c ld or @c g++. -@code{.shell-session} +@code{.bash} # replace /usr/local/cuda/lib64 with your own CUDA library installation path ~$ g++ -pthread -L /usr/local/cuda/lib64/ -lcudart \ gpuCode.o main.o cudaflow.o -o main @@ -189,7 +181,7 @@ does not conflict with the code in @c gpuCode.o. @c g++ ignores device code because it does not know how to link it, and the device code in @c gpuCode.o is already linked and ready to go. -@note +@attention This intentional ignorance is extremely useful in large builds where intermediate objects may have both CPU and GPU code. In this case, we just let the GPU and CPU linkers each do its own job, diff --git a/doxygen/install/install.dox b/doxygen/install/install.dox index 40af4cf9f..4b60a9e79 100644 --- a/doxygen/install/install.dox +++ b/doxygen/install/install.dox @@ -16,7 +16,7 @@ To use %Taskflow, you only need a compiler that supports C++17: @li Microsoft Visual Studio at least v15.7 (MSVC++ 19.14) @li AppleClang Xcode Version at least v12.0 with -std=c++17 @li Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17 -@li Intel C++ Compiler (nvcc) at least v19.0.1 with -std=c++17 +@li Intel C++ Compiler (icpc) at least v19.0.1 with -std=c++17 @li Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 and SYCL20 %Taskflow works on Linux, Windows, and Mac OS X. @@ -27,7 +27,7 @@ To use %Taskflow, you only need a compiler that supports C++17: Simply download the source and copy the headers under the directory @c taskflow/ to your project. -@code{.shell-session} +@code{.bash} ~$ git clone https://github.com/taskflow/taskflow.git ~$ cd taskflow/ ~$ cp -r taskflow myproject/include/ @@ -40,7 +40,7 @@ where to find the %Taskflow header files and link it through the system thread l (usually [POSIX threads](http://man7.org/linux/man-pages/man7/pthreads.7.html) in Linux-like systems). Take gcc for an example: -@code{.shell-session} +@code{.bash} ~$ g++ simple.cpp -std=c++17 -I myproject/include/ -O2 -pthread -o simple @endcode @@ -50,7 +50,7 @@ Take gcc for an example: %Taskflow uses CMake to build examples and unit tests. We recommend using out-of-source build. -@code{.shell-session} +@code{.bash} ~$ cd path/to/taskflow ~$ mkdir build ~$ cd build @@ -78,7 +78,7 @@ When the building completes, you can find the executables for examples and tests under the two folders, @c examples/ and @c unittests/. You can list a set of available options in the cmake. -@code{.shell-session} +@code{.bash} ~$ cmake -LA ... TF_BUILD_EXAMPLES:BOOL=ON # by default, we compile examples @@ -103,7 +103,7 @@ Currently, our CMake script supports the following options: To enable or disable a specific option, use @c -D in the CMake build. For example: -@code{.shell-session} +@code{.bash} ~$ cmake ../ -DTF_BUILD_EXAMPLES=OFF @endcode @@ -116,7 +116,7 @@ enable the CMake option @c TF_BUILD_CUDA to @c ON. Cmake will automatically detect the existence of @c nvcc and use it to compile and link @c .cu code. -@code{.shell-session} +@code{.bash} ~$ cmake ../ -DTF_BUILD_CUDA=ON ~$ make @endcode @@ -133,7 +133,7 @@ To enable a sanitizer, add the sanitizer flag to the CMake variable The following example enables thread sanitizer in building %Taskflow code to detect data race: -@code{.shell-session} +@code{.bash} # build Taskflow code with thread sanitizer to detect data race ~$ cmake ../ -DCMAKE_CXX_FLAGS="-fsanitize=thread -g" @@ -153,7 +153,7 @@ To our best knowledge, %Taskflow is one of the very few parallel programming libraries that are free from data race. -@note +@attention Some sanitizers are supported by certain computing architectures. You can find the information about architecture support of each sanitizer at [Clang Documentation](https://clang.llvm.org/docs/index.html) and @@ -166,7 +166,7 @@ the performance of Taskflow with existing parallel programming libraries. To build the benchmark code, enable the CMake option @c TF_BUILD_BENCHMARKS to @c ON as follows: -@code{.shell-session} +@code{.bash} ~$ cmake ../ -DTF_BUILD_BENCHMARKS=ON ~$ make @endcode @@ -180,7 +180,7 @@ The source of documentation is located in the folder @c taskflow/doxygen and the generated html is output to the folder @c taskflow/docs. To generate the documentation, you need to first install doxygen: -@code{.shell-session} +@code{.bash} # ubuntu as an example ~$ sudo apt-get install doxygen graphviz @endcode @@ -188,7 +188,7 @@ To generate the documentation, you need to first install doxygen: Once you have doxygen and dot graph generator installed, clone the m.css project and enter the @c m.css/documentation directory: -@code{.shell-session} +@code{.bash} ~$ git clone https://github.com/mosra/m.css.git ~$ cd m.css/documentation @endcode @@ -198,7 +198,7 @@ The script @c doxygen.py requires Python 3.6, depends on Pygments for code block highlighting. You can install the dependencies via @c pip or your distribution package manager: -@code{.shell-session} +@code{.bash} # You may need sudo here # More details are available at https://mcss.mosra.cz/documentation/doxygen/ ~$ pip3 install jinja2 Pygments @@ -206,7 +206,7 @@ You can install the dependencies via @c pip or your distribution package manager Next, invoke @c doxygen.py and point it to the @c taskflow/doxygen/conf.py: -@code{.shell-session} +@code{.bash} ~$ ./doxygen.py path/to/taskflow/doxygen/conf.py @endcode diff --git a/doxygen/install/sycl_compile.dox b/doxygen/install/sycl_compile.dox deleted file mode 100644 index 2605b0537..000000000 --- a/doxygen/install/sycl_compile.dox +++ /dev/null @@ -1,145 +0,0 @@ -namespace tf { - -/** @page CompileTaskflowWithSYCL Compile Taskflow with SYCL - -@tableofcontents - -@section InstallSYCLCompiler Install SYCL Compiler - -To compile %Taskflow with SYCL code, you need the DPC++ clang compiler, -which can be acquired from -Getting -Started with oneAPI DPC++. - -@section CompileTaskflowWithSYCLDirectly Compile Source Code Directly - -%Taskflow's GPU programming interface for SYCL is tf::syclFlow. -Consider the following `simple.cpp` program that performs the canonical -saxpy (single-precision AX + Y) operation on a GPU: - -@code{.cpp} -#include // core taskflow routines -#include // core syclflow routines - -int main() { - - tf::Executor executor; - tf::Taskflow taskflow("saxpy example"); - - sycl::queue queue; - - auto X = sycl::malloc_shared(N, queue); - auto Y = sycl::malloc_shared(N, queue); - - taskflow.emplace_on([&](tf::syclFlow& sf){ - tf::syclTask fillX = sf.fill(X, 1.0f, N).name("fillX"); - tf::syclTask fillY = sf.fill(Y, 2.0f, N).name("fillY"); - tf::syclTask saxpy = sf.parallel_for(sycl::range<1>(N), - [=] (sycl::id<1> id) { - X[id] = 3.0f * X[id] + Y[id]; - } - ).name("saxpy"); - saxpy.succeed(fillX, fillY); - }, queue).name("syclFlow"); - - executor.run(taskflow).wait(); -} -@endcode - -Use DPC++ clang to compile the program with the following options: - -@li @c -fsycl: enable SYCL compilation mode -@li @c -fsycl-targets=nvptx64-nvidia-cuda-sycldevice: enable CUDA target -@li @c -fsycl-unnamed-lambda: enable unnamed SYCL lambda kernel - -@code{.shell-session} -~$ clang++ -fsycl -fsycl-unnamed-lambda \ - -fsycl-targets=nvptx64-nvidia-cuda-sycldevice \ # for CUDA target - -I path/to/taskflow -pthread -std=c++17 simple.cpp -o simple -~$ ./simple -@endcode - - -@attention -You need to include @c taskflow/syclflow.hpp in order to use tf::syclFlow. - - -@section CompileTaskflowWithSYCLSeparately Compile Source Code Separately - -Large GPU applications often compile a program into separate objects -and link them together to form an executable or a library. -You can compile your SYCL code into separate object files and link them -to form the final executable. -Consider the following example that defines two tasks -on two different pieces (@c main.cpp and @c syclflow.cpp) of source code: - -@code{.cpp} -// main.cpp -#include - -tf::Task make_syclflow(tf::Taskflow& taskflow); // create a syclFlow task - -int main() { - - tf::Executor executor; - tf::Taskflow taskflow; - - tf::Task task1 = taskflow.emplace([](){ std::cout << "main.cpp!\n"; }) - .name("cpu task"); - tf::Task task2 = make_syclflow(taskflow); - - task1.precede(task2); - - executor.run(taskflow).wait(); - - return 0; -} -@endcode - -@code{.cpp} -// syclflow.cpp -#include -#include - -inline sycl::queue queue; // create a global sycl queue - -tf::Task make_syclflow(tf::Taskflow& taskflow) { - return taskflow.emplace_on([](tf::syclFlow& cf){ - printf("syclflow.cpp!\n"); - cf.single_task([](){}).name("kernel"); - }, queue).name("gpu task"); -} -@endcode - -Compile each source to an object using DPC++ clang: - -@code{.shell-session} -~$ clang++ -I path/to/taskflow/ -pthread -std=c++17 -c main.cpp -o main.o -~$ clang++ -fsycl -fsycl-unnamed-lambda \ - -fsycl-targets=nvptx64-nvidia-cuda-sycldevice \ - -I path/to/taskflow/ -pthread -std=c++17 -c syclflow.cpp -o syclflow.o - -# now we have the two compiled .o objects, main.o and syclflow.o -~$ ls -main.o syclflow.o -@endcode - -Next, link the two object files to the final executable: - -@code{.shell-session} -~$ clang++ -fsycl -fsycl-unnamed-lambda \ - -fsycl-targets=nvptx64-nvidia-cuda-sycldevice \ # for CUDA target - main.o syclflow.o -pthread -std=c++17 -o main - -# run the main program -~$ ./main -main.cpp! -syclflow.cpp! -@endcode - -*/ - - -} - - diff --git a/doxygen/references/references.dox b/doxygen/references/references.dox index 0f5695c42..01fc4046f 100644 --- a/doxygen/references/references.dox +++ b/doxygen/references/references.dox @@ -3,43 +3,16 @@ namespace tf { /** @page References References This page summarizes a list of publication related to %Taskflow. -If you are using %Taskflow, please cite the following paper we publised at 2022 IEEE TPDS: +If you are using %Taskflow, please cite the following paper we published at 2022 IEEE Transactions on Parallel and Distributed Systems (TPDS): + Tsung-Wei Huang, Dian-Lun Lin, Chun-Xun Lin, and Yibo Lin, "[Taskflow: A Lightweight Parallel and Heterogeneous Task Graph Computing System](https://tsung-wei-huang.github.io/papers/tpds21-taskflow.pdf)," IEEE Transactions on Parallel and Distributed Systems (TPDS), vol. 33, no. 6, pp. 1303-1320, June 2022 -@tableofcontents - - @section RefConference Conference - -
      -
    1. Dian-Lun Lin, Yanqing Zhang, Haoxing Ren, Shih-Hsin Wang, Brucek Khailany and Tsung-Wei Huang, "[GenFuzz: GPU-accelerated Hardware Fuzzing using Genetic Algorithm with Multiple Inputs](https://tsung-wei-huang.github.io/papers/2023-dac.pdf)," ACM/IEEE Design Automation Conference (DAC), San Francisco, CA, 2023
    2. -
    3. Tsung-Wei Huang, "[qTask: Task-parallel Quantum Circuit Simulation with Incrementality](https://tsung-wei-huang.github.io/papers/ipdps23.pdf)," IEEE International Parallel and Distributed Processing Symposium (IPDPS), St. Petersburg, Florida, 2023
    4. -
    5. Elmir Dzaka, Dian-Lun Lin, and Tsung-Wei Huang, "[Parallel And-Inverter Graph Simulation Using a Task-graph Computing System](https://tsung-wei-huang.github.io/papers/pdco-23.pdf)," IEEE International Parallel and Distributed Processing Symposium Workshop (IPDPSW), St. Petersburg, Florida, 2023
    6. -
    7. Tsung-Wei Huang and Leslie Hwang, "[Task-Parallel Programming with Constrained Parallelism](https://tsung-wei-huang.github.io/papers/hpec22-semaphore.pdf)," IEEE High-Performance Extreme Computing Conference (HPEC), MA, 2022
    8. -
    9. Tsung-Wei Huang, "[Enhancing the Performance Portability of Heterogeneous Circuit Analysis Programs](https://tsung-wei-huang.github.io/papers/hpec22-ot.pdf)," IEEE High-Performance Extreme Computing Conference (HPEC), MA, 2022
    10. -
    11. Dian-Lun Lin, Haoxing Ren, Yanqing Zhang, and Tsung-Wei Huang, "[From RTL to CUDA: A GPU Acceleration Flow for RTL Simulation with Batch Stimulus](https://tsung-wei-huang.github.io/papers/icpp22-rtlflow.pdf)," ACM International Conference on Parallel Processing (ICPP), Bordeaux, France, 2022
    12. -
    13. Cheng-Hsiang Chiu and Tsung-Wei Huang, "[Composing %Pipeline Parallelism using Control %Taskflow %Graph](https://doi.org/10.1145/3502181.3533714)," ACM International Symposium on High-Performance Parallel and Distributed Computing (HPDC), Minneapolis, Minnesota, 2022
    14. -
    15. Cheng-Hsiang Chiu and Tsung-Wei Huang, "[Efficient Timing Propagation with Simultaneous Structural and Pipeline Parallelisms](https://tsung-wei-huang.github.io/papers/dac2022.pdf)," ACM/IEEE Design Automation Conference (DAC), San Francisco, CA, 2022
    16. -
    17. Dian-Lun Lin and Tsung-Wei Huang, "Efficient GPU Computation using %Task %Graph Parallelism," European Conference on Parallel and Distributed Computing (EuroPar), 2021
    18. -
    19. Tsung-Wei Huang, "[A General-purpose Parallel and Heterogeneous Task Programming System for VLSI CAD](iccad20.pdf)," IEEE/ACM International Conference on Computer-aided Design (ICCAD), CA, 2020
    20. -
    21. Chun-Xun Lin, Tsung-Wei Huang, and Martin Wong, "[An Efficient Work-Stealing Scheduler for Task Dependency Graph](icpads20.pdf)," IEEE International Conference on Parallel and Distributed Systems (ICPADS), Hong Kong, 2020
    22. -
    23. Tsung-Wei Huang, Chun-Xun Lin, Guannan Guo, and Martin Wong, "[Cpp-Taskflow: Fast Task-based Parallel Programming using Modern C++](ipdps19.pdf)," IEEE International Parallel and Distributed Processing Symposium (IPDPS), pp. 974-983, Rio de Janeiro, Brazil, 2019
    24. -
    25. Chun-Xun Lin, Tsung-Wei Huang, Guannan Guo, and Martin Wong, "[A Modern C++ Parallel Task Programming Library](mm19.pdf)," ACM Multimedia Conference (MM), pp. 2284-2287, Nice, France, 2019
    26. -
    27. Chun-Xun Lin, Tsung-Wei Huang, Guannan Guo, and Martin Wong, "[An Efficient and Composable Parallel Task Programming Library](hpec19.pdf)," IEEE High-performance and Extreme Computing Conference (HPEC), pp. 1-7, Waltham, MA, 2019
    28. -
    - - @section RefJournal Journal - -
      -
    1. Dian-Lun Lin and Tsung-Wei Huang, "[Accelerating Large Sparse Neural Network Inference using GPU Task Graph Parallelism](https://tsung-wei-huang.github.io/papers/tpds22-snig.pdf)," IEEE Transactions on Parallel and Distributed Systems (TPDS), vol. 33, no. 11, pp. 3041-3052, Nov 2022
    2. -
    3. Tsung-Wei Huang, Dian-Lun Lin, Chun-Xun Lin, and Yibo Lin, "[Taskflow: A Lightweight Parallel and Heterogeneous Task Graph Computing System](https://tsung-wei-huang.github.io/papers/tpds21-taskflow.pdf)," IEEE Transactions on Parallel and Distributed Systems (TPDS), vol. 33, no. 6, pp. 1303-1320, June 2022
    4. -
    5. Tsung-Wei Huang, Dian-Lun Lin, Yibo Lin, and Chun-Xun Lin, "[Cpp-Taskflow: A General-purpose Parallel %Task Programming System at Scale](tcad21-taskflow.pdf)," IEEE Transactions on Computer-aided Design of Integrated Circuits and Systems (TCAD), vol. 40, no.8, 2021
    6. -
    - @section RefRecognition Recognition
      +
    1. Second Place of Fast Code Programming Challenge at the 2025 ACM PPoPP
    2. +
    3. Innovation Award of the 2023 IEEE HPEC/MIT/Amazon Stochastic Block Partition Challenge
    4. Champion of %Graph Challenge at the 2020 IEEE High-performance Extreme Computing Conference
    5. Second Prize of Open-Source Software Competition at the 2019 ACM Multimedia Conference
    6. ACM SIGDA Outstanding PhD Dissertation Award at the 2019 ACM/IEEE Design Automation Conference
    7. diff --git a/doxygen/releases/release-2.4.0.dox b/doxygen/releases/release-2.4.0.dox index f61d73289..30aca449e 100644 --- a/doxygen/releases/release-2.4.0.dox +++ b/doxygen/releases/release-2.4.0.dox @@ -14,7 +14,7 @@ Cpp-Taskflow 2.4.0 can be downloaded from webpage for %Taskflow! @section release-2-6-0_bug_fixes Bug Fixes -@li fixed the bug of iteratively detaching a subflow from a run loop or a condition loop (see @ref DetachASubflow) +@li fixed the bug of iteratively detaching a subflow from a run loop or a condition loop @li fixed the bug of conflict macro with boost (#184) @section release-2-6-0_deprecated_items Deprecated Items diff --git a/doxygen/releases/release-3.0.0.dox b/doxygen/releases/release-3.0.0.dox index 5d1d1496d..673bfe6b5 100644 --- a/doxygen/releases/release-3.0.0.dox +++ b/doxygen/releases/release-3.0.0.dox @@ -6,7 +6,7 @@ namespace tf { This release includes several new changes such as CPU-GPU tasking, algorithm collection, enhanced web-based profiler, documentation, and unit tests. -@note +@attention Starting from v3, we have migrated the codebase to the @CPP17 standard to largely improve the expressivity and efficiency of the codebase. @@ -52,7 +52,7 @@ To use %Taskflow v3.0.0, you need a compiler that supports C++17: @subsection release-3-0-0_cudaflow cudaFlow -@li added tf::cudaFlowCapturer for building a %cudaFlow through stream capture (see @ref GPUTaskingcudaFlowCapturer) +@li added tf::cudaFlowCapturer for building a %cudaFlow through stream capture @li added tf::cudaFlowCapturerBase for creating custom capturers @li added tf::cudaFlow::capture for capturing a %cudaFlow within a parent %cudaFlow @li added tf::Taskflow::emplace_on to place a %cudaFlow on a GPU @@ -83,8 +83,8 @@ To use %Taskflow v3.0.0, you need a compiler that supports C++17: @subsection release-3-0-0_gpu_algorithms GPU Algorithms -@li added single task (see @ref SingleTaskCUDA) -@li added parallel iterations (see @ref ForEachCUDA) +@li added single task +@li added parallel iterations @li added parallel transforms @li added parallel reduction @@ -114,13 +114,9 @@ to support cancellation (see @ref AsyncTasking and @ref RequestCancellation) @li added @ref BenchmarkTaskflow @li added @ref LimitTheMaximumConcurrency @li added @ref AsyncTasking -@li added @ref GPUTaskingcudaFlowCapturer +@li added @ref GPUTasking @li added @ref RequestCancellation @li added @ref Profiler -@li added @ref cudaFlowAlgorithms - + @ref SingleTaskCUDA to run a kernel function in just a single thread - + @ref ForEachCUDA to perform parallel iterations over a range of items - + @ref ParallelTransformsCUDA to perform parallel transforms over a range of items @li added @ref Governance + @ref rules + @ref team diff --git a/doxygen/releases/release-3.1.0.dox b/doxygen/releases/release-3.1.0.dox index 3caad64a6..73b102cd9 100644 --- a/doxygen/releases/release-3.1.0.dox +++ b/doxygen/releases/release-3.1.0.dox @@ -85,8 +85,6 @@ There are no deprecated or removed items in this release. @section release-3-1-0_documentation Documentation + added @ref QueryTheWorkerID to the cookbook page @ref ExecuteTaskflow -+ revised update methods in @ref GPUTaskingcudaFlow -+ revised rebind methods in @ref GPUTaskingcudaFlowCapturer @section release-3-1-0_miscellaneous_items Miscellaneous Items diff --git a/doxygen/releases/release-3.10.0.dox b/doxygen/releases/release-3.10.0.dox new file mode 100644 index 000000000..32d3e3e2b --- /dev/null +++ b/doxygen/releases/release-3.10.0.dox @@ -0,0 +1,185 @@ +namespace tf { + +/** @page release-3-10-0 Release 3.10.0 (2025/05/01) + +@tableofcontents + +@section release-3-10-0_summary Release Summary + +This release improves scheduling performance through optimized work-stealing threshold tuning and a constrained decentralized buffer. +It also introduces index-range-based parallel-for and parallel-reduction algorithms and modifies subflow tasking behavior to significantly enhance the performance of recursive parallelism. + +@section release-3-10-0_download Download + +%Taskflow 3.10.0 can be downloaded from here. + +@section release-3-10-0_system_requirements System Requirements + +To use %Taskflow v3.10.0, you need a compiler that supports C++17: + +@li GNU C++ Compiler at least v8.4 with -std=c++17 +@li Clang C++ Compiler at least v6.0 with -std=c++17 +@li Microsoft Visual Studio at least v19.27 with /std:c++17 +@li Apple Clang Xcode Version at least v12.0 with -std=c++17 +@li Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17 +@li Intel C++ Compiler at least v19.0.1 with -std=c++17 +@li Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 + +%Taskflow works on Linux, Windows, and Mac OS X. + +@attention +Although %Taskflow supports primarily C++17, you can enable C++20 compilation +through `-std=c++20` to achieve better performance due to new C++20 features. + + +@section release-3-10-0_new_features New Features + +@subsection release-3-10-0_taskflow_core Taskflow Core + ++ optimized work-stealing loop with an adaptive breaking strategy ++ optimized shut-down signal detection using decentralized variables ++ optimized memory layout of node by combining successors and predecessors together ++ changed the default notifier to use the atomic notification algorithm under C++20 ++ added debug mode for the windows CI to GitHub actions ++ added index range-based parallel-for algorithm ([#551](https://github.com/taskflow/taskflow/issues/551)) + +@code{.cpp} +// initialize data1 and data2 to 10 using two different approaches +std::vector data1(100), data2(100); + +// Approach 1: initialize data1 using explicit index range +taskflow.for_each_index(0, 100, 1, [&](int i){ data1[i] = 10; }); + +// Approach 2: initialize data2 using tf::IndexRange +tf::IndexRange range(0, 100, 1); +taskflow.for_each_by_index(range, [&](tf::IndexRange& subrange){ + for(int i=subrange.begin(); i data(100000); +double res = 1.0; +taskflow.reduce_by_index( + // index range + tf::IndexRange(0, N, 1), + // final result + res, + // local reducer + [&](tf::IndexRange subrange, std::optional running_total) { + double residual = running_total ? *running_total : 0.0; + for(size_t i=subrange.begin(); i() +); +@endcode + ++ added `static` keyword to the executor creation in taskflow benchmarks ++ added waiter test to detect over-subscription issues ++ added tf::Executor::num_waiters (C++20 only) for querying the number of non-stealing workers ++ added tf::make_module_task to the algorithm collection (see @ref ModuleAlgorithm) ++ added tf::Runtime::is_cancelled to query if the parent taskflow is cancelled ++ added tf::Runtime to async tasking to simplify designs of recursive parallelism (see @ref RuntimeTasking) + +@subsection release-3-10-0_utilities Utilities + ++ added tf::IndexRange for index range-based parallel-for algorithm ++ added tf::distance to calculate the number of iterations in an index range ++ added tf::is_index_range_invalid to check if the given index range is valid + +@section release-3-10-0_bug_fixes Bug Fixes + ++ fixed the compilation error of CLI11 due to version incompatibility ([#672](https://github.com/taskflow/taskflow/issues/672)) ++ fixed the compilation error of template deduction on packaged_task ([#657](https://github.com/taskflow/taskflow/issues/657)) ++ fixed the MSVC compilation error due to macro clash with std::min and std::max ([#670](https://github.com/taskflow/taskflow/issues/670)) ++ fixed the runtime error due to the use of latch in tf::Executor::Executor ([#667](https://github.com/taskflow/taskflow/issues/667)) ++ fixed the compilation error due to incorrect const qualifier used in algorithms ([#673](https://github.com/taskflow/taskflow/issues/673)) ++ fixed the TSAN error when using find-if algorithm tasks with closure wrapper ([#675](https://github.com/taskflow/taskflow/issues/675)) ++ fixed the task trait bug in incorrect detection for subflow and runtime tasks ([#679](https://github.com/taskflow/taskflow/issues/679)) ++ fixed the infinite steal caused by incorrect `num_empty_steals` ([#681](https://github.com/taskflow/taskflow/issues/681)) + +@section release-3-10-0_breaking_changes Breaking Changes + ++ corrected the terminology by replacing 'dependents' with 'predecessors' + + tf::Task::num_predecessors (previously tf::Task::num_dependents) + + tf::Task::for_each_predecessor (previously tf::Task::for_each_dependent) + + tf::Task::num_strong_dependencies (previously tf::Task::num_strong_dependents) + + tf::Task::num_weak_dependencies (previously tf::Task::num_weak_dependents) ++ disabled the support for tf::Subflow::detach due to multiple intricate and unresolved issues: + + detached subflows are inherently difficult to reason about their execution logic + + detached subflows can incur excessive memory consumption, especially in recursive workloads + + detached subflows lack a manner to safe life cycle control and graph cleanup + + detached subflows have limited practical benefits for most use cases + + detached subflows can be re-implemented using taskflow composition ++ changed the default behavior of tf::Subflow to no longer retain its task graph after join + + default retention can incur significant memory consumption problem ([#674](https://github.com/taskflow/taskflow/issues/674)) + + users must explicitly call tf::Subflow::retain to retain a subflow after join + +@code{.cpp} +tf::Taskflow taskflow; +tf::Executor executor; + +taskflow.emplace([&](tf::Subflow& sf){ + sf.retain(true); // retain the subflow after join for visualization + auto A = sf.emplace([](){ std::cout << "A\n"; }); + auto B = sf.emplace([](){ std::cout << "B\n"; }); + auto C = sf.emplace([](){ std::cout << "C\n"; }); + A.precede(B, C); // A runs before B and C +}); // subflow implicitly joins here + +executor.run(taskflow).wait(); + +// The subflow graph is now retained and can be visualized using taskflow.dump(...) +taskflow.dump(std::cout); +@endcode + ++ disabled the support for tf::cudaFlow and tf::cudaFlowCapturer + + introduced a cleaner interface tf::cudaGraph directly atop @cudaGraph (see @ref GPUTasking) + + tf::cudaGraph has similar interface to tf::cudaFlow and can be changed as follows: + + +@code{.cpp} +// programming tf::cudaGraph is consistent with Nvidia CUDA Graph but offers a simpler +// and more intuitive interface by abstracting away low-level CUDA Graph boilerplate. +tf::cudaGraph cg; +cg.kernel(...); // same as cudaFlow/cudaFlowCapturer + +// unlike cudaFlow/cudaFlowCapturer, you need to explicitly instantiate an executable +// CUDA graph now and submit it to a stream for execution +tf::cudaGraphExec exec(cg); +tf::cudaStream stream; +stream.run(exec).synchronize(); +@endcode + +@section release-3-10-0_documentation Documentation + ++ added @ref ModuleAlgorithm ++ revised @ref SubflowTasking ++ revised @ref AsyncTasking ++ revised @ref RuntimeTasking ++ revised @ref Executor ++ revised @ref ParallelIterations ++ revised @ref ParallelReduction ++ revised @ref ParallelFind ++ revised @ref fibonacci + + +@section release-3-10-0_miscellaneous_items Miscellaneous Items + +If you are interested in collaborating with us on applying %Taskflow to your projects, please feel free to reach out to @twhuang! + +*/ + +} + + diff --git a/doxygen/releases/release-3.11.0.dox b/doxygen/releases/release-3.11.0.dox new file mode 100644 index 000000000..45fd8eabb --- /dev/null +++ b/doxygen/releases/release-3.11.0.dox @@ -0,0 +1,78 @@ +namespace tf { + +/** @page release-3-11-0 Release 3.11.0 (Master) + +%Taskflow 3.11.0 is the newest developing line to new features and improvements +we continue to support. +It is also where this documentation is generated. +Many things are considered @em experimental and may change or break from time to time. +While it may be difficult to be keep all things consistent when introducing new features, +we continue to try our best to ensure backward compatibility. + +@tableofcontents + +@section release-3-11-0_download Download + +To download the newest version of %Taskflow, please clone the master branch +from %Taskflow's GitHub. + +@section release-3-11-0_system_requirements System Requirements + +To use %Taskflow v3.11.0, you need a compiler that supports C++17: + +@li GNU C++ Compiler at least v8.4 with -std=c++17 +@li Clang C++ Compiler at least v6.0 with -std=c++17 +@li Microsoft Visual Studio at least v19.27 with /std:c++17 +@li Apple Clang Xcode Version at least v12.0 with -std=c++17 +@li Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17 +@li Intel C++ Compiler at least v19.0.1 with -std=c++17 +@li Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 + +%Taskflow works on Linux, Windows, and Mac OS X. + +@attention +Although %Taskflow supports primarily C++17, you can enable C++20 compilation +through `-std=c++20` to achieve better performance due to new C++20 features. + +@section release-3-11-0_summary Release Summary + +@section release-3-11-0_new_features New Features + +@subsection release-3-11-0_taskflow_core Taskflow Core + ++ added `examples/task_visitor.cpp` to demonstrate how to traverse a taskflow ([#699](https://github.com/taskflow/taskflow/issues/699)) ++ added five benchmarks to showcase the capability of tf::Runtime + + fibonacci + + skynet + + integrate + + nqueens + + primes + +@subsection release-3-11-0_utilities Utilities + +@section release-3-11-0_bug_fixes Bug Fixes + ++ fixed missing exception on thread creation failure in tf::Executor ([#693](https://github.com/taskflow/taskflow/issues/693)) ++ fixed segmentation fault caused by empty async dependency ([#700](https://github.com/taskflow/taskflow/issues/700)) + +@section release-3-11-0_breaking_changes Breaking Changes + +@section release-3-11-0_documentation Documentation + ++ revised @ref StaticTasking ++ revised @ref ConditionalTasking ++ revised @ref RuntimeTasking ++ revised @ref AsyncTasking ++ revised @ref DependentAsyncTasking ++ revised @ref ExceptionHandling ++ revised @ref RequestCancellation + +@section release-3-11-0_miscellaneous_items Miscellaneous Items + +If you are interested in collaborating with us on applying %Taskflow to your projects, please feel free to reach out to @twhuang! + +*/ + +} + + diff --git a/doxygen/releases/release-3.2.0.dox b/doxygen/releases/release-3.2.0.dox index d1cbfff0b..db6791f09 100644 --- a/doxygen/releases/release-3.2.0.dox +++ b/doxygen/releases/release-3.2.0.dox @@ -144,13 +144,6 @@ There are no breaking changes in this release. + @ref MoveATaskflow @li revised @ref ExecuteTaskflow + @ref ExecuteATaskflowWithTransferredOwnership -@li added @ref cudaFlowAlgorithms -@li added @ref cudaStandardAlgorithms - + @ref CUDASTDExecutionPolicy - + @ref CUDASTDReduce - + @ref CUDASTDScan - + @ref CUDASTDMerge - + @ref CUDASTDFind @section release-3-2-0_miscellaneous_items Miscellaneous Items diff --git a/doxygen/releases/release-3.3.0.dox b/doxygen/releases/release-3.3.0.dox index a261bc23d..91eede10d 100644 --- a/doxygen/releases/release-3.3.0.dox +++ b/doxygen/releases/release-3.3.0.dox @@ -6,7 +6,7 @@ namespace tf { This release includes several new changes, such as sanitized data race, pipeline parallelism, documentation, and unit tests. -@note +@attention We highly recommend that adopting %Taskflow v3.3 in your projects if possible. This release has resolved pretty much all the potential data-race issues induced by incorrect memory order. @@ -123,14 +123,11 @@ This release does not have any deprecated and removed items. + @ref CreateACustomComposableGraph + Revised @ref ConditionalTasking + @ref CreateAMultiConditionTask -+ Revised @ref GPUTaskingcudaFlow -+ Revised @ref GPUTaskingcudaFlowCapturer ++ Revised @ref GPUTasking + Revised @ref LimitTheMaximumConcurrency + @ref DefineAConflictGraph + Revised @ref ParallelSort to add header-include information + Revised @ref ParallelReduction to add header-include information -+ Revised @ref cudaFlowAlgorithms to add header-include information -+ Revised @ref cudaStandardAlgorithms to add header-include information + Added @ref RuntimeTasking + Added @ref ParallelTransforms + Added @ref TaskParallelPipeline diff --git a/doxygen/releases/release-3.4.0.dox b/doxygen/releases/release-3.4.0.dox index 9bd77810d..5959e4646 100644 --- a/doxygen/releases/release-3.4.0.dox +++ b/doxygen/releases/release-3.4.0.dox @@ -78,7 +78,6 @@ There are no deprecated items in this release. + Revised @ref ExecuteTaskflow + Added @ref ExecuteATaskflowFromAnInternalWorker -+ Revised @ref CUDASTDExecutionPolicy + Revised @ref TaskParallelPipeline + Added @ref TaskParallelPipelineLearnMore + Revised @ref Examples diff --git a/doxygen/releases/release-3.5.0.dox b/doxygen/releases/release-3.5.0.dox index 21ffb287c..7a457bb1d 100644 --- a/doxygen/releases/release-3.5.0.dox +++ b/doxygen/releases/release-3.5.0.dox @@ -40,8 +40,6 @@ and adds a new text-based feature for profiler report. + Added tf::Executor::loop_until to allow looping a worker with a custom stop predicate + Added tf::DataPipeline to implement data-parallel algorithms + See @ref DataParallelPipeline -+ Extended tf::TaskQueue to include priority (tf::TaskPriority) - + See @ref PrioritizedTasking + Extended tf::Executor to include tf::WorkerInterface + Improved parallel algorithms (e.g., tf::Taskflow::for_each) with tail optimization + Resolved the busy-waiting problem in our work-stealing algorithm ([#400](https://github.com/taskflow/taskflow/pull/440)) @@ -81,7 +79,6 @@ This release has no deprecated and removed items. + Revised @ref ExecuteTaskflow + Added @ref ExecuteATaskflowFromAnInternalWorker -+ Added @ref PrioritizedTasking + Added @ref DataParallelPipeline @section release-3-5-0_miscellaneous_items Miscellaneous Items diff --git a/doxygen/releases/release-3.6.0.dox b/doxygen/releases/release-3.6.0.dox index bea2943f1..695587de9 100644 --- a/doxygen/releases/release-3.6.0.dox +++ b/doxygen/releases/release-3.6.0.dox @@ -210,11 +210,6 @@ executor.async("name", [](){}); + @ref ParallelIterations + @ref ParallelTransforms + @ref ParallelReduction -+ Revised CUDA standard algorithms to correct the use of buffer query methods - + @ref CUDASTDReduce - + @ref CUDASTDFind - + @ref CUDASTDMerge - + @ref CUDASTDScan + Added @ref TaskParallelPipelineWithTokenDependencies + Added @ref ParallelScan + Added @ref DependentAsyncTasking diff --git a/doxygen/releases/release-3.7.0.dox b/doxygen/releases/release-3.7.0.dox index 0d42c6e0e..3b0652cb2 100644 --- a/doxygen/releases/release-3.7.0.dox +++ b/doxygen/releases/release-3.7.0.dox @@ -1,20 +1,16 @@ namespace tf { -/** @page release-3-7-0 Release 3.7.0 (Master) +/** @page release-3-7-0 Release 3.7.0 (2024/05/07) -%Taskflow 3.7.0 is the newest developing line to new features and improvements -we continue to support. -It is also where this documentation is generated. -Many things are considered @em experimental and may change or break from time to time. -While it may be difficult to be keep all things consistent when introducing new features, -we continue to try our best to ensure backward compatibility. +%Taskflow 3.7.0 is the 8th release in the 3.x line! +This release includes several new changes, such as exception support, improved scheduling algorithms, +documentation, examples, and unit tests. @tableofcontents @section release-3-7-0_download Download -To download the newest version of %Taskflow, please clone the master branch -from %Taskflow's GitHub. +%Taskflow 3.7.0 can be downloaded from here. @section release-3-7-0_system_requirements System Requirements @@ -26,7 +22,7 @@ To use %Taskflow v3.7.0, you need a compiler that supports C++17: @li AppleClang Xcode Version at least v12.0 with -std=c++17 @li Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17 @li Intel C++ Compiler at least v19.0.1 with -std=c++17 -@li Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 and SYCL20 +@li Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 %Taskflow works on Linux, Windows, and Mac OS X. @@ -34,8 +30,6 @@ To use %Taskflow v3.7.0, you need a compiler that supports C++17: This release introduces a new exception interface to help identify C++ errors in taskflow programs. -Additionally, this release enhances the scheduling performance through integration -of C++20 atomic-wait into scheduler, executor, and notifier. @section release-3-7-0_new_features New Features diff --git a/doxygen/releases/release-3.8.0.dox b/doxygen/releases/release-3.8.0.dox new file mode 100644 index 000000000..b80c8a73b --- /dev/null +++ b/doxygen/releases/release-3.8.0.dox @@ -0,0 +1,99 @@ +namespace tf { + +/** @page release-3-8-0 Release 3.8.0 (2024/10/02) + +@tableofcontents + +@section release-3-8-0_summary Release Summary + +This releases (1) enhances the scheduling performance through C++20 atomic notification +and a bounded queue strategy and (2) revised the semaphore model for better runtime control. + +@section release-3-8-0_download Download + +%Taskflow 3.8.0 can be downloaded from here. + +@section release-3-8-0_system_requirements System Requirements + +To use %Taskflow v3.8.0, you need a compiler that supports C++17: + +@li GNU C++ Compiler at least v8.4 with -std=c++17 +@li Clang C++ Compiler at least v6.0 with -std=c++17 +@li Microsoft Visual Studio at least v19.27 with /std:c++17 +@li AppleClang Xcode Version at least v12.0 with -std=c++17 +@li Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17 +@li Intel C++ Compiler at least v19.0.1 with -std=c++17 +@li Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 + +%Taskflow works on Linux, Windows, and Mac OS X. + +@attention +Although %Taskflow supports primarily C++17, you can enable C++20 compilation +through `-std=c++20` to achieve better performance due to new C++20 features. + +@section release-3-8-0_new_features New Features + +@subsection release-3-8-0_taskflow_core Taskflow Core + ++ Enhanced the core scheduling algorithm using a new bounded queue strategy ++ Enhanced the core scheduling performance using C++20 atomic notification + +@code{.bash} +# compile your taskflow program with C++20 enabled +~$ g++ -std=c++20 my_taskflow.cpp +@endcode + ++ Revised the semaphore programming model for better runtime control through tf::Runtime + +@code{.cpp} +tf::Executor executor(8); // create an executor of 8 workers +tf::Taskflow taskflow; +tf::Semaphore semaphore(1); // create a semaphore with initial count 1 +for(size_t i=0; i<1000; i++) { + taskflow.emplace([&](tf::Runtime& rt){ + rt.acquire(semaphore); + std::cout << "critical section here (one worker here only)\n"; + critical_section(); + rt.release(semaphore); + }); +} +executor.run(taskflow).wait(); +@endcode + ++ Enhanced async-tasking performance through TLS ++ Added async-task benchmark ++ Added non-blocking notifier and atomic notifier modules ++ Added tf::BoundedTaskQueue and tf::UnboundedTaskQueue ++ Added tf::Freelist module to replace the centralized overflow queue ++ Removed the redundant exception handling in object pool + +@subsection release-3-8-0_utilities Utilities + +@section release-3-8-0_bug_fixes Bug Fixes + ++ Fixed the compilation error for not finding the C++ atomic library ++ Fixed the missing tf::Runtime in asynchronous tasking ++ Fixed the non-heterogeneity of tf::Taskflow::for_each_index ++ Fixed the bug of UUID unit test in a multithreaded environment + +@section release-3-8-0_breaking_changes Breaking Changes + ++ Removed the support of object pool by default ++ Removed the support of prioritized tasking due to inconsistency with work stealing + +@section release-3-8-0_documentation Documentation + ++ Revised @ref LimitTheMaximumConcurrency ++ Removed Prioritized Tasking ++ Fixed typos in multiple pages + +@section release-3-8-0_miscellaneous_items Miscellaneous Items + +Please do not hesitate to contact @twhuang if you intend to collaborate with us +on using %Taskflow in your scientific computing projects. + +*/ + +} + + diff --git a/doxygen/releases/release-3.9.0.dox b/doxygen/releases/release-3.9.0.dox new file mode 100644 index 000000000..a24ba9634 --- /dev/null +++ b/doxygen/releases/release-3.9.0.dox @@ -0,0 +1,104 @@ +namespace tf { + +/** @page release-3-9-0 Release 3.9.0 (2025/01/02) + +@tableofcontents + +@section release-3-9-0_summary Release Summary + +This release improves scheduling performance with a decentralized work-stealing strategy +and enhances exception handling across all task types. + +@section release-3-9-0_download Download + +%Taskflow 3.9.0 can be downloaded from here. + +@section release-3-9-0_system_requirements System Requirements + +To use %Taskflow v3.9.0, you need a compiler that supports C++17: + +@li GNU C++ Compiler at least v8.4 with -std=c++17 +@li Clang C++ Compiler at least v6.0 with -std=c++17 +@li Microsoft Visual Studio at least v19.27 with /std:c++17 +@li AppleClang Xcode Version at least v12.0 with -std=c++17 +@li Nvidia CUDA Toolkit and Compiler (nvcc) at least v11.1 with -std=c++17 +@li Intel C++ Compiler at least v19.0.1 with -std=c++17 +@li Intel DPC++ Clang Compiler at least v13.0.0 with -std=c++17 + +%Taskflow works on Linux, Windows, and Mac OS X. + +@attention +Although %Taskflow supports primarily C++17, you can enable C++20 compilation +through `-std=c++20` to achieve better performance due to new C++20 features. + +@section release-3-9-0_new_features New Features + +@subsection release-3-9-0_taskflow_core Taskflow Core + ++ improved the core scheduling algorithm using a decentralized work-stealing strategy + + tf::BoundedTaskQueue to optimize per-thread work-stealing latency + + tf::UnboundedTaskQueue to handle overflowed tasks ++ enhanced tf::Runtime to support preemptible execution flows ++ optimized task storage by storing detached tasks in their original subflows ++ optimized the query efficiency for strong dependencies by embedding their values in node states ++ updated tf::Graph to derive from a vector of unique pointers to nodes + + %Graph node lifetimes are managed by std::unique_ptr + + Asynchronous task node lifetimes are managed by tf::Executor. ++ expanded unit tests to include more exception handling scenarios ++ decoupled tf::Runtime from static task to accommodate distinct execution logic ++ removed the blocking behavior to avoid underutilized threads for the following tasks: + + module task ([#649](https://github.com/taskflow/taskflow/issues/649)) + + subflow task + + all parallel algorithms (through preemptible async tasks) ++ removed std::bind from asynchronous tasks to ensure proper constexpr switch ++ added compile-time macros to enable specific features + + `TF_ENABLE_TASK_POOL` to enable the use of task pool ++ added taskflow execution through asynchronous tasking with tf::make_module_task + + details can be referred to @ref ModuleAlgorithm ++ added tf::WorkerInterface for users to configure the behaviors of workers + + details can be referred to @ref ExecuteTaskflow ++ added worker interface example and unit tests + +@subsection release-3-9-0_utilities Utilities + ++ added @c tf::pause to relax CPU during busy spinning loop ++ added @c tf::seed to generate a random seed based on calling time point ++ added @c tf::atomic_min to update an atomic variable with the minimum value ++ added @c tf::atomic_max to update an atomic variable with the maximum value ++ added @c TF_CPP20 and @c TF_CPP17 macro for testing cpp versions + +@section release-3-9-0_bug_fixes Bug Fixes + ++ fixed AppleClang compile error in tsq.hpp ([#651](https://github.com/taskflow/taskflow/pull/651)) ++ fixed wrong range in uuid test ([#632](https://github.com/taskflow/taskflow/pull/632/)) ++ fixed the exception bug in tf::Subflow::join ([#602](https://github.com/taskflow/taskflow/issues/602)) ++ fixed the wrong prefix of target when running benchmark.py ++ fixed a bug in the join counter reset logic for scheduling condition tasks ([#652](https://github.com/taskflow/taskflow/issues/652)) + +@section release-3-9-0_breaking_changes Breaking Changes + ++ decoupled tf::Subflow from inheriting tf::Runtime to accommodate distinct execution logic + + tf::Subflow no longer supports tf::Runtime-specific features ++ removed tf::Runtime::corun_until as it duplicates tf::Executor::corun_until ++ removed tf::Runtime-based semaphore interface due to significant flaws of blocking corun ([#647](https://github.com/taskflow/taskflow/issues/647)) + + details can be referred to @ref LimitTheMaximumConcurrency + +@section release-3-9-0_documentation Documentation + ++ fixed missing documentation of tf::Executor due to Doxygen bugs ([#625](https://github.com/taskflow/taskflow/pull/625)) ++ fixed benchmark instance names in documentation ([#621](https://github.com/taskflow/taskflow/pull/621)) ++ revised @ref ExceptionHandling ++ revised @ref AsyncTasking ++ revised @ref LimitTheMaximumConcurrency ++ added @ref ModuleAlgorithm + +@section release-3-9-0_miscellaneous_items Miscellaneous Items + +Please do not hesitate to contact @twhuang if you intend to collaborate with us +on using %Taskflow in your scientific computing projects. + +*/ + +} + + diff --git a/doxygen/releases/release-roadmap.dox b/doxygen/releases/release-roadmap.dox index 2411bec75..8dc71d250 100644 --- a/doxygen/releases/release-roadmap.dox +++ b/doxygen/releases/release-roadmap.dox @@ -16,22 +16,22 @@ Each milestone releases technical items that significantly enhances the capability of %Taskflow.
      -| Milestone | Release | Time of Arrival | -| :-: | :-: | :-: | -| Migrate the codebase to C++20 | v4.x | (under progress) | -| Design a custom thread-creation interface | TBD | (under progress) | -| Design a distributed tasking interface with scheduling | TBD | (under progress) | -| Design a pipeline scheduling framework with token dependency | v3.x | (under progress) | -| Design a dynamic task graph model | v3.6 | 2023/05/08 (done) | -| Design a pipeline scheduling framework | v3.3 | 2022/01/03 (done) | -| Integrate thread sanitizer into the CI | v3.3 | 2022/01/03 (done) | -| Integrate OpenCL and SYCL to tf::syclFlow | v3.1 | 2021/04/14 (done) | -| Integrate @cuBLAS into tf::cudaFlow | v3.0 | 2020/01/01 (done) | -| Support building %cudaFlow through stream capture | v3.0 | 2021/01/01 (done) | -| Support profiling large data in tfprof | v3.0 | 2021/01/01 (done) | -| Support cancelling %Taskflow | v3.0 | 2021/01/01 (done) | -| Support limiting maximum concurrency | v3.0 | 2021/01/01 (done) | -| Migrate the codebase to C++17 | v3.0 | 2021/01/01 (done) | +| Milestone | Release | +| :-: | :-: | +| Migrate the codebase to C++20 | v4.x | +| Design a custom thread-creation interface | TBD | +| Design a distributed tasking interface with scheduling | TBD | +| Design a pipeline scheduling framework with token dependency | @ref release-3-7-0 | +| Design a dynamic task graph model | @ref release-3-6-0 | +| Design a pipeline scheduling framework | @ref release-3-3-0 | +| Integrate thread sanitizer into the CI | @ref release-3-3-0 | +| Integrate OpenCL and SYCL to tf::syclFlow | @ref release-3-1-0 | +| Integrate @cuBLAS into tf::cudaFlow | @ref release-3-0-0 | +| Support building %cudaFlow through stream capture | @ref release-3-0-0 | +| Support profiling large data in tfprof | @ref release-3-0-0 | +| Support cancelling %Taskflow | @ref release-3-0-0 | +| Support limiting maximum concurrency | @ref release-3-0-0 | +| Migrate the codebase to C++17 | @ref release-3-0-0 |
      Along with the project development, we expect to have multiple releases diff --git a/doxygen/releases/releases.dox b/doxygen/releases/releases.dox index ae7526d53..e2ca0ec69 100644 --- a/doxygen/releases/releases.dox +++ b/doxygen/releases/releases.dox @@ -14,6 +14,10 @@ namespace tf { All releases are available in @ProjectGitHub. + @subpage release-roadmap + + @subpage release-3-11-0 + + @subpage release-3-10-0 + + @subpage release-3-9-0 + + @subpage release-3-8-0 + @subpage release-3-7-0 + @subpage release-3-6-0 + @subpage release-3-5-0 diff --git a/doxygen/sycl_algorithms/sycl_algorithms.dox b/doxygen/sycl_algorithms/sycl_algorithms.dox deleted file mode 100644 index 27990cba4..000000000 --- a/doxygen/sycl_algorithms/sycl_algorithms.dox +++ /dev/null @@ -1,15 +0,0 @@ -namespace tf { - -/** @page syclFlowAlgorithms syclFlow Algorithms - - tf::syclFlow provides several template methods for users to - quickly express common parallel algorithms. - - + @subpage SingleTaskSYCL - + @subpage ForEachSYCL - + @subpage SYCLReduce - + @subpage ParallelTransformsSYCL - -*/ - -} diff --git a/doxygen/sycl_algorithms/sycl_for_each.dox b/doxygen/sycl_algorithms/sycl_for_each.dox deleted file mode 100644 index 4d52d86d4..000000000 --- a/doxygen/sycl_algorithms/sycl_for_each.dox +++ /dev/null @@ -1,83 +0,0 @@ -namespace tf { - -/** @page ForEachSYCL Parallel Iterations - -tf::syclFlow provides two template methods, -tf::syclFlow::for_each and tf::syclFlow::for_each_index, -for creating tasks to perform parallel iterations over a range of items. - -@tableofcontents - -@section ForEachSYCLIndexBasedParallelFor Index-based Parallel Iterations - -Index-based parallel-for performs parallel iterations over a range [first, last) with the given @c step size. -These indices must be @em integral type. -The task created by tf::syclFlow::for_each_index(I first, I last, I step, C&& callable) -represents a kernel of parallel execution -for the following loop: - -@code{.cpp} -// positive step: first, first+step, first+2*step, ... -for(auto i=first; ilast; i+=step) { - callable(i); -} -@endcode - -Each iteration @c i is independent of each other and is assigned one kernel thread -to run the callable. -The following example creates a kernel that assigns each element of @c gpu_data -to 1 over the range @c [0, 100) with step size 1. - -@code{.cpp} -taskflow.emplace_on([&](tf::syclFlow& sf){ - // ... create other gpu tasks - // assigns each element in gpu_data to 1 over the range [0, 100) with step size 1 - sf.for_each_index(0, 100, 1, [gpu_data] (int idx) { - gpu_data[idx] = 1; - }); -}, sycl_queue); -@endcode - -@section ForEachSYCLIteratorBasedParallelIterations Iterator-based Parallel Iterations - -Iterator-based parallel-for performs parallel iterations over a range specified -by two STL-styled iterators, @c first and @c last. -The task created by tf::syclFlow::for_each(I first, I last, C&& callable) represents -a parallel execution of the following loop: - -@code{.cpp} -for(auto i=first; i[gpu_data, gpu_data + 1000)
      . - -@code{.cpp} -taskflow.emplace_on([&](tf::syclFlow& cf){ - // ... create gpu tasks - // assigns each element to 1 over the range [gpu_data, gpu_data + 1000) - cf.for_each(gpu_data, gpu_data + 1000, [] (int& item) { - item = 1; - }); -}, sycl_queue); -@endcode - -Each iteration is independent of each other and is assigned one kernel thread -to run the callable. - -*/ -} - - - - - - diff --git a/doxygen/sycl_algorithms/sycl_reduce.dox b/doxygen/sycl_algorithms/sycl_reduce.dox deleted file mode 100644 index 7cfcafe99..000000000 --- a/doxygen/sycl_algorithms/sycl_reduce.dox +++ /dev/null @@ -1,97 +0,0 @@ -namespace tf { - -/** @page SYCLReduce Parallel Reduction - -tf::syclFlow provides two template methods, -tf::syclFlow::reduce and tf::syclFlow::uninitialized_reduce, -for creating tasks to perform parallel reductions over a range of items. - -@tableofcontents - -@section SYCLReduceItemsWithAnInitialValue Reduce Items with an Initial Value - -The reduction task created by -tf::syclFlow::reduce(I first, I last, T* result, C&& bop) performs -parallel reduction over a range of elements specified by [first, last) -using the binary operator @c bop and stores the reduced result in @c result. -It represents the parallel execution of the following reduction loop -on a SYCL device: - -@code{.cpp} -while (first != last) { - *result = op(*result, *first++); -} -@endcode - -The variable @c result participates in the reduction loop and must be initialized -with an initial value. -The following code performs a parallel reduction to sum all the numbers in -the given range with an initial value @c 1000: - -@code{.cpp} -const size_t N = 1000000; - -int* soln = sycl::malloc_shared(1); // solution -int* data = sycl::malloc_shared(N); // data - -std::for_each(data, data+N, [](int& v){ d = 1; }); -*soln = 1000; - -// create a syclflow to perform parallel reduction on a SYCL device -sycl::queue queue; -tf::syclFlow syclflow(queue); -syclflow.reduce(data, data+N, soln, [] (int a, int b) { return a + b; }); -syclflow.offload(); - -assert(sol == N + 1000); -@endcode - -@section SYCLReduceItemsWithoutAnInitialValue Reduce Items without an Initial Value - -You can use tf::syclFlow::uninitialized_reduce to perform parallel reduction -without any initial value. -This method represents a parallel execution of the following reduction loop -on a SYCL device that does not assum any initial value to reduce. - -@code{.cpp} -*result = *first++; // no initial values participate in the reduction loop -while (first != last) { - *result = op(*result, *first++); -} -@endcode - -The variable @c result is overwritten with the reduced value -and no initial values participate in the reduction loop. -The following code performs a parallel reduction to sum all the numbers in -the given range without any initial value: - -@code{.cpp} -const size_t N = 1000000; - -int* soln = sycl::malloc_shared(1); // solution -int* data = sycl::malloc_shared(N); // data - -std::for_each(data, data+N, [](int& v){ d = 1; }); -*soln = 1000; // no effect - -// create a syclflow to perform parallel reduction on a SYCL device -sycl::queue queue; -tf::syclFlow syclflow(queue); -syclflow.uninitialized_reduce( - data, data+N, soln, [] (int a, int b) { return a + b; } -); -syclflow.offload(); - -assert(sol == N); -@endcode - - - -*/ -} - - - - - - diff --git a/doxygen/sycl_algorithms/sycl_single_task.dox b/doxygen/sycl_algorithms/sycl_single_task.dox deleted file mode 100644 index f825d5fa1..000000000 --- a/doxygen/sycl_algorithms/sycl_single_task.dox +++ /dev/null @@ -1,46 +0,0 @@ -namespace tf { - -/** @page SingleTaskSYCL Single %Task - -tf::syclFlow provides a template method, tf::syclFlow::single_task, -for creating a task to run the -given callable using a single kernel thread. - -@tableofcontents - -@section SingleTaskSYCLSingleTask Run a Task with a Single Thread - -You can create a task to run a kernel function just once, i.e., -using one GPU thread. -This is handy when you want to set up a single or a few global variables -that do not need multiple threads and will be used by multiple -kernels afterwards. -The following example creates a single-task kernel that sets -@c gpu_variable to 1. - -@code{.cpp} -sycl::queue queue; -int* gpu_variable = sycl::malloc_shared(1, queue); - -tf::Task = taskflow.emplace_on([&] (tf::syclFlow& sf) { - // create a single task to set the gpu_variable to 1 - tf::syclTask set_var = sf.single_task( - [gpu_variable] () { *gpu_variable = 1; } - ); - // create one kernel task that needs access to gpu_variable - tf::syclTask kernel1 = sf.parallel_for( - sycl::range<1>(N), [=] (sycl::id<1> id) { data1[id] *= gpu_variable; } - ); - set_par.precede(kernel1); -}, queue); -@endcode - - -*/ -} - - - - - - diff --git a/doxygen/sycl_algorithms/sycl_transform.dox b/doxygen/sycl_algorithms/sycl_transform.dox deleted file mode 100644 index 00dacc99b..000000000 --- a/doxygen/sycl_algorithms/sycl_transform.dox +++ /dev/null @@ -1,56 +0,0 @@ -namespace tf { - -/** @page ParallelTransformsSYCL Parallel Transforms - -tf::syclFlow provides a template method, tf::syclFlow::transform, -for creating a task to perform parallel transforms by -applying the given function to a range of item -and stores the transformed result in another range. - -@tableofcontents - -@section IteratorBasedParallelTransformSYCL Iterator-based Parallel Transforms - -Iterator-based parallel-transform applies the given transform function to a range of items and store the result in another range specified -by two iterators, @c first and @c last. -The two iterators are typically two raw pointers to the -first element and the next to the last element in the range in GPU memory space. -The task created by tf::syclFlow::transform(I first, I last, C&& callable, S... srcs) -represents a kernel of parallel execution -for the following loop: - -@code{.cpp} -while (first != last) { - *first++ = callable(*src1++, *src2++, *src3++, ...); -} -@endcode - -The two iterators, @c first and @c last, are typically two raw pointers to the -first element and the next to the last element in the range. -The following example creates a @c transform kernel that assigns each element, -starting from @c gpu_data to gpu_data + 1000, -to the sum of the corresponding elements -at @c gpu_data_x, @c gpu_data_y, and @c gpu_data_z. - -@code{.cpp} -taskflow.emplace_on([](tf::syclFlow& sf){ - // gpu_data[i] = gpu_data_x[i] + gpu_data_y[i] + gpu_data_z[i] - tf::syclTask task = sf.transform( - gpu_data, gpu_data + 1000, - [] (int xi, int yi, int zi) { return xi + yi + zi; }, - gpu_data_x, gpu_data_y, gpu_data_z - ); -}, sycl_queue); -@endcode - -Each iteration is independent of each other and is assigned one kernel thread -to run the callable. - -*/ -} - - - - - - diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 6ab272fb8..90ee712a3 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -2,9 +2,9 @@ list(APPEND TF_EXAMPLES simple attach_data async - subflow_async + async_module + runtime_async dependent_async - dependent_async_algorithm observer subflow fibonacci @@ -15,11 +15,10 @@ list(APPEND TF_EXAMPLES while_loop if_else nested_if_else - priority visualization parallel_for parallel_sort - reduce + parallel_reduce inclusive_scan exclusive_scan pipeline @@ -37,12 +36,15 @@ list(APPEND TF_EXAMPLES limited_concurrency cancel exception + subflow_exception + worker_interface + task_visitor ) foreach(example IN LISTS TF_EXAMPLES) add_executable(${example} ${example}.cpp) target_link_libraries( - ${example} ${PROJECT_NAME} tf::default_settings + ${example} ${PROJECT_NAME} ${ATOMIC_LIBRARY} tf::default_settings ) # set emcc options if (CMAKE_SYSTEM_NAME STREQUAL Emscripten) diff --git a/examples/async.cpp b/examples/async.cpp index 54dfb7f5c..babd4dea2 100644 --- a/examples/async.cpp +++ b/examples/async.cpp @@ -8,13 +8,22 @@ int main() { // create asynchronous tasks from the executor // (using executor as a thread pool) - std::future fu = executor.async([](){ - std::cout << "async task 1 returns 1\n"; + std::future fu1 = executor.async([](){ + std::cout << "async task returns 1\n"; return 1; }); executor.silent_async([](){ // silent async task doesn't return any future object - std::cout << "async task 2 does not return (silent)\n"; + std::cout << "silent async does not return\n"; + }); + + // create async tasks with runtime + std::future fu2 = executor.async([](tf::Runtime& rt){ + printf("async task with a runtime: %p\n", &rt); + }); + + executor.silent_async([](tf::Runtime& rt){ + printf("silent async task with a runtime: %p\n", &rt); }); executor.wait_for_all(); // wait for the two async tasks to finish @@ -25,15 +34,15 @@ int main() { std::atomic counter {0}; - taskflow.emplace([&](tf::Subflow& sf){ + taskflow.emplace([&](tf::Runtime& rt){ for(int i=0; i<100; i++) { - sf.silent_async([&](){ counter.fetch_add(1, std::memory_order_relaxed); }); + rt.silent_async([&](){ counter.fetch_add(1, std::memory_order_relaxed); }); } - sf.join(); + rt.corun(); // when subflow joins, all spawned tasks from the subflow will finish if(counter == 100) { - std::cout << "async tasks spawned from the subflow all finish\n"; + std::cout << "async tasks spawned from the runtime all finish\n"; } else { throw std::runtime_error("this should not happen"); @@ -41,14 +50,9 @@ int main() { }); executor.run(taskflow).wait(); - + return 0; } - - - - - diff --git a/examples/async_module.cpp b/examples/async_module.cpp new file mode 100644 index 000000000..73cd84f00 --- /dev/null +++ b/examples/async_module.cpp @@ -0,0 +1,37 @@ +// This program demonstrates how to launch taskflows using asynchronous tasking. + +#include +#include + +int main() { + + tf::Executor executor; + + tf::Taskflow A; + tf::Taskflow B; + tf::Taskflow C; + tf::Taskflow D; + + A.emplace([](){ printf("Taskflow A\n"); }); + B.emplace([](){ printf("Taskflow B\n"); }); + C.emplace([](){ printf("Taskflow C\n"); }); + D.emplace([](){ printf("Taskflow D\n"); }); + + // launch the four taskflows using async + printf("launching four taskflows using async ...\n"); + executor.async(tf::make_module_task(A)); + executor.async(tf::make_module_task(B)); + executor.async(tf::make_module_task(C)); + executor.async(tf::make_module_task(D)); + executor.wait_for_all(); + + // launch four taskflows with dependencies + printf("launching four taskflows using dependent async ...\n"); + auto TA = executor.silent_dependent_async(tf::make_module_task(A)); + auto TB = executor.silent_dependent_async(tf::make_module_task(B), TA); + auto TC = executor.silent_dependent_async(tf::make_module_task(C), TB); + auto [TD, FD] = executor.dependent_async(tf::make_module_task(D), TC); + FD.get(); + + return 0; +} diff --git a/examples/corun.cpp b/examples/corun.cpp index 124219c68..a5b2b1bb6 100644 --- a/examples/corun.cpp +++ b/examples/corun.cpp @@ -1,5 +1,5 @@ -// This example demonstrates how to use the corun -// method in the executor. +// This example demonstrates how to use the corun method from a running worker +// of an executor to avoid deadlock. #include int main(){ diff --git a/examples/cuda/CMakeLists.txt b/examples/cuda/CMakeLists.txt index 3a448a7a4..c8b392e31 100644 --- a/examples/cuda/CMakeLists.txt +++ b/examples/cuda/CMakeLists.txt @@ -7,20 +7,20 @@ list(APPEND TF_CUDA_EXAMPLES cuda_saxpy cuda_matmul cuda_knn - cuda_capturer - cuda_reduce - cuda_scan - cuda_merge - cuda_sort - cuda_transform - cuda_rebind - cuda_find + #cuda_capturer + #cuda_reduce + #cuda_scan + #cuda_merge + #cuda_sort + #cuda_transform + cuda_saxpy_update + #cuda_find ) foreach(cuda_example IN LISTS TF_CUDA_EXAMPLES) add_executable(${cuda_example} ${cuda_example}.cu) target_link_libraries(${cuda_example} - ${PROJECT_NAME} Threads::Threads tf::default_settings + ${PROJECT_NAME} ${ATOMIC_LIBRARY} Threads::Threads tf::default_settings ) # avoid cmake 3.18+ warning diff --git a/examples/cuda/cuda_capturer.cu b/examples/cuda/cuda_capturer.cu index b9d64ae5e..951dc3a25 100644 --- a/examples/cuda/cuda_capturer.cu +++ b/examples/cuda/cuda_capturer.cu @@ -35,7 +35,8 @@ int main() { // execute the cudaflow capturer std::cout << "running cudaflow capturer ...\n"; tf::cudaStream stream; - cf.run(stream); + auto exec = cf.instantiate(); + exec.run(stream); stream.synchronize(); // inspect the result diff --git a/examples/cuda/cuda_knn.cu b/examples/cuda/cuda_knn.cu index 78ec3ff92..85a476566 100644 --- a/examples/cuda/cuda_knn.cu +++ b/examples/cuda/cuda_knn.cu @@ -275,31 +275,34 @@ std::pair, std::vector> gpu_predicate( auto kmeans = taskflow.emplace([&](){ - tf::cudaFlow cf; + tf::cudaGraph cg; - auto zero_c = cf.zero(d_c, K).name("zero_c"); - auto zero_sx = cf.zero(d_sx, K).name("zero_sx"); - auto zero_sy = cf.zero(d_sy, K).name("zero_sy"); + auto zero_c = cg.zero(d_c, K); + auto zero_sx = cg.zero(d_sx, K); + auto zero_sy = cg.zero(d_sy, K); - auto cluster = cf.kernel( + auto cluster = cg.kernel( (N+512-1) / 512, 512, 0, assign_clusters, d_px, d_py, N, d_mx, d_my, d_sx, d_sy, K, d_c - ).name("cluster"); + ); - auto new_centroid = cf.kernel( + auto new_centroid = cg.kernel( 1, K, 0, compute_new_means, d_mx, d_my, d_sx, d_sy, d_c - ).name("new_centroid"); + ); cluster.precede(new_centroid) .succeed(zero_c, zero_sx, zero_sy); // Repeat the execution for M times tf::cudaStream stream; + tf::cudaGraphExec exec(cg); for(int i=0; i -#include - -int main() { - - size_t N = 10000; - - auto data = tf::cuda_malloc_shared(N); - - tf::cudaFlowCapturer cudaflow; - tf::cudaStream stream; - - // set data to -1 - for(size_t i=0; i #include @@ -17,82 +17,52 @@ int main() { const unsigned N = 1<<20; - tf::Taskflow taskflow ("saxpy-flow"); - tf::Executor executor; - std::vector hx, hy; float* dx {nullptr}; float* dy {nullptr}; // allocate x - auto allocate_x = taskflow.emplace([&]() { - std::cout << "allocating host x and device x ...\n"; - hx.resize(N, 1.0f); - cudaMalloc(&dx, N*sizeof(float)); - }).name("allocate_x"); + hx.resize(N, 1.0f); + cudaMalloc(&dx, N*sizeof(float)); // allocate y - auto allocate_y = taskflow.emplace([&]() { - std::cout << "allocating host y and device y ...\n"; - hy.resize(N, 2.0f); - cudaMalloc(&dy, N*sizeof(float)); - }).name("allocate_y"); + hy.resize(N, 2.0f); + cudaMalloc(&dy, N*sizeof(float)); - // saxpy cudaFlow - auto cudaflow = taskflow.emplace([&]() { - - std::cout << "running cudaflow ...\n"; - - tf::cudaFlow cf; - auto h2d_x = cf.copy(dx, hx.data(), N).name("h2d_x"); - auto h2d_y = cf.copy(dy, hy.data(), N).name("h2d_y"); - auto d2h_x = cf.copy(hx.data(), dx, N).name("d2h_x"); - auto d2h_y = cf.copy(hy.data(), dy, N).name("d2h_y"); - auto kernel = cf.kernel((N+255)/256, 256, 0, saxpy, N, 2.0f, dx, dy) - .name("saxpy"); - kernel.succeed(h2d_x, h2d_y) - .precede(d2h_x, d2h_y); - - std::cout << "launching cudaflow ...\n"; - tf::cudaStream stream; - cf.run(stream); - stream.synchronize(); - - // visualize this cudaflow - cf.dump(std::cout); - - }).name("saxpy"); - - cudaflow.succeed(allocate_x, allocate_y); + // saxpy cudaGraph + tf::cudaGraph cg; + auto h2d_x = cg.copy(dx, hx.data(), N); + auto h2d_y = cg.copy(dy, hy.data(), N); + auto d2h_x = cg.copy(hx.data(), dx, N); + auto d2h_y = cg.copy(hy.data(), dy, N); + auto kernel = cg.kernel((N+255)/256, 256, 0, saxpy, N, 2.0f, dx, dy); + kernel.succeed(h2d_x, h2d_y) + .precede(d2h_x, d2h_y); + + tf::cudaStream stream; + tf::cudaGraphExec exec(cg); + + stream.run(exec) + .synchronize(); + + // visualize this cudaflow + cg.dump(std::cout); // Add a verification task - auto verifier = taskflow.emplace([&](){ - float max_error = 0.0f; - for (size_t i = 0; i < N; i++) { - max_error = std::max(max_error, abs(hx[i]-1.0f)); - max_error = std::max(max_error, abs(hy[i]-4.0f)); - } - std::cout << "saxpy finished with max error: " << max_error << '\n'; - }).succeed(cudaflow).name("verify"); + float max_error = 0.0f; + for (size_t i = 0; i < N; i++) { + max_error = std::max(max_error, abs(hx[i]-1.0f)); + max_error = std::max(max_error, abs(hy[i]-4.0f)); + } + std::cout << "saxpy finished with max error: " << max_error << '\n'; // free memory - auto deallocate_x = taskflow.emplace([&](){ - std::cout << "deallocating device x ...\n"; - cudaFree(dx); - }).name("deallocate_x"); - - auto deallocate_y = taskflow.emplace([&](){ - std::cout << "deallocating device y ...\n"; - cudaFree(dy); - }).name("deallocate_y"); - - verifier.precede(deallocate_x, deallocate_y); - - executor.run(taskflow).wait(); + cudaFree(dx); + cudaFree(dy); - std::cout << "dumping the taskflow ...\n"; - taskflow.dump(std::cout); + tf::cudaGraph cg2(std::move(cg)); + tf::cudaGraphExec exec2(std::move(exec)); return 0; } diff --git a/examples/cuda/cuda_saxpy_update.cu b/examples/cuda/cuda_saxpy_update.cu new file mode 100644 index 000000000..77299beb3 --- /dev/null +++ b/examples/cuda/cuda_saxpy_update.cu @@ -0,0 +1,86 @@ +// This program performs a simple single-precision Ax+Y operation +// using cudaGraph and showcase how to update its kernel parameters. + +#include +#include + +// Kernel: saxpy +__global__ void saxpy(int n, float a, float *x, float *y) { + int i = blockIdx.x*blockDim.x + threadIdx.x; + if (i < n) { + y[i] = a*x[i] + y[i]; + } +} + +// Function: main +int main() { + + const unsigned N = 1<<20; + + std::vector hx, hy; + + float* dx {nullptr}; + float* dy {nullptr}; + + // allocate x + hx.resize(N, 1.0f); + cudaMalloc(&dx, N*sizeof(float)); + + // allocate y + hy.resize(N, 2.0f); + cudaMalloc(&dy, N*sizeof(float)); + + // saxpy cudaGraph: y[i] = 2*1 + 2 + tf::cudaGraph cg; + auto h2d_x = cg.copy(dx, hx.data(), N); + auto h2d_y = cg.copy(dy, hy.data(), N); + auto d2h_x = cg.copy(hx.data(), dx, N); + auto d2h_y = cg.copy(hy.data(), dy, N); + auto kernel = cg.kernel((N+255)/256, 256, 0, saxpy, N, 2.0f, dx, dy); + kernel.succeed(h2d_x, h2d_y) + .precede(d2h_x, d2h_y); + + tf::cudaStream stream; + tf::cudaGraphExec exec(cg); + stream.run(exec) + .synchronize(); + + // visualize this cudaflow + cg.dump(std::cout); + + // verify x[i] = 1, y[i] = 2 + float max_error = 0.0f; + for (size_t i = 0; i < N; i++) { + max_error = std::max(max_error, abs(hx[i]-1.0f)); + max_error = std::max(max_error, abs(hy[i]-4.0f)); + } + std::cout << "saxpy finished with max error: " << max_error << '\n'; + + // now update the parameters: y[i] = 3*1 + 4 + exec.copy(h2d_x, dy, hy.data(), N); // dy[i] = 4 + exec.copy(h2d_y, dx, hx.data(), N); // dx[i] = 1 + exec.kernel(kernel, (N+255)/256, 256, 0, saxpy, N, 3.0f, dx, dy); + exec.copy(d2h_x, hy.data(), dy, N); // hy[i] = 7 + exec.copy(d2h_y, hx.data(), dx, N); // hx[i] = 1 + + stream.run(exec) + .synchronize(); + + // visualize this cudaflow + cg.dump(std::cout); + + // verify + max_error = 0.0f; + for (size_t i = 0; i < N; i++) { + max_error = std::max(max_error, abs(hx[i]-1.0f)); + max_error = std::max(max_error, abs(hy[i]-7.0f)); + } + std::cout << "updated saxpy finished with max error: " << max_error << '\n'; + + // free memory + cudaFree(dx); + cudaFree(dy); + + return 0; +} + diff --git a/examples/dependent_async_algorithm.cpp b/examples/dependent_async_algorithm.cpp index acc869f62..016bfe4a6 100644 --- a/examples/dependent_async_algorithm.cpp +++ b/examples/dependent_async_algorithm.cpp @@ -1,6 +1,6 @@ /** This program demonstrates how to use dependent async tasks to create - dependent algorithm tasks. + algorithm tasks. */ #include diff --git a/examples/fibonacci.cpp b/examples/fibonacci.cpp index 173302a6f..7e49494b1 100644 --- a/examples/fibonacci.cpp +++ b/examples/fibonacci.cpp @@ -1,21 +1,37 @@ +// This example demonstrates how to use Taskflow's subflow and runtime tasking features +// to create recursive parallelism, using the famous Fibonacci recursion as an example. #include -int spawn(int n, tf::Subflow& sbf) { - if (n < 2) return n; - int res1, res2; +tf::Executor& get_executor() { + static tf::Executor executor; + return executor; +} + +size_t spawn_async(size_t N, tf::Runtime& rt) { + + if (N < 2) { + return N; + } + + size_t res1, res2; - // compute f(n-1) - sbf.emplace([&res1, n] (tf::Subflow& sbf_n_1) { res1 = spawn(n - 1, sbf_n_1); } ) - .name(std::to_string(n-1)); + rt.silent_async([N, &res1](tf::Runtime& rt1){ res1 = spawn_async(N-1, rt1); }); + + // tail optimization + res2 = spawn_async(N-2, rt); - // compute f(n-2) - sbf.emplace([&res2, n] (tf::Subflow& sbf_n_2) { res2 = spawn(n - 2, sbf_n_2); } ) - .name(std::to_string(n-2)); + // use corun to avoid blocking the worker from waiting the two children tasks to finish + rt.corun(); - sbf.join(); return res1 + res2; } +size_t fibonacci_async(size_t N) { + size_t res; + get_executor().async([N, &res](tf::Runtime& rt){ res = spawn_async(N, rt); }).get(); + return res; +} + int main(int argc, char* argv[]) { if(argc != 2) { @@ -23,26 +39,15 @@ int main(int argc, char* argv[]) { std::exit(EXIT_FAILURE); } - int N = std::atoi(argv[1]); - - if(N < 0) { - throw std::runtime_error("N must be non-negative"); - } - - int res; // result - - tf::Executor executor; - tf::Taskflow taskflow("fibonacci"); - - taskflow.emplace([&res, N] (tf::Subflow& sbf) { - res = spawn(N, sbf); - }).name(std::to_string(N)); - - executor.run(taskflow).wait(); + size_t N = std::atoi(argv[1]); - //taskflow.dump(std::cout); + auto tbeg = std::chrono::steady_clock::now(); + printf("fib[%zu] = %zu\n", N, fibonacci_async(N)); + auto tend = std::chrono::steady_clock::now(); - std::cout << "Fib[" << N << "]: " << res << std::endl; + std::cout << "elapsed time: " + << std::chrono::duration_cast(tend-tbeg).count() + << " ms\n"; return 0; } diff --git a/examples/limited_concurrency.cpp b/examples/limited_concurrency.cpp index 182af92b6..99d0d30c6 100644 --- a/examples/limited_concurrency.cpp +++ b/examples/limited_concurrency.cpp @@ -34,4 +34,3 @@ int main() { return 0; } - diff --git a/examples/parallel_for.cpp b/examples/parallel_for.cpp index 61709a015..7bbde1e35 100644 --- a/examples/parallel_for.cpp +++ b/examples/parallel_for.cpp @@ -16,27 +16,34 @@ void for_each(int N) { taskflow.for_each(range.begin(), range.end(), [&] (int i) { printf("for_each on container item: %d\n", i); - }); + }, tf::StaticPartitioner()); executor.run(taskflow).get(); - - taskflow.dump(std::cout); } -// Procedure: for_each_index -void for_each_index(int N) { +// Procedure: for_each_by_index +void for_each_by_index(int N) { tf::Executor executor; tf::Taskflow taskflow; - // [0, N) with step size 2 + // [0, N) with a step size of 2 taskflow.for_each_index(0, N, 2, [] (int i) { printf("for_each_index on index: %d\n", i); }); - executor.run(taskflow).get(); + executor.run(taskflow).wait(); + + // [0, N) with a step size of 2 using tf::IndexRange + tf::IndexRange range(0, N, 2); + + taskflow.for_each_by_index(range, [](tf::IndexRange subrange) { + for(int i=subrange.begin(); i #include -#define MAX_DATA_SIZE 40000000 - struct Data { int a {::rand()}; int b {::rand()}; @@ -15,13 +13,13 @@ struct Data { // Procedure: reduce // This procedure demonstrates -void reduce() { +void reduce(size_t N) { std::cout << "Benchmark: reduce" << std::endl; std::vector data; - data.reserve(MAX_DATA_SIZE); - for(int i=0; i data(MAX_DATA_SIZE); + std::vector data(N); // sequential method auto sbeg = std::chrono::steady_clock::now(); @@ -100,21 +98,59 @@ void transform_reduce() { assert(tmin == smin); } +void reduce_by_index(size_t N) { + + std::cout << "Benchmark: reduce_by_key" << std::endl; + + tf::Executor executor; + tf::Taskflow taskflow; + + std::vector data(N); + double res = 1.0; + + auto tbeg = std::chrono::steady_clock::now(); + taskflow.reduce_by_index( + tf::IndexRange(0, N, 1), + // final result + res, + // local reducer + [&](tf::IndexRange subrange, std::optional running_total) { + double residual = running_total ? *running_total : 0.0; + for(size_t i=subrange.begin(); i() + ); + executor.run(taskflow).wait(); + auto tend = std::chrono::steady_clock::now(); + std::cout << "[taskflow] reduce_by_key " + << std::chrono::duration_cast(tend - tbeg).count() + << " us\n"; +} + // ---------------------------------------------------------------------------- // Function: main int main(int argc, char* argv[]) { - if(argc != 2) { - std::cerr << "usage: ./reduce [reduce|transform_reduce]" << std::endl; + if(argc != 3) { + std::cerr << "usage: ./reduce [reduce|transform_reduce|reduce_by_index] N" << std::endl; std::exit(EXIT_FAILURE); } if(std::strcmp(argv[1], "reduce") == 0) { - reduce(); + reduce(std::stoul(argv[2])); } else if(std::strcmp(argv[1], "transform_reduce") == 0) { - transform_reduce(); + transform_reduce(std::stoul(argv[2])); + } + else if(std::strcmp(argv[1], "reduce_by_index") == 0) { + reduce_by_index(std::stoul(argv[2])); } else { std::cerr << "invalid method " << argv[1] << std::endl; diff --git a/examples/priority.cpp b/examples/priority.cpp deleted file mode 100644 index b90cc36d3..000000000 --- a/examples/priority.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// This program demonstrates how to set priority to a task. -// -// Currently, Taskflow supports only three priority levels: -// + tf::TaskPriority::HIGH (numerical value = 0) -// + tf::TaskPriority::NORMAL (numerical value = 1) -// + tf::TaskPriority::LOW (numerical value = 2) -// -// Priority-based execution is non-preemptive. Once a task -// has started to execute, it will execute to completion, -// even if a higher priority task has been spawned or enqueued. - -#include - -int main() { - - // create an executor of only one worker to enable - // deterministic behavior - tf::Executor executor(1); - - tf::Taskflow taskflow; - - int counter {0}; - - // Here we create five tasks and print thier execution - // orders which should align with assigned priorities - auto [A, B, C, D, E] = taskflow.emplace( - [] () { }, - [&] () { - std::cout << "Task B: " << counter++ << '\n'; // 0 - }, - [&] () { - std::cout << "Task C: " << counter++ << '\n'; // 2 - }, - [&] () { - std::cout << "Task D: " << counter++ << '\n'; // 1 - }, - [] () { } - ); - - A.precede(B, C, D); - E.succeed(B, C, D); - - // By default, all tasks are of tf::TaskPriority::HIGH - B.priority(tf::TaskPriority::HIGH); - C.priority(tf::TaskPriority::LOW); - D.priority(tf::TaskPriority::NORMAL); - - assert(B.priority() == tf::TaskPriority::HIGH); - assert(C.priority() == tf::TaskPriority::LOW); - assert(D.priority() == tf::TaskPriority::NORMAL); - - // we should see B, D, and C in their priority order - executor.run(taskflow).wait(); -} - diff --git a/examples/subflow_async.cpp b/examples/runtime_async.cpp similarity index 89% rename from examples/subflow_async.cpp rename to examples/runtime_async.cpp index 70b72ab43..c9860e010 100644 --- a/examples/subflow_async.cpp +++ b/examples/runtime_async.cpp @@ -9,18 +9,18 @@ int main() { std::atomic counter{0}; - taskflow.emplace([&](tf::Subflow& sf){ + taskflow.emplace([&](tf::Runtime& rt){ for(int i=0; i<10; i++) { // Here, we use "silent_async" instead of "async" because we do // not care the return value. The method "silent_async" gives us // less overhead compared to "async". // The 10 asynchronous tasks run concurrently. - sf.silent_async([&](){ + rt.silent_async([&](){ std::cout << "async task from the subflow\n"; counter.fetch_add(1, std::memory_order_relaxed); }); } - sf.join(); + rt.corun(); std::cout << counter << " = 10\n"; }); diff --git a/examples/simple.cpp b/examples/simple.cpp index 363f52123..33bde7c56 100644 --- a/examples/simple.cpp +++ b/examples/simple.cpp @@ -24,10 +24,19 @@ int main(){ []() { std::cout << "TaskD\n"; } ); + A.name("A"); + B.name("B"); + C.name("C"); + D.name("D"); + A.precede(B, C); // A runs before B and C D.succeed(B, C); // D runs after B and C executor.run(taskflow).wait(); + + // dump the taskflow graph into a .dot format + taskflow.dump(std::cout); return 0; } + diff --git a/examples/subflow.cpp b/examples/subflow.cpp index bef0a7d0a..271a9b25e 100644 --- a/examples/subflow.cpp +++ b/examples/subflow.cpp @@ -1,47 +1,29 @@ -// This example demonstrates how to use Taskflow to create -// dynamic workload during execution. -// -// We first create four tasks A, B, C, and D. During the execution -// of B, it uses flow builder to creates another three tasks -// B1, B2, and B3, and adds dependencies from B1 and B2 to B3. -// -// We use dispatch and get to wait until the graph finished. -// Do so is difference from "wait_for_all" which will clean up the -// finished graphs. After the graph finished, we dump the topology -// for inspection. -// -// Usage: ./subflow detach|join -// - +/** + This example demonstrates how to use Taskflow to create a subflow during the + execution of a task. + + We first create four tasks: A, B, C, and D, where task A runs before B and C, + and task D runs after B and C. During the execution of B, it spawns another subflow + graph of three tasks: B1, B2, and B3, where B3 runs after B1 and B2. + Upon completion of the subflow, it joins its parent task B. + + By default, subflows are automatically cleaned up when they finish to avoid memory explosion. + In this example, since we would like to inspect the spawned subflow, + we disable this behavior by calling `tf::Subflow::retain(true)`. + + Note that we must run the subflow once for it to be created. +*/ #include -const auto usage = "usage: ./subflow detach|join"; - -int main(int argc, char* argv[]) { - - if(argc != 2) { - std::cerr << usage << std::endl; - std::exit(EXIT_FAILURE); - } - - std::string opt(argv[1]); +int main() { - if(opt != "detach" && opt != "join") { - std::cerr << usage << std::endl; - std::exit(EXIT_FAILURE); - } - - auto detached = (opt == "detach") ? true : false; - - // Create a taskflow graph with three regular tasks and one subflow task. + // Create a taskflow graph with three static tasks and one subflow task. tf::Executor executor(4); - tf::Taskflow taskflow("Dynamic Tasking Demo"); + tf::Taskflow taskflow("Subflow Demo"); - // Task A auto A = taskflow.emplace([] () { std::cout << "TaskA\n"; }); auto B = taskflow.emplace( - // Task B - [cap=std::vector{1,2,3,4,5,6,7,8}, detached] (tf::Subflow& subflow) { + [cap=std::vector{1,2,3,4,5,6,7,8}] (tf::Subflow& subflow) { std::cout << "TaskB is spawning B1, B2, and B3 ...\n"; auto B1 = subflow.emplace([&]() { @@ -61,9 +43,9 @@ int main(int argc, char* argv[]) { B1.precede(B3); B2.precede(B3); - - // detach or join the subflow (by default the subflow join at B) - if(detached) subflow.detach(); + + // retain the subflow for visualization purpose + subflow.retain(true); } ); @@ -79,7 +61,7 @@ int main(int argc, char* argv[]) { B.precede(D); // D runs after B C.precede(D); // D runs after C - executor.run(taskflow).get(); // block until finished + executor.run_n(taskflow, 3).get(); // block until finished // examine the graph taskflow.dump(std::cout); diff --git a/examples/subflow_exception.cpp b/examples/subflow_exception.cpp new file mode 100644 index 000000000..5c6624ed3 --- /dev/null +++ b/examples/subflow_exception.cpp @@ -0,0 +1,32 @@ +// This program demonstrates the exception in subflow. + +#include + +int main() { + + tf::Executor executor; + tf::Taskflow taskflow; + + taskflow.emplace([](tf::Subflow& sf) { + tf::Task A = sf.emplace([]() { + std::cout << "Task A\n"; + throw std::runtime_error("exception on A"); + }); + tf::Task B = sf.emplace([]() { + std::cout << "Task B\n"; + }); + A.precede(B); + sf.join(); + }); + + try + { + executor.run(taskflow).get(); + } + catch (const std::runtime_error& re) + { + std::cout << "exception thrown from running the taskflow: " << re.what() << '\n'; + } + + return 0; +} diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt deleted file mode 100644 index 0a1bb6c16..000000000 --- a/examples/sycl/CMakeLists.txt +++ /dev/null @@ -1,21 +0,0 @@ -list(APPEND TF_SYCL_EXAMPLES - sycl_device - sycl_ndrange - sycl_saxpy - sycl_vector_add - sycl_atomic - sycl_matmul - sycl_reduce -) - -foreach(sycl_example IN LISTS TF_SYCL_EXAMPLES) - add_executable(${sycl_example} ${sycl_example}.cpp) - - #add_sycl_to_target(TARGET ${sycl_example} SOURCES ${sycl_example}.cpp) - - target_compile_options(${sycl_example} PRIVATE ${TF_SYCL_OPTIONS}) - target_link_options(${sycl_example} PRIVATE ${TF_SYCL_OPTIONS}) - target_link_libraries(${sycl_example} - ${PROJECT_NAME} Threads::Threads tf::default_settings - ) -endforeach() diff --git a/examples/sycl/sycl_atomic.cpp b/examples/sycl/sycl_atomic.cpp deleted file mode 100644 index 551ef820f..000000000 --- a/examples/sycl/sycl_atomic.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// This program demonstrates how to create a simple vector-add -// application using syclFlow and unified shared memory (USM). - -#include - -constexpr size_t N = 10000; - -int main() { - - // create a standalone scylFlow - sycl::queue queue; - - tf::syclFlow syclflow(queue); - - // allocate a shared memory and initialize the data - auto data = sycl::malloc_shared(N, queue); - - for(size_t i=0; i(N), [=](sycl::id<1> id) { - - auto ref = sycl::atomic_ref< - int, - sycl::memory_order_relaxed, - sycl::memory_scope::device, - sycl::access::address_space::global_space - >{data[0]}; - - ref.fetch_add(data[id]); - } - ); - - // run the syclflow - syclflow.offload(); - - // create a deallocate task that checks the result and frees the memory - if(data[0] != (N-1)*N/2) { - std::cout << data[0] << '\n'; - throw std::runtime_error("incorrect result"); - } - - std::cout << "correct result\n"; - - // deallocates the memory - sycl::free(data, queue); - - - return 0; -} - - diff --git a/examples/sycl/sycl_device.cpp b/examples/sycl/sycl_device.cpp deleted file mode 100644 index ec7db94f9..000000000 --- a/examples/sycl/sycl_device.cpp +++ /dev/null @@ -1,57 +0,0 @@ -// This program pulls out all platforms and devices using SYCL. - -#include - -int main() { - - std::vector platforms = sycl::platform::get_platforms(); - - // looping over platforms - for (const auto& platform : platforms) { - - std::cout << "Platform : " - << platform.get_info() << '\n' - << "is_host : " - << platform.is_host() << '\n' - << "version : " - << platform.get_info() << '\n' - << "vendor : " - << platform.get_info() << '\n' - << "profile : " - << platform.get_info() << '\n'; - //<< "extensions :" - //<< platform.get_info() << '\n'; - - // getting the list of devices from the platform - std::vector devices = platform.get_devices(); - - // looping over devices - for (const auto& device : devices) { - - std::cout << " Device : " - << device.get_info() << '\n' - << " vendor : " - << device.get_info() << '\n' - << " version : " - << device.get_info() << '\n' - << " is_host : " << device.is_host() << '\n' - << " is_cpu : " << device.is_cpu() << '\n' - << " is_gpu : " << device.is_gpu() << '\n' - << " is_accelerator : " << device.is_accelerator() << '\n' - << " max_work_group_size: " - << device.get_info() << '\n' - << " local_mem_size : " - << device.get_info() << '\n'; - - // submitting a kernel to the sycl device - auto queue = sycl::queue(device); - queue.submit([](sycl::handler& handler){ - handler.single_task([](){}); - }); - } - - std::cout << std::endl; - } - - return 0; -} diff --git a/examples/sycl/sycl_matmul.cpp b/examples/sycl/sycl_matmul.cpp deleted file mode 100644 index fb00a8ac7..000000000 --- a/examples/sycl/sycl_matmul.cpp +++ /dev/null @@ -1,188 +0,0 @@ -// The example shows how to use syclFlow to multiply two 2D matrices. - -#include -#include - -// Matrix multiplication using GPU -auto gpu(int M, int N, int K) { - - std::vector ha, hb, hc; - int *da, *db, *dc; - - tf::Executor executor; - tf::Taskflow taskflow("MatrixMultiplication"); - - sycl::queue queue; - - // allocate the host and device storage for a - auto allocate_a = taskflow.emplace([&](){ - ha.resize(M*N, M+N); - da = sycl::malloc_device(M*N, queue); - }).name("allocate_a"); - - // allocate the host and device storage for b - auto allocate_b = taskflow.emplace([&](){ - hb.resize(N*K, N+K); - db = sycl::malloc_device(N*K, queue); - }).name("allocate_b"); - - // allocate the host and device storage for c - auto allocate_c = taskflow.emplace([&](){ - hc.resize(M*K); - dc = sycl::malloc_device(M*K, queue); - }).name("allocate_c"); - - // create a syclFlow to run the matrix multiplication - auto syclFlow = taskflow.emplace_on([&](tf::syclFlow& sf){ - - // copy data to da, db, and dc - auto copy_da = sf.copy(da, ha.data(), M*N).name("H2D_a"); - auto copy_db = sf.copy(db, hb.data(), N*K).name("H2D_b"); - auto copy_hc = sf.copy(hc.data(), dc, M*K).name("D2H_c"); - - auto _M = (M % 16 == 0) ? M : (M + 16 - M % 16); - auto _K = (K % 16 == 0) ? K : (K + 16 - K % 16); - - auto kmatmul = sf.parallel_for( - sycl::nd_range<2>{sycl::range<2>(_M, _K ), sycl::range<2>(16, 16)}, - [=](sycl::nd_item<2> item) { - int row = item.get_global_id(0); - int col = item.get_global_id(1); - if(row < M && col < K) { - int sum = 0; - for(int n = 0; n < N; n++) { - sum += da[row * N + n] * db[n * K + col]; - } - dc[row * K + col] = sum; - } - } - ).name("matmul"); - - // It is also possible to just use range and let the runtime decide the - // partition of groups, but the result is less efficient. - // - //auto kmatmul = sf.parallel_for( - // sycl::range<2>(M, K), - // [=](sycl::id<2> id) { - // int row = id[0]; - // int col = id[1]; - // int sum = 0; - // for(int n = 0; n < N; n++) { - // sum += da[row * N + n] * db[n * K + col]; - // } - // dc[row * K + col] = sum; - // } - //).name("matmul"); - - kmatmul.succeed(copy_da, copy_db) - .precede(copy_hc); - - }, queue).name("syclFlow"); - - auto free = taskflow.emplace([&](){ - sycl::free(da, queue); - sycl::free(db, queue); - sycl::free(dc, queue); - }).name("free"); - - syclFlow.succeed(allocate_a, allocate_b, allocate_c) - .precede(free); - - executor.run(taskflow).wait(); - - // You may uncomment the line below to dump the task graph - //taskflow.dump(std::cout); - - return hc; -} - -// Matrix multiplication using CPU -auto cpu(int M, int N, int K) { - - std::vector a, b, c; - - tf::Executor executor; - tf::Taskflow taskflow; - - auto ha = taskflow.emplace([&](){ - a.resize(M*N, M+N); - }).name("allocate_a"); - - auto hb = taskflow.emplace([&](){ - b.resize(N*K, N+K); - }).name("allocate_b"); - - auto hc = taskflow.emplace([&](){ - c.resize(M*K, 0); - }).name("allocate_c"); - - auto pf = taskflow.for_each_index(0, M, 1, [&] (int m) { - for(int k=0; k(gend-gbeg).count() - << " ms\n"; - - // matrix multiplication using cpu - std::cout << "running cpu matrix multiplication ... "; - auto cbeg = std::chrono::steady_clock::now(); - auto cres = cpu(M, N, K); - auto cend = std::chrono::steady_clock::now(); - std::cout << "completed with " - << std::chrono::duration_cast(cend-cbeg).count() - << " ms\n"; - - // verify the result - int64_t error = 0; - std::cout << "verifying results ... "; - for(int i=0; i - -int main() { - - size_t N = 10000; - - sycl::queue queue; - - auto data = sycl::malloc_shared(N, queue); - - tf::syclFlow syclflow(queue); - - // fill data with -1 - std::cout << "filling data with -1 ...\n"; - - tf::syclTask task = syclflow.fill(data, -1, N); - syclflow.offload(); - - for(size_t i=0; i -#include - -int main(int argc, char* argv[]) { - - if(argc != 2) { - std::cerr << "usage: ./sycl_reduce num_items\n"; - std::exit(EXIT_FAILURE); - } - - size_t N = std::atoi(argv[1]); - - sycl::queue queue; - - auto data = sycl::malloc_shared(N, queue); - auto res1 = sycl::malloc_shared(1, queue); - auto res2 = sycl::malloc_shared(1, queue); - auto hres = 0; - - // initialize the data - for(size_t i=0; i -#include - -constexpr size_t N = 1000000; - -int main() { - - tf::Executor executor; - tf::Taskflow taskflow("saxpy example"); - - sycl::queue queue; - - // allocate shared memory - auto X = sycl::malloc_shared(N, queue); - auto Y = sycl::malloc_shared(N, queue); - - // create a syclFlow to perform the saxpy operation - taskflow.emplace_on([&](tf::syclFlow& sf){ - - tf::syclTask fillX = sf.fill(X, 1.0f, N).name("fillX"); - tf::syclTask fillY = sf.fill(Y, 2.0f, N).name("fillY"); - - tf::syclTask saxpy = sf.parallel_for(sycl::range<1>(N), - [=] (sycl::id<1> id) { - X[id] = 3.0f * X[id] + Y[id]; - } - ).name("saxpy"); - - saxpy.succeed(fillX, fillY); - - }, queue).name("syclFlow"); - - // dump the graph without detailed syclFlow connections - taskflow.dump(std::cout); - - // run the taskflow - executor.run(taskflow).wait(); - - // dump the graph with all syclFlow details (after executed) - taskflow.dump(std::cout); - - // verify the result - for(size_t i=0; i= 1e-4) { - throw std::runtime_error("incorrect saxpy result (expected 5.0f)"); - } - } - - std::cout << "correct saxpy result\n"; - - // free the memory - sycl::free(X, queue); - sycl::free(Y, queue); - - return 0; -} - - - - diff --git a/examples/sycl/sycl_transform.cpp b/examples/sycl/sycl_transform.cpp deleted file mode 100644 index f2c278754..000000000 --- a/examples/sycl/sycl_transform.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// This program demonstrates how to performs a parallel transform -// using syclFlow. - -#include - -int main(int argc, char* argv[]) { - - if(argc != 2) { - std::cerr << "usage: ./sycl_transform num_items\n"; - std::exit(EXIT_FAILURE); - } - - size_t N = std::atoi(argv[1]); - - sycl::queue queue; - - auto data = sycl::malloc_shared(N, queue); - auto src1 = sycl::malloc_shared(N, queue); - auto src2 = sycl::malloc_shared(N, queue); - auto src3 = sycl::malloc_shared(N, queue); - - // initialize the data - for(size_t i=0; i - -constexpr size_t N = 10000000; - -/*int main() { - - tf::Executor executor; - tf::Taskflow taskflow; - - sycl::queue queue; - - int* data {nullptr}; - - // create an allocate task to allocate a shared memory - tf::Task allocate = taskflow.emplace( - [&](){ data = sycl::malloc_shared(N, queue); } - ); - - // create a syclFlow task to add 2 to each element of the vector - tf::Task syclFlow = taskflow.emplace_on([&](tf::syclFlow& sf){ - - tf::syclTask fill = sf.fill(data, 100, N); - - tf::syclTask plus = sf.parallel_for( - sycl::range<1>(N), [=](sycl::id<1> id) { data[id] += 2; } - ); - - fill.precede(plus); - - }, queue); - - // create a deallocate task that checks the result and frees the memory - tf::Task deallocate = taskflow.emplace([&](){ - - for(size_t i=0; i data; - - sycl::queue Q{}; // Select any device for this queue - - std::cout << "Selected device is: " << - - Q.get_device().get_info() << "\n"; - - sycl::buffer A{ sycl::range<1>(size) }; - sycl::buffer B{ sycl::range<1>(size) }; - sycl::buffer C{ data }; - - Q.submit([&](sycl::handler& h) { - auto acc = A.get_access(h); - h.parallel_for(size, [=](auto& idx) { - acc[idx] = 1000; - }); - }); - - Q.submit([&](sycl::handler& h) { - auto acc = B.get_access(h); - h.parallel_for(size, [=](auto& idx) { - acc[idx] = 4000; - }); - }); - - Q.submit([&](sycl::handler& h) { - auto Aacc = A.get_access(h); - auto Bacc = B.get_access(h); - auto Cacc = C.get_access(h); - h.parallel_for(size , [=](auto&idx){ - Cacc[idx] = Aacc[idx] + Bacc[idx]; - }); - }); - - sycl::accessor acc = B.get_access(); - - for(int i=0; i + +int main() { + + // Create a taskflow graph with three static tasks and one subflow task. + tf::Taskflow taskflow("visitor"); + tf::Executor executor; + + auto A = taskflow.emplace([]() { std::cout << "TaskA\n"; }); + auto B = taskflow.emplace([](tf::Subflow& subflow) { + std::cout << "TaskB is spawning B1, B2, and B3 ...\n"; + auto B1 = subflow.emplace([&](){ printf(" Subtask B1\n"); }).name("B1"); + auto B2 = subflow.emplace([&](){ printf(" Subtask B2\n"); }).name("B2"); + auto B3 = subflow.emplace([&](){ printf(" Subtask B3\n"); }).name("B3"); + B1.precede(B3); + B2.precede(B3); + subflow.retain(true); // retains the subflow + }); + + auto C = taskflow.emplace([] () { std::cout << "TaskC\n"; }); + auto D = taskflow.emplace([] () { std::cout << "TaskD\n"; }); + A.name("A"); + B.name("B"); + C.name("C"); + D.name("D"); + + A.precede(B); // B runs after A + A.precede(C); // C runs after A + B.precede(D); // D runs after B + C.precede(D); // D runs after C + + executor.run(taskflow).wait(); + + // examine the graph + taskflow.dump(std::cout); + + // traverse all tasks in the taskflow + taskflow.for_each_task([](tf::Task task){ + std::cout << "task " << task.name() << " [type=" << tf::to_string(task.type()) << "]\n"; + // traverse it's successor + task.for_each_successor([](tf::Task successor) { + std::cout << " -> successor task " << successor.name() << '\n'; + }); + // traverse it's predecessor + task.for_each_predecessor([](tf::Task predecessor) { + std::cout << " <- predecessor task " << predecessor.name() << '\n'; + }); + + // traverse the subflow (in our example, task B) + task.for_each_subflow_task([](tf::Task stask){ + std::cout << " subflow task " << stask.name() << '\n'; + // traverse it's successor + stask.for_each_successor([](tf::Task successor) { + std::cout << " -> successor task " << successor.name() << '\n'; + }); + // traverse it's predecessor + stask.for_each_predecessor([](tf::Task predecessor) { + std::cout << " <- predecessor task " << predecessor.name() << '\n'; + }); + }); + }); + + return 0; +} + + + diff --git a/examples/worker_interface.cpp b/examples/worker_interface.cpp new file mode 100644 index 000000000..648801381 --- /dev/null +++ b/examples/worker_interface.cpp @@ -0,0 +1,72 @@ +// This program demonstrates how to change the worker behavior +// upon the creation of an executor. + +#include + + +// ---------------------------------------------------------------------------- +// Affinity +// ---------------------------------------------------------------------------- +#if defined(__linux__) + #include + #include +#elif defined(_WIN32) + #include +#elif defined(__APPLE__) + #include + #include +#endif + +// affine the given thread to a specific core +bool affine(std::thread& thread, size_t core_id) { +#if defined(__linux__) + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + pthread_t native_handle = thread.native_handle(); + return pthread_setaffinity_np(native_handle, sizeof(cpu_set_t), &cpuset) == 0; +#elif defined(_WIN32) + return SetThreadAffinityMask(thread.native_handle(), 1ULL << core_id) != 0; +#elif defined(__APPLE__) + thread_port_t native_handle = pthread_mach_thread_np(thread.native_handle()); + thread_affinity_policy_data_t policy = {static_cast(core_id)}; + return thread_policy_set( + native_handle, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, 1 + ) == KERN_SUCCESS; +#else + // Unsupported platform + return false; +#endif +} + +// ---------------------------------------------------------------------------- + +class CustomWorkerBehavior : public tf::WorkerInterface { + + public: + + // to call before the worker enters the scheduling loop + void scheduler_prologue(tf::Worker& w) override { + printf("worker %zu prepares to enter the work-stealing loop\n", w.id()); + + // now affine the worker to a particular CPU core equal to its id + if(affine(w.thread(), w.id())) { + printf("successfully affines worker %zu to CPU core %zu\n", w.id(), w.id()); + } + else { + printf("failed to affine worker %zu to CPU core %zu\n", w.id(), w.id()); + } + } + + // to call after the worker leaves the scheduling loop + void scheduler_epilogue(tf::Worker& w, std::exception_ptr) override { + printf("worker %zu left the work-stealing loop\n", w.id()); + } +}; + +int main() { + tf::Executor executor(4, tf::make_worker_interface()); + return 0; +} + + diff --git a/sandbox/executor/executor-dl.hpp b/sandbox/executor/executor-dl.hpp new file mode 100644 index 000000000..c8cac36ad --- /dev/null +++ b/sandbox/executor/executor-dl.hpp @@ -0,0 +1,2518 @@ +#pragma once + +#include "observer.hpp" +#include "taskflow.hpp" +#include "async_task.hpp" + +/** +@file executor.hpp +@brief executor include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// Executor Definition +// ---------------------------------------------------------------------------- + +/** @class Executor + +@brief class to create an executor for running a taskflow graph + +An executor manages a set of worker threads to run one or multiple taskflows +using an efficient work-stealing scheduling algorithm. + +@code{.cpp} +// Declare an executor and a taskflow +tf::Executor executor; +tf::Taskflow taskflow; + +// Add three tasks into the taskflow +tf::Task A = taskflow.emplace([] () { std::cout << "This is TaskA\n"; }); +tf::Task B = taskflow.emplace([] () { std::cout << "This is TaskB\n"; }); +tf::Task C = taskflow.emplace([] () { std::cout << "This is TaskC\n"; }); + +// Build precedence between tasks +A.precede(B, C); + +tf::Future fu = executor.run(taskflow); +fu.wait(); // block until the execution completes + +executor.run(taskflow, [](){ std::cout << "end of 1 run"; }).wait(); +executor.run_n(taskflow, 4); +executor.wait_for_all(); // block until all associated executions finish +executor.run_n(taskflow, 4, [](){ std::cout << "end of 4 runs"; }).wait(); +executor.run_until(taskflow, [cnt=0] () mutable { return ++cnt == 10; }); +@endcode + +All the @c run methods are @em thread-safe. You can submit multiple +taskflows at the same time to an executor from different threads. +*/ +class Executor { + + friend class FlowBuilder; + friend class Subflow; + friend class Runtime; + + public: + + /** + @brief constructs the executor with @c N worker threads + + @param N the number of workers (default std::thread::hardware_concurrency) + + The constructor spawns @c N worker threads to run tasks in a + work-stealing loop. The number of workers must be greater than zero + or an exception will be thrown. + By default, the number of worker threads is equal to the maximum + hardware concurrency returned by std::thread::hardware_concurrency. + */ + explicit Executor(size_t N = std::thread::hardware_concurrency()); + + /** + @brief destructs the executor + + The destructor calls Executor::wait_for_all to wait for all submitted + taskflows to complete and then notifies all worker threads to stop + and join these threads. + */ + ~Executor(); + + /** + @brief runs a taskflow once + + @param taskflow a tf::Taskflow object + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow once and returns a tf::Future + object that eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run(taskflow); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + tf::Future run(Taskflow& taskflow); + + /** + @brief runs a moved taskflow once + + @param taskflow a moved tf::Taskflow object + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow once and returns a tf::Future + object that eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future future = executor.run(std::move(taskflow)); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + tf::Future run(Taskflow&& taskflow); + + /** + @brief runs a taskflow once and invoke a callback upon completion + + @param taskflow a tf::Taskflow object + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow once and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run(taskflow, [](){ std::cout << "done"; }); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template + tf::Future run(Taskflow& taskflow, C&& callable); + + /** + @brief runs a moved taskflow once and invoke a callback upon completion + + @param taskflow a moved tf::Taskflow object + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow once and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future future = executor.run( + std::move(taskflow), [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template + tf::Future run(Taskflow&& taskflow, C&& callable); + + /** + @brief runs a taskflow for @c N times + + @param taskflow a tf::Taskflow object + @param N number of runs + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow @c N times and returns a tf::Future + object that eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run_n(taskflow, 2); // run taskflow 2 times + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + tf::Future run_n(Taskflow& taskflow, size_t N); + + /** + @brief runs a moved taskflow for @c N times + + @param taskflow a moved tf::Taskflow object + @param N number of runs + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow @c N times and returns a tf::Future + object that eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future future = executor.run_n( + std::move(taskflow), 2 // run the moved taskflow 2 times + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + tf::Future run_n(Taskflow&& taskflow, size_t N); + + /** + @brief runs a taskflow for @c N times and then invokes a callback + + @param taskflow a tf::Taskflow + @param N number of runs + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow @c N times and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run( + taskflow, 2, [](){ std::cout << "done"; } // runs taskflow 2 times and invoke + // the lambda to print "done" + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template + tf::Future run_n(Taskflow& taskflow, size_t N, C&& callable); + + /** + @brief runs a moved taskflow for @c N times and then invokes a callback + + @param taskflow a moved tf::Taskflow + @param N number of runs + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow @c N times and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run_n( + // run the moved taskflow 2 times and invoke the lambda to print "done" + std::move(taskflow), 2, [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template + tf::Future run_n(Taskflow&& taskflow, size_t N, C&& callable); + + /** + @brief runs a taskflow multiple times until the predicate becomes true + + @param taskflow a tf::Taskflow + @param pred a boolean predicate to return @c true for stop + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow multiple times until + the predicate returns @c true. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run_until( + taskflow, [](){ return rand()%10 == 0 } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template + tf::Future run_until(Taskflow& taskflow, P&& pred); + + /** + @brief runs a moved taskflow and keeps running it + until the predicate becomes true + + @param taskflow a moved tf::Taskflow object + @param pred a boolean predicate to return @c true for stop + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow multiple times until + the predicate returns @c true. + This member function returns a tf::Future object that + eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future future = executor.run_until( + std::move(taskflow), [](){ return rand()%10 == 0 } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template + tf::Future run_until(Taskflow&& taskflow, P&& pred); + + /** + @brief runs a taskflow multiple times until the predicate becomes true and + then invokes the callback + + @param taskflow a tf::Taskflow + @param pred a boolean predicate to return @c true for stop + @param callable a callable object to be invoked after this run completes + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow multiple times until + the predicate returns @c true and then invokes the given callable when + the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run_until( + taskflow, [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template + tf::Future run_until(Taskflow& taskflow, P&& pred, C&& callable); + + /** + @brief runs a moved taskflow and keeps running + it until the predicate becomes true and then invokes the callback + + @param taskflow a moved tf::Taskflow + @param pred a boolean predicate to return @c true for stop + @param callable a callable object to be invoked after this run completes + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow multiple times until + the predicate returns @c true and then invokes the given callable when + the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future future = executor.run_until( + std::move(taskflow), + [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template + tf::Future run_until(Taskflow&& taskflow, P&& pred, C&& callable); + + /** + @brief runs a target graph and waits until it completes using + an internal worker of this executor + + @tparam T target type which has `tf::Graph& T::graph()` defined + @param target the target task graph object + + The method runs a target graph which has `tf::Graph& T::graph()` defined + and waits until the execution completes. + Unlike the typical flow of calling `tf::Executor::run` series + plus waiting on the result, this method must be called by an internal + worker of this executor. The caller worker will participate in + the work-stealing loop of the scheduler, thereby avoiding potential + deadlock caused by blocked waiting. + + @code{.cpp} + tf::Executor executor(2); + tf::Taskflow taskflow; + std::array others; + + std::atomic counter{0}; + + for(size_t n=0; n<1000; n++) { + for(size_t i=0; i<1000; i++) { + others[n].emplace([&](){ counter++; }); + } + taskflow.emplace([&executor, &tf=others[n]](){ + executor.corun(tf); + //executor.run(tf).wait(); <- blocking the worker without doing anything + // will introduce deadlock + }); + } + executor.run(taskflow).wait(); + @endcode + + The method is thread-safe as long as the target is not concurrently + ran by two or more threads. + + @attention + You must call tf::Executor::corun from a worker of the calling executor + or an exception will be thrown. + */ + template + void corun(T& target); + + /** + @brief keeps running the work-stealing loop until the predicate becomes true + + @tparam P predicate type + @param predicate a boolean predicate to indicate when to stop the loop + + The method keeps the caller worker running in the work-stealing loop + until the stop predicate becomes true. + + @code{.cpp} + taskflow.emplace([&](){ + std::future fu = std::async([](){ std::sleep(100s); }); + executor.corun_until([](){ + return fu.wait_for(std::chrono::seconds(0)) == future_status::ready; + }); + }); + @endcode + + @attention + You must call tf::Executor::corun_until from a worker of the calling executor + or an exception will be thrown. + */ + template + void corun_until(P&& predicate); + + /** + @brief waits for all tasks to complete + + This member function waits until all submitted tasks + (e.g., taskflows, asynchronous tasks) to finish. + + @code{.cpp} + executor.run(taskflow1); + executor.run_n(taskflow2, 10); + executor.run_n(taskflow3, 100); + executor.wait_for_all(); // wait until the above submitted taskflows finish + @endcode + */ + void wait_for_all(); + + /** + @brief queries the number of worker threads + + Each worker represents one unique thread spawned by an executor + upon its construction time. + + @code{.cpp} + tf::Executor executor(4); + std::cout << executor.num_workers(); // 4 + @endcode + */ + size_t num_workers() const noexcept; + + /** + @brief queries the number of running topologies at the time of this call + + When a taskflow is submitted to an executor, a topology is created to store + runtime metadata of the running taskflow. + When the execution of the submitted taskflow finishes, + its corresponding topology will be removed from the executor. + + @code{.cpp} + executor.run(taskflow); + std::cout << executor.num_topologies(); // 0 or 1 (taskflow still running) + @endcode + */ + size_t num_topologies() const; + + /** + @brief queries the number of running taskflows with moved ownership + + @code{.cpp} + executor.run(std::move(taskflow)); + std::cout << executor.num_taskflows(); // 0 or 1 (taskflow still running) + @endcode + */ + size_t num_taskflows() const; + + /** + @brief queries the id of the caller thread in this executor + + Each worker has an unique id in the range of @c 0 to @c N-1 associated with + its parent executor. + If the caller thread does not belong to the executor, @c -1 is returned. + + @code{.cpp} + tf::Executor executor(4); // 4 workers in the executor + executor.this_worker_id(); // -1 (main thread is not a worker) + + taskflow.emplace([&](){ + std::cout << executor.this_worker_id(); // 0, 1, 2, or 3 + }); + executor.run(taskflow); + @endcode + */ + int this_worker_id() const; + + // -------------------------------------------------------------------------- + // Observer methods + // -------------------------------------------------------------------------- + + /** + @brief constructs an observer to inspect the activities of worker threads + + @tparam Observer observer type derived from tf::ObserverInterface + @tparam ArgsT argument parameter pack + + @param args arguments to forward to the constructor of the observer + + @return a shared pointer to the created observer + + Each executor manages a list of observers with shared ownership with callers. + For each of these observers, the two member functions, + tf::ObserverInterface::on_entry and tf::ObserverInterface::on_exit + will be called before and after the execution of a task. + + This member function is not thread-safe. + */ + template + std::shared_ptr make_observer(ArgsT&&... args); + + /** + @brief removes an observer from the executor + + This member function is not thread-safe. + */ + template + void remove_observer(std::shared_ptr observer); + + /** + @brief queries the number of observers + */ + size_t num_observers() const noexcept; + + // -------------------------------------------------------------------------- + // Async Task Methods + // -------------------------------------------------------------------------- + + /** + @brief creates a parameterized asynchronous task to run the given function + + @tparam P task parameter type + @tparam F callable type + + @param params task parameters + @param func callable object + + @return a @std_future that will hold the result of the execution + + The method creates a parameterized asynchronous task + to run the given function and return a @std_future object + that eventually will hold the result of the execution. + + @code{.cpp} + std::future future = executor.async("name", [](){ + std::cout << "create an asynchronous task with a name and returns 1\n"; + return 1; + }); + future.get(); + @endcode + + This member function is thread-safe. + */ + template + auto async(P&& params, F&& func); + + /** + @brief runs a given function asynchronously + + @tparam F callable type + + @param func callable object + + @return a @std_future that will hold the result of the execution + + The method creates an asynchronous task to run the given function + and return a @std_future object that eventually will hold the result + of the return value. + + @code{.cpp} + std::future future = executor.async([](){ + std::cout << "create an asynchronous task and returns 1\n"; + return 1; + }); + future.get(); + @endcode + + This member function is thread-safe. + */ + template + auto async(F&& func); + + /** + @brief similar to tf::Executor::async but does not return a future object + + @tparam F callable type + + @param params task parameters + @param func callable object + + The method creates a parameterized asynchronous task + to run the given function without returning any @std_future object. + This member function is more efficient than tf::Executor::async + and is encouraged to use when applications do not need a @std_future to acquire + the result or synchronize the execution. + + @code{.cpp} + executor.silent_async("name", [](){ + std::cout << "create an asynchronous task with a name and no return\n"; + }); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template + void silent_async(P&& params, F&& func); + + /** + @brief similar to tf::Executor::async but does not return a future object + + @tparam F callable type + + @param func callable object + + The method creates an asynchronous task + to run the given function without returning any @std_future object. + This member function is more efficient than tf::Executor::async + and is encouraged to use when applications do not need a @std_future to acquire + the result or synchronize the execution. + + @code{.cpp} + executor.silent_async([](){ + std::cout << "create an asynchronous task with no return\n"; + }); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template + void silent_async(F&& func); + + // -------------------------------------------------------------------------- + // Silent Dependent Async Methods + // -------------------------------------------------------------------------- + + /** + @brief runs the given function asynchronously + when the given dependents finish + + @tparam F callable type + @tparam Tasks task types convertible to tf::AsyncTask + + @param func callable object + @param tasks asynchronous tasks on which this execution depends + + @return a tf::AsyncTask handle + + This member function is more efficient than tf::Executor::dependent_async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + + @code{.cpp} + tf::AsyncTask A = executor.silent_dependent_async([](){ printf("A\n"); }); + tf::AsyncTask B = executor.silent_dependent_async([](){ printf("B\n"); }); + executor.silent_dependent_async([](){ printf("C runs after A and B\n"); }, A, B); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template ...>, void>* = nullptr + > + tf::AsyncTask silent_dependent_async(F&& func, Tasks&&... tasks); + + /** + @brief runs the given function asynchronously + when the given dependents finish + + @tparam F callable type + @tparam Tasks task types convertible to tf::AsyncTask + + @param params task parameters + @param func callable object + @param tasks asynchronous tasks on which this execution depends + + @return a tf::AsyncTask handle + + This member function is more efficient than tf::Executor::dependent_async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + tf::AsyncTask A = executor.silent_dependent_async("A", [](){ printf("A\n"); }); + tf::AsyncTask B = executor.silent_dependent_async("B", [](){ printf("B\n"); }); + executor.silent_dependent_async( + "C", [](){ printf("C runs after A and B\n"); }, A, B + ); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template && all_same_v...>, void>* = nullptr + > + tf::AsyncTask silent_dependent_async(P&& params, F&& func, Tasks&&... tasks); + + /** + @brief runs the given function asynchronously + when the given range of dependents finish + + @tparam F callable type + @tparam I iterator type + + @param func callable object + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + + @return a tf::AsyncTask handle + + This member function is more efficient than tf::Executor::dependent_async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + + @code{.cpp} + std::array array { + executor.silent_dependent_async([](){ printf("A\n"); }), + executor.silent_dependent_async([](){ printf("B\n"); }) + }; + executor.silent_dependent_async( + [](){ printf("C runs after A and B\n"); }, array.begin(), array.end() + ); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template , AsyncTask>, void>* = nullptr + > + tf::AsyncTask silent_dependent_async(F&& func, I first, I last); + + /** + @brief runs the given function asynchronously + when the given range of dependents finish + + @tparam F callable type + @tparam I iterator type + + @param params tasks parameters + @param func callable object + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + + @return a tf::AsyncTask handle + + This member function is more efficient than tf::Executor::dependent_async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + std::array array { + executor.silent_dependent_async("A", [](){ printf("A\n"); }), + executor.silent_dependent_async("B", [](){ printf("B\n"); }) + }; + executor.silent_dependent_async( + "C", [](){ printf("C runs after A and B\n"); }, array.begin(), array.end() + ); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template && !std::is_same_v, AsyncTask>, void>* = nullptr + > + tf::AsyncTask silent_dependent_async(P&& params, F&& func, I first, I last); + + // -------------------------------------------------------------------------- + // Dependent Async Methods + // -------------------------------------------------------------------------- + + /** + @brief runs the given function asynchronously + when the given dependents finish + + @tparam F callable type + @tparam Tasks task types convertible to tf::AsyncTask + + @param func callable object + @param tasks asynchronous tasks on which this execution depends + + @return a pair of a tf::AsyncTask handle and + a @std_future that holds the result of the execution + + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Task @c C returns a pair of its tf::AsyncTask handle and a std::future + that eventually will hold the result of the execution. + + @code{.cpp} + tf::AsyncTask A = executor.silent_dependent_async([](){ printf("A\n"); }); + tf::AsyncTask B = executor.silent_dependent_async([](){ printf("B\n"); }); + auto [C, fuC] = executor.dependent_async( + [](){ + printf("C runs after A and B\n"); + return 1; + }, + A, B + ); + fuC.get(); // C finishes, which in turns means both A and B finish + @endcode + + You can mixed the use of tf::AsyncTask handles + returned by Executor::dependent_async and Executor::silent_dependent_async + when specifying task dependencies. + + This member function is thread-safe. + */ + template ...>, void>* = nullptr + > + auto dependent_async(F&& func, Tasks&&... tasks); + + /** + @brief runs the given function asynchronously + when the given dependents finish + + @tparam P task parameters type + @tparam F callable type + @tparam Tasks task types convertible to tf::AsyncTask + + @param params task parameters + @param func callable object + @param tasks asynchronous tasks on which this execution depends + + @return a pair of a tf::AsyncTask handle and + a @std_future that holds the result of the execution + + The example below creates three named asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Task @c C returns a pair of its tf::AsyncTask handle and a std::future + that eventually will hold the result of the execution. + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + tf::AsyncTask A = executor.silent_dependent_async("A", [](){ printf("A\n"); }); + tf::AsyncTask B = executor.silent_dependent_async("B", [](){ printf("B\n"); }); + auto [C, fuC] = executor.dependent_async( + "C", + [](){ + printf("C runs after A and B\n"); + return 1; + }, + A, B + ); + assert(fuC.get()==1); // C finishes, which in turns means both A and B finish + @endcode + + You can mixed the use of tf::AsyncTask handles + returned by Executor::dependent_async and Executor::silent_dependent_async + when specifying task dependencies. + + This member function is thread-safe. + */ + template && all_same_v...>, void>* = nullptr + > + auto dependent_async(P&& params, F&& func, Tasks&&... tasks); + + /** + @brief runs the given function asynchronously + when the given range of dependents finish + + @tparam F callable type + @tparam I iterator type + + @param func callable object + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + + @return a pair of a tf::AsyncTask handle and + a @std_future that holds the result of the execution + + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Task @c C returns a pair of its tf::AsyncTask handle and a std::future + that eventually will hold the result of the execution. + + @code{.cpp} + std::array array { + executor.silent_dependent_async([](){ printf("A\n"); }), + executor.silent_dependent_async([](){ printf("B\n"); }) + }; + auto [C, fuC] = executor.dependent_async( + [](){ + printf("C runs after A and B\n"); + return 1; + }, + array.begin(), array.end() + ); + assert(fuC.get()==1); // C finishes, which in turns means both A and B finish + @endcode + + You can mixed the use of tf::AsyncTask handles + returned by Executor::dependent_async and Executor::silent_dependent_async + when specifying task dependencies. + + This member function is thread-safe. + */ + template , AsyncTask>, void>* = nullptr + > + auto dependent_async(F&& func, I first, I last); + + /** + @brief runs the given function asynchronously + when the given range of dependents finish + + @tparam P task parameters type + @tparam F callable type + @tparam I iterator type + + @param params task parameters + @param func callable object + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + + @return a pair of a tf::AsyncTask handle and + a @std_future that holds the result of the execution + + The example below creates three named asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Task @c C returns a pair of its tf::AsyncTask handle and a std::future + that eventually will hold the result of the execution. + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + std::array array { + executor.silent_dependent_async("A", [](){ printf("A\n"); }), + executor.silent_dependent_async("B", [](){ printf("B\n"); }) + }; + auto [C, fuC] = executor.dependent_async( + "C", + [](){ + printf("C runs after A and B\n"); + return 1; + }, + array.begin(), array.end() + ); + assert(fuC.get()==1); // C finishes, which in turns means both A and B finish + @endcode + + You can mixed the use of tf::AsyncTask handles + returned by Executor::dependent_async and Executor::silent_dependent_async + when specifying task dependencies. + + This member function is thread-safe. + */ + template && !std::is_same_v, AsyncTask>, void>* = nullptr + > + auto dependent_async(P&& params, F&& func, I first, I last); + + private: + + const size_t _MAX_STEALS; + + std::mutex _wsq_mutex; + std::mutex _taskflows_mutex; + + std::vector _threads; + std::vector _workers; + +#ifdef __cpp_lib_atomic_wait + std::atomic _num_topologies {0}; + std::atomic_flag _all_spawned = ATOMIC_FLAG_INIT; + + std::atomic_flag _done = ATOMIC_FLAG_INIT; + std::atomic _state {0ull}; + static const uint64_t _EPOCH_INC{1ull << 32}; + static const uint64_t _NUM_WAITERS_MASK{(1ull << 32) - 1}; + static const uint64_t _NUM_WAITERS_INC{1ull}; +#else + std::condition_variable _topology_cv; + std::mutex _topology_mutex; + size_t _num_topologies {0}; + Notifier _notifier; + std::atomic _done {0}; +#endif + + std::unordered_map _wids; + std::list _taskflows; + + TaskQueue _wsq; + + std::unordered_set> _observers; + + Worker* _this_worker(); + + bool _wait_for_task(Worker&, Node*&); + bool _invoke_module_task_internal(Worker&, Node*); + + void _observer_prologue(Worker&, Node*); + void _observer_epilogue(Worker&, Node*); + void _spawn(size_t); + void _exploit_task(Worker&, Node*&); + void _explore_task(Worker&, Node*&); + void _schedule(Worker&, Node*); + void _schedule(Node*); + void _schedule(Worker&, const SmallVector&); + void _schedule(const SmallVector&); + void _set_up_topology(Worker*, Topology*); + void _set_up_graph(Graph&, Node*, Topology*, int, SmallVector&); + void _tear_down_topology(Worker&, Topology*); + void _tear_down_async(Node*); + void _tear_down_dependent_async(Worker&, Node*); + void _tear_down_invoke(Worker&, Node*); + void _increment_topology(); + void _decrement_topology(); + void _invoke(Worker&, Node*); + void _invoke_static_task(Worker&, Node*); + void _invoke_subflow_task(Worker&, Node*); + void _detach_subflow_task(Worker&, Node*, Graph&); + void _invoke_condition_task(Worker&, Node*, SmallVector&); + void _invoke_multi_condition_task(Worker&, Node*, SmallVector&); + void _invoke_module_task(Worker&, Node*); + void _invoke_async_task(Worker&, Node*); + void _invoke_dependent_async_task(Worker&, Node*); + void _process_async_dependent(Node*, tf::AsyncTask&, size_t&); + void _process_exception(Worker&, Node*); + void _schedule_async_task(Node*); + void _corun_graph(Worker&, Node*, Graph&); + + template + void _corun_until(Worker&, P&&); +}; + +// Constructor +inline Executor::Executor(size_t N) : + _MAX_STEALS {((N+1) << 1)}, + _threads {N}, + _workers {N} +#ifndef __cpp_lib_atomic_wait + ,_notifier {N} +#endif +{ + + if(N == 0) { + TF_THROW("executor must define at least one worker"); + } + + _spawn(N); + + // initialize the default observer if requested + if(has_env(TF_ENABLE_PROFILER)) { + TFProfManager::get()._manage(make_observer()); + } +} + +// Destructor +inline Executor::~Executor() { + + // wait for all topologies to complete + wait_for_all(); + + // shut down the scheduler + +#ifdef __cpp_lib_atomic_wait + _done.test_and_set(std::memory_order_relaxed); + _state.fetch_add(_EPOCH_INC, std::memory_order_release); + _state.notify_all(); +#else + _done = true; + _notifier.notify(true); +#endif + + for(auto& t : _threads) { + t.join(); + } +} + +// Function: num_workers +inline size_t Executor::num_workers() const noexcept { + return _workers.size(); +} + +// Function: num_topologies +inline size_t Executor::num_topologies() const { +#ifdef __cpp_lib_atomic_wait + return _num_topologies.load(std::memory_order_relaxed); +#else + return _num_topologies; +#endif +} + +// Function: num_taskflows +inline size_t Executor::num_taskflows() const { + return _taskflows.size(); +} + +// Function: _this_worker +inline Worker* Executor::_this_worker() { + auto itr = _wids.find(std::this_thread::get_id()); + return itr == _wids.end() ? nullptr : &_workers[itr->second]; +} + +// Function: this_worker_id +inline int Executor::this_worker_id() const { + auto i = _wids.find(std::this_thread::get_id()); + return i == _wids.end() ? -1 : static_cast(_workers[i->second]._id); +} + +// Procedure: _spawn +inline void Executor::_spawn(size_t N) { + +#ifdef __cpp_lib_atomic_wait +#else + std::mutex mutex; + std::condition_variable cond; + size_t n=0; +#endif + + for(size_t id=0; id lock(mutex); + cond.wait(lock, [&](){ return n==N; }); +#endif +} + +// Function: _corun_until +template +void Executor::_corun_until(Worker& w, P&& stop_predicate) { + + std::uniform_int_distribution rdvtm(0, _workers.size()-1); + + exploit: + + while(!stop_predicate()) { + + //exploit: + + if(auto t = w._wsq.pop(); t) { + _invoke(w, t); + } + else { + size_t num_steals = 0; + + explore: + + t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal(); + + if(t) { + _invoke(w, t); + goto exploit; + } + else if(!stop_predicate()) { + if(num_steals++ > _MAX_STEALS) { + std::this_thread::yield(); + } + w._vtm = rdvtm(w._rdgen); + goto explore; + } + else { + break; + } + } + } +} + +// Function: _explore_task +inline void Executor::_explore_task(Worker& w, Node*& t) { + + //assert(_workers[w].wsq.empty()); + //assert(!t); + + size_t num_steals = 0; + size_t num_yields = 0; + + std::uniform_int_distribution rdvtm(0, _workers.size()-1); + + // Here, we write do-while to make the worker steal at once + // from the assigned victim. + do { + t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal(); + + if(t) { + break; + } + + if(num_steals++ > _MAX_STEALS) { + std::this_thread::yield(); + if(num_yields++ > 100) { + break; + } + } + + w._vtm = rdvtm(w._rdgen); + } +#ifdef __cpp_lib_atomic_wait + // the _DONE can be checked later in wait_for_task? + while(!_done.test(std::memory_order_relaxed)); +#else + while(!_done); +#endif + +} + +// Procedure: _exploit_task +inline void Executor::_exploit_task(Worker& w, Node*& t) { + while(t) { + _invoke(w, t); + t = w._wsq.pop(); + } +} + +// Function: _wait_for_task +inline bool Executor::_wait_for_task(Worker& worker, Node*& t) { + + explore_task: + + _explore_task(worker, t); + + if(t) { + return true; + } + + // The last thief who successfully stole a task will wake up + // another thief worker to avoid starvation. +// if(t) { +//#ifdef __cpp_lib_atomic_wait +// +//#else +// _notifier.notify(false); +//#endif +// return true; +// } + +#ifdef __cpp_lib_atomic_wait + for(uint64_t cur_state = _state.load(std::memory_order_acquire);;) { + + uint64_t new_state = cur_state + _NUM_WAITERS_INC; + + // TODO: CAS with relaxed?? + if(_state.compare_exchange_weak(cur_state, new_state, std::memory_order_acquire)) { + + if(_done.test(std::memory_order_relaxed)) { + _state.fetch_sub(_NUM_WAITERS_INC, std::memory_order_relaxed); + //_state.fetch_add(_EPOCH_INC, std::memory_order_release); + //_state.notify_all(); + return false; + } + + if(!_wsq.empty()) { + worker._vtm = worker._id; + _state.fetch_sub(_NUM_WAITERS_INC, std::memory_order_relaxed); + goto explore_task; + } + + // We need to use index-based scanning to avoid data race + // with _spawn which may initialize a worker at the same time. + for(size_t vtm=0; vtm<_workers.size(); vtm++) { + if(!_workers[vtm]._wsq.empty()) { + worker._vtm = vtm; + _state.fetch_sub(_NUM_WAITERS_INC, std::memory_order_relaxed); + goto explore_task; + } + } + + _state.wait(new_state, std::memory_order_relaxed); + _state.fetch_sub(_NUM_WAITERS_INC, std::memory_order_relaxed); + goto explore_task; + } + } +#else + // ---- 2PC guard ---- + _notifier.prepare_wait(worker._waiter); + + if(!_wsq.empty()) { + _notifier.cancel_wait(worker._waiter); + worker._vtm = worker._id; + goto explore_task; + } + + if(_done) { + _notifier.cancel_wait(worker._waiter); + _notifier.notify(true); + return false; + } + + // We need to use index-based scanning to avoid data race + // with _spawn which may initialize a worker at the same time. + for(size_t vtm=0; vtm<_workers.size(); vtm++) { + if(!_workers[vtm]._wsq.empty()) { + _notifier.cancel_wait(worker._waiter); + worker._vtm = vtm; + goto explore_task; + } + } + + // Now I really need to relinquish my self to others + _notifier.commit_wait(worker._waiter); + goto explore_task; +#endif + +} + +// Function: make_observer +template +std::shared_ptr Executor::make_observer(ArgsT&&... args) { + + static_assert( + std::is_base_of_v, + "Observer must be derived from ObserverInterface" + ); + + // use a local variable to mimic the constructor + auto ptr = std::make_shared(std::forward(args)...); + + ptr->set_up(_workers.size()); + + _observers.emplace(std::static_pointer_cast(ptr)); + + return ptr; +} + +// Procedure: remove_observer +template +void Executor::remove_observer(std::shared_ptr ptr) { + + static_assert( + std::is_base_of_v, + "Observer must be derived from ObserverInterface" + ); + + _observers.erase(std::static_pointer_cast(ptr)); +} + +// Function: num_observers +inline size_t Executor::num_observers() const noexcept { + return _observers.size(); +} + +// Procedure: _schedule +inline void Executor::_schedule(Worker& worker, Node* node) { + + // We need to fetch p before the release such that the read + // operation is synchronized properly with other thread to + // void data race. + auto p = node->_priority; + + node->_state.fetch_or(Node::READY, std::memory_order_release); + + // caller is a worker to this pool - starting at v3.5 we do not use + // any complicated notification mechanism as the experimental result + // has shown no significant advantage. + if(worker._executor == this) { + worker._wsq.push(node, p); +#ifdef __cpp_lib_atomic_wait + // we load the state first as load is much faster than fetch_add + if((_state.load(std::memory_order_acquire) & _NUM_WAITERS_MASK) != 0) { + _state.fetch_add(_EPOCH_INC, std::memory_order_release); + _state.notify_one(); + } +#else + _notifier.notify(false); +#endif + return; + } + + { + std::lock_guard lock(_wsq_mutex); + _wsq.push(node, p); + } +#ifdef __cpp_lib_atomic_wait + // we load the state first as load is much faster than fetch_add + if((_state.load(std::memory_order_acquire) & _NUM_WAITERS_MASK) != 0) { + _state.fetch_add(_EPOCH_INC, std::memory_order_release); + _state.notify_one(); + } +#else + _notifier.notify(false); +#endif +} + +// Procedure: _schedule +inline void Executor::_schedule(Node* node) { + + // We need to fetch p before the release such that the read + // operation is synchronized properly with other thread to + // void data race. + auto p = node->_priority; + + node->_state.fetch_or(Node::READY, std::memory_order_release); + + { + std::lock_guard lock(_wsq_mutex); + _wsq.push(node, p); + } + +#ifdef __cpp_lib_atomic_wait + // we load the state first as load is much faster than fetch_add + if((_state.load(std::memory_order_acquire) & _NUM_WAITERS_MASK) != 0) { + _state.fetch_add(_EPOCH_INC, std::memory_order_release); + _state.notify_one(); + } +#else + _notifier.notify(false); +#endif +} + +// Procedure: _schedule +inline void Executor::_schedule(Worker& worker, const SmallVector& nodes) { + + // We need to cacth the node count to avoid accessing the nodes + // vector while the parent topology is removed! + const auto num_nodes = nodes.size(); + + if(num_nodes == 0) { + return; + } + + // caller is a worker to this pool - starting at v3.5 we do not use + // any complicated notification mechanism as the experimental result + // has shown no significant advantage. + if(worker._executor == this) { + for(size_t i=0; i_priority; + nodes[i]->_state.fetch_or(Node::READY, std::memory_order_release); + worker._wsq.push(nodes[i], p); +#ifdef __cpp_lib_atomic_wait + // we load the state first as load is much faster than fetch_add + if((_state.load(std::memory_order_acquire) & _NUM_WAITERS_MASK) != 0) { + _state.fetch_add(_EPOCH_INC, std::memory_order_release); + _state.notify_one(); + } +#else + _notifier.notify(false); +#endif + } + return; + } + + { + std::lock_guard lock(_wsq_mutex); + for(size_t k=0; k_priority; + nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release); + _wsq.push(nodes[k], p); + } + } +#ifdef __cpp_lib_atomic_wait + uint64_t num_waiters = (_state.fetch_add(_EPOCH_INC, std::memory_order_release) & _NUM_WAITERS_MASK); + if(num_nodes < num_waiters) { + for(uint64_t k = 0; k < num_waiters - num_nodes; ++k) { + _state.notify_one(); + } + } + else { + _state.notify_all(); + } +#else + _notifier.notify_n(num_nodes); +#endif +} + +// Procedure: _schedule +inline void Executor::_schedule(const SmallVector& nodes) { + + // parent topology may be removed! + const auto num_nodes = nodes.size(); + + if(num_nodes == 0) { + return; + } + + // We need to fetch p before the release such that the read + // operation is synchronized properly with other thread to + // void data race. + { + std::lock_guard lock(_wsq_mutex); + for(size_t k=0; k_priority; + nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release); + _wsq.push(nodes[k], p); + } + } + +#ifdef __cpp_lib_atomic_wait + uint64_t num_waiters = (_state.fetch_add(_EPOCH_INC, std::memory_order_release) & _NUM_WAITERS_MASK); + if(num_nodes < num_waiters) { + for(uint64_t k = 0; k < num_waiters - num_nodes; ++k) { + _state.notify_one(); + } + } + else { + _state.notify_all(); + } +#else + _notifier.notify_n(num_nodes); +#endif +} + +// Procedure: _invoke +inline void Executor::_invoke(Worker& worker, Node* node) { + + // synchronize all outstanding memory operations caused by reordering + while(!(node->_state.load(std::memory_order_acquire) & Node::READY)); + + begin_invoke: + + SmallVector conds; + + // no need to do other things if the topology is cancelled + if(node->_is_cancelled()) { + _tear_down_invoke(worker, node); + return; + } + + // if acquiring semaphore(s) exists, acquire them first + if(node->_semaphores && !node->_semaphores->to_acquire.empty()) { + SmallVector nodes; + if(!node->_acquire_all(nodes)) { + _schedule(worker, nodes); + return; + } + node->_state.fetch_or(Node::ACQUIRED, std::memory_order_release); + } + + // condition task + //int cond = -1; + + // switch is faster than nested if-else due to jump table + switch(node->_handle.index()) { + // static task + case Node::STATIC:{ + _invoke_static_task(worker, node); + } + break; + + // subflow task + case Node::SUBFLOW: { + _invoke_subflow_task(worker, node); + } + break; + + // condition task + case Node::CONDITION: { + _invoke_condition_task(worker, node, conds); + } + break; + + // multi-condition task + case Node::MULTI_CONDITION: { + _invoke_multi_condition_task(worker, node, conds); + } + break; + + // module task + case Node::MODULE: { + _invoke_module_task(worker, node); + } + break; + + // async task + case Node::ASYNC: { + _invoke_async_task(worker, node); + _tear_down_async(node); + return ; + } + break; + + // dependent async task + case Node::DEPENDENT_ASYNC: { + _invoke_dependent_async_task(worker, node); + _tear_down_dependent_async(worker, node); + if(worker._cache) { + node = worker._cache; + goto begin_invoke; + } + return; + } + break; + + // monostate (placeholder) + default: + break; + } + + //invoke_successors: + + // if releasing semaphores exist, release them + if(node->_semaphores && !node->_semaphores->to_release.empty()) { + _schedule(worker, node->_release_all()); + } + + // Reset the join counter to support the cyclic control flow. + // + We must do this before scheduling the successors to avoid race + // condition on _dependents. + // + We must use fetch_add instead of direct assigning + // because the user-space call on "invoke" may explicitly schedule + // this task again (e.g., pipeline) which can access the join_counter. + if((node->_state.load(std::memory_order_relaxed) & Node::CONDITIONED)) { + node->_join_counter.fetch_add(node->num_strong_dependents(), std::memory_order_relaxed); + } + else { + node->_join_counter.fetch_add(node->num_dependents(), std::memory_order_relaxed); + } + + // acquire the parent flow counter + auto& j = (node->_parent) ? node->_parent->_join_counter : + node->_topology->_join_counter; + + // Here, we want to cache the latest successor with the highest priority + worker._cache = nullptr; + auto max_p = static_cast(TaskPriority::MAX); + + // Invoke the task based on the corresponding type + switch(node->_handle.index()) { + + // condition and multi-condition tasks + case Node::CONDITION: + case Node::MULTI_CONDITION: { + for(auto cond : conds) { + if(cond >= 0 && static_cast(cond) < node->_successors.size()) { + auto s = node->_successors[cond]; + // zeroing the join counter for invariant + s->_join_counter.store(0, std::memory_order_relaxed); + j.fetch_add(1, std::memory_order_relaxed); + if(s->_priority <= max_p) { + if(worker._cache) { + _schedule(worker, worker._cache); + } + worker._cache = s; + max_p = s->_priority; + } + else { + _schedule(worker, s); + } + } + } + } + break; + + // non-condition task + default: { + for(size_t i=0; i_successors.size(); ++i) { + //if(auto s = node->_successors[i]; --(s->_join_counter) == 0) { + if(auto s = node->_successors[i]; + s->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) { + j.fetch_add(1, std::memory_order_relaxed); + if(s->_priority <= max_p) { + if(worker._cache) { + _schedule(worker, worker._cache); + } + worker._cache = s; + max_p = s->_priority; + } + else { + _schedule(worker, s); + } + } + } + } + break; + } + + // tear_down the invoke + _tear_down_invoke(worker, node); + + // perform tail recursion elimination for the right-most child to reduce + // the number of expensive pop/push operations through the task queue + if(worker._cache) { + node = worker._cache; + //node->_state.fetch_or(Node::READY, std::memory_order_release); + goto begin_invoke; + } +} + +// Procedure: _tear_down_invoke +inline void Executor::_tear_down_invoke(Worker& worker, Node* node) { + // we must check parent first before subtracting the join counter, + // or it can introduce data race + if(auto parent = node->_parent; parent == nullptr) { + if(node->_topology->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) { + _tear_down_topology(worker, node->_topology); + } + } + // Here we asssume the parent is in a busy loop (e.g., corun) waiting for + // its join counter to become 0. + else { + //parent->_join_counter.fetch_sub(1, std::memory_order_acq_rel); + parent->_join_counter.fetch_sub(1, std::memory_order_release); + } + //// module task + //else { + // auto id = parent->_handle.index(); + // if(parent->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) { + // if(id == Node::MODULE) { + // return parent; + // } + // } + //} + //return nullptr; +} + +// Procedure: _observer_prologue +inline void Executor::_observer_prologue(Worker& worker, Node* node) { + for(auto& observer : _observers) { + observer->on_entry(WorkerView(worker), TaskView(*node)); + } +} + +// Procedure: _observer_epilogue +inline void Executor::_observer_epilogue(Worker& worker, Node* node) { + for(auto& observer : _observers) { + observer->on_exit(WorkerView(worker), TaskView(*node)); + } +} + +// Procedure: _process_exception +inline void Executor::_process_exception(Worker&, Node* node) { + + constexpr static auto flag = Topology::EXCEPTION | Topology::CANCELLED; + + // if the node has a parent, we store the exception in its parent + if(auto parent = node->_parent; parent) { + if ((parent->_state.fetch_or(Node::EXCEPTION, std::memory_order_relaxed) & Node::EXCEPTION) == 0) { + parent->_exception_ptr = std::current_exception(); + } + // TODO if the node has a topology, cancel it to enable early stop + //if(auto tpg = node->_topology; tpg) { + // tpg->_state.fetch_or(Topology::CANCELLED, std::memory_order_relaxed); + //} + } + // multiple tasks may throw, so we only take the first thrown exception + else if(auto tpg = node->_topology; tpg && + ((tpg->_state.fetch_or(flag, std::memory_order_relaxed) & Topology::EXCEPTION) == 0) + ) { + tpg->_exception_ptr = std::current_exception(); + } + // TODO: skip the exception that is not associated with any taskflows +} + +// Procedure: _invoke_static_task +inline void Executor::_invoke_static_task(Worker& worker, Node* node) { + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { + auto& work = std::get_if(&node->_handle)->work; + switch(work.index()) { + case 0: + std::get_if<0>(&work)->operator()(); + break; + + case 1: + Runtime rt(*this, worker, node); + std::get_if<1>(&work)->operator()(rt); + node->_process_exception(); + break; + } + }); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_subflow_task +inline void Executor::_invoke_subflow_task(Worker& w, Node* node) { + _observer_prologue(w, node); + TF_EXECUTOR_EXCEPTION_HANDLER(w, node, { + auto handle = std::get_if(&node->_handle); + handle->subgraph._clear(); + Subflow sf(*this, w, node, handle->subgraph); + handle->work(sf); + if(sf._joinable) { + _corun_graph(w, node, handle->subgraph); + } + node->_process_exception(); + }); + _observer_epilogue(w, node); +} + +// Procedure: _detach_subflow_task +inline void Executor::_detach_subflow_task(Worker& w, Node* p, Graph& g) { + + // graph is empty and has no async tasks + if(g.empty() && p->_join_counter.load(std::memory_order_acquire) == 0) { + return; + } + + SmallVector src; + _set_up_graph(g, nullptr, p->_topology, Node::DETACHED, src); + + { + std::lock_guard lock(p->_topology->_taskflow._mutex); + p->_topology->_taskflow._graph._merge(std::move(g)); + } + + p->_topology->_join_counter.fetch_add(src.size(), std::memory_order_relaxed); + _schedule(w, src); +} + +// Procedure: _corun_graph +inline void Executor::_corun_graph(Worker& w, Node* p, Graph& g) { + + // assert(p); + + // graph is empty and has no async tasks (subflow) + if(g.empty() && p->_join_counter.load(std::memory_order_acquire) == 0) { + return; + } + + SmallVector src; + + _set_up_graph(g, p, p->_topology, 0, src); + p->_join_counter.fetch_add(src.size(), std::memory_order_relaxed); + + _schedule(w, src); + + _corun_until(w, [p] () -> bool { + return p->_join_counter.load(std::memory_order_acquire) == 0; } + ); +} + +// Procedure: _invoke_condition_task +inline void Executor::_invoke_condition_task( + Worker& worker, Node* node, SmallVector& conds +) { + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { + auto& work = std::get_if(&node->_handle)->work; + switch(work.index()) { + case 0: + conds = { std::get_if<0>(&work)->operator()() }; + break; + + case 1: + Runtime rt(*this, worker, node); + conds = { std::get_if<1>(&work)->operator()(rt) }; + node->_process_exception(); + break; + } + }); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_multi_condition_task +inline void Executor::_invoke_multi_condition_task( + Worker& worker, Node* node, SmallVector& conds +) { + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { + auto& work = std::get_if(&node->_handle)->work; + switch(work.index()) { + case 0: + conds = std::get_if<0>(&work)->operator()(); + break; + + case 1: + Runtime rt(*this, worker, node); + conds = std::get_if<1>(&work)->operator()(rt); + node->_process_exception(); + break; + } + }); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_module_task +inline void Executor::_invoke_module_task(Worker& w, Node* node) { + _observer_prologue(w, node); + TF_EXECUTOR_EXCEPTION_HANDLER(w, node, { + _corun_graph(w, node, std::get_if(&node->_handle)->graph); + node->_process_exception(); + }); + _observer_epilogue(w, node); +} + +//// Function: _invoke_module_task_internal +//inline bool Executor::_invoke_module_task_internal(Worker& w, Node* p) { +// +// // acquire the underlying graph +// auto& g = std::get_if(&p->_handle)->graph; +// +// // no need to do anything if the graph is empty +// if(g.empty()) { +// return false; +// } +// +// SmallVector src; +// _set_up_graph(g, p, p->_topology, 0, src); +// p->_join_counter.fetch_add(src.size(), std::memory_order_relaxed); +// +// _schedule(w, src); +// return true; +//} + +// Procedure: _invoke_async_task +inline void Executor::_invoke_async_task(Worker& worker, Node* node) { + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { + auto& work = std::get_if(&node->_handle)->work; + switch(work.index()) { + case 0: + std::get_if<0>(&work)->operator()(); + break; + + case 1: + Runtime rt(*this, worker, node); + std::get_if<1>(&work)->operator()(rt); + break; + } + }); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_dependent_async_task +inline void Executor::_invoke_dependent_async_task(Worker& worker, Node* node) { + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { + auto& work = std::get_if(&node->_handle)->work; + switch(work.index()) { + case 0: + std::get_if<0>(&work)->operator()(); + break; + + case 1: + Runtime rt(*this, worker, node); + std::get_if<1>(&work)->operator()(rt); + break; + } + }); + _observer_epilogue(worker, node); +} + +// Function: run +inline tf::Future Executor::run(Taskflow& f) { + return run_n(f, 1, [](){}); +} + +// Function: run +inline tf::Future Executor::run(Taskflow&& f) { + return run_n(std::move(f), 1, [](){}); +} + +// Function: run +template +tf::Future Executor::run(Taskflow& f, C&& c) { + return run_n(f, 1, std::forward(c)); +} + +// Function: run +template +tf::Future Executor::run(Taskflow&& f, C&& c) { + return run_n(std::move(f), 1, std::forward(c)); +} + +// Function: run_n +inline tf::Future Executor::run_n(Taskflow& f, size_t repeat) { + return run_n(f, repeat, [](){}); +} + +// Function: run_n +inline tf::Future Executor::run_n(Taskflow&& f, size_t repeat) { + return run_n(std::move(f), repeat, [](){}); +} + +// Function: run_n +template +tf::Future Executor::run_n(Taskflow& f, size_t repeat, C&& c) { + return run_until( + f, [repeat]() mutable { return repeat-- == 0; }, std::forward(c) + ); +} + +// Function: run_n +template +tf::Future Executor::run_n(Taskflow&& f, size_t repeat, C&& c) { + return run_until( + std::move(f), [repeat]() mutable { return repeat-- == 0; }, std::forward(c) + ); +} + +// Function: run_until +template +tf::Future Executor::run_until(Taskflow& f, P&& pred) { + return run_until(f, std::forward

      (pred), [](){}); +} + +// Function: run_until +template +tf::Future Executor::run_until(Taskflow&& f, P&& pred) { + return run_until(std::move(f), std::forward

      (pred), [](){}); +} + +// Function: run_until +template +tf::Future Executor::run_until(Taskflow& f, P&& p, C&& c) { + + _increment_topology(); + + // Need to check the empty under the lock since subflow task may + // define detached blocks that modify the taskflow at the same time + bool empty; + { + std::lock_guard lock(f._mutex); + empty = f.empty(); + } + + // No need to create a real topology but returns an dummy future + if(empty || p()) { + c(); + std::promise promise; + promise.set_value(); + _decrement_topology(); + return tf::Future(promise.get_future()); + } + + // create a topology for this run + auto t = std::make_shared(f, std::forward

      (p), std::forward(c)); + + // need to create future before the topology got torn down quickly + tf::Future future(t->_promise.get_future(), t); + + // modifying topology needs to be protected under the lock + { + std::lock_guard lock(f._mutex); + f._topologies.push(t); + if(f._topologies.size() == 1) { + _set_up_topology(_this_worker(), t.get()); + } + } + + return future; +} + +// Function: run_until +template +tf::Future Executor::run_until(Taskflow&& f, P&& pred, C&& c) { + + std::list::iterator itr; + + { + std::scoped_lock lock(_taskflows_mutex); + itr = _taskflows.emplace(_taskflows.end(), std::move(f)); + itr->_satellite = itr; + } + + return run_until(*itr, std::forward

      (pred), std::forward(c)); +} + +// Function: corun +template +void Executor::corun(T& target) { + + auto w = _this_worker(); + + if(w == nullptr) { + TF_THROW("corun must be called by a worker of the executor"); + } + + Node parent; // auxiliary parent + _corun_graph(*w, &parent, target.graph()); + parent._process_exception(); +} + +// Function: corun_until +template +void Executor::corun_until(P&& predicate) { + + auto w = _this_worker(); + + if(w == nullptr) { + TF_THROW("corun_until must be called by a worker of the executor"); + } + + _corun_until(*w, std::forward

      (predicate)); + + // TODO: exception? +} + +// Procedure: _increment_topology +inline void Executor::_increment_topology() { +#ifdef __cpp_lib_atomic_wait + _num_topologies.fetch_add(1, std::memory_order_relaxed); +#else + std::lock_guard lock(_topology_mutex); + ++_num_topologies; +#endif +} + +// Procedure: _decrement_topology +inline void Executor::_decrement_topology() { +#ifdef __cpp_lib_atomic_wait + if(_num_topologies.fetch_sub(1, std::memory_order_acq_rel) == 1) { + _num_topologies.notify_all(); + } +#else + std::lock_guard lock(_topology_mutex); + if(--_num_topologies == 0) { + _topology_cv.notify_all(); + } +#endif +} + +// Procedure: wait_for_all +inline void Executor::wait_for_all() { +#ifdef __cpp_lib_atomic_wait + size_t n = _num_topologies.load(std::memory_order_acquire); + while(n != 0) { + _num_topologies.wait(n, std::memory_order_acquire); + n = _num_topologies.load(std::memory_order_acquire); + } +#else + std::unique_lock lock(_topology_mutex); + _topology_cv.wait(lock, [&](){ return _num_topologies == 0; }); +#endif +} + +// Function: _set_up_topology +inline void Executor::_set_up_topology(Worker* worker, Topology* tpg) { + + // ---- under taskflow lock ---- + + tpg->_sources.clear(); + tpg->_taskflow._graph._clear_detached(); + _set_up_graph(tpg->_taskflow._graph, nullptr, tpg, 0, tpg->_sources); + tpg->_join_counter.store(tpg->_sources.size(), std::memory_order_relaxed); + + if(worker) { + _schedule(*worker, tpg->_sources); + } + else { + _schedule(tpg->_sources); + } +} + +// Function: _set_up_graph +inline void Executor::_set_up_graph( + Graph& g, Node* parent, Topology* tpg, int state, SmallVector& src +) { + for(auto node : g._nodes) { + node->_topology = tpg; + node->_parent = parent; + node->_state.store(state, std::memory_order_relaxed); + if(node->num_dependents() == 0) { + src.push_back(node); + } + node->_set_up_join_counter(); + node->_exception_ptr = nullptr; + } +} + +// Function: _tear_down_topology +inline void Executor::_tear_down_topology(Worker& worker, Topology* tpg) { + + auto &f = tpg->_taskflow; + + //assert(&tpg == &(f._topologies.front())); + + // case 1: we still need to run the topology again + if(!tpg->_exception_ptr && !tpg->cancelled() && !tpg->_pred()) { + //assert(tpg->_join_counter == 0); + std::lock_guard lock(f._mutex); + tpg->_join_counter.store(tpg->_sources.size(), std::memory_order_relaxed); + _schedule(worker, tpg->_sources); + } + // case 2: the final run of this topology + else { + + // TODO: if the topology is cancelled, need to release all semaphores + if(tpg->_call != nullptr) { + tpg->_call(); + } + + // If there is another run (interleave between lock) + if(std::unique_lock lock(f._mutex); f._topologies.size()>1) { + //assert(tpg->_join_counter == 0); + + // Set the promise + tpg->_promise.set_value(); + f._topologies.pop(); + tpg = f._topologies.front().get(); + + // decrement the topology but since this is not the last we don't notify + _decrement_topology(); + + // set up topology needs to be under the lock or it can + // introduce memory order error with pop + _set_up_topology(&worker, tpg); + } + else { + //assert(f._topologies.size() == 1); + + auto fetched_tpg {std::move(f._topologies.front())}; + f._topologies.pop(); + auto satellite {f._satellite}; + + lock.unlock(); + + // Soon after we carry out the promise, there is no longer any guarantee + // for the lifetime of the associated taskflow. + fetched_tpg->_carry_out_promise(); + + _decrement_topology(); + + // remove the taskflow if it is managed by the executor + // TODO: in the future, we may need to synchronize on wait + // (which means the following code should the moved before set_value) + if(satellite) { + std::scoped_lock satellite_lock(_taskflows_mutex); + _taskflows.erase(*satellite); + } + } + } +} + +// ############################################################################ +// Forward Declaration: Subflow +// ############################################################################ + +inline void Subflow::join() { + + // assert(this_worker().worker == &_worker); + + if(!_joinable) { + TF_THROW("subflow not joinable"); + } + + // only the parent worker can join the subflow + _executor._corun_graph(_worker, _parent, _graph); + + // if any exception is caught from subflow tasks, rethrow it + _parent->_process_exception(); + + _joinable = false; +} + +inline void Subflow::detach() { + + // assert(this_worker().worker == &_worker); + + if(!_joinable) { + TF_THROW("subflow already joined or detached"); + } + + // only the parent worker can detach the subflow + _executor._detach_subflow_task(_worker, _parent, _graph); + _joinable = false; +} + +// ############################################################################ +// Forward Declaration: Runtime +// ############################################################################ + +// Procedure: schedule +inline void Runtime::schedule(Task task) { + + auto node = task._node; + // need to keep the invariant: when scheduling a task, the task must have + // zero dependency (join counter is 0) + // or we can encounter bug when inserting a nested flow (e.g., module task) + node->_join_counter.store(0, std::memory_order_relaxed); + + auto& j = node->_parent ? node->_parent->_join_counter : + node->_topology->_join_counter; + j.fetch_add(1, std::memory_order_relaxed); + _executor._schedule(_worker, node); +} + +// Procedure: corun +template +void Runtime::corun(T&& target) { + _executor._corun_graph(_worker, _parent, target.graph()); + _parent->_process_exception(); +} + +// Procedure: corun_until +template +void Runtime::corun_until(P&& predicate) { + _executor._corun_until(_worker, std::forward

      (predicate)); + // TODO: exception? +} + +// Function: corun_all +inline void Runtime::corun_all() { + _executor._corun_until(_worker, [this] () -> bool { + return _parent->_join_counter.load(std::memory_order_acquire) == 0; + }); + _parent->_process_exception(); +} + +// Destructor +inline Runtime::~Runtime() { + _executor._corun_until(_worker, [this] () -> bool { + return _parent->_join_counter.load(std::memory_order_acquire) == 0; + }); +} + +// ------------------------------------ +// Runtime::silent_async series +// ------------------------------------ + +// Function: _silent_async +template +void Runtime::_silent_async(Worker& w, P&& params, F&& f) { + + _parent->_join_counter.fetch_add(1, std::memory_order_relaxed); + + auto node = node_pool.animate( + std::forward

      (params), _parent->_topology, _parent, 0, + std::in_place_type_t{}, std::forward(f) + ); + + _executor._schedule(w, node); +} + +// Function: silent_async +template +void Runtime::silent_async(F&& f) { + _silent_async(*_executor._this_worker(), DefaultTaskParams{}, std::forward(f)); +} + +// Function: silent_async +template +void Runtime::silent_async(P&& params, F&& f) { + _silent_async(*_executor._this_worker(), std::forward

      (params), std::forward(f)); +} + +// Function: silent_async_unchecked +template +void Runtime::silent_async_unchecked(F&& f) { + _silent_async(_worker, DefaultTaskParams{}, std::forward(f)); +} + +// Function: silent_async_unchecked +template +void Runtime::silent_async_unchecked(P&& params, F&& f) { + _silent_async(_worker, std::forward

      (params), std::forward(f)); +} + +// ------------------------------------ +// Runtime::async series +// ------------------------------------ + +// Function: _async +template +auto Runtime::_async(Worker& w, P&& params, F&& f) { + + _parent->_join_counter.fetch_add(1, std::memory_order_relaxed); + + using R = std::invoke_result_t>; + + std::packaged_task p(std::forward(f)); + auto fu{p.get_future()}; + + auto node = node_pool.animate( + std::forward

      (params), _parent->_topology, _parent, 0, + std::in_place_type_t{}, + [p=make_moc(std::move(p))] () mutable { p.object(); } + ); + + _executor._schedule(w, node); + + return fu; +} + +// Function: async +template +auto Runtime::async(F&& f) { + return _async(*_executor._this_worker(), DefaultTaskParams{}, std::forward(f)); +} + +// Function: async +template +auto Runtime::async(P&& params, F&& f) { + return _async(*_executor._this_worker(), std::forward

      (params), std::forward(f)); +} + + + +} // end of namespace tf ----------------------------------------------------- + + + + + + diff --git a/taskflow/core/executor-module-opt.hpp b/sandbox/executor/executor-module-opt.hpp similarity index 99% rename from taskflow/core/executor-module-opt.hpp rename to sandbox/executor/executor-module-opt.hpp index 0e2b1ee6f..842fc3261 100644 --- a/taskflow/core/executor-module-opt.hpp +++ b/sandbox/executor/executor-module-opt.hpp @@ -1023,7 +1023,7 @@ inline bool Executor::_wait_for_task(Worker& worker, Node*& t) { } } - // Now I really need to relinguish my self to others + // Now I really need to relinquish my self to others _notifier.commit_wait(worker._waiter); return true; diff --git a/sandbox/executor/executor-no-waiter.hpp b/sandbox/executor/executor-no-waiter.hpp new file mode 100644 index 000000000..08e336a66 --- /dev/null +++ b/sandbox/executor/executor-no-waiter.hpp @@ -0,0 +1,2492 @@ +#pragma once + +#include "observer.hpp" +#include "taskflow.hpp" +#include "async_task.hpp" + +/** +@file executor.hpp +@brief executor include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// Executor Definition +// ---------------------------------------------------------------------------- + +/** @class Executor + +@brief class to create an executor for running a taskflow graph + +An executor manages a set of worker threads to run one or multiple taskflows +using an efficient work-stealing scheduling algorithm. + +@code{.cpp} +// Declare an executor and a taskflow +tf::Executor executor; +tf::Taskflow taskflow; + +// Add three tasks into the taskflow +tf::Task A = taskflow.emplace([] () { std::cout << "This is TaskA\n"; }); +tf::Task B = taskflow.emplace([] () { std::cout << "This is TaskB\n"; }); +tf::Task C = taskflow.emplace([] () { std::cout << "This is TaskC\n"; }); + +// Build precedence between tasks +A.precede(B, C); + +tf::Future fu = executor.run(taskflow); +fu.wait(); // block until the execution completes + +executor.run(taskflow, [](){ std::cout << "end of 1 run"; }).wait(); +executor.run_n(taskflow, 4); +executor.wait_for_all(); // block until all associated executions finish +executor.run_n(taskflow, 4, [](){ std::cout << "end of 4 runs"; }).wait(); +executor.run_until(taskflow, [cnt=0] () mutable { return ++cnt == 10; }); +@endcode + +All the @c run methods are @em thread-safe. You can submit multiple +taskflows at the same time to an executor from different threads. +*/ +class Executor { + + friend class FlowBuilder; + friend class Subflow; + friend class Runtime; + + public: + + /** + @brief constructs the executor with @c N worker threads + + @param N the number of workers (default std::thread::hardware_concurrency) + + The constructor spawns @c N worker threads to run tasks in a + work-stealing loop. The number of workers must be greater than zero + or an exception will be thrown. + By default, the number of worker threads is equal to the maximum + hardware concurrency returned by std::thread::hardware_concurrency. + */ + explicit Executor(size_t N = std::thread::hardware_concurrency()); + + /** + @brief destructs the executor + + The destructor calls Executor::wait_for_all to wait for all submitted + taskflows to complete and then notifies all worker threads to stop + and join these threads. + */ + ~Executor(); + + /** + @brief runs a taskflow once + + @param taskflow a tf::Taskflow object + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow once and returns a tf::Future + object that eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run(taskflow); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + tf::Future run(Taskflow& taskflow); + + /** + @brief runs a moved taskflow once + + @param taskflow a moved tf::Taskflow object + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow once and returns a tf::Future + object that eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future future = executor.run(std::move(taskflow)); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + tf::Future run(Taskflow&& taskflow); + + /** + @brief runs a taskflow once and invoke a callback upon completion + + @param taskflow a tf::Taskflow object + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow once and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run(taskflow, [](){ std::cout << "done"; }); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template + tf::Future run(Taskflow& taskflow, C&& callable); + + /** + @brief runs a moved taskflow once and invoke a callback upon completion + + @param taskflow a moved tf::Taskflow object + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow once and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future future = executor.run( + std::move(taskflow), [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template + tf::Future run(Taskflow&& taskflow, C&& callable); + + /** + @brief runs a taskflow for @c N times + + @param taskflow a tf::Taskflow object + @param N number of runs + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow @c N times and returns a tf::Future + object that eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run_n(taskflow, 2); // run taskflow 2 times + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + tf::Future run_n(Taskflow& taskflow, size_t N); + + /** + @brief runs a moved taskflow for @c N times + + @param taskflow a moved tf::Taskflow object + @param N number of runs + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow @c N times and returns a tf::Future + object that eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future future = executor.run_n( + std::move(taskflow), 2 // run the moved taskflow 2 times + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + tf::Future run_n(Taskflow&& taskflow, size_t N); + + /** + @brief runs a taskflow for @c N times and then invokes a callback + + @param taskflow a tf::Taskflow + @param N number of runs + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow @c N times and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run( + taskflow, 2, [](){ std::cout << "done"; } // runs taskflow 2 times and invoke + // the lambda to print "done" + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template + tf::Future run_n(Taskflow& taskflow, size_t N, C&& callable); + + /** + @brief runs a moved taskflow for @c N times and then invokes a callback + + @param taskflow a moved tf::Taskflow + @param N number of runs + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow @c N times and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run_n( + // run the moved taskflow 2 times and invoke the lambda to print "done" + std::move(taskflow), 2, [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template + tf::Future run_n(Taskflow&& taskflow, size_t N, C&& callable); + + /** + @brief runs a taskflow multiple times until the predicate becomes true + + @param taskflow a tf::Taskflow + @param pred a boolean predicate to return @c true for stop + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow multiple times until + the predicate returns @c true. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run_until( + taskflow, [](){ return rand()%10 == 0 } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template + tf::Future run_until(Taskflow& taskflow, P&& pred); + + /** + @brief runs a moved taskflow and keeps running it + until the predicate becomes true + + @param taskflow a moved tf::Taskflow object + @param pred a boolean predicate to return @c true for stop + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow multiple times until + the predicate returns @c true. + This member function returns a tf::Future object that + eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future future = executor.run_until( + std::move(taskflow), [](){ return rand()%10 == 0 } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template + tf::Future run_until(Taskflow&& taskflow, P&& pred); + + /** + @brief runs a taskflow multiple times until the predicate becomes true and + then invokes the callback + + @param taskflow a tf::Taskflow + @param pred a boolean predicate to return @c true for stop + @param callable a callable object to be invoked after this run completes + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow multiple times until + the predicate returns @c true and then invokes the given callable when + the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run_until( + taskflow, [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template + tf::Future run_until(Taskflow& taskflow, P&& pred, C&& callable); + + /** + @brief runs a moved taskflow and keeps running + it until the predicate becomes true and then invokes the callback + + @param taskflow a moved tf::Taskflow + @param pred a boolean predicate to return @c true for stop + @param callable a callable object to be invoked after this run completes + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow multiple times until + the predicate returns @c true and then invokes the given callable when + the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future future = executor.run_until( + std::move(taskflow), + [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template + tf::Future run_until(Taskflow&& taskflow, P&& pred, C&& callable); + + /** + @brief runs a target graph and waits until it completes using + an internal worker of this executor + + @tparam T target type which has `tf::Graph& T::graph()` defined + @param target the target task graph object + + The method runs a target graph which has `tf::Graph& T::graph()` defined + and waits until the execution completes. + Unlike the typical flow of calling `tf::Executor::run` series + plus waiting on the result, this method must be called by an internal + worker of this executor. The caller worker will participate in + the work-stealing loop of the scheduler, thereby avoiding potential + deadlock caused by blocked waiting. + + @code{.cpp} + tf::Executor executor(2); + tf::Taskflow taskflow; + std::array others; + + std::atomic counter{0}; + + for(size_t n=0; n<1000; n++) { + for(size_t i=0; i<1000; i++) { + others[n].emplace([&](){ counter++; }); + } + taskflow.emplace([&executor, &tf=others[n]](){ + executor.corun(tf); + //executor.run(tf).wait(); <- blocking the worker without doing anything + // will introduce deadlock + }); + } + executor.run(taskflow).wait(); + @endcode + + The method is thread-safe as long as the target is not concurrently + ran by two or more threads. + + @attention + You must call tf::Executor::corun from a worker of the calling executor + or an exception will be thrown. + */ + template + void corun(T& target); + + /** + @brief keeps running the work-stealing loop until the predicate becomes true + + @tparam P predicate type + @param predicate a boolean predicate to indicate when to stop the loop + + The method keeps the caller worker running in the work-stealing loop + until the stop predicate becomes true. + + @code{.cpp} + taskflow.emplace([&](){ + std::future fu = std::async([](){ std::sleep(100s); }); + executor.corun_until([](){ + return fu.wait_for(std::chrono::seconds(0)) == future_status::ready; + }); + }); + @endcode + + @attention + You must call tf::Executor::corun_until from a worker of the calling executor + or an exception will be thrown. + */ + template + void corun_until(P&& predicate); + + /** + @brief waits for all tasks to complete + + This member function waits until all submitted tasks + (e.g., taskflows, asynchronous tasks) to finish. + + @code{.cpp} + executor.run(taskflow1); + executor.run_n(taskflow2, 10); + executor.run_n(taskflow3, 100); + executor.wait_for_all(); // wait until the above submitted taskflows finish + @endcode + */ + void wait_for_all(); + + /** + @brief queries the number of worker threads + + Each worker represents one unique thread spawned by an executor + upon its construction time. + + @code{.cpp} + tf::Executor executor(4); + std::cout << executor.num_workers(); // 4 + @endcode + */ + size_t num_workers() const noexcept; + + /** + @brief queries the number of running topologies at the time of this call + + When a taskflow is submitted to an executor, a topology is created to store + runtime metadata of the running taskflow. + When the execution of the submitted taskflow finishes, + its corresponding topology will be removed from the executor. + + @code{.cpp} + executor.run(taskflow); + std::cout << executor.num_topologies(); // 0 or 1 (taskflow still running) + @endcode + */ + size_t num_topologies() const; + + /** + @brief queries the number of running taskflows with moved ownership + + @code{.cpp} + executor.run(std::move(taskflow)); + std::cout << executor.num_taskflows(); // 0 or 1 (taskflow still running) + @endcode + */ + size_t num_taskflows() const; + + /** + @brief queries the id of the caller thread in this executor + + Each worker has an unique id in the range of @c 0 to @c N-1 associated with + its parent executor. + If the caller thread does not belong to the executor, @c -1 is returned. + + @code{.cpp} + tf::Executor executor(4); // 4 workers in the executor + executor.this_worker_id(); // -1 (main thread is not a worker) + + taskflow.emplace([&](){ + std::cout << executor.this_worker_id(); // 0, 1, 2, or 3 + }); + executor.run(taskflow); + @endcode + */ + int this_worker_id() const; + + // -------------------------------------------------------------------------- + // Observer methods + // -------------------------------------------------------------------------- + + /** + @brief constructs an observer to inspect the activities of worker threads + + @tparam Observer observer type derived from tf::ObserverInterface + @tparam ArgsT argument parameter pack + + @param args arguments to forward to the constructor of the observer + + @return a shared pointer to the created observer + + Each executor manages a list of observers with shared ownership with callers. + For each of these observers, the two member functions, + tf::ObserverInterface::on_entry and tf::ObserverInterface::on_exit + will be called before and after the execution of a task. + + This member function is not thread-safe. + */ + template + std::shared_ptr make_observer(ArgsT&&... args); + + /** + @brief removes an observer from the executor + + This member function is not thread-safe. + */ + template + void remove_observer(std::shared_ptr observer); + + /** + @brief queries the number of observers + */ + size_t num_observers() const noexcept; + + // -------------------------------------------------------------------------- + // Async Task Methods + // -------------------------------------------------------------------------- + + /** + @brief creates a parameterized asynchronous task to run the given function + + @tparam P task parameter type + @tparam F callable type + + @param params task parameters + @param func callable object + + @return a @std_future that will hold the result of the execution + + The method creates a parameterized asynchronous task + to run the given function and return a @std_future object + that eventually will hold the result of the execution. + + @code{.cpp} + std::future future = executor.async("name", [](){ + std::cout << "create an asynchronous task with a name and returns 1\n"; + return 1; + }); + future.get(); + @endcode + + This member function is thread-safe. + */ + template + auto async(P&& params, F&& func); + + /** + @brief runs a given function asynchronously + + @tparam F callable type + + @param func callable object + + @return a @std_future that will hold the result of the execution + + The method creates an asynchronous task to run the given function + and return a @std_future object that eventually will hold the result + of the return value. + + @code{.cpp} + std::future future = executor.async([](){ + std::cout << "create an asynchronous task and returns 1\n"; + return 1; + }); + future.get(); + @endcode + + This member function is thread-safe. + */ + template + auto async(F&& func); + + /** + @brief similar to tf::Executor::async but does not return a future object + + @tparam F callable type + + @param params task parameters + @param func callable object + + The method creates a parameterized asynchronous task + to run the given function without returning any @std_future object. + This member function is more efficient than tf::Executor::async + and is encouraged to use when applications do not need a @std_future to acquire + the result or synchronize the execution. + + @code{.cpp} + executor.silent_async("name", [](){ + std::cout << "create an asynchronous task with a name and no return\n"; + }); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template + void silent_async(P&& params, F&& func); + + /** + @brief similar to tf::Executor::async but does not return a future object + + @tparam F callable type + + @param func callable object + + The method creates an asynchronous task + to run the given function without returning any @std_future object. + This member function is more efficient than tf::Executor::async + and is encouraged to use when applications do not need a @std_future to acquire + the result or synchronize the execution. + + @code{.cpp} + executor.silent_async([](){ + std::cout << "create an asynchronous task with no return\n"; + }); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template + void silent_async(F&& func); + + // -------------------------------------------------------------------------- + // Silent Dependent Async Methods + // -------------------------------------------------------------------------- + + /** + @brief runs the given function asynchronously + when the given dependents finish + + @tparam F callable type + @tparam Tasks task types convertible to tf::AsyncTask + + @param func callable object + @param tasks asynchronous tasks on which this execution depends + + @return a tf::AsyncTask handle + + This member function is more efficient than tf::Executor::dependent_async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + + @code{.cpp} + tf::AsyncTask A = executor.silent_dependent_async([](){ printf("A\n"); }); + tf::AsyncTask B = executor.silent_dependent_async([](){ printf("B\n"); }); + executor.silent_dependent_async([](){ printf("C runs after A and B\n"); }, A, B); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template ...>, void>* = nullptr + > + tf::AsyncTask silent_dependent_async(F&& func, Tasks&&... tasks); + + /** + @brief runs the given function asynchronously + when the given dependents finish + + @tparam F callable type + @tparam Tasks task types convertible to tf::AsyncTask + + @param params task parameters + @param func callable object + @param tasks asynchronous tasks on which this execution depends + + @return a tf::AsyncTask handle + + This member function is more efficient than tf::Executor::dependent_async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + tf::AsyncTask A = executor.silent_dependent_async("A", [](){ printf("A\n"); }); + tf::AsyncTask B = executor.silent_dependent_async("B", [](){ printf("B\n"); }); + executor.silent_dependent_async( + "C", [](){ printf("C runs after A and B\n"); }, A, B + ); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template && all_same_v...>, void>* = nullptr + > + tf::AsyncTask silent_dependent_async(P&& params, F&& func, Tasks&&... tasks); + + /** + @brief runs the given function asynchronously + when the given range of dependents finish + + @tparam F callable type + @tparam I iterator type + + @param func callable object + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + + @return a tf::AsyncTask handle + + This member function is more efficient than tf::Executor::dependent_async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + + @code{.cpp} + std::array array { + executor.silent_dependent_async([](){ printf("A\n"); }), + executor.silent_dependent_async([](){ printf("B\n"); }) + }; + executor.silent_dependent_async( + [](){ printf("C runs after A and B\n"); }, array.begin(), array.end() + ); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template , AsyncTask>, void>* = nullptr + > + tf::AsyncTask silent_dependent_async(F&& func, I first, I last); + + /** + @brief runs the given function asynchronously + when the given range of dependents finish + + @tparam F callable type + @tparam I iterator type + + @param params tasks parameters + @param func callable object + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + + @return a tf::AsyncTask handle + + This member function is more efficient than tf::Executor::dependent_async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + std::array array { + executor.silent_dependent_async("A", [](){ printf("A\n"); }), + executor.silent_dependent_async("B", [](){ printf("B\n"); }) + }; + executor.silent_dependent_async( + "C", [](){ printf("C runs after A and B\n"); }, array.begin(), array.end() + ); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template && !std::is_same_v, AsyncTask>, void>* = nullptr + > + tf::AsyncTask silent_dependent_async(P&& params, F&& func, I first, I last); + + // -------------------------------------------------------------------------- + // Dependent Async Methods + // -------------------------------------------------------------------------- + + /** + @brief runs the given function asynchronously + when the given dependents finish + + @tparam F callable type + @tparam Tasks task types convertible to tf::AsyncTask + + @param func callable object + @param tasks asynchronous tasks on which this execution depends + + @return a pair of a tf::AsyncTask handle and + a @std_future that holds the result of the execution + + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Task @c C returns a pair of its tf::AsyncTask handle and a std::future + that eventually will hold the result of the execution. + + @code{.cpp} + tf::AsyncTask A = executor.silent_dependent_async([](){ printf("A\n"); }); + tf::AsyncTask B = executor.silent_dependent_async([](){ printf("B\n"); }); + auto [C, fuC] = executor.dependent_async( + [](){ + printf("C runs after A and B\n"); + return 1; + }, + A, B + ); + fuC.get(); // C finishes, which in turns means both A and B finish + @endcode + + You can mixed the use of tf::AsyncTask handles + returned by Executor::dependent_async and Executor::silent_dependent_async + when specifying task dependencies. + + This member function is thread-safe. + */ + template ...>, void>* = nullptr + > + auto dependent_async(F&& func, Tasks&&... tasks); + + /** + @brief runs the given function asynchronously + when the given dependents finish + + @tparam P task parameters type + @tparam F callable type + @tparam Tasks task types convertible to tf::AsyncTask + + @param params task parameters + @param func callable object + @param tasks asynchronous tasks on which this execution depends + + @return a pair of a tf::AsyncTask handle and + a @std_future that holds the result of the execution + + The example below creates three named asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Task @c C returns a pair of its tf::AsyncTask handle and a std::future + that eventually will hold the result of the execution. + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + tf::AsyncTask A = executor.silent_dependent_async("A", [](){ printf("A\n"); }); + tf::AsyncTask B = executor.silent_dependent_async("B", [](){ printf("B\n"); }); + auto [C, fuC] = executor.dependent_async( + "C", + [](){ + printf("C runs after A and B\n"); + return 1; + }, + A, B + ); + assert(fuC.get()==1); // C finishes, which in turns means both A and B finish + @endcode + + You can mixed the use of tf::AsyncTask handles + returned by Executor::dependent_async and Executor::silent_dependent_async + when specifying task dependencies. + + This member function is thread-safe. + */ + template && all_same_v...>, void>* = nullptr + > + auto dependent_async(P&& params, F&& func, Tasks&&... tasks); + + /** + @brief runs the given function asynchronously + when the given range of dependents finish + + @tparam F callable type + @tparam I iterator type + + @param func callable object + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + + @return a pair of a tf::AsyncTask handle and + a @std_future that holds the result of the execution + + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Task @c C returns a pair of its tf::AsyncTask handle and a std::future + that eventually will hold the result of the execution. + + @code{.cpp} + std::array array { + executor.silent_dependent_async([](){ printf("A\n"); }), + executor.silent_dependent_async([](){ printf("B\n"); }) + }; + auto [C, fuC] = executor.dependent_async( + [](){ + printf("C runs after A and B\n"); + return 1; + }, + array.begin(), array.end() + ); + assert(fuC.get()==1); // C finishes, which in turns means both A and B finish + @endcode + + You can mixed the use of tf::AsyncTask handles + returned by Executor::dependent_async and Executor::silent_dependent_async + when specifying task dependencies. + + This member function is thread-safe. + */ + template , AsyncTask>, void>* = nullptr + > + auto dependent_async(F&& func, I first, I last); + + /** + @brief runs the given function asynchronously + when the given range of dependents finish + + @tparam P task parameters type + @tparam F callable type + @tparam I iterator type + + @param params task parameters + @param func callable object + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + + @return a pair of a tf::AsyncTask handle and + a @std_future that holds the result of the execution + + The example below creates three named asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Task @c C returns a pair of its tf::AsyncTask handle and a std::future + that eventually will hold the result of the execution. + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + std::array array { + executor.silent_dependent_async("A", [](){ printf("A\n"); }), + executor.silent_dependent_async("B", [](){ printf("B\n"); }) + }; + auto [C, fuC] = executor.dependent_async( + "C", + [](){ + printf("C runs after A and B\n"); + return 1; + }, + array.begin(), array.end() + ); + assert(fuC.get()==1); // C finishes, which in turns means both A and B finish + @endcode + + You can mixed the use of tf::AsyncTask handles + returned by Executor::dependent_async and Executor::silent_dependent_async + when specifying task dependencies. + + This member function is thread-safe. + */ + template && !std::is_same_v, AsyncTask>, void>* = nullptr + > + auto dependent_async(P&& params, F&& func, I first, I last); + + private: + + const size_t _MAX_STEALS; + + std::mutex _wsq_mutex; + std::mutex _taskflows_mutex; + + std::vector _threads; + std::vector _workers; + +#ifdef __cpp_lib_atomic_wait + std::atomic _num_topologies {0}; + std::atomic_flag _all_spawned = ATOMIC_FLAG_INIT; + + std::atomic_flag _done = ATOMIC_FLAG_INIT; + std::atomic _state {0ull}; + static const uint64_t _EPOCH_INC = 1; + //static const uint64_t _EPOCH_INC{1ull << 32}; + //static const uint64_t _NUM_WAITERS_MASK{(1ull << 32) - 1}; + //static const uint64_t _NUM_WAITERS_INC{1ull}; +#else + std::condition_variable _topology_cv; + std::mutex _topology_mutex; + size_t _num_topologies {0}; + Notifier _notifier; + std::atomic _done {0}; +#endif + + std::unordered_map _wids; + std::list _taskflows; + + TaskQueue _wsq; + + std::unordered_set> _observers; + + Worker* _this_worker(); + + bool _wait_for_task(Worker&, Node*&); + bool _invoke_module_task_internal(Worker&, Node*); + + void _observer_prologue(Worker&, Node*); + void _observer_epilogue(Worker&, Node*); + void _spawn(size_t); + void _exploit_task(Worker&, Node*&); + void _explore_task(Worker&, Node*&); + void _schedule(Worker&, Node*); + void _schedule(Node*); + void _schedule(Worker&, const SmallVector&); + void _schedule(const SmallVector&); + void _set_up_topology(Worker*, Topology*); + void _set_up_graph(Graph&, Node*, Topology*, int, SmallVector&); + void _tear_down_topology(Worker&, Topology*); + void _tear_down_async(Node*); + void _tear_down_dependent_async(Worker&, Node*); + void _tear_down_invoke(Worker&, Node*); + void _increment_topology(); + void _decrement_topology(); + void _invoke(Worker&, Node*); + void _invoke_static_task(Worker&, Node*); + void _invoke_subflow_task(Worker&, Node*); + void _detach_subflow_task(Worker&, Node*, Graph&); + void _invoke_condition_task(Worker&, Node*, SmallVector&); + void _invoke_multi_condition_task(Worker&, Node*, SmallVector&); + void _invoke_module_task(Worker&, Node*); + void _invoke_async_task(Worker&, Node*); + void _invoke_dependent_async_task(Worker&, Node*); + void _process_async_dependent(Node*, tf::AsyncTask&, size_t&); + void _process_exception(Worker&, Node*); + void _schedule_async_task(Node*); + void _corun_graph(Worker&, Node*, Graph&); + + template + void _corun_until(Worker&, P&&); +}; + +// Constructor +inline Executor::Executor(size_t N) : + _MAX_STEALS {((N+1) << 1)}, + _threads {N}, + _workers {N} +#ifndef __cpp_lib_atomic_wait + ,_notifier {N} +#endif +{ + + if(N == 0) { + TF_THROW("executor must define at least one worker"); + } + + _spawn(N); + + // initialize the default observer if requested + if(has_env(TF_ENABLE_PROFILER)) { + TFProfManager::get()._manage(make_observer()); + } +} + +// Destructor +inline Executor::~Executor() { + + // wait for all topologies to complete + wait_for_all(); + + // shut down the scheduler + +#ifdef __cpp_lib_atomic_wait + _done.test_and_set(std::memory_order_relaxed); + for(size_t i=0; i<_workers.size(); i++) { + _state.fetch_add(_EPOCH_INC, std::memory_order_release); + _state.notify_one(); + } +#else + _done = true; + _notifier.notify(true); +#endif + + for(auto& t : _threads) { + t.join(); + } +} + +// Function: num_workers +inline size_t Executor::num_workers() const noexcept { + return _workers.size(); +} + +// Function: num_topologies +inline size_t Executor::num_topologies() const { +#ifdef __cpp_lib_atomic_wait + return _num_topologies.load(std::memory_order_relaxed); +#else + return _num_topologies; +#endif +} + +// Function: num_taskflows +inline size_t Executor::num_taskflows() const { + return _taskflows.size(); +} + +// Function: _this_worker +inline Worker* Executor::_this_worker() { + auto itr = _wids.find(std::this_thread::get_id()); + return itr == _wids.end() ? nullptr : &_workers[itr->second]; +} + +// Function: this_worker_id +inline int Executor::this_worker_id() const { + auto i = _wids.find(std::this_thread::get_id()); + return i == _wids.end() ? -1 : static_cast(_workers[i->second]._id); +} + +// Procedure: _spawn +inline void Executor::_spawn(size_t N) { + +#ifdef __cpp_lib_atomic_wait +#else + std::mutex mutex; + std::condition_variable cond; + size_t n=0; +#endif + + for(size_t id=0; id lock(mutex); + cond.wait(lock, [&](){ return n==N; }); +#endif +} + +// Function: _corun_until +template +void Executor::_corun_until(Worker& w, P&& stop_predicate) { + + std::uniform_int_distribution rdvtm(0, _workers.size()-1); + + exploit: + + while(!stop_predicate()) { + + //exploit: + + if(auto t = w._wsq.pop(); t) { + _invoke(w, t); + } + else { + size_t num_steals = 0; + + explore: + + t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal(); + + if(t) { + _invoke(w, t); + goto exploit; + } + else if(!stop_predicate()) { + if(num_steals++ > _MAX_STEALS) { + std::this_thread::yield(); + } + w._vtm = rdvtm(w._rdgen); + goto explore; + } + else { + break; + } + } + } +} + +// Function: _explore_task +inline void Executor::_explore_task(Worker& w, Node*& t) { + + //assert(_workers[w].wsq.empty()); + //assert(!t); + + size_t num_steals = 0; + size_t num_yields = 0; + + std::uniform_int_distribution rdvtm(0, _workers.size()-1); + + // Here, we write do-while to make the worker steal at once + // from the assigned victim. + do { + t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal(); + + if(t) { + break; + } + + if(num_steals++ > _MAX_STEALS) { + std::this_thread::yield(); + if(num_yields++ > 100) { + break; + } + } + + w._vtm = rdvtm(w._rdgen); + } +#ifdef __cpp_lib_atomic_wait + // the _DONE can be checked later in wait_for_task? + while(!_done.test(std::memory_order_relaxed)); +#else + while(!_done); +#endif + +} + +// Procedure: _exploit_task +inline void Executor::_exploit_task(Worker& w, Node*& t) { + while(t) { + _invoke(w, t); + t = w._wsq.pop(); + } +} + +// Function: _wait_for_task +inline bool Executor::_wait_for_task(Worker& worker, Node*& t) { + + explore_task: + + _explore_task(worker, t); + + if(t) { + return true; + } + + // The last thief who successfully stole a task will wake up + // another thief worker to avoid starvation. +// if(t) { +//#ifdef __cpp_lib_atomic_wait +// +//#else +// _notifier.notify(false); +//#endif +// return true; +// } + +#ifdef __cpp_lib_atomic_wait + + uint64_t cur_state = _state.load(std::memory_order_acquire); + + if(_done.test(std::memory_order_relaxed)) { + return false; + } + + if(!_wsq.empty()) { + worker._vtm = worker._id; + goto explore_task; + } + + // We need to use index-based scanning to avoid data race + // with _spawn which may initialize a worker at the same time. + for(size_t vtm=0; vtm<_workers.size(); vtm++) { + if(!_workers[vtm]._wsq.empty()) { + worker._vtm = vtm; + goto explore_task; + } + } + + _state.wait(cur_state, std::memory_order_acquire); + goto explore_task; +#else + // ---- 2PC guard ---- + _notifier.prepare_wait(worker._waiter); + + if(!_wsq.empty()) { + _notifier.cancel_wait(worker._waiter); + worker._vtm = worker._id; + goto explore_task; + } + + if(_done) { + _notifier.cancel_wait(worker._waiter); + _notifier.notify(true); + return false; + } + + // We need to use index-based scanning to avoid data race + // with _spawn which may initialize a worker at the same time. + for(size_t vtm=0; vtm<_workers.size(); vtm++) { + if(!_workers[vtm]._wsq.empty()) { + _notifier.cancel_wait(worker._waiter); + worker._vtm = vtm; + goto explore_task; + } + } + + // Now I really need to relinquish my self to others + _notifier.commit_wait(worker._waiter); + goto explore_task; +#endif + +} + +// Function: make_observer +template +std::shared_ptr Executor::make_observer(ArgsT&&... args) { + + static_assert( + std::is_base_of_v, + "Observer must be derived from ObserverInterface" + ); + + // use a local variable to mimic the constructor + auto ptr = std::make_shared(std::forward(args)...); + + ptr->set_up(_workers.size()); + + _observers.emplace(std::static_pointer_cast(ptr)); + + return ptr; +} + +// Procedure: remove_observer +template +void Executor::remove_observer(std::shared_ptr ptr) { + + static_assert( + std::is_base_of_v, + "Observer must be derived from ObserverInterface" + ); + + _observers.erase(std::static_pointer_cast(ptr)); +} + +// Function: num_observers +inline size_t Executor::num_observers() const noexcept { + return _observers.size(); +} + +// Procedure: _schedule +inline void Executor::_schedule(Worker& worker, Node* node) { + + // We need to fetch p before the release such that the read + // operation is synchronized properly with other thread to + // void data race. + auto p = node->_priority; + + node->_state.fetch_or(Node::READY, std::memory_order_release); + + // caller is a worker to this pool - starting at v3.5 we do not use + // any complicated notification mechanism as the experimental result + // has shown no significant advantage. + if(worker._executor == this) { + worker._wsq.push(node, p); +#ifdef __cpp_lib_atomic_wait + // we load the state first as load is much faster than fetch_add + _state.fetch_add(_EPOCH_INC, std::memory_order_release); + _state.notify_one(); +#else + _notifier.notify(false); +#endif + return; + } + + { + std::lock_guard lock(_wsq_mutex); + _wsq.push(node, p); + } +#ifdef __cpp_lib_atomic_wait + // we load the state first as load is much faster than fetch_add + _state.fetch_add(_EPOCH_INC, std::memory_order_release); + _state.notify_one(); +#else + _notifier.notify(false); +#endif +} + +// Procedure: _schedule +inline void Executor::_schedule(Node* node) { + + // We need to fetch p before the release such that the read + // operation is synchronized properly with other thread to + // void data race. + auto p = node->_priority; + + node->_state.fetch_or(Node::READY, std::memory_order_release); + + { + std::lock_guard lock(_wsq_mutex); + _wsq.push(node, p); + } + +#ifdef __cpp_lib_atomic_wait + // we load the state first as load is much faster than fetch_add + _state.fetch_add(_EPOCH_INC, std::memory_order_release); + _state.notify_one(); +#else + _notifier.notify(false); +#endif +} + +// Procedure: _schedule +inline void Executor::_schedule(Worker& worker, const SmallVector& nodes) { + + // We need to cacth the node count to avoid accessing the nodes + // vector while the parent topology is removed! + const auto num_nodes = nodes.size(); + + if(num_nodes == 0) { + return; + } + + // caller is a worker to this pool - starting at v3.5 we do not use + // any complicated notification mechanism as the experimental result + // has shown no significant advantage. + if(worker._executor == this) { + for(size_t i=0; i_priority; + nodes[i]->_state.fetch_or(Node::READY, std::memory_order_release); + worker._wsq.push(nodes[i], p); +#ifdef __cpp_lib_atomic_wait + _state.fetch_add(_EPOCH_INC, std::memory_order_release); + _state.notify_one(); +#else + _notifier.notify(false); +#endif + } + return; + } + + { + std::lock_guard lock(_wsq_mutex); + for(size_t k=0; k_priority; + nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release); + _wsq.push(nodes[k], p); + } + } +#ifdef __cpp_lib_atomic_wait + size_t n = std::min(num_nodes, _workers.size()); + for(size_t i=0; i& nodes) { + + // parent topology may be removed! + const auto num_nodes = nodes.size(); + + if(num_nodes == 0) { + return; + } + + // We need to fetch p before the release such that the read + // operation is synchronized properly with other thread to + // void data race. + { + std::lock_guard lock(_wsq_mutex); + for(size_t k=0; k_priority; + nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release); + _wsq.push(nodes[k], p); + } + } + +#ifdef __cpp_lib_atomic_wait + size_t n = std::min(num_nodes, _workers.size()); + for(size_t i=0; i_state.load(std::memory_order_acquire) & Node::READY)); + + begin_invoke: + + SmallVector conds; + + // no need to do other things if the topology is cancelled + if(node->_is_cancelled()) { + _tear_down_invoke(worker, node); + return; + } + + // if acquiring semaphore(s) exists, acquire them first + if(node->_semaphores && !node->_semaphores->to_acquire.empty()) { + SmallVector nodes; + if(!node->_acquire_all(nodes)) { + _schedule(worker, nodes); + return; + } + node->_state.fetch_or(Node::ACQUIRED, std::memory_order_release); + } + + // condition task + //int cond = -1; + + // switch is faster than nested if-else due to jump table + switch(node->_handle.index()) { + // static task + case Node::STATIC:{ + _invoke_static_task(worker, node); + } + break; + + // subflow task + case Node::SUBFLOW: { + _invoke_subflow_task(worker, node); + } + break; + + // condition task + case Node::CONDITION: { + _invoke_condition_task(worker, node, conds); + } + break; + + // multi-condition task + case Node::MULTI_CONDITION: { + _invoke_multi_condition_task(worker, node, conds); + } + break; + + // module task + case Node::MODULE: { + _invoke_module_task(worker, node); + } + break; + + // async task + case Node::ASYNC: { + _invoke_async_task(worker, node); + _tear_down_async(node); + return ; + } + break; + + // dependent async task + case Node::DEPENDENT_ASYNC: { + _invoke_dependent_async_task(worker, node); + _tear_down_dependent_async(worker, node); + if(worker._cache) { + node = worker._cache; + goto begin_invoke; + } + return; + } + break; + + // monostate (placeholder) + default: + break; + } + + //invoke_successors: + + // if releasing semaphores exist, release them + if(node->_semaphores && !node->_semaphores->to_release.empty()) { + _schedule(worker, node->_release_all()); + } + + // Reset the join counter to support the cyclic control flow. + // + We must do this before scheduling the successors to avoid race + // condition on _dependents. + // + We must use fetch_add instead of direct assigning + // because the user-space call on "invoke" may explicitly schedule + // this task again (e.g., pipeline) which can access the join_counter. + if((node->_state.load(std::memory_order_relaxed) & Node::CONDITIONED)) { + node->_join_counter.fetch_add(node->num_strong_dependents(), std::memory_order_relaxed); + } + else { + node->_join_counter.fetch_add(node->num_dependents(), std::memory_order_relaxed); + } + + // acquire the parent flow counter + auto& j = (node->_parent) ? node->_parent->_join_counter : + node->_topology->_join_counter; + + // Here, we want to cache the latest successor with the highest priority + worker._cache = nullptr; + auto max_p = static_cast(TaskPriority::MAX); + + // Invoke the task based on the corresponding type + switch(node->_handle.index()) { + + // condition and multi-condition tasks + case Node::CONDITION: + case Node::MULTI_CONDITION: { + for(auto cond : conds) { + if(cond >= 0 && static_cast(cond) < node->_successors.size()) { + auto s = node->_successors[cond]; + // zeroing the join counter for invariant + s->_join_counter.store(0, std::memory_order_relaxed); + j.fetch_add(1, std::memory_order_relaxed); + if(s->_priority <= max_p) { + if(worker._cache) { + _schedule(worker, worker._cache); + } + worker._cache = s; + max_p = s->_priority; + } + else { + _schedule(worker, s); + } + } + } + } + break; + + // non-condition task + default: { + for(size_t i=0; i_successors.size(); ++i) { + //if(auto s = node->_successors[i]; --(s->_join_counter) == 0) { + if(auto s = node->_successors[i]; + s->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) { + j.fetch_add(1, std::memory_order_relaxed); + if(s->_priority <= max_p) { + if(worker._cache) { + _schedule(worker, worker._cache); + } + worker._cache = s; + max_p = s->_priority; + } + else { + _schedule(worker, s); + } + } + } + } + break; + } + + // tear_down the invoke + _tear_down_invoke(worker, node); + + // perform tail recursion elimination for the right-most child to reduce + // the number of expensive pop/push operations through the task queue + if(worker._cache) { + node = worker._cache; + //node->_state.fetch_or(Node::READY, std::memory_order_release); + goto begin_invoke; + } +} + +// Procedure: _tear_down_invoke +inline void Executor::_tear_down_invoke(Worker& worker, Node* node) { + // we must check parent first before subtracting the join counter, + // or it can introduce data race + if(auto parent = node->_parent; parent == nullptr) { + if(node->_topology->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) { + _tear_down_topology(worker, node->_topology); + } + } + // Here we asssume the parent is in a busy loop (e.g., corun) waiting for + // its join counter to become 0. + else { + //parent->_join_counter.fetch_sub(1, std::memory_order_acq_rel); + parent->_join_counter.fetch_sub(1, std::memory_order_release); + } + //// module task + //else { + // auto id = parent->_handle.index(); + // if(parent->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) { + // if(id == Node::MODULE) { + // return parent; + // } + // } + //} + //return nullptr; +} + +// Procedure: _observer_prologue +inline void Executor::_observer_prologue(Worker& worker, Node* node) { + for(auto& observer : _observers) { + observer->on_entry(WorkerView(worker), TaskView(*node)); + } +} + +// Procedure: _observer_epilogue +inline void Executor::_observer_epilogue(Worker& worker, Node* node) { + for(auto& observer : _observers) { + observer->on_exit(WorkerView(worker), TaskView(*node)); + } +} + +// Procedure: _process_exception +inline void Executor::_process_exception(Worker&, Node* node) { + + constexpr static auto flag = Topology::EXCEPTION | Topology::CANCELLED; + + // if the node has a parent, we store the exception in its parent + if(auto parent = node->_parent; parent) { + if ((parent->_state.fetch_or(Node::EXCEPTION, std::memory_order_relaxed) & Node::EXCEPTION) == 0) { + parent->_exception_ptr = std::current_exception(); + } + // TODO if the node has a topology, cancel it to enable early stop + //if(auto tpg = node->_topology; tpg) { + // tpg->_state.fetch_or(Topology::CANCELLED, std::memory_order_relaxed); + //} + } + // multiple tasks may throw, so we only take the first thrown exception + else if(auto tpg = node->_topology; tpg && + ((tpg->_state.fetch_or(flag, std::memory_order_relaxed) & Topology::EXCEPTION) == 0) + ) { + tpg->_exception_ptr = std::current_exception(); + } + // TODO: skip the exception that is not associated with any taskflows +} + +// Procedure: _invoke_static_task +inline void Executor::_invoke_static_task(Worker& worker, Node* node) { + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { + auto& work = std::get_if(&node->_handle)->work; + switch(work.index()) { + case 0: + std::get_if<0>(&work)->operator()(); + break; + + case 1: + Runtime rt(*this, worker, node); + std::get_if<1>(&work)->operator()(rt); + node->_process_exception(); + break; + } + }); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_subflow_task +inline void Executor::_invoke_subflow_task(Worker& w, Node* node) { + _observer_prologue(w, node); + TF_EXECUTOR_EXCEPTION_HANDLER(w, node, { + auto handle = std::get_if(&node->_handle); + handle->subgraph._clear(); + Subflow sf(*this, w, node, handle->subgraph); + handle->work(sf); + if(sf._joinable) { + _corun_graph(w, node, handle->subgraph); + } + node->_process_exception(); + }); + _observer_epilogue(w, node); +} + +// Procedure: _detach_subflow_task +inline void Executor::_detach_subflow_task(Worker& w, Node* p, Graph& g) { + + // graph is empty and has no async tasks + if(g.empty() && p->_join_counter.load(std::memory_order_acquire) == 0) { + return; + } + + SmallVector src; + _set_up_graph(g, nullptr, p->_topology, Node::DETACHED, src); + + { + std::lock_guard lock(p->_topology->_taskflow._mutex); + p->_topology->_taskflow._graph._merge(std::move(g)); + } + + p->_topology->_join_counter.fetch_add(src.size(), std::memory_order_relaxed); + _schedule(w, src); +} + +// Procedure: _corun_graph +inline void Executor::_corun_graph(Worker& w, Node* p, Graph& g) { + + // assert(p); + + // graph is empty and has no async tasks (subflow) + if(g.empty() && p->_join_counter.load(std::memory_order_acquire) == 0) { + return; + } + + SmallVector src; + + _set_up_graph(g, p, p->_topology, 0, src); + p->_join_counter.fetch_add(src.size(), std::memory_order_relaxed); + + _schedule(w, src); + + _corun_until(w, [p] () -> bool { + return p->_join_counter.load(std::memory_order_acquire) == 0; } + ); +} + +// Procedure: _invoke_condition_task +inline void Executor::_invoke_condition_task( + Worker& worker, Node* node, SmallVector& conds +) { + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { + auto& work = std::get_if(&node->_handle)->work; + switch(work.index()) { + case 0: + conds = { std::get_if<0>(&work)->operator()() }; + break; + + case 1: + Runtime rt(*this, worker, node); + conds = { std::get_if<1>(&work)->operator()(rt) }; + node->_process_exception(); + break; + } + }); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_multi_condition_task +inline void Executor::_invoke_multi_condition_task( + Worker& worker, Node* node, SmallVector& conds +) { + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { + auto& work = std::get_if(&node->_handle)->work; + switch(work.index()) { + case 0: + conds = std::get_if<0>(&work)->operator()(); + break; + + case 1: + Runtime rt(*this, worker, node); + conds = std::get_if<1>(&work)->operator()(rt); + node->_process_exception(); + break; + } + }); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_module_task +inline void Executor::_invoke_module_task(Worker& w, Node* node) { + _observer_prologue(w, node); + TF_EXECUTOR_EXCEPTION_HANDLER(w, node, { + _corun_graph(w, node, std::get_if(&node->_handle)->graph); + node->_process_exception(); + }); + _observer_epilogue(w, node); +} + +//// Function: _invoke_module_task_internal +//inline bool Executor::_invoke_module_task_internal(Worker& w, Node* p) { +// +// // acquire the underlying graph +// auto& g = std::get_if(&p->_handle)->graph; +// +// // no need to do anything if the graph is empty +// if(g.empty()) { +// return false; +// } +// +// SmallVector src; +// _set_up_graph(g, p, p->_topology, 0, src); +// p->_join_counter.fetch_add(src.size(), std::memory_order_relaxed); +// +// _schedule(w, src); +// return true; +//} + +// Procedure: _invoke_async_task +inline void Executor::_invoke_async_task(Worker& worker, Node* node) { + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { + auto& work = std::get_if(&node->_handle)->work; + switch(work.index()) { + case 0: + std::get_if<0>(&work)->operator()(); + break; + + case 1: + Runtime rt(*this, worker, node); + std::get_if<1>(&work)->operator()(rt); + break; + } + }); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_dependent_async_task +inline void Executor::_invoke_dependent_async_task(Worker& worker, Node* node) { + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { + auto& work = std::get_if(&node->_handle)->work; + switch(work.index()) { + case 0: + std::get_if<0>(&work)->operator()(); + break; + + case 1: + Runtime rt(*this, worker, node); + std::get_if<1>(&work)->operator()(rt); + break; + } + }); + _observer_epilogue(worker, node); +} + +// Function: run +inline tf::Future Executor::run(Taskflow& f) { + return run_n(f, 1, [](){}); +} + +// Function: run +inline tf::Future Executor::run(Taskflow&& f) { + return run_n(std::move(f), 1, [](){}); +} + +// Function: run +template +tf::Future Executor::run(Taskflow& f, C&& c) { + return run_n(f, 1, std::forward(c)); +} + +// Function: run +template +tf::Future Executor::run(Taskflow&& f, C&& c) { + return run_n(std::move(f), 1, std::forward(c)); +} + +// Function: run_n +inline tf::Future Executor::run_n(Taskflow& f, size_t repeat) { + return run_n(f, repeat, [](){}); +} + +// Function: run_n +inline tf::Future Executor::run_n(Taskflow&& f, size_t repeat) { + return run_n(std::move(f), repeat, [](){}); +} + +// Function: run_n +template +tf::Future Executor::run_n(Taskflow& f, size_t repeat, C&& c) { + return run_until( + f, [repeat]() mutable { return repeat-- == 0; }, std::forward(c) + ); +} + +// Function: run_n +template +tf::Future Executor::run_n(Taskflow&& f, size_t repeat, C&& c) { + return run_until( + std::move(f), [repeat]() mutable { return repeat-- == 0; }, std::forward(c) + ); +} + +// Function: run_until +template +tf::Future Executor::run_until(Taskflow& f, P&& pred) { + return run_until(f, std::forward

      (pred), [](){}); +} + +// Function: run_until +template +tf::Future Executor::run_until(Taskflow&& f, P&& pred) { + return run_until(std::move(f), std::forward

      (pred), [](){}); +} + +// Function: run_until +template +tf::Future Executor::run_until(Taskflow& f, P&& p, C&& c) { + + _increment_topology(); + + // Need to check the empty under the lock since subflow task may + // define detached blocks that modify the taskflow at the same time + bool empty; + { + std::lock_guard lock(f._mutex); + empty = f.empty(); + } + + // No need to create a real topology but returns an dummy future + if(empty || p()) { + c(); + std::promise promise; + promise.set_value(); + _decrement_topology(); + return tf::Future(promise.get_future()); + } + + // create a topology for this run + auto t = std::make_shared(f, std::forward

      (p), std::forward(c)); + + // need to create future before the topology got torn down quickly + tf::Future future(t->_promise.get_future(), t); + + // modifying topology needs to be protected under the lock + { + std::lock_guard lock(f._mutex); + f._topologies.push(t); + if(f._topologies.size() == 1) { + _set_up_topology(_this_worker(), t.get()); + } + } + + return future; +} + +// Function: run_until +template +tf::Future Executor::run_until(Taskflow&& f, P&& pred, C&& c) { + + std::list::iterator itr; + + { + std::scoped_lock lock(_taskflows_mutex); + itr = _taskflows.emplace(_taskflows.end(), std::move(f)); + itr->_satellite = itr; + } + + return run_until(*itr, std::forward

      (pred), std::forward(c)); +} + +// Function: corun +template +void Executor::corun(T& target) { + + auto w = _this_worker(); + + if(w == nullptr) { + TF_THROW("corun must be called by a worker of the executor"); + } + + Node parent; // auxiliary parent + _corun_graph(*w, &parent, target.graph()); + parent._process_exception(); +} + +// Function: corun_until +template +void Executor::corun_until(P&& predicate) { + + auto w = _this_worker(); + + if(w == nullptr) { + TF_THROW("corun_until must be called by a worker of the executor"); + } + + _corun_until(*w, std::forward

      (predicate)); + + // TODO: exception? +} + +// Procedure: _increment_topology +inline void Executor::_increment_topology() { +#ifdef __cpp_lib_atomic_wait + _num_topologies.fetch_add(1, std::memory_order_relaxed); +#else + std::lock_guard lock(_topology_mutex); + ++_num_topologies; +#endif +} + +// Procedure: _decrement_topology +inline void Executor::_decrement_topology() { +#ifdef __cpp_lib_atomic_wait + if(_num_topologies.fetch_sub(1, std::memory_order_acq_rel) == 1) { + _num_topologies.notify_all(); + } +#else + std::lock_guard lock(_topology_mutex); + if(--_num_topologies == 0) { + _topology_cv.notify_all(); + } +#endif +} + +// Procedure: wait_for_all +inline void Executor::wait_for_all() { +#ifdef __cpp_lib_atomic_wait + size_t n = _num_topologies.load(std::memory_order_acquire); + while(n != 0) { + _num_topologies.wait(n, std::memory_order_acquire); + n = _num_topologies.load(std::memory_order_acquire); + } +#else + std::unique_lock lock(_topology_mutex); + _topology_cv.wait(lock, [&](){ return _num_topologies == 0; }); +#endif +} + +// Function: _set_up_topology +inline void Executor::_set_up_topology(Worker* worker, Topology* tpg) { + + // ---- under taskflow lock ---- + + tpg->_sources.clear(); + tpg->_taskflow._graph._clear_detached(); + _set_up_graph(tpg->_taskflow._graph, nullptr, tpg, 0, tpg->_sources); + tpg->_join_counter.store(tpg->_sources.size(), std::memory_order_relaxed); + + if(worker) { + _schedule(*worker, tpg->_sources); + } + else { + _schedule(tpg->_sources); + } +} + +// Function: _set_up_graph +inline void Executor::_set_up_graph( + Graph& g, Node* parent, Topology* tpg, int state, SmallVector& src +) { + for(auto node : g._nodes) { + node->_topology = tpg; + node->_parent = parent; + node->_state.store(state, std::memory_order_relaxed); + if(node->num_dependents() == 0) { + src.push_back(node); + } + node->_set_up_join_counter(); + node->_exception_ptr = nullptr; + } +} + +// Function: _tear_down_topology +inline void Executor::_tear_down_topology(Worker& worker, Topology* tpg) { + + auto &f = tpg->_taskflow; + + //assert(&tpg == &(f._topologies.front())); + + // case 1: we still need to run the topology again + if(!tpg->_exception_ptr && !tpg->cancelled() && !tpg->_pred()) { + //assert(tpg->_join_counter == 0); + std::lock_guard lock(f._mutex); + tpg->_join_counter.store(tpg->_sources.size(), std::memory_order_relaxed); + _schedule(worker, tpg->_sources); + } + // case 2: the final run of this topology + else { + + // TODO: if the topology is cancelled, need to release all semaphores + if(tpg->_call != nullptr) { + tpg->_call(); + } + + // If there is another run (interleave between lock) + if(std::unique_lock lock(f._mutex); f._topologies.size()>1) { + //assert(tpg->_join_counter == 0); + + // Set the promise + tpg->_promise.set_value(); + f._topologies.pop(); + tpg = f._topologies.front().get(); + + // decrement the topology but since this is not the last we don't notify + _decrement_topology(); + + // set up topology needs to be under the lock or it can + // introduce memory order error with pop + _set_up_topology(&worker, tpg); + } + else { + //assert(f._topologies.size() == 1); + + auto fetched_tpg {std::move(f._topologies.front())}; + f._topologies.pop(); + auto satellite {f._satellite}; + + lock.unlock(); + + // Soon after we carry out the promise, there is no longer any guarantee + // for the lifetime of the associated taskflow. + fetched_tpg->_carry_out_promise(); + + _decrement_topology(); + + // remove the taskflow if it is managed by the executor + // TODO: in the future, we may need to synchronize on wait + // (which means the following code should the moved before set_value) + if(satellite) { + std::scoped_lock satellite_lock(_taskflows_mutex); + _taskflows.erase(*satellite); + } + } + } +} + +// ############################################################################ +// Forward Declaration: Subflow +// ############################################################################ + +inline void Subflow::join() { + + // assert(this_worker().worker == &_worker); + + if(!_joinable) { + TF_THROW("subflow not joinable"); + } + + // only the parent worker can join the subflow + _executor._corun_graph(_worker, _parent, _graph); + + // if any exception is caught from subflow tasks, rethrow it + _parent->_process_exception(); + + _joinable = false; +} + +inline void Subflow::detach() { + + // assert(this_worker().worker == &_worker); + + if(!_joinable) { + TF_THROW("subflow already joined or detached"); + } + + // only the parent worker can detach the subflow + _executor._detach_subflow_task(_worker, _parent, _graph); + _joinable = false; +} + +// ############################################################################ +// Forward Declaration: Runtime +// ############################################################################ + +// Procedure: schedule +inline void Runtime::schedule(Task task) { + + auto node = task._node; + // need to keep the invariant: when scheduling a task, the task must have + // zero dependency (join counter is 0) + // or we can encounter bug when inserting a nested flow (e.g., module task) + node->_join_counter.store(0, std::memory_order_relaxed); + + auto& j = node->_parent ? node->_parent->_join_counter : + node->_topology->_join_counter; + j.fetch_add(1, std::memory_order_relaxed); + _executor._schedule(_worker, node); +} + +// Procedure: corun +template +void Runtime::corun(T&& target) { + _executor._corun_graph(_worker, _parent, target.graph()); + _parent->_process_exception(); +} + +// Procedure: corun_until +template +void Runtime::corun_until(P&& predicate) { + _executor._corun_until(_worker, std::forward

      (predicate)); + // TODO: exception? +} + +// Function: corun_all +inline void Runtime::corun_all() { + _executor._corun_until(_worker, [this] () -> bool { + return _parent->_join_counter.load(std::memory_order_acquire) == 0; + }); + _parent->_process_exception(); +} + +// Destructor +inline Runtime::~Runtime() { + _executor._corun_until(_worker, [this] () -> bool { + return _parent->_join_counter.load(std::memory_order_acquire) == 0; + }); +} + +// ------------------------------------ +// Runtime::silent_async series +// ------------------------------------ + +// Function: _silent_async +template +void Runtime::_silent_async(Worker& w, P&& params, F&& f) { + + _parent->_join_counter.fetch_add(1, std::memory_order_relaxed); + + auto node = node_pool.animate( + std::forward

      (params), _parent->_topology, _parent, 0, + std::in_place_type_t{}, std::forward(f) + ); + + _executor._schedule(w, node); +} + +// Function: silent_async +template +void Runtime::silent_async(F&& f) { + _silent_async(*_executor._this_worker(), DefaultTaskParams{}, std::forward(f)); +} + +// Function: silent_async +template +void Runtime::silent_async(P&& params, F&& f) { + _silent_async(*_executor._this_worker(), std::forward

      (params), std::forward(f)); +} + +// Function: silent_async_unchecked +template +void Runtime::silent_async_unchecked(F&& f) { + _silent_async(_worker, DefaultTaskParams{}, std::forward(f)); +} + +// Function: silent_async_unchecked +template +void Runtime::silent_async_unchecked(P&& params, F&& f) { + _silent_async(_worker, std::forward

      (params), std::forward(f)); +} + +// ------------------------------------ +// Runtime::async series +// ------------------------------------ + +// Function: _async +template +auto Runtime::_async(Worker& w, P&& params, F&& f) { + + _parent->_join_counter.fetch_add(1, std::memory_order_relaxed); + + using R = std::invoke_result_t>; + + std::packaged_task p(std::forward(f)); + auto fu{p.get_future()}; + + auto node = node_pool.animate( + std::forward

      (params), _parent->_topology, _parent, 0, + std::in_place_type_t{}, + [p=make_moc(std::move(p))] () mutable { p.object(); } + ); + + _executor._schedule(w, node); + + return fu; +} + +// Function: async +template +auto Runtime::async(F&& f) { + return _async(*_executor._this_worker(), DefaultTaskParams{}, std::forward(f)); +} + +// Function: async +template +auto Runtime::async(P&& params, F&& f) { + return _async(*_executor._this_worker(), std::forward

      (params), std::forward(f)); +} + + + +} // end of namespace tf ----------------------------------------------------- + + + + + + diff --git a/sandbox/executor/executor-tw.hpp b/sandbox/executor/executor-tw.hpp new file mode 100644 index 000000000..73ae0613c --- /dev/null +++ b/sandbox/executor/executor-tw.hpp @@ -0,0 +1,2499 @@ +#pragma once + +#include "observer.hpp" +#include "taskflow.hpp" +#include "async_task.hpp" + +/** +@file executor.hpp +@brief executor include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// Executor Definition +// ---------------------------------------------------------------------------- + +/** @class Executor + +@brief class to create an executor for running a taskflow graph + +An executor manages a set of worker threads to run one or multiple taskflows +using an efficient work-stealing scheduling algorithm. + +@code{.cpp} +// Declare an executor and a taskflow +tf::Executor executor; +tf::Taskflow taskflow; + +// Add three tasks into the taskflow +tf::Task A = taskflow.emplace([] () { std::cout << "This is TaskA\n"; }); +tf::Task B = taskflow.emplace([] () { std::cout << "This is TaskB\n"; }); +tf::Task C = taskflow.emplace([] () { std::cout << "This is TaskC\n"; }); + +// Build precedence between tasks +A.precede(B, C); + +tf::Future fu = executor.run(taskflow); +fu.wait(); // block until the execution completes + +executor.run(taskflow, [](){ std::cout << "end of 1 run"; }).wait(); +executor.run_n(taskflow, 4); +executor.wait_for_all(); // block until all associated executions finish +executor.run_n(taskflow, 4, [](){ std::cout << "end of 4 runs"; }).wait(); +executor.run_until(taskflow, [cnt=0] () mutable { return ++cnt == 10; }); +@endcode + +All the @c run methods are @em thread-safe. You can submit multiple +taskflows at the same time to an executor from different threads. +*/ +class Executor { + + friend class FlowBuilder; + friend class Subflow; + friend class Runtime; + + public: + + /** + @brief constructs the executor with @c N worker threads + + @param N the number of workers (default std::thread::hardware_concurrency) + + The constructor spawns @c N worker threads to run tasks in a + work-stealing loop. The number of workers must be greater than zero + or an exception will be thrown. + By default, the number of worker threads is equal to the maximum + hardware concurrency returned by std::thread::hardware_concurrency. + */ + explicit Executor(size_t N = std::thread::hardware_concurrency()); + + /** + @brief destructs the executor + + The destructor calls Executor::wait_for_all to wait for all submitted + taskflows to complete and then notifies all worker threads to stop + and join these threads. + */ + ~Executor(); + + /** + @brief runs a taskflow once + + @param taskflow a tf::Taskflow object + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow once and returns a tf::Future + object that eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run(taskflow); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + tf::Future run(Taskflow& taskflow); + + /** + @brief runs a moved taskflow once + + @param taskflow a moved tf::Taskflow object + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow once and returns a tf::Future + object that eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future future = executor.run(std::move(taskflow)); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + tf::Future run(Taskflow&& taskflow); + + /** + @brief runs a taskflow once and invoke a callback upon completion + + @param taskflow a tf::Taskflow object + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow once and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run(taskflow, [](){ std::cout << "done"; }); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template + tf::Future run(Taskflow& taskflow, C&& callable); + + /** + @brief runs a moved taskflow once and invoke a callback upon completion + + @param taskflow a moved tf::Taskflow object + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow once and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future future = executor.run( + std::move(taskflow), [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template + tf::Future run(Taskflow&& taskflow, C&& callable); + + /** + @brief runs a taskflow for @c N times + + @param taskflow a tf::Taskflow object + @param N number of runs + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow @c N times and returns a tf::Future + object that eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run_n(taskflow, 2); // run taskflow 2 times + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + tf::Future run_n(Taskflow& taskflow, size_t N); + + /** + @brief runs a moved taskflow for @c N times + + @param taskflow a moved tf::Taskflow object + @param N number of runs + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow @c N times and returns a tf::Future + object that eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future future = executor.run_n( + std::move(taskflow), 2 // run the moved taskflow 2 times + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + tf::Future run_n(Taskflow&& taskflow, size_t N); + + /** + @brief runs a taskflow for @c N times and then invokes a callback + + @param taskflow a tf::Taskflow + @param N number of runs + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow @c N times and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run( + taskflow, 2, [](){ std::cout << "done"; } // runs taskflow 2 times and invoke + // the lambda to print "done" + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template + tf::Future run_n(Taskflow& taskflow, size_t N, C&& callable); + + /** + @brief runs a moved taskflow for @c N times and then invokes a callback + + @param taskflow a moved tf::Taskflow + @param N number of runs + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow @c N times and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run_n( + // run the moved taskflow 2 times and invoke the lambda to print "done" + std::move(taskflow), 2, [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template + tf::Future run_n(Taskflow&& taskflow, size_t N, C&& callable); + + /** + @brief runs a taskflow multiple times until the predicate becomes true + + @param taskflow a tf::Taskflow + @param pred a boolean predicate to return @c true for stop + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow multiple times until + the predicate returns @c true. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run_until( + taskflow, [](){ return rand()%10 == 0 } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template + tf::Future run_until(Taskflow& taskflow, P&& pred); + + /** + @brief runs a moved taskflow and keeps running it + until the predicate becomes true + + @param taskflow a moved tf::Taskflow object + @param pred a boolean predicate to return @c true for stop + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow multiple times until + the predicate returns @c true. + This member function returns a tf::Future object that + eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future future = executor.run_until( + std::move(taskflow), [](){ return rand()%10 == 0 } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template + tf::Future run_until(Taskflow&& taskflow, P&& pred); + + /** + @brief runs a taskflow multiple times until the predicate becomes true and + then invokes the callback + + @param taskflow a tf::Taskflow + @param pred a boolean predicate to return @c true for stop + @param callable a callable object to be invoked after this run completes + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow multiple times until + the predicate returns @c true and then invokes the given callable when + the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run_until( + taskflow, [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template + tf::Future run_until(Taskflow& taskflow, P&& pred, C&& callable); + + /** + @brief runs a moved taskflow and keeps running + it until the predicate becomes true and then invokes the callback + + @param taskflow a moved tf::Taskflow + @param pred a boolean predicate to return @c true for stop + @param callable a callable object to be invoked after this run completes + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow multiple times until + the predicate returns @c true and then invokes the given callable when + the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future future = executor.run_until( + std::move(taskflow), + [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template + tf::Future run_until(Taskflow&& taskflow, P&& pred, C&& callable); + + /** + @brief runs a target graph and waits until it completes using + an internal worker of this executor + + @tparam T target type which has `tf::Graph& T::graph()` defined + @param target the target task graph object + + The method runs a target graph which has `tf::Graph& T::graph()` defined + and waits until the execution completes. + Unlike the typical flow of calling `tf::Executor::run` series + plus waiting on the result, this method must be called by an internal + worker of this executor. The caller worker will participate in + the work-stealing loop of the scheduler, thereby avoiding potential + deadlock caused by blocked waiting. + + @code{.cpp} + tf::Executor executor(2); + tf::Taskflow taskflow; + std::array others; + + std::atomic counter{0}; + + for(size_t n=0; n<1000; n++) { + for(size_t i=0; i<1000; i++) { + others[n].emplace([&](){ counter++; }); + } + taskflow.emplace([&executor, &tf=others[n]](){ + executor.corun(tf); + //executor.run(tf).wait(); <- blocking the worker without doing anything + // will introduce deadlock + }); + } + executor.run(taskflow).wait(); + @endcode + + The method is thread-safe as long as the target is not concurrently + ran by two or more threads. + + @attention + You must call tf::Executor::corun from a worker of the calling executor + or an exception will be thrown. + */ + template + void corun(T& target); + + /** + @brief keeps running the work-stealing loop until the predicate becomes true + + @tparam P predicate type + @param predicate a boolean predicate to indicate when to stop the loop + + The method keeps the caller worker running in the work-stealing loop + until the stop predicate becomes true. + + @code{.cpp} + taskflow.emplace([&](){ + std::future fu = std::async([](){ std::sleep(100s); }); + executor.corun_until([](){ + return fu.wait_for(std::chrono::seconds(0)) == future_status::ready; + }); + }); + @endcode + + @attention + You must call tf::Executor::corun_until from a worker of the calling executor + or an exception will be thrown. + */ + template + void corun_until(P&& predicate); + + /** + @brief waits for all tasks to complete + + This member function waits until all submitted tasks + (e.g., taskflows, asynchronous tasks) to finish. + + @code{.cpp} + executor.run(taskflow1); + executor.run_n(taskflow2, 10); + executor.run_n(taskflow3, 100); + executor.wait_for_all(); // wait until the above submitted taskflows finish + @endcode + */ + void wait_for_all(); + + /** + @brief queries the number of worker threads + + Each worker represents one unique thread spawned by an executor + upon its construction time. + + @code{.cpp} + tf::Executor executor(4); + std::cout << executor.num_workers(); // 4 + @endcode + */ + size_t num_workers() const noexcept; + + /** + @brief queries the number of running topologies at the time of this call + + When a taskflow is submitted to an executor, a topology is created to store + runtime metadata of the running taskflow. + When the execution of the submitted taskflow finishes, + its corresponding topology will be removed from the executor. + + @code{.cpp} + executor.run(taskflow); + std::cout << executor.num_topologies(); // 0 or 1 (taskflow still running) + @endcode + */ + size_t num_topologies() const; + + /** + @brief queries the number of running taskflows with moved ownership + + @code{.cpp} + executor.run(std::move(taskflow)); + std::cout << executor.num_taskflows(); // 0 or 1 (taskflow still running) + @endcode + */ + size_t num_taskflows() const; + + /** + @brief queries the id of the caller thread in this executor + + Each worker has an unique id in the range of @c 0 to @c N-1 associated with + its parent executor. + If the caller thread does not belong to the executor, @c -1 is returned. + + @code{.cpp} + tf::Executor executor(4); // 4 workers in the executor + executor.this_worker_id(); // -1 (main thread is not a worker) + + taskflow.emplace([&](){ + std::cout << executor.this_worker_id(); // 0, 1, 2, or 3 + }); + executor.run(taskflow); + @endcode + */ + int this_worker_id() const; + + // -------------------------------------------------------------------------- + // Observer methods + // -------------------------------------------------------------------------- + + /** + @brief constructs an observer to inspect the activities of worker threads + + @tparam Observer observer type derived from tf::ObserverInterface + @tparam ArgsT argument parameter pack + + @param args arguments to forward to the constructor of the observer + + @return a shared pointer to the created observer + + Each executor manages a list of observers with shared ownership with callers. + For each of these observers, the two member functions, + tf::ObserverInterface::on_entry and tf::ObserverInterface::on_exit + will be called before and after the execution of a task. + + This member function is not thread-safe. + */ + template + std::shared_ptr make_observer(ArgsT&&... args); + + /** + @brief removes an observer from the executor + + This member function is not thread-safe. + */ + template + void remove_observer(std::shared_ptr observer); + + /** + @brief queries the number of observers + */ + size_t num_observers() const noexcept; + + // -------------------------------------------------------------------------- + // Async Task Methods + // -------------------------------------------------------------------------- + + /** + @brief creates a parameterized asynchronous task to run the given function + + @tparam P task parameter type + @tparam F callable type + + @param params task parameters + @param func callable object + + @return a @std_future that will hold the result of the execution + + The method creates a parameterized asynchronous task + to run the given function and return a @std_future object + that eventually will hold the result of the execution. + + @code{.cpp} + std::future future = executor.async("name", [](){ + std::cout << "create an asynchronous task with a name and returns 1\n"; + return 1; + }); + future.get(); + @endcode + + This member function is thread-safe. + */ + template + auto async(P&& params, F&& func); + + /** + @brief runs a given function asynchronously + + @tparam F callable type + + @param func callable object + + @return a @std_future that will hold the result of the execution + + The method creates an asynchronous task to run the given function + and return a @std_future object that eventually will hold the result + of the return value. + + @code{.cpp} + std::future future = executor.async([](){ + std::cout << "create an asynchronous task and returns 1\n"; + return 1; + }); + future.get(); + @endcode + + This member function is thread-safe. + */ + template + auto async(F&& func); + + /** + @brief similar to tf::Executor::async but does not return a future object + + @tparam F callable type + + @param params task parameters + @param func callable object + + The method creates a parameterized asynchronous task + to run the given function without returning any @std_future object. + This member function is more efficient than tf::Executor::async + and is encouraged to use when applications do not need a @std_future to acquire + the result or synchronize the execution. + + @code{.cpp} + executor.silent_async("name", [](){ + std::cout << "create an asynchronous task with a name and no return\n"; + }); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template + void silent_async(P&& params, F&& func); + + /** + @brief similar to tf::Executor::async but does not return a future object + + @tparam F callable type + + @param func callable object + + The method creates an asynchronous task + to run the given function without returning any @std_future object. + This member function is more efficient than tf::Executor::async + and is encouraged to use when applications do not need a @std_future to acquire + the result or synchronize the execution. + + @code{.cpp} + executor.silent_async([](){ + std::cout << "create an asynchronous task with no return\n"; + }); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template + void silent_async(F&& func); + + // -------------------------------------------------------------------------- + // Silent Dependent Async Methods + // -------------------------------------------------------------------------- + + /** + @brief runs the given function asynchronously + when the given dependents finish + + @tparam F callable type + @tparam Tasks task types convertible to tf::AsyncTask + + @param func callable object + @param tasks asynchronous tasks on which this execution depends + + @return a tf::AsyncTask handle + + This member function is more efficient than tf::Executor::dependent_async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + + @code{.cpp} + tf::AsyncTask A = executor.silent_dependent_async([](){ printf("A\n"); }); + tf::AsyncTask B = executor.silent_dependent_async([](){ printf("B\n"); }); + executor.silent_dependent_async([](){ printf("C runs after A and B\n"); }, A, B); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template ...>, void>* = nullptr + > + tf::AsyncTask silent_dependent_async(F&& func, Tasks&&... tasks); + + /** + @brief runs the given function asynchronously + when the given dependents finish + + @tparam F callable type + @tparam Tasks task types convertible to tf::AsyncTask + + @param params task parameters + @param func callable object + @param tasks asynchronous tasks on which this execution depends + + @return a tf::AsyncTask handle + + This member function is more efficient than tf::Executor::dependent_async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + tf::AsyncTask A = executor.silent_dependent_async("A", [](){ printf("A\n"); }); + tf::AsyncTask B = executor.silent_dependent_async("B", [](){ printf("B\n"); }); + executor.silent_dependent_async( + "C", [](){ printf("C runs after A and B\n"); }, A, B + ); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template && all_same_v...>, void>* = nullptr + > + tf::AsyncTask silent_dependent_async(P&& params, F&& func, Tasks&&... tasks); + + /** + @brief runs the given function asynchronously + when the given range of dependents finish + + @tparam F callable type + @tparam I iterator type + + @param func callable object + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + + @return a tf::AsyncTask handle + + This member function is more efficient than tf::Executor::dependent_async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + + @code{.cpp} + std::array array { + executor.silent_dependent_async([](){ printf("A\n"); }), + executor.silent_dependent_async([](){ printf("B\n"); }) + }; + executor.silent_dependent_async( + [](){ printf("C runs after A and B\n"); }, array.begin(), array.end() + ); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template , AsyncTask>, void>* = nullptr + > + tf::AsyncTask silent_dependent_async(F&& func, I first, I last); + + /** + @brief runs the given function asynchronously + when the given range of dependents finish + + @tparam F callable type + @tparam I iterator type + + @param params tasks parameters + @param func callable object + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + + @return a tf::AsyncTask handle + + This member function is more efficient than tf::Executor::dependent_async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + std::array array { + executor.silent_dependent_async("A", [](){ printf("A\n"); }), + executor.silent_dependent_async("B", [](){ printf("B\n"); }) + }; + executor.silent_dependent_async( + "C", [](){ printf("C runs after A and B\n"); }, array.begin(), array.end() + ); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template && !std::is_same_v, AsyncTask>, void>* = nullptr + > + tf::AsyncTask silent_dependent_async(P&& params, F&& func, I first, I last); + + // -------------------------------------------------------------------------- + // Dependent Async Methods + // -------------------------------------------------------------------------- + + /** + @brief runs the given function asynchronously + when the given dependents finish + + @tparam F callable type + @tparam Tasks task types convertible to tf::AsyncTask + + @param func callable object + @param tasks asynchronous tasks on which this execution depends + + @return a pair of a tf::AsyncTask handle and + a @std_future that holds the result of the execution + + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Task @c C returns a pair of its tf::AsyncTask handle and a std::future + that eventually will hold the result of the execution. + + @code{.cpp} + tf::AsyncTask A = executor.silent_dependent_async([](){ printf("A\n"); }); + tf::AsyncTask B = executor.silent_dependent_async([](){ printf("B\n"); }); + auto [C, fuC] = executor.dependent_async( + [](){ + printf("C runs after A and B\n"); + return 1; + }, + A, B + ); + fuC.get(); // C finishes, which in turns means both A and B finish + @endcode + + You can mixed the use of tf::AsyncTask handles + returned by Executor::dependent_async and Executor::silent_dependent_async + when specifying task dependencies. + + This member function is thread-safe. + */ + template ...>, void>* = nullptr + > + auto dependent_async(F&& func, Tasks&&... tasks); + + /** + @brief runs the given function asynchronously + when the given dependents finish + + @tparam P task parameters type + @tparam F callable type + @tparam Tasks task types convertible to tf::AsyncTask + + @param params task parameters + @param func callable object + @param tasks asynchronous tasks on which this execution depends + + @return a pair of a tf::AsyncTask handle and + a @std_future that holds the result of the execution + + The example below creates three named asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Task @c C returns a pair of its tf::AsyncTask handle and a std::future + that eventually will hold the result of the execution. + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + tf::AsyncTask A = executor.silent_dependent_async("A", [](){ printf("A\n"); }); + tf::AsyncTask B = executor.silent_dependent_async("B", [](){ printf("B\n"); }); + auto [C, fuC] = executor.dependent_async( + "C", + [](){ + printf("C runs after A and B\n"); + return 1; + }, + A, B + ); + assert(fuC.get()==1); // C finishes, which in turns means both A and B finish + @endcode + + You can mixed the use of tf::AsyncTask handles + returned by Executor::dependent_async and Executor::silent_dependent_async + when specifying task dependencies. + + This member function is thread-safe. + */ + template && all_same_v...>, void>* = nullptr + > + auto dependent_async(P&& params, F&& func, Tasks&&... tasks); + + /** + @brief runs the given function asynchronously + when the given range of dependents finish + + @tparam F callable type + @tparam I iterator type + + @param func callable object + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + + @return a pair of a tf::AsyncTask handle and + a @std_future that holds the result of the execution + + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Task @c C returns a pair of its tf::AsyncTask handle and a std::future + that eventually will hold the result of the execution. + + @code{.cpp} + std::array array { + executor.silent_dependent_async([](){ printf("A\n"); }), + executor.silent_dependent_async([](){ printf("B\n"); }) + }; + auto [C, fuC] = executor.dependent_async( + [](){ + printf("C runs after A and B\n"); + return 1; + }, + array.begin(), array.end() + ); + assert(fuC.get()==1); // C finishes, which in turns means both A and B finish + @endcode + + You can mixed the use of tf::AsyncTask handles + returned by Executor::dependent_async and Executor::silent_dependent_async + when specifying task dependencies. + + This member function is thread-safe. + */ + template , AsyncTask>, void>* = nullptr + > + auto dependent_async(F&& func, I first, I last); + + /** + @brief runs the given function asynchronously + when the given range of dependents finish + + @tparam P task parameters type + @tparam F callable type + @tparam I iterator type + + @param params task parameters + @param func callable object + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + + @return a pair of a tf::AsyncTask handle and + a @std_future that holds the result of the execution + + The example below creates three named asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Task @c C returns a pair of its tf::AsyncTask handle and a std::future + that eventually will hold the result of the execution. + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + std::array array { + executor.silent_dependent_async("A", [](){ printf("A\n"); }), + executor.silent_dependent_async("B", [](){ printf("B\n"); }) + }; + auto [C, fuC] = executor.dependent_async( + "C", + [](){ + printf("C runs after A and B\n"); + return 1; + }, + array.begin(), array.end() + ); + assert(fuC.get()==1); // C finishes, which in turns means both A and B finish + @endcode + + You can mixed the use of tf::AsyncTask handles + returned by Executor::dependent_async and Executor::silent_dependent_async + when specifying task dependencies. + + This member function is thread-safe. + */ + template && !std::is_same_v, AsyncTask>, void>* = nullptr + > + auto dependent_async(P&& params, F&& func, I first, I last); + + private: + + const size_t _MAX_STEALS; + + std::mutex _wsq_mutex; + std::mutex _taskflows_mutex; + + std::vector _threads; + std::vector _workers; + +#ifdef __cpp_lib_atomic_wait + std::atomic _num_topologies {0}; + std::atomic_flag _all_spawned = ATOMIC_FLAG_INIT; + + std::atomic_flag _done = ATOMIC_FLAG_INIT; + std::atomic _state = 0ull; +#else + std::condition_variable _topology_cv; + std::mutex _topology_mutex; + size_t _num_topologies {0}; + Notifier _notifier; + std::atomic _done {0}; +#endif + + std::unordered_map _wids; + std::list _taskflows; + + TaskQueue _wsq; + + std::unordered_set> _observers; + + Worker* _this_worker(); + + bool _wait_for_task(Worker&, Node*&); + bool _invoke_module_task_internal(Worker&, Node*); + + void _observer_prologue(Worker&, Node*); + void _observer_epilogue(Worker&, Node*); + void _spawn(size_t); + void _exploit_task(Worker&, Node*&); + void _explore_task(Worker&, Node*&); + void _schedule(Worker&, Node*); + void _schedule(Node*); + void _schedule(Worker&, const SmallVector&); + void _schedule(const SmallVector&); + void _set_up_topology(Worker*, Topology*); + void _set_up_graph(Graph&, Node*, Topology*, int, SmallVector&); + void _tear_down_topology(Worker&, Topology*); + void _tear_down_async(Node*); + void _tear_down_dependent_async(Worker&, Node*); + void _tear_down_invoke(Worker&, Node*); + void _increment_topology(); + void _decrement_topology(); + void _invoke(Worker&, Node*); + void _invoke_static_task(Worker&, Node*); + void _invoke_subflow_task(Worker&, Node*); + void _detach_subflow_task(Worker&, Node*, Graph&); + void _invoke_condition_task(Worker&, Node*, SmallVector&); + void _invoke_multi_condition_task(Worker&, Node*, SmallVector&); + void _invoke_module_task(Worker&, Node*); + void _invoke_async_task(Worker&, Node*); + void _invoke_dependent_async_task(Worker&, Node*); + void _process_async_dependent(Node*, tf::AsyncTask&, size_t&); + void _process_exception(Worker&, Node*); + void _schedule_async_task(Node*); + void _corun_graph(Worker&, Node*, Graph&); + + template + void _corun_until(Worker&, P&&); +}; + +// Constructor +inline Executor::Executor(size_t N) : + _MAX_STEALS {((N+1) << 1)}, + _threads {N}, + _workers {N} +#ifndef __cpp_lib_atomic_wait + ,_notifier {N} +#endif +{ + + if(N == 0) { + TF_THROW("executor must define at least one worker"); + } + + _spawn(N); + + // initialize the default observer if requested + if(has_env(TF_ENABLE_PROFILER)) { + TFProfManager::get()._manage(make_observer()); + } +} + +// Destructor +inline Executor::~Executor() { + + // wait for all topologies to complete + wait_for_all(); + + // shut down the scheduler + +#ifdef __cpp_lib_atomic_wait + _done.test_and_set(std::memory_order_relaxed); + _state.fetch_add(1, std::memory_order_release); + _state.notify_all(); +#else + _done = true; + _notifier.notify(true); +#endif + + for(auto& t : _threads) { + t.join(); + } +} + +// Function: num_workers +inline size_t Executor::num_workers() const noexcept { + return _workers.size(); +} + +// Function: num_topologies +inline size_t Executor::num_topologies() const { +#ifdef __cpp_lib_atomic_wait + return _num_topologies.load(std::memory_order_relaxed); +#else + return _num_topologies; +#endif +} + +// Function: num_taskflows +inline size_t Executor::num_taskflows() const { + return _taskflows.size(); +} + +// Function: _this_worker +inline Worker* Executor::_this_worker() { + auto itr = _wids.find(std::this_thread::get_id()); + return itr == _wids.end() ? nullptr : &_workers[itr->second]; +} + +// Function: this_worker_id +inline int Executor::this_worker_id() const { + auto i = _wids.find(std::this_thread::get_id()); + return i == _wids.end() ? -1 : static_cast(_workers[i->second]._id); +} + +// Procedure: _spawn +inline void Executor::_spawn(size_t N) { + +#ifdef __cpp_lib_atomic_wait +#else + std::mutex mutex; + std::condition_variable cond; + size_t n=0; +#endif + + for(size_t id=0; id lock(mutex); + cond.wait(lock, [&](){ return n==N; }); +#endif +} + +// Function: _corun_until +template +void Executor::_corun_until(Worker& w, P&& stop_predicate) { + + std::uniform_int_distribution rdvtm(0, _workers.size()-1); + + exploit: + + while(!stop_predicate()) { + + //exploit: + + if(auto t = w._wsq.pop(); t) { + _invoke(w, t); + } + else { + size_t num_steals = 0; + + explore: + + t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal(); + + if(t) { + _invoke(w, t); + goto exploit; + } + else if(!stop_predicate()) { + if(num_steals++ > _MAX_STEALS) { + std::this_thread::yield(); + } + w._vtm = rdvtm(w._rdgen); + goto explore; + } + else { + break; + } + } + } +} + +// Function: _explore_task +inline void Executor::_explore_task(Worker& w, Node*& t) { + + //assert(_workers[w].wsq.empty()); + //assert(!t); + + size_t num_steals = 0; + size_t num_yields = 0; + + std::uniform_int_distribution rdvtm(0, _workers.size()-1); + + // Here, we write do-while to make the worker steal at once + // from the assigned victim. + do { + t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal(); + + if(t) { + break; + } + + if(num_steals++ > _MAX_STEALS) { + std::this_thread::yield(); + if(num_yields++ > 100) { + break; + } + } + + w._vtm = rdvtm(w._rdgen); + } +#ifdef __cpp_lib_atomic_wait + // the _DONE can be checked later in wait_for_task? + while(!_done.test(std::memory_order_relaxed)); +#else + while(!_done); +#endif + +} + +// Procedure: _exploit_task +inline void Executor::_exploit_task(Worker& w, Node*& t) { + while(t) { + _invoke(w, t); + t = w._wsq.pop(); + } +} + +// Function: _wait_for_task +inline bool Executor::_wait_for_task(Worker& worker, Node*& t) { + + explore_task: + + _explore_task(worker, t); + + if(t) { + return true; + } + + // The last thief who successfully stole a task will wake up + // another thief worker to avoid starvation. +// if(t) { +//#ifdef __cpp_lib_atomic_wait +// +//#else +// _notifier.notify(false); +//#endif +// return true; +// } + +#ifdef __cpp_lib_atomic_wait + + uint64_t new_state = _state.load(std::memory_order_acquire); + + if(_done.test(std::memory_order_relaxed)) { + return false; + } + + if(!_wsq.empty()) { + worker._vtm = worker._id; + goto explore_task; + } + + for(size_t vtm=0; vtm<_workers.size(); vtm++) { + if(!_workers[vtm]._wsq.empty() || + _workers[vtm]._has_task.exchange(false, std::memory_order_acquire) == true) { + worker._vtm = vtm; + goto explore_task; + } + } + + _state.wait(new_state, std::memory_order_acquire); + goto explore_task; + +#else + // ---- 2PC guard ---- + _notifier.prepare_wait(worker._waiter); + + if(!_wsq.empty()) { + _notifier.cancel_wait(worker._waiter); + worker._vtm = worker._id; + goto explore_task; + } + + if(_done) { + _notifier.cancel_wait(worker._waiter); + _notifier.notify(true); + return false; + } + + // We need to use index-based scanning to avoid data race + // with _spawn which may initialize a worker at the same time. + for(size_t vtm=0; vtm<_workers.size(); vtm++) { + if(!_workers[vtm]._wsq.empty()) { + _notifier.cancel_wait(worker._waiter); + worker._vtm = vtm; + goto explore_task; + } + } + + // Now I really need to relinquish my self to others + _notifier.commit_wait(worker._waiter); + goto explore_task; +#endif + +} + +// Function: make_observer +template +std::shared_ptr Executor::make_observer(ArgsT&&... args) { + + static_assert( + std::is_base_of_v, + "Observer must be derived from ObserverInterface" + ); + + // use a local variable to mimic the constructor + auto ptr = std::make_shared(std::forward(args)...); + + ptr->set_up(_workers.size()); + + _observers.emplace(std::static_pointer_cast(ptr)); + + return ptr; +} + +// Procedure: remove_observer +template +void Executor::remove_observer(std::shared_ptr ptr) { + + static_assert( + std::is_base_of_v, + "Observer must be derived from ObserverInterface" + ); + + _observers.erase(std::static_pointer_cast(ptr)); +} + +// Function: num_observers +inline size_t Executor::num_observers() const noexcept { + return _observers.size(); +} + +// Procedure: _schedule +inline void Executor::_schedule(Worker& worker, Node* node) { + + // We need to fetch p before the release such that the read + // operation is synchronized properly with other thread to + // void data race. + auto p = node->_priority; + + node->_state.fetch_or(Node::READY, std::memory_order_release); + + // caller is a worker to this pool - starting at v3.5 we do not use + // any complicated notification mechanism as the experimental result + // has shown no significant advantage. + if(worker._executor == this) { + worker._wsq.push(node, p); +#ifdef __cpp_lib_atomic_wait + // we load the state first as load is much faster than fetch_add + if(worker._has_task.exchange(true, std::memory_order_release) == false) { + _state.fetch_add(1, std::memory_order_release); + _state.notify_one(); + } +#else + _notifier.notify(false); +#endif + return; + } + + { + std::lock_guard lock(_wsq_mutex); + _wsq.push(node, p); + } +#ifdef __cpp_lib_atomic_wait + // we load the state first as load is much faster than fetch_add + _state.fetch_add(1, std::memory_order_release); + _state.notify_one(); +#else + _notifier.notify(false); +#endif +} + +// Procedure: _schedule +inline void Executor::_schedule(Node* node) { + + // We need to fetch p before the release such that the read + // operation is synchronized properly with other thread to + // void data race. + auto p = node->_priority; + + node->_state.fetch_or(Node::READY, std::memory_order_release); + + { + std::lock_guard lock(_wsq_mutex); + _wsq.push(node, p); + } + +#ifdef __cpp_lib_atomic_wait + // we load the state first as load is much faster than fetch_add + _state.fetch_add(1, std::memory_order_release); + _state.notify_one(); +#else + _notifier.notify(false); +#endif +} + +// Procedure: _schedule +inline void Executor::_schedule(Worker& worker, const SmallVector& nodes) { + + // We need to cacth the node count to avoid accessing the nodes + // vector while the parent topology is removed! + const auto num_nodes = nodes.size(); + + if(num_nodes == 0) { + return; + } + + // caller is a worker to this pool - starting at v3.5 we do not use + // any complicated notification mechanism as the experimental result + // has shown no significant advantage. + if(worker._executor == this) { + for(size_t i=0; i_priority; + nodes[i]->_state.fetch_or(Node::READY, std::memory_order_release); + worker._wsq.push(nodes[i], p); +#ifdef __cpp_lib_atomic_wait + // we load the state first as load is much faster than fetch_add + if(worker._has_task.exchange(true, std::memory_order_release) == false) { + _state.fetch_add(1, std::memory_order_release); + _state.notify_one(); + } +#else + _notifier.notify(false); +#endif + } + return; + } + + { + std::lock_guard lock(_wsq_mutex); + for(size_t k=0; k_priority; + nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release); + _wsq.push(nodes[k], p); + } + } +#ifdef __cpp_lib_atomic_wait + _state.fetch_add(1, std::memory_order_release); + if(num_nodes < _workers.size()) { + for(size_t i=0; i& nodes) { + + // parent topology may be removed! + const auto num_nodes = nodes.size(); + + if(num_nodes == 0) { + return; + } + + // We need to fetch p before the release such that the read + // operation is synchronized properly with other thread to + // void data race. + { + std::lock_guard lock(_wsq_mutex); + for(size_t k=0; k_priority; + nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release); + _wsq.push(nodes[k], p); + } + } + +#ifdef __cpp_lib_atomic_wait + _state.fetch_add(1, std::memory_order_release); + if(num_nodes < _workers.size()) { + for(size_t i=0; i_state.load(std::memory_order_acquire) & Node::READY)); + + begin_invoke: + + SmallVector conds; + + // no need to do other things if the topology is cancelled + if(node->_is_cancelled()) { + _tear_down_invoke(worker, node); + return; + } + + // if acquiring semaphore(s) exists, acquire them first + if(node->_semaphores && !node->_semaphores->to_acquire.empty()) { + SmallVector nodes; + if(!node->_acquire_all(nodes)) { + _schedule(worker, nodes); + return; + } + node->_state.fetch_or(Node::ACQUIRED, std::memory_order_release); + } + + // condition task + //int cond = -1; + + // switch is faster than nested if-else due to jump table + switch(node->_handle.index()) { + // static task + case Node::STATIC:{ + _invoke_static_task(worker, node); + } + break; + + // subflow task + case Node::SUBFLOW: { + _invoke_subflow_task(worker, node); + } + break; + + // condition task + case Node::CONDITION: { + _invoke_condition_task(worker, node, conds); + } + break; + + // multi-condition task + case Node::MULTI_CONDITION: { + _invoke_multi_condition_task(worker, node, conds); + } + break; + + // module task + case Node::MODULE: { + _invoke_module_task(worker, node); + } + break; + + // async task + case Node::ASYNC: { + _invoke_async_task(worker, node); + _tear_down_async(node); + return ; + } + break; + + // dependent async task + case Node::DEPENDENT_ASYNC: { + _invoke_dependent_async_task(worker, node); + _tear_down_dependent_async(worker, node); + if(worker._cache) { + node = worker._cache; + goto begin_invoke; + } + return; + } + break; + + // monostate (placeholder) + default: + break; + } + + //invoke_successors: + + // if releasing semaphores exist, release them + if(node->_semaphores && !node->_semaphores->to_release.empty()) { + _schedule(worker, node->_release_all()); + } + + // Reset the join counter to support the cyclic control flow. + // + We must do this before scheduling the successors to avoid race + // condition on _dependents. + // + We must use fetch_add instead of direct assigning + // because the user-space call on "invoke" may explicitly schedule + // this task again (e.g., pipeline) which can access the join_counter. + if((node->_state.load(std::memory_order_relaxed) & Node::CONDITIONED)) { + node->_join_counter.fetch_add(node->num_strong_dependents(), std::memory_order_relaxed); + } + else { + node->_join_counter.fetch_add(node->num_dependents(), std::memory_order_relaxed); + } + + // acquire the parent flow counter + auto& j = (node->_parent) ? node->_parent->_join_counter : + node->_topology->_join_counter; + + // Here, we want to cache the latest successor with the highest priority + worker._cache = nullptr; + auto max_p = static_cast(TaskPriority::MAX); + + // Invoke the task based on the corresponding type + switch(node->_handle.index()) { + + // condition and multi-condition tasks + case Node::CONDITION: + case Node::MULTI_CONDITION: { + for(auto cond : conds) { + if(cond >= 0 && static_cast(cond) < node->_successors.size()) { + auto s = node->_successors[cond]; + // zeroing the join counter for invariant + s->_join_counter.store(0, std::memory_order_relaxed); + j.fetch_add(1, std::memory_order_relaxed); + if(s->_priority <= max_p) { + if(worker._cache) { + _schedule(worker, worker._cache); + } + worker._cache = s; + max_p = s->_priority; + } + else { + _schedule(worker, s); + } + } + } + } + break; + + // non-condition task + default: { + for(size_t i=0; i_successors.size(); ++i) { + //if(auto s = node->_successors[i]; --(s->_join_counter) == 0) { + if(auto s = node->_successors[i]; + s->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) { + j.fetch_add(1, std::memory_order_relaxed); + if(s->_priority <= max_p) { + if(worker._cache) { + _schedule(worker, worker._cache); + } + worker._cache = s; + max_p = s->_priority; + } + else { + _schedule(worker, s); + } + } + } + } + break; + } + + // tear_down the invoke + _tear_down_invoke(worker, node); + + // perform tail recursion elimination for the right-most child to reduce + // the number of expensive pop/push operations through the task queue + if(worker._cache) { + node = worker._cache; + //node->_state.fetch_or(Node::READY, std::memory_order_release); + goto begin_invoke; + } +} + +// Procedure: _tear_down_invoke +inline void Executor::_tear_down_invoke(Worker& worker, Node* node) { + // we must check parent first before subtracting the join counter, + // or it can introduce data race + if(auto parent = node->_parent; parent == nullptr) { + if(node->_topology->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) { + _tear_down_topology(worker, node->_topology); + } + } + // Here we asssume the parent is in a busy loop (e.g., corun) waiting for + // its join counter to become 0. + else { + //parent->_join_counter.fetch_sub(1, std::memory_order_acq_rel); + parent->_join_counter.fetch_sub(1, std::memory_order_release); + } + //// module task + //else { + // auto id = parent->_handle.index(); + // if(parent->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) { + // if(id == Node::MODULE) { + // return parent; + // } + // } + //} + //return nullptr; +} + +// Procedure: _observer_prologue +inline void Executor::_observer_prologue(Worker& worker, Node* node) { + for(auto& observer : _observers) { + observer->on_entry(WorkerView(worker), TaskView(*node)); + } +} + +// Procedure: _observer_epilogue +inline void Executor::_observer_epilogue(Worker& worker, Node* node) { + for(auto& observer : _observers) { + observer->on_exit(WorkerView(worker), TaskView(*node)); + } +} + +// Procedure: _process_exception +inline void Executor::_process_exception(Worker&, Node* node) { + + constexpr static auto flag = Topology::EXCEPTION | Topology::CANCELLED; + + // if the node has a parent, we store the exception in its parent + if(auto parent = node->_parent; parent) { + if ((parent->_state.fetch_or(Node::EXCEPTION, std::memory_order_relaxed) & Node::EXCEPTION) == 0) { + parent->_exception_ptr = std::current_exception(); + } + // TODO if the node has a topology, cancel it to enable early stop + //if(auto tpg = node->_topology; tpg) { + // tpg->_state.fetch_or(Topology::CANCELLED, std::memory_order_relaxed); + //} + } + // multiple tasks may throw, so we only take the first thrown exception + else if(auto tpg = node->_topology; tpg && + ((tpg->_state.fetch_or(flag, std::memory_order_relaxed) & Topology::EXCEPTION) == 0) + ) { + tpg->_exception_ptr = std::current_exception(); + } + // TODO: skip the exception that is not associated with any taskflows +} + +// Procedure: _invoke_static_task +inline void Executor::_invoke_static_task(Worker& worker, Node* node) { + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { + auto& work = std::get_if(&node->_handle)->work; + switch(work.index()) { + case 0: + std::get_if<0>(&work)->operator()(); + break; + + case 1: + Runtime rt(*this, worker, node); + std::get_if<1>(&work)->operator()(rt); + node->_process_exception(); + break; + } + }); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_subflow_task +inline void Executor::_invoke_subflow_task(Worker& w, Node* node) { + _observer_prologue(w, node); + TF_EXECUTOR_EXCEPTION_HANDLER(w, node, { + auto handle = std::get_if(&node->_handle); + handle->subgraph._clear(); + Subflow sf(*this, w, node, handle->subgraph); + handle->work(sf); + if(sf._joinable) { + _corun_graph(w, node, handle->subgraph); + } + node->_process_exception(); + }); + _observer_epilogue(w, node); +} + +// Procedure: _detach_subflow_task +inline void Executor::_detach_subflow_task(Worker& w, Node* p, Graph& g) { + + // graph is empty and has no async tasks + if(g.empty() && p->_join_counter.load(std::memory_order_acquire) == 0) { + return; + } + + SmallVector src; + _set_up_graph(g, nullptr, p->_topology, Node::DETACHED, src); + + { + std::lock_guard lock(p->_topology->_taskflow._mutex); + p->_topology->_taskflow._graph._merge(std::move(g)); + } + + p->_topology->_join_counter.fetch_add(src.size(), std::memory_order_relaxed); + _schedule(w, src); +} + +// Procedure: _corun_graph +inline void Executor::_corun_graph(Worker& w, Node* p, Graph& g) { + + // assert(p); + + // graph is empty and has no async tasks (subflow) + if(g.empty() && p->_join_counter.load(std::memory_order_acquire) == 0) { + return; + } + + SmallVector src; + + _set_up_graph(g, p, p->_topology, 0, src); + p->_join_counter.fetch_add(src.size(), std::memory_order_relaxed); + + _schedule(w, src); + + _corun_until(w, [p] () -> bool { + return p->_join_counter.load(std::memory_order_acquire) == 0; } + ); +} + +// Procedure: _invoke_condition_task +inline void Executor::_invoke_condition_task( + Worker& worker, Node* node, SmallVector& conds +) { + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { + auto& work = std::get_if(&node->_handle)->work; + switch(work.index()) { + case 0: + conds = { std::get_if<0>(&work)->operator()() }; + break; + + case 1: + Runtime rt(*this, worker, node); + conds = { std::get_if<1>(&work)->operator()(rt) }; + node->_process_exception(); + break; + } + }); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_multi_condition_task +inline void Executor::_invoke_multi_condition_task( + Worker& worker, Node* node, SmallVector& conds +) { + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { + auto& work = std::get_if(&node->_handle)->work; + switch(work.index()) { + case 0: + conds = std::get_if<0>(&work)->operator()(); + break; + + case 1: + Runtime rt(*this, worker, node); + conds = std::get_if<1>(&work)->operator()(rt); + node->_process_exception(); + break; + } + }); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_module_task +inline void Executor::_invoke_module_task(Worker& w, Node* node) { + _observer_prologue(w, node); + TF_EXECUTOR_EXCEPTION_HANDLER(w, node, { + _corun_graph(w, node, std::get_if(&node->_handle)->graph); + node->_process_exception(); + }); + _observer_epilogue(w, node); +} + +//// Function: _invoke_module_task_internal +//inline bool Executor::_invoke_module_task_internal(Worker& w, Node* p) { +// +// // acquire the underlying graph +// auto& g = std::get_if(&p->_handle)->graph; +// +// // no need to do anything if the graph is empty +// if(g.empty()) { +// return false; +// } +// +// SmallVector src; +// _set_up_graph(g, p, p->_topology, 0, src); +// p->_join_counter.fetch_add(src.size(), std::memory_order_relaxed); +// +// _schedule(w, src); +// return true; +//} + +// Procedure: _invoke_async_task +inline void Executor::_invoke_async_task(Worker& worker, Node* node) { + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { + auto& work = std::get_if(&node->_handle)->work; + switch(work.index()) { + case 0: + std::get_if<0>(&work)->operator()(); + break; + + case 1: + Runtime rt(*this, worker, node); + std::get_if<1>(&work)->operator()(rt); + break; + } + }); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_dependent_async_task +inline void Executor::_invoke_dependent_async_task(Worker& worker, Node* node) { + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { + auto& work = std::get_if(&node->_handle)->work; + switch(work.index()) { + case 0: + std::get_if<0>(&work)->operator()(); + break; + + case 1: + Runtime rt(*this, worker, node); + std::get_if<1>(&work)->operator()(rt); + break; + } + }); + _observer_epilogue(worker, node); +} + +// Function: run +inline tf::Future Executor::run(Taskflow& f) { + return run_n(f, 1, [](){}); +} + +// Function: run +inline tf::Future Executor::run(Taskflow&& f) { + return run_n(std::move(f), 1, [](){}); +} + +// Function: run +template +tf::Future Executor::run(Taskflow& f, C&& c) { + return run_n(f, 1, std::forward(c)); +} + +// Function: run +template +tf::Future Executor::run(Taskflow&& f, C&& c) { + return run_n(std::move(f), 1, std::forward(c)); +} + +// Function: run_n +inline tf::Future Executor::run_n(Taskflow& f, size_t repeat) { + return run_n(f, repeat, [](){}); +} + +// Function: run_n +inline tf::Future Executor::run_n(Taskflow&& f, size_t repeat) { + return run_n(std::move(f), repeat, [](){}); +} + +// Function: run_n +template +tf::Future Executor::run_n(Taskflow& f, size_t repeat, C&& c) { + return run_until( + f, [repeat]() mutable { return repeat-- == 0; }, std::forward(c) + ); +} + +// Function: run_n +template +tf::Future Executor::run_n(Taskflow&& f, size_t repeat, C&& c) { + return run_until( + std::move(f), [repeat]() mutable { return repeat-- == 0; }, std::forward(c) + ); +} + +// Function: run_until +template +tf::Future Executor::run_until(Taskflow& f, P&& pred) { + return run_until(f, std::forward

      (pred), [](){}); +} + +// Function: run_until +template +tf::Future Executor::run_until(Taskflow&& f, P&& pred) { + return run_until(std::move(f), std::forward

      (pred), [](){}); +} + +// Function: run_until +template +tf::Future Executor::run_until(Taskflow& f, P&& p, C&& c) { + + _increment_topology(); + + // Need to check the empty under the lock since subflow task may + // define detached blocks that modify the taskflow at the same time + bool empty; + { + std::lock_guard lock(f._mutex); + empty = f.empty(); + } + + // No need to create a real topology but returns an dummy future + if(empty || p()) { + c(); + std::promise promise; + promise.set_value(); + _decrement_topology(); + return tf::Future(promise.get_future()); + } + + // create a topology for this run + auto t = std::make_shared(f, std::forward

      (p), std::forward(c)); + + // need to create future before the topology got torn down quickly + tf::Future future(t->_promise.get_future(), t); + + // modifying topology needs to be protected under the lock + { + std::lock_guard lock(f._mutex); + f._topologies.push(t); + if(f._topologies.size() == 1) { + _set_up_topology(_this_worker(), t.get()); + } + } + + return future; +} + +// Function: run_until +template +tf::Future Executor::run_until(Taskflow&& f, P&& pred, C&& c) { + + std::list::iterator itr; + + { + std::scoped_lock lock(_taskflows_mutex); + itr = _taskflows.emplace(_taskflows.end(), std::move(f)); + itr->_satellite = itr; + } + + return run_until(*itr, std::forward

      (pred), std::forward(c)); +} + +// Function: corun +template +void Executor::corun(T& target) { + + auto w = _this_worker(); + + if(w == nullptr) { + TF_THROW("corun must be called by a worker of the executor"); + } + + Node parent; // auxiliary parent + _corun_graph(*w, &parent, target.graph()); + parent._process_exception(); +} + +// Function: corun_until +template +void Executor::corun_until(P&& predicate) { + + auto w = _this_worker(); + + if(w == nullptr) { + TF_THROW("corun_until must be called by a worker of the executor"); + } + + _corun_until(*w, std::forward

      (predicate)); + + // TODO: exception? +} + +// Procedure: _increment_topology +inline void Executor::_increment_topology() { +#ifdef __cpp_lib_atomic_wait + _num_topologies.fetch_add(1, std::memory_order_relaxed); +#else + std::lock_guard lock(_topology_mutex); + ++_num_topologies; +#endif +} + +// Procedure: _decrement_topology +inline void Executor::_decrement_topology() { +#ifdef __cpp_lib_atomic_wait + if(_num_topologies.fetch_sub(1, std::memory_order_acq_rel) == 1) { + _num_topologies.notify_all(); + } +#else + std::lock_guard lock(_topology_mutex); + if(--_num_topologies == 0) { + _topology_cv.notify_all(); + } +#endif +} + +// Procedure: wait_for_all +inline void Executor::wait_for_all() { +#ifdef __cpp_lib_atomic_wait + size_t n = _num_topologies.load(std::memory_order_acquire); + while(n != 0) { + _num_topologies.wait(n, std::memory_order_acquire); + n = _num_topologies.load(std::memory_order_acquire); + } +#else + std::unique_lock lock(_topology_mutex); + _topology_cv.wait(lock, [&](){ return _num_topologies == 0; }); +#endif +} + +// Function: _set_up_topology +inline void Executor::_set_up_topology(Worker* worker, Topology* tpg) { + + // ---- under taskflow lock ---- + + tpg->_sources.clear(); + tpg->_taskflow._graph._clear_detached(); + _set_up_graph(tpg->_taskflow._graph, nullptr, tpg, 0, tpg->_sources); + tpg->_join_counter.store(tpg->_sources.size(), std::memory_order_relaxed); + + if(worker) { + _schedule(*worker, tpg->_sources); + } + else { + _schedule(tpg->_sources); + } +} + +// Function: _set_up_graph +inline void Executor::_set_up_graph( + Graph& g, Node* parent, Topology* tpg, int state, SmallVector& src +) { + for(auto node : g._nodes) { + node->_topology = tpg; + node->_parent = parent; + node->_state.store(state, std::memory_order_relaxed); + if(node->num_dependents() == 0) { + src.push_back(node); + } + node->_set_up_join_counter(); + node->_exception_ptr = nullptr; + } +} + +// Function: _tear_down_topology +inline void Executor::_tear_down_topology(Worker& worker, Topology* tpg) { + + auto &f = tpg->_taskflow; + + //assert(&tpg == &(f._topologies.front())); + + // case 1: we still need to run the topology again + if(!tpg->_exception_ptr && !tpg->cancelled() && !tpg->_pred()) { + //assert(tpg->_join_counter == 0); + std::lock_guard lock(f._mutex); + tpg->_join_counter.store(tpg->_sources.size(), std::memory_order_relaxed); + _schedule(worker, tpg->_sources); + } + // case 2: the final run of this topology + else { + + // TODO: if the topology is cancelled, need to release all semaphores + if(tpg->_call != nullptr) { + tpg->_call(); + } + + // If there is another run (interleave between lock) + if(std::unique_lock lock(f._mutex); f._topologies.size()>1) { + //assert(tpg->_join_counter == 0); + + // Set the promise + tpg->_promise.set_value(); + f._topologies.pop(); + tpg = f._topologies.front().get(); + + // decrement the topology but since this is not the last we don't notify + _decrement_topology(); + + // set up topology needs to be under the lock or it can + // introduce memory order error with pop + _set_up_topology(&worker, tpg); + } + else { + //assert(f._topologies.size() == 1); + + auto fetched_tpg {std::move(f._topologies.front())}; + f._topologies.pop(); + auto satellite {f._satellite}; + + lock.unlock(); + + // Soon after we carry out the promise, there is no longer any guarantee + // for the lifetime of the associated taskflow. + fetched_tpg->_carry_out_promise(); + + _decrement_topology(); + + // remove the taskflow if it is managed by the executor + // TODO: in the future, we may need to synchronize on wait + // (which means the following code should the moved before set_value) + if(satellite) { + std::scoped_lock satellite_lock(_taskflows_mutex); + _taskflows.erase(*satellite); + } + } + } +} + +// ############################################################################ +// Forward Declaration: Subflow +// ############################################################################ + +inline void Subflow::join() { + + // assert(this_worker().worker == &_worker); + + if(!_joinable) { + TF_THROW("subflow not joinable"); + } + + // only the parent worker can join the subflow + _executor._corun_graph(_worker, _parent, _graph); + + // if any exception is caught from subflow tasks, rethrow it + _parent->_process_exception(); + + _joinable = false; +} + +inline void Subflow::detach() { + + // assert(this_worker().worker == &_worker); + + if(!_joinable) { + TF_THROW("subflow already joined or detached"); + } + + // only the parent worker can detach the subflow + _executor._detach_subflow_task(_worker, _parent, _graph); + _joinable = false; +} + +// ############################################################################ +// Forward Declaration: Runtime +// ############################################################################ + +// Procedure: schedule +inline void Runtime::schedule(Task task) { + + auto node = task._node; + // need to keep the invariant: when scheduling a task, the task must have + // zero dependency (join counter is 0) + // or we can encounter bug when inserting a nested flow (e.g., module task) + node->_join_counter.store(0, std::memory_order_relaxed); + + auto& j = node->_parent ? node->_parent->_join_counter : + node->_topology->_join_counter; + j.fetch_add(1, std::memory_order_relaxed); + _executor._schedule(_worker, node); +} + +// Procedure: corun +template +void Runtime::corun(T&& target) { + _executor._corun_graph(_worker, _parent, target.graph()); + _parent->_process_exception(); +} + +// Procedure: corun_until +template +void Runtime::corun_until(P&& predicate) { + _executor._corun_until(_worker, std::forward

      (predicate)); + // TODO: exception? +} + +// Function: corun_all +inline void Runtime::corun_all() { + _executor._corun_until(_worker, [this] () -> bool { + return _parent->_join_counter.load(std::memory_order_acquire) == 0; + }); + _parent->_process_exception(); +} + +// Destructor +inline Runtime::~Runtime() { + _executor._corun_until(_worker, [this] () -> bool { + return _parent->_join_counter.load(std::memory_order_acquire) == 0; + }); +} + +// ------------------------------------ +// Runtime::silent_async series +// ------------------------------------ + +// Function: _silent_async +template +void Runtime::_silent_async(Worker& w, P&& params, F&& f) { + + _parent->_join_counter.fetch_add(1, std::memory_order_relaxed); + + auto node = node_pool.animate( + std::forward

      (params), _parent->_topology, _parent, 0, + std::in_place_type_t{}, std::forward(f) + ); + + _executor._schedule(w, node); +} + +// Function: silent_async +template +void Runtime::silent_async(F&& f) { + _silent_async(*_executor._this_worker(), DefaultTaskParams{}, std::forward(f)); +} + +// Function: silent_async +template +void Runtime::silent_async(P&& params, F&& f) { + _silent_async(*_executor._this_worker(), std::forward

      (params), std::forward(f)); +} + +// Function: silent_async_unchecked +template +void Runtime::silent_async_unchecked(F&& f) { + _silent_async(_worker, DefaultTaskParams{}, std::forward(f)); +} + +// Function: silent_async_unchecked +template +void Runtime::silent_async_unchecked(P&& params, F&& f) { + _silent_async(_worker, std::forward

      (params), std::forward(f)); +} + +// ------------------------------------ +// Runtime::async series +// ------------------------------------ + +// Function: _async +template +auto Runtime::_async(Worker& w, P&& params, F&& f) { + + _parent->_join_counter.fetch_add(1, std::memory_order_relaxed); + + using R = std::invoke_result_t>; + + std::packaged_task p(std::forward(f)); + auto fu{p.get_future()}; + + auto node = node_pool.animate( + std::forward

      (params), _parent->_topology, _parent, 0, + std::in_place_type_t{}, + [p=make_moc(std::move(p))] () mutable { p.object(); } + ); + + _executor._schedule(w, node); + + return fu; +} + +// Function: async +template +auto Runtime::async(F&& f) { + return _async(*_executor._this_worker(), DefaultTaskParams{}, std::forward(f)); +} + +// Function: async +template +auto Runtime::async(P&& params, F&& f) { + return _async(*_executor._this_worker(), std::forward

      (params), std::forward(f)); +} + + + +} // end of namespace tf ----------------------------------------------------- + + + + + + diff --git a/sandbox/run.sh b/sandbox/run.sh new file mode 100755 index 000000000..050951e45 --- /dev/null +++ b/sandbox/run.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# x: TF_DEFAULT_BOUNDED_TASK_QUEUE_LOG_SIZE +# y: TF_DEFAULT_UNBOUNDED_TASK_QUEUE_LOG_SIZE + +for((x=6; x<=12; x=x+1)) do + for((y=6; y<=12; y=y+1)) do + cmake ../ -DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_CXX_STANDARD=20 -DCMAKE_CXX_FLAGS="-DTF_DEFAULT_BOUNDED_TASK_QUEUE_LOG_SIZE=$x -DTF_DEFAULT_UNBOUNDED_TASK_QUEUE_LOG_SIZE=$y -DTF_ENABLE_ATOMIC_NOTIFIER=1" &> /dev/null; + + #echo "Compiling y=$y ..."; + make -j 16 &> /dev/null; + + #echo "Testing y=$y ..."; + make test &> /dev/null; + + for((i=0;i<20;i=i+1)) do + make test | grep "Total" | grep -oP '\d+(\.\d+)?' >> result-$x-$y ; + done + done +done diff --git a/sandbox/sum.sh b/sandbox/sum.sh new file mode 100755 index 000000000..42c3b6e6c --- /dev/null +++ b/sandbox/sum.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# x: TF_DEFAULT_BOUNDED_TASK_QUEUE_LOG_SIZE +# y: TF_DEFAULT_UNBOUNDED_TASK_QUEUE_LOG_SIZE + +for ((x=6; x<=12; x=x+1)) do + for ((y=6; y<=12; y=y+1)) do + filename="result-$x-$y" + + if [[ -f "$filename" ]]; then + # Read the numbers from the file + numbers=$(cat "$filename") + + # Calculate the sum + sum=$(echo "$numbers" | awk '{sum+=$1} END {print sum}') + + # Calculate the mean + count=$(echo "$numbers" | wc -l) + mean=$(echo "$sum / $count" | bc -l) + + # Calculate the standard deviation + stddev=$(echo "$numbers" | awk -v mean="$mean" '{sum+=($1-mean)*($1-mean)} END {print sqrt(sum/NR)}') + + # Calculate the min and max + min=$(echo "$numbers" | sort -n | head -n 1) + max=$(echo "$numbers" | sort -n | tail -n 1) + + # Output the results + printf "%s %.2f %.2f %.2f %.2f %.2f\n" "$filename" "$sum" "$mean" "$stddev" "$min" "$max" + else + echo "File: $filename does not exist." + fi + done +done diff --git a/sandbox/utility/serializer.hpp b/sandbox/utility/serializer.hpp index aab00f23f..5ede84a27 100644 --- a/sandbox/utility/serializer.hpp +++ b/sandbox/utility/serializer.hpp @@ -1126,7 +1126,7 @@ SizeType Deserializer::_load(T&& t) { return t.load(*this); } -} // ned of namespace tf ----------------------------------------------------- +} // end of namespace tf ----------------------------------------------------- diff --git a/taskflow/algorithm/algorithm.hpp b/taskflow/algorithm/algorithm.hpp new file mode 100644 index 000000000..63eb6a900 --- /dev/null +++ b/taskflow/algorithm/algorithm.hpp @@ -0,0 +1,14 @@ +#pragma once + +namespace tf { + +class Algorithm { + + public: + + template + static auto make_module_task(T&&); + +}; + +} // end of namespace tf ----------------------------------------------------- diff --git a/taskflow/algorithm/critical.hpp b/taskflow/algorithm/critical.hpp deleted file mode 100644 index c781d2827..000000000 --- a/taskflow/algorithm/critical.hpp +++ /dev/null @@ -1,78 +0,0 @@ -#pragma once - -#include "../core/task.hpp" - -/** -@file critical.hpp -@brief critical include file -*/ - -namespace tf { - -// ---------------------------------------------------------------------------- -// CriticalSection -// ---------------------------------------------------------------------------- - -/** -@class CriticalSection - -@brief class to create a critical region of limited workers to run tasks - -tf::CriticalSection is a warpper over tf::Semaphore and is specialized for -limiting the maximum concurrency over a set of tasks. -A critical section starts with an initial count representing that limit. -When a task is added to the critical section, -the task acquires and releases the semaphore internal to the critical section. -This design avoids explicit call of tf::Task::acquire and tf::Task::release. -The following example creates a critical section of one worker and adds -the five tasks to the critical section. - -@code{.cpp} -tf::Executor executor(8); // create an executor of 8 workers -tf::Taskflow taskflow; - -// create a critical section of 1 worker -tf::CriticalSection critical_section(1); - -tf::Task A = taskflow.emplace([](){ std::cout << "A" << std::endl; }); -tf::Task B = taskflow.emplace([](){ std::cout << "B" << std::endl; }); -tf::Task C = taskflow.emplace([](){ std::cout << "C" << std::endl; }); -tf::Task D = taskflow.emplace([](){ std::cout << "D" << std::endl; }); -tf::Task E = taskflow.emplace([](){ std::cout << "E" << std::endl; }); - -critical_section.add(A, B, C, D, E); - -executor.run(taskflow).wait(); -@endcode - -*/ -class CriticalSection : public Semaphore { - - public: - - /** - @brief constructs a critical region of a limited number of workers - */ - explicit CriticalSection(size_t max_workers = 1); - - /** - @brief adds a task into the critical region - */ - template - void add(Tasks...tasks); -}; - -inline CriticalSection::CriticalSection(size_t max_workers) : - Semaphore {max_workers} { -} - -template -void CriticalSection::add(Tasks... tasks) { - (tasks.acquire(*this), ...); - (tasks.release(*this), ...); -} - - -} // end of namespace tf. --------------------------------------------------- - - diff --git a/taskflow/algorithm/data_pipeline.hpp b/taskflow/algorithm/data_pipeline.hpp index 03935480b..4bbf00c36 100644 --- a/taskflow/algorithm/data_pipeline.hpp +++ b/taskflow/algorithm/data_pipeline.hpp @@ -189,7 +189,7 @@ using a module task in a taskflow. The only difference is that tf::DataPipeline provides a data abstraction for users to quickly express dataflow in a pipeline. The following example creates a data-parallel pipeline of three stages -that generate dataflow from `void` to `int`, `std::string`, `float`, and `void`. +that generate dataflow from `void` to `int`, `std::string`, and `void`. @code{.cpp} #include @@ -197,7 +197,7 @@ that generate dataflow from `void` to `int`, `std::string`, `float`, and `void`. int main() { - // data flow => void -> int -> std::string -> float -> void + // data flow => void -> int -> std::string -> void tf::Taskflow taskflow("pipeline"); tf::Executor executor; @@ -237,7 +237,7 @@ int main() { The pipeline schedules five tokens over four parallel lines in a circular fashion, as depicted below: -@code{.shell-session} +@code{.bash} o -> o -> o | | | v v v diff --git a/taskflow/algorithm/find.hpp b/taskflow/algorithm/find.hpp index cb3d080c2..1f07bb4da 100644 --- a/taskflow/algorithm/find.hpp +++ b/taskflow/algorithm/find.hpp @@ -1,68 +1,15 @@ #pragma once -#include "launch.hpp" +#include "../taskflow.hpp" namespace tf { -namespace detail { - -// Function: find_if_loop -template -bool find_if_loop( - std::atomic& offset, - Iterator& beg, - size_t& prev_e, - size_t curr_b, - size_t curr_e, - Predicate predicate -) { - // early prune - if(offset.load(std::memory_order_relaxed) < curr_b) { - return true; - } - std::advance(beg, curr_b - prev_e); - for(size_t x = curr_b; x -bool find_if_not_loop( - std::atomic& offset, - Iterator& beg, - size_t& prev_e, - size_t curr_b, - size_t curr_e, - Predicate predicate -) { - - // early prune - if(offset.load(std::memory_order_relaxed) < curr_b) { - return true; - } - std::advance(beg, curr_b - prev_e); - for(size_t x = curr_b; x auto make_find_if_task(B first, E last, T& result, UOP predicate, P part = P()) { + using namespace std::string_literals; + using B_t = std::decay_t>; using E_t = std::decay_t>; @@ -77,60 +24,71 @@ auto make_find_if_task(B first, E last, T& result, UOP predicate, P part = P()) // only myself - no need to spawn another graph if(W <= 1 || N <= part.chunk_size()) { - launch_loop(part, [&](){ - result = std::find_if(beg, end, predicate); - }); + part([=, &result]() mutable { result = std::find_if(beg, end, predicate); })(); return; } + + PreemptionGuard preemption_guard(rt); + // use no more workers than the iteration count if(N < W) { W = N; } - - std::atomic offset(N); + + auto mutex = std::make_shared(); + const auto origin = beg; + result = std::next(origin, N); // static partitioner if constexpr(part.type() == PartitionerType::STATIC) { - - size_t chunk_size; - - for(size_t w=0, curr_b=0; w lock(*mutex); + if(size_t offset = std::distance(origin, result); x < offset) { + result = std::next(origin, x); + } + return true; + } } - ); - } - ); + prev_e = part_e; + return false; + } + ); + }); + (++w == W || (curr_b += chunk_size) >= N) ? task() : rt.silent_async(task); } - - rt.corun_all(); } // dynamic partitioner else { - std::atomic next(0); - launch_loop(N, W, rt, next, part, - [N, W, beg, &predicate, &offset, &next, &part] () mutable { - part.loop_until(N, W, next, - [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable { - return detail::find_if_loop( - offset, beg, prev_e, curr_b, curr_e, predicate - ); + auto next = std::make_shared>(0); + for(size_t w=0; w lock(*mutex); + if(size_t offset = std::distance(origin, result); x < offset) { + result = std::next(origin, x); + } + return true; + } + } + prev_e = part_e; + return false; } ); - } - ); + }); + (++w == W) ? task() : rt.silent_async(task); + } } - - // update the result iterator by the offset - result = std::next(beg, offset.load(std::memory_order_relaxed)); }; } @@ -138,6 +96,8 @@ auto make_find_if_task(B first, E last, T& result, UOP predicate, P part = P()) template auto make_find_if_not_task(B first, E last, T& result, UOP predicate, P part = P()) { + using namespace std::string_literals; + using B_t = std::decay_t>; using E_t = std::decay_t>; @@ -152,66 +112,78 @@ auto make_find_if_not_task(B first, E last, T& result, UOP predicate, P part = P // only myself - no need to spawn another graph if(W <= 1 || N <= part.chunk_size()) { - launch_loop(part, [&](){ - result = std::find_if_not(beg, end, predicate); - }); + part([=, &result] () mutable { result = std::find_if_not(beg, end, predicate); })(); return; } + PreemptionGuard preemption_guard(rt); + if(N < W) { W = N; } - std::atomic offset(N); + auto mutex = std::make_shared(); + const auto origin = beg; + result = std::next(origin, N); // static partitioner if constexpr(part.type() == PartitionerType::STATIC) { - - size_t chunk_size; - - for(size_t w=0, curr_b=0; w lock(*mutex); + if(size_t offset = std::distance(origin, result); x < offset) { + result = std::next(origin, x); + } + return true; + } } - ); - } - ); + prev_e = part_e; + return false; + } + ); + }); + (++w == W || (curr_b += chunk_size) >= N) ? task() : rt.silent_async(task); } - - rt.corun_all(); } // dynamic partitioner else { - std::atomic next(0); - launch_loop(N, W, rt, next, part, - [N, W, beg, &predicate, &offset, &next, &part] () mutable { - part.loop_until(N, W, next, - [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable { - return detail::find_if_not_loop( - offset, beg, prev_e, curr_b, curr_e, predicate - ); + auto next = std::make_shared>(0); + for(size_t w=0; w lock(*mutex); + if(size_t offset = std::distance(origin, result); x < offset) { + result = std::next(origin, x); + } + return true; + } + } + prev_e = part_e; + return false; } ); - } - ); + }); + (++w == W) ? task() : rt.silent_async(task); + } } - - // update the result iterator by the offset - result = std::next(beg, offset.load(std::memory_order_relaxed)); }; } // Function: make_min_element_task template auto make_min_element_task(B first, E last, T& result, C comp, P part = P()) { + + using namespace std::string_literals; using B_t = std::decay_t>; using E_t = std::decay_t>; @@ -227,17 +199,17 @@ auto make_min_element_task(B first, E last, T& result, C comp, P part = P()) { // only myself - no need to spawn another graph if(W <= 1 || N <= part.chunk_size()) { - launch_loop(part, [&](){ - result = std::min_element(beg, end, comp); - }); + part([=, &result] () mutable { result = std::min_element(beg, end, comp); })(); return; } + PreemptionGuard preemption_guard(rt); + if(N < W) { W = N; } - - std::mutex mutex; + + auto mutex = std::make_shared(); // initialize the result to the first element result = beg++; @@ -246,20 +218,17 @@ auto make_min_element_task(B first, E last, T& result, C comp, P part = P()) { // static partitioner if constexpr(part.type() == PartitionerType::STATIC) { - size_t chunk_size; - - for(size_t w=0, curr_b=0; w lock(mutex); + std::lock_guard lock(*mutex); if(comp(*beg, *result)) { result = beg; } @@ -272,7 +241,7 @@ auto make_min_element_task(B first, E last, T& result, C comp, P part = P()) { // loop reduce part.loop(N, W, curr_b, chunk_size, - [&, prev_e=curr_b+2](size_t part_b, size_t part_e) mutable { + [=, &smallest, prev_e=curr_b+2](size_t part_b, size_t part_e) mutable { if(part_b > prev_e) { std::advance(beg, part_b - prev_e); @@ -291,21 +260,24 @@ auto make_min_element_task(B first, E last, T& result, C comp, P part = P()) { ); // final reduce - std::lock_guard lock(mutex); + std::lock_guard lock(*mutex); if(comp(*smallest, *result)) { result = smallest; } }); + + (++w == W || (curr_b += chunk_size) >= N) ? task() : rt.silent_async(task); } - rt.corun_all(); } // dynamic partitioner else { - std::atomic next(0); - launch_loop(N, W, rt, next, part, - [beg, N, W, &next, &comp, &mutex, &result, &part] () mutable { + auto next = std::make_shared>(0); + + for(size_t w=0; wfetch_add(2, std::memory_order_relaxed); if(s0 >= N) { return; @@ -314,7 +286,7 @@ auto make_min_element_task(B first, E last, T& result, C comp, P part = P()) { std::advance(beg, s0); if(N - s0 == 1) { - std::lock_guard lock(mutex); + std::lock_guard lock(*mutex); if(comp(*beg, *result)) { result = beg; } @@ -327,8 +299,8 @@ auto make_min_element_task(B first, E last, T& result, C comp, P part = P()) { T smallest = comp(*beg1, *beg2) ? beg1 : beg2; // loop reduce - part.loop(N, W, next, - [&, prev_e=s0+2](size_t part_b, size_t part_e) mutable { + part.loop(N, W, *next, + [=, &smallest, prev_e=s0+2](size_t part_b, size_t part_e) mutable { std::advance(beg, part_b - prev_e); for(size_t x=part_b; x lock(mutex); + std::lock_guard lock(*mutex); if(comp(*smallest, *result)) { result = smallest; } - } - ); + }); + (++w == W) ? task() : rt.silent_async(task); + } } }; } @@ -353,6 +326,8 @@ auto make_min_element_task(B first, E last, T& result, C comp, P part = P()) { // Function: make_max_element_task template auto make_max_element_task(B first, E last, T& result, C comp, P part = P()) { + + using namespace std::string_literals; using B_t = std::decay_t>; using E_t = std::decay_t>; @@ -368,17 +343,17 @@ auto make_max_element_task(B first, E last, T& result, C comp, P part = P()) { // only myself - no need to spawn another graph if(W <= 1 || N <= part.chunk_size()) { - launch_loop(part, [&](){ - result = std::max_element(beg, end, comp); - }); + part([=, &result] () mutable { result = std::max_element(beg, end, comp); })(); return; } + PreemptionGuard preemption_guard(rt); + if(N < W) { W = N; } - std::mutex mutex; + auto mutex = std::make_shared(); // initialize the result to the first element result = beg++; @@ -387,20 +362,18 @@ auto make_max_element_task(B first, E last, T& result, C comp, P part = P()) { // static partitioner if constexpr(part.type() == PartitionerType::STATIC) { - size_t chunk_size; - - for(size_t w=0, curr_b=0; w lock(mutex); + std::lock_guard lock(*mutex); if(comp(*result, *beg)) { result = beg; } @@ -413,7 +386,7 @@ auto make_max_element_task(B first, E last, T& result, C comp, P part = P()) { // loop reduce part.loop(N, W, curr_b, chunk_size, - [&, prev_e=curr_b+2](size_t part_b, size_t part_e) mutable { + [=, &largest, prev_e=curr_b+2](size_t part_b, size_t part_e) mutable { if(part_b > prev_e) { std::advance(beg, part_b - prev_e); @@ -432,21 +405,23 @@ auto make_max_element_task(B first, E last, T& result, C comp, P part = P()) { ); // final reduce - std::lock_guard lock(mutex); + std::lock_guard lock(*mutex); if(comp(*result, *largest)) { result = largest; } }); + (++w == W || (curr_b += chunk_size) >= N) ? task() : rt.silent_async(task); } - rt.corun_all(); } // dynamic partitioner else { - std::atomic next(0); - launch_loop(N, W, rt, next, part, - [beg, N, W, &next, &comp, &mutex, &result, &part] () mutable { + auto next = std::make_shared>(0); + + for(size_t w=0; wfetch_add(2, std::memory_order_relaxed); if(s0 >= N) { return; @@ -455,7 +430,7 @@ auto make_max_element_task(B first, E last, T& result, C comp, P part = P()) { std::advance(beg, s0); if(N - s0 == 1) { - std::lock_guard lock(mutex); + std::lock_guard lock(*mutex); if(comp(*result, *beg)) { result = beg; } @@ -468,8 +443,8 @@ auto make_max_element_task(B first, E last, T& result, C comp, P part = P()) { T largest = comp(*beg1, *beg2) ? beg2 : beg1; // loop reduce - part.loop(N, W, next, - [&, prev_e=s0+2](size_t part_b, size_t part_e) mutable { + part.loop(N, W, *next, + [=, &largest, prev_e=s0+2](size_t part_b, size_t part_e) mutable { std::advance(beg, part_b - prev_e); for(size_t x=part_b; x lock(mutex); + std::lock_guard lock(*mutex); if(comp(*result, *largest)) { result = largest; } - } - ); + }); + (++w == W) ? task() : rt.silent_async(task); + } } }; } - // Function: find_if template Task tf::FlowBuilder::find_if(B first, E last, T& result, UOP predicate, P part) { diff --git a/taskflow/algorithm/for_each.hpp b/taskflow/algorithm/for_each.hpp index 8c98e84ea..aa28434a7 100644 --- a/taskflow/algorithm/for_each.hpp +++ b/taskflow/algorithm/for_each.hpp @@ -1,13 +1,13 @@ #pragma once -#include "launch.hpp" +#include "../taskflow.hpp" namespace tf { // Function: make_for_each_task template auto make_for_each_task(B b, E e, C c, P part = P()) { - + using B_t = std::decay_t>; using E_t = std::decay_t>; @@ -20,26 +20,26 @@ auto make_for_each_task(B b, E e, C c, P part = P()) { size_t W = rt.executor().num_workers(); size_t N = std::distance(beg, end); - // only myself - no need to spawn another graph + // the workload is sequentially doable if(W <= 1 || N <= part.chunk_size()) { - launch_loop(part, [&](){ - std::for_each(beg, end, c); - }); + part([=]() mutable { std::for_each(beg, end, c); })(); return; } - + + PreemptionGuard preemption_guard(rt); + + // use no more workers than the iteration count if(N < W) { W = N; } // static partitioner if constexpr(part.type() == PartitionerType::STATIC) { - size_t chunk_size; - for(size_t w=0, curr_b=0; w= N) ? task() : rt.silent_async(task); } - - rt.corun_all(); } // dynamic partitioner else { - std::atomic next(0); - launch_loop(N, W, rt, next, part, [=, &c, &next, &part] () mutable { - part.loop(N, W, next, - [&, prev_e=size_t{0}](size_t part_b, size_t part_e) mutable { - std::advance(beg, part_b - prev_e); - for(size_t x = part_b; x>(0); + for(size_t w=0; w auto make_for_each_index_task(B b, E e, S s, C c, P part = P()){ - + using B_t = std::decay_t>; using E_t = std::decay_t>; using S_t = std::decay_t>; @@ -86,7 +89,7 @@ auto make_for_each_index_task(B b, E e, S s, C c, P part = P()){ S_t inc = s; // nothing to be done if the range is invalid - if(is_range_invalid(beg, end, inc)) { + if(is_index_range_invalid(beg, end, inc)) { return; } @@ -95,57 +98,114 @@ auto make_for_each_index_task(B b, E e, S s, C c, P part = P()){ // only myself - no need to spawn another graph if(W <= 1 || N <= part.chunk_size()) { - launch_loop(part, [&](){ + part([=]() mutable { for(size_t x=0; x(part_b) * inc + beg; - for(size_t x=part_b; x(part_b) * inc + beg; + for(size_t x=part_b; x= N) ? task() : rt.silent_async(task); } - - rt.corun_all(); } // dynamic partitioner else { - std::atomic next(0); - launch_loop(N, W, rt, next, part, [=, &c, &next, &part] () mutable { - part.loop(N, W, next, - [&](size_t part_b, size_t part_e) { + auto next = std::make_shared>(0); + for(size_t w=0; w(part_b) * inc + beg; for(size_t x=part_b; x +auto make_for_each_by_index_task(R range, C c, P part = P()){ + + using range_type = std::decay_t>; + + return [=] (Runtime& rt) mutable { + + // fetch the iterator values + range_type r = range; + + // nothing to be done if the range is invalid + if(is_index_range_invalid(r.begin(), r.end(), r.step_size())) { + return; + } + + size_t W = rt.executor().num_workers(); + size_t N = r.size(); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= part.chunk_size()) { + part([=]() mutable { c(r); })(); + return; + } + + PreemptionGuard preemption_guard(rt); + + if(N < W) { + W = N; + } + + // static partitioner + if constexpr(part.type() == PartitionerType::STATIC) { + for(size_t w=0, curr_b=0; w= N) ? task() : rt.silent_async(task); + } + } + // dynamic partitioner + else { + auto next = std::make_shared>(0); + for(size_t w=0; w @@ -155,9 +215,9 @@ Task FlowBuilder::for_each(B beg, E end, C c, P part) { ); } -// ---------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------------------ // for_each_index -// ---------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------------------ // Function: for_each_index template @@ -167,6 +227,13 @@ Task FlowBuilder::for_each_index(B beg, E end, S inc, C c, P part){ ); } +// Function: for_each_by_index +template +Task FlowBuilder::for_each_by_index(R range, C c, P part){ + return emplace( + make_for_each_by_index_task(range, c, part) + ); +} -} // end of namespace tf ----------------------------------------------------- +} // end of namespace tf ------------------------------------------------------------------------- diff --git a/taskflow/algorithm/launch.hpp b/taskflow/algorithm/launch.hpp deleted file mode 100644 index 527fb2fe0..000000000 --- a/taskflow/algorithm/launch.hpp +++ /dev/null @@ -1,76 +0,0 @@ -#pragma once - -#include -#include "../core/async.hpp" - -namespace tf { - -// Function: launch_loop -template -TF_FORCE_INLINE void launch_loop(P part, Loop loop) { - - constexpr bool is_default_wrapper_v = std::is_same_v< - typename std::decay_t

      ::closure_wrapper_type, DefaultClosureWrapper - >; - - if constexpr(is_default_wrapper_v) { - loop(); - } - else { - std::invoke(part.closure_wrapper(), loop); - } -} - -// Function: launch_loop -template -TF_FORCE_INLINE void launch_loop( - size_t N, - size_t W, - Runtime& rt, - std::atomic& next, - P part, - Loop loop -) { - - //static_assert(std::is_lvalue_reference_v, ""); - - using namespace std::string_literals; - - for(size_t w=0; w -TF_FORCE_INLINE void launch_loop( - size_t W, - size_t w, - Runtime& rt, - P part, - Loop loop -) { - using namespace std::string_literals; - if(w == W-1) { - launch_loop(part, loop); - } - else { - rt.silent_async_unchecked([=](){ launch_loop(part, loop); }); - } -} - -} // end of namespace tf ----------------------------------------------------- diff --git a/taskflow/algorithm/module.hpp b/taskflow/algorithm/module.hpp new file mode 100644 index 000000000..03ec3bd78 --- /dev/null +++ b/taskflow/algorithm/module.hpp @@ -0,0 +1,81 @@ +#pragma once + +#include "../taskflow.hpp" + +namespace tf { + +// ---------------------------------------------------------------------------- + +/** +@private +*/ +template +auto Algorithm::make_module_task(T&& target) { + return [&target=std::forward(target)](tf::Runtime& rt){ + auto& graph = target.graph(); + if(graph.empty()) { + return; + } + PreemptionGuard preemption_guard(rt); + rt._executor._schedule_graph_with_parent( + rt._worker, graph.begin(), graph.end(), rt._parent + ); + }; +} + +// ---------------------------------------------------------------------------- + +/** + * @brief creates a module task using the given target + * + * @tparam T Type of the target object, which must define the method `tf::Graph& graph()`. + * @param target The target object used to create the module task. + * @return module task that can be used by %Taskflow or asynchronous tasking. + * + * + * This example demonstrates how to create and launch multiple taskflows in parallel + * using asynchronous tasking: + * + * @code{.cpp} + * tf::Executor executor; + * + * tf::Taskflow A; + * tf::Taskflow B; + * tf::Taskflow C; + * tf::Taskflow D; + * + * A.emplace([](){ printf("Taskflow A\n"); }); + * B.emplace([](){ printf("Taskflow B\n"); }); + * C.emplace([](){ printf("Taskflow C\n"); }); + * D.emplace([](){ printf("Taskflow D\n"); }); + * + * // launch the four taskflows using asynchronous tasking + * executor.async(tf::make_module_task(A)); + * executor.async(tf::make_module_task(B)); + * executor.async(tf::make_module_task(C)); + * executor.async(tf::make_module_task(D)); + * executor.wait_for_all(); + * @endcode + * + * The module task maker, tf::make_module_task, is basically the same as tf::Taskflow::composed_of + * but provides a more generic interface that can be used beyond %Taskflow. + * For instance, the following two approaches achieve the same functionality. + * + * @code{.cpp} + * // approach 1: composition using composed_of + * tf::Task m1 = taskflow1.composed_of(taskflow2); + * + * // approach 2: composition using make_module_task + * tf::Task m1 = taskflow1.emplace(tf::make_module_task(taskflow2)); + * @endcode + * + * @attention + * Users are responsible for ensuring that the given target remains valid throughout its execution. + * The executor does not assume ownership of the target object. + */ +template +auto make_module_task(T&& target) { + return Algorithm::make_module_task(std::forward(target)); +} + +} // end of namespact tf ----------------------------------------------------- diff --git a/taskflow/algorithm/partitioner.hpp b/taskflow/algorithm/partitioner.hpp index 04406f834..d69c23eb6 100644 --- a/taskflow/algorithm/partitioner.hpp +++ b/taskflow/algorithm/partitioner.hpp @@ -44,12 +44,11 @@ enum class PartitionerType : int { //}; /** -@struct DefaultClosureWrapper +@class DefaultClosureWrapper -@brief default closure wrapper that simplies runs the given closure as is +@brief class to create a default closure wrapper */ -struct DefaultClosureWrapper { -}; +class DefaultClosureWrapper {}; /** @private @@ -89,7 +88,7 @@ tf::GuidedPartitioner or tf::DynamicPartitioner can outperform tf::StaticPartiti In most situations, tf::GuidedPartitioner can deliver decent performance and is thus used as our default partitioner. -@note +@attention Giving the partition size of 0 lets the %Taskflow runtime automatically determines the partition size for the given partitioner. @@ -121,7 +120,7 @@ taskflow.for_each_index(0, 100, 1, executor.run(taskflow).wait(); @endcode -@note +@attention The default closure wrapper (tf::DefaultClosureWrapper) does nothing but invoke the partitioned task (closure). @@ -131,6 +130,11 @@ class PartitionerBase : public IsPartitioner { public: + /** + @brief indicating if the given closure wrapper is a default wrapper (i.e., empty) + */ + constexpr static bool is_default_wrapper_v = std::is_same_v; + /** @brief the closure type */ @@ -169,12 +173,31 @@ class PartitionerBase : public IsPartitioner { */ const C& closure_wrapper() const { return _closure_wrapper; } + /** + @brief acquire a mutable access to the closure wrapper object + */ + C& closure_wrapper() { return _closure_wrapper; } + /** @brief modify the closure wrapper object */ template void closure_wrapper(F&& fn) { _closure_wrapper = std::forward(fn); } + /** + @brief wraps the given callable with the associated closure wrapper + */ + template + TF_FORCE_INLINE decltype(auto) operator () (F&& callable) { + if constexpr(is_default_wrapper_v) { + return std::forward(callable); + } + else { + // closure wrapper is stateful - capture it by reference + return [this, c=std::forward(callable)]() mutable { _closure_wrapper(c); }; + } + } + protected: /** @@ -197,7 +220,7 @@ class PartitionerBase : public IsPartitioner { @tparam C closure wrapper type (default tf::DefaultClosureWrapper) -@brief class to construct a guided partitioner for scheduling parallel algorithms +@brief class to create a guided partitioner for scheduling parallel algorithms The size of a partition is proportional to the number of unassigned iterations divided by the number of workers, @@ -290,7 +313,7 @@ class GuidedPartitioner : public PartitionerBase { if(curr_b >= N) { return; } - func(curr_b, std::min(curr_b + chunk_size, N)); + func(curr_b, (std::min)(curr_b + chunk_size, N)); } break; } @@ -301,7 +324,7 @@ class GuidedPartitioner : public PartitionerBase { q = chunk_size; } //size_t curr_e = (q <= r) ? curr_b + q : N; - size_t curr_e = std::min(curr_b + q, N); + size_t curr_e = (std::min)(curr_b + q, N); if(next.compare_exchange_strong(curr_b, curr_e, std::memory_order_relaxed, std::memory_order_relaxed)) { func(curr_b, curr_e); @@ -338,7 +361,7 @@ class GuidedPartitioner : public PartitionerBase { if(curr_b >= N) { return; } - if(func(curr_b, std::min(curr_b + chunk_size, N))) { + if(func(curr_b, (std::min)(curr_b + chunk_size, N))) { return; } } @@ -351,7 +374,7 @@ class GuidedPartitioner : public PartitionerBase { q = chunk_size; } //size_t curr_e = (q <= r) ? curr_b + q : N; - size_t curr_e = std::min(curr_b + q, N); + size_t curr_e = (std::min)(curr_b + q, N); if(next.compare_exchange_strong(curr_b, curr_e, std::memory_order_relaxed, std::memory_order_relaxed)) { if(func(curr_b, curr_e)) { @@ -372,7 +395,7 @@ class GuidedPartitioner : public PartitionerBase { /** @class DynamicPartitioner -@brief class to construct a dynamic partitioner for scheduling parallel algorithms +@brief class to create a dynamic partitioner for scheduling parallel algorithms @tparam C closure wrapper type (default tf::DefaultClosureWrapper) @@ -453,7 +476,7 @@ class DynamicPartitioner : public PartitionerBase { size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); while(curr_b < N) { - func(curr_b, std::min(curr_b + chunk_size, N)); + func(curr_b, (std::min)(curr_b + chunk_size, N)); curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); } } @@ -472,7 +495,7 @@ class DynamicPartitioner : public PartitionerBase { size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); while(curr_b < N) { - if(func(curr_b, std::min(curr_b + chunk_size, N))) { + if(func(curr_b, (std::min)(curr_b + chunk_size, N))) { return; } curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); @@ -585,7 +608,7 @@ class StaticPartitioner : public PartitionerBase { ) { size_t stride = W * chunk_size; while(curr_b < N) { - size_t curr_e = std::min(curr_b + chunk_size, N); + size_t curr_e = (std::min)(curr_b + chunk_size, N); func(curr_b, curr_e); curr_b += stride; } @@ -602,7 +625,7 @@ class StaticPartitioner : public PartitionerBase { ) { size_t stride = W * chunk_size; while(curr_b < N) { - size_t curr_e = std::min(curr_b + chunk_size, N); + size_t curr_e = (std::min)(curr_b + chunk_size, N); if(func(curr_b, curr_e)) { return; } @@ -719,8 +742,8 @@ class RandomPartitioner : public PartitionerBase { std::swap(b1, b2); } - b1 = std::max(b1, size_t{1}); - b2 = std::max(b2, b1 + 1); + b1 = (std::max)(b1, size_t{1}); + b2 = (std::max)(b2, b1 + 1); return {b1, b2}; } @@ -748,7 +771,7 @@ class RandomPartitioner : public PartitionerBase { size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); while(curr_b < N) { - func(curr_b, std::min(curr_b + chunk_size, N)); + func(curr_b, (std::min)(curr_b + chunk_size, N)); chunk_size = dist(engine); curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); } @@ -773,7 +796,7 @@ class RandomPartitioner : public PartitionerBase { size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); while(curr_b < N) { - if(func(curr_b, std::min(curr_b + chunk_size, N))){ + if(func(curr_b, (std::min)(curr_b + chunk_size, N))){ return; } chunk_size = dist(engine); @@ -784,14 +807,14 @@ class RandomPartitioner : public PartitionerBase { private: float _alpha {0.01f}; - float _beta {0.5f}; + float _beta {0.50f}; }; /** @brief default partitioner set to tf::GuidedPartitioner -Guided partitioner can achieve decent performance for most parallel algorithms, -especially for those with irregular and unbalanced workload per iteration. +Guided partitioning algorithm can achieve stable and decent performance +for most parallel algorithms. */ using DefaultPartitioner = GuidedPartitioner<>; diff --git a/taskflow/algorithm/pipeline.hpp b/taskflow/algorithm/pipeline.hpp index 79689d087..3a5b470ba 100644 --- a/taskflow/algorithm/pipeline.hpp +++ b/taskflow/algorithm/pipeline.hpp @@ -377,7 +377,7 @@ executor.run(taskflow).wait(); The above example creates a pipeline graph that schedules five tokens over four parallel lines in a circular fashion, as depicted below: -@code{.shell-session} +@code{.bash} o -> o -> o | | | v v v @@ -1032,7 +1032,7 @@ The above example creates a pipeline graph that schedules five tokens over four parallel lines in a circular fashion, first going through three serial pipes and then five serial pipes: -@code{.shell-session} +@code{.bash} # initial construction of three serial pipes o -> o -> o | | | diff --git a/taskflow/algorithm/reduce.hpp b/taskflow/algorithm/reduce.hpp index b280934df..2eab24a66 100644 --- a/taskflow/algorithm/reduce.hpp +++ b/taskflow/algorithm/reduce.hpp @@ -1,17 +1,19 @@ #pragma once -#include "launch.hpp" +#include "../taskflow.hpp" namespace tf { // Function: make_reduce_task template auto make_reduce_task(B b, E e, T& init, O bop, P part = P()) { + + using namespace std::string_literals; using B_t = std::decay_t>; using E_t = std::decay_t>; - return [=, &r=init] (Runtime& rt) mutable { + return [=, &init] (Runtime& rt) mutable { // fetch the iterator values B_t beg = b; @@ -22,36 +24,34 @@ auto make_reduce_task(B b, E e, T& init, O bop, P part = P()) { // only myself - no need to spawn another graph if(W <= 1 || N <= part.chunk_size()) { - launch_loop(part, [&](){ - for(; beg!=end; r = bop(r, *beg++)); - }); + part([=, &init] () mutable { for(; beg!=end; init = bop(init, *beg++)); })(); return; } + + PreemptionGuard preemption_guard(rt); if(N < W) { W = N; } - std::mutex mtx; + auto mutex = std::make_shared(); // static partitioner if constexpr(part.type() == PartitionerType::STATIC) { - size_t chunk_size; - - for(size_t w=0, curr_b=0; w lock(mtx); - r = bop(r, *beg); + std::lock_guard lock(*mutex); + init = bop(init, *beg); return; } @@ -61,7 +61,7 @@ auto make_reduce_task(B b, E e, T& init, O bop, P part = P()) { // loop reduce part.loop(N, W, curr_b, chunk_size, - [&, prev_e=curr_b+2](size_t part_b, size_t part_e) mutable { + [=, &sum, prev_e=curr_b+2](size_t part_b, size_t part_e) mutable { if(part_b > prev_e) { std::advance(beg, part_b - prev_e); @@ -78,51 +78,57 @@ auto make_reduce_task(B b, E e, T& init, O bop, P part = P()) { ); // final reduce - std::lock_guard lock(mtx); - r = bop(r, sum); + std::lock_guard lock(*mutex); + init = bop(init, sum); }); + + (++w == W || (curr_b += chunk_size) >= N) ? task() : rt.silent_async(task); } - rt.corun_all(); } // dynamic partitioner else { - std::atomic next(0); - launch_loop(N, W, rt, next, part, [=, &bop, &mtx, &next, &r, &part] () mutable { - // pre-reduce - size_t s0 = next.fetch_add(2, std::memory_order_relaxed); + auto next = std::make_shared>(0); + + for(size_t w=0; w= N) { - return; - } + auto task = part([=, &init] () mutable { + // pre-reduce + size_t s0 = next->fetch_add(2, std::memory_order_relaxed); - std::advance(beg, s0); + if(s0 >= N) { + return; + } - if(N - s0 == 1) { - std::lock_guard lock(mtx); - r = bop(r, *beg); - return; - } + std::advance(beg, s0); - auto beg1 = beg++; - auto beg2 = beg++; + if(N - s0 == 1) { + std::lock_guard lock(*mutex); + init = bop(init, *beg); + return; + } - T sum = bop(*beg1, *beg2); - - // loop reduce - part.loop(N, W, next, - [&, prev_e=s0+2](size_t curr_b, size_t curr_e) mutable { - std::advance(beg, curr_b - prev_e); - for(size_t x=curr_b; x lock(mtx); - r = bop(r, sum); - }); + ); + + // final reduce + std::lock_guard lock(*mutex); + init = bop(init, sum); + }); + (++w == W) ? task() : rt.silent_async(task); + } } }; } @@ -134,10 +140,11 @@ template < > auto make_transform_reduce_task(B b, E e, T& init, BOP bop, UOP uop, P part = P()) { + using namespace std::string_literals; using B_t = std::decay_t>; using E_t = std::decay_t>; - return [=, &r=init] (Runtime& rt) mutable { + return [=, &init] (Runtime& rt) mutable { // fetch the iterator values B_t beg = b; @@ -148,33 +155,32 @@ auto make_transform_reduce_task(B b, E e, T& init, BOP bop, UOP uop, P part = P( // only myself - no need to spawn another graph if(W <= 1 || N <= part.chunk_size()) { - launch_loop(part, [&](){ - for(; beg!=end; r = bop(std::move(r), uop(*beg++))); - }); + part([=, &init] () mutable { for(; beg!=end; init = bop(std::move(init), uop(*beg++))); })(); return; } + + PreemptionGuard preemption_guard(rt); if(N < W) { W = N; } - std::mutex mtx; + auto mutex = std::make_shared(); // static partitioner if constexpr(part.type() == PartitionerType::STATIC) { - - size_t chunk_size; - for(size_t w=0, curr_b=0; w lock(mtx); - r = bop(std::move(r), uop(*beg)); + std::lock_guard lock(*mutex); + init = bop(std::move(init), uop(*beg)); return; } @@ -186,7 +192,7 @@ auto make_transform_reduce_task(B b, E e, T& init, BOP bop, UOP uop, P part = P( // loop reduce part.loop(N, W, curr_b, chunk_size, - [&, prev_e=curr_b+(chunk_size == 1 ? 1 : 2)] + [=, &sum, prev_e=curr_b+(chunk_size == 1 ? 1 : 2)] (size_t part_b, size_t part_e) mutable { if(part_b > prev_e) { std::advance(beg, part_b - prev_e); @@ -202,53 +208,56 @@ auto make_transform_reduce_task(B b, E e, T& init, BOP bop, UOP uop, P part = P( ); // final reduce - std::lock_guard lock(mtx); - r = bop(std::move(r), std::move(sum)); + std::lock_guard lock(*mutex); + init = bop(std::move(init), std::move(sum)); }); + + (++w == W || (curr_b += chunk_size) >= N) ? task() : rt.silent_async(task); } - - rt.corun_all(); } // dynamic partitioner else { - std::atomic next(0); - - launch_loop(N, W, rt, next, part, [=, &bop, &uop, &mtx, &next, &r, &part] () mutable { - // pre-reduce - size_t s0 = next.fetch_add(2, std::memory_order_relaxed); + auto next = std::make_shared>(0); + for(size_t w=0; w= N) { - return; - } + // pre-reduce + size_t s0 = next->fetch_add(2, std::memory_order_relaxed); - std::advance(beg, s0); + if(s0 >= N) { + return; + } - if(N - s0 == 1) { - std::lock_guard lock(mtx); - r = bop(std::move(r), uop(*beg)); - return; - } + std::advance(beg, s0); - auto beg1 = beg++; - auto beg2 = beg++; + if(N - s0 == 1) { + std::lock_guard lock(*mutex); + init = bop(std::move(init), uop(*beg)); + return; + } - T sum = bop(uop(*beg1), uop(*beg2)); - - // loop reduce - part.loop(N, W, next, - [&, prev_e=s0+2](size_t curr_b, size_t curr_e) mutable { - std::advance(beg, curr_b - prev_e); - for(size_t x=curr_b; x lock(mtx); - r = bop(std::move(r), std::move(sum)); - }); + ); + + // final reduce + std::lock_guard lock(*mutex); + init = bop(std::move(init), std::move(sum)); + }); + (++w == W) ? task() : rt.silent_async(task); + } } }; } @@ -262,6 +271,8 @@ template < auto make_transform_reduce_task( B1 b1, E1 e1, B2 b2, T& init, BOP_R bop_r, BOP_T bop_t, P part = P() ) { + + using namespace std::string_literals; using B1_t = std::decay_t>; using E1_t = std::decay_t>; @@ -279,33 +290,31 @@ auto make_transform_reduce_task( // only myself - no need to spawn another graph if(W <= 1 || N <= part.chunk_size()) { - launch_loop(part, [&](){ - for(; beg1!=end1; r = bop_r(std::move(r), bop_t(*beg1++, *beg2++))); - }); + part([=, &r] () mutable { for(; beg1!=end1; r = bop_r(std::move(r), bop_t(*beg1++, *beg2++))); })(); return; } + + PreemptionGuard preemption_guard(rt); if(N < W) { W = N; } - std::mutex mtx; + auto mutex = std::make_shared(); // static partitioner if constexpr(part.type() == PartitionerType::STATIC) { - - size_t chunk_size; - for(size_t w=0, curr_b=0; w lock(mtx); + std::lock_guard lock(*mutex); r = bop_r(std::move(r), bop_t(*beg1, *beg2)); return; } @@ -315,7 +324,7 @@ auto make_transform_reduce_task( // loop reduce part.loop(N, W, curr_b, chunk_size, - [&, prev_e=curr_b+(chunk_size == 1 ? 1 : 2)] + [=, &sum, prev_e=curr_b+(chunk_size == 1 ? 1 : 2)] (size_t part_b, size_t part_e) mutable { if(part_b > prev_e) { std::advance(beg1, part_b - prev_e); @@ -332,64 +341,161 @@ auto make_transform_reduce_task( ); // final reduce - std::lock_guard lock(mtx); + std::lock_guard lock(*mutex); r = bop_r(std::move(r), std::move(sum)); - }); + }); + + (++w == W || (curr_b += chunk_size) >= N) ? task() : rt.silent_async(task); } - - rt.corun_all(); } // dynamic partitioner else { - std::atomic next(0); + auto next = std::make_shared>(0); - launch_loop(N, W, rt, next, part, [=, &bop_r, &bop_t, &mtx, &next, &r, &part] () mutable { - // pre-reduce - size_t s0 = next.fetch_add(2, std::memory_order_relaxed); - - if(s0 >= N) { - return; - } - - std::advance(beg1, s0); - std::advance(beg2, s0); - - if(N - s0 == 1) { - std::lock_guard lock(mtx); - r = bop_r(std::move(r), bop_t(*beg1, *beg2)); - return; - } - - auto beg11 = beg1++; - auto beg12 = beg1++; - auto beg21 = beg2++; - auto beg22 = beg2++; - - T sum = bop_r(bop_t(*beg11, *beg21), bop_t(*beg12, *beg22)); - - // loop reduce - part.loop(N, W, next, - [&, prev_e=s0+2](size_t curr_b, size_t curr_e) mutable { - std::advance(beg1, curr_b - prev_e); - std::advance(beg2, curr_b - prev_e); - for(size_t x=curr_b; xfetch_add(2, std::memory_order_relaxed); + + if(s0 >= N) { + return; + } + + std::advance(beg1, s0); + std::advance(beg2, s0); + + if(N - s0 == 1) { + std::lock_guard lock(*mutex); + r = bop_r(std::move(r), bop_t(*beg1, *beg2)); + return; } - ); + + auto beg11 = beg1++; + auto beg12 = beg1++; + auto beg21 = beg2++; + auto beg22 = beg2++; + + T sum = bop_r(bop_t(*beg11, *beg21), bop_t(*beg12, *beg22)); + + // loop reduce + part.loop(N, W, *next, + [=, &sum, prev_e=s0+2](size_t curr_b, size_t curr_e) mutable { + std::advance(beg1, curr_b - prev_e); + std::advance(beg2, curr_b - prev_e); + for(size_t x=curr_b; x lock(mtx); - r = bop_r(std::move(r), std::move(sum)); - }); + // final reduce + std::lock_guard lock(*mutex); + r = bop_r(std::move(r), std::move(sum)); + }); + + (++w == W) ? task() : rt.silent_async(task); + } } }; } -// ---------------------------------------------------------------------------- + +// Function: make_reduce_by_index_task +template +auto make_reduce_by_index_task(R range, T& init, L lop, G gop, P part = P()) { + + using range_type = std::decay_t>; + + return [=, &init] (Runtime& rt) mutable { + + // fetch the iterator values + range_type r = range; + + // nothing to be done if the range is invalid + if(is_index_range_invalid(r.begin(), r.end(), r.step_size())) { + return; + } + + size_t W = rt.executor().num_workers(); + size_t N = r.size(); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= part.chunk_size()) { + part([=, &init] () mutable { init = lop(r, std::move(init)); })(); + return; + } + + PreemptionGuard preemption_guard(rt); + + if(N < W) { + W = N; + } + + auto mutex = std::make_shared(); + + // static partitioner + if constexpr(part.type() == PartitionerType::STATIC) { + + for(size_t w=0, curr_b=0; w tmp; + + // loop reduce + part.loop(N, W, curr_b, chunk_size, [=, &tmp](size_t part_b, size_t part_e) mutable { + tmp = lop(r.discrete_domain(part_b, part_e), std::move(tmp)); + }); + + // final reduce - tmp is guaranteed to have value + // assert(tmp.has_value()); + std::lock_guard lock(*mutex); + init = gop(std::move(init), std::move(*tmp)); + }); + + (++w == W || (curr_b += chunk_size) >= N) ? task() : rt.silent_async(task); + } + } + // dynamic partitioner + else { + auto next = std::make_shared>(0); + + for(size_t w=0; w tmp; + + // loop reduce + part.loop(N, W, *next, [=, &tmp](size_t part_b, size_t part_e) mutable { + tmp = lop(r.discrete_domain(part_b, part_e), std::move(tmp)); + }); + + // final reduce - need to check if the running total has value since + // this is a dynamic scheduler; the worker may not actually acquire any work + if(tmp) { + std::lock_guard lock(*mutex); + init = gop(std::move(init), std::move(*tmp)); + } + }); + (++w == W) ? task() : rt.silent_async(task); + } + } + }; +} + +// ------------------------------------------------------------------------------------------------ // default reduction -// ---------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------------------ // Function: reduce template @@ -397,9 +503,9 @@ Task FlowBuilder::reduce(B beg, E end, T& init, O bop, P part) { return emplace(make_reduce_task(beg, end, init, bop, part)); } -// ---------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------------------ // default transform and reduction -// ---------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------------------ // Function: transform_reduce template +Task FlowBuilder::reduce_by_index(R range, T& init, L lop, G gop, P part) { + return emplace(make_reduce_by_index_task(range, init, lop, gop, part)); } -} // end of namespace tf ----------------------------------------------------- +} // end of namespace tf ------------------------------------------------------------------------- diff --git a/taskflow/algorithm/scan.hpp b/taskflow/algorithm/scan.hpp index c1682126b..3677ab19d 100644 --- a/taskflow/algorithm/scan.hpp +++ b/taskflow/algorithm/scan.hpp @@ -1,21 +1,132 @@ #pragma once -#include "launch.hpp" +#include "../taskflow.hpp" namespace tf { +/* + +Block-parallel scan algorithm: + +----------------------------------------------------------------- +| block 1 | block 2 | block 3 | block 4 | +----------------------------------------------------------------- + + ----------------------------- + | B1 | B2 | B3 | B4 | // scan block sum to auxilinary array + ----------------------------- + | | + v v + ----------------------------- + | B1 | B2 | B3 | B4 | // scan block sums + ----------------------------- + | + | // add scanned block sum i to all + | // values of scanned block i+1 + v +----------------------------------------------------------------- +| block 1 | block 2 | block 3 | block 4 | +----------------------------------------------------------------- + +Example OpenMP implementation for inclusive scan: + +void inclusive_scan(std::vector& data) { + + int n = data.size(); + int num_threads; + + #pragma omp parallel + { + num_threads = omp_get_num_threads(); + } + + std::vector partial_sums(num_threads, 0); + + // Step 1: Up-sweep + #pragma omp parallel + { + int tid = omp_get_thread_num(); + int chunk_size = (n + num_threads - 1) / num_threads; + int start = tid * chunk_size; + int end = std::min(start + chunk_size, n); + + // Compute partial sum + for (int i = start + 1; i < end; ++i) { + data[i] += data[i - 1]; + } + partial_sums[tid] = data[end - 1]; + } + + // Step 2: Propagate partial sums + for (int i = 1; i < num_threads; ++i) { + partial_sums[i] += partial_sums[i - 1]; + } + + // Step 3: Down-sweep + #pragma omp parallel + { + int tid = omp_get_thread_num(); + int chunk_size = (n + num_threads - 1) / num_threads; + int start = tid * chunk_size; + int end = std::min(start + chunk_size, n); + + // Adjust with partial sums + if (tid > 0) { + for (int i = start; i < end; ++i) { + data[i] += partial_sums[tid - 1]; + } + } + } +} + +*/ + namespace detail { -// Function: scan_loop +template +struct ScanData { + + ScanData(size_t N, size_t c) : buf(N), counter(c) {} + + std::vector> buf; + std::atomic counter; +}; + +// down scan task +template +auto make_dscan_task( + std::shared_ptr sdata, + I d_beg, + B bop, + size_t w, + size_t block_size +) { + return [=, sdata=std::move(sdata)]() mutable { + for(size_t i=0; ibuf[w-1].data, *d_beg); + } + }; +} + +// middle scan task +template +auto make_mscan_task(std::shared_ptr sdata, B bop) { + return [=, sdata=std::move(sdata)](){ + for(size_t i=1; ibuf.size(); i++) { + sdata->buf[i].data = bop(sdata->buf[i-1].data, sdata->buf[i].data); + } + }; +} + template void scan_loop( tf::Runtime& rt, - std::atomic& counter, - BufferT& buf, - B bop, - Iterator d_beg, + std::atomic& counter, + BufferT& buf, + B bop, + Iterator d_beg, size_t W, - size_t w, + size_t w, size_t chunk_size ){ // whoever finishes the last performs global scan @@ -29,29 +140,28 @@ void scan_loop( // first worker no need to do any work if(w==0) { return; - } + } // need to do public corun because multiple workers can call this rt.executor().corun_until([&counter](){ return counter.load(std::memory_order_acquire) == 0; }); - + // block addup for(size_t i=0; i>, void>* = nullptr -> -auto make_inclusive_scan_task( - B first, E last, D d_first, BOP bop, P part = P() -) { +template +auto make_inclusive_scan_task(B first, E last, D d_first, BOP bop) { + + using namespace std::string_literals; using B_t = std::decay_t>; using E_t = std::decay_t>; @@ -74,9 +184,7 @@ auto make_inclusive_scan_task( // only myself - no need to spawn another graph if(W <= 1 || N <= 2) { - launch_loop(part, [&](){ - std::inclusive_scan(s_beg, s_end, d_beg, bop); - }); + std::inclusive_scan(s_beg, s_end, d_beg, bop); return; } @@ -89,16 +197,12 @@ auto make_inclusive_scan_task( size_t Q = N/W; size_t R = N%W; - - //auto orig_d_beg = d_beg; - //ExecutionPolicy policy; for(size_t w=0, curr_b=0, chunk_size; w>, void>* = nullptr -> -auto make_inclusive_scan_task( - B first, E last, D d_first, BOP bop, T init, P part = P() -) { +template +auto make_inclusive_scan_task(B first, E last, D d_first, BOP bop, T init) { + using namespace std::string_literals; + using B_t = std::decay_t>; using E_t = std::decay_t>; using D_t = std::decay_t>; @@ -174,19 +256,17 @@ auto make_inclusive_scan_task( // only myself - no need to spawn another graph if(W <= 1 || N <= 2) { - launch_loop(part, [&](){ - std::inclusive_scan(s_beg, s_end, d_beg, bop, init); - }); + std::inclusive_scan(s_beg, s_end, d_beg, bop, init); return; } if(N < W) { W = N; } - + std::vector> buf(W); std::atomic counter(0); - + // set up the initial value for the first worker buf[0].data = std::move(init); @@ -198,7 +278,7 @@ auto make_inclusive_scan_task( chunk_size = std::min(Q + (w < R), N - curr_b); // block scan - launch_loop(W, w, rt, part, [=, &rt, &bop, &buf, &counter] () mutable { + auto task = [=, &rt, &bop, &buf, &counter] () mutable { auto result = d_beg; // local scan per worker @@ -206,12 +286,14 @@ auto make_inclusive_scan_task( *d_beg++ = local = (w == 0) ? bop(local, *s_beg++) : *s_beg++; for(size_t i=1; i>, void>* = nullptr -> +template auto make_transform_inclusive_scan_task( - B first, E last, D d_first, BOP bop, UOP uop, P part = P() + B first, E last, D d_first, BOP bop, UOP uop ) { + using namespace std::string_literals; + using B_t = std::decay_t>; using E_t = std::decay_t>; using D_t = std::decay_t>; @@ -255,28 +337,26 @@ auto make_transform_inclusive_scan_task( // only myself - no need to spawn another graph if(W <= 1 || N <= 2) { - launch_loop(part, [&](){ - std::transform_inclusive_scan(s_beg, s_end, d_beg, bop, uop); - }); + std::transform_inclusive_scan(s_beg, s_end, d_beg, bop, uop); return; } - + if(N < W) { W = N; - } - + } + std::vector> buf(W); std::atomic counter(0); size_t Q = N/W; size_t R = N%W; - + for(size_t w=0, curr_b=0, chunk_size; w>, void>* = nullptr -> +template auto make_transform_inclusive_scan_task( - B first, E last, D d_first, BOP bop, UOP uop, T init, P part = P() + B first, E last, D d_first, BOP bop, UOP uop, T init ) { + using namespace std::string_literals; + using B_t = std::decay_t>; using E_t = std::decay_t>; using D_t = std::decay_t>; @@ -329,19 +411,16 @@ auto make_transform_inclusive_scan_task( // only myself - no need to spawn another graph if(W <= 1 || N <= 2) { - launch_loop(part, [&](){ - std::transform_inclusive_scan(s_beg, s_end, d_beg, bop, uop, init); - }); + std::transform_inclusive_scan(s_beg, s_end, d_beg, bop, uop, init); return; } if(N < W) { W = N; } - std::vector> buf(W); std::atomic counter(0); - + // set up the initial value for the first worker buf[0].data = std::move(init); @@ -353,7 +432,7 @@ auto make_transform_inclusive_scan_task( chunk_size = std::min(Q + (w < R), N - curr_b); // block scan - launch_loop(W, w, rt, part, [=, &rt, &bop, &uop, &buf, &counter] () mutable { + auto task = [=, &rt, &bop, &uop, &buf, &counter] () mutable { auto result = d_beg; // local scan per worker @@ -361,20 +440,20 @@ auto make_transform_inclusive_scan_task( *d_beg++ = local = (w == 0) ? bop(local, uop(*s_beg++)) : uop(*s_beg++); for(size_t i=1; i +template auto make_exclusive_scan_task( - B first, E last, D d_first, T init, BOP bop, P part = P() + B first, E last, D d_first, T init, BOP bop ) { + + using namespace std::string_literals; using B_t = std::decay_t>; using E_t = std::decay_t>; @@ -409,9 +490,7 @@ auto make_exclusive_scan_task( // only myself - no need to spawn another graph if(W <= 1 || N <= 2) { - launch_loop(part, [&](){ - std::exclusive_scan(s_beg, s_end, d_beg, init, bop); - }); + std::exclusive_scan(s_beg, s_end, d_beg, init, bop); return; } @@ -424,22 +503,21 @@ auto make_exclusive_scan_task( size_t Q = N/W; size_t R = N%W; - + // fetch the init value auto s_beg_temp = s_beg; for(size_t w=0, curr_b=0, chunk_size; w +template auto make_transform_exclusive_scan_task( - B first, E last, D d_first, T init, BOP bop, UOP uop, P part = P() + B first, E last, D d_first, T init, BOP bop, UOP uop ) { + + using namespace std::string_literals; using B_t = std::decay_t>; using E_t = std::decay_t>; @@ -497,37 +577,34 @@ auto make_transform_exclusive_scan_task( // only myself - no need to spawn another graph if(W <= 1 || N <= 2) { - launch_loop(part, [&](){ - std::transform_exclusive_scan(s_beg, s_end, d_beg, init, bop, uop); - }); + std::transform_exclusive_scan(s_beg, s_end, d_beg, init, bop, uop); return; } if(N < W) { W = N; } - + std::vector> buf(W); std::atomic counter(0); - + size_t Q = N/W; size_t R = N%W; - // fetch the init value auto s_beg_temp = s_beg; for(size_t w=0, curr_b=0, chunk_size; w>, void>* -> -Task FlowBuilder::inclusive_scan(B first, E last, D d_first, BOP bop, P part) { - return emplace(make_inclusive_scan_task(first, last, d_first, bop, part)); +template +Task FlowBuilder::inclusive_scan(B first, E last, D d_first, BOP bop) { + return emplace(make_inclusive_scan_task(first, last, d_first, bop)); } // Function: inclusive_scan -template >, void>* -> -Task FlowBuilder::inclusive_scan(B first, E last, D d_first, BOP bop, T init, P part) { - return emplace(make_inclusive_scan_task(first, last, d_first, bop, init, part)); +template +Task FlowBuilder::inclusive_scan(B first, E last, D d_first, BOP bop, T init) { + return emplace(make_inclusive_scan_task(first, last, d_first, bop, init)); } // ---------------------------------------------------------------------------- @@ -580,26 +653,22 @@ Task FlowBuilder::inclusive_scan(B first, E last, D d_first, BOP bop, T init, P // ---------------------------------------------------------------------------- // Function: transform_inclusive_scan -template >, void>* -> +template Task FlowBuilder::transform_inclusive_scan( - B first, E last, D d_first, BOP bop, UOP uop, P part + B first, E last, D d_first, BOP bop, UOP uop ) { return emplace(make_transform_inclusive_scan_task( - first, last, d_first, bop, uop, part + first, last, d_first, bop, uop )); } // Function: transform_inclusive_scan -template >, void>* -> +template Task FlowBuilder::transform_inclusive_scan( - B first, E last, D d_first, BOP bop, UOP uop, T init, P part + B first, E last, D d_first, BOP bop, UOP uop, T init ) { return emplace(make_transform_inclusive_scan_task( - first, last, d_first, bop, uop, init, part + first, last, d_first, bop, uop, init )); } @@ -608,11 +677,9 @@ Task FlowBuilder::transform_inclusive_scan( // ---------------------------------------------------------------------------- // Function: exclusive_scan -template -Task FlowBuilder::exclusive_scan(B first, E last, D d_first, T init, BOP bop, P part) { - return emplace(make_exclusive_scan_task( - first, last, d_first, init, bop, part - )); +template +Task FlowBuilder::exclusive_scan(B first, E last, D d_first, T init, BOP bop) { + return emplace(make_exclusive_scan_task(first, last, d_first, init, bop)); } // ---------------------------------------------------------------------------- @@ -620,14 +687,15 @@ Task FlowBuilder::exclusive_scan(B first, E last, D d_first, T init, BOP bop, P // ---------------------------------------------------------------------------- // Function: transform_exclusive_scan -template +template Task FlowBuilder::transform_exclusive_scan( - B first, E last, D d_first, T init, BOP bop, UOP uop, P part + B first, E last, D d_first, T init, BOP bop, UOP uop ) { return emplace(make_transform_exclusive_scan_task( - first, last, d_first, init, bop, uop, part + first, last, d_first, init, bop, uop )); } } // end of namespace tf ----------------------------------------------------- + diff --git a/taskflow/algorithm/sort.hpp b/taskflow/algorithm/sort.hpp index 4460f8f4a..d30577972 100644 --- a/taskflow/algorithm/sort.hpp +++ b/taskflow/algorithm/sort.hpp @@ -1,6 +1,6 @@ #pragma once -#include "../core/async.hpp" +#include "../taskflow.hpp" namespace tf::detail { @@ -39,7 +39,8 @@ inline T* align_cacheline(T* p) { #else std::size_t ip = reinterpret_cast(p); #endif - ip = (ip + cacheline_size - 1) & -cacheline_size; + //ip = (ip + cacheline_size - 1) & -cacheline_size; + ip = (ip + cacheline_size - 1) & ~(cacheline_size - 1); return reinterpret_cast(ip); } @@ -224,7 +225,7 @@ std::pair partition_right_branchless(Iter begin, Iter end, Compare c // Fill the offset blocks. if (left_split >= block_size) { - for (size_t i = 0; i < block_size;) { + for (unsigned char i = 0; i < block_size;) { offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first; offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first; offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first; @@ -235,13 +236,13 @@ std::pair partition_right_branchless(Iter begin, Iter end, Compare c offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first; } } else { - for (size_t i = 0; i < left_split;) { + for (unsigned char i = 0; i < left_split;) { offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first; } } if (right_split >= block_size) { - for (size_t i = 0; i < block_size;) { + for (unsigned char i = 0; i < block_size;) { offsets_r[num_r] = ++i; num_r += comp(*--last, pivot); offsets_r[num_r] = ++i; num_r += comp(*--last, pivot); offsets_r[num_r] = ++i; num_r += comp(*--last, pivot); @@ -252,7 +253,7 @@ std::pair partition_right_branchless(Iter begin, Iter end, Compare c offsets_r[num_r] = ++i; num_r += comp(*--last, pivot); } } else { - for (size_t i = 0; i < right_split;) { + for (unsigned char i = 0; i < right_split;) { offsets_r[num_r] = ++i; num_r += comp(*--last, pivot); } } @@ -387,7 +388,7 @@ RandItr partition_left(RandItr begin, RandItr end, Compare comp) { template void parallel_pdqsort( - tf::Runtime& rt, + Runtime& rt, Iter begin, Iter end, Compare comp, int bad_allowed, bool leftmost = true ) { @@ -512,13 +513,12 @@ void parallel_pdqsort( // Sort the left partition first using recursion and // do tail recursion elimination for the right-hand partition. - rt.silent_async( - [&rt, begin, pivot_pos, comp, bad_allowed, leftmost] () mutable { - parallel_pdqsort( - rt, begin, pivot_pos, comp, bad_allowed, leftmost - ); - } - ); + // here we need to copy runtime so it stays alive during the sort recursion + rt.silent_async([=] () mutable { + parallel_pdqsort( + rt, begin, pivot_pos, comp, bad_allowed, leftmost + ); + }); begin = pivot_pos + 1; leftmost = false; } @@ -530,7 +530,7 @@ void parallel_pdqsort( // 3-way quick sort template -void parallel_3wqsort(tf::Runtime& rt, RandItr first, RandItr last, C compare) { +void parallel_3wqsort(Runtime& rt, RandItr first, RandItr last, C compare) { using namespace std::string_literals; @@ -573,26 +573,15 @@ void parallel_3wqsort(tf::Runtime& rt, RandItr first, RandItr last, C compare) { } if(l - first > 1 && is_swapped_l) { - //rt.emplace([&](tf::Runtime& rtl) mutable { - // parallel_3wqsort(rtl, first, l-1, compare); - //}); - rt.silent_async([&rt, first, l, &compare] () mutable { + rt.silent_async([=] () mutable { parallel_3wqsort(rt, first, l-1, compare); }); } if(last - r > 1 && is_swapped_r) { - //rt.emplace([&](tf::Runtime& rtr) mutable { - // parallel_3wqsort(rtr, r+1, last, compare); - //}); - //rt.silent_async([&rt, r, last, &compare] () mutable { - // parallel_3wqsort(rt, r+1, last, compare); - //}); first = r+1; goto sort_partition; } - - //rt.join(); } } // end of namespace tf::detail --------------------------------------------- @@ -601,7 +590,7 @@ namespace tf { // Function: make_sort_task template -TF_FORCE_INLINE auto make_sort_task(B b, E e, C cmp) { +auto make_sort_task(B b, E e, C cmp) { return [b, e, cmp] (Runtime& rt) mutable { @@ -625,18 +614,18 @@ TF_FORCE_INLINE auto make_sort_task(B b, E e, C cmp) { return; } - //parallel_3wqsort(rt, beg, end-1, cmp); + PreemptionGuard preemption_guard(rt); + + //detail::parallel_3wqsort(rt, beg, end-1, cmp); detail::parallel_pdqsort> && std::is_arithmetic_v::value_type> - >(rt, beg, end, cmp, log2(end - beg)); - - rt.corun_all(); + >(rt, beg, end, cmp, log2(size_t(end - beg))); }; } template -TF_FORCE_INLINE auto make_sort_task(B beg, E end) { +auto make_sort_task(B beg, E end) { using value_type = std::decay_t())>; return make_sort_task(beg, end, std::less{}); } diff --git a/taskflow/algorithm/transform.hpp b/taskflow/algorithm/transform.hpp index b155f658b..1e8ef8e2a 100644 --- a/taskflow/algorithm/transform.hpp +++ b/taskflow/algorithm/transform.hpp @@ -1,6 +1,6 @@ #pragma once -#include "launch.hpp" +#include "../taskflow.hpp" namespace tf { @@ -10,6 +10,8 @@ template < std::enable_if_t>, void>* = nullptr > auto make_transform_task(B first1, E last1, O d_first, C c, P part = P()) { + + using namespace std::string_literals; using B_t = std::decay_t>; using E_t = std::decay_t>; @@ -27,51 +29,49 @@ auto make_transform_task(B first1, E last1, O d_first, C c, P part = P()) { // only myself - no need to spawn another graph if(W <= 1 || N <= part.chunk_size()) { - launch_loop(part, [&](){ - std::transform(beg, end, d_beg, c); - }); + part([=]() mutable { std::transform(beg, end, d_beg, c); })(); return; } + PreemptionGuard preemption_guard(rt); + if(N < W) { W = N; } // static partitioner if constexpr(part.type() == PartitionerType::STATIC) { - size_t chunk_size; - for(size_t w=0, curr_b=0; w= N) ? task() : rt.silent_async(task); } - rt.corun_all(); } // dynamic partitioner else { - std::atomic next(0); - launch_loop(N, W, rt, next, part, [=, &next, &part] () mutable { - part.loop(N, W, next, - [&, prev_e=size_t{0}](size_t part_b, size_t part_e) mutable { + auto next = std::make_shared>(0); + for(size_t w=0; w>, void>* = nullptr > auto make_transform_task(B1 first1, E1 last1, B2 first2, O d_first, C c, P part = P()) { + + using namespace std::string_literals; using B1_t = std::decay_t>; using E1_t = std::decay_t>; @@ -101,11 +103,11 @@ auto make_transform_task(B1 first1, E1 last1, B2 first2, O d_first, C c, P part // only myself - no need to spawn another graph if(W <= 1 || N <= part.chunk_size()) { - launch_loop(part, [&](){ - std::transform(beg1, end1, beg2, d_beg, c); - }); + part([=]() mutable { std::transform(beg1, end1, beg2, d_beg, c); })(); return; } + + PreemptionGuard preemption_guard(rt); if(N < W) { W = N; @@ -113,31 +115,28 @@ auto make_transform_task(B1 first1, E1 last1, B2 first2, O d_first, C c, P part // static partitioner if constexpr(part.type() == PartitionerType::STATIC) { - size_t chunk_size; - for(size_t w=0, curr_b=0; w= N) ? task() : rt.silent_async(task); } - rt.corun_all(); } // dynamic partitioner else { - std::atomic next(0); - launch_loop(N, W, rt, next, part, [=, &c, &next, &part] () mutable { - part.loop(N, W, next, - [&, prev_e=size_t{0}](size_t part_b, size_t part_e) mutable { + auto next = std::make_shared>(0); + for(size_t w=0; w_parent; parent == nullptr) { + _decrement_topology(); + } + // from runtime + else { + auto state = parent->_nstate; + if(parent->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) { + if(state & NSTATE::PREEMPTED) { + _update_cache(worker, cache, parent); + } + } + } + recycle(node); +} + // ---------------------------------------------------------------------------- // Async // ---------------------------------------------------------------------------- @@ -13,24 +42,8 @@ namespace tf { // Function: async template auto Executor::async(P&& params, F&& f) { - _increment_topology(); - - using R = std::invoke_result_t>; - - std::packaged_task p(std::forward(f)); - auto fu{p.get_future()}; - - auto node = node_pool.animate( - std::forward

      (params), nullptr, nullptr, 0, - // handle - std::in_place_type_t{}, - [p=make_moc(std::move(p))]() mutable { p.object(); } - ); - - _schedule_async_task(node); - - return fu; + return _async(std::forward

      (params), std::forward(f), nullptr, nullptr); } // Function: async @@ -39,6 +52,53 @@ auto Executor::async(F&& f) { return async(DefaultTaskParams{}, std::forward(f)); } +// Function: _async +template +auto Executor::_async(P&& params, F&& f, Topology* tpg, Node* parent) { + + // async task with runtime: [] (tf::Runtime&) -> void {} + if constexpr (is_runtime_task_v) { + + std::promise p; + auto fu{p.get_future()}; + + _schedule_async_task(animate( + NSTATE::NONE, ESTATE::ANCHORED, std::forward

      (params), tpg, parent, 0, + std::in_place_type_t{}, + [p=MoC{std::move(p)}, f=std::forward(f)](Runtime& rt, bool reentered) mutable { + if(!reentered) { + f(rt); + } + else { + auto& eptr = rt._parent->_exception_ptr; + eptr ? p.object.set_exception(eptr) : p.object.set_value(); + } + } + )); + return fu; + } + // async task with closure: [] () -> auto { return ... } + else if constexpr (std::is_invocable_v){ + using R = std::invoke_result_t; + std::packaged_task p(std::forward(f)); + auto fu{p.get_future()}; + _schedule_async_task(animate( + NSTATE::NONE, ESTATE::NONE, std::forward

      (params), tpg, parent, 0, + std::in_place_type_t{}, + [p=make_moc(std::move(p))]() mutable { p.object(); } + )); + return fu; + } + else { + static_assert(dependent_false_v, + "invalid async target - must be one of the following types:\n\ + (1) [] (tf::Runtime&) -> void {}\n\ + (2) [] () -> auto { ... return ... }\n" + ); + } +} + + // ---------------------------------------------------------------------------- // Silent Async // ---------------------------------------------------------------------------- @@ -46,16 +106,8 @@ auto Executor::async(F&& f) { // Function: silent_async template void Executor::silent_async(P&& params, F&& f) { - _increment_topology(); - - auto node = node_pool.animate( - std::forward

      (params), nullptr, nullptr, 0, - // handle - std::in_place_type_t{}, std::forward(f) - ); - - _schedule_async_task(node); + _silent_async(std::forward

      (params), std::forward(f), nullptr, nullptr); } // Function: silent_async @@ -64,31 +116,24 @@ void Executor::silent_async(F&& f) { silent_async(DefaultTaskParams{}, std::forward(f)); } -// ---------------------------------------------------------------------------- -// Async Helper Methods -// ---------------------------------------------------------------------------- - -// Procedure: _schedule_async_task -inline void Executor::_schedule_async_task(Node* node) { - if(auto w = _this_worker(); w) { - _schedule(*w, node); - } - else{ - _schedule(node); - } -} - -// Procedure: _tear_down_async -inline void Executor::_tear_down_async(Node* node) { - // from runtime - if(node->_parent) { - node->_parent->_join_counter.fetch_sub(1, std::memory_order_release); +// Function: _silent_async +template +void Executor::_silent_async(P&& params, F&& f, Topology* tpg, Node* parent) { + // silent task + if constexpr (is_runtime_task_v || is_static_task_v) { + _schedule_async_task(animate( + NSTATE::NONE, ESTATE::NONE, std::forward

      (params), tpg, parent, 0, + std::in_place_type_t{}, std::forward(f) + )); } - // from executor + // invalid silent async target else { - _decrement_topology(); + static_assert(dependent_false_v, + "invalid silent_async target - must be one of the following types:\n\ + (1) [] (tf::Runtime&) -> void {}\n\ + (2) [] () -> void { ... }\n" + ); } - node_pool.recycle(node); } // ---------------------------------------------------------------------------- @@ -112,26 +157,10 @@ template (params), nullptr, nullptr, num_dependents, - std::in_place_type_t{}, std::forward(func) - )); - - if constexpr(sizeof...(Tasks) > 0) { - (_process_async_dependent(task._node, tasks, num_dependents), ...); - } - - if(num_dependents == 0) { - _schedule_async_task(task._node); - } - - return task; + std::array array = { std::forward(tasks)... }; + return silent_dependent_async( + std::forward

      (params), std::forward(func), array.begin(), array.end() + ); } // Function: silent_dependent_async @@ -152,18 +181,18 @@ tf::AsyncTask Executor::silent_dependent_async( _increment_topology(); - size_t num_dependents = std::distance(first, last); + size_t num_predecessors = std::distance(first, last); - AsyncTask task(node_pool.animate( - std::forward

      (params), nullptr, nullptr, num_dependents, + AsyncTask task(animate( + NSTATE::NONE, ESTATE::NONE, std::forward

      (params), nullptr, nullptr, num_predecessors, std::in_place_type_t{}, std::forward(func) )); - for(; first != last; first++){ - _process_async_dependent(task._node, *first, num_dependents); + for(; first != last; first++) { + _process_dependent_async(task._node, *first, num_predecessors); } - if(num_dependents == 0) { + if(num_predecessors == 0) { _schedule_async_task(task._node); } @@ -187,31 +216,10 @@ template && all_same_v...>, void>* > auto Executor::dependent_async(P&& params, F&& func, Tasks&&... tasks) { - - _increment_topology(); - - using R = std::invoke_result_t>; - - std::packaged_task p(std::forward(func)); - auto fu{p.get_future()}; - - size_t num_dependents = sizeof...(tasks); - - AsyncTask task(node_pool.animate( - std::forward

      (params), nullptr, nullptr, num_dependents, - std::in_place_type_t{}, - [p=make_moc(std::move(p))] () mutable { p.object(); } - )); - - if constexpr(sizeof...(Tasks) > 0) { - (_process_async_dependent(task._node, tasks, num_dependents), ...); - } - - if(num_dependents == 0) { - _schedule_async_task(task._node); - } - - return std::make_pair(std::move(task), std::move(fu)); + std::array array = { std::forward(tasks)... }; + return dependent_async( + std::forward

      (params), std::forward(func), array.begin(), array.end() + ); } // Function: dependent_async @@ -229,95 +237,133 @@ template >; - - std::packaged_task p(std::forward(func)); - auto fu{p.get_future()}; + // async with runtime: [] (tf::Runtime&) -> void {} + if constexpr (is_runtime_task_v) { + + std::promise p; + auto fu{p.get_future()}; + + AsyncTask task(animate( + NSTATE::NONE, ESTATE::ANCHORED, std::forward

      (params), nullptr, nullptr, num_predecessors, + std::in_place_type_t{}, + [p=MoC{std::move(p)}, f=std::forward(func)] (tf::Runtime& rt, bool reentered) mutable { + if(!reentered) { + f(rt); + } + else { + auto& eptr = rt._parent->_exception_ptr; + eptr ? p.object.set_exception(eptr) : p.object.set_value(); + } + } + )); - size_t num_dependents = std::distance(first, last); + for(; first != last; first++) { + _process_dependent_async(task._node, *first, num_predecessors); + } - AsyncTask task(node_pool.animate( - std::forward

      (params), nullptr, nullptr, num_dependents, - std::in_place_type_t{}, - [p=make_moc(std::move(p))] () mutable { p.object(); } - )); + if(num_predecessors == 0) { + _schedule_async_task(task._node); + } - for(; first != last; first++) { - _process_async_dependent(task._node, *first, num_dependents); + return std::make_pair(std::move(task), std::move(fu)); } + // async without runtime: [] () -> auto { return ... } + else if constexpr(std::is_invocable_v) { - if(num_dependents == 0) { - _schedule_async_task(task._node); - } + using R = std::invoke_result_t; + std::packaged_task p(std::forward(func)); + auto fu{p.get_future()}; - return std::make_pair(std::move(task), std::move(fu)); + AsyncTask task(animate( + NSTATE::NONE, ESTATE::NONE, std::forward

      (params), nullptr, nullptr, num_predecessors, + std::in_place_type_t{}, + [p=make_moc(std::move(p))] () mutable { p.object(); } + )); + + for(; first != last; first++) { + _process_dependent_async(task._node, *first, num_predecessors); + } + + if(num_predecessors == 0) { + _schedule_async_task(task._node); + } + + return std::make_pair(std::move(task), std::move(fu)); + } + else { + static_assert(dependent_false_v, "invalid async callable"); + } } // ---------------------------------------------------------------------------- // Dependent Async Helper Functions // ---------------------------------------------------------------------------- -// Procedure: _process_async_dependent -inline void Executor::_process_async_dependent( - Node* node, tf::AsyncTask& task, size_t& num_dependents +// Procedure: _process_dependent_async +inline void Executor::_process_dependent_async( + Node* node, tf::AsyncTask& task, size_t& num_predecessors ) { + // special case: the task is not associated with any dependent-async task + if(task.empty()) { + num_predecessors = node->_join_counter.fetch_sub(1, std::memory_order_acq_rel) - 1; + return; + } + auto& state = std::get_if(&(task._node->_handle))->state; - add_successor: + while (true) { - auto target = Node::AsyncState::UNFINISHED; - - // acquires the lock - if(state.compare_exchange_weak(target, Node::AsyncState::LOCKED, - std::memory_order_acq_rel, - std::memory_order_acquire)) { - task._node->_successors.push_back(node); - state.store(Node::AsyncState::UNFINISHED, std::memory_order_release); - } - // dep's state is FINISHED, which means dep finished its callable already - // thus decrement the node's join counter by 1 - else if (target == Node::AsyncState::FINISHED) { - num_dependents = node->_join_counter.fetch_sub(1, std::memory_order_acq_rel) - 1; - } - // another worker adding its async task to the same successors of this node - else { - goto add_successor; + auto target = ASTATE::UNFINISHED; + + // Try to acquire the lock + if (state.compare_exchange_strong(target, ASTATE::LOCKED, + std::memory_order_acq_rel, + std::memory_order_acquire)) { + task._node->_edges.push_back(node); + state.store(ASTATE::UNFINISHED, std::memory_order_release); + break; + } + + // If already finished, decrement the join counter + if (target == ASTATE::FINISHED) { + num_predecessors = node->_join_counter.fetch_sub(1, std::memory_order_acq_rel) - 1; + break; + } + + // If locked by another worker, retry } } - // Procedure: _tear_down_dependent_async -inline void Executor::_tear_down_dependent_async(Worker& worker, Node* node) { +inline void Executor::_tear_down_dependent_async(Worker& worker, Node* node, Node*& cache) { auto handle = std::get_if(&(node->_handle)); // this async task comes from Executor - auto target = Node::AsyncState::UNFINISHED; + auto target = ASTATE::UNFINISHED; - while(!handle->state.compare_exchange_weak(target, Node::AsyncState::FINISHED, + while(!handle->state.compare_exchange_weak(target, ASTATE::FINISHED, std::memory_order_acq_rel, std::memory_order_relaxed)) { - target = Node::AsyncState::UNFINISHED; + target = ASTATE::UNFINISHED; } - // spaw successors whenever their dependencies are resolved - worker._cache = nullptr; - for(size_t i=0; i_successors.size(); ++i) { - if(auto s = node->_successors[i]; + // spawn successors whenever their dependencies are resolved + for(size_t i=0; i_edges.size(); ++i) { + if(auto s = node->_edges[i]; s->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1 ) { - if(worker._cache) { - _schedule(worker, worker._cache); - } - worker._cache = s; + _update_cache(worker, cache, s); } } // now the executor no longer needs to retain ownership if(handle->use_count.fetch_sub(1, std::memory_order_acq_rel) == 1) { - node_pool.recycle(node); + recycle(node); } _decrement_topology(); diff --git a/taskflow/core/async_task.hpp b/taskflow/core/async_task.hpp index 026e8cb1c..ea15e7d0b 100644 --- a/taskflow/core/async_task.hpp +++ b/taskflow/core/async_task.hpp @@ -14,12 +14,12 @@ namespace tf { // ---------------------------------------------------------------------------- /** -@brief class to create a dependent asynchronous task +@brief class to hold a dependent asynchronous task with shared ownership A tf::AsyncTask is a lightweight handle that retains @em shared ownership -of a dependent async task created by an executor. -This shared ownership ensures that the async task remains alive when -adding it to the dependency list of another async task, +of a dependent asynchronous (dependent-async) task created by an executor. +This shared ownership ensures that the dependent-async task remains alive when +adding it to the dependency list of another dependent-async task, thus avoiding the classical [ABA problem](https://en.wikipedia.org/wiki/ABA_problem). @code{.cpp} @@ -31,13 +31,16 @@ tf::AsyncTask A = executor.silent_dependent_async([](){}); tf::AsyncTask B = executor.silent_dependent_async([](){}, A); @endcode -Currently, tf::AsyncTask is implemented based on the logic of +tf::AsyncTask is implemented based on the logic of C++ smart pointer std::shared_ptr and is considered cheap to copy or move as long as only a handful of objects own it. When a worker completes an async task, it will remove the task from the executor, decrementing the number of shared owners by one. If that counter reaches zero, the task is destroyed. + +@note +To know more about dependent-async task, please refer to @ref DependentAsyncTasking. */ class AsyncTask { @@ -51,22 +54,22 @@ class AsyncTask { AsyncTask() = default; /** - @brief destroys the managed asynchronous task if this is the last owner + @brief destroys the managed dependent-async task if this is the last owner */ ~AsyncTask(); /** - @brief constructs an asynchronous task that shares ownership of @c rhs + @brief constructs a dependent-async task that shares ownership of @c rhs */ AsyncTask(const AsyncTask& rhs); /** - @brief move-constructs an asynchronous task from @c rhs + @brief move-constructs an dependent-async task from @c rhs */ AsyncTask(AsyncTask&& rhs); /** - @brief copy-assigns the asynchronous task from @c rhs + @brief copy-assigns the dependent-async task from @c rhs Releases the managed object of @c this and retains a new shared ownership of @c rhs. @@ -74,35 +77,82 @@ class AsyncTask { AsyncTask& operator = (const AsyncTask& rhs); /** - @brief move-assigns the asynchronous task from @c rhs + @brief move-assigns the dependent-async task from @c rhs Releases the managed object of @c this and takes over the ownership of @c rhs. */ AsyncTask& operator = (AsyncTask&& rhs); /** - @brief checks if the asynchronous task stores nothing + @brief checks if this dependent-async task is associated with any task + + An empty dependent-async task is not associated with any task created + from the executor. + + @code{.cpp} + tf::AsyncTask task; + assert(task.empty()); + @endcode */ bool empty() const; /** @brief release the managed object of @c this + + Releases the ownership of the managed task, if any. + After the call `*this` manages no task. + + @code{.cpp} + tf::AsyncTask task = executor.silent_dependent_async([](){}); + assert(task.empty() == false); + task.reset(); + assert(task.empty() == true); + @endcode */ void reset(); /** - @brief obtains a hash value of this asynchronous task + @brief obtains the hashed value of this dependent-async task + + @code{.cpp} + tf::AsyncTask task = executor.silent_dependent_async([](){}); + std::cout << task.hash_value() << '\n'; + @endcode */ size_t hash_value() const; /** @brief returns the number of shared owners that are currently managing - this asynchronous task + this dependent-async task + + In a multithreaded environment, `use_count` atomically retrieves + (with `memory_order_relaxed` load) the number of tf::AsyncTask instances that manage + the current task. + + @code{.cpp} + tf::AsyncTask task; + assert(task.use_count() == 0); + @endcode */ size_t use_count() const; /** - @brief returns the boolean indicating whether the async task is done + @brief checks if this dependent-async task finishes + + In a multithreaded environment, `is_done` atomically retrieves + (with `memory_order_acquire` load) the underlying state bit that indicates + the completion of this dependent-async task. + If the dependent-async task is empty, returns `true`. + + @code{.cpp} + tf::AsyncTask task = executor.silent_dependent_async([](){}); + while(task.is_done() == false); + std::cout << "dependent-async task finishes\n"; + + task.reset(); + assert(task.is_done() == true); + @endcode + */ bool is_done() const; @@ -135,7 +185,7 @@ inline void AsyncTask::_decref() { if(_node && std::get_if(&(_node->_handle))->use_count.fetch_sub( 1, std::memory_order_acq_rel ) == 1) { - node_pool.recycle(_node); + recycle(_node); } } @@ -198,9 +248,10 @@ inline size_t AsyncTask::use_count() const { // Function: is_done inline bool AsyncTask::is_done() const { - return std::get_if(&(_node->_handle))->state.load( + return _node == nullptr ? true: + std::get_if(&(_node->_handle))->state.load( std::memory_order_acquire - ) == Node::AsyncState::FINISHED; + ) == ASTATE::FINISHED; } } // end of namespace tf ---------------------------------------------------- diff --git a/taskflow/core/atomic_notifier.hpp b/taskflow/core/atomic_notifier.hpp new file mode 100644 index 000000000..f9dd479ba --- /dev/null +++ b/taskflow/core/atomic_notifier.hpp @@ -0,0 +1,124 @@ +#if __cplusplus >= TF_CPP20 + +#pragma once + +#include +#include +#include +#include +#include "../utility/os.hpp" + +namespace tf { + +//----------------------------------------------------------------------------- + +class AtomicNotifier { + + friend class Executor; + + public: + + struct Waiter { + alignas (2*TF_CACHELINE_SIZE) uint32_t epoch; + }; + + AtomicNotifier(size_t N) noexcept : _state(0), _waiters(N) {} + ~AtomicNotifier() { assert((_state.load() & WAITER_MASK) == 0); } + + void notify_one() noexcept; + void notify_all() noexcept; + void notify_n(size_t n) noexcept; + void prepare_wait(Waiter*) noexcept; + void cancel_wait(Waiter*) noexcept; + void commit_wait(Waiter*) noexcept; + + size_t size() const noexcept; + size_t num_waiters() const noexcept; + + private: + + AtomicNotifier(const AtomicNotifier&) = delete; + AtomicNotifier(AtomicNotifier&&) = delete; + AtomicNotifier& operator=(const AtomicNotifier&) = delete; + AtomicNotifier& operator=(AtomicNotifier&&) = delete; + + // This requires 64-bit + static_assert(sizeof(int) == 4, "bad platform"); + static_assert(sizeof(uint32_t) == 4, "bad platform"); + static_assert(sizeof(uint64_t) == 8, "bad platform"); + static_assert(sizeof(std::atomic) == 8, "bad platform"); + + // _state stores the epoch in the most significant 32 bits and the + // waiter count in the least significant 32 bits. + std::atomic _state; + std::vector _waiters; + + static constexpr uint64_t WAITER_INC {1}; + static constexpr uint64_t EPOCH_SHIFT {32}; + static constexpr uint64_t EPOCH_INC {uint64_t(1) << EPOCH_SHIFT}; + static constexpr uint64_t WAITER_MASK {EPOCH_INC - 1}; +}; + +inline size_t AtomicNotifier::size() const noexcept { + return _waiters.size(); +} + +inline size_t AtomicNotifier::num_waiters() const noexcept { + return _state.load(std::memory_order_relaxed) & WAITER_MASK; +} + +inline void AtomicNotifier::notify_one() noexcept { + std::atomic_thread_fence(std::memory_order_seq_cst); + for(uint64_t state = _state.load(std::memory_order_acquire); state & WAITER_MASK;) { + if(_state.compare_exchange_weak(state, state + EPOCH_INC, std::memory_order_acq_rel)) { + _state.notify_one(); + break; + } + } +} + +inline void AtomicNotifier::notify_all() noexcept { + std::atomic_thread_fence(std::memory_order_seq_cst); + for(uint64_t state = _state.load(std::memory_order_acquire); state & WAITER_MASK;) { + if(_state.compare_exchange_weak(state, state + EPOCH_INC, std::memory_order_acq_rel)) { + _state.notify_all(); + break; + } + } +} + +inline void AtomicNotifier::notify_n(size_t n) noexcept { + if(n >= _waiters.size()) { + notify_all(); + } + else { + for(size_t k=0; kepoch = (prev >> EPOCH_SHIFT); + std::atomic_thread_fence(std::memory_order_seq_cst); +} + +inline void AtomicNotifier::cancel_wait(Waiter*) noexcept { + _state.fetch_sub(WAITER_INC, std::memory_order_seq_cst); +} + +inline void AtomicNotifier::commit_wait(Waiter* waiter) noexcept { + uint64_t prev = _state.load(std::memory_order_acquire); + while((prev >> EPOCH_SHIFT) == waiter->epoch) { + _state.wait(prev, std::memory_order_acquire); + prev = _state.load(std::memory_order_acquire); + } + _state.fetch_sub(WAITER_INC, std::memory_order_seq_cst); +} + + + +} // namespace taskflow ------------------------------------------------------- + +#endif diff --git a/taskflow/core/declarations.hpp b/taskflow/core/declarations.hpp index 7763fab0b..84b8df2eb 100644 --- a/taskflow/core/declarations.hpp +++ b/taskflow/core/declarations.hpp @@ -5,7 +5,8 @@ namespace tf { // ---------------------------------------------------------------------------- // taskflow // ---------------------------------------------------------------------------- -class AsyncTopology; + +class Algorithm; class Node; class Graph; class FlowBuilder; @@ -44,19 +45,14 @@ class cudaFlowLinearOptimizer; class cudaFlowSequentialOptimizer; class cudaFlowRoundRobinOptimizer; -// ---------------------------------------------------------------------------- -// syclFlow -// ---------------------------------------------------------------------------- -class syclNode; -class syclGraph; -class syclTask; -class syclFlow; +template +class cudaGraphExecBase; // ---------------------------------------------------------------------------- // struct // ---------------------------------------------------------------------------- -struct TaskParams; -struct DefaultTaskParams; +class TaskParams; +class DefaultTaskParams; } // end of namespace tf ----------------------------------------------------- diff --git a/taskflow/core/error.hpp b/taskflow/core/error.hpp index 6a68bea16..6ca8edaac 100644 --- a/taskflow/core/error.hpp +++ b/taskflow/core/error.hpp @@ -8,8 +8,50 @@ namespace tf { -// Procedure: throw_se -// Throws the system error under a given error code. +// node-specific states +struct NSTATE { + + using underlying_type = int; + + constexpr static underlying_type NONE = 0x00000000; + constexpr static underlying_type CONDITIONED = 0x10000000; + constexpr static underlying_type PREEMPTED = 0x20000000; + constexpr static underlying_type RETAIN_SUBFLOW = 0x40000000; + constexpr static underlying_type JOINED_SUBFLOW = 0x80000000; + + // mask to isolate state bits - non-state bits store # weak dependents + constexpr static underlying_type MASK = 0xF0000000; +}; + +using nstate_t = NSTATE::underlying_type; + +// exception-specific states +struct ESTATE { + + using underlying_type = int; + + constexpr static underlying_type NONE = 0x00000000; + constexpr static underlying_type EXCEPTION = 0x10000000; + constexpr static underlying_type CANCELLED = 0x20000000; + constexpr static underlying_type ANCHORED = 0x40000000; +}; + +using estate_t = ESTATE::underlying_type; + +// async-specific states +struct ASTATE { + + using underlying_type = int; + + constexpr static underlying_type UNFINISHED = 0; + constexpr static underlying_type LOCKED = 1; + constexpr static underlying_type FINISHED = 2; +}; + +using astate_t = ASTATE::underlying_type; + +// Procedure: throw_re +// Throws runtime error under a given error code. template //void throw_se(const char* fname, const size_t line, Error::Code c, ArgsT&&... args) { void throw_re(const char* fname, const size_t line, ArgsT&&... args) { @@ -17,10 +59,30 @@ void throw_re(const char* fname, const size_t line, ArgsT&&... args) { oss << "[" << fname << ":" << line << "] "; //ostreamize(oss, std::forward(args)...); (oss << ... << args); +#ifdef TF_DISABLE_EXCEPTION_HANDLING + std::cerr << oss.str(); + std::terminate(); +#else throw std::runtime_error(oss.str()); +#endif } } // ------------------------------------------------------------------------ #define TF_THROW(...) tf::throw_re(__FILE__, __LINE__, __VA_ARGS__); +// ---------------------------------------------------------------------------- + +#ifdef TF_DISABLE_EXCEPTION_HANDLING + #define TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, code_block) \ + code_block; +#else + #define TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, code_block) \ + try { \ + code_block; \ + } catch(...) { \ + _process_exception(worker, node); \ + } +#endif + + diff --git a/taskflow/core/executor.hpp b/taskflow/core/executor.hpp index a9d9dc457..01c648b8e 100644 --- a/taskflow/core/executor.hpp +++ b/taskflow/core/executor.hpp @@ -3,6 +3,7 @@ #include "observer.hpp" #include "taskflow.hpp" #include "async_task.hpp" +#include "freelist.hpp" /** @file executor.hpp @@ -15,11 +16,12 @@ namespace tf { // Executor Definition // ---------------------------------------------------------------------------- -/** @class Executor +/** +@class Executor -@brief class to create an executor for running a taskflow graph +@brief class to create an executor -An executor manages a set of worker threads to run one or multiple taskflows +An tf::Executor manages a set of worker threads to run tasks using an efficient work-stealing scheduling algorithm. @code{.cpp} @@ -36,7 +38,7 @@ tf::Task C = taskflow.emplace([] () { std::cout << "This is TaskC\n"; }); A.precede(B, C); tf::Future fu = executor.run(taskflow); -fu.wait(); // block until the execution completes +fu.wait(); // block until the execution completes executor.run(taskflow, [](){ std::cout << "end of 1 run"; }).wait(); executor.run_n(taskflow, 4); @@ -45,29 +47,50 @@ executor.run_n(taskflow, 4, [](){ std::cout << "end of 4 runs"; }).wait(); executor.run_until(taskflow, [cnt=0] () mutable { return ++cnt == 10; }); @endcode -All the @c run methods are @em thread-safe. You can submit multiple -taskflows at the same time to an executor from different threads. +All executor methods are @em thread-safe. +For example, you can submit multiple taskflows to an executor concurrently +from different threads, while other threads simultaneously create asynchronous tasks. + +@code{.cpp} +std::thread t1([&](){ executor.run(taskflow); }; +std::thread t2([&](){ executor.async([](){ std::cout << "async task from t2\n"; }); }); +executor.async([&](){ std::cout << "async task from the main thread\n"; }); +@endcode + +@note +To know more about tf::Executor, please refer to @ref ExecuteTaskflow. */ class Executor { friend class FlowBuilder; friend class Subflow; friend class Runtime; + friend class Algorithm; public: /** @brief constructs the executor with @c N worker threads - @param N the number of workers (default std::thread::hardware_concurrency) - + @param N number of workers (default std::thread::hardware_concurrency) + @param wix interface class instance to configure workers' behaviors + The constructor spawns @c N worker threads to run tasks in a work-stealing loop. The number of workers must be greater than zero or an exception will be thrown. By default, the number of worker threads is equal to the maximum hardware concurrency returned by std::thread::hardware_concurrency. + + Users can alter the worker behavior, such as changing thread affinity, + via deriving an instance from tf::WorkerInterface. + + @attention + An exception will be thrown if executor construction fails. */ - explicit Executor(size_t N = std::thread::hardware_concurrency()); + explicit Executor( + size_t N = std::thread::hardware_concurrency(), + std::shared_ptr wix = nullptr + ); /** @brief destructs the executor @@ -421,7 +444,7 @@ class Executor { Unlike the typical flow of calling `tf::Executor::run` series plus waiting on the result, this method must be called by an internal worker of this executor. The caller worker will participate in - the work-stealing loop of the scheduler, therby avoiding potential + the work-stealing loop of the scheduler, thereby avoiding potential deadlock caused by blocked waiting. @code{.cpp} @@ -506,6 +529,16 @@ class Executor { @endcode */ size_t num_workers() const noexcept; + + /** + @brief queries the number of workers that are currently not making any stealing attempts + */ + size_t num_waiters() const noexcept; + + /** + @brief queries the number of queues used in the work-stealing loop + */ + size_t num_queues() const noexcept; /** @brief queries the number of running topologies at the time of this call @@ -533,7 +566,7 @@ class Executor { size_t num_taskflows() const; /** - @brief queries the id of the caller thread in this executor + @brief queries the id of the caller thread within this executor Each worker has an unique id in the range of @c 0 to @c N-1 associated with its parent executor. @@ -703,7 +736,7 @@ class Executor { /** @brief runs the given function asynchronously - when the given dependents finish + when the given predecessors finish @tparam F callable type @tparam Tasks task types convertible to tf::AsyncTask @@ -735,7 +768,7 @@ class Executor { /** @brief runs the given function asynchronously - when the given dependents finish + when the given predecessors finish @tparam F callable type @tparam Tasks task types convertible to tf::AsyncTask @@ -771,7 +804,7 @@ class Executor { /** @brief runs the given function asynchronously - when the given range of dependents finish + when the given range of predecessors finish @tparam F callable type @tparam I iterator type @@ -808,7 +841,7 @@ class Executor { /** @brief runs the given function asynchronously - when the given range of dependents finish + when the given range of predecessors finish @tparam F callable type @tparam I iterator type @@ -851,7 +884,7 @@ class Executor { /** @brief runs the given function asynchronously - when the given dependents finish + when the given predecessors finish @tparam F callable type @tparam Tasks task types convertible to tf::AsyncTask @@ -893,7 +926,7 @@ class Executor { /** @brief runs the given function asynchronously - when the given dependents finish + when the given predecessors finish @tparam P task parameters type @tparam F callable type @@ -939,7 +972,7 @@ class Executor { /** @brief runs the given function asynchronously - when the given range of dependents finish + when the given range of predecessors finish @tparam F callable type @tparam I iterator type @@ -984,7 +1017,7 @@ class Executor { /** @brief runs the given function asynchronously - when the given range of dependents finish + when the given range of predecessors finish @tparam P task parameters type @tparam F callable type @@ -1033,85 +1066,112 @@ class Executor { private: - const size_t _MAX_STEALS; - - std::mutex _wsq_mutex; std::mutex _taskflows_mutex; + + std::vector _workers; + DefaultNotifier _notifier; -#ifdef __cpp_lib_atomic_wait +#if __cplusplus >= TF_CPP20 std::atomic _num_topologies {0}; - std::atomic_flag _all_spawned = ATOMIC_FLAG_INIT; #else std::condition_variable _topology_cv; std::mutex _topology_mutex; size_t _num_topologies {0}; #endif - std::unordered_map _wids; - std::vector _threads; - std::vector _workers; std::list _taskflows; - Notifier _notifier; - - TaskQueue _wsq; - - std::atomic _done {0}; + Freelist _buffers; + std::shared_ptr _worker_interface; std::unordered_set> _observers; - Worker* _this_worker(); - - bool _wait_for_task(Worker&, Node*&); - bool _invoke_module_task_internal(Worker&, Node*); - + void _shutdown(); void _observer_prologue(Worker&, Node*); void _observer_epilogue(Worker&, Node*); void _spawn(size_t); void _exploit_task(Worker&, Node*&); - void _explore_task(Worker&, Node*&); + bool _explore_task(Worker&, Node*&); void _schedule(Worker&, Node*); void _schedule(Node*); - void _schedule(Worker&, const SmallVector&); - void _schedule(const SmallVector&); void _set_up_topology(Worker*, Topology*); - void _set_up_graph(Graph&, Node*, Topology*, int, SmallVector&); void _tear_down_topology(Worker&, Topology*); - void _tear_down_async(Node*); - void _tear_down_dependent_async(Worker&, Node*); - void _tear_down_invoke(Worker&, Node*); + void _tear_down_async(Worker&, Node*, Node*&); + void _tear_down_dependent_async(Worker&, Node*, Node*&); + void _tear_down_invoke(Worker&, Node*, Node*&); void _increment_topology(); void _decrement_topology(); void _invoke(Worker&, Node*); void _invoke_static_task(Worker&, Node*); - void _invoke_subflow_task(Worker&, Node*); - void _detach_subflow_task(Worker&, Node*, Graph&); void _invoke_condition_task(Worker&, Node*, SmallVector&); void _invoke_multi_condition_task(Worker&, Node*, SmallVector&); - void _invoke_module_task(Worker&, Node*); - void _invoke_async_task(Worker&, Node*); - void _invoke_dependent_async_task(Worker&, Node*); - void _process_async_dependent(Node*, tf::AsyncTask&, size_t&); + void _process_dependent_async(Node*, tf::AsyncTask&, size_t&); void _process_exception(Worker&, Node*); void _schedule_async_task(Node*); - void _corun_graph(Worker&, Node*, Graph&); + void _update_cache(Worker&, Node*&, Node*); + + bool _wait_for_task(Worker&, Node*&); + bool _invoke_subflow_task(Worker&, Node*); + bool _invoke_module_task(Worker&, Node*); + bool _invoke_module_task_impl(Worker&, Node*, Graph&); + bool _invoke_async_task(Worker&, Node*); + bool _invoke_dependent_async_task(Worker&, Node*); + bool _invoke_runtime_task(Worker&, Node*); + bool _invoke_runtime_task_impl(Worker&, Node*, std::function&); + bool _invoke_runtime_task_impl(Worker&, Node*, std::function&); + + template + I _set_up_graph(I, I, Topology*, Node*); template void _corun_until(Worker&, P&&); + + template + void _corun_graph(Worker&, Node*, I, I); + + template + void _schedule(Worker&, I, I); + + template + void _schedule(I, I); + + template + void _schedule_graph_with_parent(Worker&, I, I, Node*); + + template + auto _async(P&&, F&&, Topology*, Node*); + + template + void _silent_async(P&&, F&&, Topology*, Node*); + }; +#ifndef DOXYGEN_GENERATING_OUTPUT + // Constructor -inline Executor::Executor(size_t N) : - _MAX_STEALS {((N+1) << 1)}, - _threads {N}, - _workers {N}, - _notifier {N} { +inline Executor::Executor(size_t N, std::shared_ptr wix) : + _workers (N), + _notifier (N), + _buffers (N), + _worker_interface(std::move(wix)) { if(N == 0) { TF_THROW("executor must define at least one worker"); } - - _spawn(N); + + // If spawning N threads fails, shut down any created threads before + // rethrowing the exception. +#ifndef TF_DISABLE_EXCEPTION_HANDLING + try { +#endif + _spawn(N); +#ifndef TF_DISABLE_EXCEPTION_HANDLING + } + catch(...) { + _shutdown(); + std::rethrow_exception(std::current_exception()); + } +#endif // initialize the default observer if requested if(has_env(TF_ENABLE_PROFILER)) { @@ -1121,17 +1181,32 @@ inline Executor::Executor(size_t N) : // Destructor inline Executor::~Executor() { + _shutdown(); +} + +// Function: _shutdown +inline void Executor::_shutdown() { // wait for all topologies to complete wait_for_all(); // shut down the scheduler - _done = true; - - _notifier.notify(true); - - for(auto& t : _threads){ - t.join(); + for(size_t i=0; i<_workers.size(); ++i) { + #if __cplusplus >= TF_CPP20 + _workers[i]._done.test_and_set(std::memory_order_relaxed); + #else + _workers[i]._done.store(true, std::memory_order_relaxed); + #endif + } + + _notifier.notify_all(); + + // Only join the thread if it is joinable, as std::thread construction + // may fail and throw an exception. + for(auto& w : _workers) { + if(w._thread.joinable()) { + w._thread.join(); + } } } @@ -1140,9 +1215,25 @@ inline size_t Executor::num_workers() const noexcept { return _workers.size(); } +// Function: num_waiters +inline size_t Executor::num_waiters() const noexcept { +#if __cplusplus >= TF_CPP20 + return _notifier.num_waiters(); +#else + // Unfortunately, nonblocking notifier does not have an easy way to return + // the number of workers that are not making stealing attempts. + return 0; +#endif +} + +// Function: num_queues +inline size_t Executor::num_queues() const noexcept { + return _workers.size() + _buffers.size(); +} + // Function: num_topologies inline size_t Executor::num_topologies() const { -#ifdef __cpp_lib_atomic_wait +#if __cplusplus >= TF_CPP20 return _num_topologies.load(std::memory_order_relaxed); #else return _num_topologies; @@ -1154,124 +1245,108 @@ inline size_t Executor::num_taskflows() const { return _taskflows.size(); } -// Function: _this_worker -inline Worker* Executor::_this_worker() { - auto itr = _wids.find(std::this_thread::get_id()); - return itr == _wids.end() ? nullptr : &_workers[itr->second]; -} - // Function: this_worker_id inline int Executor::this_worker_id() const { - auto i = _wids.find(std::this_thread::get_id()); - return i == _wids.end() ? -1 : static_cast(_workers[i->second]._id); + auto w = pt::this_worker; + return (w && w->_executor == this) ? static_cast(w->_id) : -1; } // Procedure: _spawn inline void Executor::_spawn(size_t N) { -#ifdef __cpp_lib_atomic_wait -#else - std::mutex mutex; - std::condition_variable cond; - size_t n=0; -#endif - for(size_t id=0; id( + std::hash()(std::this_thread::get_id())) + ); + + // before entering the work-stealing loop, call the scheduler prologue + if(_worker_interface) { + _worker_interface->scheduler_prologue(w); } -#endif Node* t = nullptr; - - while(1) { + std::exception_ptr ptr = nullptr; + + // must use 1 as condition instead of !done because + // the previous worker may stop while the following workers + // are still preparing for entering the scheduling loop +#ifndef TF_DISABLE_EXCEPTION_HANDLING + try { +#endif + + // worker loop + while(1) { - // execute the tasks. - _exploit_task(w, t); + // drain out the local queue + _exploit_task(w, t); - // wait for tasks - if(_wait_for_task(w, t) == false) { - break; + // steal and wait for tasks + if(_wait_for_task(w, t) == false) { + break; + } } + +#ifndef TF_DISABLE_EXCEPTION_HANDLING + } + catch(...) { + ptr = std::current_exception(); + } +#endif + + // call the user-specified epilogue function + if(_worker_interface) { + _worker_interface->scheduler_epilogue(w, ptr); } }); - - // POSIX-like system can use the following to affine threads to cores - //cpu_set_t cpuset; - //CPU_ZERO(&cpuset); - //CPU_SET(id, &cpuset); - //pthread_setaffinity_np( - // _threads[id].native_handle(), sizeof(cpu_set_t), &cpuset - //); - -#ifdef __cpp_lib_atomic_wait - //_wids[_threads[id].get_id()] = id; - _wids.emplace(std::piecewise_construct, - std::forward_as_tuple(_threads[id].get_id()), std::forward_as_tuple(id) - ); -#endif - } - -#ifdef __cpp_lib_atomic_wait - _all_spawned.test_and_set(std::memory_order_release); - _all_spawned.notify_all(); -#else - std::unique_lock lock(mutex); - cond.wait(lock, [&](){ return n==N; }); -#endif + } } // Function: _corun_until template void Executor::_corun_until(Worker& w, P&& stop_predicate) { - - std::uniform_int_distribution rdvtm(0, _workers.size()-1); + const size_t MAX_STEALS = ((num_queues() + 1) << 1); + + std::uniform_int_distribution udist(0, num_queues()-1); + exploit: while(!stop_predicate()) { - - //exploit: - + + // here we don't do while-loop to drain out the local queue as it can + // potentially enter a very deep recursive corun, cuasing stack overflow if(auto t = w._wsq.pop(); t) { _invoke(w, t); } else { size_t num_steals = 0; + size_t vtm = w._vtm; explore: - t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal(); + t = (vtm < _workers.size()) ? _workers[vtm]._wsq.steal() : + _buffers.steal(vtm - _workers.size()); if(t) { _invoke(w, t); + w._vtm = vtm; goto exploit; } else if(!stop_predicate()) { - if(num_steals++ > _MAX_STEALS) { + if(++num_steals > MAX_STEALS) { std::this_thread::yield(); } - w._vtm = rdvtm(w._rdgen); + vtm = udist(w._rdgen); goto explore; } else { @@ -1282,35 +1357,51 @@ void Executor::_corun_until(Worker& w, P&& stop_predicate) { } // Function: _explore_task -inline void Executor::_explore_task(Worker& w, Node*& t) { +inline bool Executor::_explore_task(Worker& w, Node*& t) { - //assert(_workers[w].wsq.empty()); //assert(!t); + + const size_t MAX_STEALS = ((num_queues() + 1) << 1); + std::uniform_int_distribution udist(0, num_queues()-1); size_t num_steals = 0; - size_t num_yields = 0; + size_t vtm = w._vtm; - std::uniform_int_distribution rdvtm(0, _workers.size()-1); - - // Here, we write do-while to make the worker steal at once - // from the assigned victim. - do { - t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal(); + // Make the worker steal immediately from the assigned victim. + while(true) { + + // If the worker's victim thread is within the worker pool, steal from the worker's queue. + // Otherwise, steal from the buffer, adjusting the victim index based on the worker pool size. + t = (vtm < _workers.size()) + ? _workers[vtm]._wsq.steal() + : _buffers.steal(vtm - _workers.size()); if(t) { + w._vtm = vtm; break; } - if(num_steals++ > _MAX_STEALS) { + // Increment the steal count, and if it exceeds MAX_STEALS, yield the thread. + // If the number of *consecutive* empty steals reaches MAX_STEALS, exit the loop. + if (++num_steals > MAX_STEALS) { std::this_thread::yield(); - if(num_yields++ > 100) { + if(num_steals > 100 + MAX_STEALS) { break; } } - w._vtm = rdvtm(w._rdgen); - } while(!_done); + #if __cplusplus >= TF_CPP20 + if(w._done.test(std::memory_order_relaxed)) { + #else + if(w._done.load(std::memory_order_relaxed)) { + #endif + return false; + } + // Randomely generate a next victim. + vtm = udist(w._rdgen); //w._rdvtm(); + } + return true; } // Procedure: _exploit_task @@ -1322,47 +1413,64 @@ inline void Executor::_exploit_task(Worker& w, Node*& t) { } // Function: _wait_for_task -inline bool Executor::_wait_for_task(Worker& worker, Node*& t) { +inline bool Executor::_wait_for_task(Worker& w, Node*& t) { explore_task: - _explore_task(worker, t); + if(_explore_task(w, t) == false) { + return false; + } - // The last thief who successfully stole a task will wake up - // another thief worker to avoid starvation. + // Go exploit the task if we successfully steal one. if(t) { - _notifier.notify(false); return true; } - // ---- 2PC guard ---- - _notifier.prepare_wait(worker._waiter); - - if(!_wsq.empty()) { - _notifier.cancel_wait(worker._waiter); - worker._vtm = worker._id; - goto explore_task; + // Entering the 2PC guard as all queues should be empty after many stealing attempts. + _notifier.prepare_wait(w._waiter); + + // Condition #1: buffers should be empty + for(size_t vtm=0; vtm<_buffers.size(); ++vtm) { + if(!_buffers._buckets[vtm].queue.empty()) { + _notifier.cancel_wait(w._waiter); + w._vtm = vtm + _workers.size(); + goto explore_task; + } } - if(_done) { - _notifier.cancel_wait(worker._waiter); - _notifier.notify(true); - return false; + // Condition #2: worker queues should be empty + // Note: We need to use index-based looping to avoid data race with _spawan + // which initializes other worker data structure at the same time + for(size_t vtm=0; vtm= TF_CPP20 + if(w._done.test(std::memory_order_relaxed)) { +#else + if(w._done.load(std::memory_order_relaxed)) { +#endif + _notifier.cancel_wait(w._waiter); + return false; + } + + // Now I really need to relinquish myself to others. + _notifier.commit_wait(w._waiter); goto explore_task; } @@ -1405,140 +1513,129 @@ inline size_t Executor::num_observers() const noexcept { // Procedure: _schedule inline void Executor::_schedule(Worker& worker, Node* node) { - // We need to fetch p before the release such that the read - // operation is synchronized properly with other thread to - // void data race. - auto p = node->_priority; - - node->_state.fetch_or(Node::READY, std::memory_order_release); - - // caller is a worker to this pool - starting at v3.5 we do not use + // caller is a worker of this executor - starting at v3.5 we do not use // any complicated notification mechanism as the experimental result // has shown no significant advantage. if(worker._executor == this) { - worker._wsq.push(node, p); - _notifier.notify(false); + worker._wsq.push(node, [&](){ _buffers.push(node); }); + _notifier.notify_one(); return; } - - { - std::lock_guard lock(_wsq_mutex); - _wsq.push(node, p); - } - - _notifier.notify(false); + + // caller is not a worker of this executor - go through the centralized queue + _buffers.push(node); + _notifier.notify_one(); } // Procedure: _schedule inline void Executor::_schedule(Node* node) { - - // We need to fetch p before the release such that the read - // operation is synchronized properly with other thread to - // void data race. - auto p = node->_priority; - - node->_state.fetch_or(Node::READY, std::memory_order_release); - - { - std::lock_guard lock(_wsq_mutex); - _wsq.push(node, p); - } - - _notifier.notify(false); + _buffers.push(node); + _notifier.notify_one(); } // Procedure: _schedule -inline void Executor::_schedule(Worker& worker, const SmallVector& nodes) { - - // We need to cacth the node count to avoid accessing the nodes - // vector while the parent topology is removed! - const auto num_nodes = nodes.size(); +template +void Executor::_schedule(Worker& worker, I first, I last) { + size_t num_nodes = last - first; + if(num_nodes == 0) { return; } - - // caller is a worker to this pool - starting at v3.5 we do not use - // any complicated notification mechanism as the experimental result - // has shown no significant advantage. + + // NOTE: We cannot use first/last in the for-loop (e.g., for(; first != last; ++first)). + // This is because when a node v is inserted into the queue, v can run and finish + // immediately. If v is the last node in the graph, it will tear down the parent task vector + // which cause the last ++first to fail. This problem is specific to MSVC which has a stricter + // iterator implementation in std::vector than GCC/Clang. if(worker._executor == this) { - for(size_t i=0; i_priority; - nodes[i]->_state.fetch_or(Node::READY, std::memory_order_release); - worker._wsq.push(nodes[i], p); - _notifier.notify(false); + for(size_t i=0; i lock(_wsq_mutex); - for(size_t k=0; k_priority; - nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release); - _wsq.push(nodes[k], p); - } + + // caller is not a worker of this executor - go through the centralized queue + for(size_t i=0; i& nodes) { - - // parent topology may be removed! - const auto num_nodes = nodes.size(); +template +inline void Executor::_schedule(I first, I last) { + + size_t num_nodes = last - first; if(num_nodes == 0) { return; } - // We need to fetch p before the release such that the read - // operation is synchronized properly with other thread to - // void data race. - { - std::lock_guard lock(_wsq_mutex); - for(size_t k=0; k_priority; - nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release); - _wsq.push(nodes[k], p); - } + // NOTE: We cannot use first/last in the for-loop (e.g., for(; first != last; ++first)). + // This is because when a node v is inserted into the queue, v can run and finish + // immediately. If v is the last node in the graph, it will tear down the parent task vector + // which cause the last ++first to fail. This problem is specific to MSVC which has a stricter + // iterator implementation in std::vector than GCC/Clang. + for(size_t i=0; i +void Executor::_schedule_graph_with_parent(Worker& worker, I beg, I end, Node* parent) { + auto send = _set_up_graph(beg, end, parent->_topology, parent); + parent->_join_counter.fetch_add(send - beg, std::memory_order_relaxed); + _schedule(worker, beg, send); +} +TF_FORCE_INLINE void Executor::_update_cache(Worker& worker, Node*& cache, Node* node) { + if(cache) { + _schedule(worker, cache); + } + cache = node; +} + // Procedure: _invoke inline void Executor::_invoke(Worker& worker, Node* node) { - // synchronize all outstanding memory operations caused by reordering - while(!(node->_state.load(std::memory_order_acquire) & Node::READY)); + #define TF_INVOKE_CONTINUATION() \ + if (cache) { \ + node = cache; \ + goto begin_invoke; \ + } begin_invoke: + + Node* cache {nullptr}; - SmallVector conds; + // if this is the second invoke due to preemption, directly jump to invoke task + if(node->_nstate & NSTATE::PREEMPTED) { + goto invoke_task; + } - // no need to do other things if the topology is cancelled + // if the work has been cancelled, there is no need to continue if(node->_is_cancelled()) { - _tear_down_invoke(worker, node); + _tear_down_invoke(worker, node, cache); + TF_INVOKE_CONTINUATION(); return; } // if acquiring semaphore(s) exists, acquire them first if(node->_semaphores && !node->_semaphores->to_acquire.empty()) { - SmallVector nodes; - if(!node->_acquire_all(nodes)) { - _schedule(worker, nodes); + SmallVector waiters; + if(!node->_acquire_all(waiters)) { + _schedule(worker, waiters.begin(), waiters.end()); return; } - node->_state.fetch_or(Node::ACQUIRED, std::memory_order_release); } - - // condition task - //int cond = -1; + + invoke_task: + + SmallVector conds; // switch is faster than nested if-else due to jump table switch(node->_handle.index()) { @@ -1547,10 +1644,20 @@ inline void Executor::_invoke(Worker& worker, Node* node) { _invoke_static_task(worker, node); } break; + + // runtime task + case Node::RUNTIME:{ + if(_invoke_runtime_task(worker, node)) { + return; + } + } + break; // subflow task case Node::SUBFLOW: { - _invoke_subflow_task(worker, node); + if(_invoke_subflow_task(worker, node)) { + return; + } } break; @@ -1568,26 +1675,30 @@ inline void Executor::_invoke(Worker& worker, Node* node) { // module task case Node::MODULE: { - _invoke_module_task(worker, node); + if(_invoke_module_task(worker, node)) { + return; + } } break; // async task case Node::ASYNC: { - _invoke_async_task(worker, node); - _tear_down_async(node); - return ; + if(_invoke_async_task(worker, node)) { + return; + } + _tear_down_async(worker, node, cache); + TF_INVOKE_CONTINUATION(); + return; } break; // dependent async task case Node::DEPENDENT_ASYNC: { - _invoke_dependent_async_task(worker, node); - _tear_down_dependent_async(worker, node); - if(worker._cache) { - node = worker._cache; - goto begin_invoke; + if(_invoke_dependent_async_task(worker, node)) { + return; } + _tear_down_dependent_async(worker, node, cache); + TF_INVOKE_CONTINUATION(); return; } break; @@ -1597,33 +1708,26 @@ inline void Executor::_invoke(Worker& worker, Node* node) { break; } - //invoke_successors: - // if releasing semaphores exist, release them if(node->_semaphores && !node->_semaphores->to_release.empty()) { - _schedule(worker, node->_release_all()); + SmallVector waiters; + node->_release_all(waiters); + _schedule(worker, waiters.begin(), waiters.end()); } - - // Reset the join counter to support the cyclic control flow. + + // Reset the join counter with strong dependencies to support cycles. // + We must do this before scheduling the successors to avoid race - // condition on _dependents. + // condition on _predecessors. // + We must use fetch_add instead of direct assigning // because the user-space call on "invoke" may explicitly schedule // this task again (e.g., pipeline) which can access the join_counter. - if((node->_state.load(std::memory_order_relaxed) & Node::CONDITIONED)) { - node->_join_counter.fetch_add(node->num_strong_dependents(), std::memory_order_relaxed); - } - else { - node->_join_counter.fetch_add(node->num_dependents(), std::memory_order_relaxed); - } + node->_join_counter.fetch_add( + node->num_predecessors() - (node->_nstate & ~NSTATE::MASK), std::memory_order_relaxed + ); // acquire the parent flow counter - auto& j = (node->_parent) ? node->_parent->_join_counter : - node->_topology->_join_counter; - - // Here, we want to cache the latest successor with the highest priority - worker._cache = nullptr; - auto max_p = static_cast(TaskPriority::MAX); + auto& join_counter = (node->_parent) ? node->_parent->_join_counter : + node->_topology->_join_counter; // Invoke the task based on the corresponding type switch(node->_handle.index()) { @@ -1632,21 +1736,12 @@ inline void Executor::_invoke(Worker& worker, Node* node) { case Node::CONDITION: case Node::MULTI_CONDITION: { for(auto cond : conds) { - if(cond >= 0 && static_cast(cond) < node->_successors.size()) { - auto s = node->_successors[cond]; + if(cond >= 0 && static_cast(cond) < node->_num_successors) { + auto s = node->_edges[cond]; // zeroing the join counter for invariant s->_join_counter.store(0, std::memory_order_relaxed); - j.fetch_add(1, std::memory_order_relaxed); - if(s->_priority <= max_p) { - if(worker._cache) { - _schedule(worker, worker._cache); - } - worker._cache = s; - max_p = s->_priority; - } - else { - _schedule(worker, s); - } + join_counter.fetch_add(1, std::memory_order_relaxed); + _update_cache(worker, cache, s); } } } @@ -1654,41 +1749,24 @@ inline void Executor::_invoke(Worker& worker, Node* node) { // non-condition task default: { - for(size_t i=0; i_successors.size(); ++i) { - //if(auto s = node->_successors[i]; --(s->_join_counter) == 0) { - if(auto s = node->_successors[i]; - s->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) { - j.fetch_add(1, std::memory_order_relaxed); - if(s->_priority <= max_p) { - if(worker._cache) { - _schedule(worker, worker._cache); - } - worker._cache = s; - max_p = s->_priority; - } - else { - _schedule(worker, s); - } + for(size_t i=0; i_num_successors; ++i) { + if(auto s = node->_edges[i]; s->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) { + join_counter.fetch_add(1, std::memory_order_relaxed); + _update_cache(worker, cache, s); } } } break; } - - // tear_down the invoke - _tear_down_invoke(worker, node); - - // perform tail recursion elimination for the right-most child to reduce - // the number of expensive pop/push operations through the task queue - if(worker._cache) { - node = worker._cache; - //node->_state.fetch_or(Node::READY, std::memory_order_release); - goto begin_invoke; - } + + // clean up the node after execution + _tear_down_invoke(worker, node, cache); + TF_INVOKE_CONTINUATION(); } // Procedure: _tear_down_invoke -inline void Executor::_tear_down_invoke(Worker& worker, Node* node) { +inline void Executor::_tear_down_invoke(Worker& worker, Node* node, Node*& cache) { + // we must check parent first before subtracting the join counter, // or it can introduce data race if(auto parent = node->_parent; parent == nullptr) { @@ -1696,22 +1774,16 @@ inline void Executor::_tear_down_invoke(Worker& worker, Node* node) { _tear_down_topology(worker, node->_topology); } } - // Here we asssume the parent is in a busy loop (e.g., corun) waiting for - // its join counter to become 0. - else { - //parent->_join_counter.fetch_sub(1, std::memory_order_acq_rel); - parent->_join_counter.fetch_sub(1, std::memory_order_release); + else { + // needs to fetch every data before join counter becomes zero at which + // the node may be deleted + auto state = parent->_nstate; + if(parent->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) { + if(state & NSTATE::PREEMPTED) { + _update_cache(worker, cache, parent); + } + } } - //// module task - //else { - // auto id = parent->_handle.index(); - // if(parent->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) { - // if(id == Node::MODULE) { - // return parent; - // } - // } - //} - //return nullptr; } // Procedure: _observer_prologue @@ -1731,103 +1803,90 @@ inline void Executor::_observer_epilogue(Worker& worker, Node* node) { // Procedure: _process_exception inline void Executor::_process_exception(Worker&, Node* node) { - constexpr static auto flag = Topology::EXCEPTION | Topology::CANCELLED; - - // if the node has a parent, we store the exception in its parent - if(auto parent = node->_parent; parent) { - if ((parent->_state.fetch_or(Node::EXCEPTION, std::memory_order_relaxed) & Node::EXCEPTION) == 0) { - parent->_exception_ptr = std::current_exception(); + constexpr static auto flag = ESTATE::EXCEPTION | ESTATE::CANCELLED; + + // find the anchor and mark the entire path with exception so recursive + // or nested tasks can be cancelled properly + // since exception can come from asynchronous task (with runtime), the node + // itself can be anchored + auto anchor = node; + while(anchor && (anchor->_estate.load(std::memory_order_relaxed) & ESTATE::ANCHORED) == 0) { + anchor->_estate.fetch_or(flag, std::memory_order_relaxed); + anchor = anchor->_parent; + } + + // the exception occurs under a blocking call (e.g., corun, join) + if(anchor) { + // multiple tasks may throw, and we only take the first thrown exception + if((anchor->_estate.fetch_or(flag, std::memory_order_relaxed) & ESTATE::EXCEPTION) == 0) { + anchor->_exception_ptr = std::current_exception(); + return; } - // TODO if the node has a topology, cancel it to enable early stop - //if(auto tpg = node->_topology; tpg) { - // tpg->_state.fetch_or(Topology::CANCELLED, std::memory_order_relaxed); - //} } - // multiple tasks may throw, so we only take the first thrown exception - else if(auto tpg = node->_topology; tpg && - ((tpg->_state.fetch_or(flag, std::memory_order_relaxed) & Topology::EXCEPTION) == 0) - ) { - tpg->_exception_ptr = std::current_exception(); + // otherwise, we simply store the exception in the topology and cancel it + else if(auto tpg = node->_topology; tpg) { + // multiple tasks may throw, and we only take the first thrown exception + if((tpg->_estate.fetch_or(flag, std::memory_order_relaxed) & ESTATE::EXCEPTION) == 0) { + tpg->_exception_ptr = std::current_exception(); + return; + } } - // TODO: skip the exception that is not associated with any taskflows + + // for now, we simply store the exception in this node; this can happen in an + // execution that does not have any external control to capture the exception, + // such as silent async task + node->_exception_ptr = std::current_exception(); } // Procedure: _invoke_static_task inline void Executor::_invoke_static_task(Worker& worker, Node* node) { _observer_prologue(worker, node); TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { - auto& work = std::get_if(&node->_handle)->work; - switch(work.index()) { - case 0: - std::get_if<0>(&work)->operator()(); - break; - - case 1: - Runtime rt(*this, worker, node); - std::get_if<1>(&work)->operator()(rt); - node->_process_exception(); - break; - } + std::get_if(&node->_handle)->work(); }); _observer_epilogue(worker, node); } // Procedure: _invoke_subflow_task -inline void Executor::_invoke_subflow_task(Worker& w, Node* node) { - _observer_prologue(w, node); - TF_EXECUTOR_EXCEPTION_HANDLER(w, node, { - auto handle = std::get_if(&node->_handle); - handle->subgraph._clear(); - Subflow sf(*this, w, node, handle->subgraph); - handle->work(sf); - if(sf._joinable) { - _corun_graph(w, node, handle->subgraph); - } - node->_process_exception(); - }); - _observer_epilogue(w, node); -} +inline bool Executor::_invoke_subflow_task(Worker& worker, Node* node) { + + auto& h = *std::get_if(&node->_handle); + auto& g = h.subgraph; -// Procedure: _detach_subflow_task -inline void Executor::_detach_subflow_task(Worker& w, Node* p, Graph& g) { + if((node->_nstate & NSTATE::PREEMPTED) == 0) { + + // set up the subflow + Subflow sf(*this, worker, node, g); - // graph is empty and has no async tasks - if(g.empty() && p->_join_counter.load(std::memory_order_acquire) == 0) { - return; - } + // invoke the subflow callable + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { + h.work(sf); + }); + _observer_epilogue(worker, node); + + // spawn the subflow if it is joinable and its graph is non-empty + // implicit join is faster than Subflow::join as it does not involve corun + if(sf.joinable() && g.size()) { - SmallVector src; - _set_up_graph(g, nullptr, p->_topology, Node::DETACHED, src); + // signal the executor to preempt this node + node->_nstate |= NSTATE::PREEMPTED; - { - std::lock_guard lock(p->_topology->_taskflow._mutex); - p->_topology->_taskflow._graph._merge(std::move(g)); + // set up and schedule the graph + _schedule_graph_with_parent(worker, g.begin(), g.end(), node); + return true; + } } - - p->_topology->_join_counter.fetch_add(src.size(), std::memory_order_relaxed); - _schedule(w, src); -} - -// Procedure: _corun_graph -inline void Executor::_corun_graph(Worker& w, Node* p, Graph& g) { - - // assert(p); - - // graph is empty and has no async tasks (subflow) - if(g.empty() && p->_join_counter.load(std::memory_order_acquire) == 0) { - return; + else { + node->_nstate &= ~NSTATE::PREEMPTED; } - SmallVector src; - - _set_up_graph(g, p, p->_topology, 0, src); - p->_join_counter.fetch_add(src.size(), std::memory_order_relaxed); - - _schedule(w, src); + // the subflow has finished or joined + if((node->_nstate & NSTATE::RETAIN_SUBFLOW) == 0) { + g.clear(); + } - _corun_until(w, [p] () -> bool { - return p->_join_counter.load(std::memory_order_acquire) == 0; } - ); + return false; } // Procedure: _invoke_condition_task @@ -1837,17 +1896,7 @@ inline void Executor::_invoke_condition_task( _observer_prologue(worker, node); TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { auto& work = std::get_if(&node->_handle)->work; - switch(work.index()) { - case 0: - conds = { std::get_if<0>(&work)->operator()() }; - break; - - case 1: - Runtime rt(*this, worker, node); - conds = { std::get_if<1>(&work)->operator()(rt) }; - node->_process_exception(); - break; - } + conds = { work() }; }); _observer_epilogue(worker, node); } @@ -1858,87 +1907,98 @@ inline void Executor::_invoke_multi_condition_task( ) { _observer_prologue(worker, node); TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { - auto& work = std::get_if(&node->_handle)->work; - switch(work.index()) { - case 0: - conds = std::get_if<0>(&work)->operator()(); - break; - - case 1: - Runtime rt(*this, worker, node); - conds = std::get_if<1>(&work)->operator()(rt); - node->_process_exception(); - break; - } + conds = std::get_if(&node->_handle)->work(); }); _observer_epilogue(worker, node); } // Procedure: _invoke_module_task -inline void Executor::_invoke_module_task(Worker& w, Node* node) { - _observer_prologue(w, node); - TF_EXECUTOR_EXCEPTION_HANDLER(w, node, { - _corun_graph(w, node, std::get_if(&node->_handle)->graph); - node->_process_exception(); - }); - _observer_epilogue(w, node); +inline bool Executor::_invoke_module_task(Worker& w, Node* node) { + return _invoke_module_task_impl(w, node, std::get_if(&node->_handle)->graph); +} + +// Procedure: _invoke_module_task_impl +inline bool Executor::_invoke_module_task_impl(Worker& w, Node* node, Graph& graph) { + + // No need to do anything for empty graph + if(graph.empty()) { + return false; + } + + // first entry - not spawned yet + if((node->_nstate & NSTATE::PREEMPTED) == 0) { + // signal the executor to preempt this node + node->_nstate |= NSTATE::PREEMPTED; + _schedule_graph_with_parent(w, graph.begin(), graph.end(), node); + return true; + } + + // second entry - already spawned + node->_nstate &= ~NSTATE::PREEMPTED; + + return false; } -//// Function: _invoke_module_task_internal -//inline bool Executor::_invoke_module_task_internal(Worker& w, Node* p) { -// -// // acquire the underlying graph -// auto& g = std::get_if(&p->_handle)->graph; -// -// // no need to do anything if the graph is empty -// if(g.empty()) { -// return false; -// } -// -// SmallVector src; -// _set_up_graph(g, p, p->_topology, 0, src); -// p->_join_counter.fetch_add(src.size(), std::memory_order_relaxed); -// -// _schedule(w, src); -// return true; -//} // Procedure: _invoke_async_task -inline void Executor::_invoke_async_task(Worker& worker, Node* node) { - _observer_prologue(worker, node); - TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { - auto& work = std::get_if(&node->_handle)->work; - switch(work.index()) { - case 0: +inline bool Executor::_invoke_async_task(Worker& worker, Node* node) { + auto& work = std::get_if(&node->_handle)->work; + switch(work.index()) { + // void() + case 0: + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { std::get_if<0>(&work)->operator()(); - break; + }); + _observer_epilogue(worker, node); + break; + + // void(Runtime&) + case 1: + if(_invoke_runtime_task_impl(worker, node, *std::get_if<1>(&work))) { + return true; + } + break; + + // void(Runtime&, bool) + case 2: + if(_invoke_runtime_task_impl(worker, node, *std::get_if<2>(&work))) { + return true; + } + break; + } - case 1: - Runtime rt(*this, worker, node); - std::get_if<1>(&work)->operator()(rt); - break; - } - }); - _observer_epilogue(worker, node); + return false; } // Procedure: _invoke_dependent_async_task -inline void Executor::_invoke_dependent_async_task(Worker& worker, Node* node) { - _observer_prologue(worker, node); - TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { - auto& work = std::get_if(&node->_handle)->work; - switch(work.index()) { - case 0: +inline bool Executor::_invoke_dependent_async_task(Worker& worker, Node* node) { + auto& work = std::get_if(&node->_handle)->work; + switch(work.index()) { + // void() + case 0: + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { std::get_if<0>(&work)->operator()(); - break; + }); + _observer_epilogue(worker, node); + break; + + // void(Runtime&) - silent async + case 1: + if(_invoke_runtime_task_impl(worker, node, *std::get_if<1>(&work))) { + return true; + } + break; - case 1: - Runtime rt(*this, worker, node); - std::get_if<1>(&work)->operator()(rt); - break; - } - }); - _observer_epilogue(worker, node); + // void(Runtime&, bool) - async + case 2: + if(_invoke_runtime_task_impl(worker, node, *std::get_if<2>(&work))) { + return true; + } + break; + } + return false; } // Function: run @@ -2007,16 +2067,16 @@ tf::Future Executor::run_until(Taskflow& f, P&& p, C&& c) { _increment_topology(); - // Need to check the empty under the lock since subflow task may - // define detached blocks that modify the taskflow at the same time - bool empty; - { - std::lock_guard lock(f._mutex); - empty = f.empty(); - } + //// Need to check the empty under the lock since subflow task may + //// define detached blocks that modify the taskflow at the same time + //bool empty; + //{ + // std::lock_guard lock(f._mutex); + // empty = f.empty(); + //} // No need to create a real topology but returns an dummy future - if(empty || p()) { + if(f.empty() || p()) { c(); std::promise promise; promise.set_value(); @@ -2035,7 +2095,7 @@ tf::Future Executor::run_until(Taskflow& f, P&& p, C&& c) { std::lock_guard lock(f._mutex); f._topologies.push(t); if(f._topologies.size() == 1) { - _set_up_topology(_this_worker(), t.get()); + _set_up_topology(pt::this_worker, t.get()); } } @@ -2060,36 +2120,53 @@ tf::Future Executor::run_until(Taskflow&& f, P&& pred, C&& c) { // Function: corun template void Executor::corun(T& target) { - - auto w = _this_worker(); - if(w == nullptr) { + static_assert(has_graph_v, "target must define a member function 'Graph& graph()'"); + + if(pt::this_worker == nullptr || pt::this_worker->_executor != this) { TF_THROW("corun must be called by a worker of the executor"); } - Node parent; // auxiliary parent - _corun_graph(*w, &parent, target.graph()); - parent._process_exception(); + Node anchor; + _corun_graph(*pt::this_worker, &anchor, target.graph().begin(), target.graph().end()); } // Function: corun_until template void Executor::corun_until(P&& predicate) { - auto w = _this_worker(); - - if(w == nullptr) { + if(pt::this_worker == nullptr || pt::this_worker->_executor != this) { TF_THROW("corun_until must be called by a worker of the executor"); } - _corun_until(*w, std::forward

      (predicate)); + _corun_until(*pt::this_worker, std::forward

      (predicate)); +} + +// Procedure: _corun_graph +template +void Executor::_corun_graph(Worker& w, Node* p, I first, I last) { + + // empty graph + if(first == last) { + return; + } + + // anchor this parent as the blocking point + { + AnchorGuard anchor(p); + _schedule_graph_with_parent(w, first, last, p); + _corun_until(w, [p] () -> bool { + return p->_join_counter.load(std::memory_order_acquire) == 0; } + ); + } - // TODO: exception? + // rethrow the exception to the blocker + p->_rethrow_exception(); } // Procedure: _increment_topology inline void Executor::_increment_topology() { -#ifdef __cpp_lib_atomic_wait +#if __cplusplus >= TF_CPP20 _num_topologies.fetch_add(1, std::memory_order_relaxed); #else std::lock_guard lock(_topology_mutex); @@ -2099,7 +2176,7 @@ inline void Executor::_increment_topology() { // Procedure: _decrement_topology inline void Executor::_decrement_topology() { -#ifdef __cpp_lib_atomic_wait +#if __cplusplus >= TF_CPP20 if(_num_topologies.fetch_sub(1, std::memory_order_acq_rel) == 1) { _num_topologies.notify_all(); } @@ -2113,7 +2190,7 @@ inline void Executor::_decrement_topology() { // Procedure: wait_for_all inline void Executor::wait_for_all() { -#ifdef __cpp_lib_atomic_wait +#if __cplusplus >= TF_CPP20 size_t n = _num_topologies.load(std::memory_order_acquire); while(n != 0) { _num_topologies.wait(n, std::memory_order_acquire); @@ -2126,37 +2203,39 @@ inline void Executor::wait_for_all() { } // Function: _set_up_topology -inline void Executor::_set_up_topology(Worker* worker, Topology* tpg) { +inline void Executor::_set_up_topology(Worker* w, Topology* tpg) { // ---- under taskflow lock ---- + auto& g = tpg->_taskflow._graph; + + auto send = _set_up_graph(g.begin(), g.end(), tpg, nullptr); + tpg->_join_counter.store(send - g.begin(), std::memory_order_relaxed); - tpg->_sources.clear(); - tpg->_taskflow._graph._clear_detached(); - _set_up_graph(tpg->_taskflow._graph, nullptr, tpg, 0, tpg->_sources); - tpg->_join_counter.store(tpg->_sources.size(), std::memory_order_relaxed); - - if(worker) { - _schedule(*worker, tpg->_sources); - } - else { - _schedule(tpg->_sources); - } + w ? _schedule(*w, g.begin(), send) : _schedule(g.begin(), send); } // Function: _set_up_graph -inline void Executor::_set_up_graph( - Graph& g, Node* parent, Topology* tpg, int state, SmallVector& src -) { - for(auto node : g._nodes) { +template +I Executor::_set_up_graph(I first, I last, Topology* tpg, Node* parent) { + + auto send = first; + for(; first != last; ++first) { + + auto node = first->get(); node->_topology = tpg; node->_parent = parent; - node->_state.store(state, std::memory_order_relaxed); - if(node->num_dependents() == 0) { - src.push_back(node); - } + node->_nstate = NSTATE::NONE; + node->_estate.store(ESTATE::NONE, std::memory_order_relaxed); node->_set_up_join_counter(); node->_exception_ptr = nullptr; + + // move source to the first partition + // root, root, root, v1, v2, v3, v4, ... + if(node->num_predecessors() == 0) { + std::iter_swap(send++, first); + } } + return send; } // Function: _tear_down_topology @@ -2170,13 +2249,12 @@ inline void Executor::_tear_down_topology(Worker& worker, Topology* tpg) { if(!tpg->_exception_ptr && !tpg->cancelled() && !tpg->_pred()) { //assert(tpg->_join_counter == 0); std::lock_guard lock(f._mutex); - tpg->_join_counter.store(tpg->_sources.size(), std::memory_order_relaxed); - _schedule(worker, tpg->_sources); + _set_up_topology(&worker, tpg); } // case 2: the final run of this topology else { - // TODO: if the topology is cancelled, need to release all semaphores + // invoke the callback after each run if(tpg->_call != nullptr) { tpg->_call(); } @@ -2190,7 +2268,7 @@ inline void Executor::_tear_down_topology(Worker& worker, Topology* tpg) { f._topologies.pop(); tpg = f._topologies.front().get(); - // decrement the topology but since this is not the last we don't notify + // decrement the topology _decrement_topology(); // set up topology needs to be under the lock or it can @@ -2229,161 +2307,18 @@ inline void Executor::_tear_down_topology(Worker& worker, Topology* tpg) { inline void Subflow::join() { - // assert(this_worker().worker == &_worker); - - if(!_joinable) { - TF_THROW("subflow not joinable"); - } - - // only the parent worker can join the subflow - _executor._corun_graph(_worker, _parent, _graph); - - // if any exception is caught from subflow tasks, rethrow it - _parent->_process_exception(); - - _joinable = false; -} - -inline void Subflow::detach() { - - // assert(this_worker().worker == &_worker); - - if(!_joinable) { - TF_THROW("subflow already joined or detached"); + if(!joinable()) { + TF_THROW("subflow already joined"); } - - // only the parent worker can detach the subflow - _executor._detach_subflow_task(_worker, _parent, _graph); - _joinable = false; -} - -// ############################################################################ -// Forward Declaration: Runtime -// ############################################################################ - -// Procedure: schedule -inline void Runtime::schedule(Task task) { + + _executor._corun_graph(_worker, _parent, _graph.begin(), _graph.end()); - auto node = task._node; - // need to keep the invariant: when scheduling a task, the task must have - // zero dependency (join counter is 0) - // or we can encounter bug when inserting a nested flow (e.g., module task) - node->_join_counter.store(0, std::memory_order_relaxed); - - auto& j = node->_parent ? node->_parent->_join_counter : - node->_topology->_join_counter; - j.fetch_add(1, std::memory_order_relaxed); - _executor._schedule(_worker, node); -} - -// Procedure: corun -template -void Runtime::corun(T&& target) { - _executor._corun_graph(_worker, _parent, target.graph()); - _parent->_process_exception(); -} - -// Procedure: corun_until -template -void Runtime::corun_until(P&& predicate) { - _executor._corun_until(_worker, std::forward

      (predicate)); - // TODO: exception? -} - -// Function: corun_all -inline void Runtime::corun_all() { - _executor._corun_until(_worker, [this] () -> bool { - return _parent->_join_counter.load(std::memory_order_acquire) == 0; - }); - _parent->_process_exception(); -} - -// Destructor -inline Runtime::~Runtime() { - _executor._corun_until(_worker, [this] () -> bool { - return _parent->_join_counter.load(std::memory_order_acquire) == 0; - }); -} - -// ------------------------------------ -// Runtime::silent_async series -// ------------------------------------ - -// Function: _silent_async -template -void Runtime::_silent_async(Worker& w, P&& params, F&& f) { - - _parent->_join_counter.fetch_add(1, std::memory_order_relaxed); - - auto node = node_pool.animate( - std::forward

      (params), _parent->_topology, _parent, 0, - std::in_place_type_t{}, std::forward(f) - ); - - _executor._schedule(w, node); + // join here since corun graph may throw exception + _parent->_nstate |= NSTATE::JOINED_SUBFLOW; } -// Function: silent_async -template -void Runtime::silent_async(F&& f) { - _silent_async(*_executor._this_worker(), DefaultTaskParams{}, std::forward(f)); -} - -// Function: silent_async -template -void Runtime::silent_async(P&& params, F&& f) { - _silent_async(*_executor._this_worker(), std::forward

      (params), std::forward(f)); -} - -// Function: silent_async_unchecked -template -void Runtime::silent_async_unchecked(F&& f) { - _silent_async(_worker, DefaultTaskParams{}, std::forward(f)); -} - -// Function: silent_async_unchecked -template -void Runtime::silent_async_unchecked(P&& params, F&& f) { - _silent_async(_worker, std::forward

      (params), std::forward(f)); -} - -// ------------------------------------ -// Runtime::async series -// ------------------------------------ - -// Function: _async -template -auto Runtime::_async(Worker& w, P&& params, F&& f) { - - _parent->_join_counter.fetch_add(1, std::memory_order_relaxed); - - using R = std::invoke_result_t>; - - std::packaged_task p(std::forward(f)); - auto fu{p.get_future()}; - - auto node = node_pool.animate( - std::forward

      (params), _parent->_topology, _parent, 0, - std::in_place_type_t{}, - [p=make_moc(std::move(p))] () mutable { p.object(); } - ); - - _executor._schedule(w, node); - - return fu; -} - -// Function: async -template -auto Runtime::async(F&& f) { - return _async(*_executor._this_worker(), DefaultTaskParams{}, std::forward(f)); -} +#endif -// Function: async -template -auto Runtime::async(P&& params, F&& f) { - return _async(*_executor._this_worker(), std::forward

      (params), std::forward(f)); -} diff --git a/taskflow/core/flow_builder.hpp b/taskflow/core/flow_builder.hpp index df1d02fc5..cd8b281ff 100644 --- a/taskflow/core/flow_builder.hpp +++ b/taskflow/core/flow_builder.hpp @@ -44,13 +44,37 @@ class FlowBuilder { @code{.cpp} tf::Task static_task = taskflow.emplace([](){}); @endcode - + + @note Please refer to @ref StaticTasking for details. */ template , void>* = nullptr > Task emplace(C&& callable); + + /** + @brief creates a runtime task + + @tparam C callable type constructible from std::function + + @param callable callable to construct a runtime task + + @return a tf::Task handle + + The following example creates a runtime task. + + @code{.cpp} + tf::Task static_task = taskflow.emplace([](tf::Runtime&){}); + @endcode + + @note + Please refer to @ref RuntimeTasking for details. + */ + template , void>* = nullptr + > + Task emplace(C&& callable); /** @brief creates a dynamic task @@ -71,6 +95,7 @@ class FlowBuilder { }); @endcode + @note Please refer to @ref SubflowTasking for details. */ template @@ -343,17 +371,18 @@ class FlowBuilder { } @endcode - Iterators are templated to enable stateful range using std::reference_wrapper. + Iterators can be made stateful by using std::reference_wrapper The callable needs to take a single argument of the dereferenced iterator type. + @note Please refer to @ref ParallelIterations for details. */ template Task for_each(B first, E last, C callable, P part = P()); /** - @brief constructs an STL-styled index-based parallel-for task + @brief constructs an index-based parallel-for task @tparam B beginning index type (must be integral) @tparam E ending index type (must be integral) @@ -385,15 +414,53 @@ class FlowBuilder { } @endcode - Iterators are templated to enable stateful range using std::reference_wrapper. + Iterators can be made stateful by using std::reference_wrapper The callable needs to take a single argument of the integral index type. + @note Please refer to @ref ParallelIterations for details. */ template - Task for_each_index( - B first, E last, S step, C callable, P part = P() - ); + Task for_each_index(B first, E last, S step, C callable, P part = P()); + + /** + @brief constructs an index range-based parallel-for task + + @tparam R index range type (tf::IndexRange) + @tparam C callable type + @tparam P partitioner type (default tf::DefaultPartitioner) + + @param range index range + @param callable callable object to apply to each valid index + @param part partitioning algorithm to schedule parallel iterations + + @return a tf::Task handle + + The task spawns asynchronous tasks that applies the callable object to + in the range [first, last) with the step size. + + @code{.cpp} + // [0, 17) with a step size of 2 using tf::IndexRange + tf::IndexRange range(0, 17, 2); + + // parallelize the sequence [0, 2, 4, 6, 8, 10, 12, 14, 16] + taskflow.for_each_by_index(range, [](tf::IndexRange range) { + // iterate each index in the subrange + for(int i=range.begin(); i + Task for_each_by_index(R range, C callable, P part = P()); // ------------------------------------------------------------------------ // transform @@ -426,10 +493,11 @@ class FlowBuilder { } @endcode - Iterators are templated to enable stateful range using std::reference_wrapper. + Iterators can be made stateful by using std::reference_wrapper The callable needs to take a single argument of the dereferenced iterator type. + @note Please refer to @ref ParallelTransforms for details. */ template < @@ -467,10 +535,11 @@ class FlowBuilder { } @endcode - Iterators are templated to enable stateful range using std::reference_wrapper. + Iterators can be made stateful by using std::reference_wrapper The callable needs to take two arguments of dereferenced elements from the two input ranges. + @note Please refer to @ref ParallelTransforms for details. */ template < @@ -484,7 +553,7 @@ class FlowBuilder { // ------------------------------------------------------------------------ /** - @brief constructs an STL-styled parallel-reduce task + @brief constructs an STL-styled parallel-reduction task @tparam B beginning iterator type @tparam E ending iterator type @@ -511,12 +580,70 @@ class FlowBuilder { } @endcode - Iterators are templated to enable stateful range using std::reference_wrapper. + Iterators can be made stateful by using std::reference_wrapper + @note Please refer to @ref ParallelReduction for details. */ template Task reduce(B first, E last, T& init, O bop, P part = P()); + + /** + @brief constructs an index range-based parallel-reduction task + + @tparam R index range type (tf::IndexRange) + @tparam T result type + @tparam L local reducer type + @tparam G global reducer type + @tparam P partitioner type (default tf::DefaultPartitioner) + + @param range index range + @param init initial value of the reduction and the storage for the reduced result + @param lop binary operator that will be applied locally per worker + @param gop binary operator that will be applied globally among worker + @param part partitioning algorithm to schedule parallel iterations + + @return a tf::Task handle + + The task spawns asynchronous tasks to perform parallel reduction over a range with @c init. + The reduced result is store in @c init. + Unlike the iterator-based reduction, + index range-based reduction is particularly useful for applications that benefit from SIMD optimizations + or other range-based processing strategies. + + @code{.cpp} + const size_t N = 1000000; + std::vector data(N); // uninitialized data vector + int res = 1; // res will participate in the reduction + + taskflow.reduce_by_index( + tf::IndexRange(0, N, 1), + // final result + res, + // local reducer + [&](tf::IndexRange subrange, std::optional running_total) -> int { + int residual = running_total ? *running_total : 0.0; + for(size_t i=subrange.begin(); i() + ); + executor.run(taskflow).wait(); + assert(res = N + 1); + @endcode + + Range can be made stateful by using std::reference_wrapper. + + @note + Please refer to @ref ParallelReduction for details. + */ + template + Task reduce_by_index(R range, T& init, L lop, G gop, P part = P()); // ------------------------------------------------------------------------ // transform and reduction @@ -552,8 +679,9 @@ class FlowBuilder { } @endcode - Iterators are templated to enable stateful range using std::reference_wrapper. + Iterators can be made stateful by using std::reference_wrapper + @note Please refer to @ref ParallelReduction for details. */ template < @@ -593,8 +721,9 @@ class FlowBuilder { } @endcode - Iterators are templated to enable stateful range using std::reference_wrapper. + Iterators can be made stateful by using std::reference_wrapper + @note Please refer to @ref ParallelReduction for details. */ @@ -610,28 +739,26 @@ class FlowBuilder { // ------------------------------------------------------------------------ // scan // ------------------------------------------------------------------------ - - /** + + /** @brief creates an STL-styled parallel inclusive-scan task @tparam B beginning iterator type @tparam E ending iterator type @tparam D destination iterator type @tparam BOP summation operator type - @tparam P partitioner type (default tf::DefaultPartitioner) @param first start of input range @param last end of input range @param d_first start of output range (may be the same as input range) @param bop function to perform summation - @param part partitioning algorithm to schedule parallel iterations Performs the cumulative sum (aka prefix sum, aka scan) of the input range - and writes the result to the output range. + and writes the result to the output range. Each element of the output range contains the running total of all earlier elements using the given binary operator for summation. - + This function generates an @em inclusive scan, meaning that the N-th element of the output range is the sum of the first N input elements, so the N-th input element is included. @@ -642,18 +769,17 @@ class FlowBuilder { input.begin(), input.end(), input.begin(), std::plus{} ); executor.run(taskflow).wait(); - + // input is {1, 3, 6, 10, 15} @endcode - - Iterators are templated to enable stateful range using std::reference_wrapper. - + + Iterators can be made stateful by using std::reference_wrapper + + @note Please refer to @ref ParallelScan for details. */ - template >, void>* = nullptr - > - Task inclusive_scan(B first, E last, D d_first, BOP bop, P part = P()); + template + Task inclusive_scan(B first, E last, D d_first, BOP bop); /** @brief creates an STL-styled parallel inclusive-scan task with an initial value @@ -663,14 +789,12 @@ class FlowBuilder { @tparam D destination iterator type @tparam BOP summation operator type @tparam T initial value type - @tparam P partitioner type (default tf::DefaultPartitioner) @param first start of input range @param last end of input range @param d_first start of output range (may be the same as input range) @param bop function to perform summation @param init initial value - @param part partitioning algorithm to schedule parallel iterations Performs the cumulative sum (aka prefix sum, aka scan) of the input range and writes the result to the output range. @@ -692,15 +816,14 @@ class FlowBuilder { // input is {0, 2, 5, 9, 14} @endcode - Iterators are templated to enable stateful range using std::reference_wrapper. + Iterators can be made stateful by using std::reference_wrapper + @note Please refer to @ref ParallelScan for details. */ - template >, void>* = nullptr - > - Task inclusive_scan(B first, E last, D d_first, BOP bop, T init, P part = P()); + template + Task inclusive_scan(B first, E last, D d_first, BOP bop, T init); /** @brief creates an STL-styled parallel exclusive-scan task @@ -710,14 +833,12 @@ class FlowBuilder { @tparam D destination iterator type @tparam T initial value type @tparam BOP summation operator type - @tparam P partitioner type (default tf::DefaultPartitioner) @param first start of input range @param last end of input range @param d_first start of output range (may be the same as input range) @param init initial value @param bop function to perform summation - @param part partitioning algorithm to schedule parallel iterations Performs the cumulative sum (aka prefix sum, aka scan) of the input range and writes the result to the output range. @@ -739,12 +860,13 @@ class FlowBuilder { // input is {-1, 0, 2, 5, 9} @endcode - Iterators are templated to enable stateful range using std::reference_wrapper. + Iterators can be made stateful by using std::reference_wrapper + @note Please refer to @ref ParallelScan for details. */ - template - Task exclusive_scan(B first, E last, D d_first, T init, BOP bop, P part = P()); + template + Task exclusive_scan(B first, E last, D d_first, T init, BOP bop); // ------------------------------------------------------------------------ // transform scan @@ -758,14 +880,12 @@ class FlowBuilder { @tparam D destination iterator type @tparam BOP summation operator type @tparam UOP transform operator type - @tparam P partitioner type (default tf::DefaultPartitioner) @param first start of input range @param last end of input range @param d_first start of output range (may be the same as input range) @param bop function to perform summation @param uop function to transform elements of the input range - @param part partitioning algorithm to schedule parallel iterations Write the cumulative sum (aka prefix sum, aka scan) of the input range to the output range. Each element of the output range contains the @@ -788,14 +908,13 @@ class FlowBuilder { // input is {-1, -3, -6, -10, -15} @endcode - Iterators are templated to enable stateful range using std::reference_wrapper. + Iterators can be made stateful by using std::reference_wrapper + @note Please refer to @ref ParallelScan for details. */ - template >, void>* = nullptr - > - Task transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop, P part = P()); + template + Task transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop); /** @brief creates an STL-styled parallel transform-inclusive scan task @@ -806,7 +925,6 @@ class FlowBuilder { @tparam BOP summation operator type @tparam UOP transform operator type @tparam T initial value type - @tparam P partitioner type (default tf::DefaultPartitioner) @param first start of input range @param last end of input range @@ -814,7 +932,6 @@ class FlowBuilder { @param bop function to perform summation @param uop function to transform elements of the input range @param init initial value - @param part partitioning algorithm to schedule parallel iterations Write the cumulative sum (aka prefix sum, aka scan) of the input range to the output range. Each element of the output range contains the @@ -838,14 +955,13 @@ class FlowBuilder { // input is {-2, -4, -7, -11, -16} @endcode - Iterators are templated to enable stateful range using std::reference_wrapper. + Iterators can be made stateful by using std::reference_wrapper + @note Please refer to @ref ParallelScan for details. */ - template >, void>* = nullptr - > - Task transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop, T init, P part = P()); + template + Task transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop, T init); /** @brief creates an STL-styled parallel transform-exclusive scan task @@ -856,7 +972,6 @@ class FlowBuilder { @tparam BOP summation operator type @tparam UOP transform operator type @tparam T initial value type - @tparam P partitioner type (default tf::DefaultPartitioner) @param first start of input range @param last end of input range @@ -864,7 +979,6 @@ class FlowBuilder { @param bop function to perform summation @param uop function to transform elements of the input range @param init initial value - @param part partitioning algorithm to schedule parallel iterations Write the cumulative sum (aka prefix sum, aka scan) of the input range to the output range. Each element of the output range contains the @@ -887,12 +1001,13 @@ class FlowBuilder { // input is {-1, -2, -4, -7, -11} @endcode - Iterators are templated to enable stateful range using std::reference_wrapper. + Iterators can be made stateful by using std::reference_wrapper + @note Please refer to @ref ParallelScan for details. */ - template - Task transform_exclusive_scan(B first, E last, D d_first, T init, BOP bop, UOP uop, P part = P()); + template + Task transform_exclusive_scan(B first, E last, D d_first, T init, BOP bop, UOP uop); // ------------------------------------------------------------------------ // find @@ -941,7 +1056,7 @@ class FlowBuilder { assert(*result == 22); @endcode - Iterators are templated to enable stateful range using std::reference_wrapper. + Iterators can be made stateful by using std::reference_wrapper */ template Task find_if(B first, E last, T &result, UOP predicate, P part = P()); @@ -989,7 +1104,7 @@ class FlowBuilder { assert(*result == 22); @endcode - Iterators are templated to enable stateful range using std::reference_wrapper. + Iterators can be made stateful by using std::reference_wrapper */ template Task find_if_not(B first, E last, T &result, UOP predicate, P part = P()); @@ -1041,7 +1156,7 @@ class FlowBuilder { assert(*result == -1); @endcode - Iterators are templated to enable stateful range using std::reference_wrapper. + Iterators can be made stateful by using std::reference_wrapper */ template Task min_element(B first, E last, T& result, C comp, P part); @@ -1093,7 +1208,7 @@ class FlowBuilder { assert(*result == 2); @endcode - Iterators are templated to enable stateful range using std::reference_wrapper. + Iterators can be made stateful by using std::reference_wrapper */ template Task max_element(B first, E last, T& result, C comp, P part); @@ -1116,8 +1231,9 @@ class FlowBuilder { The task spawns asynchronous tasks to sort elements in the range [first, last) in parallel. - Iterators are templated to enable stateful range using std::reference_wrapper. + Iterators can be made stateful by using std::reference_wrapper + @note Please refer to @ref ParallelSort for details. */ template @@ -1137,8 +1253,9 @@ class FlowBuilder { [first, last) using the @c std::less comparator, where @c T is the dereferenced iterator type. - Iterators are templated to enable stateful range using std::reference_wrapper. + Iterators can be made stateful by using std::reference_wrapper + @note Please refer to @ref ParallelSort for details. */ template @@ -1165,15 +1282,23 @@ inline FlowBuilder::FlowBuilder(Graph& graph) : // Function: emplace template , void>*> Task FlowBuilder::emplace(C&& c) { - return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0, + return Task(_graph._emplace_back(NSTATE::NONE, ESTATE::NONE, DefaultTaskParams{}, nullptr, nullptr, 0, std::in_place_type_t{}, std::forward(c) )); } +// Function: emplace +template , void>*> +Task FlowBuilder::emplace(C&& c) { + return Task(_graph._emplace_back(NSTATE::NONE, ESTATE::NONE, DefaultTaskParams{}, nullptr, nullptr, 0, + std::in_place_type_t{}, std::forward(c) + )); +} + // Function: emplace template , void>*> Task FlowBuilder::emplace(C&& c) { - return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0, + return Task(_graph._emplace_back(NSTATE::NONE, ESTATE::NONE, DefaultTaskParams{}, nullptr, nullptr, 0, std::in_place_type_t{}, std::forward(c) )); } @@ -1181,7 +1306,7 @@ Task FlowBuilder::emplace(C&& c) { // Function: emplace template , void>*> Task FlowBuilder::emplace(C&& c) { - return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0, + return Task(_graph._emplace_back(NSTATE::NONE, ESTATE::NONE, DefaultTaskParams{}, nullptr, nullptr, 0, std::in_place_type_t{}, std::forward(c) )); } @@ -1189,11 +1314,28 @@ Task FlowBuilder::emplace(C&& c) { // Function: emplace template , void>*> Task FlowBuilder::emplace(C&& c) { - return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0, + return Task(_graph._emplace_back(NSTATE::NONE, ESTATE::NONE, DefaultTaskParams{}, nullptr, nullptr, 0, std::in_place_type_t{}, std::forward(c) )); } +// Function: composed_of +template +Task FlowBuilder::composed_of(T& object) { + auto node = _graph._emplace_back(NSTATE::NONE, ESTATE::NONE, DefaultTaskParams{}, nullptr, nullptr, 0, + std::in_place_type_t{}, object + ); + return Task(node); +} + +// Function: placeholder +inline Task FlowBuilder::placeholder() { + auto node = _graph._emplace_back(NSTATE::NONE, ESTATE::NONE, DefaultTaskParams{}, nullptr, nullptr, 0, + std::in_place_type_t{} + ); + return Task(node); +} + // Function: emplace template 1), void>*> auto FlowBuilder::emplace(C&&... cs) { @@ -1207,39 +1349,19 @@ inline void FlowBuilder::erase(Task task) { return; } - task.for_each_dependent([&] (Task dependent) { - auto& S = dependent._node->_successors; - if(auto I = std::find(S.begin(), S.end(), task._node); I != S.end()) { - S.erase(I); - } - }); + // remove task from its successors' predecessor list + for(size_t i=0; i_num_successors; ++i) { + task._node->_edges[i]->_remove_predecessors(task._node); + } - task.for_each_successor([&] (Task dependent) { - auto& D = dependent._node->_dependents; - if(auto I = std::find(D.begin(), D.end(), task._node); I != D.end()) { - D.erase(I); - } - }); + // remove task from its precedessors' successor list + for(size_t i=task._node->_num_successors; i_edges.size(); ++i) { + task._node->_edges[i]->_remove_successors(task._node); + } _graph._erase(task._node); } -// Function: composed_of -template -Task FlowBuilder::composed_of(T& object) { - auto node = _graph._emplace_back("", 0, nullptr, nullptr, 0, - std::in_place_type_t{}, object - ); - return Task(node); -} - -// Function: placeholder -inline Task FlowBuilder::placeholder() { - auto node = _graph._emplace_back("", 0, nullptr, nullptr, 0, - std::in_place_type_t{} - ); - return Task(node); -} // Procedure: _linearize template @@ -1276,11 +1398,11 @@ inline void FlowBuilder::linearize(std::initializer_list keys) { @brief class to construct a subflow graph from the execution of a dynamic task -tf::Subflow is a derived class from tf::Runtime with a specialized mechanism -to manage the execution of a child graph. -By default, a subflow automatically @em joins its parent node. -You may explicitly join or detach a subflow by calling tf::Subflow::join -or tf::Subflow::detach, respectively. +tf::Subflow is spawned from the execution of a task to dynamically manage a +child graph that may depend on runtime variables. +You can explicitly join a subflow by calling tf::Subflow::join, respectively. +By default, the %Taskflow runtime will implicitly join a subflow it is is joinable. + The following example creates a taskflow graph that spawns a subflow from the execution of task @c B, and the subflow contains three tasks, @c B1, @c B2, and @c B3, where @c B3 runs after @c B1 and @c B2. @@ -1307,15 +1429,13 @@ C.precede(D); // D runs after C @endcode */ -class Subflow : public FlowBuilder, - public Runtime { +class Subflow : public FlowBuilder { friend class Executor; friend class FlowBuilder; - friend class Runtime; public: - + /** @brief enables the subflow to join its parent task @@ -1334,77 +1454,103 @@ class Subflow : public FlowBuilder, void join(); /** - @brief enables the subflow to detach from its parent task + @brief queries if the subflow is joinable - Performs an immediate action to detach the subflow. Once the subflow is detached, - it is considered finished and you may not modify the subflow anymore. + This member function queries if the subflow is joinable. + When a subflow is joined, it becomes not joinable. @code{.cpp} taskflow.emplace([](tf::Subflow& sf){ sf.emplace([](){}); - sf.detach(); + std::cout << sf.joinable() << '\n'; // true + sf.join(); + std::cout << sf.joinable() << '\n'; // false }); @endcode - - Only the worker that spawns this subflow can detach it. */ - void detach(); + bool joinable() const noexcept; /** - @brief resets the subflow to a joinable state + @brief acquires the associated executor + */ + Executor& executor() noexcept; + + /** + @brief acquires the associated graph + */ + Graph& graph() { return _graph; } + + /** + @brief specifies whether to keep the subflow after it is joined - @param clear_graph specifies whether to clear the associated graph (default @c true) + @param flag `true` to retain the subflow after it is joined; `false` to discard it - Clears the underlying task graph depending on the - given variable @c clear_graph (default @c true) and then - updates the subflow to a joinable state. + By default, the runtime automatically clears a spawned subflow once it is joined. + Setting this flag to `true` allows the application to retain the subflow's structure + for post-execution analysis like visualization. */ - void reset(bool clear_graph = true); + void retain(bool flag) noexcept; /** - @brief queries if the subflow is joinable - - This member function queries if the subflow is joinable. - When a subflow is joined or detached, it becomes not joinable. - - @code{.cpp} - taskflow.emplace([](tf::Subflow& sf){ - sf.emplace([](){}); - std::cout << sf.joinable() << '\n'; // true - sf.join(); - std::cout << sf.joinable() << '\n'; // false - }); - @endcode + @brief queries if the subflow will be retained after it is joined + @return `true` if the subflow will be retained after it is joined; `false` otherwise */ - bool joinable() const noexcept; + bool retain() const; private: - - bool _joinable {true}; - + Subflow(Executor&, Worker&, Node*, Graph&); + + Subflow() = delete; + Subflow(const Subflow&) = delete; + Subflow(Subflow&&) = delete; + + Executor& _executor; + Worker& _worker; + Node* _parent; }; // Constructor -inline Subflow::Subflow( - Executor& executor, Worker& worker, Node* parent, Graph& graph -) : - FlowBuilder {graph}, - Runtime {executor, worker, parent} { - // assert(_parent != nullptr); +inline Subflow::Subflow(Executor& executor, Worker& worker, Node* parent, Graph& graph) : + FlowBuilder {graph}, + _executor {executor}, + _worker {worker}, + _parent {parent} { + + // need to reset since there could have iterative control flow + _parent->_nstate &= ~(NSTATE::JOINED_SUBFLOW | NSTATE::RETAIN_SUBFLOW); + + // clear the graph + graph.clear(); } -// Function: joined +// Function: joinable inline bool Subflow::joinable() const noexcept { - return _joinable; + return !(_parent->_nstate & NSTATE::JOINED_SUBFLOW); +} + +// Function: executor +inline Executor& Subflow::executor() noexcept { + return _executor; } -// Procedure: reset -inline void Subflow::reset(bool clear_graph) { - if(clear_graph) { - _graph._clear(); +// Function: retain +inline void Subflow::retain(bool flag) noexcept { + // default value is not to retain + if TF_LIKELY(flag == true) { + _parent->_nstate |= NSTATE::RETAIN_SUBFLOW; + } + else { + _parent->_nstate &= ~NSTATE::RETAIN_SUBFLOW; } - _joinable = true; + + //_parent->_nstate = (_parent->_nstate & ~NSTATE::RETAIN_SUBFLOW) | + // (-static_cast(flag) & NSTATE::RETAIN_SUBFLOW); +} + +// Function: retain +inline bool Subflow::retain() const { + return _parent->_nstate & NSTATE::RETAIN_SUBFLOW; } } // end of namespace tf. --------------------------------------------------- diff --git a/taskflow/core/freelist.hpp b/taskflow/core/freelist.hpp new file mode 100644 index 000000000..ab9431c3c --- /dev/null +++ b/taskflow/core/freelist.hpp @@ -0,0 +1,53 @@ +#pragma once + +#include "tsq.hpp" + +namespace tf { + +/** +@private +*/ +template +class Freelist { + + friend class Executor; + + public: + struct Bucket { + std::mutex mutex; + UnboundedTaskQueue queue; + }; + + // Here, we don't create just N task queues in the freelist as it will cause + // the work-stealing loop to spand a lot of time on stealing tasks. + // Experimentally speaking, we found floor_log2(N) is the best. + TF_FORCE_INLINE Freelist(size_t N) : _buckets(N < 4 ? 1 : floor_log2(N)) {} + + // Pointers are aligned to 8 bytes. We perform a simple hash to avoid contention caused + // by hashing to the same slot. + TF_FORCE_INLINE void push(T item) { + //auto b = reinterpret_cast(item) % _buckets.size(); + auto b = (reinterpret_cast(item) >> 16) % _buckets.size(); + std::scoped_lock lock(_buckets[b].mutex); + _buckets[b].queue.push(item); + } + + TF_FORCE_INLINE T steal(size_t w) { + return _buckets[w].queue.steal(); + } + + TF_FORCE_INLINE T steal_with_hint(size_t w, size_t& num_empty_steals) { + return _buckets[w].queue.steal_with_hint(num_empty_steals); + } + + TF_FORCE_INLINE size_t size() const { + return _buckets.size(); + } + + private: + + std::vector _buckets; +}; + + +} // end of namespace tf ----------------------------------------------------- diff --git a/taskflow/core/graph.hpp b/taskflow/core/graph.hpp index efaa4ffc4..2e2bdc2af 100644 --- a/taskflow/core/graph.hpp +++ b/taskflow/core/graph.hpp @@ -1,12 +1,18 @@ #pragma once +#include "../utility/macros.hpp" #include "../utility/traits.hpp" #include "../utility/iterator.hpp" + +#ifdef TF_ENABLE_TASK_POOL #include "../utility/object_pool.hpp" +#endif + #include "../utility/os.hpp" #include "../utility/math.hpp" #include "../utility/small_vector.hpp" #include "../utility/serializer.hpp" +#include "../utility/lazy_string.hpp" #include "error.hpp" #include "declarations.hpp" #include "semaphore.hpp" @@ -14,6 +20,7 @@ #include "topology.hpp" #include "tsq.hpp" + /** @file graph.hpp @brief graph include file @@ -41,7 +48,7 @@ class to interact with the executor through taskflow composition. A graph object is move-only. */ -class Graph { +class Graph : public std::vector> { friend class Node; friend class FlowBuilder; @@ -51,493 +58,61 @@ class Graph { public: - /** - @brief constructs a graph object - */ - Graph() = default; - - /** - @brief disabled copy constructor - */ - Graph(const Graph&) = delete; - - /** - @brief constructs a graph using move semantics - */ - Graph(Graph&&); - - /** - @brief destructs the graph object - */ - ~Graph(); - - /** - @brief disabled copy assignment operator - */ - Graph& operator = (const Graph&) = delete; - - /** - @brief assigns a graph using move semantics - */ - Graph& operator = (Graph&&); - - /** - @brief queries if the graph is empty - */ - bool empty() const; - - /** - @brief queries the number of nodes in the graph - */ - size_t size() const; - - /** - @brief clears the graph - */ - void clear(); - - private: - - std::vector _nodes; - - void _clear(); - void _clear_detached(); - void _merge(Graph&&); - void _erase(Node*); - - /** - @private - */ - template - Node* _emplace_back(ArgsT&&...); -}; - -// ---------------------------------------------------------------------------- - -/** -@class Runtime - -@brief class to include a runtime object in a task - -A runtime object allows users to interact with the -scheduling runtime inside a task, such as scheduling an active task, -spawning a subflow, and so on. - -@code{.cpp} -tf::Task A, B, C, D; -std::tie(A, B, C, D) = taskflow.emplace( - [] () { return 0; }, - [&C] (tf::Runtime& rt) { // C must be captured by reference - std::cout << "B\n"; - rt.schedule(C); - }, - [] () { std::cout << "C\n"; }, - [] () { std::cout << "D\n"; } -); -A.precede(B, C, D); -executor.run(taskflow).wait(); -@endcode - -A runtime object is associated with the worker and the executor -that runs the task. - -*/ -class Runtime { - - friend class Executor; - friend class FlowBuilder; - - public: - - /** - @brief destroys the runtime object - - Issues a tf::Runtime::corun_all to finish all spawned asynchronous tasks - and then destroys the runtime object. - */ - ~Runtime(); - - /** - @brief obtains the running executor - - The running executor of a runtime task is the executor that runs - the parent taskflow of that runtime task. - - @code{.cpp} - tf::Executor executor; - tf::Taskflow taskflow; - taskflow.emplace([&](tf::Runtime& rt){ - assert(&(rt.executor()) == &executor); - }); - executor.run(taskflow).wait(); - @endcode - */ - Executor& executor(); - - /** - @brief schedules an active task immediately to the worker's queue - - @param task the given active task to schedule immediately - - This member function immediately schedules an active task to the - task queue of the associated worker in the runtime task. - An active task is a task in a running taskflow. - The task may or may not be running, and scheduling that task - will immediately put the task into the task queue of the worker - that is running the runtime task. - Consider the following example: - - @code{.cpp} - tf::Task A, B, C, D; - std::tie(A, B, C, D) = taskflow.emplace( - [] () { return 0; }, - [&C] (tf::Runtime& rt) { // C must be captured by reference - std::cout << "B\n"; - rt.schedule(C); - }, - [] () { std::cout << "C\n"; }, - [] () { std::cout << "D\n"; } - ); - A.precede(B, C, D); - executor.run(taskflow).wait(); - @endcode - - The executor will first run the condition task @c A which returns @c 0 - to inform the scheduler to go to the runtime task @c B. - During the execution of @c B, it directly schedules task @c C without - going through the normal taskflow graph scheduling process. - At this moment, task @c C is active because its parent taskflow is running. - When the taskflow finishes, we will see both @c B and @c C in the output. - */ - void schedule(Task task); - - /** - @brief runs the given callable asynchronously - - @tparam F callable type - @param f callable object - - The method creates an asynchronous task to launch the given - function on the given arguments. - The difference to tf::Executor::async is that the created asynchronous task - pertains to the runtime object. - Applications can explicitly issue tf::Runtime::corun_all - to wait for all spawned asynchronous tasks to finish. - For example: - - @code{.cpp} - std::atomic counter(0); - taskflow.emplace([&](tf::Runtime& rt){ - auto fu1 = rt.async([&](){ counter++; }); - auto fu2 = rt.async([&](){ counter++; }); - fu1.get(); - fu2.get(); - assert(counter == 2); - - // spawn 100 asynchronous tasks from the worker of the runtime - for(int i=0; i<100; i++) { - rt.async([&](){ counter++; }); - } - - // wait for the 100 asynchronous tasks to finish - rt.corun_all(); - assert(counter == 102); - }); - @endcode - - This method is thread-safe and can be called by multiple workers - that hold the reference to the runtime. - For example, the code below spawns 100 tasks from the worker of - a runtime, and each of the 100 tasks spawns another task - that will be run by another worker. - - @code{.cpp} - std::atomic counter(0); - taskflow.emplace([&](tf::Runtime& rt){ - // worker of the runtime spawns 100 tasks each spawning another task - // that will be run by another worker - for(int i=0; i<100; i++) { - rt.async([&](){ - counter++; - rt.async([](){ counter++; }); - }); - } - - // wait for the 200 asynchronous tasks to finish - rt.corun_all(); - assert(counter == 200); - }); - @endcode - */ - template - auto async(F&& f); - /** - @brief runs the given callable asynchronously - - @tparam F callable type - @tparam P task parameters type - - @param params task parameters - @param f callable - - @code{.cpp} - taskflow.emplace([&](tf::Runtime& rt){ - auto future = rt.async("my task", [](){}); - future.get(); - }); - @endcode - + @brief constructs a graph object */ - template - auto async(P&& params, F&& f); - - /** - @brief runs the given function asynchronously without returning any future object - - @tparam F callable type - @param f callable - - This member function is more efficient than tf::Runtime::async - and is encouraged to use when there is no data returned. + Graph() = default; - @code{.cpp} - std::atomic counter(0); - taskflow.emplace([&](tf::Runtime& rt){ - for(int i=0; i<100; i++) { - rt.silent_async([&](){ counter++; }); - } - rt.corun_all(); - assert(counter == 100); - }); - @endcode - - This member function is thread-safe. - */ - template - void silent_async(F&& f); - /** - @brief runs the given function asynchronously without returning any future object - - @tparam F callable type - @param params task parameters - @param f callable - - @code{.cpp} - taskflow.emplace([&](tf::Runtime& rt){ - rt.silent_async("my task", [](){}); - rt.corun_all(); - }); - @endcode + @brief disabled copy constructor */ - template - void silent_async(P&& params, F&& f); - - /** - @brief similar to tf::Runtime::silent_async but the caller must be the worker of the runtime - - @tparam F callable type + Graph(const Graph&) = delete; - @param f callable - - The method bypass the check of the caller worker from the executor - and thus can only called by the worker of this runtime. - - @code{.cpp} - taskflow.emplace([&](tf::Runtime& rt){ - // running by the worker of this runtime - rt.silent_async_unchecked([](){}); - rt.corun_all(); - }); - @endcode - */ - template - void silent_async_unchecked(F&& f); - /** - @brief similar to tf::Runtime::silent_async but the caller must be the worker of the runtime - - @tparam F callable type - @tparam P task parameters type - - @param params task parameters - @param f callable - - The method bypass the check of the caller worker from the executor - and thus can only called by the worker of this runtime. - - @code{.cpp} - taskflow.emplace([&](tf::Runtime& rt){ - // running by the worker of this runtime - rt.silent_async_unchecked("my task", [](){}); - rt.corun_all(); - }); - @endcode + @brief constructs a graph using move semantics */ - template - void silent_async_unchecked(P&& params, F&& f); + Graph(Graph&&) = default; /** - @brief co-runs the given target and waits until it completes - - A target can be one of the following forms: - + a subflow task to spawn a subflow or - + a composable graph object with `tf::Graph& T::graph()` defined - - @code{.cpp} - // co-run a subflow and wait until all tasks complete - taskflow.emplace([](tf::Runtime& rt){ - rt.corun([](tf::Subflow& sf){ - tf::Task A = sf.emplace([](){}); - tf::Task B = sf.emplace([](){}); - }); - }); - - // co-run a taskflow and wait until all tasks complete - tf::Taskflow taskflow1, taskflow2; - taskflow1.emplace([](){ std::cout << "running taskflow1\n"; }); - taskflow2.emplace([&](tf::Runtime& rt){ - std::cout << "running taskflow2\n"; - rt.corun(taskflow1); - }); - executor.run(taskflow2).wait(); - @endcode - - Although tf::Runtime::corun blocks until the operation completes, - the caller thread (worker) is not blocked (e.g., sleeping or holding any lock). - Instead, the caller thread joins the work-stealing loop of the executor - and returns when all tasks in the target completes. - - @attention - Only the worker of this tf::Runtime can issue corun. + @brief disabled copy assignment operator */ - template - void corun(T&& target); + Graph& operator = (const Graph&) = delete; /** - @brief keeps running the work-stealing loop until the predicate becomes true - - @tparam P predicate type - @param predicate a boolean predicate to indicate when to stop the loop - - The method keeps the caller worker running in the work-stealing loop - until the stop predicate becomes true. - - @attention - Only the worker of this tf::Runtime can issue corun. + @brief assigns a graph using move semantics */ - template - void corun_until(P&& predicate); + Graph& operator = (Graph&&) = default; - /** - @brief corun all asynchronous tasks spawned by this runtime with other workers - - Coruns all asynchronous tasks (tf::Runtime::async, - tf::Runtime::silent_async) with other workers until all those - asynchronous tasks finish. - - @code{.cpp} - std::atomic counter{0}; - taskflow.emplace([&](tf::Runtime& rt){ - // spawn 100 async tasks and wait - for(int i=0; i<100; i++) { - rt.silent_async([&](){ counter++; }); - } - rt.corun_all(); - assert(counter == 100); - - // spawn another 100 async tasks and wait - for(int i=0; i<100; i++) { - rt.silent_async([&](){ counter++; }); - } - rt.corun_all(); - assert(counter == 200); - }); - @endcode - - @attention - Only the worker of this tf::Runtime can issue tf::Runtime::corun_all. - */ - inline void corun_all(); - - /** - @brief acquire a reference to the underlying worker - */ - inline Worker& worker(); - protected: - - /** - @private - */ - explicit Runtime(Executor&, Worker&, Node*); - - /** - @private - */ - Executor& _executor; - - /** - @private - */ - Worker& _worker; - - /** - @private - */ - Node* _parent; + private: - /** - @private - */ - template - auto _async(Worker& w, P&& params, F&& f); + void _erase(Node*); /** @private */ - template - void _silent_async(Worker& w, P&& params, F&& f); + template + Node* _emplace_back(ArgsT&&...); }; -// constructor -inline Runtime::Runtime(Executor& e, Worker& w, Node* p) : - _executor{e}, - _worker {w}, - _parent {p}{ -} - -// Function: executor -inline Executor& Runtime::executor() { - return _executor; -} - -// Function: worker -inline Worker& Runtime::worker() { - return _worker; -} - // ---------------------------------------------------------------------------- // TaskParams // ---------------------------------------------------------------------------- /** -@struct TaskParams +@class TaskParams -@brief task parameters to use when creating an asynchronous task +@brief class to create a task parameter object */ -struct TaskParams { +class TaskParams { + + public: + /** @brief name of the task */ std::string name; - /** - @brief priority of the tassk - */ - unsigned priority {0}; - /** @brief C-styled pointer to user data */ @@ -545,20 +120,19 @@ struct TaskParams { }; /** -@struct DefaultTaskParams +@class DefaultTaskParams -@brief empty task parameter type for compile-time optimization +@brief class to create an empty task parameter for compile-time optimization */ -struct DefaultTaskParams { -}; +class DefaultTaskParams {}; /** @brief determines if the given type is a task parameter type Task parameters can be specified in one of the following types: - + tf::TaskParams: assign the struct of defined parameters - + tf::DefaultTaskParams: assign nothing - + std::string: assign a name to the task + + tf::TaskParams + + tf::DefaultTaskParams + + std::string */ template constexpr bool is_task_params_v = @@ -584,21 +158,15 @@ class Node { friend class FlowBuilder; friend class Subflow; friend class Runtime; + friend class AnchorGuard; + friend class PreemptionGuard; - enum class AsyncState : int { - UNFINISHED = 0, - LOCKED = 1, - FINISHED = 2 - }; + //template + //friend class Freelist; +#ifdef TF_ENABLE_TASK_POOL TF_ENABLE_POOLABLE_ON_THIS; - - // state bit flag - constexpr static int CONDITIONED = 1; - constexpr static int DETACHED = 2; - constexpr static int ACQUIRED = 4; - constexpr static int READY = 8; - constexpr static int EXCEPTION = 16; +#endif using Placeholder = std::monostate; @@ -608,9 +176,16 @@ class Node { template Static(C&&); - std::variant< - std::function, std::function - > work; + std::function work; + }; + + // runtime work handle + struct Runtime { + + template + Runtime(C&&); + + std::function work; }; // subflow work handle @@ -629,9 +204,7 @@ class Node { template Condition(C&&); - std::variant< - std::function, std::function - > work; + std::function work; }; // multi-condition work handle @@ -640,9 +213,7 @@ class Node { template MultiCondition(C&&); - std::variant< - std::function()>, std::function(Runtime&)> - > work; + std::function()> work; }; // module work handle @@ -661,7 +232,9 @@ class Node { Async(T&&); std::variant< - std::function, std::function + std::function, + std::function, // silent async + std::function // async > work; }; @@ -672,16 +245,19 @@ class Node { DependentAsync(C&&); std::variant< - std::function, std::function + std::function, + std::function, // silent async + std::function // async > work; std::atomic use_count {1}; - std::atomic state {AsyncState::UNFINISHED}; + std::atomic state {ASTATE::UNFINISHED}; }; using handle_t = std::variant< Placeholder, // placeholder Static, // static tasking + Runtime, // runtime tasking Subflow, // subflow tasking Condition, // conditional tasking MultiCondition, // multi-conditional tasking @@ -700,6 +276,7 @@ class Node { // variant index constexpr static auto PLACEHOLDER = get_index_v; constexpr static auto STATIC = get_index_v; + constexpr static auto RUNTIME = get_index_v; constexpr static auto SUBFLOW = get_index_v; constexpr static auto CONDITION = get_index_v; constexpr static auto MULTI_CONDITION = get_index_v; @@ -708,59 +285,53 @@ class Node { constexpr static auto DEPENDENT_ASYNC = get_index_v; Node() = default; - - template - Node(const std::string&, unsigned, Topology*, Node*, size_t, Args&&...); - - template - Node(const std::string&, Topology*, Node*, size_t, Args&&...); template - Node(const TaskParams&, Topology*, Node*, size_t, Args&&...); + Node(nstate_t, estate_t, const TaskParams&, Topology*, Node*, size_t, Args&&...); template - Node(const DefaultTaskParams&, Topology*, Node*, size_t, Args&&...); - - ~Node(); + Node(nstate_t, estate_t, const DefaultTaskParams&, Topology*, Node*, size_t, Args&&...); size_t num_successors() const; - size_t num_dependents() const; - size_t num_strong_dependents() const; - size_t num_weak_dependents() const; + size_t num_predecessors() const; + size_t num_strong_dependencies() const; + size_t num_weak_dependencies() const; const std::string& name() const; private: + + nstate_t _nstate {NSTATE::NONE}; + std::atomic _estate {ESTATE::NONE}; std::string _name; - unsigned _priority {0}; - void* _data {nullptr}; Topology* _topology {nullptr}; Node* _parent {nullptr}; - SmallVector _successors; - SmallVector _dependents; + size_t _num_successors {0}; + SmallVector _edges; - std::atomic _state {0}; std::atomic _join_counter {0}; - - std::unique_ptr _semaphores; - std::exception_ptr _exception_ptr {nullptr}; handle_t _handle; - - void _precede(Node*); - void _set_up_join_counter(); - void _process_exception(); + + std::unique_ptr _semaphores; + + std::exception_ptr _exception_ptr {nullptr}; bool _is_cancelled() const; bool _is_conditioner() const; + bool _is_preempted() const; bool _acquire_all(SmallVector&); - - SmallVector _release_all(); + void _release_all(SmallVector&); + void _precede(Node*); + void _set_up_join_counter(); + void _rethrow_exception(); + void _remove_successors(Node*); + void _remove_predecessors(Node*); }; // ---------------------------------------------------------------------------- @@ -770,7 +341,32 @@ class Node { /** @private */ -inline ObjectPool node_pool; +#ifdef TF_ENABLE_TASK_POOL +inline ObjectPool _task_pool; +#endif + +/** +@private +*/ +template +TF_FORCE_INLINE Node* animate(ArgsT&&... args) { +#ifdef TF_ENABLE_TASK_POOL + return _task_pool.animate(std::forward(args)...); +#else + return new Node(std::forward(args)...); +#endif +} + +/** +@private +*/ +TF_FORCE_INLINE void recycle(Node* ptr) { +#ifdef TF_ENABLE_TASK_POOL + _task_pool.recycle(ptr); +#else + delete ptr; +#endif +} // ---------------------------------------------------------------------------- // Definition for Node::Static @@ -781,6 +377,15 @@ template Node::Static::Static(C&& c) : work {std::forward(c)} { } +// ---------------------------------------------------------------------------- +// Definition for Node::Runtime +// ---------------------------------------------------------------------------- + +// Constructor +template +Node::Runtime::Runtime(C&& c) : work {std::forward(c)} { +} + // ---------------------------------------------------------------------------- // Definition for Node::Subflow // ---------------------------------------------------------------------------- @@ -842,48 +447,17 @@ Node::DependentAsync::DependentAsync(C&& c) : work {std::forward(c)} { // Constructor template Node::Node( - const std::string& name, - unsigned priority, - Topology* topology, - Node* parent, - size_t join_counter, - Args&&... args -) : - _name {name}, - _priority {priority}, - _topology {topology}, - _parent {parent}, - _join_counter {join_counter}, - _handle {std::forward(args)...} { -} - -// Constructor -template -Node::Node( - const std::string& name, - Topology* topology, - Node* parent, - size_t join_counter, - Args&&... args -) : - _name {name}, - _topology {topology}, - _parent {parent}, - _join_counter {join_counter}, - _handle {std::forward(args)...} { -} - -// Constructor -template -Node::Node( + nstate_t nstate, + estate_t estate, const TaskParams& params, Topology* topology, Node* parent, size_t join_counter, Args&&... args ) : + _nstate {nstate}, + _estate {estate}, _name {params.name}, - _priority {params.priority}, _data {params.data}, _topology {topology}, _parent {parent}, @@ -894,94 +468,80 @@ Node::Node( // Constructor template Node::Node( + nstate_t nstate, + estate_t estate, const DefaultTaskParams&, Topology* topology, Node* parent, size_t join_counter, Args&&... args ) : + _nstate {nstate}, + _estate {estate}, _topology {topology}, _parent {parent}, _join_counter {join_counter}, _handle {std::forward(args)...} { } -// Destructor -inline Node::~Node() { - // this is to avoid stack overflow - - if(_handle.index() == SUBFLOW) { - // using std::get_if instead of std::get makes this compatible - // with older macOS versions - // the result of std::get_if is guaranteed to be non-null - // due to the index check above - auto& subgraph = std::get_if(&_handle)->subgraph; - std::vector nodes; - nodes.reserve(subgraph.size()); - - std::move( - subgraph._nodes.begin(), subgraph._nodes.end(), std::back_inserter(nodes) - ); - subgraph._nodes.clear(); - - size_t i = 0; - - while(i < nodes.size()) { - - if(nodes[i]->_handle.index() == SUBFLOW) { - auto& sbg = std::get_if(&(nodes[i]->_handle))->subgraph; - std::move( - sbg._nodes.begin(), sbg._nodes.end(), std::back_inserter(nodes) - ); - sbg._nodes.clear(); - } - - ++i; - } +// Procedure: _precede +/* +u successor layout: s1, s2, s3, p1, p2 (num_successors = 3) +v predecessor layout: s1, p1, p2 + +add a new successor: u->v +u successor layout: + s1, s2, s3, p1, p2, v (push_back v) + s1, s2, s3, v, p2, p1 (swap adj[num_successors] with adj[n-1]) +v predecessor layout: + s1, p1, p2, u (push_back u) +*/ +inline void Node::_precede(Node* v) { + _edges.push_back(v); + std::swap(_edges[_num_successors++], _edges[_edges.size() - 1]); + v->_edges.push_back(this); +} - //auto& np = Graph::_node_pool(); - for(i=0; i_dependents.push_back(this); +// Function: _remove_predecessors +inline void Node::_remove_predecessors(Node* node) { + _edges.erase( + std::remove(_edges.begin() + _num_successors, _edges.end(), node), _edges.end() + ); } // Function: num_successors inline size_t Node::num_successors() const { - return _successors.size(); + return _num_successors; } -// Function: dependents -inline size_t Node::num_dependents() const { - return _dependents.size(); +// Function: predecessors +inline size_t Node::num_predecessors() const { + return _edges.size() - _num_successors; } -// Function: num_weak_dependents -inline size_t Node::num_weak_dependents() const { +// Function: num_weak_dependencies +inline size_t Node::num_weak_dependencies() const { size_t n = 0; - for(size_t i=0; i<_dependents.size(); i++) { - //if(_dependents[i]->_handle.index() == Node::CONDITION) { - if(_dependents[i]->_is_conditioner()) { - n++; - } + for(size_t i=_num_successors; i<_edges.size(); i++) { + n += _edges[i]->_is_conditioner(); } return n; } -// Function: num_strong_dependents -inline size_t Node::num_strong_dependents() const { +// Function: num_strong_dependencies +inline size_t Node::num_strong_dependencies() const { size_t n = 0; - for(size_t i=0; i<_dependents.size(); i++) { - //if(_dependents[i]->_handle.index() != Node::CONDITION) { - if(!_dependents[i]->_is_conditioner()) { - n++; - } + for(size_t i=_num_successors; i<_edges.size(); i++) { + n += !_edges[i]->_is_conditioner(); } return n; } @@ -997,31 +557,33 @@ inline bool Node::_is_conditioner() const { _handle.index() == Node::MULTI_CONDITION; } +// Function: _is_preempted +inline bool Node::_is_preempted() const { + return _nstate & NSTATE::PREEMPTED; +} + // Function: _is_cancelled // we currently only support cancellation of taskflow (no async task) inline bool Node::_is_cancelled() const { - //return _topology && _topology->_is_cancelled.load(std::memory_order_relaxed); - return _topology && - (_topology->_state.load(std::memory_order_relaxed) & Topology::CANCELLED); + return (_topology && (_topology->_estate.load(std::memory_order_relaxed) & ESTATE::CANCELLED)) + || + (_parent && (_parent->_estate.load(std::memory_order_relaxed) & ESTATE::CANCELLED)); } // Procedure: _set_up_join_counter inline void Node::_set_up_join_counter() { size_t c = 0; - for(auto p : _dependents) { - //if(p->_handle.index() == Node::CONDITION) { - if(p->_is_conditioner()) { - _state.fetch_or(Node::CONDITIONED, std::memory_order_relaxed); - } - else { - c++; - } + //for(auto p : _predecessors) { + for(size_t i=_num_successors; i<_edges.size(); i++) { + bool is_cond = _edges[i]->_is_conditioner(); + _nstate = (_nstate + is_cond) | (is_cond * NSTATE::CONDITIONED); // weak dependency + c += !is_cond; // strong dependency } _join_counter.store(c, std::memory_order_relaxed); } -// Procedure: _process_exception -inline void Node::_process_exception() { +// Procedure: _rethrow_exception +inline void Node::_rethrow_exception() { if(_exception_ptr) { auto e = _exception_ptr; _exception_ptr = nullptr; @@ -1031,14 +593,12 @@ inline void Node::_process_exception() { // Function: _acquire_all inline bool Node::_acquire_all(SmallVector& nodes) { - + // assert(_semaphores != nullptr); auto& to_acquire = _semaphores->to_acquire; - for(size_t i = 0; i < to_acquire.size(); ++i) { if(!to_acquire[i]->_try_acquire_or_wait(this)) { for(size_t j = 1; j <= i; ++j) { - auto r = to_acquire[i-j]->_release(); - nodes.insert(std::end(nodes), std::begin(r), std::end(r)); + to_acquire[i-j]->_release(nodes); } return false; } @@ -1047,103 +607,53 @@ inline bool Node::_acquire_all(SmallVector& nodes) { } // Function: _release_all -inline SmallVector Node::_release_all() { - +inline void Node::_release_all(SmallVector& nodes) { + // assert(_semaphores != nullptr); auto& to_release = _semaphores->to_release; - - SmallVector nodes; for(const auto& sem : to_release) { - auto r = sem->_release(); - nodes.insert(std::end(nodes), std::begin(r), std::end(r)); + sem->_release(nodes); } - - return nodes; } + + // ---------------------------------------------------------------------------- -// Node Deleter +// AnchorGuard // ---------------------------------------------------------------------------- /** @private */ -struct NodeDeleter { - void operator ()(Node* ptr) { - node_pool.recycle(ptr); - } -}; - -// ---------------------------------------------------------------------------- -// Graph definition -// ---------------------------------------------------------------------------- - -// Destructor -inline Graph::~Graph() { - _clear(); -} - -// Move constructor -inline Graph::Graph(Graph&& other) : - _nodes {std::move(other._nodes)} { -} +class AnchorGuard { -// Move assignment -inline Graph& Graph::operator = (Graph&& other) { - _clear(); - _nodes = std::move(other._nodes); - return *this; -} - -// Procedure: clear -inline void Graph::clear() { - _clear(); -} - -// Procedure: clear -inline void Graph::_clear() { - for(auto node : _nodes) { - node_pool.recycle(node); + public: + + // anchor is at estate as it may be accessed by multiple threads (e.g., corun's + // parent with tear_down_async's parent). + AnchorGuard(Node* node) : _node{node} { + _node->_estate.fetch_or(ESTATE::ANCHORED, std::memory_order_relaxed); } - _nodes.clear(); -} -// Procedure: clear_detached -inline void Graph::_clear_detached() { + ~AnchorGuard() { + _node->_estate.fetch_and(~ESTATE::ANCHORED, std::memory_order_relaxed); + } + + private: - auto mid = std::partition(_nodes.begin(), _nodes.end(), [] (Node* node) { - return !(node->_state.load(std::memory_order_relaxed) & Node::DETACHED); - }); + Node* _node; +}; - for(auto itr = mid; itr != _nodes.end(); ++itr) { - node_pool.recycle(*itr); - } - _nodes.resize(std::distance(_nodes.begin(), mid)); -} -// Procedure: merge -inline void Graph::_merge(Graph&& g) { - for(auto n : g._nodes) { - _nodes.push_back(n); - } - g._nodes.clear(); -} +// ---------------------------------------------------------------------------- +// Graph definition +// ---------------------------------------------------------------------------- // Function: erase inline void Graph::_erase(Node* node) { - if(auto I = std::find(_nodes.begin(), _nodes.end(), node); I != _nodes.end()) { - _nodes.erase(I); - node_pool.recycle(node); - } -} - -// Function: size -inline size_t Graph::size() const { - return _nodes.size(); -} - -// Function: empty -inline bool Graph::empty() const { - return _nodes.empty(); + erase( + std::remove_if(begin(), end(), [&](auto& p){ return p.get() == node; }), + end() + ); } /** @@ -1151,19 +661,84 @@ inline bool Graph::empty() const { */ template Node* Graph::_emplace_back(ArgsT&&... args) { - _nodes.push_back(node_pool.animate(std::forward(args)...)); - return _nodes.back(); + push_back(std::make_unique(std::forward(args)...)); + return back().get(); } +// ---------------------------------------------------------------------------- +// Graph checker +// ---------------------------------------------------------------------------- + +/** +@private + */ +template +struct has_graph : std::false_type {}; -} // end of namespace tf. --------------------------------------------------- +/** +@private + */ +template +struct has_graph().graph())>> + : std::is_same().graph()), Graph&> {}; +/** + * @brief determines if the given type has a member function `Graph& graph()` + * + * This trait determines if the provided type `T` contains a member function + * with the exact signature `tf::Graph& graph()`. It uses SFINAE and `std::void_t` + * to detect the presence of the member function and its return type. + * + * @tparam T The type to inspect. + * @retval true If the type `T` has a member function `tf::Graph& graph()`. + * @retval false Otherwise. + * + * Example usage: + * @code + * + * struct A { + * tf::Graph& graph() { return my_graph; }; + * tf::Graph my_graph; + * + * // other custom members to alter my_graph + * }; + * + * struct C {}; // No graph function + * + * static_assert(has_graph_v, "A has graph()"); + * static_assert(!has_graph_v, "C does not have graph()"); + * @endcode + */ +template +constexpr bool has_graph_v = has_graph::value; +// ---------------------------------------------------------------------------- +// detailed helper functions +// ---------------------------------------------------------------------------- +namespace detail { +/** +@private +*/ +template +TF_FORCE_INLINE Node* get_node_ptr(T& node) { + using U = std::decay_t; + if constexpr (std::is_same_v) { + return node; + } + else if constexpr (std::is_same_v>) { + return node.get(); + } + else { + static_assert(dependent_false_v, "Unsupported type for get_node_ptr"); + } +} +} // end of namespace tf::detail --------------------------------------------- +} // end of namespace tf. ---------------------------------------------------- diff --git a/taskflow/core/notifier.hpp b/taskflow/core/nonblocking_notifier.hpp similarity index 50% rename from taskflow/core/notifier.hpp rename to taskflow/core/nonblocking_notifier.hpp index 61663798a..e4f4b9e28 100644 --- a/taskflow/core/notifier.hpp +++ b/taskflow/core/nonblocking_notifier.hpp @@ -1,6 +1,3 @@ -// 2019/02/09 - created by Tsung-Wei Huang -// - modified the event count from Eigen - #pragma once #include @@ -16,7 +13,7 @@ #include #include #include - +#include "../utility/os.hpp" // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // @@ -49,10 +46,10 @@ namespace tf { // ec.notify(true); // // notify is cheap if there are no waiting threads. prepare_wait/commit_wait are not -// cheap, but they are executed only if the preceeding predicate check has +// cheap, but they are executed only if the preceding predicate check has // failed. // -// Algorihtm outline: +// Algorithm outline: // There are two main variables: predicate (managed by user) and _state. // Operation closely resembles Dekker mutual algorithm: // https://en.wikipedia.org/wiki/Dekker%27s_algorithm @@ -62,14 +59,15 @@ namespace tf { // and won't block, or notifying thread will see _state change and will unblock // the waiter, or both. But it can't happen that both threads don't see each // other changes, which would lead to deadlock. -class Notifier { + +class NonblockingNotifierV1 { friend class Executor; public: struct Waiter { - std::atomic next; + alignas (2*TF_CACHELINE_SIZE) std::atomic next; uint64_t epoch; enum : unsigned { kNotSignaled = 0, @@ -77,7 +75,7 @@ class Notifier { kSignaled, }; -#ifdef __cpp_lib_atomic_wait +#if __cplusplus >= TF_CPP20 std::atomic state {0}; #else std::mutex mu; @@ -86,13 +84,13 @@ class Notifier { #endif }; - explicit Notifier(size_t N) : _waiters{N} { + explicit NonblockingNotifierV1(size_t N) : _state(kStackMask), _waiters(N) { assert(_waiters.size() < (1 << kWaiterBits) - 1); // Initialize epoch to something close to overflow to test overflow. - _state = kStackMask | (kEpochMask - kEpochInc * _waiters.size() * 2); + //_state = kStackMask | (kEpochMask - kEpochInc * _waiters.size() * 2); } - ~Notifier() { + ~NonblockingNotifierV1() { // Ensure there are no waiters. assert((_state.load() & (kStackMask | kWaiterMask)) == kStackMask); } @@ -108,7 +106,7 @@ class Notifier { // commit_wait commits waiting. // only the waiter itself can call void commit_wait(Waiter* w) { -#ifdef __cpp_lib_atomic_wait +#if __cplusplus >= TF_CPP20 w->state.store(Waiter::kNotSignaled, std::memory_order_relaxed); #else w->state = Waiter::kNotSignaled; @@ -120,7 +118,7 @@ class Notifier { uint64_t state = _state.load(std::memory_order_seq_cst); for (;;) { if (int64_t((state & kEpochMask) - epoch) < 0) { - // The preceeding waiter has not decided on its fate. Wait until it + // The preceding waiter has not decided on its fate. Wait until it // calls either cancel_wait or commit_wait, or is notified. std::this_thread::yield(); state = _state.load(std::memory_order_seq_cst); @@ -152,7 +150,7 @@ class Notifier { uint64_t state = _state.load(std::memory_order_relaxed); for (;;) { if (int64_t((state & kEpochMask) - epoch) < 0) { - // The preceeding waiter has not decided on its fate. Wait until it + // The preceding waiter has not decided on its fate. Wait until it // calls either cancel_wait or commit_wait, or is notified. std::this_thread::yield(); state = _state.load(std::memory_order_relaxed); @@ -168,9 +166,96 @@ class Notifier { } } + void notify_one() { + _notify(); + } + + void notify_all() { + _notify(); + } + + // notify n workers + void notify_n(size_t n) { + if(n >= _waiters.size()) { + _notify(); + } + else { + for(size_t k=0; k(); + } + } + } + + size_t size() const { + return _waiters.size(); + } + + private: + + // State_ layout: + // - low kStackBits is a stack of waiters committed wait. + // - next kWaiterBits is count of waiters in prewait state. + // - next kEpochBits is modification counter. + static const uint64_t kStackBits = 16; + static const uint64_t kStackMask = (1ull << kStackBits) - 1; + static const uint64_t kWaiterBits = 16; + static const uint64_t kWaiterShift = 16; + static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1) + << kWaiterShift; + static const uint64_t kWaiterInc = 1ull << kWaiterBits; + static const uint64_t kEpochBits = 32; + static const uint64_t kEpochShift = 32; + static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift; + static const uint64_t kEpochInc = 1ull << kEpochShift; + std::atomic _state; + std::vector _waiters; + + void _park(Waiter* w) { +#if __cplusplus >= TF_CPP20 + unsigned target = Waiter::kNotSignaled; + if(w->state.compare_exchange_strong(target, Waiter::kWaiting, + std::memory_order_relaxed, + std::memory_order_relaxed)) { + w->state.wait(Waiter::kWaiting, std::memory_order_relaxed); + } +#else + std::unique_lock lock(w->mu); + while (w->state != Waiter::kSignaled) { + w->state = Waiter::kWaiting; + w->cv.wait(lock); + } +#endif + } + + void _unpark(Waiter* waiters) { + Waiter* next = nullptr; + for (Waiter* w = waiters; w; w = next) { + next = w->next.load(std::memory_order_relaxed); +#if __cplusplus >= TF_CPP20 + // We only notify if the other is waiting - this is why we use tri-state + // variable instead of binary-state variable (i.e., atomic_flag) + // Performance is about 0.1% faster + if(w->state.exchange(Waiter::kSignaled, std::memory_order_relaxed) == + Waiter::kWaiting) { + w->state.notify_one(); + } +#else + unsigned state; + { + std::unique_lock lock(w->mu); + state = w->state; + w->state = Waiter::kSignaled; + } + // Avoid notifying if it wasn't waiting. + if (state == Waiter::kWaiting) w->cv.notify_one(); +#endif + } + } + // notify wakes one or all waiting threads. // Must be called after changing the associated wait predicate. - void notify(bool all) { + template + void _notify() { std::atomic_thread_fence(std::memory_order_seq_cst); uint64_t state = _state.load(std::memory_order_acquire); for (;;) { @@ -214,15 +299,150 @@ class Notifier { } } } +}; + + +// ---------------------------------------------------------------------------- +// NonblockingNotifierV2 +// reference: https://gitlab.com/libeigen/eigen/-/blob/master/Eigen/src/ThreadPool/EventCount.h +// ---------------------------------------------------------------------------- +class NonblockingNotifierV2 { + + friend class Executor; + + // State_ layout: + // - low kWaiterBits is a stack of waiters committed wait + // (indexes in _waiters array are used as stack elements, + // kStackMask means empty stack). + // - next kWaiterBits is count of waiters in prewait state. + // - next kWaiterBits is count of pending signals. + // - remaining bits are ABA counter for the stack. + // (stored in Waiter node and incremented on push). + static const uint64_t kWaiterBits = 14; + static const uint64_t kStackMask = (1ull << kWaiterBits) - 1; + static const uint64_t kWaiterShift = kWaiterBits; + static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1) << kWaiterShift; + static const uint64_t kWaiterInc = 1ull << kWaiterShift; + static const uint64_t kSignalShift = 2 * kWaiterBits; + static const uint64_t kSignalMask = ((1ull << kWaiterBits) - 1) << kSignalShift; + static const uint64_t kSignalInc = 1ull << kSignalShift; + static const uint64_t kEpochShift = 3 * kWaiterBits; + static const uint64_t kEpochBits = 64 - kEpochShift; + static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift; + static const uint64_t kEpochInc = 1ull << kEpochShift; + + static_assert(kEpochBits >= 20, "not enough bits to prevent ABA problem"); + + public: + + struct Waiter { + alignas (2*TF_CACHELINE_SIZE) std::atomic next{kStackMask}; + uint64_t epoch{0}; + enum : unsigned { + kNotSignaled = 0, + kWaiting, + kSignaled, + }; + +#if __cplusplus >= TF_CPP20 + std::atomic state {kNotSignaled}; +#else + std::mutex mu; + std::condition_variable cv; + unsigned state {kNotSignaled}; +#endif + }; + + explicit NonblockingNotifierV2(size_t N) : _state(kStackMask), _waiters(N) { + assert(N < ((1 << kWaiterBits) - 1)); + } + + ~NonblockingNotifierV2() { + // Ensure there are no waiters. + assert(_state.load() == kStackMask); + } + + // prepare_wait prepares for waiting. + // After calling prepare_wait, the thread must re-check the wait predicate + // and then call either cancel_wait or commit_wait. + //void prepare_wait(Waiter*) { + // uint64_t state = _state.load(std::memory_order_relaxed); + // for (;;) { + // //_check_state(state); + // uint64_t newstate = state + kWaiterInc; + // //_check_state(newstate); + // if (_state.compare_exchange_weak(state, newstate, std::memory_order_seq_cst)) return; + // } + //} + + void prepare_wait(Waiter*) { + _state.fetch_add(kWaiterInc, std::memory_order_relaxed); + std::atomic_thread_fence(std::memory_order_seq_cst); + } + + // commit_wait commits waiting after prepare_wait. + void commit_wait(Waiter* w) { +#if __cplusplus >= TF_CPP20 + w->state.store(Waiter::kNotSignaled, std::memory_order_relaxed); +#else + w->state = Waiter::kNotSignaled; +#endif + const uint64_t me = (w - &_waiters[0]) | w->epoch; + uint64_t state = _state.load(std::memory_order_seq_cst); + for (;;) { + //_check_state(state, true); + uint64_t newstate; + if ((state & kSignalMask) != 0) { + // Consume the signal and return immediately. + newstate = state - kWaiterInc - kSignalInc; + } else { + // Remove this thread from pre-wait counter and add to the waiter stack. + newstate = ((state & kWaiterMask) - kWaiterInc) | me; + w->next.store(state & (kStackMask | kEpochMask), std::memory_order_relaxed); + } + //_check_state(newstate); + if (_state.compare_exchange_weak(state, newstate, std::memory_order_acq_rel)) { + if ((state & kSignalMask) == 0) { + w->epoch += kEpochInc; + _park(w); + } + return; + } + } + } + // cancel_wait cancels effects of the previous prepare_wait call. + void cancel_wait(Waiter*) { + uint64_t state = _state.load(std::memory_order_relaxed); + for (;;) { + //_check_state(state, true); + uint64_t newstate = state - kWaiterInc; + // We don't know if the thread was also notified or not, + // so we should not consume a signal unconditionally. + // Only if number of waiters is equal to number of signals, + // we know that the thread was notified and we must take away the signal. + if (((state & kWaiterMask) >> kWaiterShift) == ((state & kSignalMask) >> kSignalShift)) newstate -= kSignalInc; + //_check_state(newstate); + if (_state.compare_exchange_weak(state, newstate, std::memory_order_acq_rel)) return; + } + } + + void notify_one() { + _notify(); + } + + void notify_all() { + _notify(); + } + // notify n workers void notify_n(size_t n) { if(n >= _waiters.size()) { - notify(true); + _notify(); } else { for(size_t k=0; k(); } } } @@ -231,28 +451,14 @@ class Notifier { return _waiters.size(); } - private: + private: + - // State_ layout: - // - low kStackBits is a stack of waiters committed wait. - // - next kWaiterBits is count of waiters in prewait state. - // - next kEpochBits is modification counter. - static const uint64_t kStackBits = 16; - static const uint64_t kStackMask = (1ull << kStackBits) - 1; - static const uint64_t kWaiterBits = 16; - static const uint64_t kWaiterShift = 16; - static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1) - << kWaiterShift; - static const uint64_t kWaiterInc = 1ull << kWaiterBits; - static const uint64_t kEpochBits = 32; - static const uint64_t kEpochShift = 32; - static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift; - static const uint64_t kEpochInc = 1ull << kEpochShift; std::atomic _state; std::vector _waiters; void _park(Waiter* w) { -#ifdef __cpp_lib_atomic_wait +#if __cplusplus >= TF_CPP20 unsigned target = Waiter::kNotSignaled; if(w->state.compare_exchange_strong(target, Waiter::kWaiting, std::memory_order_relaxed, @@ -268,19 +474,16 @@ class Notifier { #endif } - void _unpark(Waiter* waiters) { - Waiter* next = nullptr; - for (Waiter* w = waiters; w; w = next) { - next = w->next.load(std::memory_order_relaxed); -#ifdef __cpp_lib_atomic_wait - // We only notify if the other is waiting - this is why we use tri-state - // variable instead of binary-state variable (i.e., atomic_flag) - // Performance is about 0.1% faster + void _unpark(Waiter* w) { + for (Waiter* next; w; w = next) { + uint64_t wnext = w->next.load(std::memory_order_relaxed) & kStackMask; + next = (wnext == kStackMask) ? nullptr : &_waiters[static_cast(wnext)]; +#if __cplusplus >= TF_CPP20 if(w->state.exchange(Waiter::kSignaled, std::memory_order_relaxed) == Waiter::kWaiting) { w->state.notify_one(); } -#else +#else unsigned state; { std::unique_lock lock(w->mu); @@ -292,10 +495,59 @@ class Notifier { #endif } } + + // Notify wakes one or all waiting threads. + // Must be called after changing the associated wait predicate. + template + void _notify() { + std::atomic_thread_fence(std::memory_order_seq_cst); + uint64_t state = _state.load(std::memory_order_acquire); + for (;;) { + //_check_state(state); + const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift; + const uint64_t sigs = (state & kSignalMask) >> kSignalShift; + // Easy case: no waiters. + if ((state & kStackMask) == kStackMask && waiters == sigs) return; + uint64_t newstate; + if (notifyAll) { + // Empty wait stack and set signal to number of pre-wait threads. + newstate = (state & kWaiterMask) | (waiters << kSignalShift) | kStackMask; + } else if (sigs < waiters) { + // There is a thread in pre-wait state, unblock it. + newstate = state + kSignalInc; + } else { + // Pop a waiter from list and unpark it. + Waiter* w = &_waiters[state & kStackMask]; + uint64_t next = w->next.load(std::memory_order_relaxed); + newstate = (state & (kWaiterMask | kSignalMask)) | next; + } + //_check_state(newstate); + if (_state.compare_exchange_weak(state, newstate, std::memory_order_acq_rel)) { + if (!notifyAll && (sigs < waiters)) return; // unblocked pre-wait thread + if ((state & kStackMask) == kStackMask) return; + Waiter* w = &_waiters[state & kStackMask]; + if (!notifyAll) w->next.store(kStackMask, std::memory_order_relaxed); + _unpark(w); + return; + } + } + } -}; + //static void _check_state(uint64_t state, bool waiter = false) { + // const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift; + // const uint64_t signals = (state & kSignalMask) >> kSignalShift; + // assert(waiters >= signals); + // assert(waiters < (1 << kWaiterBits) - 1); + // assert(!waiter || waiters > 0); + // (void)waiters; + // (void)signals; + //} + NonblockingNotifierV2(const NonblockingNotifierV2&) = delete; + void operator=(const NonblockingNotifierV2&) = delete; +}; + } // namespace tf ------------------------------------------------------------ diff --git a/taskflow/core/observer.hpp b/taskflow/core/observer.hpp index 3c1873efa..55546ed85 100644 --- a/taskflow/core/observer.hpp +++ b/taskflow/core/observer.hpp @@ -443,7 +443,9 @@ class TFProfObserver : public ObserverInterface { friend class Executor; friend class TFProfManager; - /** @private overall task summary */ + /** + @private + */ struct TaskSummary { size_t count {0}; size_t total_span {0}; @@ -453,7 +455,9 @@ class TFProfObserver : public ObserverInterface { float avg_span() const { return total_span * 1.0f / count; } }; - /** @private worker summary at a level */ + /** + @private + */ struct WorkerSummary { size_t id; @@ -469,7 +473,9 @@ class TFProfObserver : public ObserverInterface { //return count < 2 ? 0.0f : total_delay * 1.0f / (count-1); }; - /** @private */ + /** + @private + */ struct Summary { std::array tsum; std::vector wsum; @@ -537,27 +543,27 @@ inline void TFProfObserver::Summary::dump_tsum(std::ostream& os) const { std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){ if(i.count == 0) return; - count_w = std::max(count_w, std::to_string(i.count).size()); + count_w = (std::max)(count_w, std::to_string(i.count).size()); }); std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){ if(i.count == 0) return; - time_w = std::max(time_w, std::to_string(i.total_span).size()); + time_w = (std::max)(time_w, std::to_string(i.total_span).size()); }); std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){ if(i.count == 0) return; - avg_w = std::max(time_w, std::to_string(i.avg_span()).size()); + avg_w = (std::max)(time_w, std::to_string(i.avg_span()).size()); }); std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){ if(i.count == 0) return; - min_w = std::max(min_w, std::to_string(i.min_span).size()); + min_w = (std::max)(min_w, std::to_string(i.min_span).size()); }); std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){ if(i.count == 0) return; - max_w = std::max(max_w, std::to_string(i.max_span).size()); + max_w = (std::max)(max_w, std::to_string(i.max_span).size()); }); os << std::setw(type_w) << "-Task-" @@ -590,32 +596,32 @@ inline void TFProfObserver::Summary::dump_wsum(std::ostream& os) const { std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){ if(i.count == 0) return; - l_w = std::max(l_w, std::to_string(i.level).size()); + l_w = (std::max)(l_w, std::to_string(i.level).size()); }); std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){ if(i.count == 0) return; - c_w = std::max(c_w, std::to_string(i.count).size()); + c_w = (std::max)(c_w, std::to_string(i.count).size()); }); std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){ if(i.count == 0) return; - d_w = std::max(d_w, std::to_string(i.total_span).size()); + d_w = (std::max)(d_w, std::to_string(i.total_span).size()); }); std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){ if(i.count == 0) return; - avg_w = std::max(avg_w, std::to_string(i.avg_span()).size()); + avg_w = (std::max)(avg_w, std::to_string(i.avg_span()).size()); }); std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){ if(i.count == 0) return; - min_w = std::max(min_w, std::to_string(i.min_span).size()); + min_w = (std::max)(min_w, std::to_string(i.min_span).size()); }); std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){ if(i.count == 0) return; - max_w = std::max(max_w, std::to_string(i.max_span).size()); + max_w = (std::max)(max_w, std::to_string(i.max_span).size()); }); os << std::setw(w_w) << "-Worker-" @@ -840,8 +846,8 @@ inline void TFProfObserver::summary(std::ostream& os) const { // update the entire span auto& s = _timeline.segments[w][l][i]; - view_beg = view_beg ? std::min(*view_beg, s.beg) : s.beg; - view_end = view_end ? std::max(*view_end, s.end) : s.end; + view_beg = view_beg ? (std::min)(*view_beg, s.beg) : s.beg; + view_end = view_end ? (std::max)(*view_end, s.end) : s.end; // update the task summary size_t t = duration_cast(s.end - s.beg).count(); @@ -849,19 +855,19 @@ inline void TFProfObserver::summary(std::ostream& os) const { auto& x = summary.tsum[static_cast(s.type)]; x.count += 1; x.total_span += t; - x.min_span = (x.count == 1) ? t : std::min(t, x.min_span); - x.max_span = (x.count == 1) ? t : std::max(t, x.max_span); + x.min_span = (x.count == 1) ? t : (std::min)(t, x.min_span); + x.max_span = (x.count == 1) ? t : (std::max)(t, x.max_span); // update the worker summary ws.total_span += t; - ws.min_span = (i == 0) ? t : std::min(t, ws.min_span); - ws.max_span = (i == 0) ? t : std::max(t, ws.max_span); + ws.min_span = (i == 0) ? t : (std::min)(t, ws.min_span); + ws.max_span = (i == 0) ? t : (std::max)(t, ws.max_span); auto&y = ws.tsum[static_cast(s.type)]; y.count += 1; y.total_span += t; - y.min_span = (y.count == 1) ? t : std::min(t, y.min_span); - y.max_span = (y.count == 1) ? t : std::max(t, y.max_span); + y.min_span = (y.count == 1) ? t : (std::min)(t, y.min_span); + y.max_span = (y.count == 1) ? t : (std::max)(t, y.max_span); // update the delay //if(i) { diff --git a/taskflow/core/runtime.hpp b/taskflow/core/runtime.hpp new file mode 100644 index 000000000..c35cce3c2 --- /dev/null +++ b/taskflow/core/runtime.hpp @@ -0,0 +1,559 @@ +#pragma once + +#include "executor.hpp" + +namespace tf { + +/** +@class Runtime + +@brief class to include a runtime object in a task + +A runtime object allows users to interact with the +scheduling runtime inside a task (or the *parent task* of this runtime), such as scheduling an active task, +spawning an asynchronous task, corunning a graph target, and so on. + +@code{.cpp} +tf::Task A, B, C, D; +std::tie(A, B, C, D) = taskflow.emplace( + [] () { return 0; }, + [&C] (tf::Runtime& rt) { // C must be captured by reference + std::cout << "B\n"; + rt.schedule(C); + }, + [] () { std::cout << "C\n"; }, + [] () { std::cout << "D\n"; } +); +A.precede(B, C, D); +executor.run(taskflow).wait(); +@endcode + +A runtime object is associated with the worker and the executor that runs its parent task. + +@note +To understand how %Taskflow schedules a runtime task, please refer to @ref RuntimeTasking. + +*/ +class Runtime { + + friend class Executor; + friend class FlowBuilder; + friend class PreemptionGuard; + friend class Algorithm; + + #define TF_RUNTIME_CHECK_CALLER(msg) \ + if(pt::this_worker != &_worker) { \ + TF_THROW(msg); \ + } + + public: + + /** + @brief obtains the running executor + + The running executor of a runtime task is the executor that runs + the parent taskflow of that runtime task. + + @code{.cpp} + tf::Executor executor; + tf::Taskflow taskflow; + taskflow.emplace([&](tf::Runtime& rt){ + assert(&(rt.executor()) == &executor); + }); + executor.run(taskflow).wait(); + @endcode + */ + Executor& executor(); + + /** + @brief acquire a reference to the underlying worker + */ + inline Worker& worker(); + + /** + @brief schedules an active task immediately to the worker's queue + + @param task the given active task to schedule immediately + + This member function immediately schedules an active task to the + task queue of the associated worker in the runtime task. + An active task is a task in a running taskflow. + The task may or may not be running, and scheduling that task + will immediately put the task into the task queue of the worker + that is running the runtime task. + Consider the following example: + + @code{.cpp} + tf::Task A, B, C, D; + std::tie(A, B, C, D) = taskflow.emplace( + [] () { return 0; }, + [&C] (tf::Runtime& rt) { // C must be captured by reference + std::cout << "B\n"; + rt.schedule(C); + }, + [] () { std::cout << "C\n"; }, + [] () { std::cout << "D\n"; } + ); + A.precede(B, C, D); + executor.run(taskflow).wait(); + @endcode + + The executor will first run the condition task @c A which returns @c 0 + to inform the scheduler to go to the runtime task @c B. + During the execution of @c B, it directly schedules task @c C without + going through the normal taskflow graph scheduling process. + At this moment, task @c C is active because its parent taskflow is running. + When the taskflow finishes, we will see both @c B and @c C in the output. + + @attention + This method can only be called by the parent worker of this runtime, + or the behavior is undefined. + */ + void schedule(Task task); + + /** + @brief runs the given callable asynchronously + + @tparam F callable type + @param f callable object + + The method creates an asynchronous task to launch the given + function on the given arguments. + The difference to tf::Executor::async is that the created asynchronous task + pertains to the runtime object. + Applications can explicitly issue tf::Runtime::corun + to wait for all spawned asynchronous tasks to finish. + For example: + + @code{.cpp} + std::atomic counter(0); + taskflow.emplace([&](tf::Runtime& rt){ + auto fu1 = rt.async([&](){ counter++; }); + auto fu2 = rt.async([&](){ counter++; }); + fu1.get(); + fu2.get(); + assert(counter == 2); + + // spawn 100 asynchronous tasks from the worker of the runtime + for(int i=0; i<100; i++) { + rt.silent_async([&](){ counter++; }); + } + + // wait for the 100 asynchronous tasks to finish + rt.corun(); + assert(counter == 102); + }); + @endcode + + This method is thread-safe and can be called by multiple workers + that hold the reference to the runtime. + For example, the code below spawns 100 tasks from the worker of + a runtime, and each of the 100 tasks spawns another task + that will be run by another worker. + + @code{.cpp} + std::atomic counter(0); + taskflow.emplace([&](tf::Runtime& rt){ + // worker of the runtime spawns 100 tasks each spawning another task + // that will be run by another worker + for(int i=0; i<100; i++) { + rt.async([&](){ + counter++; + rt.async([](){ counter++; }); + }); + } + + // wait for the 200 asynchronous tasks to finish + rt.corun(); + assert(counter == 200); + }); + @endcode + */ + template + auto async(F&& f); + + /** + @brief runs the given callable asynchronously + + @tparam F callable type + @tparam P task parameters type + + @param params task parameters + @param f callable + +

      + + @code{.cpp} + taskflow.emplace([&](tf::Runtime& rt){ + auto future = rt.async("my task", [](){}); + future.get(); + }); + @endcode + + */ + template + auto async(P&& params, F&& f); + + /** + @brief runs the given function asynchronously without returning any future object + + @tparam F callable type + @param f callable + + This member function is more efficient than tf::Runtime::async + and is encouraged to use when there is no data returned. + + @code{.cpp} + std::atomic counter(0); + taskflow.emplace([&](tf::Runtime& rt){ + for(int i=0; i<100; i++) { + rt.silent_async([&](){ counter++; }); + } + rt.corun(); + assert(counter == 100); + }); + @endcode + + This member function is thread-safe. + */ + template + void silent_async(F&& f); + + /** + @brief runs the given function asynchronously without returning any future object + + @tparam F callable type + @param params task parameters + @param f callable + +

      + + @code{.cpp} + taskflow.emplace([&](tf::Runtime& rt){ + rt.silent_async("my task", [](){}); + rt.corun(); + }); + @endcode + */ + template + void silent_async(P&& params, F&& f); + + /** + @brief co-runs the given target and waits until it completes + + A corunnable target must have `tf::Graph& T::graph()` defined. + + // co-run a taskflow and wait until all tasks complete + @code{.cpp} + tf::Taskflow taskflow1, taskflow2; + taskflow1.emplace([](){ std::cout << "running taskflow1\n"; }); + taskflow2.emplace([&](tf::Runtime& rt){ + std::cout << "running taskflow2\n"; + rt.corun(taskflow1); + }); + executor.run(taskflow2).wait(); + @endcode + + Although tf::Runtime::corun blocks until the operation completes, + the caller thread (worker) is not blocked (e.g., sleeping or holding any lock). + Instead, the caller thread joins the work-stealing loop of the executor + and returns when all tasks in the target completes. + + @attention + This method can only be called by the parent worker of this runtime, + or the behavior is undefined. + */ + template + void corun(T&& target); + + /** + @brief corun all tasks spawned by this runtime with other workers + + Coruns all tasks spawned by this runtime with other workers until all these tasks finish. + + @code{.cpp} + std::atomic counter{0}; + taskflow.emplace([&](tf::Runtime& rt){ + // spawn 100 async tasks and wait + for(int i=0; i<100; i++) { + rt.silent_async([&](){ counter++; }); + } + rt.corun(); + assert(counter == 100); + + // spawn another 100 async tasks and wait + for(int i=0; i<100; i++) { + rt.silent_async([&](){ counter++; }); + } + rt.corun(); + assert(counter == 200); + }); + @endcode + + @attention + This method can only be called by the parent worker of this runtime, + or the behavior is undefined. + */ + void corun(); + + /** + @brief equivalent to tf::Runtime::corun - just an alias for legacy purpose + */ + void corun_all(); + + /** + @brief This method verifies if the task has been cancelled. + */ + bool is_cancelled(); + +protected: + /** + @private + */ + explicit Runtime(Executor&, Worker&, Node*); + + /** + @private + */ + Executor& _executor; + + /** + @private + */ + Worker& _worker; + + /** + @private + */ + Node* _parent; + + /** + @private + */ + bool _preempted {false}; +}; + +// constructor +inline Runtime::Runtime(Executor& executor, Worker& worker, Node* parent) : + _executor {executor}, + _worker {worker}, + _parent {parent} { +} + +// Function: executor +inline Executor& Runtime::executor() { + return _executor; +} + +// Function: worker +inline Worker& Runtime::worker() { + return _worker; +} + +// Procedure: schedule +inline void Runtime::schedule(Task task) { + + auto node = task._node; + // need to keep the invariant: when scheduling a task, the task must have + // zero dependency (join counter is 0) + // or we can encounter bug when inserting a nested flow (e.g., module task) + node->_join_counter.store(0, std::memory_order_relaxed); + + auto& j = node->_parent ? node->_parent->_join_counter : + node->_topology->_join_counter; + j.fetch_add(1, std::memory_order_relaxed); + _executor._schedule(_worker, node); +} + +// Procedure: corun +template +void Runtime::corun(T&& target) { + static_assert(has_graph_v, "target must define a member function 'Graph& graph()'"); + _executor._corun_graph(*pt::this_worker, _parent, target.graph().begin(), target.graph().end()); +} + +// Function: corun +inline void Runtime::corun() { + { + AnchorGuard anchor(_parent); + _executor._corun_until(_worker, [this] () -> bool { + return _parent->_join_counter.load(std::memory_order_acquire) == 0; + }); + } + _parent->_rethrow_exception(); +} + +// Function: corun_all +inline void Runtime::corun_all() { + corun(); +} + +inline bool Runtime::is_cancelled() { + return _parent->_is_cancelled(); +} + +// ------------------------------------ +// Runtime::silent_async series +// ------------------------------------ + +// Function: silent_async +template +void Runtime::silent_async(F&& f) { + silent_async(DefaultTaskParams{}, std::forward(f)); +} + +// Function: silent_async +template +void Runtime::silent_async(P&& params, F&& f) { + _parent->_join_counter.fetch_add(1, std::memory_order_relaxed); + _executor._silent_async( + std::forward

      (params), std::forward(f), _parent->_topology, _parent + ); +} + +// ------------------------------------ +// Runtime::async series +// ------------------------------------ + +// Function: async +template +auto Runtime::async(F&& f) { + return async(DefaultTaskParams{}, std::forward(f)); +} + +// Function: async +template +auto Runtime::async(P&& params, F&& f) { + _parent->_join_counter.fetch_add(1, std::memory_order_relaxed); + return _executor._async( + std::forward

      (params), std::forward(f), _parent->_topology, _parent + ); +} + +// ---------------------------------------------------------------------------- +// Preemption guard +// ---------------------------------------------------------------------------- + +/** +@private +*/ +class PreemptionGuard { + + public: + + PreemptionGuard(Runtime& runtime) : _runtime {runtime} { + if(_runtime._preempted == true) { + TF_THROW("runtime is not preemptible"); + } + _runtime._parent->_nstate |= NSTATE::PREEMPTED; + _runtime._preempted = true; + _runtime._parent->_join_counter.fetch_add(1, std::memory_order_release); + } + + ~PreemptionGuard() { + // If I am the last to join, then there is not need to preempt the runtime + if(_runtime._parent->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) { + _runtime._preempted = false; + _runtime._parent->_nstate &= ~NSTATE::PREEMPTED; + } + } + + PreemptionGuard(const PreemptionGuard&) = delete; + PreemptionGuard(PreemptionGuard&&) = delete; + + PreemptionGuard& operator = (const PreemptionGuard&) = delete; + PreemptionGuard& operator = (PreemptionGuard&&) = delete; + + private: + + Runtime& _runtime; +}; + + +// ---------------------------------------------------------------------------- +// Executor Forward Declaration +// ---------------------------------------------------------------------------- + +// Procedure: _invoke_runtime_task +inline bool Executor::_invoke_runtime_task(Worker& worker, Node* node) { + return _invoke_runtime_task_impl( + worker, node, std::get_if(&node->_handle)->work + ); +} + +// Function: _invoke_runtime_task_impl +inline bool Executor::_invoke_runtime_task_impl( + Worker& worker, Node* node, std::function& work +) { + // first time + if((node->_nstate & NSTATE::PREEMPTED) == 0) { + + Runtime rt(*this, worker, node); + + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { + work(rt); + }); + _observer_epilogue(worker, node); + + // here, we cannot check the state from node->_nstate due to data race + if(rt._preempted) { + return true; + } + } + // second time - previously preempted + else { + node->_nstate &= ~NSTATE::PREEMPTED; + } + return false; +} + +// Function: _invoke_runtime_task_impl +inline bool Executor::_invoke_runtime_task_impl( + Worker& worker, Node* node, std::function& work +) { + + Runtime rt(*this, worker, node); + + // first time + if((node->_nstate & NSTATE::PREEMPTED) == 0) { + + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { + work(rt, false); + }); + _observer_epilogue(worker, node); + + // here, we cannot check the state from node->_nstate due to data race + // Ex: if preempted, another task may finish real quck and insert this parent task + // again into the scheduling queue. When running this parent task, it will jump to + // else branch below and modify tne nstate, thus incuring data race. + if(rt._preempted) { + return true; + } + } + // second time - previously preempted + else { + node->_nstate &= ~NSTATE::PREEMPTED; + } + + // clean up outstanding work + work(rt, true); + + return false; +} + + + + + +} // end of namespace tf ----------------------------------------------------- + + + + + + + + + diff --git a/taskflow/core/semaphore.hpp b/taskflow/core/semaphore.hpp index 12d6069b1..f7502e982 100644 --- a/taskflow/core/semaphore.hpp +++ b/taskflow/core/semaphore.hpp @@ -1,9 +1,9 @@ #pragma once -#include #include #include "declarations.hpp" +#include "../utility/small_vector.hpp" /** @file semaphore.hpp @@ -41,7 +41,7 @@ tf::Taskflow taskflow; tf::Semaphore semaphore(1); // create a semaphore with initial count 1 -std::vector tasks { +SmallVector tasks { taskflow.emplace([](){ std::cout << "A" << std::endl; }), taskflow.emplace([](){ std::cout << "B" << std::endl; }), taskflow.emplace([](){ std::cout << "C" << std::endl; }), @@ -68,11 +68,20 @@ This arrangement limits the number of concurrently running tasks to only one. class Semaphore { friend class Node; + friend class Executor; public: /** - @brief constructs a semaphore with the given counter + @brief constructs a default semaphore + + A default semaphore has the value of zero. Users can call tf::Semaphore::reset + to reassign a new value to the semaphore. + */ + Semaphore() = default; + + /** + @brief constructs a semaphore with the given value (i.e., counter) A semaphore creates a constraint that limits the maximum concurrency, i.e., the number of workers, in a set of tasks. @@ -81,34 +90,51 @@ class Semaphore { tf::Semaphore semaphore(4); // concurrency constraint of 4 workers @endcode */ - explicit Semaphore(size_t max_workers); + explicit Semaphore(size_t max_value); /** - @brief queries the counter value (not thread-safe during the run) + @brief queries the current counter value */ - size_t count() const; + size_t value() const; - private: + /** + @brief queries the maximum allowable value of this semaphore + */ + size_t max_value() const; - std::mutex _mtx; + /** + @brief resets the semaphores to a clean state + */ + void reset(); + + /** + @brief resets the semaphores to a clean state with the given new maximum value + */ + void reset(size_t new_max_value); - size_t _counter; + private: - std::vector _waiters; + mutable std::mutex _mtx; + + size_t _max_value{0}; + size_t _cur_value{0}; + + SmallVector _waiters; bool _try_acquire_or_wait(Node*); - std::vector _release(); + void _release(SmallVector&); }; -inline Semaphore::Semaphore(size_t max_workers) : - _counter(max_workers) { +inline Semaphore::Semaphore(size_t max_value) : + _max_value(max_value), + _cur_value(max_value) { } inline bool Semaphore::_try_acquire_or_wait(Node* me) { std::lock_guard lock(_mtx); - if(_counter > 0) { - --_counter; + if(_cur_value > 0) { + --_cur_value; return true; } else { @@ -117,15 +143,45 @@ inline bool Semaphore::_try_acquire_or_wait(Node* me) { } } -inline std::vector Semaphore::_release() { +inline void Semaphore::_release(SmallVector& dst) { + std::lock_guard lock(_mtx); - ++_counter; - std::vector r{std::move(_waiters)}; - return r; + + if(_cur_value >= _max_value) { + TF_THROW("can't release the semaphore more than its maximum value: ", _max_value); + } + + ++_cur_value; + + if(dst.empty()) { + dst.swap(_waiters); + } + else { + dst.reserve(dst.size() + _waiters.size()); + dst.insert(dst.end(), _waiters.begin(), _waiters.end()); + _waiters.clear(); + } +} + +inline size_t Semaphore::max_value() const { + return _max_value; } -inline size_t Semaphore::count() const { - return _counter; +inline size_t Semaphore::value() const { + std::lock_guard lock(_mtx); + return _cur_value; +} + +inline void Semaphore::reset() { + std::lock_guard lock(_mtx); + _cur_value = _max_value; + _waiters.clear(); +} + +inline void Semaphore::reset(size_t new_max_value) { + std::lock_guard lock(_mtx); + _cur_value = (_max_value = new_max_value); + _waiters.clear(); } } // end of namespace tf. --------------------------------------------------- diff --git a/taskflow/core/task.hpp b/taskflow/core/task.hpp index 1070671c4..ec05481ed 100644 --- a/taskflow/core/task.hpp +++ b/taskflow/core/task.hpp @@ -23,6 +23,8 @@ enum class TaskType : int { PLACEHOLDER = 0, /** @brief static task type */ STATIC, + /** @brief runtime task type */ + RUNTIME, /** @brief dynamic (subflow) task type */ SUBFLOW, /** @brief condition task type */ @@ -39,9 +41,10 @@ enum class TaskType : int { @private @brief array of all task types (used for iterating task types) */ -inline constexpr std::array TASK_TYPES = { +inline constexpr std::array TASK_TYPES = { TaskType::PLACEHOLDER, TaskType::STATIC, + TaskType::RUNTIME, TaskType::SUBFLOW, TaskType::CONDITION, TaskType::MODULE, @@ -52,83 +55,129 @@ inline constexpr std::array TASK_TYPES = { @brief convert a task type to a human-readable string The name of each task type is the litte-case string of its characters. - -@code{.cpp} -TaskType::PLACEHOLDER -> "placeholder" -TaskType::STATIC -> "static" -TaskType::SUBFLOW -> "subflow" -TaskType::CONDITION -> "condition" -TaskType::MODULE -> "module" -TaskType::ASYNC -> "async" -@endcode + + TaskType::PLACEHOLDER is of string `placeholder` + + TaskType::STATIC is of string `static` + + TaskType::RUNTIME is of string `runtime` + + TaskType::SUBFLOW is of string `subflow` + + TaskType::CONDITION is of string `condition` + + TaskType::MODULE is of string `module` + + TaskType::ASYNC is of string `async` */ inline const char* to_string(TaskType type) { const char* val; switch(type) { - case TaskType::PLACEHOLDER: val = "placeholder"; break; - case TaskType::STATIC: val = "static"; break; - case TaskType::SUBFLOW: val = "subflow"; break; - case TaskType::CONDITION: val = "condition"; break; - case TaskType::MODULE: val = "module"; break; - case TaskType::ASYNC: val = "async"; break; - default: val = "undefined"; break; + case TaskType::PLACEHOLDER: val = "placeholder"; break; + case TaskType::STATIC: val = "static"; break; + case TaskType::RUNTIME: val = "runtime"; break; + case TaskType::SUBFLOW: val = "subflow"; break; + case TaskType::CONDITION: val = "condition"; break; + case TaskType::MODULE: val = "module"; break; + case TaskType::ASYNC: val = "async"; break; + default: val = "undefined"; break; } return val; } // ---------------------------------------------------------------------------- -// Task Traits +// Static Task Trait // ---------------------------------------------------------------------------- /** -@brief determines if a callable is a dynamic task +@private +*/ +template +struct is_static_task : std::false_type {}; -A dynamic task is a callable object constructible from std::function. +/** +@private */ template -constexpr bool is_subflow_task_v = - std::is_invocable_r_v && - !std::is_invocable_r_v; +struct is_static_task>> + : std::is_same, void> {}; /** -@brief determines if a callable is a condition task +@brief determines if a callable is a static task -A condition task is a callable object constructible from std::function -or std::function. +A static task is a callable object constructible from std::function. */ template -constexpr bool is_condition_task_v = - (std::is_invocable_r_v || std::is_invocable_r_v) && - !is_subflow_task_v; +constexpr bool is_static_task_v = is_static_task::value; + +// ---------------------------------------------------------------------------- +// Subflow Task Trait +// ---------------------------------------------------------------------------- /** -@brief determines if a callable is a multi-condition task +@private +*/ +template +struct is_subflow_task : std::false_type {}; -A multi-condition task is a callable object constructible from -std::function()> or -std::function(tf::Runtime&)>. +/** +@private */ template -constexpr bool is_multi_condition_task_v = - (std::is_invocable_r_v, C> || - std::is_invocable_r_v, C, Runtime&>) && - !is_subflow_task_v; +struct is_subflow_task>> + : std::is_same, void> {}; /** -@brief determines if a callable is a static task +@brief determines if a callable is a subflow task -A static task is a callable object constructible from std::function -or std::function. +A subflow task is a callable object constructible from std::function. */ template -constexpr bool is_static_task_v = - (std::is_invocable_r_v || std::is_invocable_r_v) && - !is_condition_task_v && - !is_multi_condition_task_v && - !is_subflow_task_v; +constexpr bool is_subflow_task_v = is_subflow_task::value; + +// ---------------------------------------------------------------------------- +// Runtime Task Trait +// ---------------------------------------------------------------------------- + +/** +@private +*/ +template +struct is_runtime_task : std::false_type {}; + +/** +@private +*/ +template +struct is_runtime_task>> + : std::is_same, void> {}; + +/** +@brief determines if a callable is a runtime task + +A runtime task is a callable object constructible from std::function. +*/ +template +constexpr bool is_runtime_task_v = is_runtime_task::value; + + +// ---------------------------------------------------------------------------- +// Condition Task Trait +// ---------------------------------------------------------------------------- + +/** +@brief determines if a callable is a condition task + +A condition task is a callable object constructible from std::function. +*/ +template +constexpr bool is_condition_task_v = std::is_invocable_r_v; + +/** +@brief determines if a callable is a multi-condition task + +A multi-condition task is a callable object constructible from +std::function()>. +*/ +template +constexpr bool is_multi_condition_task_v = std::is_invocable_r_v, C>; + // ---------------------------------------------------------------------------- // Task @@ -137,14 +186,59 @@ constexpr bool is_static_task_v = /** @class Task -@brief class to create a task handle over a node in a taskflow graph +@brief class to create a task handle over a taskflow node + +A task points to a node in a taskflow graph and provides a set of methods for users to access and modify +attributes of the associated node, +such as dependencies, callable, names, and so on. +A task is a very lightweight object (i.e., it only stores a node pointer) and can be trivially +copied around. + +@code{.cpp} +// create two tasks with one dependency +auto task1 = taskflow.emplace([](){}).name("task1"); +auto task2 = taskflow.emplace([](){}).name("task2"); +task1.precede(task2); + +// dump the task information through std::cout +task1.dump(std::cout); +@endcode + +A task created from a taskflow can be one of the following types: + + tf::TaskType::STATIC - @ref StaticTasking + + tf::TaskType::CONDITION - @ref ConditionalTasking + + tf::TaskType::RUNTIME - @ref RuntimeTasking + + tf::TaskType::SUBFLOW - @ref SubflowTasking + + tf::TaskType::MODULE - @ref ComposableTasking + +@code{.cpp} +tf::Task task1 = taskflow.emplace([](){}).name("static task"); +tf::Task task2 = taskflow.emplace([](){ return 3; }).name("condition task"); +tf::Task task3 = taskflow.emplace([](tf::Runtime&){}).name("runtime task"); +tf::Task task4 = taskflow.emplace([](tf::Subflow& sf){ + tf::Task stask1 = sf.emplace([](){}); + tf::Task stask2 = sf.emplace([](){}); +}).name("subflow task"); +tf::Task task5 = taskflow.composed_of(taskflow2).name("module task"); +@endcode + +A tf::Task is polymorphic. +Once created, you can assign a different task type to it using tf::Task::work. +For example, the code below creates a static task and then reworks it to a subflow task: + +@code{.cpp} +tf::Task task = taskflow.emplace([](){}).name("static task"); +task.work([](tf::Subflow& sf){ + tf::Task stask1 = sf.emplace([](){}); + tf::Task stask2 = sf.emplace([](){}); +}).name("subflow task"); +@endcode + +@attention +tf::Task does not own the lifetime of the associated node. +Accessing the attributes of the associated node after the taskflow has been destroyed +can result in undefined behavior. -A task is a wrapper over a node in a taskflow graph. -It provides a set of methods for users to access and modify the attributes of -the associated node in the taskflow graph. -A task is very lightweight object (i.e., only storing a node pointer) that -can be trivially copied around, -and it does not own the lifetime of the associated node. */ class Task { @@ -158,65 +252,186 @@ class Task { /** @brief constructs an empty task + + An empty task is not associated with any node in a taskflow. */ Task() = default; /** @brief constructs the task with the copy of the other task + + @param other the other task to copy + + @code{.cpp} + tf::Taskflow taskflow; + tf::Task A = taskflow.emplace([](){ std::cout << "Task A\n"; }); + tf::Task B(A); + assert(B == A); // Now, B and A refer to the same underlying node + @endcode */ Task(const Task& other); /** @brief replaces the contents with a copy of the other task + + @param other the other task to copy + + @code{.cpp} + tf::Task A = taskflow.emplace([](){ std::cout << "A\n"; }); + tf::Task B; + B = A; // B now refers to the same node as A + @endcode */ - Task& operator = (const Task&); + Task& operator = (const Task& other); /** @brief replaces the contents with a null pointer + + @code{.cpp} + tf::Task A = taskflow.emplace([](){ std::cout << "A\n"; }); + A = nullptr; // A no longer refers to any node + @endcode */ Task& operator = (std::nullptr_t); /** - @brief compares if two tasks are associated with the same graph node + @brief compares if two tasks are associated with the same taskflow node + + @param rhs the other task to compare with + @return true if both tasks refer to the same node; false otherwise + + @code{.cpp} + tf::Task A = taskflow.emplace([](){ std::cout << "A\n"; }); + tf::Task B = A; + assert(A == B); // A and B refer to the same node + @endcode */ bool operator == (const Task& rhs) const; /** - @brief compares if two tasks are not associated with the same graph node + @brief compares if two tasks are not associated with the same taskflow node + + @param rhs the other task to compare with + @return true if they refer to different nodes; false otherwise + + @code{.cpp} + tf::Task A = taskflow.emplace([](){ std::cout << "A\n"; }); + tf::Task B = taskflow.emplace([](){ std::cout << "B\n"; }); + assert(A != B); // A and B refer to different nodes + @endcode */ bool operator != (const Task& rhs) const; /** @brief queries the name of the task + + @return the name of the task as a constant string reference + + @code{.cpp} + tf::Task task = taskflow.emplace([](){}); + task.name("MyTask"); + std::cout << "Task name: " << task.name() << std::endl; + @endcode */ const std::string& name() const; /** @brief queries the number of successors of the task + + @return the number of successor tasks. + + @code{.cpp} + tf::Task A = taskflow.emplace([](){}); + tf::Task B = taskflow.emplace([](){}); + A.precede(B); // B is a successor of A + std::cout << "A has " << A.num_successors() << " successor(s)." << std::endl; + @endcode */ size_t num_successors() const; /** @brief queries the number of predecessors of the task + + @return the number of predecessor tasks + + @code{.cpp} + tf::Task A = taskflow.emplace([](){}); + tf::Task B = taskflow.emplace([](){}); + A.precede(B); // A is a predecessor of B + std::cout << "B has " << B.num_predecessors() << " predecessor(s)." << std::endl; + @endcode */ - size_t num_dependents() const; + size_t num_predecessors() const; /** - @brief queries the number of strong dependents of the task + @brief queries the number of strong dependencies of the task + + @return the number of strong dependencies to this task + + A strong dependency is a preceding link from one non-condition task to another task. + For instance, task `cond` below has one strong dependency, while tasks `yes` and `no` + each have one weak dependency. + + @code{.cpp} + auto [init, cond, yes, no] = taskflow.emplace( + [] () { }, + [] () { return 0; }, + [] () { std::cout << "yes\n"; }, + [] () { std::cout << "no\n"; } + ); + cond.succeed(init) + .precede(yes, no); // executes yes if cond returns 0 + // executes no if cond returns 1 + @endcode + + @dotfile images/conditional-tasking-if-else.dot + + @note + To understand how %Taskflow schedule tasks under strong and weak dependencies, + please refer to @ref ConditionalTasking. */ - size_t num_strong_dependents() const; + size_t num_strong_dependencies() const; /** - @brief queries the number of weak dependents of the task + @brief queries the number of weak dependencies of the task + + @return the number of weak dependencies to this task + + A weak dependency is a preceding link from one condition task to another task. + For instance, task `cond` below has one strong dependency, while tasks `yes` and `no` + each have one weak dependency. + + @code{.cpp} + auto [init, cond, yes, no] = taskflow.emplace( + [] () { }, + [] () { return 0; }, + [] () { std::cout << "yes\n"; }, + [] () { std::cout << "no\n"; } + ); + cond.succeed(init) + .precede(yes, no); // executes yes if cond returns 0 + // executes no if cond returns 1 + @endcode + + @dotfile images/conditional-tasking-if-else.dot + + @note + To understand how %Taskflow schedule tasks under strong and weak dependencies, + please refer to @ref ConditionalTasking. */ - size_t num_weak_dependents() const; + size_t num_weak_dependencies() const; /** @brief assigns a name to the task - @param name a @std_string acceptable string + @param name a @std_string @return @c *this + + @code{.cpp} + tf::Task task = taskflow.emplace([](){}).name("foo"); + assert(task.name*) == "foo"); + @endcode */ Task& name(const std::string& name); @@ -228,6 +443,19 @@ class Task { @param callable callable to construct a task @return @c *this + + A tf::Task is polymorphic. + Once created, you can reassign it to a different callable of a different task type + using tf::Task::work. + For example, the code below creates a static task and reworks it to a subflow task: + + @code{.cpp} + tf::Task task = taskflow.emplace([](){}).name("static task"); + task.work([](tf::Subflow& sf){ + tf::Task stask1 = sf.emplace([](){}); + tf::Task stask2 = sf.emplace([](){}); + }).name("subflow task"); + @endcode */ template Task& work(C&& callable); @@ -239,6 +467,15 @@ class Task { @param object a custom object that defines @c T::graph() method @return @c *this + + The example below creates a module task from a taskflow: + + @code{.cpp} + task.composed_of(taskflow); + @endcode + + To understand how %Taskflow schedules a module task including how to create a schedulable graph, + pleas refer to @ref CreateACustomComposableGraph. */ template Task& composed_of(T& object); @@ -251,6 +488,16 @@ class Task { @param tasks one or multiple tasks @return @c *this + + The example below creates a taskflow of two tasks, where `task1` runs before `task2`. + + @code{.cpp} + auto [task1, task2] = taskflow.emplace( + [](){ std::cout << "task1\n"; }, + [](){ std::cout << "task2\n"; } + ); + task1.precede(task2); + @endcode */ template Task& precede(Ts&&... tasks); @@ -263,35 +510,130 @@ class Task { @param tasks one or multiple tasks @return @c *this + + The example below creates a taskflow of two tasks, where `task1` runs before `task2`. + + @code{.cpp} + auto [task1, task2] = taskflow.emplace( + [](){ std::cout << "task1\n"; }, + [](){ std::cout << "task2\n"; } + ); + task2.succeed(task1); + @endcode */ template Task& succeed(Ts&&... tasks); + + /** + @brief removes predecessor links from other tasks to this + + @tparam Ts parameter pack + + @param tasks one or multiple tasks + + @return @c *this + + This method removes the dependency links where the given tasks are predecessors + of this task (i.e., tasks -> this). It ensures both sides of the dependency + are updated to maintain graph consistency. + + @code{.cpp} + tf::Task A = taskflow.emplace([](){}); + tf::Task B = taskflow.emplace([](){}); + tf::Task C = taskflow.emplace([](){}); + // create a linear chain of tasks, A->B->C + B.succeed(A) + .precede(C); + assert(B.num_successors() == 1 && C.num_predecessors() == 1); + + // remove C from B's successor list + C.remove_predecessors(B); + assert(B.num_successors() == 0 && C.num_predecessors() == 0); + @endcode + */ + template + Task& remove_predecessors(Ts&&... tasks); /** - @brief makes the task release this semaphore + @brief removes successor links from this to other tasks + + @tparam Ts parameter pack + + @param tasks one or multiple tasks + + @return @c *this + + This method removes the dependency links where this task is a predecessor + of the given tasks (i.e., this -> tasks). It ensures both sides of the dependency + are updated to maintain graph consistency. + + @code{.cpp} + tf::Task A = taskflow.emplace([](){}); + tf::Task B = taskflow.emplace([](){}); + tf::Task C = taskflow.emplace([](){}); + // create a linear chain of tasks, A->B->C + B.succeed(A) + .precede(C); + assert(B.num_successors() == 1 && C.num_predecessors() == 1); + + // remove C from B's successor list + B.remove_successors(C); + assert(B.num_successors() == 0 && C.num_predecessors() == 0); + @endcode + */ + template + Task& remove_successors(Ts&&... tasks); + + /** + @brief makes the task release the given semaphore + + @note + To know more about tf::Semaphore, please refer to @ref LimitTheMaximumConcurrency. */ Task& release(Semaphore& semaphore); + + /** + @brief makes the task release the given range of semaphores + + @note + To know more about tf::Semaphore, please refer to @ref LimitTheMaximumConcurrency. + */ + template + Task& release(I first, I last); /** - @brief makes the task acquire this semaphore + @brief makes the task acquire the given semaphore + + @note + To know more about tf::Semaphore, please refer to @ref LimitTheMaximumConcurrency. */ Task& acquire(Semaphore& semaphore); + /** + @brief makes the task acquire the given range of semaphores + + @note + To know more about tf::Semaphore, please refer to @ref LimitTheMaximumConcurrency. + */ + template + Task& acquire(I first, I last); + /** @brief assigns pointer to user data @param data pointer to user data + @return @c *this - The following example shows how to attach user data to a task and - run the task iteratively while changing the data value: + The following example shows how to attach a user data to a task and retrieve it + during the execution of the task. @code{.cpp} tf::Executor executor; tf::Taskflow taskflow("attach data to a task"); + + int data; // user data - int data; - - // create a task and attach it the data + // create a task and attach it a user data auto A = taskflow.placeholder(); A.data(&data).work([A](){ auto d = *static_cast(A.data()); @@ -304,28 +646,20 @@ class Task { } @endcode - @return @c *this */ Task& data(void* data); /** - @brief assigns a priority value to the task + @brief resets the task handle to null - A priority value can be one of the following three levels, - tf::TaskPriority::HIGH (numerically equivalent to 0), - tf::TaskPriority::NORMAL (numerically equivalent to 1), and - tf::TaskPriority::LOW (numerically equivalent to 2). - The smaller the priority value, the higher the priority. - */ - Task& priority(TaskPriority p); - - /** - @brief queries the priority value of the task - */ - TaskPriority priority() const; + Resetting a task will remove its associated taskflow node and make it an empty task. - /** - @brief resets the task handle to null + @code{.cpp} + tf::Task task = taskflow.emplace([](){}); + assert(task.empty() == false); + task.reset(); + assert(task.empty() == true); + @endcode */ void reset(); @@ -335,48 +669,176 @@ class Task { void reset_work(); /** - @brief queries if the task handle points to a task node + @brief queries if the task handle is associated with a taskflow node + + @return `true` if the task is not associated with any taskflow node; otherwise `false` + + @code{.cpp} + tf::Task task; + assert(task.empty() == true); + @endcode + + Note that an empty task is not equal to a placeholder task. + A placeholder task is created from tf::Taskflow::placeholder and is associated with + a taskflow node, but its work is not assigned yet. */ bool empty() const; /** @brief queries if the task has a work assigned + + @return `true` if the task has a work assigned (not placeholder); otherwise `false` + + @code{.cpp} + tf::Task task = taskflow.placeholder(); + assert(task.has_work() == false); + // assign a static task callable to this task + task.work([](){}); + assert(task.has_work() == true); + @endcode */ bool has_work() const; /** @brief applies an visitor callable to each successor of the task + + @tparam V a callable type (function, lambda, etc.) that accepts a tf::Task handle + @param visitor visitor to apply to each subflow task + + This method allows you to traverse and inspect successor tasks of this task. + For instance, the code below iterates the two successors (`task2` and `task3`) of `task1`. + + @code{.cpp} + auto [task1, task2, task3] = taskflow.emplace( + [](){ std::cout << "task 1\n"; }, + [](){ std::cout << "task 2\n"; }, + [](){ std::cout << "task 3\n"; } + }); + task1.precede(task2, task3); + task1.for_each_successor([](tf::Task successor){ + std::cout << "successor task " << successor.name() << '\n'; + }); + @endcode + */ template void for_each_successor(V&& visitor) const; /** - @brief applies an visitor callable to each dependents of the task + @brief applies an visitor callable to each predecessor of the task + + @tparam V a callable type (function, lambda, etc.) that accepts a tf::Task handle + @param visitor visitor to apply to each predecessor task + + This method allows you to traverse and inspect predecessor tasks of this task. + For instance, the code below iterates the two predecessors (`task2` and `task3`) of `task1`. + + @code{.cpp} + auto [task1, task2, task3] = taskflow.emplace( + [](){ std::cout << "task 1\n"; }, + [](){ std::cout << "task 2\n"; }, + [](){ std::cout << "task 3\n"; } + }); + task1.succeed(task2, task3); + task1.for_each_predecessor([](tf::Task predecessor){ + std::cout << "predecessor task " << predecessor.name() << '\n'; + }); + @endcode + */ + template + void for_each_predecessor(V&& visitor) const; + + /** + @brief applies an visitor callable to each subflow task + + @tparam V a callable type (function, lambda, etc.) that accepts a tf::Task handle + @param visitor visitor to apply to each subflow task + + This method allows you to traverse and inspect tasks within a subflow. + It only applies to a subflow task. + + @code{.cpp} + tf::Task task = taskflow.emplace([](tf::Subflow& sf){ + tf::Task stask1 = sf.emplace([](){}).name("stask1"); + tf::Task stask2 = sf.emplace([](){}).name("stask2"); + }); + // Iterate tasks in the subflow and print each subflow task. + task.for_each_subflow_task([](tf::Task stask){ + std::cout << "subflow task " << stask.name() << '\n'; + }); + @endcode */ template - void for_each_dependent(V&& visitor) const; + void for_each_subflow_task(V&& visitor) const; /** @brief obtains a hash value of the underlying node + + @return the hash value of the underlying node + + The method returns std::hash on the underlying node pointer. + + @code{.cpp} + tf::Task task = taskflow.emplace([](){}); + std::cout << "hash value of task is " << task.hash_value() << '\n'; + @endcode */ size_t hash_value() const; /** @brief returns the task type + + A task can be one of the types defined in tf::TaskType and can be printed in + a human-readable form using tf::to_string. + + @code{.cpp} + auto task = taskflow.emplace([](){}).name("task"); + std::cout << task.name() << " type=[" << tf::to_string(task.type()) << "]\n"; + @endcode + */ TaskType type() const; /** @brief dumps the task through an output stream + + The method dumps the name and the type of this task through std::cout. + + @code{.cpp} + task.dump(std::cout); + @endcode */ void dump(std::ostream& ostream) const; /** @brief queries pointer to user data + + @return C-styled pointer to the attached user data by tf::Task::data(void* data) + + The following example shows how to attach a user data to a task and retrieve it + during the execution of the task. + + @code{.cpp} + tf::Executor executor; + tf::Taskflow taskflow("attach data to a task"); + + int data; // user data + + // create a task and attach it a user data + auto A = taskflow.placeholder(); + A.data(&data).work([A](){ + auto d = *static_cast(A.data()); + std::cout << "data is " << d << std::endl; + }); + + // run the taskflow iteratively with changing data + for(data = 0; data<10; data++){ + executor.run(taskflow).wait(); + } + @endcode */ void* data() const; - private: Task(Node*); @@ -408,6 +870,22 @@ Task& Task::succeed(Ts&&... tasks) { return *this; } +// Function: remove_predecessors +template +Task& Task::remove_predecessors(Ts&&... tasks) { + (tasks._node->_remove_successors(_node), ...); + (_node->_remove_predecessors(tasks._node), ...); + return *this; +} + +// Function: remove_successors +template +Task& Task::remove_successors(Ts&&... tasks) { + (_node->_remove_successors(tasks._node), ...); + (tasks._node->_remove_predecessors(_node), ...); + return *this; +} + // Function: composed_of template Task& Task::composed_of(T& object) { @@ -452,16 +930,45 @@ inline Task& Task::acquire(Semaphore& s) { return *this; } +// Function: acquire +template +Task& Task::acquire(I first, I last) { + if(!_node->_semaphores) { + _node->_semaphores = std::make_unique(); + } + _node->_semaphores->to_acquire.reserve( + _node->_semaphores->to_acquire.size() + std::distance(first, last) + ); + for(auto s = first; s != last; ++s){ + _node->_semaphores->to_acquire.push_back(&(*s)); + } + return *this; +} + // Function: release inline Task& Task::release(Semaphore& s) { if(!_node->_semaphores) { - //_node->_semaphores.emplace(); _node->_semaphores = std::make_unique(); } _node->_semaphores->to_release.push_back(&s); return *this; } +// Function: release +template +Task& Task::release(I first, I last) { + if(!_node->_semaphores) { + _node->_semaphores = std::make_unique(); + } + _node->_semaphores->to_release.reserve( + _node->_semaphores->to_release.size() + std::distance(first, last) + ); + for(auto s = first; s != last; ++s) { + _node->_semaphores->to_release.push_back(&(*s)); + } + return *this; +} + // Procedure: reset inline void Task::reset() { _node = nullptr; @@ -477,19 +984,19 @@ inline const std::string& Task::name() const { return _node->_name; } -// Function: num_dependents -inline size_t Task::num_dependents() const { - return _node->num_dependents(); +// Function: num_predecessors +inline size_t Task::num_predecessors() const { + return _node->num_predecessors(); } -// Function: num_strong_dependents -inline size_t Task::num_strong_dependents() const { - return _node->num_strong_dependents(); +// Function: num_strong_dependencies +inline size_t Task::num_strong_dependencies() const { + return _node->num_strong_dependencies(); } -// Function: num_weak_dependents -inline size_t Task::num_weak_dependents() const { - return _node->num_weak_dependents(); +// Function: num_weak_dependencies +inline size_t Task::num_weak_dependencies() const { + return _node->num_weak_dependencies(); } // Function: num_successors @@ -512,6 +1019,7 @@ inline TaskType Task::type() const { switch(_node->_handle.index()) { case Node::PLACEHOLDER: return TaskType::PLACEHOLDER; case Node::STATIC: return TaskType::STATIC; + case Node::RUNTIME: return TaskType::RUNTIME; case Node::SUBFLOW: return TaskType::SUBFLOW; case Node::CONDITION: return TaskType::CONDITION; case Node::MULTI_CONDITION: return TaskType::CONDITION; @@ -525,16 +1033,26 @@ inline TaskType Task::type() const { // Function: for_each_successor template void Task::for_each_successor(V&& visitor) const { - for(size_t i=0; i<_node->_successors.size(); ++i) { - visitor(Task(_node->_successors[i])); + for(size_t i=0; i<_node->_num_successors; ++i) { + visitor(Task(_node->_edges[i])); } } -// Function: for_each_dependent +// Function: for_each_predecessor template -void Task::for_each_dependent(V&& visitor) const { - for(size_t i=0; i<_node->_dependents.size(); ++i) { - visitor(Task(_node->_dependents[i])); +void Task::for_each_predecessor(V&& visitor) const { + for(size_t i=_node->_num_successors; i<_node->_edges.size(); ++i) { + visitor(Task(_node->_edges[i])); + } +} + +// Function: for_each_subflow_task +template +void Task::for_each_subflow_task(V&& visitor) const { + if(auto ptr = std::get_if(&_node->_handle); ptr) { + for(auto itr = ptr->subgraph.begin(); itr != ptr->subgraph.end(); ++itr) { + visitor(Task(itr->get())); + } } } @@ -558,6 +1076,9 @@ Task& Task::work(C&& c) { if constexpr(is_static_task_v) { _node->_handle.emplace(std::forward(c)); } + else if constexpr(is_runtime_task_v) { + _node->_handle.emplace(std::forward(c)); + } else if constexpr(is_subflow_task_v) { _node->_handle.emplace(std::forward(c)); } @@ -584,17 +1105,6 @@ inline Task& Task::data(void* data) { return *this; } -// Function: priority -inline Task& Task::priority(TaskPriority p) { - _node->_priority = static_cast(p); - return *this; -} - -// Function: priority -inline TaskPriority Task::priority() const { - return static_cast(_node->_priority); -} - // ---------------------------------------------------------------------------- // global ostream // ---------------------------------------------------------------------------- @@ -635,29 +1145,39 @@ class TaskView { /** @brief queries the number of predecessors of the task */ - size_t num_dependents() const; + size_t num_predecessors() const; /** - @brief queries the number of strong dependents of the task + @brief queries the number of strong dependencies of the task */ - size_t num_strong_dependents() const; + size_t num_strong_dependencies() const; /** - @brief queries the number of weak dependents of the task + @brief queries the number of weak dependencies of the task */ - size_t num_weak_dependents() const; + size_t num_weak_dependencies() const; /** @brief applies an visitor callable to each successor of the task + + @tparam V a callable type (function, lambda, etc.) that accepts a tf::Task handle + @param visitor visitor to apply to each subflow task + + This method allows you to traverse and inspect successor tasks of this task. */ template void for_each_successor(V&& visitor) const; /** - @brief applies an visitor callable to each dependents of the task + @brief applies an visitor callable to each predecessor of the task + + @tparam V a callable type (function, lambda, etc.) that accepts a tf::Task handle + @param visitor visitor to apply to each predecessor task + + This method allows you to traverse and inspect predecessor tasks of this task. */ template - void for_each_dependent(V&& visitor) const; + void for_each_predecessor(V&& visitor) const; /** @brief queries the task type @@ -686,19 +1206,19 @@ inline const std::string& TaskView::name() const { return _node._name; } -// Function: num_dependents -inline size_t TaskView::num_dependents() const { - return _node.num_dependents(); +// Function: num_predecessors +inline size_t TaskView::num_predecessors() const { + return _node.num_predecessors(); } -// Function: num_strong_dependents -inline size_t TaskView::num_strong_dependents() const { - return _node.num_strong_dependents(); +// Function: num_strong_dependencies +inline size_t TaskView::num_strong_dependencies() const { + return _node.num_strong_dependencies(); } -// Function: num_weak_dependents -inline size_t TaskView::num_weak_dependents() const { - return _node.num_weak_dependents(); +// Function: num_weak_dependencies +inline size_t TaskView::num_weak_dependencies() const { + return _node.num_weak_dependencies(); } // Function: num_successors @@ -711,6 +1231,7 @@ inline TaskType TaskView::type() const { switch(_node._handle.index()) { case Node::PLACEHOLDER: return TaskType::PLACEHOLDER; case Node::STATIC: return TaskType::STATIC; + case Node::RUNTIME: return TaskType::RUNTIME; case Node::SUBFLOW: return TaskType::SUBFLOW; case Node::CONDITION: return TaskType::CONDITION; case Node::MULTI_CONDITION: return TaskType::CONDITION; @@ -729,17 +1250,23 @@ inline size_t TaskView::hash_value() const { // Function: for_each_successor template void TaskView::for_each_successor(V&& visitor) const { - for(size_t i=0; i<_node._successors.size(); ++i) { - visitor(TaskView(*_node._successors[i])); + for(size_t i=0; i<_node._num_successors; ++i) { + visitor(TaskView(*_node._edges[i])); } + //for(size_t i=0; i<_node._successors.size(); ++i) { + // visitor(TaskView(*_node._successors[i])); + //} } -// Function: for_each_dependent +// Function: for_each_predecessor template -void TaskView::for_each_dependent(V&& visitor) const { - for(size_t i=0; i<_node._dependents.size(); ++i) { - visitor(TaskView(*_node._dependents[i])); +void TaskView::for_each_predecessor(V&& visitor) const { + for(size_t i=_node._num_successors; i<_node._edges.size(); ++i) { + visitor(TaskView(*_node._edges[i])); } + //for(size_t i=0; i<_node._predecessors.size(); ++i) { + // visitor(TaskView(*_node._predecessors[i])); + //} } } // end of namespace tf. ---------------------------------------------------- diff --git a/taskflow/core/taskflow.hpp b/taskflow/core/taskflow.hpp index f6a0f424a..19ffee2a8 100644 --- a/taskflow/core/taskflow.hpp +++ b/taskflow/core/taskflow.hpp @@ -69,6 +69,7 @@ class Taskflow : public FlowBuilder { friend class Topology; friend class Executor; friend class FlowBuilder; + friend class Subflow; struct Dumper { size_t id; @@ -105,8 +106,8 @@ class Taskflow : public FlowBuilder { assert(taskflow2.empty()); @endcode - Notice that @c taskflow2 should not be running in an executor - during the move operation, or the behavior is undefined. + @attention You should avoid moving a taskflow that is currently running on an executor. + Doing so results in undefined behavior. */ Taskflow(Taskflow&& rhs); @@ -122,8 +123,8 @@ class Taskflow : public FlowBuilder { assert(taskflow2.empty()); @endcode - Notice that both @c taskflow1 and @c taskflow2 should not be running - in an executor during the move operation, or the behavior is undefined. + @attention You should avoid moving a taskflow that is currently running on an executor. + Doing so results in undefined behavior. */ Taskflow& operator = (Taskflow&& rhs); @@ -191,32 +192,62 @@ class Taskflow : public FlowBuilder { std::string dump() const; /** - @brief queries the number of tasks + @brief queries the number of tasks in this taskflow + + The number of tasks in this taskflow is defined at the first level of hierarchy. + Tasks that are created dynamically, such as those via tf::Subflow, are not counted. + + @code{.cpp} + tf::Taskflow taskflow; + auto my_task = taskflow.emplace([](){}); + assert(taskflow.num_tasks() == 1); + + // reassign my_task to a subflow of four tasks + my_task.work([](tf::Subflow& sf){ + sf.emplace( + [](){ std::cout << "Task A\n"; }, + [](){ std::cout << "Task B\n"; }, + [](){ std::cout << "Task C\n"; }, + [](){ std::cout << "Task D\n"; } + ); + }); + + // subflow tasks will not be counted + assert(taskflow.num_tasks() == 1); + @endcode */ size_t num_tasks() const; /** - @brief queries the emptiness of the taskflow + @brief queries if this taskflow is empty (has no tasks) - An empty taskflow has no tasks. That is the return of - tf::Taskflow::num_tasks is zero. + An empty taskflow has no tasks, i.e., the return of tf::Taskflow::num_tasks is `0`. + + @code{.cpp} + tf::Taskflow taskflow; + assert(taskflow.empty() == true); + taskflow.emplace([](){}); + assert(taskflow.empty() == false); + @endcode */ bool empty() const; /** - @brief assigns a name to the taskflow + @brief assigns a new name to this taskflow @code{.cpp} - taskflow.name("assign another name"); + taskflow.name("foo"); + assert(taskflow.name() == "foo"); @endcode */ void name(const std::string&); /** - @brief queries the name of the taskflow + @brief queries the name of this taskflow @code{.cpp} - std::cout << "my name is: " << taskflow.name(); + tf::Taskflow taskflow("foo"); + assert(taskflow.name() == "foo"); @endcode */ const std::string& name() const; @@ -231,7 +262,7 @@ class Taskflow : public FlowBuilder { void clear(); /** - @brief applies a visitor to each task in the taskflow + @brief applies a visitor to each task in this taskflow A visitor is a callable that takes an argument of type tf::Task and returns nothing. The following example iterates each task in a @@ -251,7 +282,11 @@ class Taskflow : public FlowBuilder { @param from from task (dependent) @param to to task (successor) - + + Removing the depencency from task `from` to task `to` is equivalent to + removing `to` from the succcessor list of `from` and + removing `from` from the predecessor list of `to`. + @code{.cpp} tf::Taskflow taskflow; auto a = taskflow.placeholder().name("a"); @@ -261,23 +296,28 @@ class Taskflow : public FlowBuilder { a.precede(b, c, d); assert(a.num_successors() == 3); - assert(b.num_dependents() == 1); - assert(c.num_dependents() == 1); - assert(d.num_dependents() == 1); + assert(b.num_predecessors() == 1); + assert(c.num_predecessors() == 1); + assert(d.num_predecessors() == 1); taskflow.remove_dependency(a, b); assert(a.num_successors() == 2); - assert(b.num_dependents() == 0); + assert(b.num_predecessors() == 0); @endcode + + @attention For performance reason, %Taskflow does not store the graph using linked lists but + vectors with contiguous space. + Therefore, removing tasks or dependencies incurs linear time complexity proportional + to the size of the graph and the dependency count of a task. */ - inline void remove_dependency(Task from, Task to); + void remove_dependency(Task from, Task to); /** @brief returns a reference to the underlying graph object - A graph object (of type tf::Graph) is the ultimate storage for the - task dependency graph and should only be used as an opaque - data structure to interact with the executor (e.g., composition). + A graph object is of type tf::Graph and stores a task dependency graph that can be executed + by an tf::Executor. + */ Graph& graph(); @@ -335,7 +375,7 @@ inline Taskflow& Taskflow::operator = (Taskflow&& rhs) { // Procedure: inline void Taskflow::clear() { - _graph._clear(); + _graph.clear(); } // Function: num_tasks @@ -366,24 +406,18 @@ inline Graph& Taskflow::graph() { // Function: for_each_task template void Taskflow::for_each_task(V&& visitor) const { - for(size_t i=0; i<_graph._nodes.size(); ++i) { - visitor(Task(_graph._nodes[i])); + for(auto itr = _graph.begin(); itr != _graph.end(); ++itr) { + visitor(Task(itr->get())); } } // Procedure: remove_dependency inline void Taskflow::remove_dependency(Task from, Task to) { - from._node->_successors.erase(std::remove_if( - from._node->_successors.begin(), from._node->_successors.end(), [&](Node* i){ - return i == to._node; - } - ), from._node->_successors.end()); - - to._node->_dependents.erase(std::remove_if( - to._node->_dependents.begin(), to._node->_dependents.end(), [&](Node* i){ - return i == from._node; - } - ), to._node->_dependents.end()); + // remove "to" from the succcessor list of "from" + from._node->_remove_successors(to._node); + + // remove "from" from the predecessor list of "to" + to._node->_remove_predecessors(from._node); } // Procedure: dump @@ -439,12 +473,13 @@ inline void Taskflow::_dump( std::ostream& os, const Node* node, Dumper& dumper ) const { + // label of the node os << 'p' << node << "[label=\""; if(node->_name.empty()) os << 'p' << node; else os << node->_name; os << "\" "; - // shape for node + // shape of the node switch(node->_handle.index()) { case Node::CONDITION: @@ -458,21 +493,21 @@ inline void Taskflow::_dump( os << "];\n"; - for(size_t s=0; s_successors.size(); ++s) { + for(size_t s=0; s_num_successors; ++s) { if(node->_is_conditioner()) { // case edge is dashed - os << 'p' << node << " -> p" << node->_successors[s] + os << 'p' << node << " -> p" << node->_edges[s] << " [style=dashed label=\"" << s << "\"];\n"; } else { - os << 'p' << node << " -> p" << node->_successors[s] << ";\n"; + os << 'p' << node << " -> p" << node->_edges[s] << ";\n"; } } // subflow join node if(node->_parent && node->_parent->_handle.index() == Node::SUBFLOW && - node->_successors.size() == 0 + node->_num_successors == 0 ) { - os << 'p' << node << " -> p" << node->_parent << ";\n"; + os << 'p' << node << " -> p" << node->_parent << " [style=dashed color=blue];\n"; } // node info @@ -502,7 +537,9 @@ inline void Taskflow::_dump( std::ostream& os, const Graph* graph, Dumper& dumper ) const { - for(const auto& n : graph->_nodes) { + for(auto itr = graph->begin(); itr != graph->end(); ++itr) { + + Node* n = itr->get(); // regular task if(n->_handle.index() != Node::MODULE) { @@ -524,8 +561,9 @@ inline void Taskflow::_dump( os << " [m" << dumper.visited[module] << "]\"];\n"; - for(const auto s : n->_successors) { - os << 'p' << n << "->" << 'p' << s << ";\n"; + //for(const auto s : n->_successors) { + for(size_t i=0; i_num_successors; ++i) { + os << 'p' << n << "->" << 'p' << n->_edges[i] << ";\n"; } } } @@ -541,7 +579,7 @@ inline void Taskflow::_dump( @brief class to access the result of an execution tf::Future is a derived class from std::future that will eventually hold the -execution result of a submitted taskflow (tf::Executor::run) +execution result of a submitted taskflow (tf::Executor::run series). In addition to the base methods inherited from std::future, you can call tf::Future::cancel to cancel the execution of the running taskflow associated with this future object. @@ -609,10 +647,30 @@ class Future : public std::future { @return @c true if the execution can be cancelled or @c false if the execution has already completed - When you request a cancellation, the executor will stop scheduling - any tasks onwards. Tasks that are already running will continue to finish - (non-preemptive). + When you request a cancellation, the executor will stop scheduling any tasks onwards. + Tasks that are already running will continue to finish as their executions are non-preemptive. You can call tf::Future::wait to wait for the cancellation to complete. + + @code{.cpp} + // create a taskflow of four tasks and submit it to an executor + taskflow.emplace( + [](){ std::cout << "Task A\n"; }, + [](){ std::cout << "Task B\n"; }, + [](){ std::cout << "Task C\n"; }, + [](){ std::cout << "Task D\n"; } + ); + auto future = executor.run(taskflow); + + // cancel the execution of the taskflow and wait until it finishes all running tasks + future.cancel(); + future.wait(); + @endcode + + In the above example, we submit a taskflow of four tasks to the executor and then + issue a cancellation to stop its execution. + Since the cancellation is non-deterministic with the executor runtime, + we may still see some tasks complete their executions or none. + */ bool cancel(); @@ -633,7 +691,7 @@ Future::Future(std::future&& f, std::weak_ptr p) : template bool Future::cancel() { if(auto ptr = _topology.lock(); ptr) { - ptr->_state.fetch_or(Topology::CANCELLED, std::memory_order_relaxed); + ptr->_estate.fetch_or(ESTATE::CANCELLED, std::memory_order_relaxed); return true; } return false; diff --git a/taskflow/core/topology.hpp b/taskflow/core/topology.hpp index 335ccfb80..354b72c69 100644 --- a/taskflow/core/topology.hpp +++ b/taskflow/core/topology.hpp @@ -12,16 +12,13 @@ class TopologyBase { class Topology { friend class Executor; + friend class Subflow; friend class Runtime; friend class Node; template friend class Future; - constexpr static int CLEAN = 0; - constexpr static int CANCELLED = 1; - constexpr static int EXCEPTION = 2; - public: template @@ -34,14 +31,12 @@ class Topology { Taskflow& _taskflow; std::promise _promise; - - SmallVector _sources; - + std::function _pred; std::function _call; std::atomic _join_counter {0}; - std::atomic _state {CLEAN}; + std::atomic _estate {ESTATE::NONE}; std::exception_ptr _exception_ptr {nullptr}; @@ -70,7 +65,7 @@ inline void Topology::_carry_out_promise() { // Function: cancelled inline bool Topology::cancelled() const { - return _state.load(std::memory_order_relaxed) & CANCELLED; + return _estate.load(std::memory_order_relaxed) & ESTATE::CANCELLED; } } // end of namespace tf. ---------------------------------------------------- diff --git a/taskflow/core/tsq.hpp b/taskflow/core/tsq.hpp index e4ea76c28..220be052e 100644 --- a/taskflow/core/tsq.hpp +++ b/taskflow/core/tsq.hpp @@ -8,36 +8,27 @@ @brief task queue include file */ -namespace tf { - - -// ---------------------------------------------------------------------------- -// Task Types -// ---------------------------------------------------------------------------- - -/** -@enum TaskPriority - -@brief enumeration of all task priority values - -A priority is an enumerated value of type @c unsigned. -Currently, %Taskflow defines three priority levels, -@c HIGH, @c NORMAL, and @c LOW, starting from 0, 1, to 2. -That is, the lower the value, the higher the priority. - -*/ -enum class TaskPriority : unsigned { - /** @brief value of the highest priority (i.e., 0) */ - HIGH = 0, - /** @brief value of the normal priority (i.e., 1) */ - NORMAL = 1, - /** @brief value of the lowest priority (i.e., 2) */ - LOW = 2, - /** @brief conventional value for iterating priority values */ - MAX = 3 -}; - +#ifndef TF_DEFAULT_BOUNDED_TASK_QUEUE_LOG_SIZE + /** + @def TF_DEFAULT_BOUNDED_TASK_QUEUE_LOG_SIZE + + This macro defines the default size of the bounded task queue in Log2. + Bounded task queue is used by each worker. + */ + #define TF_DEFAULT_BOUNDED_TASK_QUEUE_LOG_SIZE 8 +#endif + +#ifndef TF_DEFAULT_UNBOUNDED_TASK_QUEUE_LOG_SIZE + /** + @def TF_DEFAULT_UNBOUNDED_TASK_QUEUE_LOG_SIZE + + This macro defines the default size of the unbounded task queue in Log2. + Unbounded task queue is used by the executor. + */ + #define TF_DEFAULT_UNBOUNDED_TASK_QUEUE_LOG_SIZE 10 +#endif +namespace tf { // ---------------------------------------------------------------------------- // Task Queue @@ -45,71 +36,22 @@ enum class TaskPriority : unsigned { /** -@class: TaskQueue +@class: UnboundedTaskQueue @tparam T data type (must be a pointer type) -@tparam TF_MAX_PRIORITY maximum level of the priority -@brief class to create a lock-free unbounded single-producer multiple-consumer queue +@brief class to create a lock-free unbounded work-stealing queue This class implements the work-stealing queue described in the paper, -Correct and Efficient Work-Stealing for Weak Memory Models, -and extends it to include priority. +Correct and Efficient Work-Stealing for Weak Memory Models. Only the queue owner can perform pop and push operations, while others can steal data from the queue simultaneously. -Priority starts from zero (highest priority) to the template value -`TF_MAX_PRIORITY-1` (lowest priority). -All operations are associated with priority values to indicate -the corresponding queues to which an operation is applied. - -The default template value, `TF_MAX_PRIORITY`, is `TaskPriority::MAX` -which applies only three priority levels to the task queue. - -@code{.cpp} -auto [A, B, C, D, E] = taskflow.emplace( - [] () { }, - [&] () { - std::cout << "Task B: " << counter++ << '\n'; // 0 - }, - [&] () { - std::cout << "Task C: " << counter++ << '\n'; // 2 - }, - [&] () { - std::cout << "Task D: " << counter++ << '\n'; // 1 - }, - [] () { } -); - -A.precede(B, C, D); -E.succeed(B, C, D); - -B.priority(tf::TaskPriority::HIGH); -C.priority(tf::TaskPriority::LOW); -D.priority(tf::TaskPriority::NORMAL); - -executor.run(taskflow).wait(); -@endcode - -In the above example, we have a task graph of five tasks, -@c A, @c B, @c C, @c D, and @c E, in which @c B, @c C, and @c D -can run in simultaneously when @c A finishes. -Since we only uses one worker thread in the executor, -we can deterministically run @c B first, then @c D, and @c C -in order of their priority values. -The output is as follows: - -@code{.shell-session} -Task B: 0 -Task D: 1 -Task C: 2 -@endcode */ -template (TaskPriority::MAX)> -class TaskQueue { +template +class UnboundedTaskQueue { - static_assert(TF_MAX_PRIORITY > 0, "TF_MAX_PRIORITY must be at least one"); static_assert(std::is_pointer_v, "T must be a pointer type"); struct Array { @@ -152,206 +94,148 @@ class TaskQueue { // Doubling the alignment by 2 seems to generate the most // decent performance. - CachelineAligned> _top[TF_MAX_PRIORITY]; - CachelineAligned> _bottom[TF_MAX_PRIORITY]; - std::atomic _array[TF_MAX_PRIORITY]; - std::vector _garbage[TF_MAX_PRIORITY]; - - //std::atomic _cache {nullptr}; + alignas(2*TF_CACHELINE_SIZE) std::atomic _top; + alignas(2*TF_CACHELINE_SIZE) std::atomic _bottom; + std::atomic _array; + std::vector _garbage; public: - /** - @brief constructs the queue with a given capacity - - @param capacity the capacity of the queue (must be power of 2) - */ - explicit TaskQueue(int64_t capacity = 512); - - /** - @brief destructs the queue - */ - ~TaskQueue(); - - /** - @brief queries if the queue is empty at the time of this call - */ - bool empty() const noexcept; - - /** - @brief queries if the queue is empty at a specific priority value - */ - bool empty(unsigned priority) const noexcept; - - /** - @brief queries the number of items at the time of this call - */ - size_t size() const noexcept; - - /** - @brief queries the number of items with the given priority - at the time of this call - */ - size_t size(unsigned priority) const noexcept; - - /** - @brief queries the capacity of the queue - */ - int64_t capacity() const noexcept; - - /** - @brief queries the capacity of the queue at a specific priority value - */ - int64_t capacity(unsigned priority) const noexcept; - - /** - @brief inserts an item to the queue - - @param item the item to push to the queue - @param priority priority value of the item to push (default = 0) - - Only the owner thread can insert an item to the queue. - The operation can trigger the queue to resize its capacity - if more space is required. - */ - TF_FORCE_INLINE void push(T item, unsigned priority); + /** + @brief constructs the queue with the given size in the base-2 logarithm - /** - @brief pops out an item from the queue + @param LogSize the base-2 logarithm of the queue size + */ + explicit UnboundedTaskQueue(int64_t LogSize = TF_DEFAULT_UNBOUNDED_TASK_QUEUE_LOG_SIZE); - Only the owner thread can pop out an item from the queue. - The return can be a @c nullptr if this operation failed (empty queue). - */ - T pop(); + /** + @brief destructs the queue + */ + ~UnboundedTaskQueue(); - /** - @brief pops out an item with a specific priority value from the queue + /** + @brief queries if the queue is empty at the time of this call + */ + bool empty() const noexcept; - @param priority priority of the item to pop + /** + @brief queries the number of items at the time of this call + */ + size_t size() const noexcept; - Only the owner thread can pop out an item from the queue. - The return can be a @c nullptr if this operation failed (empty queue). - */ - TF_FORCE_INLINE T pop(unsigned priority); - - /** - @brief steals an item from the queue - - Any threads can try to steal an item from the queue. - The return can be a @c nullptr if this operation failed (not necessary empty). - */ - T steal(); - - /** - @brief steals an item with a specific priority value from the queue + /** + @brief queries the capacity of the queue + */ + int64_t capacity() const noexcept; + + /** + @brief inserts an item to the queue - @param priority priority of the item to steal + @param item the item to push to the queue + + Only the owner thread can insert an item to the queue. + The operation can trigger the queue to resize its capacity + if more space is required. + */ + void push(T item); + + /** + @brief pops out an item from the queue + + Only the owner thread can pop out an item from the queue. + The return can be a @c nullptr if this operation failed (empty queue). + */ + T pop(); + + /** + @brief steals an item from the queue + + Any threads can try to steal an item from the queue. + The return can be a @c nullptr if this operation failed (not necessary empty). + */ + T steal(); + + /** + @brief attempts to steal a task with a hint mechanism + + @param num_empty_steals a reference to a counter tracking consecutive empty steal attempts + + This function tries to steal a task from the queue. If the steal attempt + is successful, the stolen task is returned. + Additionally, if the queue is empty, the provided counter `num_empty_steals` is incremented; + otherwise, `num_empty_steals` is reset to zero. - Any threads can try to steal an item from the queue. - The return can be a @c nullptr if this operation failed (not necessary empty). - */ - T steal(unsigned priority); + */ + T steal_with_hint(size_t& num_empty_steals); private: - TF_NO_INLINE Array* resize_array(Array* a, unsigned p, std::int64_t b, std::int64_t t); + + Array* resize_array(Array* a, int64_t b, int64_t t); }; // Constructor -template -TaskQueue::TaskQueue(int64_t c) { - assert(c && (!(c & (c-1)))); - unroll<0, TF_MAX_PRIORITY, 1>([&](auto p){ - _top[p].data.store(0, std::memory_order_relaxed); - _bottom[p].data.store(0, std::memory_order_relaxed); - _array[p].store(new Array{c}, std::memory_order_relaxed); - _garbage[p].reserve(32); - }); +template +UnboundedTaskQueue::UnboundedTaskQueue(int64_t LogSize) { + _top.store(0, std::memory_order_relaxed); + _bottom.store(0, std::memory_order_relaxed); + _array.store(new Array{(int64_t{1} << LogSize)}, std::memory_order_relaxed); + _garbage.reserve(32); } // Destructor -template -TaskQueue::~TaskQueue() { - unroll<0, TF_MAX_PRIORITY, 1>([&](auto p){ - for(auto a : _garbage[p]) { - delete a; - } - delete _array[p].load(); - }); -} - -// Function: empty -template -bool TaskQueue::empty() const noexcept { - for(unsigned i=0; i +UnboundedTaskQueue::~UnboundedTaskQueue() { + for(auto a : _garbage) { + delete a; } - return true; + delete _array.load(); } // Function: empty -template -bool TaskQueue::empty(unsigned p) const noexcept { - int64_t b = _bottom[p].data.load(std::memory_order_relaxed); - int64_t t = _top[p].data.load(std::memory_order_relaxed); +template +bool UnboundedTaskQueue::empty() const noexcept { + int64_t t = _top.load(std::memory_order_relaxed); + int64_t b = _bottom.load(std::memory_order_relaxed); return (b <= t); } // Function: size -template -size_t TaskQueue::size() const noexcept { - size_t s; - unroll<0, TF_MAX_PRIORITY, 1>([&](auto i) { s = i ? size(i) + s : size(i); }); - return s; -} - -// Function: size -template -size_t TaskQueue::size(unsigned p) const noexcept { - int64_t b = _bottom[p].data.load(std::memory_order_relaxed); - int64_t t = _top[p].data.load(std::memory_order_relaxed); +template +size_t UnboundedTaskQueue::size() const noexcept { + int64_t t = _top.load(std::memory_order_relaxed); + int64_t b = _bottom.load(std::memory_order_relaxed); return static_cast(b >= t ? b - t : 0); } // Function: push -template -TF_FORCE_INLINE void TaskQueue::push(T o, unsigned p) { +template +void UnboundedTaskQueue::push(T o) { - int64_t b = _bottom[p].data.load(std::memory_order_relaxed); - int64_t t = _top[p].data.load(std::memory_order_acquire); - Array* a = _array[p].load(std::memory_order_relaxed); + int64_t b = _bottom.load(std::memory_order_relaxed); + int64_t t = _top.load(std::memory_order_acquire); + Array* a = _array.load(std::memory_order_relaxed); - // queue is full - if(a->capacity() - 1 < (b - t)) { - a = resize_array(a, p, b, t); + // queue is full with one additional item (b-t+1) + if TF_UNLIKELY(a->capacity() - 1 < (b - t)) { + a = resize_array(a, b, t); } a->push(b, o); std::atomic_thread_fence(std::memory_order_release); - _bottom[p].data.store(b + 1, std::memory_order_relaxed); -} -// Function: pop -template -T TaskQueue::pop() { - for(unsigned i=0; i -TF_FORCE_INLINE T TaskQueue::pop(unsigned p) { +template +T UnboundedTaskQueue::pop() { - int64_t b = _bottom[p].data.load(std::memory_order_relaxed) - 1; - Array* a = _array[p].load(std::memory_order_relaxed); - _bottom[p].data.store(b, std::memory_order_relaxed); + int64_t b = _bottom.load(std::memory_order_relaxed) - 1; + Array* a = _array.load(std::memory_order_relaxed); + _bottom.store(b, std::memory_order_relaxed); std::atomic_thread_fence(std::memory_order_seq_cst); - int64_t t = _top[p].data.load(std::memory_order_relaxed); + int64_t t = _top.load(std::memory_order_relaxed); T item {nullptr}; @@ -359,83 +243,569 @@ TF_FORCE_INLINE T TaskQueue::pop(unsigned p) { item = a->pop(b); if(t == b) { // the last item just got stolen - if(!_top[p].data.compare_exchange_strong(t, t+1, + if(!_top.compare_exchange_strong(t, t+1, std::memory_order_seq_cst, std::memory_order_relaxed)) { item = nullptr; } - _bottom[p].data.store(b + 1, std::memory_order_relaxed); + _bottom.store(b + 1, std::memory_order_relaxed); } } else { - _bottom[p].data.store(b + 1, std::memory_order_relaxed); + _bottom.store(b + 1, std::memory_order_relaxed); } return item; } // Function: steal -template -T TaskQueue::steal() { - for(unsigned i=0; i +T UnboundedTaskQueue::steal() { + + int64_t t = _top.load(std::memory_order_acquire); + std::atomic_thread_fence(std::memory_order_seq_cst); + int64_t b = _bottom.load(std::memory_order_acquire); + + T item {nullptr}; + + if(t < b) { + Array* a = _array.load(std::memory_order_consume); + item = a->pop(t); + if(!_top.compare_exchange_strong(t, t+1, + std::memory_order_seq_cst, + std::memory_order_relaxed)) { + return nullptr; } } - return nullptr; + + return item; } // Function: steal -template -T TaskQueue::steal(unsigned p) { +template +T UnboundedTaskQueue::steal_with_hint(size_t& num_empty_steals) { - int64_t t = _top[p].data.load(std::memory_order_acquire); + int64_t t = _top.load(std::memory_order_acquire); std::atomic_thread_fence(std::memory_order_seq_cst); - int64_t b = _bottom[p].data.load(std::memory_order_acquire); + int64_t b = _bottom.load(std::memory_order_acquire); T item {nullptr}; if(t < b) { - Array* a = _array[p].load(std::memory_order_consume); + num_empty_steals = 0; + Array* a = _array.load(std::memory_order_consume); item = a->pop(t); - if(!_top[p].data.compare_exchange_strong(t, t+1, - std::memory_order_seq_cst, - std::memory_order_relaxed)) { + if(!_top.compare_exchange_strong(t, t+1, + std::memory_order_seq_cst, + std::memory_order_relaxed)) { return nullptr; } } - + else { + ++num_empty_steals; + } return item; } // Function: capacity -template -int64_t TaskQueue::capacity() const noexcept { - size_t s; - unroll<0, TF_MAX_PRIORITY, 1>([&](auto i) { - s = i ? capacity(i) + s : capacity(i); - }); - return s; +template +int64_t UnboundedTaskQueue::capacity() const noexcept { + return _array.load(std::memory_order_relaxed)->capacity(); } -// Function: capacity -template -int64_t TaskQueue::capacity(unsigned p) const noexcept { - return _array[p].load(std::memory_order_relaxed)->capacity(); -} - -template -TF_NO_INLINE typename TaskQueue::Array* - TaskQueue::resize_array(Array* a, unsigned p, std::int64_t b, std::int64_t t) { +template +typename UnboundedTaskQueue::Array* +UnboundedTaskQueue::resize_array(Array* a, int64_t b, int64_t t) { + + //Array* tmp = a->resize(b, t); + //_garbage.push_back(a); + //std::swap(a, tmp); + //_array.store(a, std::memory_order_release); + //// Note: the original paper using relaxed causes t-san to complain + ////_array.store(a, std::memory_order_relaxed); + //return a; + Array* tmp = a->resize(b, t); - _garbage[p].push_back(a); - std::swap(a, tmp); - _array[p].store(a, std::memory_order_release); + _garbage.push_back(a); + _array.store(tmp, std::memory_order_release); // Note: the original paper using relaxed causes t-san to complain //_array.store(a, std::memory_order_relaxed); - return a; + return tmp; } +// ---------------------------------------------------------------------------- +// BoundedTaskQueue +// ---------------------------------------------------------------------------- + +/** +@class: BoundedTaskQueue + +@tparam T data type +@tparam LogSize the base-2 logarithm of the queue size + +@brief class to create a lock-free bounded work-stealing queue + +This class implements the work-stealing queue described in the paper, +"Correct and Efficient Work-Stealing for Weak Memory Models," +available at https://www.di.ens.fr/~zappa/readings/ppopp13.pdf. + +Only the queue owner can perform pop and push operations, +while others can steal data from the queue. +*/ +template +class BoundedTaskQueue { + + static_assert(std::is_pointer_v, "T must be a pointer type"); + + constexpr static int64_t BufferSize = int64_t{1} << LogSize; + constexpr static int64_t BufferMask = (BufferSize - 1); + + static_assert((BufferSize >= 2) && ((BufferSize & (BufferSize - 1)) == 0)); + + alignas(2*TF_CACHELINE_SIZE) std::atomic _top {0}; + alignas(2*TF_CACHELINE_SIZE) std::atomic _bottom {0}; + alignas(2*TF_CACHELINE_SIZE) std::atomic _buffer[BufferSize]; + + public: + + /** + @brief constructs the queue with a given capacity + */ + BoundedTaskQueue() = default; + + /** + @brief destructs the queue + */ + ~BoundedTaskQueue() = default; + + /** + @brief queries if the queue is empty at the time of this call + */ + bool empty() const noexcept; + + /** + @brief queries the number of items at the time of this call + */ + size_t size() const noexcept; + + /** + @brief queries the capacity of the queue + */ + constexpr size_t capacity() const; + + /** + @brief tries to insert an item to the queue + + @tparam O data type + @param item the item to perfect-forward to the queue + @return `true` if the insertion succeed or `false` (queue is full) + + Only the owner thread can insert an item to the queue. + + */ + template + bool try_push(O&& item); + + /** + @brief tries to insert an item to the queue or invoke the callable if fails + + @tparam O data type + @tparam C callable type + @param item the item to perfect-forward to the queue + @param on_full callable to invoke when the queue is full (insertion fails) + + Only the owner thread can insert an item to the queue. + + */ + template + void push(O&& item, C&& on_full); + + /** + @brief pops out an item from the queue + + Only the owner thread can pop out an item from the queue. + The return can be a `nullptr` if this operation failed (empty queue). + */ + T pop(); + + /** + @brief steals an item from the queue + + Any threads can try to steal an item from the queue. + The return can be a `nullptr` if this operation failed (not necessary empty). + */ + T steal(); + + /** + @brief attempts to steal a task with a hint mechanism + + @param num_empty_steals a reference to a counter tracking consecutive empty steal attempts + + This function tries to steal a task from the queue. If the steal attempt + is successful, the stolen task is returned. + Additionally, if the queue is empty, the provided counter `num_empty_steals` is incremented; + otherwise, `num_empty_steals` is reset to zero. + */ + T steal_with_hint(size_t& num_empty_steals); +}; + +// Function: empty +template +bool BoundedTaskQueue::empty() const noexcept { + int64_t t = _top.load(std::memory_order_relaxed); + int64_t b = _bottom.load(std::memory_order_relaxed); + return b <= t; +} + +// Function: size +template +size_t BoundedTaskQueue::size() const noexcept { + int64_t t = _top.load(std::memory_order_relaxed); + int64_t b = _bottom.load(std::memory_order_relaxed); + return static_cast(b >= t ? b - t : 0); +} + +// Function: try_push +template +template +bool BoundedTaskQueue::try_push(O&& o) { + + int64_t b = _bottom.load(std::memory_order_relaxed); + int64_t t = _top.load(std::memory_order_acquire); + + // queue is full with one additional item (b-t+1) + if TF_UNLIKELY((b - t) > BufferSize - 1) { + return false; + } + + _buffer[b & BufferMask].store(std::forward(o), std::memory_order_relaxed); + + std::atomic_thread_fence(std::memory_order_release); + + // original paper uses relaxed here but tsa complains + _bottom.store(b + 1, std::memory_order_release); + + return true; +} + +// Function: push +template +template +void BoundedTaskQueue::push(O&& o, C&& on_full) { + + int64_t b = _bottom.load(std::memory_order_relaxed); + int64_t t = _top.load(std::memory_order_acquire); + + // queue is full with one additional item (b-t+1) + if TF_UNLIKELY((b - t) > BufferSize - 1) { + on_full(); + return; + } + + _buffer[b & BufferMask].store(std::forward(o), std::memory_order_relaxed); + + std::atomic_thread_fence(std::memory_order_release); + + // original paper uses relaxed here but tsa complains + _bottom.store(b + 1, std::memory_order_release); +} + +// Function: pop +template +T BoundedTaskQueue::pop() { + + int64_t b = _bottom.load(std::memory_order_relaxed) - 1; + _bottom.store(b, std::memory_order_relaxed); + std::atomic_thread_fence(std::memory_order_seq_cst); + int64_t t = _top.load(std::memory_order_relaxed); + + T item {nullptr}; + + if(t <= b) { + item = _buffer[b & BufferMask].load(std::memory_order_relaxed); + if(t == b) { + // the last item just got stolen + if(!_top.compare_exchange_strong(t, t+1, + std::memory_order_seq_cst, + std::memory_order_relaxed)) { + item = nullptr; + } + _bottom.store(b + 1, std::memory_order_relaxed); + } + } + else { + _bottom.store(b + 1, std::memory_order_relaxed); + } + + return item; +} + +// Function: steal +template +T BoundedTaskQueue::steal() { + int64_t t = _top.load(std::memory_order_acquire); + std::atomic_thread_fence(std::memory_order_seq_cst); + int64_t b = _bottom.load(std::memory_order_acquire); + + T item{nullptr}; + + if(t < b) { + item = _buffer[t & BufferMask].load(std::memory_order_relaxed); + if(!_top.compare_exchange_strong(t, t+1, + std::memory_order_seq_cst, + std::memory_order_relaxed)) { + return nullptr; + } + } + + return item; +} + +// Function: steal +template +T BoundedTaskQueue::steal_with_hint(size_t& num_empty_steals) { + int64_t t = _top.load(std::memory_order_acquire); + std::atomic_thread_fence(std::memory_order_seq_cst); + int64_t b = _bottom.load(std::memory_order_acquire); + + T item {nullptr}; + + if(t < b) { + num_empty_steals = 0; + item = _buffer[t & BufferMask].load(std::memory_order_relaxed); + if(!_top.compare_exchange_strong(t, t+1, + std::memory_order_seq_cst, + std::memory_order_relaxed)) { + return nullptr; + } + } + else { + ++num_empty_steals; + } + return item; +} + +// Function: capacity +template +constexpr size_t BoundedTaskQueue::capacity() const { + return static_cast(BufferSize); +} + + + +//----------------------------------------------------------------------------- + +//template +//class UnboundedTaskQueue2 { +// +// static_assert(std::is_pointer_v, "T must be a pointer type"); +// +// struct Array { +// +// int64_t C; +// int64_t M; +// std::atomic* S; +// +// explicit Array(int64_t c) : +// C {c}, +// M {c-1}, +// S {new std::atomic[static_cast(C)]} { +// } +// +// ~Array() { +// delete [] S; +// } +// +// int64_t capacity() const noexcept { +// return C; +// } +// +// void push(int64_t i, T o) noexcept { +// S[i & M].store(o, std::memory_order_relaxed); +// } +// +// T pop(int64_t i) noexcept { +// return S[i & M].load(std::memory_order_relaxed); +// } +// +// Array* resize(int64_t b, int64_t t) { +// Array* ptr = new Array {2*C}; +// for(int64_t i=t; i!=b; ++i) { +// ptr->push(i, pop(i)); +// } +// return ptr; +// } +// +// }; +// +// // Doubling the alignment by 2 seems to generate the most +// // decent performance. +// alignas(2*TF_CACHELINE_SIZE) std::atomic _top; +// alignas(2*TF_CACHELINE_SIZE) std::atomic _bottom; +// std::atomic _array; +// std::vector _garbage; +// +// static constexpr int64_t BOTTOM_LOCK = std::numeric_limits::min(); +// static constexpr int64_t BOTTOM_MASK = std::numeric_limits::max(); +// +// public: +// +// /** +// @brief constructs the queue with the given size in the base-2 logarithm +// +// @param LogSize the base-2 logarithm of the queue size +// */ +// explicit UnboundedTaskQueue2(int64_t LogSize = TF_DEFAULT_UNBOUNDED_TASK_QUEUE_LOG_SIZE); +// +// /** +// @brief destructs the queue +// */ +// ~UnboundedTaskQueue2(); +// +// /** +// @brief queries if the queue is empty at the time of this call +// */ +// bool empty() const noexcept; +// +// /** +// @brief queries the number of items at the time of this call +// */ +// size_t size() const noexcept; +// +// /** +// @brief queries the capacity of the queue +// */ +// int64_t capacity() const noexcept; +// +// /** +// @brief inserts an item to the queue +// +// @param item the item to push to the queue +// +// Only the owner thread can insert an item to the queue. +// The operation can trigger the queue to resize its capacity +// if more space is required. +// */ +// void push(T item); +// +// /** +// @brief steals an item from the queue +// +// Any threads can try to steal an item from the queue. +// The return can be a @c nullptr if this operation failed (not necessary empty). +// */ +// T steal(); +// +// private: +// +// Array* resize_array(Array* a, int64_t b, int64_t t); +//}; +// +//// Constructor +//template +//UnboundedTaskQueue2::UnboundedTaskQueue2(int64_t LogSize) { +// _top.store(0, std::memory_order_relaxed); +// _bottom.store(0, std::memory_order_relaxed); +// _array.store(new Array{(int64_t{1} << LogSize)}, std::memory_order_relaxed); +// _garbage.reserve(32); +//} +// +//// Destructor +//template +//UnboundedTaskQueue2::~UnboundedTaskQueue2() { +// for(auto a : _garbage) { +// delete a; +// } +// delete _array.load(); +//} +// +//// Function: empty +//template +//bool UnboundedTaskQueue2::empty() const noexcept { +// int64_t b = _bottom.load(std::memory_order_relaxed) & BOTTOM_MASK; +// int64_t t = _top.load(std::memory_order_relaxed); +// return (b <= t); +//} +// +//// Function: size +//template +//size_t UnboundedTaskQueue2::size() const noexcept { +// int64_t b = _bottom.load(std::memory_order_relaxed) & BOTTOM_MASK; +// int64_t t = _top.load(std::memory_order_relaxed); +// return static_cast(b >= t ? b - t : 0); +//} +// +//// Function: push +//template +//void UnboundedTaskQueue2::push(T o) { +// +// // spin until getting an exclusive access to b +// int64_t b = _bottom.load(std::memory_order_acquire) & BOTTOM_MASK; +// while(!_bottom.compare_exchange_weak(b, b | BOTTOM_LOCK, std::memory_order_acquire, +// std::memory_order_relaxed)) { +// b = b & BOTTOM_MASK; +// } +// +// // critical region +// int64_t t = _top.load(std::memory_order_acquire); +// Array* a = _array.load(std::memory_order_relaxed); +// +// // queue is full +// if TF_UNLIKELY(a->capacity() - 1 < (b - t)) { +// a = resize_array(a, b, t); +// } +// +// a->push(b, o); +// std::atomic_thread_fence(std::memory_order_release); +// +// // original paper uses relaxed here but tsa complains +// _bottom.store(b + 1, std::memory_order_release); +//} +// +//// Function: steal +//template +//T UnboundedTaskQueue2::steal() { +// +// int64_t t = _top.load(std::memory_order_acquire); +// std::atomic_thread_fence(std::memory_order_seq_cst); +// int64_t b = _bottom.load(std::memory_order_acquire) & BOTTOM_MASK; +// +// T item {nullptr}; +// +// if(t < b) { +// Array* a = _array.load(std::memory_order_consume); +// item = a->pop(t); +// if(!_top.compare_exchange_strong(t, t+1, +// std::memory_order_seq_cst, +// std::memory_order_relaxed)) { +// return nullptr; +// } +// } +// +// return item; +//} +// +//// Function: capacity +//template +//int64_t UnboundedTaskQueue2::capacity() const noexcept { +// return _array.load(std::memory_order_relaxed)->capacity(); +//} +// +//template +//typename UnboundedTaskQueue2::Array* +//UnboundedTaskQueue2::resize_array(Array* a, int64_t b, int64_t t) { +// +// Array* tmp = a->resize(b, t); +// _garbage.push_back(a); +// std::swap(a, tmp); +// _array.store(a, std::memory_order_release); +// // Note: the original paper using relaxed causes t-san to complain +// //_array.store(a, std::memory_order_relaxed); +// return a; +//} } // end of namespace tf ----------------------------------------------------- + + + diff --git a/taskflow/core/worker.hpp b/taskflow/core/worker.hpp index 8f86381a8..174a50e6f 100644 --- a/taskflow/core/worker.hpp +++ b/taskflow/core/worker.hpp @@ -2,7 +2,9 @@ #include "declarations.hpp" #include "tsq.hpp" -#include "notifier.hpp" +#include "atomic_notifier.hpp" +#include "nonblocking_notifier.hpp" + /** @file worker.hpp @@ -11,6 +13,28 @@ namespace tf { +// ---------------------------------------------------------------------------- +// Default Notifier +// ---------------------------------------------------------------------------- + + +/** +@private +*/ +#ifdef TF_ENABLE_ATOMIC_NOTIFIER + using DefaultNotifier = AtomicNotifier; +#elif TF_ENABLE_NONBLOCKING_NOTIFIER_V1 + using DefaultNotifier = NonblockingNotifierV1; +#elif TF_ENABLE_NONBLOCKING_NOTIFIER_V2 + using DefaultNotifier = NonblockingNotifierV2; +#else + #if __cplusplus >= TF_CPP20 + using DefaultNotifier = AtomicNotifier; + #else + using DefaultNotifier = NonblockingNotifierV2; + #endif +#endif + // ---------------------------------------------------------------------------- // Class Definition: Worker // ---------------------------------------------------------------------------- @@ -28,6 +52,7 @@ using tf::WorkerInterface. class Worker { friend class Executor; + friend class Runtime; friend class WorkerView; public: @@ -41,11 +66,6 @@ class Worker { */ inline size_t id() const { return _id; } - /** - @brief acquires a pointer access to the underlying thread - */ - inline std::thread* thread() const { return _thread; } - /** @brief queries the size of the queue (i.e., number of enqueued tasks to run) associated with the worker @@ -56,47 +76,56 @@ class Worker { @brief queries the current capacity of the queue */ inline size_t queue_capacity() const { return static_cast(_wsq.capacity()); } + + /** + @brief acquires the associated executor + */ + inline Executor* executor() { return _executor; } + + /** + @brief acquires the associated thread + */ + std::thread& thread() { return _thread; } private: + + #if __cplusplus >= TF_CPP20 + std::atomic_flag _done = ATOMIC_FLAG_INIT; + #else + std::atomic _done {false}; + #endif size_t _id; size_t _vtm; - Executor* _executor; - std::thread* _thread; - Notifier::Waiter* _waiter; - std::default_random_engine _rdgen { std::random_device{}() }; - TaskQueue _wsq; - Node* _cache; + Executor* _executor {nullptr}; + DefaultNotifier::Waiter* _waiter; + std::thread _thread; + + std::default_random_engine _rdgen; + //std::uniform_int_distribution _udist; + + BoundedTaskQueue _wsq; + + //TF_FORCE_INLINE size_t _rdvtm() { + // auto r = _udist(_rdgen); + // return r + (r >= _id); + //} + }; + // ---------------------------------------------------------------------------- -// Class Definition: PerThreadWorker +// Per-thread // ---------------------------------------------------------------------------- -/** -@private -*/ -//struct PerThreadWorker { -// -// Worker* worker; -// -// PerThreadWorker() : worker {nullptr} {} -// -// PerThreadWorker(const PerThreadWorker&) = delete; -// PerThreadWorker(PerThreadWorker&&) = delete; -// -// PerThreadWorker& operator = (const PerThreadWorker&) = delete; -// PerThreadWorker& operator = (PerThreadWorker&&) = delete; -//}; +namespace pt { /** @private */ -//inline PerThreadWorker& this_worker() { -// thread_local PerThreadWorker worker; -// return worker; -//} +inline thread_local Worker* this_worker {nullptr}; +} // ---------------------------------------------------------------------------- // Class Definition: WorkerView @@ -105,7 +134,7 @@ class Worker { /** @class WorkerView -@brief class to create an immutable view of a worker in an executor +@brief class to create an immutable view of a worker An executor keeps a set of internal worker threads to run tasks. A worker view provides users an immutable interface to observe @@ -166,7 +195,103 @@ inline size_t WorkerView::queue_capacity() const { return static_cast(_worker._wsq.capacity()); } +// ---------------------------------------------------------------------------- +// Class Definition: WorkerInterface +// ---------------------------------------------------------------------------- + +/** +@class WorkerInterface + +@brief class to configure worker behavior in an executor + +The tf::WorkerInterface class allows users to customize worker properties when creating an executor. +Examples include binding workers to specific CPU cores or +invoking custom methods before and after a worker enters or leaves the work-stealing loop. +When you create an executor, it spawns a set of workers to execute tasks +with the following logic: + +@code{.cpp} +for(size_t n=0; nscheduler_prologue(worker); + + try { + while(1) { + perform_work_stealing_algorithm(); + if(stop) { + break; + } + } + } catch(...) { + exception_ptr = std::current_exception(); + } + + // leaves the scheduling loop and joins this worker thread + // Here, WorkerInterface::scheduler_epilogue is invoked, if any + worker_interface->scheduler_epilogue(worker, exception_ptr); + ); +} +@endcode + +@attention +tf::WorkerInterface::scheduler_prologue and tf::WorkerInterface::scheduler_eiplogue +are invoked by each worker simultaneously. + +*/ +class WorkerInterface { + + public: + + /** + @brief default destructor + */ + virtual ~WorkerInterface() = default; + + /** + @brief method to call before a worker enters the scheduling loop + @param worker a reference to the worker + + The method is called by the constructor of an executor. + */ + virtual void scheduler_prologue(Worker& worker) = 0; + + /** + @brief method to call after a worker leaves the scheduling loop + @param worker a reference to the worker + @param ptr an pointer to the exception thrown by the scheduling loop + + The method is called by the constructor of an executor. + */ + virtual void scheduler_epilogue(Worker& worker, std::exception_ptr ptr) = 0; + +}; + +/** +@brief helper function to create an instance derived from tf::WorkerInterface + +@tparam T type derived from tf::WorkerInterface +@tparam ArgsT argument types to construct @c T + +@param args arguments to forward to the constructor of @c T +*/ +template +std::unique_ptr make_worker_interface(ArgsT&&... args) { + static_assert( + std::is_base_of_v, + "T must be derived from WorkerInterface" + ); + return std::make_unique(std::forward(args)...); +} + -} // end of namespact tf ----------------------------------------------------- + + +} // end of namespact tf ------------------------------------------------------ diff --git a/taskflow/cuda/algorithm/for_each.hpp b/taskflow/cuda/algorithm/for_each.hpp index 38a6f8597..551cca178 100644 --- a/taskflow/cuda/algorithm/for_each.hpp +++ b/taskflow/cuda/algorithm/for_each.hpp @@ -14,12 +14,12 @@ namespace detail { /** @private */ -template +template __global__ void cuda_for_each_kernel(I first, unsigned count, C c) { - auto tid = threadIdx.x; - auto bid = blockIdx.x; - auto tile = cuda_get_tile(bid, nt*vt, count); - cuda_strided_iterate( + auto tid = threadIdx.x; + auto bid = blockIdx.x; + auto tile = cuda_get_tile(bid, E::nv, count); + cuda_strided_iterate( [=](auto, auto j) { c(*(first + tile.begin + j)); }, @@ -28,12 +28,12 @@ __global__ void cuda_for_each_kernel(I first, unsigned count, C c) { } /** @private */ -template +template __global__ void cuda_for_each_index_kernel(I first, I inc, unsigned count, C c) { auto tid = threadIdx.x; auto bid = blockIdx.x; - auto tile = cuda_get_tile(bid, nt*vt, count); - cuda_strided_iterate( + auto tile = cuda_get_tile(bid, E::nv, count); + cuda_strided_iterate( [=]__device__(auto, auto j) { c(first + inc*(tile.begin+j)); }, @@ -43,268 +43,62 @@ __global__ void cuda_for_each_index_kernel(I first, I inc, unsigned count, C c) } // end of namespace detail ------------------------------------------------- -// ---------------------------------------------------------------------------- -// cuda standard algorithms: single_task/for_each/for_each_index -// ---------------------------------------------------------------------------- - -/** -@brief runs a callable asynchronously using one kernel thread - -@tparam P execution policy type -@tparam C closure type - -@param p execution policy -@param c closure to run by one kernel thread - -The function launches a single kernel thread to run the given callable -through the stream in the execution policy object. -*/ -template -void cuda_single_task(P&& p, C c) { - cuda_kernel<<<1, 1, 0, p.stream()>>>( - [=]__device__(auto, auto) mutable { c(); } - ); -} - -/** -@brief performs asynchronous parallel iterations over a range of items - -@tparam P execution policy type -@tparam I input iterator type -@tparam C unary operator type - -@param p execution policy object -@param first iterator to the beginning of the range -@param last iterator to the end of the range -@param c unary operator to apply to each dereferenced iterator - -This function is equivalent to a parallel execution of the following loop -on a GPU: - -@code{.cpp} -for(auto itr = first; itr != last; itr++) { - c(*itr); -} -@endcode -*/ -template -void cuda_for_each(P&& p, I first, I last, C c) { - - using E = std::decay_t

      ; - - unsigned count = std::distance(first, last); - - if(count == 0) { - return; - } - - detail::cuda_for_each_kernel<<>>( - first, count, c - ); -} - -/** -@brief performs asynchronous parallel iterations over - an index-based range of items - -@tparam P execution policy type -@tparam I input index type -@tparam C unary operator type - -@param p execution policy object -@param first index to the beginning of the range -@param last index to the end of the range -@param inc step size between successive iterations -@param c unary operator to apply to each index - -This function is equivalent to a parallel execution of -the following loop on a GPU: - -@code{.cpp} -// step is positive [first, last) -for(auto i=first; ilast; i+=step) { - c(i); -} -@endcode -*/ -template -void cuda_for_each_index(P&& p, I first, I last, I inc, C c) { - - using E = std::decay_t

      ; - - unsigned count = distance(first, last, inc); - - if(count == 0) { - return; - } - - detail::cuda_for_each_index_kernel<<>>( - first, inc, count, c - ); -} - -// ---------------------------------------------------------------------------- -// single_task -// ---------------------------------------------------------------------------- - -/** @private */ -template -__global__ void cuda_single_task(C callable) { - callable(); -} - -// Function: single_task -template -cudaTask cudaFlow::single_task(C c) { - return kernel(1, 1, 0, cuda_single_task, c); -} - -// Function: single_task -template -void cudaFlow::single_task(cudaTask task, C c) { - return kernel(task, 1, 1, 0, cuda_single_task, c); -} - -// Function: single_task -template -cudaTask cudaFlowCapturer::single_task(C callable) { - return on([=] (cudaStream_t stream) mutable { - cuda_single_task(cudaDefaultExecutionPolicy(stream), callable); - }); -} - -// Function: single_task -template -void cudaFlowCapturer::single_task(cudaTask task, C callable) { - on(task, [=] (cudaStream_t stream) mutable { - cuda_single_task(cudaDefaultExecutionPolicy(stream), callable); - }); -} - // ---------------------------------------------------------------------------- // cudaFlow: for_each, for_each_index // ---------------------------------------------------------------------------- // Function: for_each -template -cudaTask cudaFlow::for_each(I first, I last, C c) { +template +template +cudaTask cudaGraphBase::for_each(I first, I last, C c) { - using E = cudaDefaultExecutionPolicy; - unsigned count = std::distance(first, last); - // TODO: - //if(count == 0) { - // return; - //} - return kernel( E::num_blocks(count), E::nt, 0, - detail::cuda_for_each_kernel, first, count, c + detail::cuda_for_each_kernel, first, count, c ); } // Function: for_each -template -void cudaFlow::for_each(cudaTask task, I first, I last, C c) { - - using E = cudaDefaultExecutionPolicy; +template +template +void cudaGraphExecBase::for_each(cudaTask task, I first, I last, C c) { unsigned count = std::distance(first, last); - // TODO: - //if(count == 0) { - // return; - //} - kernel(task, E::num_blocks(count), E::nt, 0, - detail::cuda_for_each_kernel, first, count, c + detail::cuda_for_each_kernel, first, count, c ); } // Function: for_each_index -template -cudaTask cudaFlow::for_each_index(I first, I last, I inc, C c) { - - using E = cudaDefaultExecutionPolicy; +template +template +cudaTask cudaGraphBase::for_each_index(I first, I last, I inc, C c) { unsigned count = distance(first, last, inc); - // TODO: - //if(count == 0) { - // return; - //} - return kernel( E::num_blocks(count), E::nt, 0, - detail::cuda_for_each_index_kernel, first, inc, count, c + detail::cuda_for_each_index_kernel, first, inc, count, c ); } // Function: for_each_index -template -void cudaFlow::for_each_index(cudaTask task, I first, I last, I inc, C c) { +template +template +void cudaGraphExecBase::for_each_index(cudaTask task, I first, I last, I inc, C c) { - using E = cudaDefaultExecutionPolicy; - unsigned count = distance(first, last, inc); - - // TODO: - //if(count == 0) { - // return; - //} return kernel(task, E::num_blocks(count), E::nt, 0, - detail::cuda_for_each_index_kernel, first, inc, count, c + detail::cuda_for_each_index_kernel, first, inc, count, c ); } -// ---------------------------------------------------------------------------- -// cudaFlowCapturer: for_each, for_each_index -// ---------------------------------------------------------------------------- - -// Function: for_each -template -cudaTask cudaFlowCapturer::for_each(I first, I last, C c) { - return on([=](cudaStream_t stream) mutable { - cuda_for_each(cudaDefaultExecutionPolicy(stream), first, last, c); - }); -} - -// Function: for_each_index -template -cudaTask cudaFlowCapturer::for_each_index(I beg, I end, I inc, C c) { - return on([=] (cudaStream_t stream) mutable { - cuda_for_each_index(cudaDefaultExecutionPolicy(stream), beg, end, inc, c); - }); -} - -// Function: for_each -template -void cudaFlowCapturer::for_each(cudaTask task, I first, I last, C c) { - on(task, [=](cudaStream_t stream) mutable { - cuda_for_each(cudaDefaultExecutionPolicy(stream), first, last, c); - }); -} - -// Function: for_each_index -template -void cudaFlowCapturer::for_each_index( - cudaTask task, I beg, I end, I inc, C c -) { - on(task, [=] (cudaStream_t stream) mutable { - cuda_for_each_index(cudaDefaultExecutionPolicy(stream), beg, end, inc, c); - }); -} - - } // end of namespace tf ----------------------------------------------------- diff --git a/taskflow/cuda/algorithm/reduce.hpp b/taskflow/cuda/algorithm/reduce.hpp index d6ba33244..5a5de0a80 100644 --- a/taskflow/cuda/algorithm/reduce.hpp +++ b/taskflow/cuda/algorithm/reduce.hpp @@ -17,9 +17,9 @@ namespace tf::detail { template struct cudaBlockReduce { - static const unsigned group_size = std::min(nt, CUDA_WARP_SIZE); - static const unsigned num_passes = log2(group_size); - static const unsigned num_items = nt / group_size; + static constexpr unsigned group_size = (std::min)(nt, CUDA_WARP_SIZE); + static constexpr unsigned num_passes = static_floor_log2(); + static constexpr unsigned num_items = nt / group_size; static_assert( nt && (0 == nt % CUDA_WARP_SIZE), diff --git a/taskflow/cuda/algorithm/scan.hpp b/taskflow/cuda/algorithm/scan.hpp index bce0d6341..223d683cf 100644 --- a/taskflow/cuda/algorithm/scan.hpp +++ b/taskflow/cuda/algorithm/scan.hpp @@ -42,9 +42,9 @@ struct cudaScanResult { template struct cudaBlockScan { - const static unsigned num_warps = nt / CUDA_WARP_SIZE; - const static unsigned num_passes = log2(nt); - const static unsigned capacity = nt + num_warps; + static constexpr unsigned num_warps = nt / CUDA_WARP_SIZE; + static constexpr unsigned num_passes = static_floor_log2(); + static constexpr unsigned capacity = nt + num_warps; /** @private */ union storage_t { diff --git a/taskflow/cuda/algorithm/single_task.hpp b/taskflow/cuda/algorithm/single_task.hpp new file mode 100644 index 000000000..4177ff38e --- /dev/null +++ b/taskflow/cuda/algorithm/single_task.hpp @@ -0,0 +1,36 @@ +#pragma once + +/** +@file taskflow/cuda/algorithm/single_task.hpp +@brief cuda single-task algorithms include file +*/ + +namespace tf { + +/** @private */ +template +__global__ void cuda_single_task(C callable) { + callable(); +} + +// Function: single_task +template +template +cudaTask cudaGraphBase::single_task(C c) { + return kernel(1, 1, 0, cuda_single_task, c); +} + +// Function: single_task +template +template +void cudaGraphExecBase::single_task(cudaTask task, C c) { + return kernel(task, 1, 1, 0, cuda_single_task, c); +} + +} // end of namespace tf ----------------------------------------------------- + + + + + + diff --git a/taskflow/cuda/algorithm/sort.hpp b/taskflow/cuda/algorithm/sort.hpp index 3cc01d5ae..97695f877 100644 --- a/taskflow/cuda/algorithm/sort.hpp +++ b/taskflow/cuda/algorithm/sort.hpp @@ -150,7 +150,7 @@ template struct cudaBlockSort { static constexpr bool has_values = !std::is_same::value; - static constexpr unsigned num_passes = log2(nt); + static constexpr unsigned num_passes = static_floor_log2(); /** @private */ union Storage { @@ -226,7 +226,7 @@ void cuda_merge_sort_partitions( unsigned coop, unsigned spacing, C comp, unsigned* buf ) { - // bufer size is num_partitions + 1 + // buffer size is num_partitions + 1 unsigned num_partitions = (count + spacing - 1) / spacing + 1; const unsigned nt = 128; diff --git a/taskflow/cuda/algorithm/transform.hpp b/taskflow/cuda/algorithm/transform.hpp index b1146bdd7..e8fc386e4 100644 --- a/taskflow/cuda/algorithm/transform.hpp +++ b/taskflow/cuda/algorithm/transform.hpp @@ -18,12 +18,12 @@ namespace detail { /** @private */ -template +template __global__ void cuda_transform_kernel(I first, unsigned count, O output, C op) { auto tid = threadIdx.x; auto bid = blockIdx.x; - auto tile = cuda_get_tile(bid, nt*vt, count); - cuda_strided_iterate( + auto tile = cuda_get_tile(bid, E::nv, count); + cuda_strided_iterate( [=]__device__(auto, auto j) { auto offset = j + tile.begin; *(output + offset) = op(*(first+offset)); @@ -36,14 +36,14 @@ __global__ void cuda_transform_kernel(I first, unsigned count, O output, C op) { /** @private */ -template +template __global__ void cuda_transform_kernel( I1 first1, I2 first2, unsigned count, O output, C op ) { auto tid = threadIdx.x; auto bid = blockIdx.x; - auto tile = cuda_get_tile(bid, nt*vt, count); - cuda_strided_iterate( + auto tile = cuda_get_tile(bid, E::nv, count); + cuda_strided_iterate( [=]__device__(auto, auto j) { auto offset = j + tile.begin; *(output + offset) = op(*(first1+offset), *(first2+offset)); @@ -55,224 +55,68 @@ __global__ void cuda_transform_kernel( } // end of namespace detail ------------------------------------------------- -// ---------------------------------------------------------------------------- -// CUDA standard algorithms: transform -// ---------------------------------------------------------------------------- - -/** -@brief performs asynchronous parallel transforms over a range of items - -@tparam P execution policy type -@tparam I input iterator type -@tparam O output iterator type -@tparam C unary operator type - -@param p execution policy -@param first iterator to the beginning of the range -@param last iterator to the end of the range -@param output iterator to the beginning of the output range -@param op unary operator to apply to transform each item - -This method is equivalent to the parallel execution of the following loop on a GPU: - -@code{.cpp} -while (first != last) { - *output++ = op(*first++); -} -@endcode - -*/ -template -void cuda_transform(P&& p, I first, I last, O output, C op) { - - using E = std::decay_t

      ; - - unsigned count = std::distance(first, last); - - if(count == 0) { - return; - } - - detail::cuda_transform_kernel - <<>> ( - first, count, output, op - ); -} - -/** -@brief performs asynchronous parallel transforms over two ranges of items - -@tparam P execution policy type -@tparam I1 first input iterator type -@tparam I2 second input iterator type -@tparam O output iterator type -@tparam C binary operator type - -@param p execution policy -@param first1 iterator to the beginning of the first range -@param last1 iterator to the end of the first range -@param first2 iterator to the beginning of the second range -@param output iterator to the beginning of the output range -@param op binary operator to apply to transform each pair of items - -This method is equivalent to the parallel execution of the following loop on a GPU: - -@code{.cpp} -while (first1 != last1) { - *output++ = op(*first1++, *first2++); -} -@endcode -*/ -template -void cuda_transform( - P&& p, I1 first1, I1 last1, I2 first2, O output, C op -) { - - using E = std::decay_t

      ; - - unsigned count = std::distance(first1, last1); - - if(count == 0) { - return; - } - - detail::cuda_transform_kernel - <<>> ( - first1, first2, count, output, op - ); -} - // ---------------------------------------------------------------------------- // cudaFlow // ---------------------------------------------------------------------------- // Function: transform -template -cudaTask cudaFlow::transform(I first, I last, O output, C c) { +template +template +cudaTask cudaGraphBase::transform(I first, I last, O output, C c) { - using E = cudaDefaultExecutionPolicy; - unsigned count = std::distance(first, last); - // TODO: - //if(count == 0) { - // return; - //} - return kernel( E::num_blocks(count), E::nt, 0, - detail::cuda_transform_kernel, + detail::cuda_transform_kernel, first, count, output, c ); } // Function: transform -template -cudaTask cudaFlow::transform(I1 first1, I1 last1, I2 first2, O output, C c) { +template +template +cudaTask cudaGraphBase::transform(I1 first1, I1 last1, I2 first2, O output, C c) { - using E = cudaDefaultExecutionPolicy; - unsigned count = std::distance(first1, last1); - // TODO: - //if(count == 0) { - // return; - //} - return kernel( E::num_blocks(count), E::nt, 0, - detail::cuda_transform_kernel, + detail::cuda_transform_kernel, first1, first2, count, output, c ); } + // Function: update transform -template -void cudaFlow::transform(cudaTask task, I first, I last, O output, C c) { +template +template +void cudaGraphExecBase::transform(cudaTask task, I first, I last, O output, C c) { - using E = cudaDefaultExecutionPolicy; - unsigned count = std::distance(first, last); - // TODO: - //if(count == 0) { - // return; - //} - kernel(task, E::num_blocks(count), E::nt, 0, - detail::cuda_transform_kernel, + detail::cuda_transform_kernel, first, count, output, c ); } // Function: update transform -template -void cudaFlow::transform( +template +template +void cudaGraphExecBase::transform( cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c ) { - using E = cudaDefaultExecutionPolicy; - unsigned count = std::distance(first1, last1); - - // TODO: - //if(count == 0) { - // return; - //} kernel(task, E::num_blocks(count), E::nt, 0, - detail::cuda_transform_kernel, + detail::cuda_transform_kernel, first1, first2, count, output, c ); } -// ---------------------------------------------------------------------------- -// cudaFlowCapturer -// ---------------------------------------------------------------------------- - -// Function: transform -template -cudaTask cudaFlowCapturer::transform(I first, I last, O output, C op) { - return on([=](cudaStream_t stream) mutable { - cudaDefaultExecutionPolicy p(stream); - cuda_transform(p, first, last, output, op); - }); -} - -// Function: transform -template -cudaTask cudaFlowCapturer::transform( - I1 first1, I1 last1, I2 first2, O output, C op -) { - return on([=](cudaStream_t stream) mutable { - cudaDefaultExecutionPolicy p(stream); - cuda_transform(p, first1, last1, first2, output, op); - }); -} - -// Function: transform -template -void cudaFlowCapturer::transform( - cudaTask task, I first, I last, O output, C op -) { - on(task, [=] (cudaStream_t stream) mutable { - cudaDefaultExecutionPolicy p(stream); - cuda_transform(p, first, last, output, op); - }); -} - -// Function: transform -template -void cudaFlowCapturer::transform( - cudaTask task, I1 first1, I1 last1, I2 first2, O output, C op -) { - on(task, [=] (cudaStream_t stream) mutable { - cudaDefaultExecutionPolicy p(stream); - cuda_transform(p, first1, last1, first2, output, op); - }); -} - } // end of namespace tf ----------------------------------------------------- diff --git a/taskflow/cuda/cuda_capturer.hpp b/taskflow/cuda/cuda_capturer.hpp index 3b5daee9d..f0a431b8c 100644 --- a/taskflow/cuda/cuda_capturer.hpp +++ b/taskflow/cuda/cuda_capturer.hpp @@ -1,6 +1,5 @@ #pragma once -#include "cuda_task.hpp" #include "cuda_optimizer.hpp" /** @@ -79,7 +78,7 @@ class cudaFlowCapturer { public: /** - @brief constrcts a standalone cudaFlowCapturer + @brief constructs a standalone cudaFlowCapturer A standalone %cudaFlow capturer does not go through any taskflow and can be run by the caller thread using tf::cudaFlowCapturer::run. @@ -232,7 +231,7 @@ class cudaFlowCapturer { /** @brief initializes or sets GPU memory to the given value byte by byte - @param ptr pointer to GPU mempry + @param ptr pointer to GPU memory @param v value to set for each byte of the specified memory @param n size in bytes to set @@ -474,7 +473,7 @@ class cudaFlowCapturer { a native CUDA graph. */ template - OPT& make_optimizer(ArgsT&&... args); + void make_optimizer(ArgsT&&... args); /** @brief captures the cudaFlow and turns it into a CUDA Graph @@ -505,17 +504,15 @@ class cudaFlowCapturer { cudaGraph_t native_graph(); /** - @brief acquires a reference to the underlying CUDA graph executable + @brief instantiates an executable graph from this cudaflow capturer */ - cudaGraphExec_t native_executable(); + cudaGraphExec instantiate(); private: cudaFlowGraph _cfg; Optimizer _optimizer; - - cudaGraphExec _exe {nullptr}; }; // Function: empty @@ -530,7 +527,6 @@ inline size_t cudaFlowCapturer::num_tasks() const { // Procedure: clear inline void cudaFlowCapturer::clear() { - _exe.clear(); _cfg.clear(); } @@ -560,10 +556,6 @@ inline cudaTask cudaFlowCapturer::noop() { return on([](cudaStream_t){}); } -// Function: noop -inline void cudaFlowCapturer::noop(cudaTask task) { - on(task, [](cudaStream_t){}); -} // Function: memcpy inline cudaTask cudaFlowCapturer::memcpy( @@ -607,6 +599,12 @@ cudaTask cudaFlowCapturer::kernel( }); } +// Function: make_optimizer +template +void cudaFlowCapturer::make_optimizer(ArgsT&&... args) { + return _optimizer.emplace(std::forward(args)...); +} + // Function: capture inline cudaGraph_t cudaFlowCapturer::capture() { return std::visit( @@ -614,111 +612,121 @@ inline cudaGraph_t cudaFlowCapturer::capture() { ); } -// Procedure: run -inline void cudaFlowCapturer::run(cudaStream_t stream) { - - // If the topology got changed, we need to destroy the executable - // and create a new one - if(_cfg._state & cudaFlowGraph::CHANGED) { - _cfg._native_handle.reset(capture()); - _exe.instantiate(_cfg._native_handle); - } - // if the graph is just updated (i.e., topology does not change), - // we can skip part of the optimization and just update the executable - // with the new captured graph - else if(_cfg._state & cudaFlowGraph::UPDATED) { - // TODO: skip part of the optimization (e.g., levelization) - _cfg._native_handle.reset(capture()); - if(_exe.update(_cfg._native_handle) != cudaGraphExecUpdateSuccess) { - _exe.instantiate(_cfg._native_handle); - } - } +// Function: instantiate +inline cudaGraphExec cudaFlowCapturer::instantiate() { + + _cfg._native_handle.reset(capture()); - // run the executable (should exist) - _exe.launch(stream); + cudaGraphExec_t exec; + TF_CHECK_CUDA( + cudaGraphInstantiate(&exec, _cfg._native_handle, nullptr, nullptr, 0), + "failed to create an executable graph" + ); - _cfg._state = cudaFlowGraph::OFFLOADED; + return cudaGraphExec(exec); } +//// Procedure: run +//inline void cudaFlowCapturer::run(cudaStream_t stream) { +// +// // If the topology got changed, we need to destroy the executable +// // and create a new one +// if(_cfg._state & cudaFlowGraph::CHANGED) { +// _cfg._native_handle.reset(capture()); +// _exe.instantiate(_cfg._native_handle); +// } +// // if the graph is just updated (i.e., topology does not change), +// // we can skip part of the optimization and just update the executable +// // with the new captured graph +// else if(_cfg._state & cudaFlowGraph::UPDATED) { +// // TODO: skip part of the optimization (e.g., levelization) +// _cfg._native_handle.reset(capture()); +// if(_exe.update(_cfg._native_handle) != cudaGraphExecUpdateSuccess) { +// _exe.instantiate(_cfg._native_handle); +// } +// } +// +// // run the executable (should exist) +// _exe.run(stream); +// +// _cfg._state = cudaFlowGraph::OFFLOADED; +//} + // Function: native_graph inline cudaGraph_t cudaFlowCapturer::native_graph() { return _cfg._native_handle; } -// Function: native_executable -inline cudaGraphExec_t cudaFlowCapturer::native_executable() { - return _exe; -} - -// Function: on -template , void>* -> -void cudaFlowCapturer::on(cudaTask task, C&& callable) { - - if(task.type() != cudaTaskType::CAPTURE) { - TF_THROW("invalid cudaTask type (must be CAPTURE)"); - } - - _cfg._state |= cudaFlowGraph::UPDATED; - - std::get_if(&task._node->_handle)->work = - std::forward(callable); -} - -// Function: memcpy -inline void cudaFlowCapturer::memcpy( - cudaTask task, void* dst, const void* src, size_t count -) { - on(task, [dst, src, count](cudaStream_t stream) mutable { - TF_CHECK_CUDA( - cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream), - "failed to capture memcpy" - ); - }); -} +//// Function: on +//template , void>* +//> +//void cudaFlowCapturer::on(cudaTask task, C&& callable) { +// +// if(task.type() != cudaTaskType::CAPTURE) { +// TF_THROW("invalid cudaTask type (must be CAPTURE)"); +// } +// +// _cfg._state |= cudaFlowGraph::UPDATED; +// +// std::get_if(&task._node->_handle)->work = +// std::forward(callable); +//} +// +//// Function: noop +//inline void cudaFlowCapturer::noop(cudaTask task) { +// on(task, [](cudaStream_t){}); +//} +//// +//// Function: memcpy +//inline void cudaFlowCapturer::memcpy( +// cudaTask task, void* dst, const void* src, size_t count +//) { +// on(task, [dst, src, count](cudaStream_t stream) mutable { +// TF_CHECK_CUDA( +// cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream), +// "failed to capture memcpy" +// ); +// }); +//} +// +//// Function: copy +//template , void>* +//> +//void cudaFlowCapturer::copy( +// cudaTask task, T* tgt, const T* src, size_t num +//) { +// on(task, [tgt, src, num] (cudaStream_t stream) mutable { +// TF_CHECK_CUDA( +// cudaMemcpyAsync(tgt, src, sizeof(T)*num, cudaMemcpyDefault, stream), +// "failed to capture copy" +// ); +// }); +//} +// +//// Function: memset +//inline void cudaFlowCapturer::memset( +// cudaTask task, void* ptr, int v, size_t n +//) { +// on(task, [ptr, v, n] (cudaStream_t stream) mutable { +// TF_CHECK_CUDA( +// cudaMemsetAsync(ptr, v, n, stream), "failed to capture memset" +// ); +// }); +//} +// +//// Function: kernel +//template +//void cudaFlowCapturer::kernel( +// cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT&&... args +//) { +// on(task, [g, b, s, f, args...] (cudaStream_t stream) mutable { +// f<<>>(args...); +// }); +//} +// -// Function: copy -template , void>* -> -void cudaFlowCapturer::copy( - cudaTask task, T* tgt, const T* src, size_t num -) { - on(task, [tgt, src, num] (cudaStream_t stream) mutable { - TF_CHECK_CUDA( - cudaMemcpyAsync(tgt, src, sizeof(T)*num, cudaMemcpyDefault, stream), - "failed to capture copy" - ); - }); -} - -// Function: memset -inline void cudaFlowCapturer::memset( - cudaTask task, void* ptr, int v, size_t n -) { - on(task, [ptr, v, n] (cudaStream_t stream) mutable { - TF_CHECK_CUDA( - cudaMemsetAsync(ptr, v, n, stream), "failed to capture memset" - ); - }); -} - -// Function: kernel -template -void cudaFlowCapturer::kernel( - cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT&&... args -) { - on(task, [g, b, s, f, args...] (cudaStream_t stream) mutable { - f<<>>(args...); - }); -} - -// Function: make_optimizer -template -OPT& cudaFlowCapturer::make_optimizer(ArgsT&&... args) { - return _optimizer.emplace(std::forward(args)...); -} } // end of namespace tf ----------------------------------------------------- diff --git a/taskflow/cuda/cuda_device.hpp b/taskflow/cuda/cuda_device.hpp index 016b2a6f6..0bf541d6a 100644 --- a/taskflow/cuda/cuda_device.hpp +++ b/taskflow/cuda/cuda_device.hpp @@ -76,19 +76,15 @@ inline void cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p) } os << '\n'; - os << "Maximum dimenstion of grid: "; + os << "Maximum dimension of grid: "; for (int i = 0; i < 3; ++i) { if(i) os << 'x'; os << p.maxGridSize[i];; } os << '\n'; - - os << "Clock rate: " << p.clockRate << '\n' - << "Total constant memory: " << p.totalConstMem << '\n' + os << "Total constant memory: " << p.totalConstMem << '\n' << "Texture alignment: " << p.textureAlignment << '\n' - << "Concurrent copy and execution: " << p.deviceOverlap << '\n' << "Number of multiprocessors: " << p.multiProcessorCount << '\n' - << "Kernel execution timeout: " << p.kernelExecTimeoutEnabled << '\n' << "GPU sharing Host Memory: " << p.integrated << '\n' << "Host page-locked mem mapping: " << p.canMapHostMemory << '\n' << "Alignment for Surfaces: " << p.surfaceAlignment << '\n' diff --git a/taskflow/cuda/cuda_error.hpp b/taskflow/cuda/cuda_error.hpp index c38e1324c..0e56e5ac7 100644 --- a/taskflow/cuda/cuda_error.hpp +++ b/taskflow/cuda/cuda_error.hpp @@ -24,3 +24,11 @@ if(TF_CUDA_GET_FIRST(__VA_ARGS__) != cudaSuccess) { \ throw std::runtime_error(oss.str()); \ } +#if __CUDACC_VER_MAJOR__ >= 13 +#define TF_CUDA_POST13(X) X +#define TF_CUDA_PRE13(X) +#else +#define TF_CUDA_PRE13(X) X +#define TF_CUDA_POST13(X) +#endif + diff --git a/taskflow/cuda/cuda_execution_policy.hpp b/taskflow/cuda/cuda_execution_policy.hpp index ae90d98aa..c33eaa1d5 100644 --- a/taskflow/cuda/cuda_execution_policy.hpp +++ b/taskflow/cuda/cuda_execution_policy.hpp @@ -42,25 +42,10 @@ class cudaExecutionPolicy { const static unsigned nv = NT*VT; /** - @brief constructs an execution policy object with default stream + @brief constructs an execution policy object */ cudaExecutionPolicy() = default; - /** - @brief constructs an execution policy object with the given stream - */ - explicit cudaExecutionPolicy(cudaStream_t s) : _stream{s} {} - - /** - @brief queries the associated stream - */ - cudaStream_t stream() noexcept { return _stream; }; - - /** - @brief assigns a stream - */ - void stream(cudaStream_t stream) noexcept { _stream = stream; } - /** @brief queries the number of blocks to accommodate N elements */ @@ -138,10 +123,6 @@ class cudaExecutionPolicy { tf::cuda_merge and tf::cuda_merge_by_key. */ inline static unsigned merge_bufsz(unsigned a_count, unsigned b_count); - - private: - - cudaStream_t _stream {0}; }; /** diff --git a/taskflow/cuda/cuda_graph.hpp b/taskflow/cuda/cuda_graph.hpp index a326aedea..285acdd1e 100644 --- a/taskflow/cuda/cuda_graph.hpp +++ b/taskflow/cuda/cuda_graph.hpp @@ -1,5 +1,7 @@ #pragma once +#include + #include "cuda_memory.hpp" #include "cuda_stream.hpp" #include "cuda_meta.hpp" @@ -147,17 +149,72 @@ inline size_t cuda_graph_get_num_nodes(cudaGraph_t graph) { } /** -@brief queries the number of edges in a native CUDA graph -*/ -inline size_t cuda_graph_get_num_edges(cudaGraph_t graph) { +@brief Handles compatibility with CUDA <= 12.x and CUDA == 13.x + */ +inline size_t cuda_graph_get_num_edges(cudaGraph_t graph, cudaGraphNode_t* from, cudaGraphNode_t* to) { size_t num_edges; TF_CHECK_CUDA( - cudaGraphGetEdges(graph, nullptr, nullptr, &num_edges), - "failed to get native graph edges" + TF_CUDA_PRE13(cudaGraphGetEdges(graph, from, to, &num_edges)) + TF_CUDA_POST13(cudaGraphGetEdges(graph, from, to, nullptr, &num_edges)), + "failed to get native graph edges" ); return num_edges; } +/** +@brief Handles compatibility with CUDA <= 12.x and CUDA 13 +* @param node +* @param dependencies +* @return + */ +inline size_t cuda_graph_node_get_dependencies(cudaGraphNode_t node, cudaGraphNode_t* dependencies) { + size_t num_predecessors; + TF_CHECK_CUDA( + TF_CUDA_PRE13(cudaGraphNodeGetDependencies(node, dependencies, &num_predecessors)) + TF_CUDA_POST13(cudaGraphNodeGetDependencies(node, dependencies, nullptr, &num_predecessors)), + "Failed to get number of dependencies"); + return num_predecessors; +} + +/** +@brief Handles compatibility with CUDA <= 12.x and CUDA 13 +@param node +@param dependent_nodes +@return + */ +inline size_t cuda_graph_node_get_dependent_nodes(cudaGraphNode_t node, cudaGraphNode_t *dependent_nodes) { + size_t num_successors; + TF_CHECK_CUDA( + TF_CUDA_PRE13(cudaGraphNodeGetDependentNodes(node, dependent_nodes, &num_successors)) + TF_CUDA_POST13(cudaGraphNodeGetDependentNodes(node, dependent_nodes, nullptr, &num_successors)), + "Failed to get CUDA dependent nodes"); + return num_successors; +} + +/** +@brief Handles compatibility with CUDA <= 12.x and CUDA 13 +@param graph +@param from +@param to +@param numDependencies + */ +inline void cuda_graph_add_dependencies(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, size_t numDependencies) { + TF_CHECK_CUDA( + TF_CUDA_PRE13(cudaGraphAddDependencies(graph, from, to, numDependencies)) + TF_CUDA_POST13(cudaGraphAddDependencies(graph, from, to, nullptr, numDependencies)), + "Failed to add CUDA graph node dependencies" + ); +} + +/** +@brief queries the number of edges in a native CUDA graph +*/ +inline size_t cuda_graph_get_num_edges(cudaGraph_t graph) { + return cuda_graph_get_num_edges(graph, nullptr, nullptr); +} + + + /** @brief acquires the nodes in a native CUDA graph */ @@ -191,10 +248,7 @@ inline std::vector> cuda_graph_get_edges(cudaGraph_t graph) { size_t num_edges = cuda_graph_get_num_edges(graph); std::vector froms(num_edges), tos(num_edges); - TF_CHECK_CUDA( - cudaGraphGetEdges(graph, froms.data(), tos.data(), &num_edges), - "failed to get native graph edges" - ); + num_edges = cuda_graph_get_num_edges(graph, froms.data(), tos.data()); std::vector> edges(num_edges); for(size_t i=0; i -void cuda_dump_graph(T& os, cudaGraph_t g) { - - os << "digraph cudaGraph {\n"; - - std::stack> stack; - stack.push(std::make_tuple(g, nullptr, 1)); - - int pl = 0; - - while(stack.empty() == false) { - - auto [graph, parent, l] = stack.top(); - stack.pop(); - - for(int i=0; i " << 'p' << to << ";\n"; - } - - for(auto& node : nodes) { - auto type = cuda_get_graph_node_type(node); - if(type == cudaGraphNodeTypeGraph) { - - cudaGraph_t child_graph; - TF_CHECK_CUDA(cudaGraphChildGraphNodeGetGraph(node, &child_graph), ""); - stack.push(std::make_tuple(child_graph, node, l+1)); - - os << 'p' << node << "[" - << "shape=folder, style=filled, fontcolor=white, fillcolor=purple, " - << "label=\"cudaGraph-L" << l+1 - << "\"];\n"; - } - else { - os << 'p' << node << "[label=\"" - << cuda_graph_node_type_to_string(type) - << "\"];\n"; - } - } - - // precede to parent - if(parent != nullptr) { - std::unordered_set successors; - for(const auto& p : edges) { - successors.insert(p.first); - } - for(auto node : nodes) { - if(successors.find(node) == successors.end()) { - os << 'p' << node << " -> " << 'p' << parent << ";\n"; - } - } - } - - // set the previous level - pl = l; - } +class cudaTask { - for(int i=0; i<=pl; i++) { - os << "}\n"; - } + template + friend class cudaGraphBase; + + template + friend class cudaGraphExecBase; + + friend class cudaFlow; + friend class cudaFlowCapturer; + friend class cudaFlowCapturerBase; + + friend std::ostream& operator << (std::ostream&, const cudaTask&); + + public: + + /** + @brief constructs an empty cudaTask + */ + cudaTask() = default; + + /** + @brief copy-constructs a cudaTask + */ + cudaTask(const cudaTask&) = default; + + /** + @brief copy-assigns a cudaTask + */ + cudaTask& operator = (const cudaTask&) = default; + + /** + @brief adds precedence links from this to other tasks + + @tparam Ts parameter pack + + @param tasks one or multiple tasks + + @return @c *this + */ + template + cudaTask& precede(Ts&&... tasks); + + /** + @brief adds precedence links from other tasks to this + + @tparam Ts parameter pack + + @param tasks one or multiple tasks + + @return @c *this + */ + template + cudaTask& succeed(Ts&&... tasks); + + /** + @brief queries the number of successors + */ + size_t num_successors() const; + + /** + @brief queries the number of dependents + */ + size_t num_predecessors() const; + + /** + @brief queries the type of this task + */ + auto type() const; + + /** + @brief dumps the task through an output stream + + @param os an output stream target + */ + void dump(std::ostream& os) const; + + private: + + cudaTask(cudaGraph_t, cudaGraphNode_t); + + cudaGraph_t _native_graph {nullptr}; + cudaGraphNode_t _native_node {nullptr}; +}; + +// Constructor +inline cudaTask::cudaTask(cudaGraph_t native_graph, cudaGraphNode_t native_node) : + _native_graph {native_graph}, _native_node {native_node} { +} + +// Function: precede +template +cudaTask& cudaTask::precede(Ts&&... tasks) { + ( + cuda_graph_add_dependencies( + _native_graph, &_native_node, &(tasks._native_node), 1 + ), ... + ); + return *this; +} + +// Function: succeed +template +cudaTask& cudaTask::succeed(Ts&&... tasks) { + (tasks.precede(*this), ...); + return *this; +} + +// Function: num_predecessors +inline size_t cudaTask::num_predecessors() const { + return cuda_graph_node_get_dependencies(_native_node, nullptr); +} + +// Function: num_successors +inline size_t cudaTask::num_successors() const { + return cuda_graph_node_get_dependent_nodes(_native_node, nullptr); +} + +// Function: type +inline auto cudaTask::type() const { + cudaGraphNodeType type; + cudaGraphNodeGetType(_native_node, &type); + return type; +} + +// Function: dump +inline void cudaTask::dump(std::ostream& os) const { + os << "cudaTask [type=" << to_string(type()) << ']'; +} + +/** +@brief overload of ostream inserter operator for cudaTask +*/ +inline std::ostream& operator << (std::ostream& os, const cudaTask& ct) { + ct.dump(os); + return os; } // ---------------------------------------------------------------------------- // cudaGraph // ---------------------------------------------------------------------------- - + /** -@private + @class cudaGraphCreator + + @brief class to create functors that construct CUDA graphs + + This class define functors to new CUDA graphs using `cudaGraphCreate`. + */ -struct cudaGraphCreator { - cudaGraph_t operator () () const { +class cudaGraphCreator { + + public: + + /** + * @brief creates a new CUDA graph + * + * Calls `cudaGraphCreate` to generate a CUDA native graph and returns it. + * If the graph creation fails, an error is reported. + * + * @return A newly created `cudaGraph_t` instance. + * @throws If CUDA graph creation fails, an error is logged. + */ + cudaGraph_t operator () () const { cudaGraph_t g; TF_CHECK_CUDA(cudaGraphCreate(&g, 0), "failed to create a CUDA native graph"); - return g; + return g; + } + + /** + @brief return the given CUDA graph + */ + cudaGraph_t operator () (cudaGraph_t graph) const { + return graph; } + }; /** -@private + @class cudaGraphDeleter + + @brief class to create a functor that deletes a CUDA graph + + This structure provides an overloaded function call operator to safely + destroy a CUDA graph using `cudaGraphDestroy`. + */ -struct cudaGraphDeleter { +class cudaGraphDeleter { + + public: + + /** + * @brief deletes a CUDA graph + * + * Calls `cudaGraphDestroy` to release the CUDA graph resource if it is valid. + * + * @param g the CUDA graph to be destroyed + */ void operator () (cudaGraph_t g) const { - if(g) { - cudaGraphDestroy(g); - } + cudaGraphDestroy(g); } }; + /** -@class cudaGraph +@class cudaGraphBase -@brief class to create an RAII-styled wrapper over a CUDA executable graph +@brief class to create a CUDA graph with uunique ownership -A cudaGraph object is an RAII-styled wrapper over -a native CUDA graph (@c cudaGraph_t). -A cudaGraph object is move-only. +@tparam Creator functor to create the stream (used in constructor) +@tparam Deleter functor to delete the stream (used in destructor) + +This class wraps a `cudaGraph_t` handle with std::unique_ptr to ensure proper +resource management and automatic cleanup. */ -class cudaGraph : - public cudaObject { +template +class cudaGraphBase : public std::unique_ptr, cudaGraphDeleter> { + + static_assert(std::is_pointer_v, "cudaGraph_t is not a pointer type"); public: + + /** + @brief base std::unique_ptr type + */ + using base_type = std::unique_ptr, Deleter>; /** - @brief constructs an RAII-styled object from the given CUDA exec + @brief constructs a `cudaGraph` object by passing the given arguments to the executable CUDA graph creator - Constructs a cudaGraph object from the given CUDA graph @c native. + Constructs a `cudaGraph` object by passing the given arguments to the executable CUDA graph creator + + @param args arguments to pass to the executable CUDA graph creator */ - explicit cudaGraph(cudaGraph_t native) : cudaObject(native) { } + template + explicit cudaGraphBase(ArgsT&& ... args) : base_type( + Creator{}(std::forward(args)...), Deleter() + ) { + } /** - @brief constructs a cudaGraph object with a new CUDA graph + @brief constructs a `cudaGraph` from the given rhs using move semantics */ - cudaGraph() = default; -}; + cudaGraphBase(cudaGraphBase&&) = default; -// ---------------------------------------------------------------------------- -// cudaGraphExec -// ---------------------------------------------------------------------------- + /** + @brief assign the rhs to `*this` using move semantics + */ + cudaGraphBase& operator = (cudaGraphBase&&) = default; + + /** + @brief queries the number of nodes in a native CUDA graph + */ + size_t num_nodes() const; -/** -@private -*/ -struct cudaGraphExecCreator { - cudaGraphExec_t operator () () const { return nullptr; } -}; + /** + @brief queries the number of edges in a native CUDA graph + */ + size_t num_edges() const; -/** -@private -*/ -struct cudaGraphExecDeleter { - void operator () (cudaGraphExec_t executable) const { - if(executable) { - cudaGraphExecDestroy(executable); - } - } -}; + /** + @brief queries if the graph is empty + */ + bool empty() const; -/** -@class cudaGraphExec + /** + @brief dumps the CUDA graph to a DOT format through the given output stream + + @param os target output stream + */ + void dump(std::ostream& os); -@brief class to create an RAII-styled wrapper over a CUDA executable graph + // ------------------------------------------------------------------------ + // Graph building routines + // ------------------------------------------------------------------------ -A cudaGraphExec object is an RAII-styled wrapper over -a native CUDA executable graph (@c cudaGraphExec_t). -A cudaGraphExec object is move-only. -*/ -class cudaGraphExec : - public cudaObject { + /** + @brief creates a no-operation task - public: + @return a tf::cudaTask handle + + An empty node performs no operation during execution, + but can be used for transitive ordering. + For example, a phased execution graph with 2 groups of @c n nodes + with a barrier between them can be represented using an empty node + and @c 2*n dependency edges, + rather than no empty node and @c n^2 dependency edges. + */ + cudaTask noop(); /** - @brief constructs an RAII-styled object from the given CUDA exec + @brief creates a host task that runs a callable on the host + + @tparam C callable type + + @param callable a callable object with neither arguments nor return + (i.e., constructible from @c std::function) + @param user_data a pointer to the user data - Constructs a cudaGraphExec object which owns @c exec. + @return a tf::cudaTask handle + + A host task can only execute CPU-specific functions and cannot do any CUDA calls + (e.g., @c cudaMalloc). */ - explicit cudaGraphExec(cudaGraphExec_t exec) : cudaObject(exec) { } - + template + cudaTask host(C&& callable, void* user_data); + /** - @brief default constructor + @brief creates a kernel task + + @tparam F kernel function type + @tparam ArgsT kernel function parameters type + + @param g configured grid + @param b configured block + @param s configured shared memory size in bytes + @param f kernel function + @param args arguments to forward to the kernel function by copy + + @return a tf::cudaTask handle */ - cudaGraphExec() = default; - + template + cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args); + /** - @brief instantiates the exexutable from the given CUDA graph + @brief creates a memset task that fills untyped data with a byte value + + @param dst pointer to the destination device memory area + @param v value to set for each byte of specified memory + @param count size in bytes to set + + @return a tf::cudaTask handle + + A memset task fills the first @c count bytes of device memory area + pointed by @c dst with the byte value @c v. */ - void instantiate(cudaGraph_t graph) { - cudaGraphExecDeleter {} (object); - TF_CHECK_CUDA( - cudaGraphInstantiate(&object, graph, nullptr, nullptr, 0), - "failed to create an executable graph" - ); - } - + cudaTask memset(void* dst, int v, size_t count); + /** - @brief updates the exexutable from the given CUDA graph + @brief creates a memcpy task that copies untyped data in bytes + + @param tgt pointer to the target memory block + @param src pointer to the source memory block + @param bytes bytes to copy + + @return a tf::cudaTask handle + + A memcpy task transfers @c bytes of data from a source location + to a target location. Direction can be arbitrary among CPUs and GPUs. */ - cudaGraphExecUpdateResult update(cudaGraph_t graph) { - cudaGraphNode_t error_node; - cudaGraphExecUpdateResult error_result; - cudaGraphExecUpdate(object, graph, &error_node, &error_result); - return error_result; - } - + cudaTask memcpy(void* tgt, const void* src, size_t bytes); + /** - @brief launchs the executable graph via the given stream - */ - void launch(cudaStream_t stream) { - TF_CHECK_CUDA( - cudaGraphLaunch(object, stream), "failed to launch a CUDA executable graph" - ); - } -}; + @brief creates a memset task that sets a typed memory block to zero -// ---------------------------------------------------------------------------- -// cudaFlowGraph class -// ---------------------------------------------------------------------------- + @tparam T element type (size of @c T must be either 1, 2, or 4) + @param dst pointer to the destination device memory area + @param count number of elements -// class: cudaFlowGraph -class cudaFlowGraph { + @return a tf::cudaTask handle - friend class cudaFlowNode; - friend class cudaTask; - friend class cudaFlowCapturer; - friend class cudaFlow; - friend class cudaFlowOptimizerBase; - friend class cudaFlowSequentialOptimizer; - friend class cudaFlowLinearOptimizer; - friend class cudaFlowRoundRobinOptimizer; - friend class Taskflow; - friend class Executor; + A zero task zeroes the first @c count elements of type @c T + in a device memory area pointed by @c dst. + */ + template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr + > + cudaTask zero(T* dst, size_t count); - constexpr static int OFFLOADED = 0x01; - constexpr static int CHANGED = 0x02; - constexpr static int UPDATED = 0x04; + /** + @brief creates a memset task that fills a typed memory block with a value - public: + @tparam T element type (size of @c T must be either 1, 2, or 4) - cudaFlowGraph() = default; - ~cudaFlowGraph() = default; + @param dst pointer to the destination device memory area + @param value value to fill for each element of type @c T + @param count number of elements - cudaFlowGraph(const cudaFlowGraph&) = delete; - cudaFlowGraph(cudaFlowGraph&&) = default; + @return a tf::cudaTask handle - cudaFlowGraph& operator = (const cudaFlowGraph&) = delete; - cudaFlowGraph& operator = (cudaFlowGraph&&) = default; + A fill task fills the first @c count elements of type @c T with @c value + in a device memory area pointed by @c dst. + The value to fill is interpreted in type @c T rather than byte. + */ + template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr + > + cudaTask fill(T* dst, T value, size_t count); - template - cudaFlowNode* emplace_back(ArgsT&&...); + /** + @brief creates a memcopy task that copies typed data - bool empty() const; + @tparam T element type (non-void) - void clear(); - void dump(std::ostream&, const void*, const std::string&) const ; + @param tgt pointer to the target memory block + @param src pointer to the source memory block + @param num number of elements to copy - private: + @return a tf::cudaTask handle - int _state{CHANGED}; - cudaGraph _native_handle {nullptr}; - std::vector> _nodes; -}; + A copy task transfers num*sizeof(T) bytes of data from a source location + to a target location. Direction can be arbitrary among CPUs and GPUs. + */ + template , void>* = nullptr + > + cudaTask copy(T* tgt, const T* src, size_t num); + + // ------------------------------------------------------------------------ + // generic algorithms + // ------------------------------------------------------------------------ -// ---------------------------------------------------------------------------- -// cudaFlowNode class -// ---------------------------------------------------------------------------- + /** + @brief runs a callable with only a single kernel thread -/** -@private -@class: cudaFlowNode -*/ -class cudaFlowNode { + @tparam C callable type - friend class cudaFlowGraph; - friend class cudaTask; - friend class cudaFlow; - friend class cudaFlowCapturer; - friend class cudaFlowOptimizerBase; - friend class cudaFlowSequentialOptimizer; - friend class cudaFlowLinearOptimizer; - friend class cudaFlowRoundRobinOptimizer; - friend class Taskflow; - friend class Executor; + @param c callable to run by a single kernel thread - // Empty handle - struct Empty { - }; + @return a tf::cudaTask handle + */ + template + cudaTask single_task(C c); + + /** + @brief applies a callable to each dereferenced element of the data array - // Host handle - struct Host { + @tparam I iterator type + @tparam C callable type + @tparam E execution poligy (default tf::cudaDefaultExecutionPolicy) - template - Host(C&&); + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param callable a callable object to apply to the dereferenced iterator - std::function func; + @return a tf::cudaTask handle - static void callback(void*); - }; + This method is equivalent to the parallel execution of the following loop on a GPU: - // Memset handle - struct Memset { - }; + @code{.cpp} + for(auto itr = first; itr != last; itr++) { + callable(*itr); + } + @endcode + */ + template + cudaTask for_each(I first, I last, C callable); + + /** + @brief applies a callable to each index in the range with the step size + + @tparam I index type + @tparam C callable type + @tparam E execution poligy (default tf::cudaDefaultExecutionPolicy) - // Memcpy handle - struct Memcpy { - }; + @param first beginning index + @param last last index + @param step step size + @param callable the callable to apply to each element in the data array - // Kernel handle - struct Kernel { + @return a tf::cudaTask handle - template - Kernel(F&& f); + This method is equivalent to the parallel execution of the following loop on a GPU: - void* func {nullptr}; - }; + @code{.cpp} + // step is positive [first, last) + for(auto i=first; ilast; i+=step) { + callable(i); + } + @endcode + */ + template + cudaTask for_each_index(I first, I last, I step, C callable); + + /** + @brief applies a callable to a source range and stores the result in a target range - // Capture - struct Capture { + @tparam I input iterator type + @tparam O output iterator type + @tparam C unary operator type + @tparam E execution poligy (default tf::cudaDefaultExecutionPolicy) - template - Capture(C&&); + @param first iterator to the beginning of the input range + @param last iterator to the end of the input range + @param output iterator to the beginning of the output range + @param op the operator to apply to transform each element in the range - std::function work; + @return a tf::cudaTask handle - cudaEvent_t event; - size_t level; - size_t lid; - size_t idx; - }; + This method is equivalent to the parallel execution of the following loop on a GPU: - using handle_t = std::variant< - Empty, - Host, - Memset, - Memcpy, - Kernel, - Subflow, - Capture - >; + @code{.cpp} + while (first != last) { + *output++ = callable(*first++); + } + @endcode + */ + template + cudaTask transform(I first, I last, O output, C op); + + /** + @brief creates a task to perform parallel transforms over two ranges of items - public: + @tparam I1 first input iterator type + @tparam I2 second input iterator type + @tparam O output iterator type + @tparam C unary operator type + @tparam E execution poligy (default tf::cudaDefaultExecutionPolicy) - // variant index - constexpr static auto EMPTY = get_index_v; - constexpr static auto HOST = get_index_v; - constexpr static auto MEMSET = get_index_v; - constexpr static auto MEMCPY = get_index_v; - constexpr static auto KERNEL = get_index_v; - constexpr static auto SUBFLOW = get_index_v; - constexpr static auto CAPTURE = get_index_v; + @param first1 iterator to the beginning of the input range + @param last1 iterator to the end of the input range + @param first2 iterato + @param output iterator to the beginning of the output range + @param op binary operator to apply to transform each pair of items in the + two input ranges - cudaFlowNode() = delete; + @return cudaTask handle - template - cudaFlowNode(cudaFlowGraph&, ArgsT&&...); + This method is equivalent to the parallel execution of the following loop on a GPU: + + @code{.cpp} + while (first1 != last1) { + *output++ = op(*first1++, *first2++); + } + @endcode + */ + template + cudaTask transform(I1 first1, I1 last1, I2 first2, O output, C op); private: - cudaFlowGraph& _cfg; + cudaGraphBase(const cudaGraphBase&) = delete; + cudaGraphBase& operator = (const cudaGraphBase&) = delete; +}; - std::string _name; +// query the number of nodes +template +size_t cudaGraphBase::num_nodes() const { + size_t n; + TF_CHECK_CUDA( + cudaGraphGetNodes(this->get(), nullptr, &n), + "failed to get native graph nodes" + ); + return n; +} - handle_t _handle; +// query the emptiness +template +bool cudaGraphBase::empty() const { + return num_nodes() == 0; +} - cudaGraphNode_t _native_handle {nullptr}; +// query the number of edges +template +size_t cudaGraphBase::num_edges() const { + return cuda_graph_get_num_edges(this->get()); +} - SmallVector _successors; - SmallVector _dependents; +//// dump the graph +//inline void cudaGraph::dump(std::ostream& os) { +// +// // acquire the native handle +// auto g = this->get(); +// +// os << "digraph cudaGraph {\n"; +// +// std::stack> stack; +// stack.push(std::make_tuple(g, nullptr, 1)); +// +// int pl = 0; +// +// while(stack.empty() == false) { +// +// auto [graph, parent, l] = stack.top(); +// stack.pop(); +// +// for(int i=0; i " << 'p' << to << ";\n"; +// } +// +// for(auto& node : nodes) { +// auto type = cuda_get_graph_node_type(node); +// if(type == cudaGraphNodeTypeGraph) { +// +// cudaGraph_t child_graph; +// TF_CHECK_CUDA(cudaGraphChildGraphNodeGetGraph(node, &child_graph), ""); +// stack.push(std::make_tuple(child_graph, node, l+1)); +// +// os << 'p' << node << "[" +// << "shape=folder, style=filled, fontcolor=white, fillcolor=purple, " +// << "label=\"cudaGraph-L" << l+1 +// << "\"];\n"; +// } +// else { +// os << 'p' << node << "[label=\"" +// << to_string(type) +// << "\"];\n"; +// } +// } +// +// // precede to parent +// if(parent != nullptr) { +// std::unordered_set successors; +// for(const auto& p : edges) { +// successors.insert(p.first); +// } +// for(auto node : nodes) { +// if(successors.find(node) == successors.end()) { +// os << 'p' << node << " -> " << 'p' << parent << ";\n"; +// } +// } +// } +// +// // set the previous level +// pl = l; +// } +// +// for(int i=0; i<=pl; i++) { +// os << "}\n"; +// } +//} + +// dump the graph +template +void cudaGraphBase::dump(std::ostream& os) { + + // Generate a unique temporary filename in the system's temp directory using filesystem + auto temp_path = std::filesystem::temp_directory_path() / "graph_"; + std::random_device rd; + std::uniform_int_distribution dist(100000, 999999); // Generates a random number + temp_path += std::to_string(dist(rd)) + ".dot"; + + // Call the original function with the temporary file + TF_CHECK_CUDA(cudaGraphDebugDotPrint(this->get(), temp_path.string().c_str(), 0), ""); + + // Read the file and write to the output stream + std::ifstream file(temp_path); + if (file) { + os << file.rdbuf(); // Copy file contents to the stream + file.close(); + std::filesystem::remove(temp_path); // Clean up the temporary file + } else { + TF_THROW("failed to open ", temp_path, " for dumping the CUDA graph"); + } +} - void _precede(cudaFlowNode*); -}; +// Function: noop +template +cudaTask cudaGraphBase::noop() { -// ---------------------------------------------------------------------------- -// cudaFlowNode definitions -// ---------------------------------------------------------------------------- + cudaGraphNode_t node; -// Host handle constructor + TF_CHECK_CUDA( + cudaGraphAddEmptyNode(&node, this->get(), nullptr, 0), + "failed to create a no-operation (empty) node" + ); + + return cudaTask(this->get(), node); +} + +// Function: host +template template -cudaFlowNode::Host::Host(C&& c) : func {std::forward(c)} { +cudaTask cudaGraphBase::host(C&& callable, void* user_data) { + + cudaGraphNode_t node; + cudaHostNodeParams p {callable, user_data}; + + TF_CHECK_CUDA( + cudaGraphAddHostNode(&node, this->get(), nullptr, 0, &p), + "failed to create a host node" + ); + + return cudaTask(this->get(), node); } -// Host callback -inline void cudaFlowNode::Host::callback(void* data) { - static_cast(data)->func(); -}; +// Function: kernel +template +template +cudaTask cudaGraphBase::kernel( + dim3 g, dim3 b, size_t s, F f, ArgsT... args +) { + + cudaGraphNode_t node; + cudaKernelNodeParams p; -// Kernel handle constructor -template -cudaFlowNode::Kernel::Kernel(F&& f) : - func {std::forward(f)} { + void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... }; + + p.func = (void*)f; + p.gridDim = g; + p.blockDim = b; + p.sharedMemBytes = s; + p.kernelParams = arguments; + p.extra = nullptr; + + TF_CHECK_CUDA( + cudaGraphAddKernelNode(&node, this->get(), nullptr, 0, &p), + "failed to create a kernel task" + ); + + return cudaTask(this->get(), node); } -// Capture handle constructor -template -cudaFlowNode::Capture::Capture(C&& c) : - work {std::forward(c)} { +// Function: zero +template +template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* +> +cudaTask cudaGraphBase::zero(T* dst, size_t count) { + + cudaGraphNode_t node; + auto p = cuda_get_zero_parms(dst, count); + + TF_CHECK_CUDA( + cudaGraphAddMemsetNode(&node, this->get(), nullptr, 0, &p), + "failed to create a memset (zero) task" + ); + + return cudaTask(this->get(), node); } -// Constructor -template -cudaFlowNode::cudaFlowNode(cudaFlowGraph& graph, ArgsT&&... args) : - _cfg {graph}, - _handle {std::forward(args)...} { +// Function: fill +template +template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* +> +cudaTask cudaGraphBase::fill(T* dst, T value, size_t count) { + + cudaGraphNode_t node; + auto p = cuda_get_fill_parms(dst, value, count); + TF_CHECK_CUDA( + cudaGraphAddMemsetNode(&node, this->get(), nullptr, 0, &p), + "failed to create a memset (fill) task" + ); + + return cudaTask(this->get(), node); } -// Procedure: _precede -inline void cudaFlowNode::_precede(cudaFlowNode* v) { +// Function: copy +template +template < + typename T, + std::enable_if_t, void>* +> +cudaTask cudaGraphBase::copy(T* tgt, const T* src, size_t num) { - _cfg._state |= cudaFlowGraph::CHANGED; + cudaGraphNode_t node; + auto p = cuda_get_copy_parms(tgt, src, num); - _successors.push_back(v); - v->_dependents.push_back(this); + TF_CHECK_CUDA( + cudaGraphAddMemcpyNode(&node, this->get(), nullptr, 0, &p), + "failed to create a memcpy (copy) task" + ); - // capture node doesn't have the native graph yet - if(_handle.index() != cudaFlowNode::CAPTURE) { - TF_CHECK_CUDA( - cudaGraphAddDependencies( - _cfg._native_handle, &_native_handle, &v->_native_handle, 1 - ), - "failed to add a preceding link ", this, "->", v - ); - } + return cudaTask(this->get(), node); } -// ---------------------------------------------------------------------------- -// cudaGraph definitions -// ---------------------------------------------------------------------------- +// Function: memset +template +cudaTask cudaGraphBase::memset(void* dst, int ch, size_t count) { -// Function: empty -inline bool cudaFlowGraph::empty() const { - return _nodes.empty(); -} - -// Procedure: clear -inline void cudaFlowGraph::clear() { - _state |= cudaFlowGraph::CHANGED; - _nodes.clear(); - _native_handle.clear(); -} - -// Function: emplace_back -template -cudaFlowNode* cudaFlowGraph::emplace_back(ArgsT&&... args) { - - _state |= cudaFlowGraph::CHANGED; - - auto node = std::make_unique(std::forward(args)...); - _nodes.emplace_back(std::move(node)); - return _nodes.back().get(); - - // TODO: use object pool to save memory - //auto node = new cudaFlowNode(std::forward(args)...); - //_nodes.push_back(node); - //return node; -} - -// Procedure: dump the graph to a DOT format -inline void cudaFlowGraph::dump( - std::ostream& os, const void* root, const std::string& root_name -) const { - - // recursive dump with stack - std::stack> stack; - stack.push(std::make_tuple(this, nullptr, 1)); - - int pl = 0; - - while(!stack.empty()) { - - auto [graph, parent, l] = stack.top(); - stack.pop(); - - for(int i=0; i_name.empty()) os << 'p' << parent; - else os << parent->_name; - os << "\";\n" << "color=\"purple\"\n"; - } - - for(auto& node : graph->_nodes) { - - auto v = node.get(); - - os << 'p' << v << "[label=\""; - if(v->_name.empty()) { - os << 'p' << v << "\""; - } - else { - os << v->_name << "\""; - } - - switch(v->_handle.index()) { - case cudaFlowNode::KERNEL: - os << " style=\"filled\"" - << " color=\"white\" fillcolor=\"black\"" - << " fontcolor=\"white\"" - << " shape=\"box3d\""; - break; - - case cudaFlowNode::SUBFLOW: - stack.push(std::make_tuple( - &(std::get_if(&v->_handle)->cfg), v, l+1) - ); - os << " style=\"filled\"" - << " color=\"black\" fillcolor=\"purple\"" - << " fontcolor=\"white\"" - << " shape=\"folder\""; - break; - - default: - break; - } - - os << "];\n"; - - for(const auto s : v->_successors) { - os << 'p' << v << " -> " << 'p' << s << ";\n"; - } - - if(v->_successors.size() == 0) { - if(parent == nullptr) { - if(root) { - os << 'p' << v << " -> p" << root << ";\n"; - } - } - else { - os << 'p' << v << " -> p" << parent << ";\n"; - } - } - } - - // set the previous level - pl = l; - } + cudaGraphNode_t node; + auto p = cuda_get_memset_parms(dst, ch, count); - for(int i=0; iget(), nullptr, 0, &p), + "failed to create a memset task" + ); + + return cudaTask(this->get(), node); +} +// Function: memcpy +template +cudaTask cudaGraphBase::memcpy(void* tgt, const void* src, size_t bytes) { + + cudaGraphNode_t node; + auto p = cuda_get_memcpy_parms(tgt, src, bytes); + + TF_CHECK_CUDA( + cudaGraphAddMemcpyNode(&node, this->get(), nullptr, 0, &p), + "failed to create a memcpy task" + ); + + return cudaTask(this->get(), node); } + + + } // end of namespace tf ----------------------------------------------------- diff --git a/taskflow/cuda/cuda_graph_exec.hpp b/taskflow/cuda/cuda_graph_exec.hpp new file mode 100644 index 000000000..912c9f6c6 --- /dev/null +++ b/taskflow/cuda/cuda_graph_exec.hpp @@ -0,0 +1,384 @@ +#pragma once + +#include "cuda_graph.hpp" + + +namespace tf { + +// ---------------------------------------------------------------------------- +// cudaGraphExec +// ---------------------------------------------------------------------------- + +/** +@class cudaGraphExecCreator +@brief class to create functors for constructing executable CUDA graphs + +This class provides an overloaded function call operator to create a +new executable CUDA graph using `cudaGraphCreate`. +*/ +class cudaGraphExecCreator { + + public: + + /** + @brief returns a null executable CUDA graph + */ + cudaGraphExec_t operator () () const { + return nullptr; + } + + /** + @brief returns the given executable graph + */ + cudaGraphExec_t operator () (cudaGraphExec_t exec) const { + return exec; + } + + /** + @brief returns a newly instantiated executable graph from the given CUDA graph + */ + cudaGraphExec_t operator () (cudaGraph_t graph) const { + cudaGraphExec_t exec; + TF_CHECK_CUDA( + cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0), + "failed to create an executable graph" + ); + return exec; + } + + /** + @brief returns a newly instantiated executable graph from the given CUDA graph + */ + template + cudaGraphExec_t operator () (const cudaGraphBase& graph) const { + return this->operator()(graph.get()); + } +}; + +/** +@class cudaGraphExecDeleter +@brief class to create a functor for deleting an executable CUDA graph + +This class provides an overloaded function call operator to safely +destroy a CUDA graph using `cudaGraphDestroy`. +*/ +class cudaGraphExecDeleter { + + public: + + /** + @brief deletes an executable CUDA graph + + Calls `cudaGraphDestroy` to release the CUDA graph resource if it is valid. + + @param executable the executable CUDA graph to be destroyed + */ + void operator () (cudaGraphExec_t executable) const { + cudaGraphExecDestroy(executable); + } +}; + +/** +@class cudaGraphExecBase + +@brief class to create an executable CUDA graph with unique ownership + +@tparam Creator functor to create the stream (used in constructor) +@tparam Deleter functor to delete the stream (used in destructor) + +This class wraps a `cudaGraphExec_t` handle with `std::unique_ptr` to ensure proper +resource management and automatic cleanup. +*/ +template +class cudaGraphExecBase : public std::unique_ptr, Deleter> { + + static_assert(std::is_pointer_v, "cudaGraphExec_t is not a pointer type"); + + public: + + /** + @brief base std::unique_ptr type + */ + using base_type = std::unique_ptr, Deleter>; + + /** + @brief constructs a `cudaGraphExec` object by passing the given arguments to the executable CUDA graph creator + + Constructs a `cudaGraphExec` object by passing the given arguments to the executable CUDA graph creator + + @param args arguments to pass to the executable CUDA graph creator + */ + template + explicit cudaGraphExecBase(ArgsT&& ... args) : base_type( + Creator{}(std::forward(args)...), Deleter() + ) {} + + /** + @brief constructs a `cudaGraphExec` from the given rhs using move semantics + */ + cudaGraphExecBase(cudaGraphExecBase&&) = default; + + /** + @brief assign the rhs to `*this` using move semantics + */ + cudaGraphExecBase& operator = (cudaGraphExecBase&&) = default; + + // ---------------------------------------------------------------------------------------------- + // Update Methods + // ---------------------------------------------------------------------------------------------- + + /** + @brief updates parameters of a host task + + This method updates the parameter of the given host task (similar to tf::cudaFlow::host). + */ + template + void host(cudaTask task, C&& callable, void* user_data); + + /** + @brief updates parameters of a kernel task + + The method is similar to tf::cudaFlow::kernel but operates on a task + of type tf::cudaTaskType::KERNEL. + The kernel function name must NOT change. + */ + template + void kernel( + cudaTask task, dim3 g, dim3 b, size_t shm, F f, ArgsT... args + ); + + /** + @brief updates parameters of a memset task + + The method is similar to tf::cudaFlow::memset but operates on a task + of type tf::cudaTaskType::MEMSET. + The source/destination memory may have different address values but + must be allocated from the same contexts as the original + source/destination memory. + */ + void memset(cudaTask task, void* dst, int ch, size_t count); + + /** + @brief updates parameters of a memcpy task + + The method is similar to tf::cudaFlow::memcpy but operates on a task + of type tf::cudaTaskType::MEMCPY. + The source/destination memory may have different address values but + must be allocated from the same contexts as the original + source/destination memory. + */ + void memcpy(cudaTask task, void* tgt, const void* src, size_t bytes); + + /** + @brief updates parameters of a memset task to a zero task + + The method is similar to tf::cudaFlow::zero but operates on + a task of type tf::cudaTaskType::MEMSET. + + The source/destination memory may have different address values but + must be allocated from the same contexts as the original + source/destination memory. + */ + template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr + > + void zero(cudaTask task, T* dst, size_t count); + + /** + @brief updates parameters of a memset task to a fill task + + The method is similar to tf::cudaFlow::fill but operates on a task + of type tf::cudaTaskType::MEMSET. + + The source/destination memory may have different address values but + must be allocated from the same contexts as the original + source/destination memory. + */ + template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr + > + void fill(cudaTask task, T* dst, T value, size_t count); + + /** + @brief updates parameters of a memcpy task to a copy task + + The method is similar to tf::cudaFlow::copy but operates on a task + of type tf::cudaTaskType::MEMCPY. + The source/destination memory may have different address values but + must be allocated from the same contexts as the original + source/destination memory. + */ + template , void>* = nullptr + > + void copy(cudaTask task, T* tgt, const T* src, size_t num); + + //--------------------------------------------------------------------------- + // Algorithm Primitives + //--------------------------------------------------------------------------- + + /** + @brief updates a single-threaded kernel task + + This method is similar to cudaFlow::single_task but operates + on an existing task. + */ + template + void single_task(cudaTask task, C c); + + /** + @brief updates parameters of a `for_each` kernel task created from the CUDA graph of `*this` + */ + template + void for_each(cudaTask task, I first, I last, C callable); + + /** + @brief updates parameters of a `for_each_index` kernel task created from the CUDA graph of `*this` + */ + template + void for_each_index(cudaTask task, I first, I last, I step, C callable); + + /** + @brief updates parameters of a `transform` kernel task created from the CUDA graph of `*this` + */ + template + void transform(cudaTask task, I first, I last, O output, C c); + + /** + @brief updates parameters of a `transform` kernel task created from the CUDA graph of `*this` + */ + template + void transform(cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c); + + + private: + + cudaGraphExecBase(const cudaGraphExecBase&) = delete; + + cudaGraphExecBase& operator = (const cudaGraphExecBase&) = delete; +}; + +// ------------------------------------------------------------------------------------------------ +// update methods +// ------------------------------------------------------------------------------------------------ + +// Function: host +template +template +void cudaGraphExecBase::host(cudaTask task, C&& func, void* user_data) { + cudaHostNodeParams p {func, user_data}; + TF_CHECK_CUDA( + cudaGraphExecHostNodeSetParams(this->get(), task._native_node, &p), + "failed to update kernel parameters on ", task + ); +} + +// Function: update kernel parameters +template +template +void cudaGraphExecBase::kernel( + cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT... args +) { + cudaKernelNodeParams p; + + void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... }; + p.func = (void*)f; + p.gridDim = g; + p.blockDim = b; + p.sharedMemBytes = s; + p.kernelParams = arguments; + p.extra = nullptr; + + TF_CHECK_CUDA( + cudaGraphExecKernelNodeSetParams(this->get(), task._native_node, &p), + "failed to update kernel parameters on ", task + ); +} + +// Function: update copy parameters +template +template , void>*> +void cudaGraphExecBase::copy(cudaTask task, T* tgt, const T* src, size_t num) { + auto p = cuda_get_copy_parms(tgt, src, num); + TF_CHECK_CUDA( + cudaGraphExecMemcpyNodeSetParams(this->get(), task._native_node, &p), + "failed to update memcpy parameters on ", task + ); +} + +// Function: update memcpy parameters +template +void cudaGraphExecBase::memcpy( + cudaTask task, void* tgt, const void* src, size_t bytes +) { + auto p = cuda_get_memcpy_parms(tgt, src, bytes); + + TF_CHECK_CUDA( + cudaGraphExecMemcpyNodeSetParams(this->get(), task._native_node, &p), + "failed to update memcpy parameters on ", task + ); +} + +// Procedure: memset +template +void cudaGraphExecBase::memset(cudaTask task, void* dst, int ch, size_t count) { + auto p = cuda_get_memset_parms(dst, ch, count); + TF_CHECK_CUDA( + cudaGraphExecMemsetNodeSetParams(this->get(), task._native_node, &p), + "failed to update memset parameters on ", task + ); +} + +// Procedure: fill +template +template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* +> +void cudaGraphExecBase::fill(cudaTask task, T* dst, T value, size_t count) { + auto p = cuda_get_fill_parms(dst, value, count); + TF_CHECK_CUDA( + cudaGraphExecMemsetNodeSetParams(this->get(), task._native_node, &p), + "failed to update memset parameters on ", task + ); +} + +// Procedure: zero +template +template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* +> +void cudaGraphExecBase::zero(cudaTask task, T* dst, size_t count) { + auto p = cuda_get_zero_parms(dst, count); + TF_CHECK_CUDA( + cudaGraphExecMemsetNodeSetParams(this->get(), task._native_node, &p), + "failed to update memset parameters on ", task + ); +} + +//------------------------------------------------------------------------------------------------- +// forward declaration +//------------------------------------------------------------------------------------------------- + +/** +@private +*/ +template +cudaStreamBase& cudaStreamBase::run(cudaGraphExec_t exec) { + TF_CHECK_CUDA( + cudaGraphLaunch(exec, this->get()), "failed to launch a CUDA executable graph" + ); + return *this; +} + +/** +@private +*/ +template +template +cudaStreamBase& cudaStreamBase::run(const cudaGraphExecBase& exec) { + return run(exec.get()); +} + + + +} // end of namespace tf ------------------------------------------------------------------------- diff --git a/taskflow/cuda/cuda_memory.hpp b/taskflow/cuda/cuda_memory.hpp index 44648683e..76aa10167 100644 --- a/taskflow/cuda/cuda_memory.hpp +++ b/taskflow/cuda/cuda_memory.hpp @@ -142,7 +142,7 @@ inline void cuda_memcpy_async( @brief initializes or sets GPU memory to the given value byte by byte @param stream stream identifier -@param devPtr pointer to GPU mempry +@param devPtr pointer to GPU memory @param value value to set for each byte of the specified memory @param count size in bytes to set @@ -379,15 +379,7 @@ struct cudaSharedMemory // ---------------------------------------------------------------------------- /** -@class cudaDeviceAllocator - -@brief class to create a CUDA device allocator - -@tparam T element type - -A %cudaDeviceAllocator enables device-specific allocation for -standard library containers. It is typically passed as template parameter -when declaring standard library containers (e.g. std::vector). +@private */ template class cudaDeviceAllocator { @@ -529,7 +521,7 @@ class cudaDeviceAllocator { A call to member allocate with the value returned by this function can still fail to allocate the requested storage. - @return the nubmer of elements that might be allcoated as maximum + @return the number of elements that might be allocated as maximum by a call to member allocate */ size_type max_size() const noexcept { return size_type {-1}; } @@ -575,15 +567,7 @@ class cudaDeviceAllocator { // ---------------------------------------------------------------------------- /** -@class cudaUSMAllocator - -@brief class to create a unified shared memory (USM) allocator - -@tparam T element type - -A %cudaUSMAllocator enables using unified shared memory (USM) allocation for -standard library containers. It is typically passed as template parameter -when declaring standard library containers (e.g. std::vector). +@private */ template class cudaUSMAllocator { @@ -725,7 +709,7 @@ class cudaUSMAllocator { A call to member allocate with the value returned by this function can still fail to allocate the requested storage. - @return the nubmer of elements that might be allcoated as maximum + @return the number of elements that might be allocated as maximum by a call to member allocate */ size_type max_size() const noexcept { return size_type {-1}; } diff --git a/taskflow/cuda/cuda_object.hpp b/taskflow/cuda/cuda_object.hpp deleted file mode 100644 index e30d3a52d..000000000 --- a/taskflow/cuda/cuda_object.hpp +++ /dev/null @@ -1,287 +0,0 @@ -#pragma once - -#include "cuda_error.hpp" - -namespace tf { - -/** -@brief per-thread object pool to manage CUDA device object - -@tparam H object type -@tparam C function object to create a library object -@tparam D function object to delete a library object - -A CUDA device object has a lifetime associated with a device, -for example, @c cudaStream_t, @c cublasHandle_t, etc. -Creating a device object is typically expensive (e.g., 10-200 ms) -and destroying it may trigger implicit device synchronization. -For applications tha intensively make use of device objects, -it is desirable to reuse them as much as possible. - -There exists an one-to-one relationship between CUDA devices in CUDA Runtime API -and CUcontexts in the CUDA Driver API within a process. -The specific context which the CUDA Runtime API uses for a device -is called the device's primary context. -From the perspective of the CUDA Runtime API, -a device and its primary context are synonymous. - -We design the device object pool in a decentralized fashion by keeping -(1) a global pool to keep track of potentially usable objects and -(2) a per-thread pool to footprint objects with shared ownership. -The global pool does not own the object and therefore does not destruct any of them. -The per-thread pool keeps the footprints of objects with shared ownership -and will destruct them if the thread holds the last reference count after it joins. -The motivation of this decentralized control is to avoid device objects -from being destroyed while the context had been destroyed due to driver shutdown. - -*/ -template -class cudaPerThreadDeviceObjectPool { - - public: - - /** - @brief structure to store a context object - */ - struct Object { - - int device; - H value; - - Object(int); - ~Object(); - - Object(const Object&) = delete; - Object(Object&&) = delete; - }; - - private: - - // Master thread hold the storage to the pool. - // Due to some ordering, cuda context may be destroyed when the master - // program thread destroys the cuda object. - // Therefore, we use a decentralized approach to let child thread - // destroy cuda objects while the master thread only keeps a weak reference - // to those objects for reuse. - struct cudaGlobalDeviceObjectPool { - - std::shared_ptr acquire(int); - void release(int, std::weak_ptr); - - std::mutex mutex; - std::unordered_map>> pool; - }; - - public: - - /** - @brief default constructor - */ - cudaPerThreadDeviceObjectPool() = default; - - /** - @brief acquires a device object with shared ownership - */ - std::shared_ptr acquire(int); - - /** - @brief releases a device object with moved ownership - */ - void release(std::shared_ptr&&); - - /** - @brief queries the number of device objects with shared ownership - */ - size_t footprint_size() const; - - private: - - inline static cudaGlobalDeviceObjectPool _shared_pool; - - std::unordered_set> _footprint; -}; - -// ---------------------------------------------------------------------------- -// cudaPerThreadDeviceObject::cudaHanale definition -// ---------------------------------------------------------------------------- - -template -cudaPerThreadDeviceObjectPool::Object::Object(int d) : - device {d} { - cudaScopedDevice ctx(device); - value = C{}(); -} - -template -cudaPerThreadDeviceObjectPool::Object::~Object() { - cudaScopedDevice ctx(device); - D{}(value); -} - -// ---------------------------------------------------------------------------- -// cudaPerThreadDeviceObject::cudaHanaldePool definition -// ---------------------------------------------------------------------------- - -template -std::shared_ptr::Object> -cudaPerThreadDeviceObjectPool::cudaGlobalDeviceObjectPool::acquire(int d) { - std::scoped_lock lock(mutex); - if(auto itr = pool.find(d); itr != pool.end()) { - while(!itr->second.empty()) { - auto sptr = itr->second.back().lock(); - itr->second.pop_back(); - if(sptr) { - return sptr; - } - } - } - return nullptr; -} - -template -void cudaPerThreadDeviceObjectPool::cudaGlobalDeviceObjectPool::release( - int d, std::weak_ptr ptr -) { - std::scoped_lock lock(mutex); - pool[d].push_back(ptr); -} - -// ---------------------------------------------------------------------------- -// cudaPerThreadDeviceObject definition -// ---------------------------------------------------------------------------- - -template -std::shared_ptr::Object> -cudaPerThreadDeviceObjectPool::acquire(int d) { - - auto ptr = _shared_pool.acquire(d); - - if(!ptr) { - ptr = std::make_shared(d); - } - - return ptr; -} - -template -void cudaPerThreadDeviceObjectPool::release( - std::shared_ptr&& ptr -) { - _shared_pool.release(ptr->device, ptr); - _footprint.insert(std::move(ptr)); -} - -template -size_t cudaPerThreadDeviceObjectPool::footprint_size() const { - return _footprint.size(); -} - -// ---------------------------------------------------------------------------- -// cudaObject -// ---------------------------------------------------------------------------- - -/** -@class cudaObject - -@brief class to create an RAII-styled and move-only wrapper for CUDA objects -*/ -template -class cudaObject { - - public: - - /** - @brief constructs a CUDA object from the given one - */ - explicit cudaObject(T obj) : object(obj) {} - - /** - @brief constructs a new CUDA object - */ - cudaObject() : object{ C{}() } {} - - /** - @brief disabled copy constructor - */ - cudaObject(const cudaObject&) = delete; - - /** - @brief move constructor - */ - cudaObject(cudaObject&& rhs) : object{rhs.object} { - rhs.object = nullptr; - } - - /** - @brief destructs the CUDA object - */ - ~cudaObject() { D{}(object); } - - /** - @brief disabled copy assignment - */ - cudaObject& operator = (const cudaObject&) = delete; - - /** - @brief move assignment - */ - cudaObject& operator = (cudaObject&& rhs) { - D {} (object); - object = rhs.object; - rhs.object = nullptr; - return *this; - } - - /** - @brief implicit conversion to the native CUDA stream (cudaObject_t) - - Returns the underlying stream of type @c cudaObject_t. - */ - operator T () const { - return object; - } - - /** - @brief deletes the current CUDA object (if any) and creates a new one - */ - void create() { - D {} (object); - object = C{}(); - } - - /** - @brief resets this CUDA object to the given one - */ - void reset(T new_obj) { - D {} (object); - object = new_obj; - } - - /** - @brief deletes the current CUDA object - */ - void clear() { - reset(nullptr); - } - - /** - @brief releases the ownership of the CUDA object - */ - T release() { - auto tmp = object; - object = nullptr; - return tmp; - } - - protected: - - /** - @brief the CUDA object - */ - T object; -}; - -} // end of namespace tf ----------------------------------------------------- - - - diff --git a/taskflow/cuda/cuda_stream.hpp b/taskflow/cuda/cuda_stream.hpp index 1e312605b..cbcb7fd45 100644 --- a/taskflow/cuda/cuda_stream.hpp +++ b/taskflow/cuda/cuda_stream.hpp @@ -1,6 +1,6 @@ #pragma once -#include "cuda_object.hpp" +#include "cuda_error.hpp" /** @file cuda_stream.hpp @@ -10,217 +10,334 @@ namespace tf { - // ---------------------------------------------------------------------------- -// cudaStream +// cudaEventBase // ---------------------------------------------------------------------------- - + /** -@private +@class cudaEventCreator + +@brief class to create functors that construct CUDA events */ -struct cudaStreamCreator { - cudaStream_t operator () () const { - cudaStream_t stream; - TF_CHECK_CUDA(cudaStreamCreate(&stream), "failed to create a CUDA stream"); - return stream; +class cudaEventCreator { + + public: + + /** + @brief creates a new `cudaEvent_t` object using `cudaEventCreate` + */ + cudaEvent_t operator () () const { + cudaEvent_t event; + TF_CHECK_CUDA(cudaEventCreate(&event), "failed to create a CUDA event"); + return event; + } + + /** + @brief creates a new `cudaEvent_t` object using `cudaEventCreate` with the given `flag` + */ + cudaEvent_t operator () (unsigned int flag) const { + cudaEvent_t event; + TF_CHECK_CUDA( + cudaEventCreateWithFlags(&event, flag), + "failed to create a CUDA event with flag=", flag + ); + return event; + } + + /** + @brief returns the given `cudaEvent_t` object + */ + cudaEvent_t operator () (cudaEvent_t event) const { + return event; } }; /** -@private +@class cudaEventDeleter + +@brief class to create a functor that deletes a CUDA event */ -struct cudaStreamDeleter { - void operator () (cudaStream_t stream) const { - if(stream) { - cudaStreamDestroy(stream); - } +class cudaEventDeleter { + public: + /** + @brief deletes the given `cudaEvent_t` object using `cudaEventDestroy` + */ + void operator () (cudaEvent_t event) const { + cudaEventDestroy(event); } }; /** -@class cudaStream +@class cudaEventBase -@brief class to create an RAII-styled wrapper over a native CUDA stream +@brief class to create a CUDA event with unique ownership -A cudaStream object is an RAII-styled wrapper over a native CUDA stream -(@c cudaStream_t). -A cudaStream object is move-only. +@tparam Creator functor to create the stream (used in constructor) +@tparam Deleter functor to delete the stream (used in destructor) + +The `cudaEventBase` class encapsulates a `cudaEvent_t` using `std::unique_ptr`, ensuring that +CUDA events are properly created and destroyed with a unique ownership. */ -class cudaStream : +template +class cudaEventBase : public std::unique_ptr, Deleter> { - public cudaObject { - - public: + static_assert(std::is_pointer_v, "cudaEvent_t is not a pointer type"); - /** - @brief constructs an RAII-styled object from the given CUDA stream + public: + + /** + @brief base type for the underlying unique pointer + + This alias provides a shorthand for the underlying `std::unique_ptr` type that manages + CUDA event resources with an associated deleter. + */ + using base_type = std::unique_ptr, Deleter>; + + /** + @brief constructs a `cudaEvent` object by passing the given arguments to the event creator + + Constructs a `cudaEvent` object by passing the given arguments to the event creator + + @param args arguments to pass to the event creator + */ + template + explicit cudaEventBase(ArgsT&& ... args) : base_type( + Creator{}(std::forward(args)...), Deleter() + ) { + } + + /** + @brief constructs a `cudaEvent` from the given rhs using move semantics + */ + cudaEventBase(cudaEventBase&&) = default; + + /** + @brief assign the rhs to `*this` using move semantics + */ + cudaEventBase& operator = (cudaEventBase&&) = default; + + private: - Constructs a cudaStream object which owns @c stream. - */ - explicit cudaStream(cudaStream_t stream) : cudaObject(stream) { - } - - /** - @brief default constructor - */ - cudaStream() = default; - - /** - @brief synchronizes the associated stream - - Equivalently calling @c cudaStreamSynchronize to block - until this stream has completed all operations. - */ - void synchronize() const { - TF_CHECK_CUDA( - cudaStreamSynchronize(object), "failed to synchronize a CUDA stream" - ); - } - - /** - @brief begins graph capturing on the stream - - When a stream is in capture mode, all operations pushed into the stream - will not be executed, but will instead be captured into a graph, - which will be returned via cudaStream::end_capture. - - A thread's mode can be one of the following: - + @c cudaStreamCaptureModeGlobal: This is the default mode. - If the local thread has an ongoing capture sequence that was not initiated - with @c cudaStreamCaptureModeRelaxed at @c cuStreamBeginCapture, - or if any other thread has a concurrent capture sequence initiated with - @c cudaStreamCaptureModeGlobal, this thread is prohibited from potentially - unsafe API calls. - - + @c cudaStreamCaptureModeThreadLocal: If the local thread has an ongoing capture - sequence not initiated with @c cudaStreamCaptureModeRelaxed, - it is prohibited from potentially unsafe API calls. - Concurrent capture sequences in other threads are ignored. - - + @c cudaStreamCaptureModeRelaxed: The local thread is not prohibited - from potentially unsafe API calls. Note that the thread is still prohibited - from API calls which necessarily conflict with stream capture, for example, - attempting @c cudaEventQuery on an event that was last recorded - inside a capture sequence. - */ - void begin_capture(cudaStreamCaptureMode m = cudaStreamCaptureModeGlobal) const { - TF_CHECK_CUDA( - cudaStreamBeginCapture(object, m), - "failed to begin capture on stream ", object, " with thread mode ", m - ); - } - - /** - @brief ends graph capturing on the stream - - Equivalently calling @c cudaStreamEndCapture to - end capture on stream and returning the captured graph. - Capture must have been initiated on stream via a call to cudaStream::begin_capture. - If capture was invalidated, due to a violation of the rules of stream capture, - then a NULL graph will be returned. - */ - cudaGraph_t end_capture() const { - cudaGraph_t native_g; - TF_CHECK_CUDA( - cudaStreamEndCapture(object, &native_g), - "failed to end capture on stream ", object - ); - return native_g; - } - - /** - @brief records an event on the stream - - Equivalently calling @c cudaEventRecord to record an event on this stream, - both of which must be on the same CUDA context. - */ - void record(cudaEvent_t event) const { - TF_CHECK_CUDA( - cudaEventRecord(event, object), - "failed to record event ", event, " on stream ", object - ); - } - - /** - @brief waits on an event - - Equivalently calling @c cudaStreamWaitEvent to make all future work - submitted to stream wait for all work captured in event. - */ - void wait(cudaEvent_t event) const { - TF_CHECK_CUDA( - cudaStreamWaitEvent(object, event, 0), - "failed to wait for event ", event, " on stream ", object - ); - } + cudaEventBase(const cudaEventBase&) = delete; + cudaEventBase& operator = (const cudaEventBase&) = delete; }; +/** +@brief default smart pointer type to manage a `cudaEvent_t` object with unique ownership +*/ +using cudaEvent = cudaEventBase; + // ---------------------------------------------------------------------------- -// cudaEvent +// cudaStream // ---------------------------------------------------------------------------- - + /** -@private +@class cudaStreamCreator + +@brief class to create functors that construct CUDA streams */ -struct cudaEventCreator { +class cudaStreamCreator { + + public: - cudaEvent_t operator () () const { - cudaEvent_t event; - TF_CHECK_CUDA(cudaEventCreate(&event), "failed to create a CUDA event"); - return event; + /** + @brief constructs a new `cudaStream_t` object using `cudaStreamCreate` + */ + cudaStream_t operator () () const { + cudaStream_t stream; + TF_CHECK_CUDA(cudaStreamCreate(&stream), "failed to create a CUDA stream"); + return stream; } - cudaEvent_t operator () (unsigned int flag) const { - cudaEvent_t event; - TF_CHECK_CUDA( - cudaEventCreateWithFlags(&event, flag), - "failed to create a CUDA event with flag=", flag - ); - return event; + /** + @brief returns the given `cudaStream_t` object + */ + cudaStream_t operator () (cudaStream_t stream) const { + return stream; } }; /** -@private +@class cudaStreamDeleter + +@brief class to create a functor that deletes a CUDA stream */ -struct cudaEventDeleter { - void operator () (cudaEvent_t event) const { - if (event != nullptr) { - cudaEventDestroy(event); - } +class cudaStreamDeleter { + + public: + + /** + @brief deletes the given `cudaStream_t` object + */ + void operator () (cudaStream_t stream) const { + cudaStreamDestroy(stream); } }; /** -@class cudaEvent +@class cudaStreamBase + +@brief class to create a CUDA stream with unique ownership -@brief class to create an RAII-styled wrapper over a native CUDA event +@tparam Creator functor to create the stream (used in constructor) +@tparam Deleter functor to delete the stream (used in destructor) -A cudaEvent object is an RAII-styled wrapper over a native CUDA event -(@c cudaEvent_t). -A cudaEvent object is move-only. +The `cudaStream` class encapsulates a `cudaStream_t` using `std::unique_ptr`, ensuring that +CUDA events are properly created and destroyed with a unique ownership. */ -class cudaEvent : - public cudaObject { +template +class cudaStreamBase : public std::unique_ptr, Deleter> { + static_assert(std::is_pointer_v, "cudaStream_t is not a pointer type"); + public: + + /** + @brief base type for the underlying unique pointer + + This alias provides a shorthand for the underlying `std::unique_ptr` type that manages + CUDA stream resources with an associated deleter. + */ + using base_type = std::unique_ptr, Deleter>; + + /** + @brief constructs a `cudaStream` object by passing the given arguments to the stream creator + + Constructs a `cudaStream` object by passing the given arguments to the stream creator + + @param args arguments to pass to the stream creator + */ + template + explicit cudaStreamBase(ArgsT&& ... args) : base_type( + Creator{}(std::forward(args)...), Deleter() + ) { + } + + /** + @brief constructs a `cudaStream` from the given rhs using move semantics + */ + cudaStreamBase(cudaStreamBase&&) = default; + + /** + @brief assign the rhs to `*this` using move semantics + */ + cudaStreamBase& operator = (cudaStreamBase&&) = default; + + /** + @brief synchronizes the associated stream - /** - @brief constructs an RAII-styled CUDA event object from the given CUDA event - */ - explicit cudaEvent(cudaEvent_t event) : cudaObject(event) { } + Equivalently calling @c cudaStreamSynchronize to block + until this stream has completed all operations. + */ + cudaStreamBase& synchronize() { + TF_CHECK_CUDA( + cudaStreamSynchronize(this->get()), "failed to synchronize a CUDA stream" + ); + return *this; + } + + /** + @brief begins graph capturing on the stream + + When a stream is in capture mode, all operations pushed into the stream + will not be executed, but will instead be captured into a graph, + which will be returned via cudaStream::end_capture. + + A thread's mode can be one of the following: + + @c cudaStreamCaptureModeGlobal: This is the default mode. + If the local thread has an ongoing capture sequence that was not initiated + with @c cudaStreamCaptureModeRelaxed at @c cuStreamBeginCapture, + or if any other thread has a concurrent capture sequence initiated with + @c cudaStreamCaptureModeGlobal, this thread is prohibited from potentially + unsafe API calls. + + + @c cudaStreamCaptureModeThreadLocal: If the local thread has an ongoing capture + sequence not initiated with @c cudaStreamCaptureModeRelaxed, + it is prohibited from potentially unsafe API calls. + Concurrent capture sequences in other threads are ignored. + + + @c cudaStreamCaptureModeRelaxed: The local thread is not prohibited + from potentially unsafe API calls. Note that the thread is still prohibited + from API calls which necessarily conflict with stream capture, for example, + attempting @c cudaEventQuery on an event that was last recorded + inside a capture sequence. + */ + void begin_capture(cudaStreamCaptureMode m = cudaStreamCaptureModeGlobal) const { + TF_CHECK_CUDA( + cudaStreamBeginCapture(this->get(), m), + "failed to begin capture on stream ", this->get(), " with thread mode ", m + ); + } - /** - @brief constructs an RAII-styled CUDA event object - */ - cudaEvent() = default; - - /** - @brief constructs an RAII-styled CUDA event object with the given flag - */ - explicit cudaEvent(unsigned int flag) : cudaObject(cudaEventCreator{}(flag)) { } + /** + @brief ends graph capturing on the stream + + Equivalently calling @c cudaStreamEndCapture to + end capture on stream and returning the captured graph. + Capture must have been initiated on stream via a call to cudaStream::begin_capture. + If capture was invalidated, due to a violation of the rules of stream capture, + then a NULL graph will be returned. + */ + cudaGraph_t end_capture() const { + cudaGraph_t native_g; + TF_CHECK_CUDA( + cudaStreamEndCapture(this->get(), &native_g), + "failed to end capture on stream ", this->get() + ); + return native_g; + } + + /** + @brief records an event on the stream + + Equivalently calling @c cudaEventRecord to record an event on this stream, + both of which must be on the same CUDA context. + */ + void record(cudaEvent_t event) const { + TF_CHECK_CUDA( + cudaEventRecord(event, this->get()), + "failed to record event ", event, " on stream ", this->get() + ); + } + + /** + @brief waits on an event + + Equivalently calling @c cudaStreamWaitEvent to make all future work + submitted to stream wait for all work captured in event. + */ + void wait(cudaEvent_t event) const { + TF_CHECK_CUDA( + cudaStreamWaitEvent(this->get(), event, 0), + "failed to wait for event ", event, " on stream ", this->get() + ); + } + + /** + @brief runs the given executable CUDA graph + + @param exec the given `cudaGraphExec` + */ + template + cudaStreamBase& run(const cudaGraphExecBase& exec); + + /** + @brief runs the given executable CUDA graph + + @param exec the given `cudaGraphExec_t` + */ + cudaStreamBase& run(cudaGraphExec_t exec); + + private: + + cudaStreamBase(const cudaStreamBase&) = delete; + cudaStreamBase& operator = (const cudaStreamBase&) = delete; }; +/** +@brief default smart pointer type to manage a `cudaStream_t` object with unique ownership +*/ +using cudaStream = cudaStreamBase; } // end of namespace tf ----------------------------------------------------- diff --git a/taskflow/cuda/cuda_task.hpp b/taskflow/cuda/cuda_task.hpp deleted file mode 100644 index 92fac9ccc..000000000 --- a/taskflow/cuda/cuda_task.hpp +++ /dev/null @@ -1,274 +0,0 @@ -#pragma once - -#include "cuda_graph.hpp" - -/** -@file cuda_task.hpp -@brief cudaTask include file -*/ - -namespace tf { - -// ---------------------------------------------------------------------------- -// cudaTask Types -// ---------------------------------------------------------------------------- - -/** -@enum cudaTaskType - -@brief enumeration of all %cudaTask types -*/ -enum class cudaTaskType : int { - /** @brief empty task type */ - EMPTY = 0, - /** @brief host task type */ - HOST, - /** @brief memory set task type */ - MEMSET, - /** @brief memory copy task type */ - MEMCPY, - /** @brief memory copy task type */ - KERNEL, - /** @brief subflow (child graph) task type */ - SUBFLOW, - /** @brief capture task type */ - CAPTURE, - /** @brief undefined task type */ - UNDEFINED -}; - -/** -@brief convert a cuda_task type to a human-readable string -*/ -constexpr const char* to_string(cudaTaskType type) { - switch(type) { - case cudaTaskType::EMPTY: return "empty"; - case cudaTaskType::HOST: return "host"; - case cudaTaskType::MEMSET: return "memset"; - case cudaTaskType::MEMCPY: return "memcpy"; - case cudaTaskType::KERNEL: return "kernel"; - case cudaTaskType::SUBFLOW: return "subflow"; - case cudaTaskType::CAPTURE: return "capture"; - default: return "undefined"; - } -} - -// ---------------------------------------------------------------------------- -// cudaTask -// ---------------------------------------------------------------------------- - -/** -@class cudaTask - -@brief class to create a task handle over an internal node of a %cudaFlow graph -*/ -class cudaTask { - - friend class cudaFlow; - friend class cudaFlowCapturer; - friend class cudaFlowCapturerBase; - - friend std::ostream& operator << (std::ostream&, const cudaTask&); - - public: - - /** - @brief constructs an empty cudaTask - */ - cudaTask() = default; - - /** - @brief copy-constructs a cudaTask - */ - cudaTask(const cudaTask&) = default; - - /** - @brief copy-assigns a cudaTask - */ - cudaTask& operator = (const cudaTask&) = default; - - /** - @brief adds precedence links from this to other tasks - - @tparam Ts parameter pack - - @param tasks one or multiple tasks - - @return @c *this - */ - template - cudaTask& precede(Ts&&... tasks); - - /** - @brief adds precedence links from other tasks to this - - @tparam Ts parameter pack - - @param tasks one or multiple tasks - - @return @c *this - */ - template - cudaTask& succeed(Ts&&... tasks); - - /** - @brief assigns a name to the task - - @param name a @std_string acceptable string - - @return @c *this - */ - cudaTask& name(const std::string& name); - - /** - @brief queries the name of the task - */ - const std::string& name() const; - - /** - @brief queries the number of successors - */ - size_t num_successors() const; - - /** - @brief queries the number of dependents - */ - size_t num_dependents() const; - - /** - @brief queries if the task is associated with a cudaFlowNode - */ - bool empty() const; - - /** - @brief queries the task type - */ - cudaTaskType type() const; - - /** - @brief dumps the task through an output stream - - @tparam T output stream type with insertion operator (<<) defined - @param ostream an output stream target - */ - template - void dump(T& ostream) const; - - /** - @brief applies an visitor callable to each successor of the task - */ - template - void for_each_successor(V&& visitor) const; - - /** - @brief applies an visitor callable to each dependents of the task - */ - template - void for_each_dependent(V&& visitor) const; - - private: - - cudaTask(cudaFlowNode*); - - cudaFlowNode* _node {nullptr}; -}; - -// Constructor -inline cudaTask::cudaTask(cudaFlowNode* node) : _node {node} { -} - -// Function: precede -template -cudaTask& cudaTask::precede(Ts&&... tasks) { - (_node->_precede(tasks._node), ...); - return *this; -} - -// Function: succeed -template -cudaTask& cudaTask::succeed(Ts&&... tasks) { - (tasks._node->_precede(_node), ...); - return *this; -} - -// Function: empty -inline bool cudaTask::empty() const { - return _node == nullptr; -} - -// Function: name -inline cudaTask& cudaTask::name(const std::string& name) { - _node->_name = name; - return *this; -} - -// Function: name -inline const std::string& cudaTask::name() const { - return _node->_name; -} - -// Function: num_successors -inline size_t cudaTask::num_successors() const { - return _node->_successors.size(); -} - -// Function: num_dependents -inline size_t cudaTask::num_dependents() const { - return _node->_dependents.size(); -} - -// Function: type -inline cudaTaskType cudaTask::type() const { - switch(_node->_handle.index()) { - case cudaFlowNode::EMPTY: return cudaTaskType::EMPTY; - case cudaFlowNode::HOST: return cudaTaskType::HOST; - case cudaFlowNode::MEMSET: return cudaTaskType::MEMSET; - case cudaFlowNode::MEMCPY: return cudaTaskType::MEMCPY; - case cudaFlowNode::KERNEL: return cudaTaskType::KERNEL; - case cudaFlowNode::SUBFLOW: return cudaTaskType::SUBFLOW; - case cudaFlowNode::CAPTURE: return cudaTaskType::CAPTURE; - default: return cudaTaskType::UNDEFINED; - } -} - -// Procedure: dump -template -void cudaTask::dump(T& os) const { - os << "cudaTask "; - if(_node->_name.empty()) os << _node; - else os << _node->_name; - os << " [type=" << to_string(type()) << ']'; -} - -// Function: for_each_successor -template -void cudaTask::for_each_successor(V&& visitor) const { - for(size_t i=0; i<_node->_successors.size(); ++i) { - visitor(cudaTask(_node->_successors[i])); - } -} - -// Function: for_each_dependent -template -void cudaTask::for_each_dependent(V&& visitor) const { - for(size_t i=0; i<_node->_dependents.size(); ++i) { - visitor(cudaTask(_node->_dependents[i])); - } -} - -// ---------------------------------------------------------------------------- -// global ostream -// ---------------------------------------------------------------------------- - -/** -@brief overload of ostream inserter operator for cudaTask -*/ -inline std::ostream& operator << (std::ostream& os, const cudaTask& ct) { - ct.dump(os); - return os; -} - -} // end of namespace tf ----------------------------------------------------- - - - diff --git a/taskflow/cuda/cudaflow.hpp b/taskflow/cuda/cudaflow.hpp index 61d5c84dc..770de9c15 100644 --- a/taskflow/cuda/cudaflow.hpp +++ b/taskflow/cuda/cudaflow.hpp @@ -1,8 +1,9 @@ #pragma once #include "../taskflow.hpp" -#include "cuda_task.hpp" -#include "cuda_capturer.hpp" +#include "cuda_graph.hpp" +#include "cuda_graph_exec.hpp" +#include "algorithm/single_task.hpp" /** @file taskflow/cuda/cudaflow.hpp @@ -11,1013 +12,15 @@ namespace tf { -// ---------------------------------------------------------------------------- -// class definition: cudaFlow -// ---------------------------------------------------------------------------- - /** -@class cudaFlow - -@brief class to create a %cudaFlow task dependency graph - -A %cudaFlow is a high-level interface over CUDA Graph to perform GPU operations -using the task dependency graph model. -The class provides a set of methods for creating and launch different tasks -on one or multiple CUDA devices, -for instance, kernel tasks, data transfer tasks, and memory operation tasks. -The following example creates a %cudaFlow of two kernel tasks, @c task1 and -@c task2, where @c task1 runs before @c task2. - -@code{.cpp} -tf::Taskflow taskflow; -tf::Executor executor; - -taskflow.emplace([&](tf::cudaFlow& cf){ - // create two kernel tasks - tf::cudaTask task1 = cf.kernel(grid1, block1, shm_size1, kernel1, args1); - tf::cudaTask task2 = cf.kernel(grid2, block2, shm_size2, kernel2, args2); - - // kernel1 runs before kernel2 - task1.precede(task2); -}); - -executor.run(taskflow).wait(); -@endcode - -A %cudaFlow is a task (tf::Task) created from tf::Taskflow -and will be run by @em one worker thread in the executor. -That is, the callable that describes a %cudaFlow -will be executed sequentially. -Inside a %cudaFlow task, different GPU tasks (tf::cudaTask) may run -in parallel scheduled by the CUDA runtime. - -Please refer to @ref GPUTaskingcudaFlow for details. +@brief default smart pointer type to manage a `cudaGraph_t` object with unique ownership */ -class cudaFlow { - - public: - - /** - @brief constructs a %cudaFlow - */ - cudaFlow(); - - /** - @brief destroys the %cudaFlow and its associated native CUDA graph - and executable graph - */ - ~cudaFlow() = default; - - /** - @brief default move constructor - */ - cudaFlow(cudaFlow&&) = default; - - /** - @brief default move assignment operator - */ - cudaFlow& operator = (cudaFlow&&) = default; - - /** - @brief queries the emptiness of the graph - */ - bool empty() const; - - /** - @brief queries the number of tasks - */ - size_t num_tasks() const; - - /** - @brief clears the %cudaFlow object - */ - void clear(); - - /** - @brief dumps the %cudaFlow graph into a DOT format through an - output stream - */ - void dump(std::ostream& os) const; - - /** - @brief dumps the native CUDA graph into a DOT format through an - output stream - - The native CUDA graph may be different from the upper-level %cudaFlow - graph when flow capture is involved. - */ - void dump_native_graph(std::ostream& os) const; - - // ------------------------------------------------------------------------ - // Graph building routines - // ------------------------------------------------------------------------ - - /** - @brief creates a no-operation task - - @return a tf::cudaTask handle - - An empty node performs no operation during execution, - but can be used for transitive ordering. - For example, a phased execution graph with 2 groups of @c n nodes - with a barrier between them can be represented using an empty node - and @c 2*n dependency edges, - rather than no empty node and @c n^2 dependency edges. - */ - cudaTask noop(); - - /** - @brief creates a host task that runs a callable on the host - - @tparam C callable type - - @param callable a callable object with neither arguments nor return - (i.e., constructible from @c std::function) - - @return a tf::cudaTask handle - - A host task can only execute CPU-specific functions and cannot do any CUDA calls - (e.g., @c cudaMalloc). - */ - template - cudaTask host(C&& callable); - - /** - @brief updates parameters of a host task - - The method is similar to tf::cudaFlow::host but operates on a task - of type tf::cudaTaskType::HOST. - */ - template - void host(cudaTask task, C&& callable); - - /** - @brief creates a kernel task - - @tparam F kernel function type - @tparam ArgsT kernel function parameters type - - @param g configured grid - @param b configured block - @param s configured shared memory size in bytes - @param f kernel function - @param args arguments to forward to the kernel function by copy - - @return a tf::cudaTask handle - */ - template - cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args); - - /** - @brief updates parameters of a kernel task - - The method is similar to tf::cudaFlow::kernel but operates on a task - of type tf::cudaTaskType::KERNEL. - The kernel function name must NOT change. - */ - template - void kernel( - cudaTask task, dim3 g, dim3 b, size_t shm, F f, ArgsT... args - ); - - /** - @brief creates a memset task that fills untyped data with a byte value - - @param dst pointer to the destination device memory area - @param v value to set for each byte of specified memory - @param count size in bytes to set - - @return a tf::cudaTask handle - - A memset task fills the first @c count bytes of device memory area - pointed by @c dst with the byte value @c v. - */ - cudaTask memset(void* dst, int v, size_t count); - - /** - @brief updates parameters of a memset task - - The method is similar to tf::cudaFlow::memset but operates on a task - of type tf::cudaTaskType::MEMSET. - The source/destination memory may have different address values but - must be allocated from the same contexts as the original - source/destination memory. - */ - void memset(cudaTask task, void* dst, int ch, size_t count); - - /** - @brief creates a memcpy task that copies untyped data in bytes - - @param tgt pointer to the target memory block - @param src pointer to the source memory block - @param bytes bytes to copy - - @return a tf::cudaTask handle - - A memcpy task transfers @c bytes of data from a source location - to a target location. Direction can be arbitrary among CPUs and GPUs. - */ - cudaTask memcpy(void* tgt, const void* src, size_t bytes); - - /** - @brief updates parameters of a memcpy task - - The method is similar to tf::cudaFlow::memcpy but operates on a task - of type tf::cudaTaskType::MEMCPY. - The source/destination memory may have different address values but - must be allocated from the same contexts as the original - source/destination memory. - */ - void memcpy(cudaTask task, void* tgt, const void* src, size_t bytes); - - /** - @brief creates a memset task that sets a typed memory block to zero - - @tparam T element type (size of @c T must be either 1, 2, or 4) - @param dst pointer to the destination device memory area - @param count number of elements - - @return a tf::cudaTask handle - - A zero task zeroes the first @c count elements of type @c T - in a device memory area pointed by @c dst. - */ - template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr - > - cudaTask zero(T* dst, size_t count); - - /** - @brief updates parameters of a memset task to a zero task - - The method is similar to tf::cudaFlow::zero but operates on - a task of type tf::cudaTaskType::MEMSET. - - The source/destination memory may have different address values but - must be allocated from the same contexts as the original - source/destination memory. - */ - template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr - > - void zero(cudaTask task, T* dst, size_t count); - - /** - @brief creates a memset task that fills a typed memory block with a value - - @tparam T element type (size of @c T must be either 1, 2, or 4) - - @param dst pointer to the destination device memory area - @param value value to fill for each element of type @c T - @param count number of elements - - @return a tf::cudaTask handle - - A fill task fills the first @c count elements of type @c T with @c value - in a device memory area pointed by @c dst. - The value to fill is interpreted in type @c T rather than byte. - */ - template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr - > - cudaTask fill(T* dst, T value, size_t count); - - /** - @brief updates parameters of a memset task to a fill task - - The method is similar to tf::cudaFlow::fill but operates on a task - of type tf::cudaTaskType::MEMSET. - - The source/destination memory may have different address values but - must be allocated from the same contexts as the original - source/destination memory. - */ - template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr - > - void fill(cudaTask task, T* dst, T value, size_t count); - - /** - @brief creates a memcopy task that copies typed data - - @tparam T element type (non-void) - - @param tgt pointer to the target memory block - @param src pointer to the source memory block - @param num number of elements to copy - - @return a tf::cudaTask handle - - A copy task transfers num*sizeof(T) bytes of data from a source location - to a target location. Direction can be arbitrary among CPUs and GPUs. - */ - template , void>* = nullptr - > - cudaTask copy(T* tgt, const T* src, size_t num); - - /** - @brief updates parameters of a memcpy task to a copy task - - The method is similar to tf::cudaFlow::copy but operates on a task - of type tf::cudaTaskType::MEMCPY. - The source/destination memory may have different address values but - must be allocated from the same contexts as the original - source/destination memory. - */ - template , void>* = nullptr - > - void copy(cudaTask task, T* tgt, const T* src, size_t num); - - // ------------------------------------------------------------------------ - // run method - // ------------------------------------------------------------------------ - /** - @brief offloads the %cudaFlow onto a GPU asynchronously via a stream - - @param stream stream for performing this operation - - Offloads the present %cudaFlow onto a GPU asynchronously via - the given stream. - - An offloaded %cudaFlow forces the underlying graph to be instantiated. - After the instantiation, you should not modify the graph topology - but update node parameters. - */ - void run(cudaStream_t stream); - - /** - @brief acquires a reference to the underlying CUDA graph - */ - cudaGraph_t native_graph(); - - /** - @brief acquires a reference to the underlying CUDA graph executable - */ - cudaGraphExec_t native_executable(); - - // ------------------------------------------------------------------------ - // generic algorithms - // ------------------------------------------------------------------------ - - /** - @brief runs a callable with only a single kernel thread - - @tparam C callable type - - @param c callable to run by a single kernel thread - - @return a tf::cudaTask handle - */ - template - cudaTask single_task(C c); - - /** - @brief updates a single-threaded kernel task - - This method is similar to cudaFlow::single_task but operates - on an existing task. - */ - template - void single_task(cudaTask task, C c); - - /** - @brief applies a callable to each dereferenced element of the data array - - @tparam I iterator type - @tparam C callable type - - @param first iterator to the beginning (inclusive) - @param last iterator to the end (exclusive) - @param callable a callable object to apply to the dereferenced iterator - - @return a tf::cudaTask handle - - This method is equivalent to the parallel execution of the following loop on a GPU: - - @code{.cpp} - for(auto itr = first; itr != last; itr++) { - callable(*itr); - } - @endcode - */ - template - cudaTask for_each(I first, I last, C callable); - - /** - @brief updates parameters of a kernel task created from - tf::cudaFlow::for_each - - The type of the iterators and the callable must be the same as - the task created from tf::cudaFlow::for_each. - */ - template - void for_each(cudaTask task, I first, I last, C callable); - - /** - @brief applies a callable to each index in the range with the step size - - @tparam I index type - @tparam C callable type - - @param first beginning index - @param last last index - @param step step size - @param callable the callable to apply to each element in the data array - - @return a tf::cudaTask handle - - This method is equivalent to the parallel execution of the following loop on a GPU: - - @code{.cpp} - // step is positive [first, last) - for(auto i=first; ilast; i+=step) { - callable(i); - } - @endcode - */ - template - cudaTask for_each_index(I first, I last, I step, C callable); - - /** - @brief updates parameters of a kernel task created from - tf::cudaFlow::for_each_index - - The type of the iterators and the callable must be the same as - the task created from tf::cudaFlow::for_each_index. - */ - template - void for_each_index( - cudaTask task, I first, I last, I step, C callable - ); - - /** - @brief applies a callable to a source range and stores the result in a target range - - @tparam I input iterator type - @tparam O output iterator type - @tparam C unary operator type - - @param first iterator to the beginning of the input range - @param last iterator to the end of the input range - @param output iterator to the beginning of the output range - @param op the operator to apply to transform each element in the range - - @return a tf::cudaTask handle - - This method is equivalent to the parallel execution of the following loop on a GPU: - - @code{.cpp} - while (first != last) { - *output++ = callable(*first++); - } - @endcode - */ - template - cudaTask transform(I first, I last, O output, C op); - - /** - @brief updates parameters of a kernel task created from - tf::cudaFlow::transform - - The type of the iterators and the callable must be the same as - the task created from tf::cudaFlow::for_each. - */ - template - void transform(cudaTask task, I first, I last, O output, C c); - - /** - @brief creates a task to perform parallel transforms over two ranges of items - - @tparam I1 first input iterator type - @tparam I2 second input iterator type - @tparam O output iterator type - @tparam C unary operator type - - @param first1 iterator to the beginning of the input range - @param last1 iterator to the end of the input range - @param first2 iterato - @param output iterator to the beginning of the output range - @param op binary operator to apply to transform each pair of items in the - two input ranges - - @return cudaTask handle - - This method is equivalent to the parallel execution of the following loop on a GPU: - - @code{.cpp} - while (first1 != last1) { - *output++ = op(*first1++, *first2++); - } - @endcode - */ - template - cudaTask transform(I1 first1, I1 last1, I2 first2, O output, C op); - - /** - @brief updates parameters of a kernel task created from - tf::cudaFlow::transform +using cudaGraph = cudaGraphBase; - The type of the iterators and the callable must be the same as - the task created from tf::cudaFlow::for_each. - */ - template - void transform( - cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c - ); - - // ------------------------------------------------------------------------ - // subflow - // ------------------------------------------------------------------------ - - /** - @brief constructs a subflow graph through tf::cudaFlowCapturer - - @tparam C callable type constructible from - @c std::function - - @param callable the callable to construct a capture flow - - @return a tf::cudaTask handle - - A captured subflow forms a sub-graph to the %cudaFlow and can be used to - capture custom (or third-party) kernels that cannot be directly constructed - from the %cudaFlow. - - Example usage: - - @code{.cpp} - taskflow.emplace([&](tf::cudaFlow& cf){ - - tf::cudaTask my_kernel = cf.kernel(my_arguments); - - // create a flow capturer to capture custom kernels - tf::cudaTask my_subflow = cf.capture([&](tf::cudaFlowCapturer& capturer){ - capturer.on([&](cudaStream_t stream){ - invoke_custom_kernel_with_stream(stream, custom_arguments); - }); - }); - - my_kernel.precede(my_subflow); - }); - @endcode - */ - template - cudaTask capture(C&& callable); - - /** - @brief updates the captured child graph - - The method is similar to tf::cudaFlow::capture but operates on a task - of type tf::cudaTaskType::SUBFLOW. - The new captured graph must be topologically identical to the original - captured graph. - */ - template - void capture(cudaTask task, C callable); - - private: - - cudaFlowGraph _cfg; - cudaGraphExec _exe {nullptr}; -}; - -// Construct a standalone cudaFlow -inline cudaFlow::cudaFlow() { - _cfg._native_handle.create(); -} - -// Procedure: clear -inline void cudaFlow::clear() { - _exe.clear(); - _cfg.clear(); - _cfg._native_handle.create(); -} - -// Function: empty -inline bool cudaFlow::empty() const { - return _cfg._nodes.empty(); -} - -// Function: num_tasks -inline size_t cudaFlow::num_tasks() const { - return _cfg._nodes.size(); -} - -// Procedure: dump -inline void cudaFlow::dump(std::ostream& os) const { - _cfg.dump(os, nullptr, ""); -} - -// Procedure: dump -inline void cudaFlow::dump_native_graph(std::ostream& os) const { - cuda_dump_graph(os, _cfg._native_handle); -} - -// ---------------------------------------------------------------------------- -// Graph building methods -// ---------------------------------------------------------------------------- - -// Function: noop -inline cudaTask cudaFlow::noop() { - - auto node = _cfg.emplace_back( - _cfg, std::in_place_type_t{} - ); - - TF_CHECK_CUDA( - cudaGraphAddEmptyNode( - &node->_native_handle, _cfg._native_handle, nullptr, 0 - ), - "failed to create a no-operation (empty) node" - ); - - return cudaTask(node); -} - -// Function: host -template -cudaTask cudaFlow::host(C&& c) { - - auto node = _cfg.emplace_back( - _cfg, std::in_place_type_t{}, std::forward(c) - ); - - auto h = std::get_if(&node->_handle); - - cudaHostNodeParams p; - p.fn = cudaFlowNode::Host::callback; - p.userData = h; - - TF_CHECK_CUDA( - cudaGraphAddHostNode( - &node->_native_handle, _cfg._native_handle, nullptr, 0, &p - ), - "failed to create a host node" - ); - - return cudaTask(node); -} - -// Function: kernel -template -cudaTask cudaFlow::kernel( - dim3 g, dim3 b, size_t s, F f, ArgsT... args -) { - - auto node = _cfg.emplace_back( - _cfg, std::in_place_type_t{}, (void*)f - ); - - cudaKernelNodeParams p; - void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... }; - p.func = (void*)f; - p.gridDim = g; - p.blockDim = b; - p.sharedMemBytes = s; - p.kernelParams = arguments; - p.extra = nullptr; - - TF_CHECK_CUDA( - cudaGraphAddKernelNode( - &node->_native_handle, _cfg._native_handle, nullptr, 0, &p - ), - "failed to create a kernel task" - ); - - return cudaTask(node); -} - -// Function: zero -template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* -> -cudaTask cudaFlow::zero(T* dst, size_t count) { - - auto node = _cfg.emplace_back( - _cfg, std::in_place_type_t{} - ); - - auto p = cuda_get_zero_parms(dst, count); - - TF_CHECK_CUDA( - cudaGraphAddMemsetNode( - &node->_native_handle, _cfg._native_handle, nullptr, 0, &p - ), - "failed to create a memset (zero) task" - ); - - return cudaTask(node); -} - -// Function: fill -template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* -> -cudaTask cudaFlow::fill(T* dst, T value, size_t count) { - - auto node = _cfg.emplace_back( - _cfg, std::in_place_type_t{} - ); - - auto p = cuda_get_fill_parms(dst, value, count); - - TF_CHECK_CUDA( - cudaGraphAddMemsetNode( - &node->_native_handle, _cfg._native_handle, nullptr, 0, &p - ), - "failed to create a memset (fill) task" - ); - - return cudaTask(node); -} - -// Function: copy -template < - typename T, - std::enable_if_t, void>* -> -cudaTask cudaFlow::copy(T* tgt, const T* src, size_t num) { - - auto node = _cfg.emplace_back( - _cfg, std::in_place_type_t{} - ); - - auto p = cuda_get_copy_parms(tgt, src, num); - - TF_CHECK_CUDA( - cudaGraphAddMemcpyNode( - &node->_native_handle, _cfg._native_handle, nullptr, 0, &p - ), - "failed to create a memcpy (copy) task" - ); - - return cudaTask(node); -} - -// Function: memset -inline cudaTask cudaFlow::memset(void* dst, int ch, size_t count) { - - auto node = _cfg.emplace_back( - _cfg, std::in_place_type_t{} - ); - - auto p = cuda_get_memset_parms(dst, ch, count); - - TF_CHECK_CUDA( - cudaGraphAddMemsetNode( - &node->_native_handle, _cfg._native_handle, nullptr, 0, &p - ), - "failed to create a memset task" - ); - - return cudaTask(node); -} - -// Function: memcpy -inline cudaTask cudaFlow::memcpy(void* tgt, const void* src, size_t bytes) { - - auto node = _cfg.emplace_back( - _cfg, std::in_place_type_t{} - ); - - auto p = cuda_get_memcpy_parms(tgt, src, bytes); - - TF_CHECK_CUDA( - cudaGraphAddMemcpyNode( - &node->_native_handle, _cfg._native_handle, nullptr, 0, &p - ), - "failed to create a memcpy task" - ); - - return cudaTask(node); -} - -// ------------------------------------------------------------------------ -// update methods -// ------------------------------------------------------------------------ - -// Function: host -template -void cudaFlow::host(cudaTask task, C&& c) { - - if(task.type() != cudaTaskType::HOST) { - TF_THROW(task, " is not a host task"); - } - - auto h = std::get_if(&task._node->_handle); - - h->func = std::forward(c); -} - -// Function: update kernel parameters -template -void cudaFlow::kernel( - cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT... args -) { - - if(task.type() != cudaTaskType::KERNEL) { - TF_THROW(task, " is not a kernel task"); - } - - cudaKernelNodeParams p; - - void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... }; - p.func = (void*)f; - p.gridDim = g; - p.blockDim = b; - p.sharedMemBytes = s; - p.kernelParams = arguments; - p.extra = nullptr; - - TF_CHECK_CUDA( - cudaGraphExecKernelNodeSetParams(_exe, task._node->_native_handle, &p), - "failed to update kernel parameters on ", task - ); -} - -// Function: update copy parameters -template , void>*> -void cudaFlow::copy(cudaTask task, T* tgt, const T* src, size_t num) { - - if(task.type() != cudaTaskType::MEMCPY) { - TF_THROW(task, " is not a memcpy task"); - } - - auto p = cuda_get_copy_parms(tgt, src, num); - - TF_CHECK_CUDA( - cudaGraphExecMemcpyNodeSetParams(_exe, task._node->_native_handle, &p), - "failed to update memcpy parameters on ", task - ); -} - -// Function: update memcpy parameters -inline void cudaFlow::memcpy( - cudaTask task, void* tgt, const void* src, size_t bytes -) { - - if(task.type() != cudaTaskType::MEMCPY) { - TF_THROW(task, " is not a memcpy task"); - } - - auto p = cuda_get_memcpy_parms(tgt, src, bytes); - - TF_CHECK_CUDA( - cudaGraphExecMemcpyNodeSetParams(_exe, task._node->_native_handle, &p), - "failed to update memcpy parameters on ", task - ); -} - -// Procedure: memset -inline void cudaFlow::memset(cudaTask task, void* dst, int ch, size_t count) { - - if(task.type() != cudaTaskType::MEMSET) { - TF_THROW(task, " is not a memset task"); - } - - auto p = cuda_get_memset_parms(dst, ch, count); - - TF_CHECK_CUDA( - cudaGraphExecMemsetNodeSetParams(_exe, task._node->_native_handle, &p), - "failed to update memset parameters on ", task - ); -} - -// Procedure: fill -template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* -> -void cudaFlow::fill(cudaTask task, T* dst, T value, size_t count) { - - if(task.type() != cudaTaskType::MEMSET) { - TF_THROW(task, " is not a memset task"); - } - - auto p = cuda_get_fill_parms(dst, value, count); - - TF_CHECK_CUDA( - cudaGraphExecMemsetNodeSetParams(_exe, task._node->_native_handle, &p), - "failed to update memset parameters on ", task - ); -} - -// Procedure: zero -template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* -> -void cudaFlow::zero(cudaTask task, T* dst, size_t count) { - - if(task.type() != cudaTaskType::MEMSET) { - TF_THROW(task, " is not a memset task"); - } - - auto p = cuda_get_zero_parms(dst, count); - - TF_CHECK_CUDA( - cudaGraphExecMemsetNodeSetParams(_exe, task._node->_native_handle, &p), - "failed to update memset parameters on ", task - ); -} - -// Function: capture -template -void cudaFlow::capture(cudaTask task, C c) { - - if(task.type() != cudaTaskType::SUBFLOW) { - TF_THROW(task, " is not a subflow task"); - } - - // insert a subflow node - // construct a captured flow from the callable - auto node_handle = std::get_if(&task._node->_handle); - //node_handle->graph.clear(); - - cudaFlowCapturer capturer; - c(capturer); - - // obtain the optimized captured graph - capturer._cfg._native_handle.reset(capturer.capture()); - node_handle->cfg = std::move(capturer._cfg); - - TF_CHECK_CUDA( - cudaGraphExecChildGraphNodeSetParams( - _exe, - task._node->_native_handle, - node_handle->cfg._native_handle - ), - "failed to update a captured child graph" - ); -} - -// ---------------------------------------------------------------------------- -// captured flow -// ---------------------------------------------------------------------------- - -// Function: capture -template -cudaTask cudaFlow::capture(C&& c) { - - // insert a subflow node - auto node = _cfg.emplace_back( - _cfg, std::in_place_type_t{} - ); - - // construct a captured flow from the callable - auto node_handle = std::get_if(&node->_handle); - - // perform capturing - cudaFlowCapturer capturer; - c(capturer); - - // obtain the optimized captured graph - capturer._cfg._native_handle.reset(capturer.capture()); - - // move capturer's cudaFlow graph into node - node_handle->cfg = std::move(capturer._cfg); - - TF_CHECK_CUDA( - cudaGraphAddChildGraphNode( - &node->_native_handle, - _cfg._native_handle, - nullptr, - 0, - node_handle->cfg._native_handle - ), - "failed to add a cudaFlow capturer task" - ); - - return cudaTask(node); -} - -// ---------------------------------------------------------------------------- -// run method -// ---------------------------------------------------------------------------- - -// Procedure: run -inline void cudaFlow::run(cudaStream_t stream) { - if(!_exe) { - _exe.instantiate(_cfg._native_handle); - } - _exe.launch(stream); - _cfg._state = cudaFlowGraph::OFFLOADED; -} - -// Function: native_cfg -inline cudaGraph_t cudaFlow::native_graph() { - return _cfg._native_handle; -} - -// Function: native_executable -inline cudaGraphExec_t cudaFlow::native_executable() { - return _exe; -} +/** +@brief default smart pointer type to manage a `cudaGraphExec_t` object with unique ownership +*/ +using cudaGraphExec = cudaGraphExecBase; } // end of namespace tf ----------------------------------------------------- diff --git a/taskflow/taskflow.hpp b/taskflow/taskflow.hpp index c2403f81f..66462b989 100644 --- a/taskflow/taskflow.hpp +++ b/taskflow/taskflow.hpp @@ -1,8 +1,16 @@ #pragma once +// Feature macros for fine-tuning the performance of Taskflow at compile time +// +// Disabled features by default: +// + TF_ENABLE_TASK_POOL : enable task pool optimization +// + TF_ENABLE_ATOMIC_NOTIFIER : enable atomic notifier (required C++20) +// + #include "core/executor.hpp" +#include "core/runtime.hpp" #include "core/async.hpp" -#include "algorithm/critical.hpp" +#include "algorithm/algorithm.hpp" /** @dir taskflow @@ -29,17 +37,44 @@ @brief main taskflow include file */ -// TF_VERSION % 100 is the patch level -// TF_VERSION / 100 % 1000 is the minor version -// TF_VERSION / 100000 is the major version -// current version: 3.7.0 -#define TF_VERSION 300700 +/** +@def TF_VERSION + +@brief version of the %Taskflow (currently 3.11.0) + +The version system is made of a major version number, a minor version number, +and a patch number: + + TF_VERSION % 100 is the patch level + + TF_VERSION / 100 % 1000 is the minor version + + TF_VERSION / 100000 is the major version +*/ +#define TF_VERSION 301000 + +/** +@def TF_MAJOR_VERSION + +@brief major version of %Taskflow, which is equal to `TF_VERSION/100000` +*/ #define TF_MAJOR_VERSION TF_VERSION/100000 + +/** +@def TF_MINOR_VERSION + +@brief minor version of %Taskflow, which is equal to `TF_VERSION / 100 % 1000` +*/ #define TF_MINOR_VERSION TF_VERSION/100%1000 + +/** +@def TF_PATCH_VERSION + +@brief patch version of %Taskflow, which is equal to `TF_VERSION % 100` +*/ #define TF_PATCH_VERSION TF_VERSION%100 + + /** @brief taskflow namespace */ @@ -57,7 +92,7 @@ namespace detail { } Release notes are available here: https://taskflow.github.io/taskflow/Releases.html */ constexpr const char* version() { - return "3.7.0"; + return "3.11.0"; } diff --git a/taskflow/utility/iterator.hpp b/taskflow/utility/iterator.hpp index 8636a3bcc..b861a2077 100644 --- a/taskflow/utility/iterator.hpp +++ b/taskflow/utility/iterator.hpp @@ -5,18 +5,204 @@ namespace tf { -template -constexpr std::enable_if_t>::value, bool> -is_range_invalid(T beg, T end, T step) { +/** + * @brief checks if the given index range is invalid + * + * @tparam B type of the beginning index + * @tparam E type of the ending index + * @tparam S type of the step size + * + * @param beg starting index of the range + * @param end ending index of the range + * @param step step size to traverse the range + * + * @return returns @c true if the range is invalid; @c false otherwise. + * + * A range is considered invalid under the following conditions: + * + The step is zero and the begin and end values are not equal. + * + A positive range (begin < end) with a non-positive step. + * + A negative range (begin > end) with a non-negative step. + */ +template +constexpr std::enable_if_t> && + std::is_integral_v> && + std::is_integral_v>, bool> +is_index_range_invalid(B beg, E end, S step) { return ((step == 0 && beg != end) || (beg < end && step <= 0) || // positive range (beg > end && step >= 0)); // negative range } -template -constexpr std::enable_if_t>::value, size_t> -distance(T beg, T end, T step) { +/** + * @brief calculates the number of iterations in the given index range + * + * @tparam B type of the beginning index + * @tparam E type of the ending index + * @tparam S type of the step size + * + * @param beg starting index of the range + * @param end ending index of the range + * @param step step size to traverse the range + * + * @return returns the number of required iterations to traverse the range + * + * The distance of a range represents the number of required iterations to traverse the range + * from the beginning index to the ending index (exclusive) with the given step size. + * + * Example 1: + * @code{.cpp} + * // Range: 0 to 10 with step size 2 + * size_t dist = distance(0, 10, 2); // Returns 5, the sequence is [0, 2, 4, 6, 8] + * @endcode + * + * Example 2: + * @code{.cpp} + * // Range: 10 to 0 with step size -2 + * size_t dist = distance(10, 0, -2); // Returns 5, the sequence is [10, 8, 6, 4, 2] + * @endcode + * + * Example 3: + * @code{.cpp} + * // Range: 5 to 20 with step size 5 + * size_t dist = distance(5, 20, 5); // Returns 3, the sequence is [5, 10, 15] + * @endcode + * + * @attention + * It is user's responsibility to ensure the given index range is valid. + */ +template +constexpr std::enable_if_t> && + std::is_integral_v> && + std::is_integral_v>, size_t> +distance(B beg, E end, S step) { return (end - beg + step + (step > 0 ? -1 : 1)) / step; } +/** + * @class IndexRange + * + * @brief class to create an index range of integral indices with a step size + * + * This class provides functionality for managing a range of indices, where the range + * is defined by a starting index, an ending index, and a step size. The indices must + * be of an integral type. + * For example, the range [0, 10) with a step size 2 represents the five elements, + * 0, 2, 4, 6, and 8. + * + * @tparam T the integral type of the indices + * + * @attention + * It is user's responsibility to ensure the given range is valid. + */ +template +class IndexRange { + + static_assert(std::is_integral_v, "index type must be integral"); + +public: + + /** + @brief alias for the index type used in the range + */ + using index_type = T; + + /** + @brief constructs an index range object without any initialization + */ + IndexRange() = default; + + /** + * @brief constructs an IndexRange object + * @param beg starting index of the range + * @param end ending index of the range (exclusive) + * @param step_size step size between consecutive indices in the range + */ + explicit IndexRange(T beg, T end, T step_size) + : _beg{beg}, _end{end}, _step_size{step_size} {} + + /** + * @brief queries the starting index of the range + */ + T begin() const { return _beg; } + + /** + * @brief queries the ending index of the range + */ + T end() const { return _end; } + + /** + * @brief queries the step size of the range + */ + T step_size() const { return _step_size; } + + /** + * @brief updates the range with the new starting index, ending index, and step size + */ + IndexRange& reset(T begin, T end, T step_size) { + _beg = begin; + _end = end; + _step_size = step_size; + return *this; + } + + /** + * @brief updates the starting index of the range + */ + IndexRange& begin(T new_begin) { _beg = new_begin; return *this; } + + /** + * @brief updates the ending index of the range + */ + IndexRange& end(T new_end) { _end = new_end; return *this; } + + /** + * @brief updates the step size of the range + */ + IndexRange& step_size(T new_step_size) { _step_size = new_step_size; return *this; } + + /** + * @brief queries the number of elements in the range + * + * The number of elements is equivalent to the number of iterations in the range. + * For instance, the range [0, 10) with step size of 2 will iterate five elements, + * 0, 2, 4, 6, and 8. + */ + size_t size() const { return distance(_beg, _end, _step_size); } + + /** + * @brief returns a range from the given discrete domain + * @param part_beg starting index of the discrete domain + * @param part_end ending index of the discrete domain + * @return a new IndexRange object representing the given discrete domain + * + * The discrete domain of a range refers to a counter-based sequence indexed from 0 + * to @c N, where @c N is the size (i.e., number of iterated elements) of the range. + * For example, a discrete domain of the range [0, 10) with a step size of 2 corresponds + * to the sequence 0, 1, 2, 3, and 4, which map to the range elements 0, 2, 4, 6, and 8. + * + * For a partitioned domain [@c part_beg, @c part_end), this function returns + * the corresponding range. For instance, the partitioned domain [2, 5) for the + * above example returns the range [4, 10) with the same step size of 2. + * + * @attention + * Users must ensure the specified domain is valid with respect to the range. + */ + IndexRange discrete_domain(size_t part_beg, size_t part_end) const { + return IndexRange( + static_cast(part_beg) * _step_size + _beg, + static_cast(part_end) * _step_size + _beg, + _step_size + ); + } + + private: + + T _beg; + T _end; + T _step_size; + +}; + + + } // end of namespace tf ----------------------------------------------------- diff --git a/taskflow/utility/latch.hpp b/taskflow/utility/latch.hpp new file mode 100644 index 000000000..af292e75a --- /dev/null +++ b/taskflow/utility/latch.hpp @@ -0,0 +1,77 @@ +#pragma once + + +// use tf::Latch +#include +#include +#include + +namespace tf { + +class Latch { + +private: + + std::ptrdiff_t _counter; + mutable std::condition_variable _cv; + mutable std::mutex _mutex; + +public: + + static constexpr ptrdiff_t (max)() noexcept + { + return (std::numeric_limits::max)(); + } + + explicit Latch(std::ptrdiff_t expected) + : _counter(expected) + { + assert(0 <= expected && expected < (max)()); + } + + ~Latch() = default; + + Latch(const Latch&) = delete; + Latch& operator=(const Latch&) = delete; + + void count_down(std::ptrdiff_t update = 1) + { + std::lock_guard lk(_mutex); + assert(0 <= update && update <= _counter); + _counter -= update; + if (_counter == 0) { + _cv.notify_all(); + } + } + + bool try_wait() const noexcept + { + std::lock_guard lk(_mutex); + // no spurious failure + return (_counter == 0); + } + + void wait() const + { + std::unique_lock lk(_mutex); + while (_counter != 0) { + _cv.wait(lk); + } + } + + void arrive_and_wait(std::ptrdiff_t update = 1) + { + std::unique_lock lk(_mutex); + // equivalent to { count_down(update); wait(); } + assert(0 <= update && update <= _counter); + _counter -= update; + if (_counter == 0) { + _cv.notify_all(); + } + while (_counter != 0) { + _cv.wait(lk); + } + } +}; + +} // namespace tf ------------------------------------------------------------- diff --git a/taskflow/utility/lazy_string.hpp b/taskflow/utility/lazy_string.hpp new file mode 100644 index 000000000..dce2340f6 --- /dev/null +++ b/taskflow/utility/lazy_string.hpp @@ -0,0 +1,73 @@ +#pragma once + +#include +#include + +namespace tf { + +class LazyString { + + public: + + LazyString() = default; + + LazyString(const std::string& str) : + _str(str.empty() ? nullptr : std::make_unique(str)) { + } + + LazyString(std::string&& str) : + _str(str.empty() ? nullptr : std::make_unique(std::move(str))) { + } + + LazyString(const char* str) : + _str((!str || str[0] == '\0') ? nullptr : std::make_unique(str)) { + } + + // Modify the operator to return a const reference + operator const std::string& () const noexcept { + static const std::string empty_string; + return _str ? *_str : empty_string; + } + + LazyString& operator = (const std::string& str) { + if(_str == nullptr) { + _str = std::make_unique(str); + } + else { + *_str = str; + } + return *this; + } + + LazyString& operator = (std::string&& str) { + if(_str == nullptr) { + _str = std::make_unique(std::move(str)); + } + else { + *_str = std::move(str); + } + return *this; + } + + bool empty() const noexcept { + return !_str || _str->empty(); + } + + size_t size() const noexcept { + return _str ? _str->size() : 0; + } + + friend std::ostream& operator<<(std::ostream& os, const LazyString& ls) { + os << (ls._str ? *ls._str : ""); + return os; + } + + private: + + std::unique_ptr _str; + +}; + + + +} // end of namespace tf ------------------------------------------------------------------------- diff --git a/taskflow/utility/macros.hpp b/taskflow/utility/macros.hpp index f184468c5..0aaa6fd87 100644 --- a/taskflow/utility/macros.hpp +++ b/taskflow/utility/macros.hpp @@ -1,5 +1,18 @@ #pragma once +// ============================================================================ +// C++ Versions +// ============================================================================ +#define TF_CPP98 199711L +#define TF_CPP11 201103L +#define TF_CPP14 201402L +#define TF_CPP17 201703L +#define TF_CPP20 202002L + +// ============================================================================ +// inline and no-inline +// ============================================================================ + #if defined(_MSC_VER) #define TF_FORCE_INLINE __forceinline #elif defined(__GNUC__) && __GNUC__ > 3 @@ -16,18 +29,30 @@ #define TF_NO_INLINE #endif -// ---------------------------------------------------------------------------- +// ============================================================================ +// likely and unlikely +// ============================================================================ -#ifdef TF_DISABLE_EXCEPTION_HANDLING - #define TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, code_block) \ - code_block; +#if defined(__GNUC__) + #define TF_LIKELY(x) (__builtin_expect((x), 1)) + #define TF_UNLIKELY(x) (__builtin_expect((x), 0)) #else - #define TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, code_block) \ - try { \ - code_block; \ - } catch(...) { \ - _process_exception(worker, node); \ - } + #define TF_LIKELY(x) (x) + #define TF_UNLIKELY(x) (x) #endif + + // ---------------------------------------------------------------------------- + +#define TF_FWD(T, x) std::forward(x) + + + + + + + + + + diff --git a/taskflow/utility/math.hpp b/taskflow/utility/math.hpp index f80053e40..2b8ea7dc7 100644 --- a/taskflow/utility/math.hpp +++ b/taskflow/utility/math.hpp @@ -1,43 +1,59 @@ #pragma once #include +#include namespace tf { -// rounds the given 64-bit unsigned integer to the nearest power of 2 +/** + * @brief rounds the given 64-bit unsigned integer to the nearest power of 2 + */ template > && sizeof(T) == 8) , void + (std::is_unsigned_v> && sizeof(T) == 8), void >* = nullptr> constexpr T next_pow2(T x) { if(x == 0) return 1; x--; - x |= x>>1; - x |= x>>2; - x |= x>>4; - x |= x>>8; - x |= x>>16; - x |= x>>32; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + x |= x >> 32; x++; return x; } -// rounds the given 32-bit unsigned integer to the nearest power of 2 +/** + * @brief rounds the given 32-bit unsigned integer to the nearest power of 2 + */ template > && sizeof(T) == 4), void >* = nullptr> -constexpr T next_pow2(T x) { - if(x == 0) return 1; - x--; - x |= x>>1; - x |= x>>2; - x |= x>>4; - x |= x>>8; - x |= x>>16; - x++; - return x; +constexpr T next_pow2(T y) { + if(y == 0) return 1; + y--; + y |= y >> 1; + y |= y >> 2; + y |= y >> 4; + y |= y >> 8; + y |= y >> 16; + y++; + return y; } -// checks if the given number if a power of 2 +/** + * @brief checks if the given number is a power of 2 + * + * This function determines if the given integer is a power of 2. + * + * @tparam T The type of the input. Must be an integral type. + * @param x The integer to check. + * @return `true` if `x` is a power of 2, otherwise `false`. + * + * @attention This function is constexpr and can be evaluated at compile time. + * + */ template >, void>* = nullptr > @@ -45,31 +61,72 @@ constexpr bool is_pow2(const T& x) { return x && (!(x&(x-1))); } -//// finds the ceil of x divided by b -//template >, void>* = nullptr -//> -//constexpr T ceil(const T& x, const T& y) { -// //return (x + y - 1) / y; -// return (x-1) / y + 1; -//} - /** -@brief returns floor(log2(n)), assumes n > 0 -*/ -template -constexpr int log2(T n) { - int log = 0; + * @brief computes the floor of the base-2 logarithm of a number using count-leading-zeros (CTL). + * + * This function efficiently calculates the floor of `log2(n)` for both 32-bit and 64-bit integers. + * + * @tparam T integer type (uint32_t or uint64_t). + * @param n input number. + * @return floor of `log2(n)` + */ +template +constexpr size_t floor_log2(T n) { + + static_assert(std::is_unsigned_v, "log2 only supports unsigned integer types"); + +#if defined(_MSC_VER) + unsigned long index; + if constexpr (sizeof(T) == 8) { + _BitScanReverse64(&index, n); + } else { + _BitScanReverse(&index, static_cast(n)); + } + return static_cast(index); +#elif defined(__GNUC__) || defined(__clang__) + if constexpr (sizeof(T) == 8) { + return 63 - __builtin_clzll(n); + } else { + return 31 - __builtin_clz(n); + } +#else + // Portable fallback: Uses bit shifts to count leading zeros manually + size_t log = 0; while (n >>= 1) { ++log; } return log; +#endif } /** -@brief finds the median of three numbers of dereferenced iterators using - the given comparator +@brief returns the floor of `log2(N)` at compile time */ +template +constexpr size_t static_floor_log2() { + return (N < 2) ? 0 : 1 + static_floor_log2(); + //auto log = 0; + //while (N >>= 1) { + // ++log; + //} + //return log; +} + +/** + * @brief finds the median of three numbers pointed to by iterators using the given comparator + * + * This function determines the median value of the elements pointed to by + * three random-access iterators using the provided comparator. + * + * @tparam RandItr The type of the random-access iterator. + * @tparam C The type of the comparator. + * @param l Iterator to the first element. + * @param m Iterator to the second element. + * @param r Iterator to the third element. + * @param cmp The comparator used to compare the dereferenced iterator values. + * @return The iterator pointing to the median value among the three elements. + * + */ template RandItr median_of_three(RandItr l, RandItr m, RandItr r, C cmp) { return cmp(*l, *m) ? (cmp(*m, *r) ? m : (cmp(*l, *r) ? r : l )) @@ -77,8 +134,22 @@ RandItr median_of_three(RandItr l, RandItr m, RandItr r, C cmp) { } /** -@brief finds the pseudo median of a range of items using spreaded - nine numbers + * @brief finds the pseudo median of a range of items using a spread of nine numbers + * + * This function computes an approximate median of a range of items by sampling + * nine values spread across the range and finding their median. It uses a + * combination of the `median_of_three` function to determine the pseudo median. + * + * @tparam RandItr The type of the random-access iterator. + * @tparam C The type of the comparator. + * @param beg Iterator to the beginning of the range. + * @param end Iterator to the end of the range. + * @param cmp The comparator used to compare the dereferenced iterator values. + * @return The iterator pointing to the pseudo median of the range. + * + * @attention The pseudo median is an approximation of the true median and may not + * be the exact middle value of the range. + * */ template RandItr pseudo_median_of_nine(RandItr beg, RandItr end, C cmp) { @@ -93,18 +164,38 @@ RandItr pseudo_median_of_nine(RandItr beg, RandItr end, C cmp) { } /** -@brief sorts two elements of dereferenced iterators using the given - comparison function -*/ + * @brief sorts two elements of dereferenced iterators using the given comparison function + * + * This function compares two elements pointed to by iterators and swaps them + * if they are out of order according to the provided comparator. + * + * @tparam Iter The type of the iterator. + * @tparam Compare The type of the comparator. + * @param a Iterator to the first element. + * @param b Iterator to the second element. + * @param comp The comparator used to compare the dereferenced iterator values. + * + */ template void sort2(Iter a, Iter b, Compare comp) { if (comp(*b, *a)) std::iter_swap(a, b); } /** -@brief sorts three elements of dereferenced iterators using the given - comparison function -*/ + * @brief Sorts three elements of dereferenced iterators using the given comparison function. + * + * This function sorts three elements pointed to by iterators in ascending order + * according to the provided comparator. The sorting is performed using a sequence + * of calls to the `sort2` function to ensure the correct order of elements. + * + * @tparam Iter The type of the iterator. + * @tparam Compare The type of the comparator. + * @param a Iterator to the first element. + * @param b Iterator to the second element. + * @param c Iterator to the third element. + * @param comp The comparator used to compare the dereferenced iterator values. + * + */ template void sort3(Iter a, Iter b, Iter c, Compare comp) { sort2(a, b, comp); @@ -113,8 +204,19 @@ void sort3(Iter a, Iter b, Iter c, Compare comp) { } /** -@brief generates a program-wise unique id of the give type (thread-safe) -*/ + * @brief generates a program-wide unique ID of the given type in a thread-safe manner + * + * This function provides a globally unique identifier of the specified integral type. + * It uses a static `std::atomic` counter to ensure thread safety and increments the + * counter in a relaxed memory ordering for efficiency. + * + * @tparam T The type of the ID to generate. Must be an integral type. + * @return A unique ID of type `T`. + * + * @attention The uniqueness of the ID is guaranteed only within the program's lifetime. + * @attention The function does not throw exceptions. + * + */ template , void>* = nullptr> T unique_id() { static std::atomic counter{0}; @@ -122,8 +224,20 @@ T unique_id() { } /** -@brief updates an atomic variable with a maximum value -*/ + * @brief updates an atomic variable with the maximum value + * + * This function atomically updates the provided atomic variable `v` to hold + * the maximum of its current value and `max_v`. The update is performed using + * a relaxed memory ordering for efficiency in non-synchronizing contexts. + * + * @tparam T The type of the atomic variable. Must be trivially copyable and comparable. + * @param v The atomic variable to update. + * @param max_v The value to compare with the current value of `v`. + * + * @attention If multiple threads call this function concurrently, the value of `v` + * will be the maximum value seen across all threads. + * + */ template inline void atomic_max(std::atomic& v, const T& max_v) noexcept { T prev = v.load(std::memory_order_relaxed); @@ -134,8 +248,20 @@ inline void atomic_max(std::atomic& v, const T& max_v) noexcept { } /** -@brief updates an atomic variable with a minimum value -*/ + * @brief updates an atomic variable with the minimum value + * + * This function atomically updates the provided atomic variable `v` to hold + * the minimum of its current value and `min_v`. The update is performed using + * a relaxed memory ordering for efficiency in non-synchronizing contexts. + * + * @tparam T The type of the atomic variable. Must be trivially copyable and comparable. + * @param v The atomic variable to update. + * @param min_v The value to compare with the current value of `v`. + * + * @attention If multiple threads call this function concurrently, the value of `v` + * will be the minimum value seen across all threads. + * + */ template inline void atomic_min(std::atomic& v, const T& min_v) noexcept { T prev = v.load(std::memory_order_relaxed); @@ -145,6 +271,167 @@ inline void atomic_min(std::atomic& v, const T& min_v) noexcept { } } +/** + * @brief generates a random seed based on the current system clock + * + * This function returns a seed value derived from the number of clock ticks + * since the epoch as measured by the system clock. The seed can be used + * to initialize random number generators. + * + * @tparam T The type of the returned seed. Must be an integral type. + * @return A seed value based on the system clock. + * + */ +template +inline T seed() noexcept { + return std::chrono::system_clock::now().time_since_epoch().count(); +} + +/** + * @brief counts the number of trailing zeros in an integer. + * + * This function provides a portable implementation for counting the number of + * trailing zeros across different platforms and integer sizes (32-bit and 64-bit). + * + * @tparam T integer type (32-bit or 64-bit). + * @param x non-zero integer to count trailing zeros from + * @return the number of trailing zeros in @c x + * + * @attention + * The behavior is undefined when @c x is 0. + */ +template >> +auto ctz(T x) { + + #if defined(_MSC_VER) + unsigned long index; + if constexpr (sizeof(T) == 8) { + _BitScanForward64(&index, x); + } else { + _BitScanForward(&index, (unsigned long)x); + } + return index; + #elif defined(__GNUC__) || defined(__clang__) + if constexpr (sizeof(T) == 8) { + return __builtin_ctzll(x); + } else { + return __builtin_ctz(x); + } + #else + size_t r = 0; + while ((x & 1) == 0) { + x >>= 1; + r++; + } + return r; + #endif +} + +// ------------------------------------------------------------------------------------------------ +// coprime +// ------------------------------------------------------------------------------------------------ + +/** + * @brief computes a coprime of a given number + * + * This function finds the largest number less than N that is coprime (i.e., has a greatest common divisor of 1) with @c N. + * If @c N is less than 3, it returns 1 as a default coprime. + * + * @param N input number for which a coprime is to be found. + * @return the largest number < @c N that is coprime to N + */ +constexpr size_t coprime(size_t N) { + if(N < 3) { + return 1; + } + for (size_t x = N; --x > 0;) { + if (std::gcd(x, N) == 1) { + return x; + } + } + return 1; +} + +/** + * @brief generates a compile-time array of coprimes for numbers from 0 to N-1 + * + * This function constructs a constexpr array where each element at index `i` contains a coprime of `i` + * (the largest number less than `i` that is coprime to it). + * + * @tparam N the size of the array to generate (should be greater than 0). + * @return a constexpr array of size @c N where each index holds a coprime of its value. + */ +template +constexpr std::array make_coprime_lut() { + static_assert(N>0, "N must be greater than 0"); + std::array coprimes{}; + for (size_t n = 0; n < N; ++n) { + coprimes[n] = coprime(n); + } + return coprimes; +} + + +//class XorShift64 { +// +// public: +// +// explicit XorShift64(uint64_t seed) : _state(seed) {} +// +// uint64_t next() { +// _state ^= _state >> 12; +// _state ^= _state << 25; +// _state ^= _state >> 27; +// return _state * 0x2545F4914F6CDD1DULL; // Scramble for better randomness +// } +// +// size_t random_range(size_t min, size_t max) { +// return min + (next() % (max - min + 1)); +// } +// +// private: +// +// uint64_t _state; +//}; + +//inline int generate_random_excluding(int worker_id, int W, XorShift64& rng) { +// int random_number = rng.random_range(0, 2 * W - 2); // Range: [0, 2W-2] +// return random_number + (random_number >= worker_id); // Skip worker_id +//} +// +// +//class Xoroshiro128Plus { +// +// public: +// +// explicit Xoroshiro128Plus(uint64_t seed1, uint64_t seed2) : _state{seed1, seed2} {} +// +// uint64_t next() { +// uint64_t s0 = _state[0]; +// uint64_t s1 = _state[1]; +// uint64_t result = s0 + s1; +// +// s1 ^= s0; +// _state[0] = _rotl(s0, 55) ^ s1 ^ (s1 << 14); // Scramble _state +// _state[1] = _rotl(s1, 36); +// +// return result; +// } +// +// int random_range(int min, int max) { +// return min + (next() % (max - min + 1)); +// } +// +// private: +// +// std::array _state; +// +// static uint64_t _rotl(uint64_t x, int k) { +// return (x << k) | (x >> (64 - k)); +// } +//}; + + } // end of namespace tf ----------------------------------------------------- diff --git a/taskflow/utility/mpmc.hpp b/taskflow/utility/mpmc.hpp new file mode 100644 index 000000000..f9e53ca6d --- /dev/null +++ b/taskflow/utility/mpmc.hpp @@ -0,0 +1,508 @@ +#pragma once + +#include +#include +#include + +#include "os.hpp" + +namespace tf { + +/** + * A 'lockless' bounded multi-producer, multi-consumer queue + * + * Has the caveat that the queue can *appear* empty even if there are + * returned items within it as a single thread can block progression + * of the queue. + */ +template +class MPMC { + + constexpr static uint64_t BufferSize = 1ull << LogSize; + constexpr static uint64_t BufferMask = (BufferSize - 1); + + static_assert((BufferSize >= 2) && ((BufferSize & (BufferSize - 1)) == 0)); + +public: + + /** + * Constructs a bounded multi-producer, multi-consumer queue + * + * Note: Due to the algorithm used, buffer_size must be a power + * of two and must be greater than or equal to two. + * + * @param buffer_size Number of spaces available in the queue. + */ + explicit MPMC() { + for (size_t i = 0; i < _buffer.size(); i++) { + _buffer[i].sequence.store(i, std::memory_order_relaxed); + } + _enqueue_pos.store(0, std::memory_order_relaxed); + _dequeue_pos.store(0, std::memory_order_relaxed); + } + + + /** + * Enqueues an item into the queue + * + * @param data Argument to place into the array + * @return false if the queue was full (and enqueing failed), + * true otherwise + */ + bool try_enqueue(T data) { + Cell *cell; + auto pos = _enqueue_pos.load(std::memory_order_relaxed); + for (; ;) { + cell = &_buffer[pos & BufferMask]; + auto seq = cell->sequence.load(std::memory_order_acquire); + if (seq == pos) { + if (_enqueue_pos.compare_exchange_weak(pos, pos + 1, + std::memory_order_relaxed)) { + break; + } + } else if (seq < pos) { + return false; + } else { + pos = _enqueue_pos.load(std::memory_order_relaxed); + } + } + + cell->data = data; + cell->sequence.store(pos + 1, std::memory_order_release); + + return true; + } + + void enqueue(T data) { + + Cell *cell; + auto pos = _enqueue_pos.load(std::memory_order_relaxed); + + for (; ;) { + cell = &_buffer[pos & BufferMask]; + auto seq = cell->sequence.load(std::memory_order_acquire); + if (seq == pos) { + if (_enqueue_pos.compare_exchange_weak(pos, pos + 1, + std::memory_order_relaxed)) { + break; + } + } + else { + pos = _enqueue_pos.load(std::memory_order_relaxed); + } + } + + cell->data = data; + cell->sequence.store(pos + 1, std::memory_order_release); + } + + /** + * Dequeues an item from the queue + * + * @param[out] data Reference to place item into + * @return false if the queue was empty (and dequeuing failed), + * true if successful + */ + std::optional try_dequeue() { + Cell *cell; + auto pos = _dequeue_pos.load(std::memory_order_relaxed); + for (; ;) { + cell = &_buffer[pos & BufferMask]; + auto seq = cell->sequence.load(std::memory_order_acquire); + if (seq == pos + 1) { + if (_dequeue_pos.compare_exchange_weak(pos, pos + 1, + std::memory_order_relaxed)) { + break; + } + } else if (seq < (pos + 1)) { + return std::nullopt; + } else { + pos = _dequeue_pos.load(std::memory_order_relaxed); + } + } + + T data = cell->data; + cell->sequence.store(pos + BufferMask + 1, std::memory_order_release); + + return data; + } + + bool empty() const { + auto beg = _dequeue_pos.load(std::memory_order_relaxed); + auto end = _enqueue_pos.load(std::memory_order_relaxed); + return beg >= end; + } + + size_t capacity() const { + return BufferSize; + } + +private: + + struct Cell { + T data; + std::atomic sequence; + }; + + //static const size_t cacheline_size = 64; + + alignas(2*TF_CACHELINE_SIZE) std::array _buffer; + alignas(2*TF_CACHELINE_SIZE) std::atomic _enqueue_pos; + alignas(2*TF_CACHELINE_SIZE) std::atomic _dequeue_pos; +}; + +// ------------------------------------------------------------------------------------------------ +// specialization for pointer type +// ------------------------------------------------------------------------------------------------ + +template +class MPMC { + + constexpr static uint64_t BufferSize = 1ull << LogSize; + constexpr static uint64_t BufferMask = (BufferSize - 1); + + static_assert((BufferSize >= 2) && ((BufferSize & (BufferSize - 1)) == 0)); + +public: + + /** + * Constructs a bounded multi-producer, multi-consumer queue + * + * Note: Due to the algorithm used, buffer_size must be a power + * of two and must be greater than or equal to two. + * + * @param buffer_size Number of spaces available in the queue. + */ + explicit MPMC() { + for (size_t i = 0; i < _buffer.size(); i++) { + _buffer[i].sequence.store(i, std::memory_order_relaxed); + } + _enqueue_pos.store(0, std::memory_order_relaxed); + _dequeue_pos.store(0, std::memory_order_relaxed); + } + + + /** + * Enqueues an item into the queue + * + * @param data Argument to place into the array + * @return false if the queue was full (and enqueing failed), + * true otherwise + */ + bool try_enqueue(T* data) { + Cell *cell; + auto pos = _enqueue_pos.load(std::memory_order_relaxed); + for (; ;) { + cell = &_buffer[pos & BufferMask]; + auto seq = cell->sequence.load(std::memory_order_acquire); + if (seq == pos) { + if (_enqueue_pos.compare_exchange_weak(pos, pos + 1, + std::memory_order_relaxed)) { + break; + } + } else if (seq < pos) { + return false; + } else { + pos = _enqueue_pos.load(std::memory_order_relaxed); + } + } + + cell->data = data; + cell->sequence.store(pos + 1, std::memory_order_release); + + return true; + } + + void enqueue(T* data) { + + Cell *cell; + auto pos = _enqueue_pos.load(std::memory_order_relaxed); + + for (; ;) { + cell = &_buffer[pos & BufferMask]; + auto seq = cell->sequence.load(std::memory_order_acquire); + if (seq == pos) { + if (_enqueue_pos.compare_exchange_weak(pos, pos + 1, + std::memory_order_relaxed)) { + break; + } + } + else { + pos = _enqueue_pos.load(std::memory_order_relaxed); + } + } + + cell->data = data; + cell->sequence.store(pos + 1, std::memory_order_release); + } + + /** + * Dequeues an item from the queue + * + * @param[out] data Reference to place item into + * @return false if the queue was empty (and dequeuing failed), + * true if successful + */ + T* try_dequeue() { + Cell *cell; + auto pos = _dequeue_pos.load(std::memory_order_relaxed); + for (; ;) { + cell = &_buffer[pos & BufferMask]; + auto seq = cell->sequence.load(std::memory_order_acquire); + if (seq == pos + 1) { + if (_dequeue_pos.compare_exchange_weak(pos, pos + 1, + std::memory_order_relaxed)) { + break; + } + } else if (seq < (pos + 1)) { + return nullptr; + } else { + pos = _dequeue_pos.load(std::memory_order_relaxed); + } + } + + auto data = cell->data; + cell->sequence.store(pos + BufferMask + 1, std::memory_order_release); + + return data; + } + + bool empty() const { + auto beg = _dequeue_pos.load(std::memory_order_relaxed); + auto end = _enqueue_pos.load(std::memory_order_relaxed); + return beg >= end; + } + + size_t capacity() const { + return BufferSize; + } + +private: + + struct Cell { + T* data; + std::atomic sequence; + }; + + //static const size_t cacheline_size = 64; + + alignas(2*TF_CACHELINE_SIZE) std::array _buffer; + alignas(2*TF_CACHELINE_SIZE) std::atomic _enqueue_pos; + alignas(2*TF_CACHELINE_SIZE) std::atomic _dequeue_pos; +}; + +/** + * RunQueue is a fixed-size, partially non-blocking deque or Work items. + * Operations on front of the queue must be done by a single thread (owner), + * operations on back of the queue can be done by multiple threads concurrently. + * + * Algorithm outline: + * All remote threads operating on the queue back are serialized by a mutex. + * This ensures that at most two threads access state: owner and one remote + * thread (Size aside). The algorithm ensures that the occupied region of the + * underlying array is logically continuous (can wraparound, but no stray + * occupied elements). Owner operates on one end of this region, remote thread + * operates on the other end. Synchronization between these threads + * (potential consumption of the last element and take up of the last empty + * element) happens by means of state variable in each element. States are: + * empty, busy (in process of insertion of removal) and ready. Threads claim + * elements (empty->busy and ready->busy transitions) by means of a CAS + * operation. The finishing transition (busy->empty and busy->ready) are done + * with plain store as the element is exclusively owned by the current thread. + * + * Note: we could permit only pointers as elements, then we would not need + * separate state variable as null/non-null pointer value would serve as state, + * but that would require malloc/free per operation for large, complex values + * (and this is designed to store std::function<()>). +template +class RunQueue { + public: + RunQueue() : front_(0), back_(0) { + // require power-of-two for fast masking + eigen_plain_assert((kSize & (kSize - 1)) == 0); + eigen_plain_assert(kSize > 2); // why would you do this? + eigen_plain_assert(kSize <= (64 << 10)); // leave enough space for counter + for (unsigned i = 0; i < kSize; i++) array_[i].state.store(kEmpty, std::memory_order_relaxed); + } + + ~RunQueue() { eigen_plain_assert(Size() == 0); } + + // PushFront inserts w at the beginning of the queue. + // If queue is full returns w, otherwise returns default-constructed Work. + Work PushFront(Work w) { + unsigned front = front_.load(std::memory_order_relaxed); + Elem* e = &array_[front & kMask]; + uint8_t s = e->state.load(std::memory_order_relaxed); + if (s != kEmpty || !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) return w; + front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed); + e->w = std::move(w); + e->state.store(kReady, std::memory_order_release); + return Work(); + } + + // PopFront removes and returns the first element in the queue. + // If the queue was empty returns default-constructed Work. + Work PopFront() { + unsigned front = front_.load(std::memory_order_relaxed); + Elem* e = &array_[(front - 1) & kMask]; + uint8_t s = e->state.load(std::memory_order_relaxed); + if (s != kReady || !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) return Work(); + Work w = std::move(e->w); + e->state.store(kEmpty, std::memory_order_release); + front = ((front - 1) & kMask2) | (front & ~kMask2); + front_.store(front, std::memory_order_relaxed); + return w; + } + + // PushBack adds w at the end of the queue. + // If queue is full returns w, otherwise returns default-constructed Work. + Work PushBack(Work w) { + EIGEN_MUTEX_LOCK lock(mutex_); + unsigned back = back_.load(std::memory_order_relaxed); + Elem* e = &array_[(back - 1) & kMask]; + uint8_t s = e->state.load(std::memory_order_relaxed); + if (s != kEmpty || !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) return w; + back = ((back - 1) & kMask2) | (back & ~kMask2); + back_.store(back, std::memory_order_relaxed); + e->w = std::move(w); + e->state.store(kReady, std::memory_order_release); + return Work(); + } + + // PopBack removes and returns the last elements in the queue. + Work PopBack() { + if (Empty()) return Work(); + EIGEN_MUTEX_LOCK lock(mutex_); + unsigned back = back_.load(std::memory_order_relaxed); + Elem* e = &array_[back & kMask]; + uint8_t s = e->state.load(std::memory_order_relaxed); + if (s != kReady || !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) return Work(); + Work w = std::move(e->w); + e->state.store(kEmpty, std::memory_order_release); + back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed); + return w; + } + + // PopBackHalf removes and returns half last elements in the queue. + // Returns number of elements removed. + unsigned PopBackHalf(std::vector* result) { + if (Empty()) return 0; + EIGEN_MUTEX_LOCK lock(mutex_); + unsigned back = back_.load(std::memory_order_relaxed); + unsigned size = Size(); + unsigned mid = back; + if (size > 1) mid = back + (size - 1) / 2; + unsigned n = 0; + unsigned start = 0; + for (; static_cast(mid - back) >= 0; mid--) { + Elem* e = &array_[mid & kMask]; + uint8_t s = e->state.load(std::memory_order_relaxed); + if (n == 0) { + if (s != kReady || !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) continue; + start = mid; + } else { + // Note: no need to store temporal kBusy, we exclusively own these + // elements. + eigen_plain_assert(s == kReady); + } + result->push_back(std::move(e->w)); + e->state.store(kEmpty, std::memory_order_release); + n++; + } + if (n != 0) back_.store(start + 1 + (kSize << 1), std::memory_order_relaxed); + return n; + } + + // Size returns current queue size. + // Can be called by any thread at any time. + unsigned Size() const { return SizeOrNotEmpty(); } + + // Empty tests whether container is empty. + // Can be called by any thread at any time. + bool Empty() const { return SizeOrNotEmpty() == 0; } + + // Delete all the elements from the queue. + void Flush() { + while (!Empty()) { + PopFront(); + } + } + + private: + static const unsigned kMask = kSize - 1; + static const unsigned kMask2 = (kSize << 1) - 1; + + enum State { + kEmpty, + kBusy, + kReady, + }; + + struct Elem { + std::atomic state; + Work w; + }; + + // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of + // front/back, respectively. The remaining bits contain modification counters + // that are incremented on Push operations. This allows us to (1) distinguish + // between empty and full conditions (if we would use log(kSize) bits for + // position, these conditions would be indistinguishable); (2) obtain + // consistent snapshot of front_/back_ for Size operation using the + // modification counters. + EIGEN_ALIGN_TO_AVOID_FALSE_SHARING std::atomic front_; + EIGEN_ALIGN_TO_AVOID_FALSE_SHARING std::atomic back_; + EIGEN_MUTEX mutex_; // guards `PushBack` and `PopBack` (accesses `back_`) + + EIGEN_ALIGN_TO_AVOID_FALSE_SHARING Elem array_[kSize]; + + // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false, + // only whether the size is 0 is guaranteed to be correct. + // Can be called by any thread at any time. + template + unsigned SizeOrNotEmpty() const { + // Emptiness plays critical role in thread pool blocking. So we go to great + // effort to not produce false positives (claim non-empty queue as empty). + unsigned front = front_.load(std::memory_order_acquire); + for (;;) { + // Capture a consistent snapshot of front/tail. + unsigned back = back_.load(std::memory_order_acquire); + unsigned front1 = front_.load(std::memory_order_relaxed); + if (front != front1) { + front = front1; + std::atomic_thread_fence(std::memory_order_acquire); + continue; + } + if (NeedSizeEstimate) { + return CalculateSize(front, back); + } else { + // This value will be 0 if the queue is empty, and undefined otherwise. + unsigned maybe_zero = ((front ^ back) & kMask2); + // Queue size estimate must agree with maybe zero check on the queue + // empty/non-empty state. + eigen_assert((CalculateSize(front, back) == 0) == (maybe_zero == 0)); + return maybe_zero; + } + } + } + + EIGEN_ALWAYS_INLINE unsigned CalculateSize(unsigned front, unsigned back) const { + int size = (front & kMask2) - (back & kMask2); + // Fix overflow. + if (EIGEN_PREDICT_FALSE(size < 0)) size += 2 * kSize; + // Order of modification in push/pop is crafted to make the queue look + // larger than it is during concurrent modifications. E.g. push can + // increment size before the corresponding pop has decremented it. + // So the computed size can be up to kSize + 1, fix it. + if (EIGEN_PREDICT_FALSE(size > static_cast(kSize))) size = kSize; + return static_cast(size); + } + + RunQueue(const RunQueue&) = delete; + void operator=(const RunQueue&) = delete; +}; +*/ + + +} // end of namespace tf ----------------------------------------------------- + diff --git a/taskflow/utility/object_pool.hpp b/taskflow/utility/object_pool.hpp index 34d60fb80..d9f225494 100644 --- a/taskflow/utility/object_pool.hpp +++ b/taskflow/utility/object_pool.hpp @@ -32,7 +32,7 @@ namespace tf { // Different from the normal memory allocator, object pool allocates // only one object at a time. // -// Internall, we use the following variables to maintain blocks and heaps: +// Internally, we use the following variables to maintain blocks and heaps: // X: size in byte of a item slot // M: number of items per block // F: emptiness threshold @@ -356,7 +356,7 @@ template constexpr P* ObjectPool::_parent_class_of( Q* ptr, const Q P::*member ) { - return (P*)( (char*)ptr - _offset_in_class(member)); + return reinterpret_cast(reinterpret_cast(ptr) - _offset_in_class(member)); } // Function: _parent_class_of @@ -365,7 +365,7 @@ template constexpr P* ObjectPool::_parent_class_of( const Q* ptr, const Q P::*member ) const { - return (P*)( (char*)ptr - _offset_in_class(member)); + return reinterpret_cast(reinterpret_cast(ptr) - _offset_in_class(member)); } // Function: _block_of @@ -625,10 +625,6 @@ T* ObjectPool::animate(ArgsT&&... args) { //s = static_cast(std::malloc(sizeof(Block))); s = new Block(); - if(s == nullptr) { - throw std::bad_alloc(); - } - s->heap = &h; s->i = 0; s->u = 0; diff --git a/taskflow/utility/os.hpp b/taskflow/utility/os.hpp index 23ac3011d..c910fd08e 100644 --- a/taskflow/utility/os.hpp +++ b/taskflow/utility/os.hpp @@ -3,6 +3,7 @@ #include #include #include +#include #define TF_OS_LINUX 0 #define TF_OS_DRAGONFLY 0 @@ -96,7 +97,6 @@ #if defined(__i386__) || defined(__x86_64__) #define TF_CACHELINE_SIZE 64 #elif defined(__powerpc64__) - // TODO // This is the L1 D-cache line size of our Power7 machines. // Need to check if this is appropriate for other PowerPC64 systems. #define TF_CACHELINE_SIZE 128 @@ -120,24 +120,67 @@ -//----------------------------------------------------------------------------- -// pause -//----------------------------------------------------------------------------- -//#if __has_include () -// #define TF_HAS_MM_PAUSE 1 -// #include -//#endif - namespace tf { -// Struct: CachelineAligned -// Due to prefetch, we typically do 2x cacheline for the alignment. +/** + @class CachelineAligned + + @brief class to ensure cacheline-aligned storage for an object. + + @tparam T The type of the stored object. + + This utility class aligns the stored object `data` to twice the size of a cacheline. + The alignment improves performance by optimizing data access in cache-sensitive scenarios. + + @code{.cpp} + // create two integers on two separate cachelines to avoid false sharing + tf::CachelineAligned counter1; + tf::CachelineAligned counter2; + + // two threads access the two counters without false sharing + std::thread t1([&]{ counter1.get() = 1; }); + std::thread t2([&]{ counter2.get() = 2; }); + t1.join(); + t2.join(); + @endcode +*/ template -struct CachelineAligned { +class CachelineAligned { + public: + /** + * @brief The stored object, aligned to twice the cacheline size. + */ alignas (2*TF_CACHELINE_SIZE) T data; + + /** + * @brief accesses the underlying object + * + * @return a reference to the underlying object. + */ + T& get() { return data; } + + /** + * @brief accesses the underlying object as a constant reference + * + * @return a constant reference to the underlying object. + */ + const T& get() const { return data; } }; -// Function: get_env +/** + * @brief retrieves the value of an environment variable + * + * This function fetches the value of an environment variable by name. + * If the variable is not found, it returns an empty string. + * + * @param str The name of the environment variable to retrieve. + * @return The value of the environment variable as a string, or an empty string if not found. + * + * @attention The implementation differs between Windows and POSIX platforms: + * - On Windows, it uses `_dupenv_s` to fetch the value. + * - On POSIX, it uses `std::getenv`. + * + */ inline std::string get_env(const std::string& str) { #ifdef _MSC_VER char *ptr = nullptr; @@ -156,7 +199,19 @@ inline std::string get_env(const std::string& str) { #endif } -// Function: has_env +/** + * @brief checks whether an environment variable is defined + * + * This function determines if a specific environment variable exists in the current environment. + * + * @param str The name of the environment variable to check. + * @return `true` if the environment variable exists, `false` otherwise. + * + * @attention The implementation differs between Windows and POSIX platforms: + * - On Windows, it uses `_dupenv_s` to check for the variable's presence. + * - On POSIX, it uses `std::getenv` to check for the variable's presence. + * + */ inline bool has_env(const std::string& str) { #ifdef _MSC_VER char *ptr = nullptr; @@ -175,12 +230,84 @@ inline bool has_env(const std::string& str) { #endif } -// Procedure: relax_cpu -//inline void relax_cpu() { -//#ifdef TF_HAS_MM_PAUSE -// _mm_pause(); -//#endif -//} +/** + * @fn pause + * + * This function is used in spin-wait loops to hint the CPU that the current + * thread is in a busy-wait state. + * It helps reduce power consumption and improves performance on hyper-threaded processors + * by preventing the CPU from consuming unnecessary cycles while waiting. + * It is particularly useful in low-contention scenarios, where the thread + * is likely to quickly acquire the lock or condition it's waiting for, + * avoiding an expensive context switch. + * On modern x86 processors, this instruction can be invoked using @c __builtin_ia32_pause() + * in GCC/Clang or @c _mm_pause() in MSVC. + * In non-x86 architectures, alternative mechanisms such as yielding the CPU may be used instead. + * + */ +inline void pause() { +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86) + // x86 and x86_64: Use the PAUSE instruction + #if defined(_MSC_VER) + // Microsoft Visual C++ + _mm_pause(); + #elif defined(__GNUC__) || defined(__clang__) + // GCC and Clang + __builtin_ia32_pause(); + #else + asm volatile("pause" ::: "memory"); + #endif + +#elif defined(__aarch64__) || defined(__arm__) + // ARM and AArch64: Use the YIELD instruction + #if defined(__GNUC__) || defined(__clang__) + asm volatile("yield" ::: "memory"); + #endif + +#else + // Fallback: Portable yield for unknown architectures + std::this_thread::yield(); +#endif +} + +/** +@brief pause CPU for a specified number of iterations +*/ +inline void pause(size_t count) { + while(count-- > 0) pause(); +} + +/** + * @brief spins until the given predicate becomes true + * + * @tparam P the type of the predicate function or callable. + * @param predicate the callable that returns a boolean value, which is checked in the loop. + * + * This function repeatedly checks the provided predicate in a spin-wait loop + * and uses a backoff strategy to minimize CPU waste during the wait. Initially, + * it uses the `pause()` instruction for the first 100 iterations to hint to the + * CPU that the thread is waiting, thus reducing power consumption and avoiding + * unnecessary cycles. After 100 iterations, it switches to yielding the CPU using + * `std::this_thread::yield()` to allow other threads to run and improve system + * responsiveness. + * + * The function operates as follows: + * 1. For the first 100 iterations, it invokes `pause()` to reduce power consumption + * during the spin-wait. + * 2. After 100 iterations, it uses `std::this_thread::yield()` to relinquish the + * CPU, allowing other threads to execute. + * + * @attention This function is useful when you need to wait for a condition to be true, but + * want to optimize CPU usage during the wait by using a busy-wait approach. + * + */ +template +void spin_until(P&& predicate) { + size_t num_pauses = 0; + while(!predicate()) { + (num_pauses++ < 100) ? pause() : std::this_thread::yield(); + } +} diff --git a/taskflow/utility/serializer.hpp b/taskflow/utility/serializer.hpp index aab00f23f..5ede84a27 100644 --- a/taskflow/utility/serializer.hpp +++ b/taskflow/utility/serializer.hpp @@ -1126,7 +1126,7 @@ SizeType Deserializer::_load(T&& t) { return t.load(*this); } -} // ned of namespace tf ----------------------------------------------------- +} // end of namespace tf ----------------------------------------------------- diff --git a/taskflow/utility/small_vector.hpp b/taskflow/utility/small_vector.hpp index a42c2646a..1fe107a03 100644 --- a/taskflow/utility/small_vector.hpp +++ b/taskflow/utility/small_vector.hpp @@ -2,6 +2,8 @@ #pragma once +#include "macros.hpp" + #include #include #include @@ -11,13 +13,6 @@ #include #include -#if defined(__GNUC__) - #define TF_LIKELY(x) (__builtin_expect((x), 1)) - #define TF_UNLIKELY(x) (__builtin_expect((x), 0)) -#else - #define TF_LIKELY(x) (x) - #define TF_UNLIKELY(x) (x) -#endif /** @file small_vector.hpp @@ -119,9 +114,15 @@ class SmallVectorTemplateCommon : public SmallVectorBase { private: template friend struct SmallVectorStorage; + //template + //struct AlignedUnionType { + // alignas(X) std::byte buff[std::max(sizeof(std::byte), sizeof(X))]; + //}; + template struct AlignedUnionType { - alignas(X) std::byte buff[std::max(sizeof(std::byte), sizeof(X))]; + static constexpr std::size_t max_size = (sizeof(std::byte) > sizeof(X)) ? sizeof(std::byte) : sizeof(X); + alignas(X) std::byte buff[max_size]; }; // Allocate raw space for N elements of type T. If T has a ctor or dtor, we diff --git a/taskflow/utility/traits.hpp b/taskflow/utility/traits.hpp index dd3953bd4..c7addcbaf 100644 --- a/taskflow/utility/traits.hpp +++ b/taskflow/utility/traits.hpp @@ -1,7 +1,11 @@ #pragma once #if __has_include() -# include +#include +#endif + +#if __has_include() +#include #endif #include @@ -296,6 +300,17 @@ using all_same = all_true...>; template constexpr bool all_same_v = all_same::value; +// ---------------------------------------------------------------------------- +// Iterator +// ---------------------------------------------------------------------------- + +template +using deref_t = std::decay_t())>; + +template +constexpr auto is_random_access_iterator = std::is_same_v< + typename std::iterator_traits::iterator_category, std::random_access_iterator_tag +>; } // end of namespace tf. ---------------------------------------------------- diff --git a/tfprof/server/CMakeLists.txt b/tfprof/server/CMakeLists.txt index 7fee76b4d..48a570758 100644 --- a/tfprof/server/CMakeLists.txt +++ b/tfprof/server/CMakeLists.txt @@ -2,7 +2,7 @@ add_executable(tfprof tfprof.cpp) target_link_libraries( - tfprof ${PROJECT_NAME} tf::default_settings + tfprof ${PROJECT_NAME} ${ATOMIC_LIBRARY} tf::default_settings ) target_include_directories(tfprof PRIVATE ${TF_3RD_PARTY_DIR}) diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt index e19c765b6..30cab8fd9 100644 --- a/unittests/CMakeLists.txt +++ b/unittests/CMakeLists.txt @@ -4,9 +4,9 @@ include(${TF_3RD_PARTY_DIR}/doctest/doctest.cmake) list(APPEND TF_UNITTESTS test_utility + test_queue test_work_stealing #test_serializer - test_priorities test_basics test_asyncs test_dependent_asyncs @@ -21,16 +21,19 @@ list(APPEND TF_UNITTESTS test_sort test_scan test_find - test_compositions + test_modules test_traversals test_pipelines test_scalable_pipelines test_deferred_pipelines test_deferred_scalable_pipelines - test_runtimes test_data_pipelines + test_runtimes + test_workers + #test_exceptions ) +# we only do exception tests if sanitizer is not enabled string(FIND '${CMAKE_CXX_FLAGS}' "-fsanitize" sanitize) #message("sanitize='${sanitize}'") @@ -42,7 +45,7 @@ endif() foreach(unittest IN LISTS TF_UNITTESTS) add_executable(${unittest} ${unittest}.cpp) - target_link_libraries(${unittest} ${PROJECT_NAME} tf::default_settings) + target_link_libraries(${unittest} ${PROJECT_NAME} ${ATOMIC_LIBRARY} tf::default_settings) target_include_directories(${unittest} PRIVATE ${TF_3RD_PARTY_DIR}/doctest) doctest_discover_tests(${unittest}) endforeach() diff --git a/unittests/cuda/CMakeLists.txt b/unittests/cuda/CMakeLists.txt index 45c08b026..ae26b3516 100644 --- a/unittests/cuda/CMakeLists.txt +++ b/unittests/cuda/CMakeLists.txt @@ -1,21 +1,20 @@ list(APPEND TF_CUDA_UNITTESTS test_cuda_objects - test_cuda_memory test_cuda_basics + test_cuda_updates test_cuda_matrix test_cuda_kmeans test_cuda_for_each - test_cuda_for_each_index test_cuda_transform - test_cuda_reduce - test_cuda_scan - test_cuda_find - test_cuda_min_max_element - test_cuda_merge - test_cuda_basic_updates - test_cuda_capturer_optimizer - test_cuda_capture + #test_cuda_reduce + #test_cuda_scan + #test_cuda_find + #test_cuda_min_max_element + #test_cuda_merge + + #test_cuda_capturer_optimizer + #test_cuda_capture #cuda_algorithms #cuda_algorithm_updates @@ -23,7 +22,7 @@ list(APPEND TF_CUDA_UNITTESTS foreach(cudatest IN LISTS TF_CUDA_UNITTESTS) add_executable(${cudatest} ${cudatest}.cu) - target_link_libraries(${cudatest} ${PROJECT_NAME} tf::default_settings) + target_link_libraries(${cudatest} ${PROJECT_NAME} ${ATOMIC_LIBRARY} tf::default_settings) target_include_directories(${cudatest} PRIVATE ${TF_3RD_PARTY_DIR}/doctest) # avoid cmake 3.18+ warning diff --git a/unittests/cuda/test_cuda_basic_updates.cu b/unittests/cuda/test_cuda_basic_updates.cu deleted file mode 100644 index 7c4ede69f..000000000 --- a/unittests/cuda/test_cuda_basic_updates.cu +++ /dev/null @@ -1,848 +0,0 @@ -#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN - -#include - -#include -#include -#include -#include -#include - -template -void run_and_wait(T& cf) { - tf::cudaStream stream; - cf.run(stream); - stream.synchronize(); -} - -//verify -template -__global__ -void verify(const T* a, const T* b, bool* check, size_t size) { - size_t tid = blockIdx.x * blockDim.x + threadIdx.x; - for(;tid < size; tid += gridDim.x * blockDim.x) { - if(a[tid] != b[tid]) { - *check = false; - return; - } - } -} - -template -__global__ void k_add(T* ptr, size_t N, T value) { - int i = blockIdx.x*blockDim.x + threadIdx.x; - if (i < N) { - ptr[i] += value; - } -} - -//add -template -__global__ -void add(const T* a, const T* b, T* c, size_t size) { - size_t tid = blockIdx.x * blockDim.x + threadIdx.x; - for(;tid < size; tid += gridDim.x * blockDim.x) { - c[tid] = a[tid] + b[tid]; - } -} - -//multiply -template -__global__ -void multiply(const T* a, const T* b, T* c, size_t size) { - size_t tid = blockIdx.x * blockDim.x + threadIdx.x; - for(;tid < size; tid += gridDim.x * blockDim.x) { - c[tid] = a[tid] * b[tid]; - } -} - -// ---------------------------------------------------------------------------- -// Incrementality -// ---------------------------------------------------------------------------- -TEST_CASE("cudaFlowCapturer.Incrementality") { - - unsigned N = 1024; - - tf::cudaFlowCapturer cf; - - // construct a cudaflow of three tasks - auto cpu = static_cast(std::calloc(N, sizeof(int))); - auto gpu = tf::cuda_malloc_device(N); - dim3 g = {(N+255)/256, 1, 1}; - dim3 b = {256, 1, 1}; - auto h2d = cf.copy(gpu, cpu, N); - auto kernel = cf.kernel(g, b, 0, k_add, gpu, N, 17); - auto d2h = cf.copy(cpu, gpu, N); - h2d.precede(kernel); - kernel.precede(d2h); - - REQUIRE(cf.num_tasks() == 3); - REQUIRE(cf.empty() == false); - REQUIRE(cf.native_executable() == nullptr); - - // run - cf.run(0); - cudaStreamSynchronize(0); - - auto native_graph = cf.native_graph(); - auto native_executable = cf.native_executable(); - - REQUIRE(native_graph != nullptr); - REQUIRE(native_executable != nullptr); - REQUIRE(cf.num_tasks() == 3); - REQUIRE(cf.empty() == false); - REQUIRE(cf.native_graph() != nullptr); - REQUIRE(cf.native_executable() != nullptr); - REQUIRE(tf::cuda_graph_get_num_nodes(cf.native_graph()) == cf.num_tasks()); - - for(unsigned i=0; i, gpu, N, j); - cf.run(0); - cudaStreamSynchronize(0); - - auto updated_native_graph = cf.native_graph(); - auto updated_native_executable = cf.native_executable(); - - REQUIRE(updated_native_graph != native_graph); - REQUIRE(updated_native_executable == native_executable); - REQUIRE(cf.num_tasks() == 3); - REQUIRE(cf.empty() == false); - REQUIRE(cf.native_graph() != nullptr); - REQUIRE(cf.native_executable() != nullptr); - REQUIRE(tf::cuda_graph_get_num_nodes(cf.native_graph()) == cf.num_tasks()); - - for(unsigned i=0; i -void rebind_kernel() { - tf::Executor executor; - - for(size_t N = 1; N < 65529; N = N * 2 + 1) { - tf::Taskflow taskflow; - - std::vector operand(3, nullptr); - std::vector ans_operand(3, nullptr); - - std::vector ind(3); - std::generate_n(ind.data(), 3, [&](){ return ::rand() % 3; }); - - - bool* check {nullptr}; - - //allocate - auto allocate_t = taskflow.emplace([&]() { - for(int i = 0; i < 3; ++i) { - REQUIRE(cudaMallocManaged(&operand[i], N * sizeof(T)) == cudaSuccess); - REQUIRE(cudaMallocManaged(&ans_operand[i], N * sizeof(T)) == cudaSuccess); - } - - REQUIRE(cudaMallocManaged(&check, sizeof(bool)) == cudaSuccess); - }).name("allocate"); - - //initialize - auto initialize_t = taskflow.emplace([&](){ - for(int i = 0; i < 3; ++i) { - std::generate_n(operand[i], N, [&](){ return ::rand() % N - N / 2 + i; }); - std::memcpy(ans_operand[i], operand[i], N * sizeof(T)); - } - - *check = true; - }).name("initialize"); - - - //rebind_kernel - auto add_t = taskflow.emplace([&]() { - - F cf; - - auto multi_t = cf.kernel( - 32, 512, 0, - multiply, - operand[ind[0]], operand[ind[1]], operand[ind[2]], N - ); - - auto add_t = cf.kernel( - 32, 512, 0, - add, - operand[ind[1]], operand[ind[2]], operand[ind[0]], N - ); - - multi_t.precede(add_t); - - run_and_wait(cf); - - cf.kernel( - multi_t, - 64, 128, 0, - multiply, - operand[ind[2]], operand[ind[0]], operand[ind[1]], N - ); - - cf.kernel( - add_t, - 16, 256, 0, - add, - operand[ind[1]], operand[ind[0]], operand[ind[2]], N - ); - - run_and_wait(cf); - - cf.kernel( - multi_t, - 8, 1024, 0, - multiply, - operand[ind[0]], operand[ind[2]], operand[ind[1]], N - ); - - cf.kernel( - add_t, - 64, 64, 0, - add, - operand[ind[2]], operand[ind[1]], operand[ind[0]], N - ); - - run_and_wait(cf); - }).name("add"); - - //verify - auto verify_t = taskflow.emplace([&]() { - - F cf; - - //auto multi1_t = cf.transform( - // ans_operand[ind[2]], ans_operand[ind[2]]+ N, - // [] __device__ (T& v1, T& v2) { return v1 * v2; }, - // ans_operand[ind[0]], ans_operand[ind[1]] - //); - - auto multi1_t = cf.transform( - ans_operand[ind[0]], ans_operand[ind[0]] + N, ans_operand[ind[1]], - ans_operand[ind[2]], - [] __device__ (T& v1, T& v2) { return v1*v2; } - ); - - //auto add1_t = cf.transform( - // ans_operand[ind[0]], ans_operand[ind[0]]+ N, - // [] __device__ (T& v1, T& v2) { return v1 + v2; }, - // ans_operand[ind[1]], ans_operand[ind[2]] - //); - - auto add1_t = cf.transform( - ans_operand[ind[1]], ans_operand[ind[1]]+N, ans_operand[ind[2]], - ans_operand[ind[0]], - [] __device__ (T& v1, T& v2) { return v1 + v2; } - ); - - //auto multi2_t = cf.transform( - // ans_operand[ind[1]], ans_operand[ind[1]]+ N, - // [] __device__ (T& v1, T& v2) { return v1 * v2; }, - // ans_operand[ind[2]], ans_operand[ind[0]] - //); - - auto multi2_t = cf.transform( - ans_operand[ind[2]], ans_operand[ind[2]] + N, ans_operand[ind[0]], - ans_operand[ind[1]], - [] __device__ (T& v1, T& v2) { return v1 * v2; } - ); - - //auto add2_t = cf.transform( - // ans_operand[ind[2]], ans_operand[ind[2]]+ N, - // [] __device__ (T& v1, T& v2) { return v1 + v2; }, - // ans_operand[ind[1]], ans_operand[ind[0]] - //); - - auto add2_t = cf.transform( - ans_operand[ind[1]], ans_operand[ind[1]] + N, ans_operand[ind[0]], - ans_operand[ind[2]], - [] __device__ (T& v1, T& v2) { return v1 + v2; } - ); - - auto multi3_t = cf.transform( - ans_operand[ind[0]], ans_operand[ind[0]] + N, ans_operand[ind[2]], - ans_operand[ind[1]], - [] __device__ (T& v1, T& v2) { return v1 * v2; } - ); - - auto add3_t = cf.transform( - ans_operand[ind[2]], ans_operand[ind[2]] + N, ans_operand[ind[1]], - ans_operand[ind[0]], - [] __device__ (T& v1, T& v2) { return v1 + v2; } - ); - - auto verify1_t = cf.kernel( - 32, 512, 0, - verify, - operand[ind[0]], ans_operand[ind[0]], check, N - ); - - auto verify2_t = cf.kernel( - 32, 512, 0, - verify, - operand[ind[1]], ans_operand[ind[1]], check, N - ); - - auto verify3_t = cf.kernel( - 32, 512, 0, - verify, - operand[ind[2]], ans_operand[ind[2]], check, N - ); - - multi1_t.precede(add1_t); - add1_t.precede(multi2_t); - multi2_t.precede(add2_t); - add2_t.precede(multi3_t); - multi3_t.precede(add3_t); - add3_t.precede(verify1_t).precede(verify2_t).precede(verify3_t); - - run_and_wait(cf); - REQUIRE(*check); - - }).name("verify"); - - //free memory - auto deallocate_t = taskflow.emplace([&]() { - for(int i = 0; i < 3; ++i) { - REQUIRE(cudaFree(operand[i]) == cudaSuccess); - REQUIRE(cudaFree(ans_operand[i]) == cudaSuccess); - } - - REQUIRE(cudaFree(check) == cudaSuccess); - }).name("deallocate"); - - allocate_t.precede(initialize_t); - initialize_t.precede(add_t); - add_t.precede(verify_t); - verify_t.precede(deallocate_t); - - executor.run(taskflow).wait(); - - } - -} - -// cudaflow -TEST_CASE("cudaFlow.rebind.kernel.int" * doctest::timeout(300)) { - rebind_kernel(); -} - -TEST_CASE("cudaFlow.rebind.kernel.float" * doctest::timeout(300)) { - rebind_kernel(); -} - -TEST_CASE("cudaFlow.rebind.kernel.double" * doctest::timeout(300)) { - rebind_kernel(); -} - -// capturer -TEST_CASE("cudaFlowCapturer.rebind.kernel.int" * doctest::timeout(300)) { - rebind_kernel(); -} - -TEST_CASE("cudaFlowCapturer.rebind.kernel.float" * doctest::timeout(300)) { - rebind_kernel(); -} - -TEST_CASE("cudaFlowCapturer.rebind.kernel.double" * doctest::timeout(300)) { - rebind_kernel(); -} - -//---------------------------------------------------------------------- -//rebind copy -//---------------------------------------------------------------------- -template -void rebind_copy() { - tf::Executor executor; - - for(int N = 1; N < 65459; N = N * 2 + 1) { - tf::Taskflow taskflow; - - std::vector ha(N, N + 5); - std::vector hb(N, N - 31); - std::vector hc(N, N - 47); - std::vector hz(N); - - T* da {nullptr}; - T* db {nullptr}; - T* dc {nullptr}; - T* dz {nullptr}; - - - //allocate - auto allocate_t = taskflow.emplace([&]() { - REQUIRE(cudaMalloc(&da, N * sizeof(T)) == cudaSuccess); - REQUIRE(cudaMalloc(&db, N * sizeof(T)) == cudaSuccess); - REQUIRE(cudaMalloc(&dc, N * sizeof(T)) == cudaSuccess); - REQUIRE(cudaMalloc(&dz, N * sizeof(T)) == cudaSuccess); - }).name("allocate"); - - - //rebind_copy - auto h2d_t = taskflow.emplace([&]() { - - F cf; - - auto h2d_t = cf.copy(da, ha.data(), N).name("h2d"); - run_and_wait(cf); - - cf.copy(h2d_t, db, hb.data(), N); - run_and_wait(cf); - - cf.copy(h2d_t, dc, hc.data(), N); - run_and_wait(cf); - }); - - auto kernel_t = taskflow.emplace([&]() { - F cf; - //auto add1_t = cf.transform( - // dz, dz + N, - // [] __device__ (T& v1, T& v2) { return v1 + v2; }, - // da, db - //); - - auto add1_t = cf.transform( - da, da+N, db, - dz, - [] __device__ (T& v1, T& v2) { return v1 + v2; } - ); - - //auto add2_t = cf.transform( - // dc, dc + N, - // [] __device__ (T& v1, T& v2) { return v1 - v2; }, - // dc, dz - //); - - auto add2_t = cf.transform( - dc, dc + N, dz, - dc, - [] __device__ (T& v1, T& v2) { return v1 - v2; } - ); - - add1_t.precede(add2_t); - - run_and_wait(cf); - }); - - auto d2h_t = taskflow.emplace([&]() { - - F cf; - - auto d2h_t = cf.copy(hc.data(), dc, N).name("d2h"); - run_and_wait(cf); - - cf.copy(d2h_t, hz.data(), dz, N); - run_and_wait(cf); - }); - - //verify - auto verify_t = taskflow.emplace([&]() { - for(auto& c: hc) { - REQUIRE(c == -21 - N); - } - - for(auto& z: hz) { - REQUIRE(z == 2 * N - 26); - } - }); - - //free memory - auto deallocate_t = taskflow.emplace([&]() { - REQUIRE(cudaFree(da) == cudaSuccess); - REQUIRE(cudaFree(db) == cudaSuccess); - REQUIRE(cudaFree(dc) == cudaSuccess); - REQUIRE(cudaFree(dz) == cudaSuccess); - }).name("deallocate"); - - allocate_t.precede(h2d_t); - h2d_t.precede(kernel_t); - kernel_t.precede(d2h_t); - d2h_t.precede(verify_t); - verify_t.precede(deallocate_t); - - executor.run(taskflow).wait(); - - } -} - -// cudaFlow -TEST_CASE("cudaFlow.rebind.copy.int" * doctest::timeout(300)) { - rebind_copy(); -} - -TEST_CASE("cudaFlow.rebind.copy.float" * doctest::timeout(300)) { - rebind_copy(); -} - -TEST_CASE("cudaFlow.rebind.copy.double" * doctest::timeout(300)) { - rebind_copy(); -} - -// cudaFlowCapturer -TEST_CASE("cudaFlowCapturer.rebind.copy.int" * doctest::timeout(300)) { - rebind_copy(); -} - -TEST_CASE("cudaFlowCapturer.rebind.copy.float" * doctest::timeout(300)) { - rebind_copy(); -} - -TEST_CASE("cudaFlowCapturer.rebind.copy.double" * doctest::timeout(300)) { - rebind_copy(); -} - - -//---------------------------------------------------------------------- -// rebind memcpy -//---------------------------------------------------------------------- -template -void rebind_memcpy() { - tf::Executor executor; - - for(int N = 1; N < 65459; N = N * 2 + 1) { - tf::Taskflow taskflow; - - std::vector ha(N, N + 5); - std::vector hb(N, N - 31); - std::vector hc(N, N - 47); - std::vector hz(N); - - T* da {nullptr}; - T* db {nullptr}; - T* dc {nullptr}; - T* dz {nullptr}; - - - //allocate - auto allocate_t = taskflow.emplace([&]() { - REQUIRE(cudaMalloc(&da, N * sizeof(T)) == cudaSuccess); - REQUIRE(cudaMalloc(&db, N * sizeof(T)) == cudaSuccess); - REQUIRE(cudaMalloc(&dc, N * sizeof(T)) == cudaSuccess); - REQUIRE(cudaMalloc(&dz, N * sizeof(T)) == cudaSuccess); - }).name("allocate"); - - - //rebind_memcpy - auto h2d_t = taskflow.emplace([&]() { - - F cf; - - auto h2d_t = cf.memcpy(da, ha.data(), sizeof(T) * N).name("h2d"); - run_and_wait(cf); - - cf.memcpy(h2d_t, db, hb.data(), sizeof(T) * N); - run_and_wait(cf); - - cf.memcpy(h2d_t, dc, hc.data(), sizeof(T) * N); - run_and_wait(cf); - - }); - - auto kernel_t = taskflow.emplace([&]() { - F cf; - - auto add1_t = cf.transform( - da, da + N, db, - dz, - [] __device__ (T& v1, T& v2) { return v1 + v2; } - ); - - auto add2_t = cf.transform( - dc, dc + N, dz, - dc, - [] __device__ (T& v1, T& v2) { return v1 - v2; } - ); - - add1_t.precede(add2_t); - run_and_wait(cf); - }); - - auto d2h_t = taskflow.emplace([&]() { - F cf; - auto d2h_t = cf.memcpy(hc.data(), dc, sizeof(T) * N).name("d2h"); - run_and_wait(cf); - cf.memcpy(d2h_t, hz.data(), dz, sizeof(T) * N); - run_and_wait(cf); - }); - - //verify - auto verify_t = taskflow.emplace([&]() { - for(auto& c: hc) { - REQUIRE(c == -21 - N); - } - - for(auto& z: hz) { - REQUIRE(z == 2 * N - 26); - } - }); - - //free memory - auto deallocate_t = taskflow.emplace([&]() { - REQUIRE(cudaFree(da) == cudaSuccess); - REQUIRE(cudaFree(db) == cudaSuccess); - REQUIRE(cudaFree(dc) == cudaSuccess); - REQUIRE(cudaFree(dz) == cudaSuccess); - }).name("deallocate"); - - allocate_t.precede(h2d_t); - h2d_t.precede(kernel_t); - kernel_t.precede(d2h_t); - d2h_t.precede(verify_t); - verify_t.precede(deallocate_t); - - executor.run(taskflow).wait(); - - } -} - -// cudaflow -TEST_CASE("cudaFlow.rebind.memcpy.int" * doctest::timeout(300)) { - rebind_memcpy(); -} - -TEST_CASE("cudaFlow.rebind.memcpy.float" * doctest::timeout(300)) { - rebind_memcpy(); -} - -TEST_CASE("cudaFlow.rebind.memcpy.double" * doctest::timeout(300)) { - rebind_memcpy(); -} - -// capturer -TEST_CASE("cudaFlowCapturer.rebind.memcpy.int" * doctest::timeout(300)) { - rebind_memcpy(); -} - -TEST_CASE("cudaFlowCapturer.rebind.memcpy.float" * doctest::timeout(300)) { - rebind_memcpy(); -} - -TEST_CASE("cudaFlowCapturer.rebind.memcpy.double" * doctest::timeout(300)) { - rebind_memcpy(); -} - -//---------------------------------------------------------------------- -//rebind memset -//---------------------------------------------------------------------- -template -void rebind_memset() { - - tf::Executor executor; - tf::Taskflow taskflow; - - for(size_t N = 1; N < 65199; N = N * 2 + 1) { - - taskflow.clear(); - - T* a {nullptr}; - T* b {nullptr}; - - T* ans_a {nullptr}; - T* ans_b {nullptr}; - - bool* check {nullptr}; - - //allocate - auto allocate_t = taskflow.emplace([&]() { - REQUIRE(cudaMallocManaged(&a, N * sizeof(T)) == cudaSuccess); - REQUIRE(cudaMallocManaged(&b, (N + 37) * sizeof(T)) == cudaSuccess); - - REQUIRE(cudaMallocManaged(&ans_a, N * sizeof(T)) == cudaSuccess); - REQUIRE(cudaMallocManaged(&ans_b, (N + 37) * sizeof(T)) == cudaSuccess); - - REQUIRE(cudaMallocManaged(&check, sizeof(bool)) == cudaSuccess); - }).name("allocate"); - - //initialize - auto initialize_t = taskflow.emplace([&]() { - std::generate_n(a, N, [&](){ return ::rand() % N - N / 2; }); - std::generate_n(b, N + 37, [&](){ return ::rand() % N + N / 2; }); - - REQUIRE(cudaMemset(ans_a, 0, N * sizeof(T)) == cudaSuccess); - REQUIRE(cudaMemset(ans_b, 1, (N + 37) * sizeof(T)) == cudaSuccess); - - *check = true; - }).name("initialize"); - - //rebind_memset - auto memset_t = taskflow.emplace([&]() { - F cf; - auto memset_t = cf.memset(ans_a, 0, N * sizeof(T)); - run_and_wait(cf); - - cf.memset(memset_t, a, 0, N * sizeof(T)); - run_and_wait(cf); - - cf.memset(memset_t, b, 1, (N + 37) * sizeof(T)); - run_and_wait(cf); - }).name("memset"); - - //verify - auto verify_t = taskflow.emplace([&]() { - F cf; - cf.kernel( - 32, 512, 0, - verify, - a, ans_a, check, N - ); - - cf.kernel( - 32, 512, 0, - verify, - b, ans_b, check, N + 37 - ); - - run_and_wait(cf); - - REQUIRE(*check); - }).name("verify"); - - //free memory - auto deallocate_t = taskflow.emplace([&]() { - REQUIRE(cudaFree(a) == cudaSuccess); - REQUIRE(cudaFree(b) == cudaSuccess); - REQUIRE(cudaFree(ans_a) == cudaSuccess); - REQUIRE(cudaFree(ans_b) == cudaSuccess); - REQUIRE(cudaFree(check) == cudaSuccess); - }).name("deallocate"); - - allocate_t.precede(initialize_t); - initialize_t.precede(memset_t); - memset_t.precede(verify_t); - verify_t.precede(deallocate_t); - - executor.run(taskflow).wait(); - } -} - -// cudaflow -TEST_CASE("cudaFlow.rebind.memset.int" * doctest::timeout(300)) { - rebind_memset(); -} - -TEST_CASE("cudaFlow.rebind.memset.float" * doctest::timeout(300)) { - rebind_memset(); -} - -TEST_CASE("cudaFlow.rebind.memset.double" * doctest::timeout(300)) { - rebind_memset(); -} - -// capturer -TEST_CASE("cudaFlowCapturer.rebind.memset.int" * doctest::timeout(300)) { - rebind_memset(); -} - -TEST_CASE("cudaFlowCapturer.rebind.memset.float" * doctest::timeout(300)) { - rebind_memset(); -} - -TEST_CASE("cudaFlowCapturer.rebind.memset.double" * doctest::timeout(300)) { - rebind_memset(); -} - -// ---------------------------------------------------------------------------- -// rebind algorithms -// ---------------------------------------------------------------------------- - -TEST_CASE("cudaFlowCapturer.rebind.algorithms") { - - tf::cudaFlowCapturer capturer; - - auto data = tf::cuda_malloc_shared(10000); - auto res = tf::cuda_malloc_shared(1); - - auto task = capturer.for_each( - data, data+10000, []__device__(int& i) { - i = 10; - } - ); - - run_and_wait(capturer); - - for(int i=0; i<10000; i++) { - REQUIRE(data[i] == 10); - } - REQUIRE(capturer.num_tasks() == 1); - - // rebind to single task - capturer.single_task(task, [=] __device__ () {*data = 2;}); - - run_and_wait(capturer); - - REQUIRE(*data == 2); - for(int i=1; i<10000; i++) { - REQUIRE(data[i] == 10); - } - REQUIRE(capturer.num_tasks() == 1); - - // rebind to for each index - capturer.for_each_index(task, 0, 10000, 1, - [=] __device__ (int i) { - data[i] = -23; - } - ); - - run_and_wait(capturer); - - for(int i=0; i<10000; i++) { - REQUIRE(data[i] == -23); - } - REQUIRE(capturer.num_tasks() == 1); - - // rebind to single task - capturer.single_task(task, [res]__device__(){ *res = 999; }); - - run_and_wait(capturer); - REQUIRE(*res == 999); - REQUIRE(capturer.num_tasks() == 1); - - // clear the capturer - capturer.clear(); - REQUIRE(capturer.num_tasks() == 0); - - run_and_wait(capturer); - REQUIRE(*res == 999); - for(int i=0; i<10000; i++) { - REQUIRE(data[i] == -23); - } - - // clear the memory - tf::cuda_free(data); - tf::cuda_free(res); -} diff --git a/unittests/cuda/test_cuda_basics.cu b/unittests/cuda/test_cuda_basics.cu index 93f5a5261..69dd3f29d 100644 --- a/unittests/cuda/test_cuda_basics.cu +++ b/unittests/cuda/test_cuda_basics.cu @@ -33,177 +33,20 @@ __global__ void k_single_add(T* ptr, int i, T value) { ptr[i] += value; } -template -void run_and_wait(T& cf) { +void run_and_wait(tf::cudaGraph& cg) { tf::cudaStream stream; - cf.run(stream); - stream.synchronize(); -} - -// -------------------------------------------------------- -// Testcase: Empty -// -------------------------------------------------------- - -template -void empty() { - std::atomic counter{0}; - - tf::Taskflow taskflow; - tf::Executor executor; - - taskflow.emplace([&](){ - T tf; - ++counter; - }); - - taskflow.emplace([&](){ - T tf; - ++counter; - }); - - taskflow.emplace([&](){ - T tf; - ++counter; - }); - - executor.run_n(taskflow, 100).wait(); - - REQUIRE(counter == 300); -} - -TEST_CASE("Empty" * doctest::timeout(300)) { - empty(); -} - -TEST_CASE("EmptyCapture" * doctest::timeout(300)) { - empty(); -} - -// ---------------------------------------------------------------------------- -// Move Semantics -// ---------------------------------------------------------------------------- - -template -void move_semantics() { - - unsigned N = 1024; - - F rhs; - - REQUIRE(rhs.num_tasks() == 0); - REQUIRE(rhs.empty()); - REQUIRE(rhs.native_executable() == nullptr); - - // construct a cudaflow of three tasks - auto cpu = static_cast(std::calloc(N, sizeof(int))); - auto gpu = tf::cuda_malloc_device(N); - dim3 g = {(N+255)/256, 1, 1}; - dim3 b = {256, 1, 1}; - auto h2d = rhs.copy(gpu, cpu, N); - auto kernel = rhs.kernel(g, b, 0, k_add, gpu, N, 17); - auto d2h = rhs.copy(cpu, gpu, N); - h2d.precede(kernel); - kernel.precede(d2h); - - REQUIRE(rhs.num_tasks() == 3); - REQUIRE(rhs.empty() == false); - REQUIRE(rhs.native_executable() == nullptr); - - // construct a rhs - F lhs( std::move(rhs) ); - - REQUIRE(rhs.num_tasks() == 0); - REQUIRE(rhs.empty()); - REQUIRE(rhs.native_executable() == nullptr); - - REQUIRE(lhs.num_tasks() == 3); - REQUIRE(lhs.empty() == false); - REQUIRE(lhs.native_executable() == nullptr); - - // assign lhs to rhs using move semantics - rhs = std::move(lhs); - - REQUIRE(lhs.num_tasks() == 0); - REQUIRE(lhs.empty()); - REQUIRE(lhs.native_executable() == nullptr); - - REQUIRE(rhs.num_tasks() == 3); - REQUIRE(rhs.empty() == false); - REQUIRE(rhs.native_executable() == nullptr); - - // run - rhs.run(0); - cudaStreamSynchronize(0); - - auto native_graph = rhs.native_graph(); - auto native_executable = rhs.native_executable(); - - REQUIRE(native_graph != nullptr); - REQUIRE(native_executable != nullptr); - REQUIRE(rhs.num_tasks() == 3); - REQUIRE(rhs.empty() == false); - REQUIRE(rhs.native_graph() != nullptr); - REQUIRE(rhs.native_executable() != nullptr); - REQUIRE(tf::cuda_graph_get_num_nodes(rhs.native_graph()) == rhs.num_tasks()); - - for(unsigned i=0; i(); -} - -TEST_CASE("cudaFlowCapturer.MoveSemantics" * doctest::timeout(300)) { - move_semantics(); + tf::cudaGraphExec exec(cg); + stream.run(exec).synchronize(); } // ---------------------------------------------------------------------------- -// Standalone +// standalone add // ---------------------------------------------------------------------------- -template -void standalone() { +TEST_CASE("cudaGraph.Standalone") { - T cf; + tf::cudaGraph cg; tf::cudaStream stream; - REQUIRE(cf.empty()); + REQUIRE(cg.empty()); unsigned N = 1024; @@ -212,9 +55,9 @@ void standalone() { dim3 g = {(N+255)/256, 1, 1}; dim3 b = {256, 1, 1}; - auto h2d = cf.copy(gpu, cpu, N); - auto kernel = cf.kernel(g, b, 0, k_add, gpu, N, 17); - auto d2h = cf.copy(cpu, gpu, N); + auto h2d = cg.copy(gpu, cpu, N); + auto kernel = cg.kernel(g, b, 0, k_add, gpu, N, 17); + auto d2h = cg.copy(cpu, gpu, N); h2d.precede(kernel); kernel.precede(d2h); @@ -222,14 +65,15 @@ void standalone() { REQUIRE(cpu[i] == 0); } - cf.run(stream); - stream.synchronize(); + tf::cudaGraphExec exec(cg); + + stream.run(exec).synchronize(); for(unsigned i=0; i(); -} - -TEST_CASE("Standalone.cudaCapturer") { - standalone(); -} - - - // -------------------------------------------------------- // Testcase: Set // -------------------------------------------------------- @@ -273,15 +107,15 @@ void set() { }); auto gputask = taskflow.emplace([&]() { - tf::cudaFlow cf; - auto h2d = cf.copy(gpu, cpu, n); - auto kernel = cf.kernel((n+255)/256, 256, 0, k_set, gpu, n, (T)17); - auto d2h = cf.copy(cpu, gpu, n); + tf::cudaGraph cg; + auto h2d = cg.copy(gpu, cpu, n); + auto kernel = cg.kernel((n+255)/256, 256, 0, k_set, gpu, n, (T)17); + auto d2h = cg.copy(cpu, gpu, n); h2d.precede(kernel); kernel.precede(d2h); - run_and_wait(cf); + run_and_wait(cg); - REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); + REQUIRE(cg.num_nodes() == 3); }); cputask.precede(gputask); @@ -297,15 +131,15 @@ void set() { } } -TEST_CASE("Set.i8" * doctest::timeout(300)) { +TEST_CASE("cudaGraph.Set.i8" * doctest::timeout(300)) { set(); } -TEST_CASE("Set.i16" * doctest::timeout(300)) { +TEST_CASE("cudaGraph.Set.i16" * doctest::timeout(300)) { set(); } -TEST_CASE("Set.i32" * doctest::timeout(300)) { +TEST_CASE("cudaGraph.Set.i32" * doctest::timeout(300)) { set(); } @@ -329,22 +163,22 @@ void add() { }); auto gputask = taskflow.emplace([&](){ - tf::cudaFlow cf; + tf::cudaGraph cg; dim3 g = {(n+255)/256, 1, 1}; dim3 b = {256, 1, 1}; - auto h2d = cf.copy(gpu, cpu, n); - auto ad1 = cf.kernel(g, b, 0, k_add, gpu, n, 1); - auto ad2 = cf.kernel(g, b, 0, k_add, gpu, n, 2); - auto ad3 = cf.kernel(g, b, 0, k_add, gpu, n, 3); - auto ad4 = cf.kernel(g, b, 0, k_add, gpu, n, 4); - auto d2h = cf.copy(cpu, gpu, n); + auto h2d = cg.copy(gpu, cpu, n); + auto ad1 = cg.kernel(g, b, 0, k_add, gpu, n, 1); + auto ad2 = cg.kernel(g, b, 0, k_add, gpu, n, 2); + auto ad3 = cg.kernel(g, b, 0, k_add, gpu, n, 3); + auto ad4 = cg.kernel(g, b, 0, k_add, gpu, n, 4); + auto d2h = cg.copy(cpu, gpu, n); h2d.precede(ad1); ad1.precede(ad2); ad2.precede(ad3); ad3.precede(ad4); ad4.precede(d2h); - run_and_wait(cf); - REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); + run_and_wait(cg); + REQUIRE(cg.num_nodes() == 6); }); cputask.precede(gputask); @@ -360,28 +194,28 @@ void add() { } } -TEST_CASE("Add.i8" * doctest::timeout(300)) { +TEST_CASE("cudaGraph.Add.i8" * doctest::timeout(300)) { add(); } -TEST_CASE("Add.i16" * doctest::timeout(300)) { +TEST_CASE("cudaGraph.Add.i16" * doctest::timeout(300)) { add(); } -TEST_CASE("Add.i32" * doctest::timeout(300)) { +TEST_CASE("cudaGraph.Add.i32" * doctest::timeout(300)) { add(); } + // TODO: 64-bit fail? //TEST_CASE("Add.i64" * doctest::timeout(300)) { // add(); //} - // -------------------------------------------------------- // Testcase: Binary Set // -------------------------------------------------------- -template +template void bset() { const unsigned n = 10000; @@ -398,16 +232,16 @@ void bset() { }); auto gputask = taskflow.emplace([&]() { - F cf; + tf::cudaGraph cg; dim3 g = {1, 1, 1}; dim3 b = {1, 1, 1}; - auto h2d = cf.copy(gpu, cpu, n); - auto d2h = cf.copy(cpu, gpu, n); + auto h2d = cg.copy(gpu, cpu, n); + auto d2h = cg.copy(cpu, gpu, n); std::vector tasks(n+1); for(unsigned i=1; i<=n; ++i) { - tasks[i] = cf.kernel(g, b, 0, k_single_set, gpu, i-1, (T)17); + tasks[i] = cg.kernel(g, b, 0, k_single_set, gpu, i-1, (T)17); auto p = i/2; if(p != 0) { @@ -418,8 +252,8 @@ void bset() { h2d.precede(tasks[i]); } - run_and_wait(cf); - REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); + run_and_wait(cg); + REQUIRE(cg.num_nodes() == n + 2); }); cputask.precede(gputask); @@ -434,37 +268,24 @@ void bset() { REQUIRE(cudaFree(gpu) == cudaSuccess); } -TEST_CASE("BSet.i8" * doctest::timeout(300)) { - bset(); -} - -TEST_CASE("BSet.i16" * doctest::timeout(300)) { - bset(); -} - -TEST_CASE("BSet.i32" * doctest::timeout(300)) { - bset(); -} - -TEST_CASE("CapturedBSet.i8" * doctest::timeout(300)) { - bset(); +TEST_CASE("cudaGraph.BSet.i8" * doctest::timeout(300)) { + bset(); } -TEST_CASE("CapturedBSet.i16" * doctest::timeout(300)) { - bset(); +TEST_CASE("cudaGraph.BSet.i16" * doctest::timeout(300)) { + bset(); } -TEST_CASE("CapturedBSet.i32" * doctest::timeout(300)) { - bset(); +TEST_CASE("cudaGraph.BSet.i32" * doctest::timeout(300)) { + bset(); } // -------------------------------------------------------- // Testcase: Memset // -------------------------------------------------------- -template -void memset() { - +TEST_CASE("cudaGraph.Memset" * doctest::timeout(300)) { + tf::Taskflow taskflow; tf::Executor executor; @@ -484,16 +305,16 @@ void memset() { } taskflow.emplace([&](){ - F cf; + tf::cudaGraph cg; dim3 g = {(unsigned)(N+255)/256, 1, 1}; dim3 b = {256, 1, 1}; - auto kset = cf.kernel(g, b, 0, k_set, gpu, N, 123); - auto copy = cf.copy(cpu, gpu, N); - auto zero = cf.memset(gpu+start, 0x3f, (N-start)*sizeof(int)); + auto kset = cg.kernel(g, b, 0, k_set, gpu, N, 123); + auto copy = cg.copy(cpu, gpu, N); + auto zero = cg.memset(gpu+start, 0x3f, (N-start)*sizeof(int)); kset.precede(zero); zero.precede(copy); - run_and_wait(cf); - REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); + run_and_wait(cg); + REQUIRE(cg.num_nodes() == 3); }); executor.run(taskflow).wait(); @@ -510,109 +331,10 @@ void memset() { REQUIRE(cudaFree(gpu) == cudaSuccess); } -TEST_CASE("Memset" * doctest::timeout(300)) { - memset(); -} - -TEST_CASE("CapturedMemset" * doctest::timeout(300)) { - memset(); -} - -// -------------------------------------------------------- -// Testcase: Memset0 -// -------------------------------------------------------- -template -void memset0() { - - tf::Taskflow taskflow; - tf::Executor executor; - - const int N = 97; - - T* cpu = new T [N]; - T* gpu = nullptr; - - REQUIRE(cudaMalloc(&gpu, N*sizeof(T)) == cudaSuccess); - - for(int r=1; r<=100; ++r) { - - int start = ::rand() % N; - - for(int i=0; i, gpu, N, (T)123); - auto zero = cf.memset(gpu+start, (T)0, (N-start)*sizeof(T)); - auto copy = cf.copy(cpu, gpu, N); - kset.precede(zero); - zero.precede(copy); - run_and_wait(cf); - REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); - }); - - executor.run(taskflow).wait(); - - for(int i=0; i(); -} - -TEST_CASE("Memset0.i16") { - memset0(); -} - -TEST_CASE("Memset0.i32") { - memset0(); -} - -TEST_CASE("Memset0.f32") { - memset0(); -} - -TEST_CASE("Memset0.f64") { - memset0(); -} - -TEST_CASE("CapturedMemset0.i8") { - memset0(); -} - -TEST_CASE("CapturedMemset0.i16") { - memset0(); -} - -TEST_CASE("CapturedMemset0.i32") { - memset0(); -} - -TEST_CASE("CapturedMemset0.f32") { - memset0(); -} - -TEST_CASE("CapturedMemset0.f64") { - memset0(); -} - // -------------------------------------------------------- // Testcase: Memcpy // -------------------------------------------------------- -template +template void memcpy() { tf::Taskflow taskflow; @@ -634,16 +356,16 @@ void memcpy() { } taskflow.emplace([&](){ - F cf; + tf::cudaGraph cg; dim3 g = {(unsigned)(N+255)/256, 1, 1}; dim3 b = {256, 1, 1}; - auto kset = cf.kernel(g, b, 0, k_set, gpu, N, (T)123); - auto zero = cf.memset(gpu+start, (T)0, (N-start)*sizeof(T)); - auto copy = cf.memcpy(cpu, gpu, N*sizeof(T)); + auto kset = cg.kernel(g, b, 0, k_set, gpu, N, (T)123); + auto zero = cg.memset(gpu+start, (T)0, (N-start)*sizeof(T)); + auto copy = cg.memcpy(cpu, gpu, N*sizeof(T)); kset.precede(zero); zero.precede(copy); - run_and_wait(cf); - REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); + run_and_wait(cg); + REQUIRE(cg.num_nodes() == 3); }); executor.run(taskflow).wait(); @@ -660,44 +382,24 @@ void memcpy() { REQUIRE(cudaFree(gpu) == cudaSuccess); } -TEST_CASE("Memcpy.i8") { - memcpy(); -} - -TEST_CASE("Memcpy.i16") { - memcpy(); -} - -TEST_CASE("Memcpy.i32") { - memcpy(); -} - -TEST_CASE("Memcpy.f32") { - memcpy(); +TEST_CASE("cudaGraph.Memcpy.i8") { + memcpy(); } -TEST_CASE("Memcpy.f64") { - memcpy(); +TEST_CASE("cudaGraph.Memcpy.i16") { + memcpy(); } -TEST_CASE("CapturedMemcpy.i8") { - memcpy(); +TEST_CASE("cudaGraph.Memcpy.i32") { + memcpy(); } -TEST_CASE("CapturedMemcpy.i16") { - memcpy(); +TEST_CASE("cudaGraph.Memcpy.f32") { + memcpy(); } -TEST_CASE("CapturedMemcpy.i32") { - memcpy(); -} - -TEST_CASE("CapturedMemcpy.f32") { - memcpy(); -} - -TEST_CASE("CapturedMemcpy.f64") { - memcpy(); +TEST_CASE("cudaGraph.Memcpy.f64") { + memcpy(); } // -------------------------------------------------------- @@ -726,18 +428,18 @@ void fill(T value) { taskflow.emplace([&](){ - tf::cudaFlow cf; + tf::cudaGraph cg; dim3 g = {(unsigned)(N+255)/256, 1, 1}; dim3 b = {256, 1, 1}; - auto kset = cf.kernel(g, b, 0, k_set, gpu, N, (T)123); - auto fill = cf.fill(gpu+start, value, (N-start)); - auto copy = cf.copy(cpu, gpu, N); + auto kset = cg.kernel(g, b, 0, k_set, gpu, N, (T)123); + auto fill = cg.fill(gpu+start, value, (N-start)); + auto copy = cg.copy(cpu, gpu, N); kset.precede(fill); fill.precede(copy); - run_and_wait(cf); - REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); + run_and_wait(cg); + REQUIRE(cg.num_nodes() == 3); }); executor.run(taskflow).wait(); @@ -754,22 +456,22 @@ void fill(T value) { REQUIRE(cudaFree(gpu) == cudaSuccess); } -TEST_CASE("Fill.i8") { +TEST_CASE("cudaGraph.Fill.i8") { fill(+123); fill(-123); } -TEST_CASE("Fill.i16") { +TEST_CASE("cudaGraph.Fill.i16") { fill(+12345); fill(-12345); } -TEST_CASE("Fill.i32") { +TEST_CASE("cudaGraph.Fill.i32") { fill(+123456789); fill(-123456789); } -TEST_CASE("Fill.f32") { +TEST_CASE("cudaGraph.Fill.f32") { fill(+123456789.0f); fill(-123456789.0f); } @@ -800,18 +502,18 @@ void zero() { taskflow.emplace([&](){ - tf::cudaFlow cf; + tf::cudaGraph cg; dim3 g = {(unsigned)(N+255)/256, 1, 1}; dim3 b = {256, 1, 1}; - auto kset = cf.kernel(g, b, 0, k_set, gpu, N, (T)123); - auto zero = cf.zero(gpu+start, (N-start)); - auto copy = cf.copy(cpu, gpu, N); + auto kset = cg.kernel(g, b, 0, k_set, gpu, N, (T)123); + auto zero = cg.zero(gpu+start, (N-start)); + auto copy = cg.copy(cpu, gpu, N); kset.precede(zero); zero.precede(copy); - run_and_wait(cf); - REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); + run_and_wait(cg); + REQUIRE(cg.num_nodes() == 3); }); executor.run(taskflow).wait(); @@ -828,19 +530,19 @@ void zero() { REQUIRE(cudaFree(gpu) == cudaSuccess); } -TEST_CASE("Zero.i8") { +TEST_CASE("cudaGraph.Zero.i8") { zero(); } -TEST_CASE("Zero.i16") { +TEST_CASE("cudaGraph.Zero.i16") { zero(); } -TEST_CASE("Zero.i32") { +TEST_CASE("cudaGraph.Zero.i32") { zero(); } -TEST_CASE("Zero.f32") { +TEST_CASE("cudaGraph.Zero.f32") { zero(); } @@ -865,32 +567,32 @@ void barrier() { auto gputask = taskflow.emplace([&]() { - tf::cudaFlow cf; + tf::cudaGraph cg; dim3 g = {1, 1, 1}; dim3 b = {1, 1, 1}; - auto br1 = cf.noop(); - auto br2 = cf.noop(); - auto br3 = cf.noop(); - auto h2d = cf.copy(gpu, cpu, n); - auto d2h = cf.copy(cpu, gpu, n); + auto br1 = cg.noop(); + auto br2 = cg.noop(); + auto br3 = cg.noop(); + auto h2d = cg.copy(gpu, cpu, n); + auto d2h = cg.copy(cpu, gpu, n); h2d.precede(br1); for(unsigned i=0; i, gpu, i, (T)17); + auto k1 = cg.kernel(g, b, 0, k_single_set, gpu, i, (T)17); k1.succeed(br1) .precede(br2); - auto k2 = cf.kernel(g, b, 0, k_single_add, gpu, i, (T)3); + auto k2 = cg.kernel(g, b, 0, k_single_add, gpu, i, (T)3); k2.succeed(br2) .precede(br3); } br3.precede(d2h); - run_and_wait(cf); - REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); + run_and_wait(cg); + REQUIRE(cg.num_nodes() == 5 + 2*n); }); cputask.precede(gputask); @@ -905,596 +607,23 @@ void barrier() { REQUIRE(cudaFree(gpu) == cudaSuccess); } -TEST_CASE("Barrier.i8" * doctest::timeout(300)) { +TEST_CASE("cudaGraph.Barrier.i8" * doctest::timeout(300)) { barrier(); } -TEST_CASE("Barrier.i16" * doctest::timeout(300)) { +TEST_CASE("cudaGraph.Barrier.i16" * doctest::timeout(300)) { barrier(); } -TEST_CASE("Barrier.i32" * doctest::timeout(300)) { +TEST_CASE("cudaGraph.Barrier.i32" * doctest::timeout(300)) { barrier(); } -// ---------------------------------------------------------------------------- -// NestedRuns -// ---------------------------------------------------------------------------- - -template -void nested_runs() { - - int* cpu = nullptr; - int* gpu = nullptr; - - constexpr unsigned n = 1000; - - cpu = static_cast(std::calloc(n, sizeof(int))); - REQUIRE(cudaMalloc(&gpu, n*sizeof(int)) == cudaSuccess); - - struct A { - - tf::Executor executor; - tf::Taskflow taskflow; - - void run(int* cpu, int* gpu, unsigned n) { - taskflow.clear(); - - auto A1 = taskflow.emplace([&]() { - F cf; - cf.copy(gpu, cpu, n); - run_and_wait(cf); - REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); - }); - - auto A2 = taskflow.emplace([&]() { - F cf; - dim3 g = {(n+255)/256, 1, 1}; - dim3 b = {256, 1, 1}; - cf.kernel(g, b, 0, k_add, gpu, n, 1); - run_and_wait(cf); - REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); - }); - - auto A3 = taskflow.emplace([&] () { - F cf; - cf.copy(cpu, gpu, n); - run_and_wait(cf); - REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); - }); - - A1.precede(A2); - A2.precede(A3); - - executor.run_n(taskflow, 10).wait(); - } - - }; - - struct B { - - tf::Taskflow taskflow; - tf::Executor executor; - - A a; - - void run(int* cpu, int* gpu, unsigned n) { - - taskflow.clear(); - - auto B0 = taskflow.emplace([] () {}); - auto B1 = taskflow.emplace([&] () { - F cf; - dim3 g = {(n+255)/256, 1, 1}; - dim3 b = {256, 1, 1}; - auto h2d = cf.copy(gpu, cpu, n); - auto kernel = cf.kernel(g, b, 0, k_add, gpu, n, 1); - auto d2h = cf.copy(cpu, gpu, n); - h2d.precede(kernel); - kernel.precede(d2h); - run_and_wait(cf); - REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); - }); - auto B2 = taskflow.emplace([&] () { a.run(cpu, gpu, n); }); - auto B3 = taskflow.emplace([&] () { - for(unsigned i=0; i(); -} - -TEST_CASE("CapturedNestedRuns" * doctest::timeout(300)) { - nested_runs(); -} - -/* -// ---------------------------------------------------------------------------- -// WorkerID -// ---------------------------------------------------------------------------- - -void worker_id(unsigned N, unsigned M) { - - tf::Taskflow taskflow; - tf::Executor executor(N + M); - - REQUIRE(executor.num_workers() == (N + M)); - - const unsigned s = 100; - - for(unsigned k=0; k= 0); - REQUIRE(id < N+M); - }); - - auto gputask = taskflow.emplace([&](tf::cudaFlow&) { - auto id = executor.this_worker_id(); - REQUIRE(id >= 0); - REQUIRE(id < N+M); - }); - - auto chktask = taskflow.emplace([&] () { - auto id = executor.this_worker_id(); - REQUIRE(id >= 0); - REQUIRE(id < N+M); - }); - - taskflow.emplace([&](tf::cudaFlow&) { - auto id = executor.this_worker_id(); - REQUIRE(id >= 0); - REQUIRE(id < N+M); - }); - - taskflow.emplace([&]() { - auto id = executor.this_worker_id(); - REQUIRE(id >= 0); - REQUIRE(id < N+M); - }); - - auto subflow = taskflow.emplace([&](tf::Subflow& sf){ - auto id = executor.this_worker_id(); - REQUIRE(id >= 0); - REQUIRE(id < N+M); - auto t1 = sf.emplace([&](){ - auto id = executor.this_worker_id(); - REQUIRE(id >= 0); - REQUIRE(id < N+M); - }); - auto t2 = sf.emplace([&](tf::cudaFlow&){ - auto id = executor.this_worker_id(); - REQUIRE(id >= 0); - REQUIRE(id < N+M); - }); - t1.precede(t2); - }); - - cputask.precede(gputask); - gputask.precede(chktask); - chktask.precede(subflow); - } - - executor.run_n(taskflow, 10).wait(); -} - -TEST_CASE("WorkerID.1C1G") { - worker_id(1, 1); -} - -TEST_CASE("WorkerID.1C2G") { - worker_id(1, 2); -} - -TEST_CASE("WorkerID.1C3G") { - worker_id(1, 3); -} - -TEST_CASE("WorkerID.1C4G") { - worker_id(1, 4); -} - -TEST_CASE("WorkerID.2C1G") { - worker_id(2, 1); -} - -TEST_CASE("WorkerID.2C2G") { - worker_id(2, 2); -} - -TEST_CASE("WorkerID.2C3G") { - worker_id(2, 3); -} - -TEST_CASE("WorkerID.2C4G") { - worker_id(2, 4); -} - -TEST_CASE("WorkerID.3C1G") { - worker_id(3, 1); -} - -TEST_CASE("WorkerID.3C2G") { - worker_id(3, 2); -} - -TEST_CASE("WorkerID.3C3G") { - worker_id(3, 3); -} - -TEST_CASE("WorkerID.3C4G") { - worker_id(3, 4); -} - -TEST_CASE("WorkerID.4C1G") { - worker_id(4, 1); -} - -TEST_CASE("WorkerID.4C2G") { - worker_id(4, 2); -} - -TEST_CASE("WorkerID.4C3G") { - worker_id(4, 3); -} - -TEST_CASE("WorkerID.4C4G") { - worker_id(4, 4); -} */ - -// ---------------------------------------------------------------------------- -// Multiruns -// ---------------------------------------------------------------------------- - -void multiruns(unsigned N, unsigned M) { - - tf::Taskflow taskflow; - tf::Executor executor(N + M); - - const unsigned n = 1000; - const unsigned s = 100; - - int *cpu[s] = {0}; - int *gpu[s] = {0}; - - for(unsigned k=0; k(std::calloc(n, sizeof(int))); - REQUIRE(cudaMalloc(&gpu[k], n*sizeof(int)) == cudaSuccess); - }); - - auto gputask = taskflow.emplace([&, k, number]() { - tf::cudaFlow cf; - dim3 g = {(n+255)/256, 1, 1}; - dim3 b = {256, 1, 1}; - auto h2d = cf.copy(gpu[k], cpu[k], n); - auto kernel = cf.kernel(g, b, 0, k_add, gpu[k], n, number); - auto d2h = cf.copy(cpu[k], gpu[k], n); - h2d.precede(kernel); - kernel.precede(d2h); - run_and_wait(cf); - REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); - }); - - auto chktask = taskflow.emplace([&, k, number] () { - for(unsigned i=0; i -void subflow() { - tf::Taskflow taskflow; - tf::Executor executor; - - int* cpu = nullptr; - int* gpu = nullptr; - - const unsigned n = 1000; - - auto partask = taskflow.emplace([&](tf::Subflow& sf){ - - auto cputask = sf.emplace([&](){ - cpu = static_cast(std::calloc(n, sizeof(int))); - REQUIRE(cudaMalloc(&gpu, n*sizeof(int)) == cudaSuccess); - }); - - auto gputask = sf.emplace([&]() { - F cf; - dim3 g = {(n+255)/256, 1, 1}; - dim3 b = {256, 1, 1}; - auto h2d = cf.copy(gpu, cpu, n); - auto kernel = cf.kernel(g, b, 0, k_add, gpu, n, 1); - auto d2h = cf.copy(cpu, gpu, n); - h2d.precede(kernel); - kernel.precede(d2h); - run_and_wait(cf); - REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); - }); - - cputask.precede(gputask); - }); - - auto chktask = taskflow.emplace([&](){ - for(unsigned i=0; i(); -} - -TEST_CASE("CapturedSubflow" * doctest::timeout(300)) { - subflow(); -} - -// ---------------------------------------------------------------------------- -// NestedSubflow -// ---------------------------------------------------------------------------- - -template -void nested_subflow() { - - tf::Taskflow taskflow; - tf::Executor executor; - - int* cpu = nullptr; - int* gpu = nullptr; - - const unsigned n = 1000; - - auto cputask = taskflow.emplace([&](){ - cpu = static_cast(std::calloc(n, sizeof(int))); - REQUIRE(cudaMalloc(&gpu, n*sizeof(int)) == cudaSuccess); - }); - - auto partask = taskflow.emplace([&](tf::Subflow& sf){ - - auto gputask1 = sf.emplace([&]() { - F cf; - dim3 g = {(n+255)/256, 1, 1}; - dim3 b = {256, 1, 1}; - auto h2d = cf.copy(gpu, cpu, n); - auto kernel = cf.kernel(g, b, 0, k_add, gpu, n, 1); - auto d2h = cf.copy(cpu, gpu, n); - h2d.precede(kernel); - kernel.precede(d2h); - run_and_wait(cf); - REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); - }); - - auto subtask1 = sf.emplace([&](tf::Subflow& sf2) { - auto gputask2 = sf2.emplace([&]() { - F cf; - dim3 g = {(n+255)/256, 1, 1}; - dim3 b = {256, 1, 1}; - auto h2d = cf.copy(gpu, cpu, n); - auto kernel = cf.kernel(g, b, 0, k_add, gpu, n, 1); - auto d2h = cf.copy(cpu, gpu, n); - h2d.precede(kernel); - kernel.precede(d2h); - run_and_wait(cf); - REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); - }); - - auto subtask2 = sf2.emplace([&](tf::Subflow& sf3){ - sf3.emplace([&]() { - F cf; - dim3 g = {(n+255)/256, 1, 1}; - dim3 b = {256, 1, 1}; - auto h2d = cf.copy(gpu, cpu, n); - auto kernel = cf.kernel(g, b, 0, k_add, gpu, n, 1); - auto d2h = cf.copy(cpu, gpu, n); - h2d.precede(kernel); - kernel.precede(d2h); - run_and_wait(cf); - REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); - }); - }); - - gputask2.precede(subtask2); - }); - - gputask1.precede(subtask1); - }); - - auto chktask = taskflow.emplace([&](){ - for(unsigned i=0; i(); -} - -TEST_CASE("CapturedNestedSubflow" * doctest::timeout(300) ) { - nested_subflow(); -} - - -// ---------------------------------------------------------------------------- -// DetachedSubflow -// ---------------------------------------------------------------------------- - -template -void detached_subflow() { - - tf::Taskflow taskflow; - tf::Executor executor; - - int* cpu = nullptr; - int* gpu = nullptr; - - const unsigned n = 1000; - - taskflow.emplace([&](tf::Subflow& sf){ - - auto cputask = sf.emplace([&](){ - cpu = static_cast(std::calloc(n, sizeof(int))); - REQUIRE(cudaMalloc(&gpu, n*sizeof(int)) == cudaSuccess); - }); - - auto gputask = sf.emplace([&]() { - F cf; - dim3 g = {(n+255)/256, 1, 1}; - dim3 b = {256, 1, 1}; - auto h2d = cf.copy(gpu, cpu, n); - auto kernel = cf.kernel(g, b, 0, k_add, gpu, n, 1); - auto d2h = cf.copy(cpu, gpu, n); - h2d.precede(kernel); - kernel.precede(d2h); - run_and_wait(cf); - REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); - }); - - cputask.precede(gputask); - - sf.detach(); - }); - - executor.run(taskflow).wait(); - - for(unsigned i=0; i(); -} - -TEST_CASE("CapturedDetachedSubflow" * doctest::timeout(300)) { - detached_subflow(); -} - // ---------------------------------------------------------------------------- // Conditional GPU tasking // ---------------------------------------------------------------------------- -template -void loop() { +TEST_CASE("cudaGraph.ConditionTask" * doctest::timeout(300)) { tf::Taskflow taskflow; tf::Executor executor; @@ -1510,16 +639,16 @@ void loop() { }); auto gputask = taskflow.emplace([&]() { - F cf; + tf::cudaGraph cg; dim3 g = {(n+255)/256, 1, 1}; dim3 b = {256, 1, 1}; - auto h2d = cf.copy(gpu, cpu, n); - auto kernel = cf.kernel(g, b, 0, k_add, gpu, n, 1); - auto d2h = cf.copy(cpu, gpu, n); + auto h2d = cg.copy(gpu, cpu, n); + auto kernel = cg.kernel(g, b, 0, k_add, gpu, n, 1); + auto d2h = cg.copy(cpu, gpu, n); h2d.precede(kernel); kernel.precede(d2h); - run_and_wait(cf); - REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); + run_and_wait(cg); + REQUIRE(cg.num_nodes() == 3); }); auto condition = taskflow.emplace([&cpu, round=0] () mutable { @@ -1542,20 +671,12 @@ void loop() { executor.run(taskflow).wait(); } -TEST_CASE("Loop" * doctest::timeout(300)) { - loop(); -} - -TEST_CASE("CapturedLoop" * doctest::timeout(300)) { - loop(); -} - // ---------------------------------------------------------------------------- // Predicate // ---------------------------------------------------------------------------- -TEST_CASE("Predicate") { +TEST_CASE("cudaGraph.Loop") { tf::Taskflow taskflow; tf::Executor executor; @@ -1572,16 +693,17 @@ TEST_CASE("Predicate") { }); auto gputask = taskflow.emplace([&]() { - tf::cudaFlow cf; + tf::cudaGraph cg; dim3 g = {(n+255)/256, 1, 1}; dim3 b = {256, 1, 1}; - auto kernel = cf.kernel(g, b, 0, k_add, gpu, n, 1); - auto copy = cf.copy(cpu, gpu, n); + auto kernel = cg.kernel(g, b, 0, k_add, gpu, n, 1); + auto copy = cg.copy(cpu, gpu, n); kernel.precede(copy); tf::cudaStream stream; + tf::cudaGraphExec exec(cg); for(int i=0; i<100; i++) { - cf.run(stream); + stream.run(exec); } stream.synchronize(); }); @@ -1599,54 +721,3 @@ TEST_CASE("Predicate") { executor.run(taskflow).wait(); } - -// ---------------------------------------------------------------------------- -// Repeat -// ---------------------------------------------------------------------------- - -TEST_CASE("Repeat") { - - tf::Taskflow taskflow; - tf::Executor executor; - - const unsigned n = 1000; - - int* cpu = nullptr; - int* gpu = nullptr; - - auto cputask = taskflow.emplace([&](){ - cpu = static_cast(std::calloc(n, sizeof(int))); - REQUIRE(cudaMalloc(&gpu, n*sizeof(int)) == cudaSuccess); - REQUIRE(cudaMemcpy(gpu, cpu, n*sizeof(int), cudaMemcpyHostToDevice) == cudaSuccess); - }); - - auto gputask = taskflow.emplace([&]() { - tf::cudaFlow cf; - dim3 g = {(n+255)/256, 1, 1}; - dim3 b = {256, 1, 1}; - auto kernel = cf.kernel(g, b, 0, k_add, gpu, n, 1); - auto copy = cf.copy(cpu, gpu, n); - kernel.precede(copy); - - tf::cudaStream stream; - for(int i=0; i<100; i++) { - cf.run(stream); - } - stream.synchronize(); - }); - - auto freetask = taskflow.emplace([&](){ - for(unsigned i=0; i -void run_and_wait(T& cf) { +void run_and_wait(tf::cudaGraphExec& exec) { tf::cudaStream stream; - cf.run(stream); - stream.synchronize(); + stream.run(exec).synchronize(); } // ---------------------------------------------------------------------------- @@ -19,57 +17,74 @@ void run_and_wait(T& cf) { // ---------------------------------------------------------------------------- template -void cuda_for_each() { - +void for_each() { + tf::Taskflow taskflow; tf::Executor executor; - - for(int n=0; n<=1234567; n = (n<=100) ? n+1 : n*2 + 1) { + for(int n=1; n<=1234567; n = (n<=100) ? n+1 : n*2 + 1) { + taskflow.emplace([n](){ - tf::cudaStream stream; - tf::cudaDefaultExecutionPolicy policy(stream); - auto g_data = tf::cuda_malloc_shared(n); + auto cpu = static_cast(std::calloc(n, sizeof(T))); + + T* gpu = nullptr; + REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess); + + tf::cudaGraph cg; + auto d2h = cg.copy(cpu, gpu, n); + auto h2d = cg.copy(gpu, cpu, n); + auto kernel = cg.for_each( + gpu, gpu+n, [] __device__ (T& val) { val = 65536; } + ); + h2d.precede(kernel); + d2h.succeed(kernel); + + tf::cudaGraphExec exec(cg); + + run_and_wait(exec); + for(int i=0; i(); +TEST_CASE("cudaGraph.for_each.int" * doctest::timeout(300)) { + for_each(); } -TEST_CASE("cuda_for_each.float" * doctest::timeout(300)) { - cuda_for_each(); +TEST_CASE("cudaGraph.for_each.float" * doctest::timeout(300)) { + for_each(); } -TEST_CASE("cuda_for_each.double" * doctest::timeout(300)) { - cuda_for_each(); +TEST_CASE("cudaGraph.for_each.double" * doctest::timeout(300)) { + for_each(); } // ---------------------------------------------------------------------------- -// for_each +// for_each_index // ---------------------------------------------------------------------------- -template -void cudaflow_for_each() { +template +void for_each_index() { tf::Taskflow taskflow; tf::Executor executor; @@ -83,30 +98,32 @@ void cudaflow_for_each() { T* gpu = nullptr; REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess); - F cf; - auto d2h = cf.copy(cpu, gpu, n); - auto h2d = cf.copy(gpu, cpu, n); - auto kernel = cf.for_each( - gpu, gpu+n, [] __device__ (T& val) { val = 65536; } + tf::cudaGraph cg; + auto d2h = cg.copy(cpu, gpu, n); + auto h2d = cg.copy(gpu, cpu, n); + auto kernel = cg.for_each_index( + 0, n, 1, [gpu] __device__ (int i) { gpu[i] = 65536; } ); h2d.precede(kernel); d2h.succeed(kernel); - run_and_wait(cf); + tf::cudaGraphExec exec(cg); + + run_and_wait(exec); for(int i=0; i(); +TEST_CASE("cudaGraph.for_each_index.int" * doctest::timeout(300)) { + for_each_index(); } -TEST_CASE("cudaFlow.for_each.float" * doctest::timeout(300)) { - cudaflow_for_each(); +TEST_CASE("cudaGraph.for_each_index.float" * doctest::timeout(300)) { + for_each_index(); } -TEST_CASE("cudaFlow.for_each.double" * doctest::timeout(300)) { - cudaflow_for_each(); +TEST_CASE("cudaGraph.for_each_index.double" * doctest::timeout(300)) { + for_each_index(); } -TEST_CASE("cudaFlowCapturer.for_each.int" * doctest::timeout(300)) { - cudaflow_for_each(); -} - -TEST_CASE("cudaFlowCapturer.for_each.float" * doctest::timeout(300)) { - cudaflow_for_each(); -} -TEST_CASE("cudaFlowCapturer.for_each.double" * doctest::timeout(300)) { - cudaflow_for_each(); -} diff --git a/unittests/cuda/test_cuda_for_each_index.cu b/unittests/cuda/test_cuda_for_each_index.cu deleted file mode 100644 index a54a0f102..000000000 --- a/unittests/cuda/test_cuda_for_each_index.cu +++ /dev/null @@ -1,143 +0,0 @@ -#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN - -#include -#include -#include -#include - -constexpr float eps = 0.0001f; - -template -void run_and_wait(T& cf) { - tf::cudaStream stream; - cf.run(stream); - stream.synchronize(); -} - -// ---------------------------------------------------------------------------- -// for_each_index -// ---------------------------------------------------------------------------- - -template -void cuda_for_each_index() { - - tf::Taskflow taskflow; - tf::Executor executor; - - for(int n=0; n<=1234567; n = (n<=100) ? n+1 : n*2 + 1) { - - taskflow.emplace([n](){ - tf::cudaStream stream; - tf::cudaDefaultExecutionPolicy policy(stream); - - auto g_data = tf::cuda_malloc_shared(n); - for(int i=0; i(); -} - -TEST_CASE("cuda_for_each_index.float" * doctest::timeout(300)) { - cuda_for_each_index(); -} - -TEST_CASE("cuda_for_each_index.double" * doctest::timeout(300)) { - cuda_for_each_index(); -} - -// ---------------------------------------------------------------------------- -// for_each_index -// ---------------------------------------------------------------------------- - -template -void cudaflow_for_each_index() { - - tf::Taskflow taskflow; - tf::Executor executor; - - for(int n=1; n<=1234567; n = (n<=100) ? n+1 : n*2 + 1) { - - taskflow.emplace([n](){ - - auto cpu = static_cast(std::calloc(n, sizeof(T))); - - T* gpu = nullptr; - REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess); - - F cf; - auto d2h = cf.copy(cpu, gpu, n); - auto h2d = cf.copy(gpu, cpu, n); - auto kernel = cf.for_each_index( - 0, n, 1, [gpu] __device__ (int i) { gpu[i] = 65536; } - ); - h2d.precede(kernel); - d2h.succeed(kernel); - - run_and_wait(cf); - - for(int i=0; i(); -} - -TEST_CASE("cudaFlow.for_each_index.float" * doctest::timeout(300)) { - cudaflow_for_each_index(); -} - -TEST_CASE("cudaFlow.for_each_index.double" * doctest::timeout(300)) { - cudaflow_for_each_index(); -} - -TEST_CASE("cudaFlowCapturer.for_each_index.int" * doctest::timeout(300)) { - cudaflow_for_each_index(); -} - -TEST_CASE("cudaFlowCapturer.for_each_index.float" * doctest::timeout(300)) { - cudaflow_for_each_index(); -} - -TEST_CASE("cudaFlowCapturer.for_each_index.double" * doctest::timeout(300)) { - cudaflow_for_each_index(); -} - diff --git a/unittests/cuda/test_cuda_kmeans.cu b/unittests/cuda/test_cuda_kmeans.cu index cd48579d1..798280ec7 100644 --- a/unittests/cuda/test_cuda_kmeans.cu +++ b/unittests/cuda/test_cuda_kmeans.cu @@ -4,15 +4,13 @@ #include #include #include -#include #define L2(x1, y1, x2, y2) ((x1-x2)*(x1-x2) + (y1-y2)*(y1-y2)) -template -void run_and_wait(T& cf) { +void run_and_wait(tf::cudaGraph& cg) { + tf::cudaGraphExec exec(cg); tf::cudaStream stream; - cf.run(stream); - stream.synchronize(); + stream.run(exec).synchronize(); } // Each point (thread) computes its distance to each centroid @@ -182,36 +180,36 @@ void kmeans(int N, int K, int M, size_t num_cpus, size_t num_gpus) { }).name("allocate_c"); auto h2d = taskflow.emplace([&](){ - tf::cudaFlow cf; - cf.copy(d_px, h_px.data(), N).name("h2d_px"); - cf.copy(d_py, h_py.data(), N).name("h2d_py"); - cf.copy(d_mx, h_mx.data(), K).name("h2d_mx"); - cf.copy(d_my, h_my.data(), K).name("h2d_my"); - run_and_wait(cf); + tf::cudaGraph cg; + cg.copy(d_px, h_px.data(), N); + cg.copy(d_py, h_py.data(), N); + cg.copy(d_mx, h_mx.data(), K); + cg.copy(d_my, h_my.data(), K); + run_and_wait(cg); }).name("h2d"); auto kmeans = taskflow.emplace([&](){ - tf::cudaFlow cf; + tf::cudaGraph cg; - auto zero_c = cf.zero(d_c, K).name("zero_c"); - auto zero_sx = cf.zero(d_sx, K).name("zero_sx"); - auto zero_sy = cf.zero(d_sy, K).name("zero_sy"); + auto zero_c = cg.zero(d_c, K); + auto zero_sx = cg.zero(d_sx, K); + auto zero_sy = cg.zero(d_sy, K); - auto cluster = cf.kernel( + auto cluster = cg.kernel( (N+1024-1) / 1024, 1024, 0, assign_clusters, d_px, d_py, N, d_mx, d_my, d_sx, d_sy, K, d_c - ).name("cluster"); + ); - auto new_centroid = cf.kernel( + auto new_centroid = cg.kernel( 1, K, 0, compute_new_means, d_mx, d_my, d_sx, d_sy, d_c - ).name("new_centroid"); + ); cluster.precede(new_centroid) .succeed(zero_c, zero_sx, zero_sy); - run_and_wait(cf); + run_and_wait(cg); }).name("update_means"); auto gpu_condition = taskflow.emplace([i=0, M] () mutable { @@ -219,10 +217,10 @@ void kmeans(int N, int K, int M, size_t num_cpus, size_t num_gpus) { }).name("converged?"); auto stop = taskflow.emplace([&](){ - tf::cudaFlow cf; - cf.copy(h_mx.data(), d_mx, K).name("d2h_mx"); - cf.copy(h_my.data(), d_my, K).name("d2h_my"); - run_and_wait(cf); + tf::cudaGraph cg; + cg.copy(h_mx.data(), d_mx, K); + cg.copy(h_my.data(), d_my, K); + run_and_wait(cg); }).name("stop"); auto free = taskflow.emplace([&](){ diff --git a/unittests/cuda/test_cuda_matrix.cu b/unittests/cuda/test_cuda_matrix.cu index 827ede021..59467db8c 100644 --- a/unittests/cuda/test_cuda_matrix.cu +++ b/unittests/cuda/test_cuda_matrix.cu @@ -4,13 +4,6 @@ #include #include -template -void run_and_wait(T& cf) { - tf::cudaStream stream; - cf.run(stream); - stream.synchronize(); -} - // ---------------------------------------------------------------------------- // Matrix Multiplication Kernel // ---------------------------------------------------------------------------- @@ -74,19 +67,22 @@ TEST_CASE("multiply" * doctest::timeout(300)) { }).name("hc"); auto cuda = taskflow.emplace([&](){ - tf::cudaFlow cf; - auto pa = cf.copy(da, ha, m*n); - auto pb = cf.copy(db, hb, n*k); + tf::cudaGraph cg; + auto pa = cg.copy(da, ha, m*n); + auto pb = cg.copy(db, hb, n*k); - auto op = cf.kernel( + auto op = cg.kernel( grid, block, 0, k_multiplication, da, db, dc, m, n, k - ).name("op"); + ); - auto cc = cf.copy(hc, dc, m*k) - .name("cc"); + auto cc = cg.copy(hc, dc, m*k); op.precede(cc).succeed(pa, pb); - run_and_wait(cf); + + tf::cudaGraphExec exec(cg); + tf::cudaStream stream; + stream.run(exec) + .synchronize(); }); cuda.succeed(hosta, hostb, hostc); @@ -153,12 +149,14 @@ TEST_CASE("transpose" * doctest::timeout(300)) { }).name("ha"); auto op = taskflow.emplace([&](){ - tf::cudaFlow cf; - auto copyin = cf.copy(sin, ptr_in, m*n); - auto copyout = cf.copy(ptr_out, sout, m*n); - auto trans = cf.kernel(grid, block, 0, k_transpose, sin, sout, m, n); + tf::cudaGraph cg; + auto copyin = cg.copy(sin, ptr_in, m*n); + auto copyout = cg.copy(ptr_out, sout, m*n); + auto trans = cg.kernel(grid, block, 0, k_transpose, sin, sout, m, n); trans.succeed(copyin).precede(copyout); - run_and_wait(cf); + tf::cudaGraphExec exec(cg); + tf::cudaStream stream; + stream.run(exec).synchronize(); }); hin.precede(op); @@ -225,13 +223,16 @@ TEST_CASE("product" * doctest::timeout(300)) { }); auto kernel = taskflow.emplace([&, i](){ - tf::cudaFlow cf; - auto copyA = cf.copy(dA[i], hA[i], N); - auto copyB = cf.copy(dB[i], hB[i], N); - auto op = cf.kernel(grid, block, 0, k_product, dA[i], dB[i], dC[i], N); - auto copyC = cf.copy(hC[i], dC[i], N); + tf::cudaGraph cg; + auto copyA = cg.copy(dA[i], hA[i], N); + auto copyB = cg.copy(dB[i], hB[i], N); + auto op = cg.kernel(grid, block, 0, k_product, dA[i], dB[i], dC[i], N); + auto copyC = cg.copy(hC[i], dC[i], N); op.succeed(copyA, copyB).precede(copyC); - run_and_wait(cf); + tf::cudaStream stream; + tf::cudaGraphExec exec(cg); + stream.run(exec) + .synchronize(); }); auto deallocate = taskflow.emplace([&, i, v1, v2](){ diff --git a/unittests/cuda/test_cuda_memory.cu b/unittests/cuda/test_cuda_memory.cu deleted file mode 100644 index 1a1ed14b9..000000000 --- a/unittests/cuda/test_cuda_memory.cu +++ /dev/null @@ -1,99 +0,0 @@ -#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN - -#include -#include -#include -#include - -// ---------------------------------------------------------------------------- -// USM Allocator -// ---------------------------------------------------------------------------- - -TEST_CASE("cudaUSMAllocator" * doctest::timeout(300)) { - - tf::cudaStream stream; - - std::vector> vec; - std::vector> rhs; - - REQUIRE(vec.size() == 0); - - vec.resize(100, 10); - REQUIRE(vec.size() == 100); - for(auto c : vec) { - REQUIRE(c == 10); - } - - rhs = std::move(vec); - - REQUIRE(vec.size() == 0); - REQUIRE(rhs.size() == 100); - for(auto c : rhs) { - REQUIRE(c == 10); - } - - for(int i=0; i<65536; i++) { - vec.push_back(-i); - } - for(int i=0; i<65536; i++) { - REQUIRE(vec[i] == -i); - } - - rhs = vec; - - for(int i=0; i<65536; i++) { - REQUIRE(vec[i] == rhs[i]); - } - - tf::cudaDefaultExecutionPolicy p(stream); - - tf::cuda_for_each(p, vec.data(), vec.data() + vec.size(), [] __device__ (int& v) { - v = -177; - }); - stream.synchronize(); - - rhs = vec; - for(size_t i=0; i, tf::cudaDeviceAllocator>> vec; - std::vector, tf::cudaDeviceAllocator>> rhs(N); - - REQUIRE(vec.size() == 0); - REQUIRE(rhs.size() == 10000); - - //tf::cudaStream stream; - //tf::cudaDefaultExecutionPolicy policy(stream); - // - //tf::cuda_for_each(policy, rhs.data(), rhs.data() + N, [] __device__ (tf::NoInit& v) { - // v = -177; - //}); - //stream.synchronize(); -} - - - - - - - - - - - - diff --git a/unittests/cuda/test_cuda_objects.cu b/unittests/cuda/test_cuda_objects.cu index 1eb6ba67c..ec2119df1 100644 --- a/unittests/cuda/test_cuda_objects.cu +++ b/unittests/cuda/test_cuda_objects.cu @@ -4,6 +4,7 @@ #include #include + TEST_CASE("cuda.version" * doctest::timeout(300) ) { REQUIRE(tf::cuda_get_driver_version() > 0); REQUIRE(tf::cuda_get_runtime_version() > 0); @@ -54,21 +55,33 @@ TEST_CASE("cudaStream" * doctest::timeout(300)) { cudaStreamCreate(&s2_source); tf::cudaStream s2(s2_source); - REQUIRE(s2 == s2_source); + REQUIRE(s2.get() == s2_source); - cudaStream_t s1_source = s1; - REQUIRE(s1 == s1_source); + cudaStream_t s1_source = s1.get(); + REQUIRE(s1.get() == s1_source); // query status - REQUIRE(cudaStreamQuery(s1) == cudaSuccess); - REQUIRE(cudaStreamQuery(s2) == cudaSuccess); + REQUIRE(cudaStreamQuery(s1.get()) == cudaSuccess); + REQUIRE(cudaStreamQuery(s2.get()) == cudaSuccess); s1 = std::move(s2); REQUIRE(s2 == nullptr); - REQUIRE(s1 == s2_source); - REQUIRE(cudaStreamQuery(s1) == cudaSuccess); + REQUIRE(s1.get() == s2_source); + REQUIRE(cudaStreamQuery(s1.get()) == cudaSuccess); + + // create a nullstream + tf::cudaStream s3(std::move(s1)); + + REQUIRE(s1 == nullptr); + REQUIRE(s3.get() == s2_source); + // create an empty stream + tf::cudaStream s4(nullptr); + REQUIRE(s4 == nullptr); + + s3 = std::move(s4); + REQUIRE(s3.get() == nullptr); } // ---------------------------------------------------------------------------- @@ -79,27 +92,30 @@ TEST_CASE("cudaEvent" * doctest::timeout(300)) { // create a new event e1 inside tf::cudaEvent e1; + + REQUIRE(e1 != nullptr); + REQUIRE(e1.get() != nullptr); // create another event e2 from the outside cudaEvent_t e2_source; cudaEventCreate(&e2_source); tf::cudaEvent e2(e2_source); - REQUIRE(e2 == e2_source); + REQUIRE(e2.get() == e2_source); - cudaEvent_t e1_source = e1; - REQUIRE(e1 == e1_source); + cudaEvent_t e1_source = e1.get(); + REQUIRE(e1.get() == e1_source); // query status - REQUIRE(cudaEventQuery(e1) == cudaSuccess); - REQUIRE(cudaEventQuery(e2) == cudaSuccess); + REQUIRE(cudaEventQuery(e1.get()) == cudaSuccess); + REQUIRE(cudaEventQuery(e2.get()) == cudaSuccess); e1 = std::move(e2); REQUIRE(e2 == nullptr); - REQUIRE(e1 == e2_source); - REQUIRE(cudaEventQuery(e1) == cudaSuccess); - REQUIRE(cudaEventQuery(e2) != cudaSuccess); + REQUIRE(e1.get() == e2_source); + REQUIRE(cudaEventQuery(e1.get()) == cudaSuccess); + REQUIRE(cudaEventQuery(e2.get()) != cudaSuccess); } // ---------------------------------------------------------------------------- @@ -111,32 +127,69 @@ TEST_CASE("cudaGraph" * doctest::timeout(300)) { // create a new graph g1 inside tf::cudaGraph g1; - cudaGraph_t g1_source = g1; - REQUIRE(g1 == g1_source); + cudaGraph_t g1_source = g1.get(); + REQUIRE(g1.get() == g1_source); // create another graph g2 from the outside cudaGraph_t g2_source; cudaGraphCreate(&g2_source, 0); tf::cudaGraph g2(g2_source); - REQUIRE(g2 == g2_source); + REQUIRE(g2.get() == g2_source); g1 = std::move(g2); REQUIRE(g2 == nullptr); - REQUIRE(g1 == g2_source); + REQUIRE(g1.get() == g2_source); // reassign g1 (now holding g2_source) to g2 g2.reset(g1.release()); REQUIRE(g1 == nullptr); - REQUIRE(g2 == g2_source); + REQUIRE(g2.get() == g2_source); - // clear - g2.clear(); - g1.clear(); + g1.reset(); + g2.reset(); REQUIRE(g1 == nullptr); REQUIRE(g2 == nullptr); } +// ---------------------------------------------------------------------------- +// CUDA Graph Exec +// ---------------------------------------------------------------------------- + +TEST_CASE("cudaGraphExec" * doctest::timeout(300)) { + + // create a new graph g1 inside + tf::cudaGraph g1, g2, g3; + tf::cudaGraphExec e1(g1), e2(g2), e3(g3); + + // create another graph g2 from the outside + REQUIRE(g1 != nullptr); + REQUIRE(g2 != nullptr); + REQUIRE(g3 != nullptr); + REQUIRE(e1 != nullptr); + REQUIRE(e2 != nullptr); + REQUIRE(e3 != nullptr); + + auto re1 = e1.get(); + auto re2 = e2.get(); + auto re3 = e3.get(); + + REQUIRE(re1 != nullptr); + REQUIRE(re2 != nullptr); + REQUIRE(re3 != nullptr); + + e1 = std::move(e2); + REQUIRE(e1.get() == re2); + REQUIRE(e2.get() == nullptr); + + e2 = std::move(e3); + REQUIRE(e2.get() == re3); + REQUIRE(e3.get() == nullptr); +} + + + + diff --git a/unittests/cuda/test_cuda_transform.cu b/unittests/cuda/test_cuda_transform.cu index 29eb84aa7..bc43bec64 100644 --- a/unittests/cuda/test_cuda_transform.cu +++ b/unittests/cuda/test_cuda_transform.cu @@ -7,92 +7,17 @@ constexpr float eps = 0.0001f; -template -void run_and_wait(T& cf) { +void run_and_wait(tf::cudaGraphExec& exec) { tf::cudaStream stream; - cf.run(stream); - stream.synchronize(); + stream.run(exec).synchronize(); } // ---------------------------------------------------------------------------- -// cuda transform +// cudaflow transform 1 // ---------------------------------------------------------------------------- template -void cuda_transform() { - - tf::Taskflow taskflow; - tf::Executor executor; - - for(int n=1; n<=1234567; n = (n<=100) ? n+1 : n*2 + 1) { - - taskflow.emplace([n](){ - - tf::cudaStream stream; - tf::cudaDefaultExecutionPolicy policy(stream); - - T v1 = ::rand() % 100; - T v2 = ::rand() % 100; - - T* dx = tf::cuda_malloc_shared(n); - T* dy = tf::cuda_malloc_shared(n); - - for(int i=0; i(); -} - -TEST_CASE("cuda_transform.float" * doctest::timeout(300)) { - cuda_transform(); -} - -TEST_CASE("cuda_transform.double" * doctest::timeout(300)) { - cuda_transform(); -} - -// ---------------------------------------------------------------------------- -// cudaflow transform -// ---------------------------------------------------------------------------- - -template -void cudaflow_transform() { +void transform1() { tf::Taskflow taskflow; tf::Executor executor; @@ -118,18 +43,19 @@ void cudaflow_transform() { REQUIRE(cudaMalloc(&dy, n*sizeof(T)) == cudaSuccess); // axpy - F cf; - auto h2d_x = cf.copy(dx, hx.data(), n).name("h2d_x"); - auto h2d_y = cf.copy(dy, hy.data(), n).name("h2d_y"); - auto d2h_x = cf.copy(hx.data(), dx, n).name("d2h_x"); - auto d2h_y = cf.copy(hy.data(), dy, n).name("d2h_y"); - auto kernel = cf.transform(dx, dx+n, dy, + tf::cudaGraph cg; + auto h2d_x = cg.copy(dx, hx.data(), n); + auto h2d_y = cg.copy(dy, hy.data(), n); + auto d2h_x = cg.copy(hx.data(), dx, n); + auto d2h_y = cg.copy(hy.data(), dy, n); + auto kernel = cg.transform(dx, dx+n, dy, [] __device__ (T x) { return x + 2; } ); kernel.succeed(h2d_x, h2d_y) .precede(d2h_x, d2h_y); - run_and_wait(cf); + tf::cudaGraphExec exec(cg); + run_and_wait(exec); // verify the result for (int i = 0; i < n; i++) { @@ -138,11 +64,11 @@ void cudaflow_transform() { } // update the kernel and run the cf again - cf.transform(kernel, dy, dy+n, dx, + exec.transform(kernel, dy, dy+n, dx, [] __device__ (T y) { return y - 4; } ); - run_and_wait(cf); + run_and_wait(exec); // verify the result for (int i = 0; i < n; i++) { @@ -159,101 +85,24 @@ void cudaflow_transform() { executor.run(taskflow).wait(); } -TEST_CASE("cudaFlow.transform.int" * doctest::timeout(300)) { - cudaflow_transform(); +TEST_CASE("cudaGraph.transform1.int" * doctest::timeout(300)) { + transform1(); } -TEST_CASE("cudaFlow.transform.float" * doctest::timeout(300)) { - cudaflow_transform(); +TEST_CASE("cudaGraph.transform1.float" * doctest::timeout(300)) { + transform1(); } -TEST_CASE("cudaFlow.transform.double" * doctest::timeout(300)) { - cudaflow_transform(); -} - -TEST_CASE("cudaFlowCapturer.transform.int" * doctest::timeout(300)) { - cudaflow_transform(); -} - -TEST_CASE("cudaFlowCapturer.transform.float" * doctest::timeout(300)) { - cudaflow_transform(); -} - -TEST_CASE("cudaFlowCapturer.transform.double" * doctest::timeout(300)) { - cudaflow_transform(); +TEST_CASE("cudaGraph.transform1.double" * doctest::timeout(300)) { + transform1(); } // ---------------------------------------------------------------------------- -// cuda transform2 +// cudaGraph transform2 // ---------------------------------------------------------------------------- template -void cuda_transform2() { - - tf::Taskflow taskflow; - tf::Executor executor; - - for(int n=1; n<=1234567; n = (n<=100) ? n+1 : n*2 + 1) { - - taskflow.emplace([n](){ - - tf::cudaStream stream; - tf::cudaDefaultExecutionPolicy policy(stream); - - T v1 = ::rand() % 100; - T v2 = ::rand() % 100; - T v3 = ::rand() % 1000; - - T* dx = tf::cuda_malloc_shared(n); - T* dy = tf::cuda_malloc_shared(n); - T* dz = tf::cuda_malloc_shared(n); - - for(int i=0; i(); -} - -TEST_CASE("cuda_transform2.float" * doctest::timeout(300)) { - cuda_transform2(); -} - -TEST_CASE("cuda_transform2.double" * doctest::timeout(300)) { - cuda_transform2(); -} - -// ---------------------------------------------------------------------------- -// cudaflow transform2 -// ---------------------------------------------------------------------------- - -template -void cudaflow_transform2() { +void transform2() { tf::Taskflow taskflow; tf::Executor executor; @@ -285,20 +134,22 @@ void cudaflow_transform2() { REQUIRE(cudaMalloc(&dz, n*sizeof(T)) == cudaSuccess); // axpy - F cf; - auto h2d_x = cf.copy(dx, hx.data(), n).name("h2d_x"); - auto h2d_y = cf.copy(dy, hy.data(), n).name("h2d_y"); - auto h2d_z = cf.copy(dz, hz.data(), n).name("h2d_z"); - auto d2h_x = cf.copy(hx.data(), dx, n).name("d2h_x"); - auto d2h_y = cf.copy(hy.data(), dy, n).name("d2h_y"); - auto d2h_z = cf.copy(hz.data(), dz, n).name("d2h_z"); - auto kernel = cf.transform(dx, dx+n, dy, dz, + tf::cudaGraph cg; + auto h2d_x = cg.copy(dx, hx.data(), n); + auto h2d_y = cg.copy(dy, hy.data(), n); + auto h2d_z = cg.copy(dz, hz.data(), n); + auto d2h_x = cg.copy(hx.data(), dx, n); + auto d2h_y = cg.copy(hy.data(), dy, n); + auto d2h_z = cg.copy(hz.data(), dz, n); + auto kernel = cg.transform(dx, dx+n, dy, dz, [] __device__ (T x, T y) { return x + y; } ); kernel.succeed(h2d_x, h2d_y, h2d_z) .precede(d2h_x, d2h_y, d2h_z); - run_and_wait(cf); + tf::cudaGraphExec exec(cg); + + run_and_wait(exec); // verify the result for (int i = 0; i < n; i++) { @@ -307,15 +158,15 @@ void cudaflow_transform2() { REQUIRE(std::fabs(hz[i] - v1 - v2) < eps); } - // update the kernel and run the cf again + // update the kernel and run the exec again // dz = v1 + v2 // dx = v1 // dy = v2 - cf.transform(kernel, dz, dz+n, dx, dy, + exec.transform(kernel, dz, dz+n, dx, dy, [] __device__ (T z, T x) { return z + x + T(10); } ); - run_and_wait(cf); + run_and_wait(exec); // verify the result for (int i = 0; i < n; i++) { @@ -332,26 +183,15 @@ void cudaflow_transform2() { executor.run(taskflow).wait(); } -TEST_CASE("cudaFlow.transform2.int" * doctest::timeout(300)) { - cudaflow_transform2(); -} - -TEST_CASE("cudaFlow.transform2.float" * doctest::timeout(300)) { - cudaflow_transform2(); +TEST_CASE("cudaGraph.transform2.int" * doctest::timeout(300)) { + transform2(); } -TEST_CASE("cudaFlow.transform2.double" * doctest::timeout(300)) { - cudaflow_transform2(); +TEST_CASE("cudaGraph.transform2.float" * doctest::timeout(300)) { + transform2(); } -TEST_CASE("cudaFlowCapturer.transform2.int" * doctest::timeout(300)) { - cudaflow_transform2(); +TEST_CASE("cudaGraph.transform2.double" * doctest::timeout(300)) { + transform2(); } -TEST_CASE("cudaFlowCapturer.transform2.float" * doctest::timeout(300)) { - cudaflow_transform2(); -} - -TEST_CASE("cudaFlowCapturer.transform2.double" * doctest::timeout(300)) { - cudaflow_transform2(); -} diff --git a/unittests/cuda/test_cuda_updates.cu b/unittests/cuda/test_cuda_updates.cu new file mode 100644 index 000000000..bd2f06851 --- /dev/null +++ b/unittests/cuda/test_cuda_updates.cu @@ -0,0 +1,211 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN + +#include + +#include +#include + +template +void run_and_wait(T& cf) { + tf::cudaStream stream; + cf.run(stream); + stream.synchronize(); +} + +//verify +template +__global__ +void verify(const T* a, const T* b, bool* check, size_t size) { + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + for(;tid < size; tid += gridDim.x * blockDim.x) { + if(a[tid] != b[tid]) { + *check = false; + return; + } + } +} + +template +__global__ void k_add(T* ptr, size_t N, T value) { + int i = blockIdx.x*blockDim.x + threadIdx.x; + if (i < N) { + ptr[i] += value; + } +} + +//add +template +__global__ +void add(const T* a, const T* b, T* c, size_t size) { + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + for(;tid < size; tid += gridDim.x * blockDim.x) { + c[tid] = a[tid] + b[tid]; + } +} + +//multiply +template +__global__ +void multiply(const T* a, const T* b, T* c, size_t size) { + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + for(;tid < size; tid += gridDim.x * blockDim.x) { + c[tid] = a[tid] * b[tid]; + } +} + +// CUDA on windows require the enclosing parent function of an extended lambda +// to not have internal or no linkage, so we have to add something that is not a +// lambda. +struct cuda_graph_update_single_task_assign_int { + int* var; + int to_set; + + __device__ void operator()() const { + *var = to_set; + } +}; + +// update single_task +TEST_CASE("cudaGraph.Update.SingleTask") { + + tf::cudaGraph cg; + + auto var = tf::cuda_malloc_shared(1); + *var = 1; + REQUIRE(*var == 1); + + auto task = cg.single_task(cuda_graph_update_single_task_assign_int{var, 2}); + + tf::cudaGraphExec exec(cg); + tf::cudaStream stream; + stream.run(exec).synchronize(); + + REQUIRE(*var == 2); + + exec.single_task(task, cuda_graph_update_single_task_assign_int{var, 10}); + + stream.run(exec).synchronize(); + + REQUIRE(*var == 10); + + tf::cuda_free(var); +} + + +// update kernel +TEST_CASE("cudaGraph.Update.Kernel") { + + const size_t N = 1024; + + tf::cudaGraph cg; + + auto vec = tf::cuda_malloc_shared(N); + + auto t1 = cg.zero(vec, N); + auto t2 = cg.kernel(2, 512, 0, k_add, vec, N, 10); + t1.precede(t2); + + tf::cudaGraphExec exec(cg); + tf::cudaStream stream; + + stream.run(exec).synchronize(); + + for(size_t i=0; i, vec, N, 20); + + stream.run(exec).synchronize(); + + for(size_t i=0; i(N); + + auto t1 = cg.memset(vec, 0x01, N*sizeof(int)); + + tf::cudaGraphExec exec(cg); + tf::cudaStream stream; + + stream.run(exec).synchronize(); + + for(size_t i=0; i(N); + auto vec2 = tf::cuda_malloc_shared(N); + auto vec3 = tf::cuda_malloc_shared(N); + + for(size_t i=0; i executors(N); + + std::atomic counter(0); + + auto check_wid = [&](size_t e){ + for(size_t i=0; i counter(0); - int N = 1000; + int N = 10000; for(int i=0; i counter{0}; - - auto A = taskflow.emplace( - [&](){ counter.fetch_add(1, std::memory_order_relaxed); } - ); - auto B = taskflow.emplace( - [&](){ counter.fetch_add(1, std::memory_order_relaxed); } - ); - - taskflow.emplace( - [&](){ counter.fetch_add(1, std::memory_order_relaxed); } - ); - - auto S1 = taskflow.emplace([&] (tf::Subflow& sf){ - for(int i=0; i<1000; i++) { - sf.async([&](){ counter.fetch_add(1, std::memory_order_relaxed); }); - } - }); - - auto S2 = taskflow.emplace([&] (tf::Subflow& sf){ - sf.emplace([&](){ counter.fetch_add(1, std::memory_order_relaxed); }); - for(int i=0; i<1000; i++) { - sf.async([&](){ counter.fetch_add(1, std::memory_order_relaxed); }); - } - }); - - taskflow.emplace([&] (tf::Subflow& sf){ - sf.emplace([&](){ counter.fetch_add(1, std::memory_order_relaxed); }); - for(int i=0; i<1000; i++) { - sf.async([&](){ counter.fetch_add(1, std::memory_order_relaxed); }); - } - sf.join(); - }); - - taskflow.emplace([&] (tf::Subflow& sf){ - for(int i=0; i<1000; i++) { - sf.async([&](){ counter.fetch_add(1, std::memory_order_relaxed); }); - } - sf.join(); - }); - - A.precede(S1, S2); - B.succeed(S1, S2); - - executor.run(taskflow).wait(); - - REQUIRE(counter == 4005); -} - -TEST_CASE("SubflowAsync.1thread") { - subflow_async(1); -} - -TEST_CASE("SubflowAsync.3threads") { - subflow_async(3); -} - -TEST_CASE("SubflowAsync.11threads") { - subflow_async(11); -} - -// -------------------------------------------------------- -// Testcase: NestedSubflowAsync -// -------------------------------------------------------- - -void nested_subflow_async(size_t W) { - - tf::Taskflow taskflow; - tf::Executor executor(W); - - std::atomic counter{0}; - - taskflow.emplace([&](tf::Subflow& sf1){ - - for(int i=0; i<100; i++) { - sf1.async([&](){ counter.fetch_add(1, std::memory_order_relaxed); }); - } - - sf1.emplace([&](tf::Subflow& sf2){ - for(int i=0; i<100; i++) { - sf2.async([&](){ counter.fetch_add(1, std::memory_order_relaxed); }); - sf1.async( - [&](){ counter.fetch_add(1, std::memory_order_relaxed); } - ); - } - - sf2.emplace([&](tf::Subflow& sf3){ - for(int i=0; i<100; i++) { - sf3.silent_async( - [&](){ counter.fetch_add(1, std::memory_order_relaxed); } - ); - sf2.silent_async([&](){ counter.fetch_add(1, std::memory_order_relaxed); }); - sf1.silent_async([&](){ counter.fetch_add(1, std::memory_order_relaxed); }); - } - }); - }); - - sf1.join(); - REQUIRE(counter == 600); - }); - - executor.run(taskflow).wait(); - REQUIRE(counter == 600); -} - -TEST_CASE("NestedSubflowAsync.1thread") { - nested_subflow_async(1); -} - -TEST_CASE("NestedSubflowAsync.3threads") { - nested_subflow_async(3); -} - -TEST_CASE("NestedSubflowAsync.11threads") { - nested_subflow_async(11); -} - // -------------------------------------------------------- // Testcase: RuntimeAsync // -------------------------------------------------------- @@ -354,7 +310,7 @@ void runtime_async(size_t W) { [&](){counter.fetch_add(1, std::memory_order_relaxed);} ); } - sf.corun_all(); + sf.corun(); }); auto S2 = taskflow.emplace([&] (tf::Runtime& sf){ @@ -362,7 +318,7 @@ void runtime_async(size_t W) { for(int i=0; i<1000; i++) { sf.silent_async([&](){ counter.fetch_add(1, std::memory_order_relaxed); }); } - sf.corun_all(); + sf.corun(); }); taskflow.emplace([&] (tf::Runtime& sf){ @@ -372,14 +328,14 @@ void runtime_async(size_t W) { [&](){ counter.fetch_add(1, std::memory_order_relaxed); } ); } - sf.corun_all(); + sf.corun(); }); taskflow.emplace([&] (tf::Runtime& sf){ for(int i=0; i<1000; i++) { sf.async([&](){ counter.fetch_add(1, std::memory_order_relaxed); }); } - sf.corun_all(); + sf.corun(); }); A.precede(S1, S2); @@ -394,10 +350,34 @@ TEST_CASE("RuntimeAsync.1thread") { runtime_async(1); } +TEST_CASE("RuntimeAsync.2threads") { + runtime_async(2); +} + TEST_CASE("RuntimeAsync.3threads") { runtime_async(3); } +TEST_CASE("RuntimeAsync.4threads") { + runtime_async(4); +} + +TEST_CASE("RuntimeAsync.5threads") { + runtime_async(5); +} + +TEST_CASE("RuntimeAsync.6threads") { + runtime_async(6); +} + +TEST_CASE("RuntimeAsync.7threads") { + runtime_async(7); +} + +TEST_CASE("RuntimeAsync.8threads") { + runtime_async(8); +} + TEST_CASE("RuntimeAsync.11threads") { runtime_async(11); } diff --git a/unittests/test_basics.cpp b/unittests/test_basics.cpp index f7aa68636..6c0e2be3e 100644 --- a/unittests/test_basics.cpp +++ b/unittests/test_basics.cpp @@ -18,17 +18,29 @@ TEST_CASE("Type" * doctest::timeout(300)) { auto t4 = taskflow.composed_of(taskflow2); auto t5 = taskflow.emplace([](){ return tf::SmallVector{1, 2}; }); auto t6 = taskflow.emplace([](tf::Runtime&){}); - auto t7 = taskflow.emplace([](tf::Runtime&){ return 1; }); - auto t8 = taskflow.emplace([](tf::Runtime&){ return tf::SmallVector{1, 2}; }); REQUIRE(t1.type() == tf::TaskType::STATIC); REQUIRE(t2.type() == tf::TaskType::CONDITION); REQUIRE(t3.type() == tf::TaskType::SUBFLOW); REQUIRE(t4.type() == tf::TaskType::MODULE); REQUIRE(t5.type() == tf::TaskType::CONDITION); - REQUIRE(t6.type() == tf::TaskType::STATIC); - REQUIRE(t7.type() == tf::TaskType::CONDITION); - REQUIRE(t8.type() == tf::TaskType::CONDITION); + REQUIRE(t6.type() == tf::TaskType::RUNTIME); + + // static assert + auto task1 = [](){}; + auto task2 = [](){ return 1; }; + auto task3 = [](tf::Subflow&) {}; + auto task4 = [](tf::Subflow&) { return 1; }; + auto task5 = [](tf::Runtime&) {}; + auto task6 = [](tf::Runtime&) { return 1; }; + + static_assert(tf::is_static_task_v == true, ""); + static_assert(tf::is_static_task_v == false, ""); + static_assert(tf::is_condition_task_v == true, ""); + static_assert(tf::is_subflow_task_v == true, ""); + static_assert(tf::is_subflow_task_v == false, ""); + static_assert(tf::is_runtime_task_v == true, ""); + static_assert(tf::is_runtime_task_v == false, ""); } // -------------------------------------------------------- @@ -63,7 +75,7 @@ TEST_CASE("Builder" * doctest::timeout(300)) { for(size_t i=0; i count {0}; - tf::Taskflow f; - auto A = f.emplace([&](){ count ++; }); - auto B = f.emplace([&](tf::Subflow& subflow){ - count ++; - auto B1 = subflow.emplace([&](){ count++; }); - auto B2 = subflow.emplace([&](){ count++; }); - auto B3 = subflow.emplace([&](){ count++; }); - B1.precede(B3); B2.precede(B3); - }); - auto C = f.emplace([&](){ count ++; }); - auto D = f.emplace([&](){ count ++; }); - - A.precede(B, C); - B.precede(D); - C.precede(D); - - std::list> fu_list; - for(size_t i=0; i<500; i++) { - if(i == 499) { - executor.run(f).get(); // Synchronize the first 500 runs - executor.run_n(f, 500); // Run 500 times more - } - else if(i % 2) { - fu_list.push_back(executor.run(f)); - } - else { - fu_list.push_back(executor.run(f, [&, i=i](){ - REQUIRE(count == (i+1)*7); }) - ); - } - } - - executor.wait_for_all(); - - for(auto& fu: fu_list) { - REQUIRE(fu.valid()); - REQUIRE(fu.wait_for(std::chrono::seconds(1)) == std::future_status::ready); - } - - REQUIRE(count == 7000); - - } - - SUBCASE("RunWithChange") { - std::atomic count {0}; - tf::Taskflow f; - auto A = f.emplace([&](){ count ++; }); - auto B = f.emplace([&](tf::Subflow& subflow){ - count ++; - auto B1 = subflow.emplace([&](){ count++; }); - auto B2 = subflow.emplace([&](){ count++; }); - auto B3 = subflow.emplace([&](){ count++; }); - B1.precede(B3); B2.precede(B3); - }); - auto C = f.emplace([&](){ count ++; }); - auto D = f.emplace([&](){ count ++; }); - - A.precede(B, C); - B.precede(D); - C.precede(D); - - executor.run_n(f, 10).get(); - REQUIRE(count == 70); - - auto E = f.emplace([](){}); - D.precede(E); - executor.run_n(f, 10).get(); - REQUIRE(count == 140); - - auto F = f.emplace([](){}); - E.precede(F); - executor.run_n(f, 10); - executor.wait_for_all(); - REQUIRE(count == 210); - - } - - SUBCASE("RunWithPred") { - std::atomic count {0}; - tf::Taskflow f; - auto A = f.emplace([&](){ count ++; }); - auto B = f.emplace([&](tf::Subflow& subflow){ - count ++; - auto B1 = subflow.emplace([&](){ count++; }); - auto B2 = subflow.emplace([&](){ count++; }); - auto B3 = subflow.emplace([&](){ count++; }); - B1.precede(B3); B2.precede(B3); - }); - auto C = f.emplace([&](){ count ++; }); - auto D = f.emplace([&](){ count ++; }); - - A.precede(B, C); - B.precede(D); - C.precede(D); - - executor.run_until(f, [run=10]() mutable { return run-- == 0; }, - [&](){ - REQUIRE(count == 70); - count = 0; - } - ).get(); - - - executor.run_until(f, [run=10]() mutable { return run-- == 0; }, - [&](){ - REQUIRE(count == 70); - count = 0; - }); - - executor.run_until(f, [run=10]() mutable { return run-- == 0; }, - [&](){ - REQUIRE(count == 70); - count = 0; - } - ).get(); - - } - SUBCASE("MultipleRuns") { std::atomic count(0); diff --git a/unittests/test_cancellation.cpp b/unittests/test_cancellation.cpp index 08534e6d1..e9fa5f7ce 100644 --- a/unittests/test_cancellation.cpp +++ b/unittests/test_cancellation.cpp @@ -152,12 +152,12 @@ TEST_CASE("CancelSubflow" * doctest::timeout(300)) { counter.fetch_add(1, std::memory_order_relaxed); }); } + + // test explicit join if(i % 2) { sf.join(); } - else { - sf.detach(); - } + // else test implicit join }); } @@ -285,13 +285,15 @@ TEST_CASE("CancelComposition") { auto f3_module_task = f4.composed_of(f3).name("module_of_f3"); auto f2_module_task = f4.composed_of(f2).name("module_of_f2"); f3_module_task.precede(f2_module_task); + + std::vector> futures; for(int r=0; r<100; r++) { size_t N = 100; size_t success = 0; - std::vector> futures; + futures.clear(); for(int i=0; i<100; i++) { futures.emplace_back(executor.run(f4)); diff --git a/unittests/test_compositions.cpp b/unittests/test_compositions.cpp deleted file mode 100644 index 4fd621b5b..000000000 --- a/unittests/test_compositions.cpp +++ /dev/null @@ -1,220 +0,0 @@ -#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN - -#include -#include - -// -------------------------------------------------------- -// Testcase: Composition -// -------------------------------------------------------- -TEST_CASE("Composition-1" * doctest::timeout(300)) { - - for(unsigned w=1; w<=8; ++w) { - - tf::Executor executor(w); - - tf::Taskflow f0; - - int cnt {0}; - - auto A = f0.emplace([&cnt](){ ++cnt; }); - auto B = f0.emplace([&cnt](){ ++cnt; }); - auto C = f0.emplace([&cnt](){ ++cnt; }); - auto D = f0.emplace([&cnt](){ ++cnt; }); - auto E = f0.emplace([&cnt](){ ++cnt; }); - - A.precede(B); - B.precede(C); - C.precede(D); - D.precede(E); - - tf::Taskflow f1; - - // module 1 - std::tie(A, B, C, D, E) = f1.emplace( - [&cnt] () { ++cnt; }, - [&cnt] () { ++cnt; }, - [&cnt] () { ++cnt; }, - [&cnt] () { ++cnt; }, - [&cnt] () { ++cnt; } - ); - A.precede(B); - B.precede(C); - C.precede(D); - D.precede(E); - auto m1_1 = f1.composed_of(f0); - E.precede(m1_1); - - executor.run(f1).get(); - REQUIRE(cnt == 10); - - cnt = 0; - executor.run_n(f1, 100).get(); - REQUIRE(cnt == 10 * 100); - - auto m1_2 = f1.composed_of(f0); - m1_1.precede(m1_2); - - for(int n=0; n<100; n++) { - cnt = 0; - executor.run_n(f1, n).get(); - REQUIRE(cnt == 15*n); - } - - cnt = 0; - for(int n=0; n<100; n++) { - executor.run(f1); - } - - executor.wait_for_all(); - - REQUIRE(cnt == 1500); - } -} - -// TESTCASE: composition-2 -TEST_CASE("Composition-2" * doctest::timeout(300)) { - - for(unsigned w=1; w<=8; ++w) { - - tf::Executor executor(w); - - int cnt {0}; - - // level 0 (+5) - tf::Taskflow f0; - - auto A = f0.emplace([&cnt](){ ++cnt; }).name("f0A"); - auto B = f0.emplace([&cnt](){ ++cnt; }).name("f0B"); - auto C = f0.emplace([&cnt](){ ++cnt; }).name("f0C"); - auto D = f0.emplace([&cnt](){ ++cnt; }).name("f0D"); - auto E = f0.emplace([&cnt](){ ++cnt; }).name("f0E"); - - A.precede(B); - B.precede(C); - C.precede(D); - D.precede(E); - - // level 1 (+10) - tf::Taskflow f1; - auto m1_1 = f1.composed_of(f0).name("m1_1"); - auto m1_2 = f1.composed_of(f0).name("m1_2"); - m1_1.precede(m1_2); - - // level 2 (+20) - tf::Taskflow f2; - auto m2_1 = f2.composed_of(f1).name("m2_1"); - auto m2_2 = f2.composed_of(f1).name("m2_2"); - m2_1.precede(m2_2); - - //f2.dump(std::cout); - - // synchronous run - for(int n=0; n<100; n++) { - cnt = 0; - executor.run_n(f2, n).get(); - REQUIRE(cnt == 20*n); - } - - // asynchronous run - cnt = 0; - for(int n=0; n<100; n++) { - executor.run(f2); - } - executor.wait_for_all(); - REQUIRE(cnt == 100*20); - } -} - -// TESTCASE: composition-3 -TEST_CASE("Composition-3" * doctest::timeout(300)) { - - for(unsigned w=1; w<=8; ++w) { - - tf::Executor executor(w); - - int cnt {0}; - - // level 0 (+2) - tf::Taskflow f0; - - auto A = f0.emplace([&cnt](){ ++cnt; }); - auto B = f0.emplace([&cnt](){ ++cnt; }); - - A.precede(B); - - // level 1 (+4) - tf::Taskflow f1; - auto m1_1 = f1.composed_of(f0); - auto m1_2 = f1.composed_of(f0); - m1_1.precede(m1_2); - - // level 2 (+8) - tf::Taskflow f2; - auto m2_1 = f2.composed_of(f1); - auto m2_2 = f2.composed_of(f1); - m2_1.precede(m2_2); - - // level 3 (+16) - tf::Taskflow f3; - auto m3_1 = f3.composed_of(f2); - auto m3_2 = f3.composed_of(f2); - m3_1.precede(m3_2); - - // synchronous run - for(int n=0; n<100; n++) { - cnt = 0; - executor.run_n(f3, n).get(); - REQUIRE(cnt == 16*n); - } - - // asynchronous run - cnt = 0; - for(int n=0; n<100; n++) { - executor.run(f3); - } - executor.wait_for_all(); - REQUIRE(cnt == 16*100); - } -} - -// ---------------------------------------------------------------------------- -// ParallelCompositions -// ---------------------------------------------------------------------------- -TEST_CASE("ParallelCompositions") { - - std::vector taskflows(100); - - tf::Executor executor(4); - tf::Taskflow taskflow; - - std::atomic counter{0}; - - for(auto& tf : taskflows) { - for(size_t n=0; n<100; n++) { - auto [A, B, C, D, E, F, G, H] = tf.emplace( - [&](){ counter.fetch_add(1, std::memory_order_relaxed); }, - [&](){ counter.fetch_add(1, std::memory_order_relaxed); }, - [&](){ counter.fetch_add(1, std::memory_order_relaxed); }, - [&](){ counter.fetch_add(1, std::memory_order_relaxed); }, - [&](){ counter.fetch_add(1, std::memory_order_relaxed); }, - [&](){ counter.fetch_add(1, std::memory_order_relaxed); }, - [&](){ counter.fetch_add(1, std::memory_order_relaxed); }, - [&](){ counter.fetch_add(1, std::memory_order_relaxed); } - ); - A.precede(B); - A.precede(C); - D.precede(E); - D.precede(F); - } - taskflow.composed_of(tf); - } - - executor.run(taskflow).wait(); - - REQUIRE(counter == 80000); -} - - - - - diff --git a/unittests/test_control_flow.cpp b/unittests/test_control_flow.cpp index ff5e48559..24fd6bc53 100644 --- a/unittests/test_control_flow.cpp +++ b/unittests/test_control_flow.cpp @@ -7,7 +7,7 @@ // Testcase: Conditional Tasking // -------------------------------------------------------- -TEST_CASE("Cond.Types") { +TEST_CASE("Cond.Types" * doctest::timeout(300)) { tf::Taskflow taskflow; @@ -80,13 +80,13 @@ void loop_cond(unsigned w) { A.precede(B); B.precede(B, C); - REQUIRE(A.num_strong_dependents() == 0); - REQUIRE(A.num_weak_dependents() == 0); - REQUIRE(A.num_dependents() == 0); + REQUIRE(A.num_strong_dependencies() == 0); + REQUIRE(A.num_weak_dependencies() == 0); + REQUIRE(A.num_predecessors() == 0); - REQUIRE(B.num_strong_dependents() == 1); - REQUIRE(B.num_weak_dependents() == 1); - REQUIRE(B.num_dependents() == 2); + REQUIRE(B.num_strong_dependencies() == 1); + REQUIRE(B.num_weak_dependencies() == 1); + REQUIRE(B.num_predecessors() == 2); executor.run(taskflow).wait(); REQUIRE(counter == 0); @@ -667,7 +667,6 @@ void condition_subflow(unsigned W) { REQUIRE(i #include #include +#include + +// ---------------------------------------------------------------------------- +// null dependent-async task +// ---------------------------------------------------------------------------- + +TEST_CASE("DependentAsync.NullDependency") { + + tf::Executor executor; + tf::AsyncTask dummy; + int v1, v2, v3; + auto t1 = executor.silent_dependent_async([&](){ v1 = 100; }, dummy); + auto t2 = executor.silent_dependent_async([&](){ v2 = 200; }, dummy); + auto [t3, fu3] = executor.dependent_async([&](){ v3 = v1 + v2; }, t1, t2); + fu3.wait(); + REQUIRE(v1 == 100); + REQUIRE(v2 == 200); + REQUIRE(v3 == v1 + v2); +} // ---------------------------------------------------------------------------- // embarrassing parallelism @@ -278,19 +297,19 @@ void simple_graph_2(unsigned W) { results.resize(count); auto t0 = executor.silent_dependent_async( - "t0", [&](){ + [&](){ results[0].data = 100 + id; } ); auto t1 = executor.silent_dependent_async( - "t1", [&](){ + [&](){ results[1].data = 6 * id; } ); auto t2 = executor.silent_dependent_async( - "t2", [&](){ + [&](){ results[2].data = results[0].data + results[1].data + id; }, t0, t1 ); @@ -298,27 +317,27 @@ void simple_graph_2(unsigned W) { tasks1.push_back(t2); auto [t3, fu3] = executor.dependent_async( - "t3", [&](){ + [&](){ results[3].data = results[2].data + id; return results[3].data; }, tasks1.begin(), tasks1.end() ); auto t4 = executor.silent_dependent_async( - "t4", [&](){ + [&](){ results[4].data = results[2].data + id; }, tasks1.begin(), tasks1.end() ); auto [t5, fu5] = executor.dependent_async( - "t5", [&](){ + [&](){ results[5].data = results[2].data + id; return results[5].data; }, tasks1.begin(), tasks1.end() ); auto t6 = executor.silent_dependent_async( - "t6", [&](){ + [&](){ results[6].data = results[2].data + id; }, tasks1.begin(), tasks1.end() ); @@ -329,14 +348,14 @@ void simple_graph_2(unsigned W) { tasks3.push_back(t6); auto [t7, fu7] = executor.dependent_async( - "t7", [&](){ + [&](){ results[7].data = results[3].data + results[4].data + id; return results[7].data; }, tasks2.begin(), tasks2.end() ); auto t8 = executor.silent_dependent_async( - "t8", [&](){ + [&](){ results[8].data = results[5].data + results[6].data + id; }, tasks3.begin(), tasks3.end() ); @@ -348,7 +367,7 @@ void simple_graph_2(unsigned W) { tasks4.push_back(t8); auto [t9, fu9] = executor.dependent_async( - "t9", [&](){ + [&](){ results[9].data = results[0].data + results[1].data + results[2].data + results[7].data + results[8].data + id; return results[9].data; @@ -478,7 +497,7 @@ auto make_complex_graph(tf::Executor& executor, int r) { // define task 0 auto task0 = executor.silent_dependent_async( - "0", [&results, r](){ + [&results, r](){ results[0].data = 100 + r; } ); @@ -532,7 +551,7 @@ auto make_complex_graph(tf::Executor& executor, int r) { // define task 10201 executor.dependent_async( - "10201", [&results, r](){ + [&results, r](){ int value = 0; for (int i = 10101; i <= 10200; ++i) { value += results[i].data; @@ -646,7 +665,7 @@ void binary_tree(unsigned W) { tf::Executor executor(W); - std::vector data(1< data((size_t{1}< tasks_p, tasks_c; std::array dep; @@ -654,7 +673,7 @@ void binary_tree(unsigned W) { // iterate all other tasks level by level for(size_t i=0; i(1<(1<> results(2*N); std::vector tasks; @@ -895,14 +914,14 @@ TEST_CASE("DependentAsync.ParallelGraphConstruction.16threads" * doctest::timeou // ---------------------------------------------------------------------------- // Iterative Fibonacci // ---------------------------------------------------------------------------- -std::vector fibonacci{0,1,1,2,3,5,8,13,21,34,55,89,144,233,377,610,987,1597,2584,4181,6765,10946,17711,28657,46368,75025,121393,196418,317811,514229,832040,1346269,2178309,3524578,5702887,9227465,14930352,24157817,39088169,63245986,102334155,165580141,267914296,433494437,701408733,1134903170,1836311903,2971215073,4807526976,7778742049,12586269025,20365011074,32951280099,53316291173,86267571272,139583862445,225851433717,365435296162,591286729879,956722026041,1548008755920,2504730781961,4052739537881,6557470319842,10610209857723,17167680177565,27777890035288,44945570212853,72723460248141,117669030460994,190392490709135,308061521170129,498454011879264,806515533049393,1304969544928657,2111485077978050,3416454622906707,5527939700884757,8944394323791464,14472334024676221,23416728348467685,37889062373143906,61305790721611591,99194853094755497,160500643816367088,259695496911122585,420196140727489673,679891637638612258,1100087778366101931,1779979416004714189,2880067194370816120,4660046610375530309,7540113804746346429}; +std::vector fibonacci{0,1,1,2,3,5,8,13,21,34,55,89,144,233,377,610,987,1597,2584,4181,6765,10946,17711,28657,46368,75025,121393,196418,317811,514229,832040,1346269,2178309,3524578,5702887,9227465,14930352,24157817,39088169,63245986,102334155,165580141,267914296,433494437,701408733,1134903170,1836311903,2971215073,4807526976,7778742049,12586269025,20365011074,32951280099,53316291173,86267571272,139583862445,225851433717,365435296162,591286729879,956722026041,1548008755920,2504730781961,4052739537881,6557470319842,10610209857723,17167680177565,27777890035288,44945570212853,72723460248141,117669030460994,190392490709135,308061521170129,498454011879264,806515533049393,1304969544928657,2111485077978050,3416454622906707,5527939700884757,8944394323791464,14472334024676221,23416728348467685,37889062373143906,61305790721611591,99194853094755497,160500643816367088,259695496911122585,420196140727489673,679891637638612258,1100087778366101931,1779979416004714189,2880067194370816120,4660046610375530309,7540113804746346429}; void iterative_fibonacci(unsigned W) { tf::Executor executor(W); std::vector tasks; - size_t val_n_1 = 0, val_n_2 = 0; + unsigned long long int val_n_1 = 0, val_n_2 = 0; for (int i = 0; i <= 92; ++i) { if (i < 2) { @@ -946,10 +965,10 @@ TEST_CASE("DependentAsync.IterativeFibonacci.8threads" * doctest::timeout(300)) void recursive_fibonacci(unsigned W) { tf::Executor executor(W); - + std::function fib; - fib = [&](int N){ + fib = [&](int N) -> int { if (N < 2) { return N; @@ -958,16 +977,16 @@ void recursive_fibonacci(unsigned W) { std::future fu1, fu2; tf::AsyncTask t1, t2; - std::tie(t1, fu1) = executor.dependent_async(std::bind(fib, N-1)); - std::tie(t2, fu2) = executor.dependent_async(std::bind(fib, N-2)); + std::tie(t1, fu1) = executor.dependent_async([=, &fib](){ return fib(N-1); }); + std::tie(t2, fu2) = executor.dependent_async([=, &fib](){ return fib(N-2); }); executor.corun_until([&](){ return t1.is_done() && t2.is_done(); }); return fu1.get() + fu2.get(); }; - for (size_t i = 0; i <= 11; ++i) { - auto [tn, fun] = executor.dependent_async(std::bind(fib, i)); + for (int i = 0; i <= 11; ++i) { + auto [tn, fun] = executor.dependent_async([=, &fib]() { return fib(i); }); REQUIRE(fun.get() == fibonacci[i]); } } @@ -989,10 +1008,10 @@ TEST_CASE("DependentAsync.RecursiveFibonacci.8threads" * doctest::timeout(300)) } // ---------------------------------------------------------------------------- -// Mixed algorithms +// Mixed Algorithm with Dependent Async // ---------------------------------------------------------------------------- -void mixed_algorithms(unsigned W) { +void mixed_algorithms_with_dependent_async(unsigned W) { size_t N = 65536; @@ -1002,70 +1021,73 @@ void mixed_algorithms(unsigned W) { std::vector data(N), data1(N), data2(N), data3(N), data4(N); // initialize data to 10 - tf::AsyncTask A = executor.silent_dependent_async(tf::make_for_each_task( + auto [A, fuA] = executor.dependent_async(tf::make_for_each_task( data.begin(), data.begin() + N/2, [](int& d){ d = 10; } )); - tf::AsyncTask B = executor.silent_dependent_async(tf::make_for_each_index_task( + auto [B, fuB] = executor.dependent_async(tf::make_for_each_index_task( N/2, N, size_t{1}, [&] (size_t i) { data[i] = 10; } )); // data1[i] = [11, 11, 11, ...] - tf::AsyncTask T1 = executor.silent_dependent_async(tf::make_transform_task( + auto [T1, fuT1] = executor.dependent_async(tf::make_transform_task( data.begin(), data.end(), data1.begin(), [](int& d) { return d+1; } ), A, B); // data2[i] = [12, 12, 12, ...] - tf::AsyncTask T2 = executor.silent_dependent_async(tf::make_transform_task( + auto [T2, fuT2] = executor.dependent_async(tf::make_transform_task( data.begin(), data.end(), data2.begin(), [](int& d) { return d+2; } ), A, B); // data3[i] = [13, 13, 13, ...] - tf::AsyncTask T3 = executor.silent_dependent_async(tf::make_transform_task( + auto [T3, fuT3] = executor.dependent_async(tf::make_transform_task( data.begin(), data.end(), data3.begin(), [](int& d) { return d+3; } ), A, B); // data4[i] = [1, 1, 1, ...] - tf::AsyncTask T4 = executor.silent_dependent_async(tf::make_transform_task( + auto [T4, fuT4] = executor.dependent_async(tf::make_transform_task( data1.begin(), data1.end(), data2.begin(), data4.begin(), [](int a, int b){ return b - a; } ), T1, T2); // sum1 = 1 + [-1-1-1-1...] - tf::AsyncTask T5 = executor.silent_dependent_async(tf::make_transform_reduce_task( + auto [T5, fuT5] = executor.dependent_async(tf::make_transform_reduce_task( data4.begin(), data4.end(), sum1, std::plus{}, [](int d){ return -d; } ), T4); - tf::AsyncTask T6 = executor.silent_dependent_async(tf::make_transform_reduce_task( + auto [T6, fuT6] = executor.dependent_async(tf::make_transform_reduce_task( data4.begin(), data4.end(), data3.begin(), sum2, std::plus{}, std::plus{} ), T3, T4); // inclusive scan over data1 [11, 22, 33, 44, ...] - tf::AsyncTask T7 = executor.silent_dependent_async(tf::make_inclusive_scan_task( - data1.begin(), data1.end(), data1.begin(), std::plus{} - ), T5, T6); + tf::Taskflow G7; + G7.inclusive_scan(data1.begin(), data1.end(), data1.begin(), std::plus{}); + auto [T7, fuT7] = executor.dependent_async(tf::make_module_task(G7), T5, T6); // exclusive scan over data2 [-1, 11, 23, 35, ...] - tf::AsyncTask T8 = executor.silent_dependent_async(tf::make_exclusive_scan_task( - data2.begin(), data2.end(), data2.begin(), -1, std::plus{} - ), T5, T6); + tf::Taskflow G8; + G8.exclusive_scan(data2.begin(), data2.end(), data2.begin(), -1, std::plus{}); + auto [T8, fuT8] = executor.dependent_async(tf::make_module_task(G8), T5, T6); // transform inclusive scan over data3 [-13, -26, -39, ...] - tf::AsyncTask T9 = executor.silent_dependent_async(tf::make_transform_inclusive_scan_task( - data3.begin(), data3.end(), data3.begin(), std::plus{}, - [](int i){ return -i; } - ), T5, T6); + tf::Taskflow G9; + G9.transform_inclusive_scan( + data3.begin(), data3.end(), data3.begin(), std::plus{}, [](int i) {return -i;} + ); + auto [T9, fuT9] = executor.dependent_async(tf::make_module_task(G9), T5, T6); // transform exclusive scan over data4 [7, 6, 5, 4, ...] - tf::AsyncTask T10 = executor.silent_dependent_async(tf::make_transform_exclusive_scan_task( + tf::Taskflow G10; + G10.transform_exclusive_scan( data4.begin(), data4.end(), data4.begin(), 7, std::plus{}, [](int i){ return -i; } - ), T5, T6); + ); + auto [T10, fuT10] = executor.dependent_async(tf::make_module_task(G10), T5, T6); // sort data4 - tf::AsyncTask T11 = executor.silent_dependent_async(tf::make_sort_task( - data4.begin(), data4.end() - ), T10); + auto [T11, fuT11] = executor.dependent_async( + tf::make_sort_task(data4.begin(), data4.end()), T10 + ); executor.wait_for_all(); @@ -1078,43 +1100,169 @@ void mixed_algorithms(unsigned W) { REQUIRE(data2[i] == i*12 - 1); REQUIRE(data3[i] == (i+1)*-13); REQUIRE(data4[N-i-1] == 7-i); - //printf( - // "data 0|1|2|3|4 [%2zu]=%5d|%5d|%5d|%5d|%5d\n", - // i, data[i], data1[i], data2[i], data3[i], data4[i] - //); } } TEST_CASE("DependentAsync.MixedAlgorithms.1thread" * doctest::timeout(300)) { - mixed_algorithms(1); + mixed_algorithms_with_dependent_async(1); } TEST_CASE("DependentAsync.MixedAlgorithms.2threads" * doctest::timeout(300)) { - mixed_algorithms(2); + mixed_algorithms_with_dependent_async(2); } TEST_CASE("DependentAsync.MixedAlgorithms.3threads" * doctest::timeout(300)) { - mixed_algorithms(3); + mixed_algorithms_with_dependent_async(3); } TEST_CASE("DependentAsync.MixedAlgorithms.4threads" * doctest::timeout(300)) { - mixed_algorithms(4); + mixed_algorithms_with_dependent_async(4); } TEST_CASE("DependentAsync.MixedAlgorithms.5threads" * doctest::timeout(300)) { - mixed_algorithms(5); + mixed_algorithms_with_dependent_async(5); } TEST_CASE("DependentAsync.MixedAlgorithms.6threads" * doctest::timeout(300)) { - mixed_algorithms(6); + mixed_algorithms_with_dependent_async(6); } TEST_CASE("DependentAsync.MixedAlgorithms.7threads" * doctest::timeout(300)) { - mixed_algorithms(7); + mixed_algorithms_with_dependent_async(7); } TEST_CASE("DependentAsync.MixedAlgorithms.8threads" * doctest::timeout(300)) { - mixed_algorithms(8); + mixed_algorithms_with_dependent_async(8); } +// ---------------------------------------------------------------------------- +// Mixed Algorithm with Silent Dependent Async +// ---------------------------------------------------------------------------- + +void mixed_algorithms_with_silent_dependent_async(unsigned W) { + + size_t N = 65536; + + tf::Executor executor(W); + + int sum1{1}, sum2{1}; + std::vector data(N), data1(N), data2(N), data3(N), data4(N); + + // initialize data to 10 + auto A = executor.silent_dependent_async(tf::make_for_each_task( + data.begin(), data.begin() + N/2, [](int& d){ d = 10; } + )); + + auto B = executor.silent_dependent_async(tf::make_for_each_index_task( + N/2, N, size_t{1}, [&] (size_t i) { data[i] = 10; } + )); + + // data1[i] = [11, 11, 11, ...] + auto T1 = executor.silent_dependent_async(tf::make_transform_task( + data.begin(), data.end(), data1.begin(), [](int& d) { return d+1; } + ), A, B); + + // data2[i] = [12, 12, 12, ...] + auto T2 = executor.silent_dependent_async(tf::make_transform_task( + data.begin(), data.end(), data2.begin(), [](int& d) { return d+2; } + ), A, B); + + // data3[i] = [13, 13, 13, ...] + auto T3 = executor.silent_dependent_async(tf::make_transform_task( + data.begin(), data.end(), data3.begin(), [](int& d) { return d+3; } + ), A, B); + + // data4[i] = [1, 1, 1, ...] + auto T4 = executor.silent_dependent_async(tf::make_transform_task( + data1.begin(), data1.end(), data2.begin(), data4.begin(), + [](int a, int b){ return b - a; } + ), T1, T2); + + // sum1 = 1 + [-1-1-1-1...] + auto T5 = executor.silent_dependent_async(tf::make_transform_reduce_task( + data4.begin(), data4.end(), sum1, std::plus{}, [](int d){ return -d; } + ), T4); + + auto T6 = executor.silent_dependent_async(tf::make_transform_reduce_task( + data4.begin(), data4.end(), data3.begin(), sum2, std::plus{}, std::plus{} + ), T3, T4); + + // inclusive scan over data1 [11, 22, 33, 44, ...] + tf::Taskflow G7; + G7.inclusive_scan(data1.begin(), data1.end(), data1.begin(), std::plus{}); + auto T7 = executor.silent_dependent_async(tf::make_module_task(G7), T5, T6); + + // exclusive scan over data2 [-1, 11, 23, 35, ...] + tf::Taskflow G8; + G8.exclusive_scan(data2.begin(), data2.end(), data2.begin(), -1, std::plus{}); + auto T8 = executor.silent_dependent_async(tf::make_module_task(G8), T5, T6); + + // transform inclusive scan over data3 [-13, -26, -39, ...] + tf::Taskflow G9; + G9.transform_inclusive_scan( + data3.begin(), data3.end(), data3.begin(), std::plus{}, [](int i) {return -i;} + ); + auto T9 = executor.silent_dependent_async(tf::make_module_task(G9), T5, T6); + + // transform exclusive scan over data4 [7, 6, 5, 4, ...] + tf::Taskflow G10; + G10.transform_exclusive_scan( + data4.begin(), data4.end(), data4.begin(), 7, std::plus{}, + [](int i){ return -i; } + ); + auto T10 = executor.silent_dependent_async(tf::make_module_task(G10), T5, T6); + + // sort data4 + auto T11 = executor.silent_dependent_async( + tf::make_sort_task(data4.begin(), data4.end()), T10 + ); + + executor.wait_for_all(); + + REQUIRE(sum1 == 1-N); + REQUIRE(sum2 == 1+N*14); + + for(size_t i=0; i // -------------------------------------------------------- -// Testcase: static_task_exception +// Testcase: static_task // -------------------------------------------------------- -void static_task_exception(unsigned W) { +void static_task(unsigned W) { tf::Taskflow taskflow; tf::Executor executor(W); @@ -47,27 +47,27 @@ void static_task_exception(unsigned W) { } } -TEST_CASE("Exception.StaticTask.1thread") { - static_task_exception(1); +TEST_CASE("Exception.StaticTask.1thread" * doctest::timeout(300)) { + static_task(1); } -TEST_CASE("Exception.StaticTask.2threads") { - static_task_exception(2); +TEST_CASE("Exception.StaticTask.2threads" * doctest::timeout(300)) { + static_task(2); } -TEST_CASE("Exception.StaticTask.3threads") { - static_task_exception(3); +TEST_CASE("Exception.StaticTask.3threads" * doctest::timeout(300)) { + static_task(3); } -TEST_CASE("Exception.StaticTask.4threads") { - static_task_exception(4); +TEST_CASE("Exception.StaticTask.4threads" * doctest::timeout(300)) { + static_task(4); } // -------------------------------------------------------- -// Testcase: condition_task_exception +// Testcase: condition_task // -------------------------------------------------------- -void condition_task_exception(unsigned W) { +void condition_task(unsigned W) { tf::Taskflow taskflow; tf::Executor executor(W); @@ -125,27 +125,27 @@ void condition_task_exception(unsigned W) { } } -TEST_CASE("Exception.ConditionTask.1thread") { - condition_task_exception(1); +TEST_CASE("Exception.ConditionTask.1thread" * doctest::timeout(300)) { + condition_task(1); } -TEST_CASE("Exception.ConditionTask.2threads") { - condition_task_exception(2); +TEST_CASE("Exception.ConditionTask.2threads" * doctest::timeout(300)) { + condition_task(2); } -TEST_CASE("Exception.ConditionTask.3threads") { - condition_task_exception(3); +TEST_CASE("Exception.ConditionTask.3threads" * doctest::timeout(300)) { + condition_task(3); } -TEST_CASE("Exception.ConditionTask.4threads") { - condition_task_exception(4); +TEST_CASE("Exception.ConditionTask.4threads" * doctest::timeout(300)) { + condition_task(4); } // -------------------------------------------------------- -// Testcase: multicondition_task_exception +// Testcase: multicondition_task // -------------------------------------------------------- -void multicondition_task_exception(unsigned W) { +void multicondition_task(unsigned W) { tf::Taskflow taskflow; tf::Executor executor(W); @@ -209,27 +209,27 @@ void multicondition_task_exception(unsigned W) { } } -TEST_CASE("Exception.MultiConditionTask.1thread") { - multicondition_task_exception(1); +TEST_CASE("Exception.MultiConditionTask.1thread" * doctest::timeout(300)) { + multicondition_task(1); } -TEST_CASE("Exception.MultiConditionTask.2threads") { - multicondition_task_exception(2); +TEST_CASE("Exception.MultiConditionTask.2threads" * doctest::timeout(300)) { + multicondition_task(2); } -TEST_CASE("Exception.MultiConditionTask.3threads") { - multicondition_task_exception(3); +TEST_CASE("Exception.MultiConditionTask.3threads" * doctest::timeout(300)) { + multicondition_task(3); } -TEST_CASE("Exception.MultiConditionTask.4threads") { - multicondition_task_exception(4); +TEST_CASE("Exception.MultiConditionTask.4threads" * doctest::timeout(300)) { + multicondition_task(4); } // ---------------------------------------------------------------------------- // Subflow Task // ---------------------------------------------------------------------------- -void subflow_task_exception(unsigned W) { +void subflow_task(unsigned W) { tf::Taskflow taskflow; tf::Executor executor(W); @@ -256,255 +256,451 @@ void subflow_task_exception(unsigned W) { REQUIRE_THROWS_WITH_AS(executor.run(taskflow).get(), "y", std::runtime_error); } -TEST_CASE("Exception.SubflowTask.1thread") { - subflow_task_exception(1); +TEST_CASE("Exception.SubflowTask.1thread" * doctest::timeout(300)) { + subflow_task(1); } -TEST_CASE("Exception.SubflowTask.2threads") { - subflow_task_exception(2); +TEST_CASE("Exception.SubflowTask.2threads" * doctest::timeout(300)) { + subflow_task(2); } -TEST_CASE("Exception.SubflowTask.3threads") { - subflow_task_exception(3); +TEST_CASE("Exception.SubflowTask.3threads" * doctest::timeout(300)) { + subflow_task(3); } -TEST_CASE("Exception.SubflowTask.4threads") { - subflow_task_exception(4); +TEST_CASE("Exception.SubflowTask.4threads" * doctest::timeout(300)) { + subflow_task(4); } // ---------------------------------------------------------------------------- -// Exception.AsyncTask +// Joined Subflow // ---------------------------------------------------------------------------- -void async_task_exception(unsigned W) { +void joined_subflow_1(unsigned W) { - // executor async tf::Executor executor(W); + tf::Taskflow taskflow; - auto fu1 = executor.async([](){ - return 1; - }); - REQUIRE(fu1.get() == 1); - - auto fu2 = executor.async([](){ - throw std::runtime_error("x"); - }); - REQUIRE_THROWS_WITH_AS(fu2.get(), "x", std::runtime_error); + taskflow.emplace([&] (tf::Subflow& sf0) { + for (int i = 0; i < 100; ++i) { + sf0.emplace([&] (tf::Subflow& sf1) { + + for (int j = 0; j < 2; ++j) { + sf1.emplace([] () { + throw std::runtime_error("x"); + }).name(std::string("sf1-child-") + std::to_string(j)); + } + + sf1.join(); + // [NOTE]: We cannot guarantee post_join won't run since + // the exception also triggers cancellation which in turns + // bypasses the two tasks inside sf1. In this case, sf1.join + // will succeed and set post_join to true. + + //post_join = true; + }).name(std::string("sf1-") + std::to_string(i)); + } + }).name("sf0"); - // exception is caught without any action - executor.silent_async([](){ - throw std::runtime_error("y"); - }); + REQUIRE_THROWS_WITH_AS(executor.run(taskflow).get(), "x", std::runtime_error); + //REQUIRE(post_join == false); - executor.wait_for_all(); } -TEST_CASE("Exception.AsyncTask.1thread") { - async_task_exception(1); +TEST_CASE("Exception.JoinedSubflow1.1thread" * doctest::timeout(300)) { + joined_subflow_1(1); } -TEST_CASE("Exception.AsyncTask.2threads") { - async_task_exception(2); +TEST_CASE("Exception.JoinedSubflow1.2threads" * doctest::timeout(300)) { + joined_subflow_1(2); } -TEST_CASE("Exception.AsyncTask.3threads") { - async_task_exception(3); +TEST_CASE("Exception.JoinedSubflow1.3threads" * doctest::timeout(300)) { + joined_subflow_1(3); } -TEST_CASE("Exception.AsyncTask.4threads") { - async_task_exception(4); +TEST_CASE("Exception.JoinedSubflow1.4threads" * doctest::timeout(300)) { + joined_subflow_1(4); } // ---------------------------------------------------------------------------- -// Runtime Async Task +// Joined Subflow 2 // ---------------------------------------------------------------------------- -void runtime_async_task_exception(unsigned W) { +void joined_subflow_2(unsigned W) { - // executor async tf::Executor executor(W); tf::Taskflow taskflow; - int flag = 0; - // runtime async - auto A = taskflow.emplace([](tf::Runtime& rt){ - auto fu1 = rt.async([](){ return 1; }); - REQUIRE(fu1.get() == 1); - auto fu2 = rt.async([](){ throw std::runtime_error("z"); }); - REQUIRE_THROWS_WITH_AS(fu2.get(), "z", std::runtime_error); - }); - auto B = taskflow.emplace([&](){ - flag = 1; + std::atomic post_join {false}; + + taskflow.emplace([&](tf::Subflow& sf0){ + for (int j = 0; j < 16; ++j) { + sf0.emplace([] () { + throw std::runtime_error("x"); + }); + } + try { + sf0.join(); + post_join = true; + } catch(const std::runtime_error& re) { + REQUIRE(std::strcmp(re.what(), "x") == 0); + } }); executor.run(taskflow).wait(); - REQUIRE(flag == 1); + REQUIRE(post_join == false); +} - // runtime silent async - flag = 0; - taskflow.clear(); - A = taskflow.emplace([&](tf::Runtime& rt){ - rt.silent_async([&](){ throw std::runtime_error("a"); }); - REQUIRE_THROWS_WITH_AS(rt.corun_all(), "a", std::runtime_error); - flag = 1; - }); - B = taskflow.emplace([&](){ - flag = 2; - }); - A.precede(B); - executor.run(taskflow).get(); - REQUIRE(flag == 2); - - // runtime silent async - flag = 0; - taskflow.clear(); - A = taskflow.emplace([&](tf::Runtime& rt){ - rt.silent_async([&](){ throw std::runtime_error("a"); }); - rt.corun_all(); - flag = 1; - }); - B = taskflow.emplace([&](){ - flag = 2; - }); - A.precede(B); - REQUIRE_THROWS_WITH_AS(executor.run(taskflow).get(), "a", std::runtime_error); - REQUIRE(flag == 0); +TEST_CASE("Exception.JoinedSubflow2.1thread" * doctest::timeout(300)) { + joined_subflow_2(1); } -TEST_CASE("Exception.RuntimeAsyncTask.2threads") { - runtime_async_task_exception(2); +TEST_CASE("Exception.JoinedSubflow2.2threads" * doctest::timeout(300)) { + joined_subflow_2(2); } -TEST_CASE("Exception.RuntimeAsyncTask.3threads") { - runtime_async_task_exception(3); +TEST_CASE("Exception.JoinedSubflow2.3threads" * doctest::timeout(300)) { + joined_subflow_2(3); } -TEST_CASE("Exception.RuntimeAsyncTask.4threads") { - runtime_async_task_exception(4); +TEST_CASE("Exception.JoinedSubflow2.4threads" * doctest::timeout(300)) { + joined_subflow_2(4); } // ---------------------------------------------------------------------------- -// Exception.ThreadSafety +// Joined Subflow Exception 3 // ---------------------------------------------------------------------------- -void thread_safety(unsigned W) { +void joined_subflow_3(unsigned N) { - tf::Executor executor(W); + tf::Executor executor(N); tf::Taskflow taskflow; - for(int i=0; i<1000; i++) { - taskflow.emplace([&](){ throw std::runtime_error("x"); }); - } - - // thread sanitizer should not report any data race + size_t num_tasks = 0; + + // implicit join + taskflow.emplace([&](tf::Subflow& sf) { + tf::Task W = sf.emplace([&](){ ++num_tasks; }); + tf::Task X = sf.emplace([&](){ ++num_tasks; throw std::runtime_error("x"); }); + tf::Task Y = sf.emplace([&](){ ++num_tasks; }); + tf::Task Z = sf.emplace([&](){ ++num_tasks; }); + W.precede(X); + X.precede(Y); + Y.precede(Z); + }); + REQUIRE_THROWS_WITH_AS(executor.run(taskflow).get(), "x", std::runtime_error); + REQUIRE(num_tasks == 2); + + // explicit join + num_tasks = 0; + taskflow.clear(); + taskflow.emplace([&](tf::Subflow& sf) { + tf::Task W = sf.emplace([&](){ ++num_tasks; }); + tf::Task X = sf.emplace([&](){ ++num_tasks; throw std::runtime_error("y"); }); + tf::Task Y = sf.emplace([&](){ ++num_tasks; }); + tf::Task Z = sf.emplace([&](){ ++num_tasks; }); + W.precede(X); + X.precede(Y); + Y.precede(Z); + sf.join(); + }); + + REQUIRE_THROWS_WITH_AS(executor.run(taskflow).get(), "y", std::runtime_error); + REQUIRE(num_tasks == 2); } -TEST_CASE("Exception.ThreadSafety.1thread") { - thread_safety(1); +TEST_CASE("Exception.JoinedSubflow3.1thread" * doctest::timeout(300)) { + joined_subflow_3(1); } -TEST_CASE("Exception.ThreadSafety.2threads") { - thread_safety(2); +TEST_CASE("Exception.JoinedSubflow3.2threads" * doctest::timeout(300)) { + joined_subflow_3(2); } -TEST_CASE("Exception.ThreadSafety.3threads") { - thread_safety(3); +TEST_CASE("Exception.JoinedSubflow3.3threads" * doctest::timeout(300)) { + joined_subflow_3(3); } -TEST_CASE("Exception.ThreadSafety.4threads") { - thread_safety(4); +TEST_CASE("Exception.JoinedSubflow3.4threads" * doctest::timeout(300)) { + joined_subflow_3(4); } // ---------------------------------------------------------------------------- -// Subflow exception +// Nested Subflow // ---------------------------------------------------------------------------- -void joined_subflow_exception_1(unsigned W) { +void nested_subflow(unsigned N) { - tf::Executor executor(W); + tf::Executor executor(N); tf::Taskflow taskflow; - std::atomic post_join {false}; - - taskflow.emplace([&] (tf::Subflow& sf0) { - for (int i = 0; i < 16; ++i) { - sf0.emplace([&] (tf::Subflow& sf1) { - for (int j = 0; j < 16; ++j) { - sf1.emplace([] () { + size_t num_tasks = 0; + + // level 1 + taskflow.emplace([&](tf::Subflow& sf1) { + tf::Task V1 = sf1.emplace([&num_tasks](){ ++num_tasks; }).name("V1"); + tf::Task W1 = sf1.emplace([&num_tasks](){ ++num_tasks; }).name("W1"); + + // level 2 + tf::Task X1 = sf1.emplace([&num_tasks](tf::Subflow& sf2){ + ++num_tasks; + + tf::Task V2 = sf2.emplace([&num_tasks](){ ++num_tasks; }).name("V2"); + tf::Task W2 = sf2.emplace([&num_tasks](){ ++num_tasks; }).name("W2"); + + // level 3 + tf::Task X2 = sf2.emplace([&num_tasks](tf::Subflow& sf3) { + ++num_tasks; + + tf::Task V3 = sf3.emplace([&num_tasks](){ ++num_tasks; }).name("V3"); + tf::Task W3 = sf3.emplace([&num_tasks](){ ++num_tasks; }).name("W3"); + + // level 4 + tf::Task X3 = sf3.emplace([&num_tasks](tf::Subflow& sf4){ + ++num_tasks; + + tf::Task V4 = sf4.emplace([&num_tasks](){ ++num_tasks; }).name("V4"); + tf::Task W4 = sf4.emplace([&num_tasks](){ ++num_tasks; }).name("W4"); + tf::Task X4 = sf4.emplace([&num_tasks](){ + ++num_tasks; throw std::runtime_error("x"); - }); - } - sf1.join(); - post_join = true; - }); - } + }).name("X4 (throw)"); + tf::Task Y4 = sf4.emplace([&num_tasks](){ ++num_tasks; }).name("Y4"); + tf::Task Z4 = sf4.emplace([&num_tasks](){ ++num_tasks; }).name("Z4"); + + V4.precede(W4); + W4.precede(X4); + X4.precede(Y4); + Y4.precede(Z4); + }).name("sf-4"); + + tf::Task Y3 = sf3.emplace([&num_tasks](){ ++num_tasks; }).name("Y3"); + tf::Task Z3 = sf3.emplace([&num_tasks](){ ++num_tasks; }).name("Z3"); + + V3.precede(W3); + W3.precede(X3); + X3.precede(Y3); + Y3.precede(Z3); + }).name("sf3"); + + tf::Task Y2 = sf2.emplace([&num_tasks](){ ++num_tasks; }).name("Y2"); + tf::Task Z2 = sf2.emplace([&num_tasks](){ ++num_tasks; }).name("Z2"); + + V2.precede(W2); + W2.precede(X2); + X2.precede(Y2); + Y2.precede(Z2); + }).name("sf-2"); + + tf::Task Y1 = sf1.emplace([&num_tasks](){ ++num_tasks; }).name("Y1"); + tf::Task Z1 = sf1.emplace([&num_tasks](){ ++num_tasks; }).name("Z1"); + + V1.precede(W1); + W1.precede(X1); + X1.precede(Y1); + Y1.precede(Z1); + }).name("sf-1"); + + REQUIRE_THROWS_WITH_AS(executor.run_n(taskflow, 10).get(), "x", std::runtime_error); + REQUIRE(num_tasks == 12); + + //taskflow.dump(std::cout); + + // corun the nested subflow from an async task + num_tasks = 0; + executor.async([&](){ + REQUIRE_THROWS_WITH_AS(executor.corun(taskflow), "x", std::runtime_error); + }).get(); + REQUIRE(num_tasks == 12); + + // corun the nested subflow from an silent async task + num_tasks = 0; + executor.silent_async([&](){ + REQUIRE_THROWS_WITH_AS(executor.corun(taskflow), "x", std::runtime_error); }); + executor.wait_for_all(); + REQUIRE(num_tasks == 12); - REQUIRE_THROWS_WITH_AS(executor.run(taskflow).get(), "x", std::runtime_error); - REQUIRE(post_join == false); + // corun the nested subflow from an async task's runtime + num_tasks = 0; + executor.async([&](tf::Runtime& rt){ + REQUIRE_THROWS_WITH_AS(rt.corun(taskflow), "x", std::runtime_error); + }).get(); + REQUIRE(num_tasks == 12); + + // corun the nested subflow from an silent-async task's runtime + num_tasks = 0; + executor.silent_async([&](tf::Runtime& rt){ + REQUIRE_THROWS_WITH_AS(rt.corun(taskflow), "x", std::runtime_error); + }); + executor.wait_for_all(); + REQUIRE(num_tasks == 12); + } -TEST_CASE("Exception.JoinedSubflow1.1thread") { - joined_subflow_exception_1(1); +TEST_CASE("Exception.NestedSubflow.1thread" * doctest::timeout(300)) { + nested_subflow(1); } -TEST_CASE("Exception.JoinedSubflow1.2threads") { - joined_subflow_exception_1(2); +TEST_CASE("Exception.NestedSubflow.2threads" * doctest::timeout(300)) { + nested_subflow(2); } -TEST_CASE("Exception.JoinedSubflow1.3threads") { - joined_subflow_exception_1(3); +TEST_CASE("Exception.NestedSubflow.3threads" * doctest::timeout(300)) { + nested_subflow(3); } -TEST_CASE("Exception.JoinedSubflow1.4threads") { - joined_subflow_exception_1(4); +TEST_CASE("Exception.NestedSubflow.4threads" * doctest::timeout(300)) { + nested_subflow(4); } -void joined_subflow_exception_2(unsigned W) { +// ---------------------------------------------------------------------------- +// Nested Subflow 2 +// ---------------------------------------------------------------------------- - tf::Executor executor(W); +void nested_subflow_2(unsigned N) { + + tf::Executor executor(N); tf::Taskflow taskflow; - std::atomic post_join {false}; + size_t num_tasks = 0; + + // level 1 + taskflow.emplace([&](tf::Subflow& sf1) { + tf::Task V1 = sf1.emplace([&num_tasks](){ ++num_tasks; }).name("V1"); + tf::Task W1 = sf1.emplace([&num_tasks](){ ++num_tasks; }).name("W1"); + + // level 2 + tf::Task X1 = sf1.emplace([&num_tasks](tf::Subflow& sf2){ + ++num_tasks; + + tf::Task V2 = sf2.emplace([&num_tasks](){ ++num_tasks; }).name("V2"); + tf::Task W2 = sf2.emplace([&num_tasks](){ ++num_tasks; }).name("W2"); + + // level 3 + tf::Task X2 = sf2.emplace([&num_tasks](tf::Subflow& sf3) { + ++num_tasks; + + tf::Task V3 = sf3.emplace([&num_tasks](){ ++num_tasks; }).name("V3"); + tf::Task W3 = sf3.emplace([&num_tasks](){ ++num_tasks; }).name("W3"); + + // level 4 + tf::Task X3 = sf3.emplace([&num_tasks](tf::Subflow& sf4){ + ++num_tasks; + + tf::Task V4 = sf4.emplace([&num_tasks](){ ++num_tasks; }).name("V4"); + tf::Task W4 = sf4.emplace([&num_tasks](){ ++num_tasks; }).name("W4"); + tf::Task X4 = sf4.emplace([&num_tasks](){ + ++num_tasks; + throw std::runtime_error("x"); + }).name("X4 (throw)"); + tf::Task Y4 = sf4.emplace([&num_tasks](){ ++num_tasks; }).name("Y4"); + tf::Task Z4 = sf4.emplace([&num_tasks](){ ++num_tasks; }).name("Z4"); - taskflow.emplace([&](tf::Subflow& sf0){ - for (int j = 0; j < 16; ++j) { - sf0.emplace([] () { - throw std::runtime_error("x"); - }); - } - try { - sf0.join(); - post_join = true; - } catch(const std::runtime_error& re) { - REQUIRE(std::strcmp(re.what(), "x") == 0); - } + V4.precede(W4); + W4.precede(X4); + X4.precede(Y4); + Y4.precede(Z4); + + sf4.join(); + + }).name("sf-4"); + + tf::Task Y3 = sf3.emplace([&num_tasks](){ ++num_tasks; }).name("Y3"); + tf::Task Z3 = sf3.emplace([&num_tasks](){ ++num_tasks; }).name("Z3"); + + V3.precede(W3); + W3.precede(X3); + X3.precede(Y3); + Y3.precede(Z3); + + sf3.join(); + + }).name("sf3"); + + tf::Task Y2 = sf2.emplace([&num_tasks](){ ++num_tasks; }).name("Y2"); + tf::Task Z2 = sf2.emplace([&num_tasks](){ ++num_tasks; }).name("Z2"); + + V2.precede(W2); + W2.precede(X2); + X2.precede(Y2); + Y2.precede(Z2); + + sf2.join(); + + }).name("sf-2"); + + tf::Task Y1 = sf1.emplace([&num_tasks](){ ++num_tasks; }).name("Y1"); + tf::Task Z1 = sf1.emplace([&num_tasks](){ ++num_tasks; }).name("Z1"); + + V1.precede(W1); + W1.precede(X1); + X1.precede(Y1); + Y1.precede(Z1); + + sf1.join(); + + }).name("sf-1"); + + REQUIRE_THROWS_WITH_AS(executor.run_n(taskflow, 10).get(), "x", std::runtime_error); + REQUIRE(num_tasks == 12); + + //taskflow.dump(std::cout); + + // corun the nested subflow from an async task + num_tasks = 0; + executor.async([&](){ + REQUIRE_THROWS_WITH_AS(executor.corun(taskflow), "x", std::runtime_error); + }).get(); + REQUIRE(num_tasks == 12); + + // corun the nested subflow from an silent async task + num_tasks = 0; + executor.silent_async([&](){ + REQUIRE_THROWS_WITH_AS(executor.corun(taskflow), "x", std::runtime_error); }); - executor.run(taskflow).wait(); - REQUIRE(post_join == false); + executor.wait_for_all(); + REQUIRE(num_tasks == 12); + + // corun the nested subflow from an async task's runtime + num_tasks = 0; + executor.async([&](tf::Runtime& rt){ + REQUIRE_THROWS_WITH_AS(rt.corun(taskflow), "x", std::runtime_error); + }).get(); + REQUIRE(num_tasks == 12); + + // corun the nested subflow from an silent-async task's runtime + num_tasks = 0; + executor.silent_async([&](tf::Runtime& rt){ + REQUIRE_THROWS_WITH_AS(rt.corun(taskflow), "x", std::runtime_error); + }); + executor.wait_for_all(); + REQUIRE(num_tasks == 12); + } -TEST_CASE("Exception.JoinedSubflow2.1thread") { - joined_subflow_exception_2(1); +TEST_CASE("Exception.NestedSubflow2.1thread" * doctest::timeout(300)) { + nested_subflow_2(1); } -TEST_CASE("Exception.JoinedSubflow2.2threads") { - joined_subflow_exception_2(2); +TEST_CASE("Exception.NestedSubflow2.2threads" * doctest::timeout(300)) { + nested_subflow_2(2); } -TEST_CASE("Exception.JoinedSubflow2.3threads") { - joined_subflow_exception_2(3); +TEST_CASE("Exception.NestedSubflow2.3threads" * doctest::timeout(300)) { + nested_subflow_2(3); } -TEST_CASE("Exception.JoinedSubflow2.4threads") { - joined_subflow_exception_2(4); +TEST_CASE("Exception.NestedSubflow2.4threads" * doctest::timeout(300)) { + nested_subflow_2(4); } // ---------------------------------------------------------------------------- -// corun +// Executor Corun Exception 1 // ---------------------------------------------------------------------------- -void executor_corun_exception(unsigned W) { +void executor_corun_1(unsigned W) { tf::Executor executor(W); tf::Taskflow taskflow1; @@ -513,46 +709,111 @@ void executor_corun_exception(unsigned W) { taskflow1.emplace([](){ throw std::runtime_error("x"); }); + taskflow2.emplace([&](){ REQUIRE_THROWS_WITH_AS(executor.corun(taskflow1), "x", std::runtime_error); }); + executor.run(taskflow2).get(); - taskflow1.clear(); + taskflow2.clear(); + for(size_t i=0; i<100; i++) { taskflow1.emplace([](tf::Subflow& sf){ for(size_t j=0; j<100; j++) { sf.emplace([&](){ - throw std::runtime_error("x"); + throw std::runtime_error("y"); }); } }); } + + taskflow2.emplace([&](){ + REQUIRE_THROWS_WITH_AS(executor.corun(taskflow1), "y", std::runtime_error); + }); + executor.run(taskflow2).get(); } -TEST_CASE("Exception.ExecutorCorun.1thread") { - executor_corun_exception(1); +TEST_CASE("Exception.ExecutorCorun1.1thread" * doctest::timeout(300)) { + executor_corun_1(1); +} + +TEST_CASE("Exception.ExecutorCorun1.2threads" * doctest::timeout(300)) { + executor_corun_1(2); +} + +TEST_CASE("Exception.ExecutorCorun1.3threads" * doctest::timeout(300)) { + executor_corun_1(3); +} + +TEST_CASE("Exception.ExecutorCorun1.4threads" * doctest::timeout(300)) { + executor_corun_1(4); +} + +// ---------------------------------------------------------------------------- +// Executor Corun Exception 2 +// ---------------------------------------------------------------------------- + +void executor_corun_2(unsigned W) { + + tf::Taskflow taskflow; + tf::Executor executor(W); + + size_t counter = 0; + + auto A = taskflow.emplace([&](){ counter++; }); + auto B = taskflow.emplace([&](){ counter++; }); + auto C = taskflow.emplace([&](){ throw std::runtime_error("x"); }); + auto D = taskflow.emplace([&](){ counter++; }); + auto E = taskflow.emplace([&](){ counter++; }); + auto F = taskflow.emplace([&](){ counter++; }); + + A.precede(B); + B.precede(C); + C.precede(D); + D.precede(E); + E.precede(F); + + // uncaught corun exception propagates to the topology + tf::Taskflow taskflow2; + taskflow2.emplace([&](){ + executor.corun(taskflow); + }); + REQUIRE_THROWS_WITH_AS(executor.run(taskflow2).get(), "x", std::runtime_error); + REQUIRE(counter == 2); + + // catch corun exception directly + tf::Taskflow taskflow3; + taskflow3.emplace([&](){ + REQUIRE_THROWS_WITH_AS(executor.corun(taskflow), "x", std::runtime_error); + }); + executor.run(taskflow3).get(); + REQUIRE(counter == 4); +} + +TEST_CASE("Exception.ExecutorCorun2.1thread" * doctest::timeout(300)) { + executor_corun_2(1); } -TEST_CASE("Exception.ExecutorCorun.2threads") { - executor_corun_exception(2); +TEST_CASE("Exception.ExecutorCorun2.2threads" * doctest::timeout(300)) { + executor_corun_2(2); } -TEST_CASE("Exception.ExecutorCorun.3threads") { - executor_corun_exception(3); +TEST_CASE("Exception.ExecutorCorun2.3threads" * doctest::timeout(300)) { + executor_corun_2(3); } -TEST_CASE("Exception.ExecutorCorun.4threads") { - executor_corun_exception(4); +TEST_CASE("Exception.ExecutorCorun2.4threads" * doctest::timeout(300)) { + executor_corun_2(4); } // ---------------------------------------------------------------------------- -// runtime_corun_exception +// runtime_corun // ---------------------------------------------------------------------------- -void runtime_corun_exception(unsigned W) { +void runtime_corun_1(unsigned W) { tf::Executor executor(W); tf::Taskflow taskflow1; @@ -586,27 +847,85 @@ void runtime_corun_exception(unsigned W) { REQUIRE_THROWS_WITH_AS(executor.run(taskflow2).get(), "x", std::runtime_error); } -TEST_CASE("Exception.RuntimeCorun.1thread") { - runtime_corun_exception(1); +TEST_CASE("Exception.RuntimeCorun1.1thread" * doctest::timeout(300)) { + runtime_corun_1(1); } -TEST_CASE("Exception.RuntimeCorun.2threads") { - runtime_corun_exception(2); +TEST_CASE("Exception.RuntimeCorun1.2threads" * doctest::timeout(300)) { + runtime_corun_1(2); } -TEST_CASE("Exception.RuntimeCorun.3threads") { - runtime_corun_exception(3); +TEST_CASE("Exception.RuntimeCorun1.3threads" * doctest::timeout(300)) { + runtime_corun_1(3); } -TEST_CASE("Exception.RuntimeCorun.4threads") { - runtime_corun_exception(4); +TEST_CASE("Exception.RuntimeCorun1.4threads" * doctest::timeout(300)) { + runtime_corun_1(4); } // ---------------------------------------------------------------------------- -// module_task_exception +// Runtime Corun Exception 2 // ---------------------------------------------------------------------------- -void module_task_exception(unsigned W) { +void runtime_corun_2(unsigned W) { + + tf::Taskflow taskflow; + tf::Executor executor(W); + + size_t counter = 0; + + auto A = taskflow.emplace([&](){ counter++; }); + auto B = taskflow.emplace([&](){ counter++; }); + auto C = taskflow.emplace([&](){ throw std::runtime_error("x"); }); + auto D = taskflow.emplace([&](){ counter++; }); + auto E = taskflow.emplace([&](){ counter++; }); + auto F = taskflow.emplace([&](){ counter++; }); + + A.precede(B); + B.precede(C); + C.precede(D); + D.precede(E); + E.precede(F); + + // uncaught corun exception propagates to the topology + tf::Taskflow taskflow2; + taskflow2.emplace([&](tf::Runtime& rt){ + rt.corun(taskflow); + }); + REQUIRE_THROWS_WITH_AS(executor.run(taskflow2).get(), "x", std::runtime_error); + REQUIRE(counter == 2); + + // catch corun exception directly + tf::Taskflow taskflow3; + taskflow3.emplace([&](tf::Runtime& rt){ + REQUIRE_THROWS_WITH_AS(rt.corun(taskflow), "x", std::runtime_error); + }); + executor.run(taskflow3).get(); + REQUIRE(counter == 4); +} + +TEST_CASE("Exception.RuntimeCorun2.1thread" * doctest::timeout(300)) { + runtime_corun_2(1); +} + +TEST_CASE("Exception.RuntimeCorun2.2threads" * doctest::timeout(300)) { + runtime_corun_2(2); +} + +TEST_CASE("Exception.RuntimeCorun2.3threads" * doctest::timeout(300)) { + runtime_corun_2(3); +} + +TEST_CASE("Exception.RuntimeCorun2.4threads" * doctest::timeout(300)) { + runtime_corun_2(4); +} + + +// ---------------------------------------------------------------------------- +// module_task +// ---------------------------------------------------------------------------- + +void module_task(unsigned W) { tf::Executor executor(W); tf::Taskflow taskflow1; @@ -618,28 +937,316 @@ void module_task_exception(unsigned W) { taskflow2.composed_of(taskflow1); REQUIRE_THROWS_WITH_AS(executor.run(taskflow2).get(), "x", std::runtime_error); - taskflow1.clear(); - taskflow1.emplace([](tf::Subflow& sf){ - sf.emplace([](){ - throw std::runtime_error("y"); + //taskflow1.clear(); + //taskflow1.emplace([](tf::Subflow& sf){ + // sf.emplace([](){ + // throw std::runtime_error("y"); + // }); + //}); + //REQUIRE_THROWS_WITH_AS(executor.run(taskflow2).get(), "y", std::runtime_error); +} + +TEST_CASE("Exception.ModuleTask.1thread" * doctest::timeout(300)) { + module_task(1); +} + +TEST_CASE("Exception.ModuleTask.2threads" * doctest::timeout(300)) { + module_task(2); +} + +TEST_CASE("Exception.ModuleTask.3threads" * doctest::timeout(300)) { + module_task(3); +} + +TEST_CASE("Exception.ModuleTask.4threads" * doctest::timeout(300)) { + module_task(4); +} + +// ---------------------------------------------------------------------------- +// Exception.Async +// ---------------------------------------------------------------------------- + +void async_task(unsigned W) { + + // executor async + tf::Executor executor(W); + + auto fu1 = executor.async([](){ + return 1; + }); + REQUIRE(fu1.get() == 1); + + auto fu2 = executor.async([](){ + throw std::runtime_error("x"); + }); + REQUIRE_THROWS_WITH_AS(fu2.get(), "x", std::runtime_error); + + // exception is caught without any action + executor.silent_async([](){ + throw std::runtime_error("y"); + }); + + executor.wait_for_all(); +} + +TEST_CASE("Exception.Async.1thread" * doctest::timeout(300)) { + async_task(1); +} + +TEST_CASE("Exception.Async.2threads" * doctest::timeout(300)) { + async_task(2); +} + +TEST_CASE("Exception.Async.3threads" * doctest::timeout(300)) { + async_task(3); +} + +TEST_CASE("Exception.Async.4threads" * doctest::timeout(300)) { + async_task(4); +} + +// ---------------------------------------------------------------------------- +// Async Task with Runtime +// ---------------------------------------------------------------------------- + +void async_with_runtime(unsigned W) { + + tf::Executor executor(W); + std::vector> futures; + + for(size_t i=0; i<1024; i++) { + futures.emplace_back(executor.async([](tf::Runtime&){ + throw std::runtime_error("x"); + })); + } + + for(auto& fu : futures) { + REQUIRE_THROWS_WITH_AS(fu.get(), "x", std::runtime_error); + } + + // silently caught by the task + executor.silent_async([](tf::Runtime&){ + throw std::runtime_error("x"); + }); + + executor.wait_for_all(); +} + +TEST_CASE("Exception.Async.Runtime.1thread" * doctest::timeout(300)) { + async_with_runtime(1); +} + +TEST_CASE("Exception.Async.Runtime.2threads" * doctest::timeout(300)) { + async_with_runtime(2); +} + +TEST_CASE("Exception.Async.Runtime.3threads" * doctest::timeout(300)) { + async_with_runtime(3); +} + +TEST_CASE("Exception.Async.Runtime.4threads" * doctest::timeout(300)) { + async_with_runtime(4); +} + +// ---------------------------------------------------------------------------- +// Dependent Async Task with Runtime +// ---------------------------------------------------------------------------- + +void dependent_async_with_runtime(unsigned W) { + + tf::Executor executor(W); + std::vector> futures; + + for(size_t i=0; i<1024; i++) { + auto [t, f] = executor.dependent_async([](tf::Runtime&){ + throw std::runtime_error("x"); }); + } + + for(auto& fu : futures) { + REQUIRE_THROWS_WITH_AS(fu.get(), "x", std::runtime_error); + } + + // silently caught by the task + executor.silent_dependent_async([](tf::Runtime&){ + throw std::runtime_error("x"); }); - REQUIRE_THROWS_WITH_AS(executor.run(taskflow2).get(), "y", std::runtime_error); + + executor.wait_for_all(); +} + +TEST_CASE("Exception.DependentAsync.Runtime.1thread" * doctest::timeout(300)) { + dependent_async_with_runtime(1); +} + +TEST_CASE("Exception.DependentAsync.Runtime.2threads" * doctest::timeout(300)) { + dependent_async_with_runtime(2); +} + +TEST_CASE("Exception.DependentAsync.Runtime.3threads" * doctest::timeout(300)) { + dependent_async_with_runtime(3); +} + +TEST_CASE("Exception.DependentAsync.Runtime.4threads" * doctest::timeout(300)) { + dependent_async_with_runtime(4); +} + +/* +// ---------------------------------------------------------------------------- +// Runtime Async Task +// ---------------------------------------------------------------------------- + +void runtime_async_task(unsigned W) { + + // executor async + tf::Executor executor(W); + tf::Taskflow taskflow; + int flag = 0; + + // runtime async + auto A = taskflow.emplace([](tf::Runtime& rt){ + auto fu1 = rt.async([](){ return 1; }); + REQUIRE(fu1.get() == 1); + auto fu2 = rt.async([](){ throw std::runtime_error("z"); }); + REQUIRE_THROWS_WITH_AS(fu2.get(), "z", std::runtime_error); + }); + auto B = taskflow.emplace([&](){ + flag = 1; + }); + executor.run(taskflow).wait(); + REQUIRE(flag == 1); + + // runtime silent async + flag = 0; + taskflow.clear(); + A = taskflow.emplace([&](tf::Runtime& rt){ + rt.silent_async([&](){ throw std::runtime_error("a"); }); + REQUIRE_THROWS_WITH_AS(rt.corun(), "a", std::runtime_error); + flag = 1; + }); + B = taskflow.emplace([&](){ + flag = 2; + }); + A.precede(B); + executor.run(taskflow).get(); + REQUIRE(flag == 2); + + // runtime silent async + flag = 0; + taskflow.clear(); + A = taskflow.emplace([&](tf::Runtime& rt){ + rt.silent_async([&](){ throw std::runtime_error("a"); }); + std::this_thread::sleep_for(std::chrono::seconds(1)); + rt.corun(); + flag = 1; // can't guarantee since rt.silent_async can finish + // before corun finishes + }); + B = taskflow.emplace([&](){ + flag = 2; + }); + A.precede(B); + REQUIRE_THROWS_WITH_AS(executor.run(taskflow).get(), "a", std::runtime_error); + REQUIRE(flag == 0); +} + +TEST_CASE("Exception.RuntimeAsync.2threads" * doctest::timeout(300)) { + runtime_async_task(2); +} + +TEST_CASE("Exception.RuntimeAsync.3threads" * doctest::timeout(300)) { + runtime_async_task(3); +} + +TEST_CASE("Exception.RuntimeAsync.4threads" * doctest::timeout(300)) { + runtime_async_task(4); +} +*/ + +// ---------------------------------------------------------------------------- +// Exception.ThreadSafety +// ---------------------------------------------------------------------------- + +void thread_safety(unsigned W) { + + tf::Executor executor(W); + tf::Taskflow taskflow; + + for(int i=0; i<1000; i++) { + taskflow.emplace([&](){ throw std::runtime_error("x"); }); + } + + // thread sanitizer should not report any data race + REQUIRE_THROWS_WITH_AS(executor.run(taskflow).get(), "x", std::runtime_error); +} + +TEST_CASE("Exception.ThreadSafety.1thread" * doctest::timeout(300)) { + thread_safety(1); +} + +TEST_CASE("Exception.ThreadSafety.2threads" * doctest::timeout(300)) { + thread_safety(2); +} + +TEST_CASE("Exception.ThreadSafety.3threads" * doctest::timeout(300)) { + thread_safety(3); +} + +TEST_CASE("Exception.ThreadSafety.4threads" * doctest::timeout(300)) { + thread_safety(4); +} + + +// ---------------------------------------------------------------------------- +// Semaphores +// ---------------------------------------------------------------------------- + +void semaphore1(unsigned W) { + + tf::Executor executor(W); + tf::Taskflow taskflow; + tf::Semaphore semaphore(1); + + tf::Task A = taskflow.emplace([](){}); + tf::Task B = taskflow.emplace([](){ throw std::runtime_error("exception"); }); + tf::Task C = taskflow.emplace([](){}); + tf::Task D = taskflow.emplace([](){}); + + A.precede(B); + B.precede(C); + C.precede(D); + + A.acquire(semaphore); + D.release(semaphore); + + REQUIRE(semaphore.value() == 1); + + // when B throws the exception, D will not run and thus semaphore is not released + REQUIRE_THROWS_WITH_AS(executor.run(taskflow).get(), "exception", std::runtime_error); + + REQUIRE(semaphore.value() == 0); + + // reset the semaphore to a clean state before running the taskflow again + semaphore.reset(); + + REQUIRE(semaphore.value() == 1); + + // run it again + REQUIRE_THROWS_WITH_AS(executor.run(taskflow).get(), "exception", std::runtime_error); } -TEST_CASE("Exception.ModuleTask.1thread") { - module_task_exception(1); +TEST_CASE("Exception.Semaphore.1thread" * doctest::timeout(300)) { + semaphore1(1); } -TEST_CASE("Exception.ModuleTask.2threads") { - module_task_exception(2); +TEST_CASE("Exception.Semaphore.2threads" * doctest::timeout(300)) { + semaphore1(2); } -TEST_CASE("Exception.ModuleTask.3threads") { - module_task_exception(3); +TEST_CASE("Exception.Semaphore.3threads" * doctest::timeout(300)) { + semaphore1(3); } -TEST_CASE("Exception.ModuleTask.4threads") { - module_task_exception(4); +TEST_CASE("Exception.Semaphore.4threads" * doctest::timeout(300)) { + semaphore1(4); } diff --git a/unittests/test_find.cpp b/unittests/test_find.cpp index dd275dc36..e29317936 100644 --- a/unittests/test_find.cpp +++ b/unittests/test_find.cpp @@ -873,5 +873,141 @@ TEST_CASE("ClosureWrapper.max_element.Dynamic" * doctest::timeout(300)) { } } +// ---------------------------------------------------------------------------- +// silent async +// ---------------------------------------------------------------------------- + +void silent_async(unsigned W) { + + tf::Executor executor(W); + std::vector input; + + for(size_t n = 0; n <= 65536; n <= 256 ? n++ : n=2*n+1) { + + input.resize(n); + + for(auto& i : input) { + i = ::rand() % (2 * n) + 1; + } + + auto P1 = [] (int i) { return i == 5; }; + auto P2 = [] (int i) { return i == 0; }; + + auto res1 = std::find_if(input.begin(), input.end(), P1); + auto res2 = std::find_if(input.begin(), input.end(), P2); + + REQUIRE(res2 == input.end()); + + std::vector::iterator itr1, itr2; + + executor.silent_async(tf::make_find_if_task(input.begin(), input.end(), itr1, P1)); + executor.silent_async(tf::make_find_if_task(input.begin(), input.end(), itr2, P2)); + + executor.wait_for_all(); + + REQUIRE(itr1 == res1); + REQUIRE(itr2 == res2); + } +} + +TEST_CASE("FindIf.SilentAsync.1thread" * doctest::timeout(300)) { + silent_async(1); +} + +TEST_CASE("FindIf.SilentAsync.2threads" * doctest::timeout(300)) { + silent_async(2); +} + +TEST_CASE("FindIf.SilentAsync.3threads" * doctest::timeout(300)) { + silent_async(3); +} + +TEST_CASE("FindIf.SilentAsync.4threads" * doctest::timeout(300)) { + silent_async(4); +} + +TEST_CASE("FindIf.SilentAsync.5threads" * doctest::timeout(300)) { + silent_async(5); +} + +TEST_CASE("FindIf.SilentAsync.6threads" * doctest::timeout(300)) { + silent_async(6); +} + +TEST_CASE("FindIf.SilentAsync.7threads" * doctest::timeout(300)) { + silent_async(7); +} + +TEST_CASE("FindIf.SilentAsync.8threads" * doctest::timeout(300)) { + silent_async(8); +} +// ---------------------------------------------------------------------------- +// silent dependent async +// ---------------------------------------------------------------------------- + +void silent_dependent_async(unsigned W) { + + tf::Executor executor(W); + std::vector input; + + for(size_t n = 0; n <= 65536; n <= 256 ? n++ : n=2*n+1) { + + input.resize(n); + + for(auto& i : input) { + i = ::rand() % (2 * n) + 1; + } + + auto P1 = [] (int i) { return i == 5; }; + auto P2 = [] (int i) { return i == 0; }; + + auto res1 = std::find_if(input.begin(), input.end(), P1); + auto res2 = std::find_if(input.begin(), input.end(), P2); + + REQUIRE(res2 == input.end()); + + std::vector::iterator itr1, itr2; + + executor.silent_dependent_async(tf::make_find_if_task(input.begin(), input.end(), itr1, P1)); + executor.silent_dependent_async(tf::make_find_if_task(input.begin(), input.end(), itr2, P2)); + + executor.wait_for_all(); + + REQUIRE(itr1 == res1); + REQUIRE(itr2 == res2); + } +} + +TEST_CASE("FindIf.SilentAsync.1thread" * doctest::timeout(300)) { + silent_dependent_async(1); +} + +TEST_CASE("FindIf.SilentAsync.2threads" * doctest::timeout(300)) { + silent_dependent_async(2); +} + +TEST_CASE("FindIf.SilentAsync.3threads" * doctest::timeout(300)) { + silent_dependent_async(3); +} + +TEST_CASE("FindIf.SilentAsync.4threads" * doctest::timeout(300)) { + silent_dependent_async(4); +} + +TEST_CASE("FindIf.SilentAsync.5threads" * doctest::timeout(300)) { + silent_dependent_async(5); +} + +TEST_CASE("FindIf.SilentAsync.6threads" * doctest::timeout(300)) { + silent_dependent_async(6); +} + +TEST_CASE("FindIf.SilentAsync.7threads" * doctest::timeout(300)) { + silent_dependent_async(7); +} + +TEST_CASE("FindIf.SilentAsync.8threads" * doctest::timeout(300)) { + silent_dependent_async(8); +} diff --git a/unittests/test_for_each.cpp b/unittests/test_for_each.cpp index c59887977..70b930c8d 100644 --- a/unittests/test_for_each.cpp +++ b/unittests/test_for_each.cpp @@ -3,6 +3,7 @@ #include #include #include +#include // -------------------------------------------------------- // Testcase: for_each @@ -588,7 +589,211 @@ TEST_CASE("ForEachIndex.InvalidRange" * doctest::timeout(300)) { counter.fetch_add(i, std::memory_order_relaxed); }); ex.run(flow).wait(); - REQUIRE(counter == 0); + REQUIRE(counter == 0); +} + +// ---------------------------------------------------------------------------- +// ForEachIndex.HeterogeneousRange +// ---------------------------------------------------------------------------- + +TEST_CASE("ForEachIndex.HeterogeneousRange" * doctest::timeout(300)) { + std::atomic counter(0); + tf::Executor ex; + tf::Taskflow flow; + + size_t from = 1; + size_t to = 10; + size_t step = 1; + + flow.for_each_index(from, to, step, [&](size_t i) { + counter.fetch_add(i, std::memory_order_relaxed); + }); + ex.run(flow).wait(); + REQUIRE(counter == to * (to - 1) / 2); +} + +// ---------------------------------------------------------------------------- +// range-based for_each_index +// ---------------------------------------------------------------------------- + +template +void range_based_for_each_index(unsigned w) { + tf::Executor executor(w); + tf::Taskflow taskflow; + std::atomic counter {0}; + + for(int beg=10; beg>=-10; --beg) { + for(int end=beg; end>=-10; --end) { + for(int s=1; s<=beg-end; ++s) { + + size_t n = tf::distance(beg, end, -s); + + for(size_t c=0; c<10; c++) { + taskflow.clear(); + counter = 0; + + tf::IndexRange range(beg, end, -s); + REQUIRE(range.size() == n); + + taskflow.for_each_by_index(range, [&] (tf::IndexRange lrange) { + size_t l = 0; + for(auto j=lrange.begin(); j>lrange.end(); j+=lrange.step_size()) { + l++; + } + REQUIRE(lrange.size() == l); + counter.fetch_add(l, std::memory_order_relaxed); + }, P(c)); + executor.run(taskflow).wait(); + REQUIRE(n == counter); + } + } + } + } +} + +TEST_CASE("ForEach.NegativeIndexRange.Static.1thread" * doctest::timeout(300)) { + range_based_for_each_index>(1); +} + +TEST_CASE("ForEach.NegativeIndexRange.Static.2threads" * doctest::timeout(300)) { + range_based_for_each_index>(2); +} + +TEST_CASE("ForEach.NegativeIndexRange.Static.3threads" * doctest::timeout(300)) { + range_based_for_each_index>(3); +} + +TEST_CASE("ForEach.NegativeIndexRange.Static.4threads" * doctest::timeout(300)) { + range_based_for_each_index>(4); +} + +TEST_CASE("ForEach.NegativeIndexRange.Guided.1thread" * doctest::timeout(300)) { + range_based_for_each_index>(1); +} + +TEST_CASE("ForEach.NegativeIndexRange.Guided.2threads" * doctest::timeout(300)) { + range_based_for_each_index>(2); +} + +TEST_CASE("ForEach.NegativeIndexRange.Guided.3threads" * doctest::timeout(300)) { + range_based_for_each_index>(3); +} + +TEST_CASE("ForEach.NegativeIndexRange.Guided.4threads" * doctest::timeout(300)) { + range_based_for_each_index>(4); +} + +TEST_CASE("ForEach.NegativeIndexRange.Dynamic.1thread" * doctest::timeout(300)) { + range_based_for_each_index>(1); +} + +TEST_CASE("ForEach.NegativeIndexRange.Dynamic.2threads" * doctest::timeout(300)) { + range_based_for_each_index>(2); +} + +TEST_CASE("ForEach.NegativeIndexRange.Dynamic.3threads" * doctest::timeout(300)) { + range_based_for_each_index>(3); +} + +TEST_CASE("ForEach.NegativeIndexRange.Dynamic.4threads" * doctest::timeout(300)) { + range_based_for_each_index>(4); +} + +// ---------------------------------------------------------------------------- +// stateful range-based for_each_index +// ---------------------------------------------------------------------------- + +template +void stateful_range_based_for_each_index(unsigned w) { + + tf::Executor executor(w); + tf::Taskflow taskflow; + std::atomic counter {0}; + + for(int beg=10; beg>=-10; --beg) { + for(int end=beg; end>=-10; --end) { + for(int s=1; s<=beg-end; ++s) { + + size_t n = tf::distance(beg, end, -s); + + for(size_t c=0; c<10; c++) { + taskflow.clear(); + counter = 0; + + tf::IndexRange range(0, 0, 0); + + auto set_range = taskflow.emplace([&](){ + range.begin(beg) + .end(end) + .step_size(-s); + REQUIRE(range.size() == n); + }); + + auto loop_range = taskflow.for_each_by_index(std::ref(range), [&] (tf::IndexRange lrange) { + size_t l = 0; + for(auto j=lrange.begin(); j>lrange.end(); j+=lrange.step_size()) { + l++; + } + REQUIRE(lrange.size() == l); + counter.fetch_add(l, std::memory_order_relaxed); + }, P(c)); + + set_range.precede(loop_range); + + executor.run(taskflow).wait(); + REQUIRE(n == counter); + } + } + } + } +} + +TEST_CASE("StatefulForEach.NegativeIndexRange.Static.1thread" * doctest::timeout(300)) { + stateful_range_based_for_each_index>(1); +} + +TEST_CASE("StatefulForEach.NegativeIndexRange.Static.2threads" * doctest::timeout(300)) { + stateful_range_based_for_each_index>(2); +} + +TEST_CASE("StatefulForEach.NegativeIndexRange.Static.3threads" * doctest::timeout(300)) { + stateful_range_based_for_each_index>(3); +} + +TEST_CASE("StatefulForEach.NegativeIndexRange.Static.4threads" * doctest::timeout(300)) { + stateful_range_based_for_each_index>(4); +} + +TEST_CASE("StatefulForEach.NegativeIndexRange.Dynamic.1thread" * doctest::timeout(300)) { + stateful_range_based_for_each_index>(1); +} + +TEST_CASE("StatefulForEach.NegativeIndexRange.Dynamic.2threads" * doctest::timeout(300)) { + stateful_range_based_for_each_index>(2); +} + +TEST_CASE("StatefulForEach.NegativeIndexRange.Dynamic.3threads" * doctest::timeout(300)) { + stateful_range_based_for_each_index>(3); +} + +TEST_CASE("StatefulForEach.NegativeIndexRange.Dynamic.4threads" * doctest::timeout(300)) { + stateful_range_based_for_each_index>(4); +} + +TEST_CASE("StatefulForEach.NegativeIndexRange.Guided.1thread" * doctest::timeout(300)) { + stateful_range_based_for_each_index>(1); +} + +TEST_CASE("StatefulForEach.NegativeIndexRange.Guided.2threads" * doctest::timeout(300)) { + stateful_range_based_for_each_index>(2); +} + +TEST_CASE("StatefulForEach.NegativeIndexRange.Guided.3threads" * doctest::timeout(300)) { + stateful_range_based_for_each_index>(3); +} + +TEST_CASE("StatefulForEach.NegativeIndexRange.Guided.4threads" * doctest::timeout(300)) { + stateful_range_based_for_each_index>(4); } // ---------------------------------------------------------------------------- @@ -761,9 +966,413 @@ TEST_CASE("ClosureWrapper.for_each.Dynamic" * doctest::timeout(300)) // parallel_for_exception(4); //} +// ---------------------------------------------------------------------------- +// Multiple For Each +// ---------------------------------------------------------------------------- + +template +void multiple_for_each(unsigned W) { + + tf::Executor executor(W); + tf::Taskflow taskflow; + + const int N = 1000; + const int M = 1000; + + std::array, N> vectors; + + for(auto& vec : vectors) { + vec.resize(M); + } + + for(int i=0; i>(1); +} + +TEST_CASE("MultipleParallelForEach.Static.2threads") { + multiple_for_each>(2); +} + +TEST_CASE("MultipleParallelForEach.Static.3threads") { + multiple_for_each>(3); +} + +TEST_CASE("MultipleParallelForEach.Static.4threads") { + multiple_for_each>(4); +} + +TEST_CASE("MultipleParallelForEach.Dynamic.1thread") { + multiple_for_each>(1); +} + +TEST_CASE("MultipleParallelForEach.Dynamic.2threads") { + multiple_for_each>(2); +} + +TEST_CASE("MultipleParallelForEach.Dynamic.3threads") { + multiple_for_each>(3); +} + +TEST_CASE("MultipleParallelForEach.Dynamic.4threads") { + multiple_for_each>(4); +} + +TEST_CASE("MultipleParallelForEach.Guided.1thread") { + multiple_for_each>(1); +} + +TEST_CASE("MultipleParallelForEach.Guided.2threads") { + multiple_for_each>(2); +} + +TEST_CASE("MultipleParallelForEach.Guided.3threads") { + multiple_for_each>(3); +} + +TEST_CASE("MultipleParallelForEach.Guided.4threads") { + multiple_for_each>(4); +} + + +// ---------------------------------------------------------------------------- +// Async +// ---------------------------------------------------------------------------- +void async(unsigned W) { + + tf::Executor executor(W); + + std::vector data; + + for(size_t N=0; N<=65536; N =((N == 0) ? 1 : N << 1)) { + + data.resize(N); + + // initialize data to -10 and 10 + executor.async(tf::make_for_each_task( + data.begin(), data.begin() + N/2, [](int& d){ d = -10; } + )); + + executor.async(tf::make_for_each_index_task( + N/2, N, size_t{1}, [&] (size_t i) { data[i] = 10; } + )); + + executor.wait_for_all(); + + for(size_t i=0; i data(N); + + // initialize data to 10 and -10 + executor.silent_async(tf::make_for_each_task( + data.begin(), data.begin() + N/2, [](int& d){ d = 10; } + )); + + executor.silent_async(tf::make_for_each_index_task( + N/2, N, size_t{1}, [&] (size_t i) { data[i] = -10; } + )); + + executor.wait_for_all(); + + for(size_t i=0; i data; + + for(size_t N=0; N<=65536; N =((N == 0) ? 1 : N << 1)) { + + data.resize(N); + + // initialize data to -10 and 10 + executor.dependent_async(tf::make_for_each_task( + data.begin(), data.begin() + N/2, [](int& d){ d = -10; } + )); + + executor.dependent_async(tf::make_for_each_index_task( + N/2, N, size_t{1}, [&] (size_t i) { data[i] = 10; } + )); + + executor.wait_for_all(); + + for(size_t i=0; i data(N); + + // initialize data to 10 and -10 + executor.silent_dependent_async(tf::make_for_each_task( + data.begin(), data.begin() + N/2, [](int& d){ d = 10; } + )); + + executor.silent_dependent_async(tf::make_for_each_index_task( + N/2, N, size_t{1}, [&] (size_t i) { data[i] = -10; } + )); + + executor.wait_for_all(); + + for(size_t i=0; i> data(N1); + + for(int i=0; i +#include +#include + +// -------------------------------------------------------- +// Testcase: Module +// -------------------------------------------------------- +void module1(unsigned W) { + + tf::Executor executor(W); + + tf::Taskflow f0; + + int cnt {0}; + + auto A = f0.emplace([&cnt](){ ++cnt; }); + auto B = f0.emplace([&cnt](){ ++cnt; }); + auto C = f0.emplace([&cnt](){ ++cnt; }); + auto D = f0.emplace([&cnt](){ ++cnt; }); + auto E = f0.emplace([&cnt](){ ++cnt; }); + + A.precede(B); + B.precede(C); + C.precede(D); + D.precede(E); + + tf::Taskflow f1; + + // module 1 + std::tie(A, B, C, D, E) = f1.emplace( + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; } + ); + A.precede(B); + B.precede(C); + C.precede(D); + D.precede(E); + auto m1_1 = f1.composed_of(f0); + E.precede(m1_1); + + executor.run(f1).get(); + REQUIRE(cnt == 10); + + cnt = 0; + executor.run_n(f1, 100).get(); + REQUIRE(cnt == 10 * 100); + + auto m1_2 = f1.composed_of(f0); + m1_1.precede(m1_2); + + for(int n=0; n<100; n++) { + cnt = 0; + executor.run_n(f1, n).get(); + REQUIRE(cnt == 15*n); + } + + cnt = 0; + for(int n=0; n<100; n++) { + executor.run(f1); + } + + executor.wait_for_all(); + + REQUIRE(cnt == 1500); +} + +TEST_CASE("Module1.1thread" * doctest::timeout(300)) { + module1(1); +} + +TEST_CASE("Module1.2threads" * doctest::timeout(300)) { + module1(2); +} + +TEST_CASE("Module1.3threads" * doctest::timeout(300)) { + module1(3); +} + +TEST_CASE("Module1.4threads" * doctest::timeout(300)) { + module1(4); +} + +TEST_CASE("Module1.5threads" * doctest::timeout(300)) { + module1(5); +} + +TEST_CASE("Module1.6threads" * doctest::timeout(300)) { + module1(6); +} + +TEST_CASE("Module1.7threads" * doctest::timeout(300)) { + module1(7); +} + +TEST_CASE("Module1.8threads" * doctest::timeout(300)) { + module1(8); +} + +// ---------------------------------------------------------------------------- +// Module 2 +// ---------------------------------------------------------------------------- + +// TESTCASE: module-2 +void module2(unsigned W) { + + tf::Executor executor(W); + + int cnt {0}; + + // level 0 (+5) + tf::Taskflow f0; + + auto A = f0.emplace([&cnt](){ ++cnt; }).name("f0A"); + auto B = f0.emplace([&cnt](){ ++cnt; }).name("f0B"); + auto C = f0.emplace([&cnt](){ ++cnt; }).name("f0C"); + auto D = f0.emplace([&cnt](){ ++cnt; }).name("f0D"); + auto E = f0.emplace([&cnt](){ ++cnt; }).name("f0E"); + + A.precede(B); + B.precede(C); + C.precede(D); + D.precede(E); + + // level 1 (+10) + tf::Taskflow f1; + auto m1_1 = f1.composed_of(f0).name("m1_1"); + auto m1_2 = f1.composed_of(f0).name("m1_2"); + m1_1.precede(m1_2); + + // level 2 (+20) + tf::Taskflow f2; + auto m2_1 = f2.composed_of(f1).name("m2_1"); + auto m2_2 = f2.composed_of(f1).name("m2_2"); + m2_1.precede(m2_2); + + //f2.dump(std::cout); + + // synchronous run + for(int n=0; n<100; n++) { + cnt = 0; + executor.run_n(f2, n).get(); + REQUIRE(cnt == 20*n); + } + + // asynchronous run + cnt = 0; + for(int n=0; n<100; n++) { + executor.run(f2); + } + executor.wait_for_all(); + REQUIRE(cnt == 100*20); + +} + +TEST_CASE("Module2.1thread" * doctest::timeout(300)) { + module2(1); +} + +TEST_CASE("Module2.2threads" * doctest::timeout(300)) { + module2(2); +} + +TEST_CASE("Module2.3threads" * doctest::timeout(300)) { + module2(3); +} + +TEST_CASE("Module2.4threads" * doctest::timeout(300)) { + module2(4); +} + +TEST_CASE("Module2.5threads" * doctest::timeout(300)) { + module2(5); +} + +TEST_CASE("Module2.6threads" * doctest::timeout(300)) { + module2(6); +} + +TEST_CASE("Module2.7threads" * doctest::timeout(300)) { + module2(7); +} + +TEST_CASE("Module2.8threads" * doctest::timeout(300)) { + module2(8); +} + +// ---------------------------------------------------------------------------- +// Module 3 +// ---------------------------------------------------------------------------- + +// TESTCASE: module-3 +void module3(unsigned W) { + + tf::Executor executor(W); + + int cnt {0}; + + // level 0 (+2) + tf::Taskflow f0; + + auto A = f0.emplace([&cnt](){ ++cnt; }); + auto B = f0.emplace([&cnt](){ ++cnt; }); + + A.precede(B); + + // level 1 (+4) + tf::Taskflow f1; + auto m1_1 = f1.composed_of(f0); + auto m1_2 = f1.composed_of(f0); + m1_1.precede(m1_2); + + // level 2 (+8) + tf::Taskflow f2; + auto m2_1 = f2.composed_of(f1); + auto m2_2 = f2.composed_of(f1); + m2_1.precede(m2_2); + + // level 3 (+16) + tf::Taskflow f3; + auto m3_1 = f3.composed_of(f2); + auto m3_2 = f3.composed_of(f2); + m3_1.precede(m3_2); + + // synchronous run + for(int n=0; n<100; n++) { + cnt = 0; + executor.run_n(f3, n).get(); + REQUIRE(cnt == 16*n); + } + + // asynchronous run + cnt = 0; + for(int n=0; n<100; n++) { + executor.run(f3); + } + executor.wait_for_all(); + REQUIRE(cnt == 16*100); + +} + +TEST_CASE("Module3.1thread" * doctest::timeout(300)) { + module3(1); +} + +TEST_CASE("Module3.2threads" * doctest::timeout(300)) { + module3(2); +} + +TEST_CASE("Module3.3threads" * doctest::timeout(300)) { + module3(3); +} + +TEST_CASE("Module3.4threads" * doctest::timeout(300)) { + module3(4); +} + +TEST_CASE("Module3.5threads" * doctest::timeout(300)) { + module3(5); +} + +TEST_CASE("Module3.6threads" * doctest::timeout(300)) { + module3(6); +} + +TEST_CASE("Module3.7threads" * doctest::timeout(300)) { + module3(7); +} + +TEST_CASE("Module3.8threads" * doctest::timeout(300)) { + module3(8); +} + +// ---------------------------------------------------------------------------- +// Module Algorithm with Taskflow Launch +// ---------------------------------------------------------------------------- + +void module4(unsigned W) { + + tf::Executor executor(W); + + tf::Taskflow f0; + + int cnt {0}; + + auto A = f0.emplace([&cnt](){ ++cnt; }); + auto B = f0.emplace([&cnt](){ ++cnt; }); + auto C = f0.emplace([&cnt](){ ++cnt; }); + auto D = f0.emplace([&cnt](){ ++cnt; }); + auto E = f0.emplace([&cnt](){ ++cnt; }); + + A.precede(B); + B.precede(C); + C.precede(D); + D.precede(E); + + tf::Taskflow f1; + + // module 1 + std::tie(A, B, C, D, E) = f1.emplace( + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; } + ); + A.precede(B); + B.precede(C); + C.precede(D); + D.precede(E); + auto m1_1 = f1.emplace(tf::make_module_task(f0)); + E.precede(m1_1); + + executor.run(f1).get(); + REQUIRE(cnt == 10); + + cnt = 0; + executor.run_n(f1, 100).get(); + REQUIRE(cnt == 10 * 100); + + auto m1_2 = f1.emplace(tf::make_module_task(f0)); + m1_1.precede(m1_2); + + for(int n=0; n<100; n++) { + cnt = 0; + executor.run_n(f1, n).get(); + REQUIRE(cnt == 15*n); + } + + cnt = 0; + for(int n=0; n<100; n++) { + executor.run(f1); + } + + executor.wait_for_all(); + + REQUIRE(cnt == 1500); +} + +TEST_CASE("Module4.1thread" * doctest::timeout(300)) { + module4(1); +} + +TEST_CASE("Module4.2threads" * doctest::timeout(300)) { + module4(2); +} + +TEST_CASE("Module4.3threads" * doctest::timeout(300)) { + module4(3); +} + +TEST_CASE("Module4.4threads" * doctest::timeout(300)) { + module4(4); +} + +TEST_CASE("Module4.5threads" * doctest::timeout(300)) { + module4(5); +} + +TEST_CASE("Module4.6threads" * doctest::timeout(300)) { + module4(6); +} + +TEST_CASE("Module4.7threads" * doctest::timeout(300)) { + module4(7); +} + +TEST_CASE("Module4.8threads" * doctest::timeout(300)) { + module4(8); +} + +// ---------------------------------------------------------------------------- +// Parallel Modules +// ---------------------------------------------------------------------------- + +void parallel_modules(unsigned W) { + + std::vector taskflows(100); + + tf::Executor executor(W); + tf::Taskflow taskflow; + + std::atomic counter{0}; + + for(auto& tf : taskflows) { + for(size_t n=0; n<100; n++) { + auto [A, B, C, D, E, F, G, H] = tf.emplace( + [&](){ counter.fetch_add(1, std::memory_order_relaxed); }, + [&](){ counter.fetch_add(1, std::memory_order_relaxed); }, + [&](){ counter.fetch_add(1, std::memory_order_relaxed); }, + [&](){ counter.fetch_add(1, std::memory_order_relaxed); }, + [&](){ counter.fetch_add(1, std::memory_order_relaxed); }, + [&](){ counter.fetch_add(1, std::memory_order_relaxed); }, + [&](){ counter.fetch_add(1, std::memory_order_relaxed); }, + [&](){ counter.fetch_add(1, std::memory_order_relaxed); } + ); + A.precede(B); + A.precede(C); + D.precede(E); + D.precede(F); + } + taskflow.composed_of(tf); + } + + executor.run(taskflow).wait(); + + REQUIRE(counter == 80000); +} + +TEST_CASE("ParallelModules.1thread" * doctest::timeout(300)) { + parallel_modules(1); +} + +TEST_CASE("ParallelModules.2threads" * doctest::timeout(300)) { + parallel_modules(2); +} + +TEST_CASE("ParallelModules.3thread" * doctest::timeout(300)) { + parallel_modules(3); +} + +TEST_CASE("ParallelModules.4thread" * doctest::timeout(300)) { + parallel_modules(4); +} + + +// ---------------------------------------------------------------------------- +// Module with Async Launch +// ---------------------------------------------------------------------------- + +void module_with_async_launch(unsigned W) { + + tf::Executor executor(W); + + tf::Taskflow f0; + + int cnt {0}; + + auto A = f0.emplace([&cnt](){ ++cnt; }); + auto B = f0.emplace([&cnt](){ ++cnt; }); + auto C = f0.emplace([&cnt](){ ++cnt; }); + auto D = f0.emplace([&cnt](){ ++cnt; }); + auto E = f0.emplace([&cnt](){ ++cnt; }); + + A.precede(B); + B.precede(C); + C.precede(D); + D.precede(E); + + tf::Taskflow f1; + + // module 1 + std::tie(A, B, C, D, E) = f1.emplace( + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; } + ); + A.precede(B); + B.precede(C); + C.precede(D); + D.precede(E); + auto m1_1 = f1.composed_of(f0); + E.precede(m1_1); + + executor.async(tf::make_module_task(f1)).get(); + + REQUIRE(cnt == 10); +} + +TEST_CASE("Module.AsyncLaunch.1thread" * doctest::timeout(300)) { + module_with_async_launch(1); +} + +TEST_CASE("Module.AsyncLaunch.2threads" * doctest::timeout(300)) { + module_with_async_launch(2); +} + +TEST_CASE("Module.AsyncLaunch.3threads" * doctest::timeout(300)) { + module_with_async_launch(3); +} + +TEST_CASE("Module.AsyncLaunch.4threads" * doctest::timeout(300)) { + module_with_async_launch(4); +} + +TEST_CASE("Module.AsyncLaunch.5threads" * doctest::timeout(300)) { + module_with_async_launch(5); +} + +TEST_CASE("Module.AsyncLaunch.6threads" * doctest::timeout(300)) { + module_with_async_launch(6); +} + +TEST_CASE("Module.AsyncLaunch.7threads" * doctest::timeout(300)) { + module_with_async_launch(7); +} + +TEST_CASE("Module.AsyncLaunch.8threads" * doctest::timeout(300)) { + module_with_async_launch(8); +} + +// ---------------------------------------------------------------------------- +// Module with Silent Async Launch +// ---------------------------------------------------------------------------- + +void module_with_silent_async_launch(unsigned W) { + + tf::Executor executor(W); + + tf::Taskflow f0; + + int cnt {0}; + + auto A = f0.emplace([&cnt](){ ++cnt; }); + auto B = f0.emplace([&cnt](){ ++cnt; }); + auto C = f0.emplace([&cnt](){ ++cnt; }); + auto D = f0.emplace([&cnt](){ ++cnt; }); + auto E = f0.emplace([&cnt](){ ++cnt; }); + + A.precede(B); + B.precede(C); + C.precede(D); + D.precede(E); + + tf::Taskflow f1; + + // module 1 + std::tie(A, B, C, D, E) = f1.emplace( + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; } + ); + A.precede(B); + B.precede(C); + C.precede(D); + D.precede(E); + auto m1_1 = f1.composed_of(f0); + E.precede(m1_1); + + executor.silent_async(tf::make_module_task(f1)); + executor.wait_for_all(); + + REQUIRE(cnt == 10); +} + +TEST_CASE("Module.SilentAsyncLaunch.1thread" * doctest::timeout(300)) { + module_with_silent_async_launch(1); +} + +TEST_CASE("Module.SilentAsyncLaunch.2threads" * doctest::timeout(300)) { + module_with_silent_async_launch(2); +} + +TEST_CASE("Module.SilentAsyncLaunch.3threads" * doctest::timeout(300)) { + module_with_silent_async_launch(3); +} + +TEST_CASE("Module.SilentAsyncLaunch.4threads" * doctest::timeout(300)) { + module_with_silent_async_launch(4); +} + +TEST_CASE("Module.SilentAsyncLaunch.5threads" * doctest::timeout(300)) { + module_with_silent_async_launch(5); +} + +TEST_CASE("Module.SilentAsyncLaunch.6threads" * doctest::timeout(300)) { + module_with_silent_async_launch(6); +} + +TEST_CASE("Module.SilentAsyncLaunch.7threads" * doctest::timeout(300)) { + module_with_silent_async_launch(7); +} + +TEST_CASE("Module.SilentAsyncLaunch.8threads" * doctest::timeout(300)) { + module_with_silent_async_launch(8); +} + +// ---------------------------------------------------------------------------- +// Module with Dependent Async Launch +// ---------------------------------------------------------------------------- + +void module_with_dependent_async_launch(unsigned W) { + + tf::Executor executor(W); + + tf::Taskflow f0; + + int cnt {0}; + + auto A = f0.emplace([&cnt](){ ++cnt; }); + auto B = f0.emplace([&cnt](){ ++cnt; }); + auto C = f0.emplace([&cnt](){ ++cnt; }); + auto D = f0.emplace([&cnt](){ ++cnt; }); + auto E = f0.emplace([&cnt](){ ++cnt; }); + + A.precede(B); + B.precede(C); + C.precede(D); + D.precede(E); + + tf::Taskflow f1; + + // module 1 + std::tie(A, B, C, D, E) = f1.emplace( + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; } + ); + A.precede(B); + B.precede(C); + C.precede(D); + D.precede(E); + auto m1_1 = f1.composed_of(f0); + E.precede(m1_1); + + auto [task, future] = executor.dependent_async(tf::make_module_task(f1)); + + future.get(); + + REQUIRE(cnt == 10); +} + +TEST_CASE("Module.DependentAsyncLaunch.1thread" * doctest::timeout(300)) { + module_with_dependent_async_launch(1); +} + +TEST_CASE("Module.DependentAsyncLaunch.2threads" * doctest::timeout(300)) { + module_with_dependent_async_launch(2); +} + +TEST_CASE("Module.DependentAsyncLaunch.3threads" * doctest::timeout(300)) { + module_with_dependent_async_launch(3); +} + +TEST_CASE("Module.DependentAsyncLaunch.4threads" * doctest::timeout(300)) { + module_with_dependent_async_launch(4); +} + +TEST_CASE("Module.DependentAsyncLaunch.5threads" * doctest::timeout(300)) { + module_with_dependent_async_launch(5); +} + +TEST_CASE("Module.DependentAsyncLaunch.6threads" * doctest::timeout(300)) { + module_with_dependent_async_launch(6); +} + +TEST_CASE("Module.DependentAsyncLaunch.7threads" * doctest::timeout(300)) { + module_with_dependent_async_launch(7); +} + +TEST_CASE("Module.DependentAsyncLaunch.8threads" * doctest::timeout(300)) { + module_with_dependent_async_launch(8); +} + +// ---------------------------------------------------------------------------- +// Module with Silent Dependent Async Launch +// ---------------------------------------------------------------------------- + +void module_with_silent_dependent_async_launch(unsigned W) { + + tf::Executor executor(W); + + tf::Taskflow f0; + + int cnt {0}; + + auto A = f0.emplace([&cnt](){ ++cnt; }); + auto B = f0.emplace([&cnt](){ ++cnt; }); + auto C = f0.emplace([&cnt](){ ++cnt; }); + auto D = f0.emplace([&cnt](){ ++cnt; }); + auto E = f0.emplace([&cnt](){ ++cnt; }); + + A.precede(B); + B.precede(C); + C.precede(D); + D.precede(E); + + tf::Taskflow f1; + + // module 1 + std::tie(A, B, C, D, E) = f1.emplace( + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; }, + [&cnt] () { ++cnt; } + ); + A.precede(B); + B.precede(C); + C.precede(D); + D.precede(E); + auto m1_1 = f1.composed_of(f0); + E.precede(m1_1); + + auto task = executor.silent_dependent_async(tf::make_module_task(f1)); + + executor.wait_for_all(); + + REQUIRE(task.is_done() == true); + REQUIRE(cnt == 10); +} + +TEST_CASE("Module.DependentAsyncLaunch.1thread" * doctest::timeout(300)) { + module_with_silent_dependent_async_launch(1); +} + +TEST_CASE("Module.DependentAsyncLaunch.2threads" * doctest::timeout(300)) { + module_with_silent_dependent_async_launch(2); +} + +TEST_CASE("Module.DependentAsyncLaunch.3threads" * doctest::timeout(300)) { + module_with_silent_dependent_async_launch(3); +} + +TEST_CASE("Module.DependentAsyncLaunch.4threads" * doctest::timeout(300)) { + module_with_silent_dependent_async_launch(4); +} + +TEST_CASE("Module.DependentAsyncLaunch.5threads" * doctest::timeout(300)) { + module_with_silent_dependent_async_launch(5); +} + +TEST_CASE("Module.DependentAsyncLaunch.6threads" * doctest::timeout(300)) { + module_with_silent_dependent_async_launch(6); +} + +TEST_CASE("Module.DependentAsyncLaunch.7threads" * doctest::timeout(300)) { + module_with_silent_dependent_async_launch(7); +} + +TEST_CASE("Module.DependentAsyncLaunch.8threads" * doctest::timeout(300)) { + module_with_silent_dependent_async_launch(8); +} + + diff --git a/unittests/test_pipelines.cpp b/unittests/test_pipelines.cpp index c1c1a705e..a7bdc0160 100644 --- a/unittests/test_pipelines.cpp +++ b/unittests/test_pipelines.cpp @@ -2401,7 +2401,7 @@ int ifelse_pipe_ans(int a) { } void ifelse_pipeline(size_t L, unsigned w) { - srand(time(NULL)); + //srand(time(NULL)); tf::Executor executor(w); size_t maxN = 200; diff --git a/unittests/test_queue.cpp b/unittests/test_queue.cpp new file mode 100644 index 000000000..8f2d9aff3 --- /dev/null +++ b/unittests/test_queue.cpp @@ -0,0 +1,668 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN + +#include +#include +#include + + +// ============================================================================ +// BoundedTaskQueue Test +// ============================================================================ + +// Procedure: test_wsq_owner +template +void bounded_tsq_owner() { + + tf::BoundedTaskQueue queue; + + constexpr size_t N = (1 << LogSize); + + std::vector data; + + for(size_t k=0; k(); +} + +TEST_CASE("BoundedTaskQueue.Owner.LogSize=3" * doctest::timeout(300)) { + bounded_tsq_owner<3>(); +} + +TEST_CASE("BoundedTaskQueue.Owner.LogSize=4" * doctest::timeout(300)) { + bounded_tsq_owner<4>(); +} + +TEST_CASE("BoundedTaskQueue.Owner.LogSize=5" * doctest::timeout(300)) { + bounded_tsq_owner<5>(); +} + +TEST_CASE("BoundedTaskQueue.Owner.LogSize=6" * doctest::timeout(300)) { + bounded_tsq_owner<6>(); +} + +TEST_CASE("BoundedTaskQueue.Owner.LogSize=7" * doctest::timeout(300)) { + bounded_tsq_owner<7>(); +} + +TEST_CASE("BoundedTaskQueue.Owner.LogSize=8" * doctest::timeout(300)) { + bounded_tsq_owner<8>(); +} + +TEST_CASE("BoundedTaskQueue.Owner.LogSize=9" * doctest::timeout(300)) { + bounded_tsq_owner<9>(); +} + +TEST_CASE("BoundedTaskQueue.Owner.LogSize=10" * doctest::timeout(300)) { + bounded_tsq_owner<10>(); +} + + +// ============================================================================ +// UnboundedTaskQueue Test +// ============================================================================ + +// Procedure: unbounded_tsq_owner +void unbounded_tsq_owner() { + + for(size_t N=1; N<=777777; N=N*2+1) { + tf::UnboundedTaskQueue queue; + std::vector gold(N); + + REQUIRE(queue.empty()); + + // push and pop + for(size_t i=0; i queue; + + std::vector gold; + std::atomic consumed; + + // 1, 4, 13, 40, 121, 364, 1093, 3280, 9841, 29524, 88573 + for(size_t N=1; N<=88573; N=N*3+1) { + + REQUIRE(queue.empty()); + + gold.resize(N); + consumed = 0; + + for(size_t i=0; i threads; + std::vector> stolens(M); + for(size_t i=0; i items; + while(consumed != N) { + auto ptr = queue.pop(); + if(ptr != nullptr) { + items.push_back(ptr); + consumed.fetch_add(1, std::memory_order_relaxed); + } + } + REQUIRE(queue.steal() == nullptr); + REQUIRE(queue.pop() == nullptr); + REQUIRE(queue.empty()); + + // join thieves + for(auto& thread : threads) thread.join(); + + // merge items + for(size_t i=0; i queue; + + std::vector gold; + std::atomic consumed; + + // 1, 4, 13, 40, 121, 364, 1093, 3280, 9841, 29524, 88573, 265720 + for(size_t N=1; N<=265720; N=N*3+1) { + + REQUIRE(queue.empty()); + + gold.resize(N); + consumed = 0; + + for(size_t i=0; i threads; + std::vector> stolens(M); + for(size_t i=0; i items; + while(consumed != N) { + auto ptr = queue.pop(); + if(ptr != nullptr) { + items.push_back(ptr); + consumed.fetch_add(1, std::memory_order_relaxed); + } + } + REQUIRE(queue.steal() == nullptr); + REQUIRE(queue.pop() == nullptr); + REQUIRE(queue.empty()); + + // join thieves + for(auto& thread : threads) thread.join(); + + // merge items + for(size_t i=0; i +void mpmc_basics() { + + tf::MPMC mpmc; + size_t N = (1< data(N+1, -1); + + REQUIRE(mpmc.capacity() == N); + + REQUIRE(mpmc.empty() == true); + REQUIRE(mpmc.try_dequeue() == std::nullopt); + + for(size_t i=0; i(); +} + +TEST_CASE("BoundedMPMC.Basics.LogSize=2") { + mpmc_basics(); +} + +TEST_CASE("BoundedMPMC.Basics.LogSize=3") { + mpmc_basics(); +} + +TEST_CASE("BoundedMPMC.Basics.LogSize=4") { + mpmc_basics(); +} + +TEST_CASE("BoundedMPMC.Basics.LogSize=5") { + mpmc_basics(); +} + +TEST_CASE("BoundedMPMC.Basics.LogSize=6") { + mpmc_basics(); +} + +TEST_CASE("BoundedMPMC.Basics.LogSize=7") { + mpmc_basics(); +} + +TEST_CASE("BoundedMPMC.Basics.LogSize=8") { + mpmc_basics(); +} + +TEST_CASE("BoundedMPMC.Basics.LogSize=9") { + mpmc_basics(); +} + +TEST_CASE("BoundedMPMC.Basics.LogSize=10") { + mpmc_basics(); +} + +// mpmc +template +void mpmc(unsigned num_producers, unsigned num_consumers) { + + const int N = 6543; + + std::atomic pcnt(0), ccnt(0), ans(0); + std::vector threads; + + tf::MPMC mpmc; + + for(unsigned i=0; i= N) { + break; + } + mpmc.enqueue(v); + } + }); + } + + for(auto & thread : threads) { + thread.join(); + } + + REQUIRE(ans.load() == (((N-1)*N) >> 1)); +} + +TEST_CASE("BoundedMPMC.1C1P") { + mpmc(1, 1); + mpmc(1, 1); +} + +TEST_CASE("BoundedMPMC.1C2P") { + mpmc(1, 2); + mpmc(1, 2); +} + +TEST_CASE("BoundedMPMC.1C3P") { + mpmc(1, 3); + mpmc(1, 3); +} + +TEST_CASE("BoundedMPMC.1C4P") { + mpmc(1, 4); + mpmc(1, 4); +} + +TEST_CASE("BoundedMPMC.2C1P") { + mpmc(2, 1); + mpmc(2, 1); +} + +TEST_CASE("BoundedMPMC.2C2P") { + mpmc(2, 2); + mpmc(2, 2); +} + +TEST_CASE("BoundedMPMC.2C3P") { + mpmc(2, 3); + mpmc(2, 3); +} + +TEST_CASE("BoundedMPMC.2C4P") { + mpmc(2, 4); + mpmc(2, 4); +} + +TEST_CASE("BoundedMPMC.3C1P") { + mpmc(3, 1); + mpmc(3, 1); +} + +TEST_CASE("BoundedMPMC.3C2P") { + mpmc(3, 2); + mpmc(3, 2); +} + +TEST_CASE("BoundedMPMC.3C3P") { + mpmc(3, 3); + mpmc(3, 3); +} + +TEST_CASE("BoundedMPMC.3C4P") { + mpmc(3, 4); + mpmc(3, 4); +} + +TEST_CASE("BoundedMPMC.4C1P") { + mpmc(4, 1); + mpmc(4, 1); +} + +TEST_CASE("BoundedMPMC.4C2P") { + mpmc(4, 2); + mpmc(4, 2); +} + +TEST_CASE("BoundedMPMC.4C3P") { + mpmc(4, 3); + mpmc(4, 3); +} + +TEST_CASE("BoundedMPMC.4C4P") { + mpmc(4, 4); + mpmc(4, 4); +} + +// ------------------------------------------------------------------------------------------------ +// BoundedMPMC Specialization on Pointer Type +// ------------------------------------------------------------------------------------------------ + +template +void mpmc_pointer_basics() { + + tf::MPMC mpmc; + size_t N = (1<> data(N+1); + + REQUIRE(mpmc.capacity() == N); + + REQUIRE(mpmc.empty() == true); + REQUIRE(mpmc.try_dequeue() == nullptr); + + for(size_t i=0; i(); +} + +TEST_CASE("BoundedMPMC.Pointer.Basics.LogSize=2") { + mpmc_pointer_basics(); +} + +TEST_CASE("BoundedMPMC.Pointer.Basics.LogSize=3") { + mpmc_pointer_basics(); +} + +TEST_CASE("BoundedMPMC.Pointer.Basics.LogSize=4") { + mpmc_pointer_basics(); +} + +TEST_CASE("BoundedMPMC.Pointer.Basics.LogSize=5") { + mpmc_pointer_basics(); +} + +TEST_CASE("BoundedMPMC.Pointer.Basics.LogSize=6") { + mpmc_pointer_basics(); +} + +TEST_CASE("BoundedMPMC.Pointer.Basics.LogSize=7") { + mpmc_pointer_basics(); +} + +TEST_CASE("BoundedMPMC.Pointer.Basics.LogSize=8") { + mpmc_pointer_basics(); +} + +TEST_CASE("BoundedMPMC.Pointer.Basics.LogSize=9") { + mpmc_pointer_basics(); +} + +TEST_CASE("BoundedMPMC.Pointer.Basics.LogSize=10") { + mpmc_pointer_basics(); +} + + diff --git a/unittests/test_reduce.cpp b/unittests/test_reduce.cpp index d510c01d1..423090986 100644 --- a/unittests/test_reduce.cpp +++ b/unittests/test_reduce.cpp @@ -41,7 +41,7 @@ struct MoveOnly2{ // -------------------------------------------------------- template -void reduce(unsigned W) { +void reduce_min(unsigned W) { tf::Executor executor(W); tf::Taskflow taskflow; @@ -86,199 +86,135 @@ void reduce(unsigned W) { } // guided -TEST_CASE("Reduce.Guided.1thread" * doctest::timeout(300)) { - reduce>(1); +TEST_CASE("ReduceMin.Guided.1thread" * doctest::timeout(300)) { + reduce_min>(1); } -TEST_CASE("Reduce.Guided.2threads" * doctest::timeout(300)) { - reduce>(2); +TEST_CASE("ReduceMin.Guided.2threads" * doctest::timeout(300)) { + reduce_min>(2); } -TEST_CASE("Reduce.Guided.3threads" * doctest::timeout(300)) { - reduce>(3); +TEST_CASE("ReduceMin.Guided.3threads" * doctest::timeout(300)) { + reduce_min>(3); } -TEST_CASE("Reduce.Guided.4threads" * doctest::timeout(300)) { - reduce>(4); +TEST_CASE("ReduceMin.Guided.4threads" * doctest::timeout(300)) { + reduce_min>(4); } -TEST_CASE("Reduce.Guided.5threads" * doctest::timeout(300)) { - reduce>(5); +TEST_CASE("ReduceMin.Guided.5threads" * doctest::timeout(300)) { + reduce_min>(5); } -TEST_CASE("Reduce.Guided.6threads" * doctest::timeout(300)) { - reduce>(6); +TEST_CASE("ReduceMin.Guided.6threads" * doctest::timeout(300)) { + reduce_min>(6); } -TEST_CASE("Reduce.Guided.7threads" * doctest::timeout(300)) { - reduce>(7); +TEST_CASE("ReduceMin.Guided.7threads" * doctest::timeout(300)) { + reduce_min>(7); } -TEST_CASE("Reduce.Guided.8threads" * doctest::timeout(300)) { - reduce>(8); -} - -TEST_CASE("Reduce.Guided.9threads" * doctest::timeout(300)) { - reduce>(9); -} - -TEST_CASE("Reduce.Guided.10threads" * doctest::timeout(300)) { - reduce>(10); -} - -TEST_CASE("Reduce.Guided.11threads" * doctest::timeout(300)) { - reduce>(11); -} - -TEST_CASE("Reduce.Guided.12threads" * doctest::timeout(300)) { - reduce>(12); +TEST_CASE("ReduceMin.Guided.8threads" * doctest::timeout(300)) { + reduce_min>(8); } // dynamic -TEST_CASE("Reduce.Dynamic.1thread" * doctest::timeout(300)) { - reduce>(1); -} - -TEST_CASE("Reduce.Dynamic.2threads" * doctest::timeout(300)) { - reduce>(2); -} - -TEST_CASE("Reduce.Dynamic.3threads" * doctest::timeout(300)) { - reduce>(3); +TEST_CASE("ReduceMin.Dynamic.1thread" * doctest::timeout(300)) { + reduce_min>(1); } -TEST_CASE("Reduce.Dynamic.4threads" * doctest::timeout(300)) { - reduce>(4); +TEST_CASE("ReduceMin.Dynamic.2threads" * doctest::timeout(300)) { + reduce_min>(2); } -TEST_CASE("Reduce.Dynamic.5threads" * doctest::timeout(300)) { - reduce>(5); +TEST_CASE("ReduceMin.Dynamic.3threads" * doctest::timeout(300)) { + reduce_min>(3); } -TEST_CASE("Reduce.Dynamic.6threads" * doctest::timeout(300)) { - reduce>(6); +TEST_CASE("ReduceMin.Dynamic.4threads" * doctest::timeout(300)) { + reduce_min>(4); } -TEST_CASE("Reduce.Dynamic.7threads" * doctest::timeout(300)) { - reduce>(7); +TEST_CASE("ReduceMin.Dynamic.5threads" * doctest::timeout(300)) { + reduce_min>(5); } -TEST_CASE("Reduce.Dynamic.8threads" * doctest::timeout(300)) { - reduce>(8); +TEST_CASE("ReduceMin.Dynamic.6threads" * doctest::timeout(300)) { + reduce_min>(6); } -TEST_CASE("Reduce.Dynamic.9threads" * doctest::timeout(300)) { - reduce>(9); +TEST_CASE("ReduceMin.Dynamic.7threads" * doctest::timeout(300)) { + reduce_min>(7); } -TEST_CASE("Reduce.Dynamic.10threads" * doctest::timeout(300)) { - reduce>(10); -} - -TEST_CASE("Reduce.Dynamic.11threads" * doctest::timeout(300)) { - reduce>(11); -} - -TEST_CASE("Reduce.Dynamic.12threads" * doctest::timeout(300)) { - reduce>(12); +TEST_CASE("ReduceMin.Dynamic.8threads" * doctest::timeout(300)) { + reduce_min>(8); } // static -TEST_CASE("Reduce.Static.1thread" * doctest::timeout(300)) { - reduce>(1); -} - -TEST_CASE("Reduce.Static.2threads" * doctest::timeout(300)) { - reduce>(2); +TEST_CASE("ReduceMin.Static.1thread" * doctest::timeout(300)) { + reduce_min>(1); } -TEST_CASE("Reduce.Static.3threads" * doctest::timeout(300)) { - reduce>(3); +TEST_CASE("ReduceMin.Static.2threads" * doctest::timeout(300)) { + reduce_min>(2); } -TEST_CASE("Reduce.Static.4threads" * doctest::timeout(300)) { - reduce>(4); +TEST_CASE("ReduceMin.Static.3threads" * doctest::timeout(300)) { + reduce_min>(3); } -TEST_CASE("Reduce.Static.5threads" * doctest::timeout(300)) { - reduce>(5); +TEST_CASE("ReduceMin.Static.4threads" * doctest::timeout(300)) { + reduce_min>(4); } -TEST_CASE("Reduce.Static.6threads" * doctest::timeout(300)) { - reduce>(6); +TEST_CASE("ReduceMin.Static.5threads" * doctest::timeout(300)) { + reduce_min>(5); } -TEST_CASE("Reduce.Static.7threads" * doctest::timeout(300)) { - reduce>(7); +TEST_CASE("ReduceMin.Static.6threads" * doctest::timeout(300)) { + reduce_min>(6); } -TEST_CASE("Reduce.Static.8threads" * doctest::timeout(300)) { - reduce>(8); +TEST_CASE("ReduceMin.Static.7threads" * doctest::timeout(300)) { + reduce_min>(7); } -TEST_CASE("Reduce.Static.9threads" * doctest::timeout(300)) { - reduce>(9); -} - -TEST_CASE("Reduce.Static.10threads" * doctest::timeout(300)) { - reduce>(10); -} - -TEST_CASE("Reduce.Static.11threads" * doctest::timeout(300)) { - reduce>(11); -} - -TEST_CASE("Reduce.Static.12threads" * doctest::timeout(300)) { - reduce>(12); +TEST_CASE("ReduceMin.Static.8threads" * doctest::timeout(300)) { + reduce_min>(8); } // random -TEST_CASE("Reduce.Random.1thread" * doctest::timeout(300)) { - reduce>(1); -} - -TEST_CASE("Reduce.Random.2threads" * doctest::timeout(300)) { - reduce>(2); +TEST_CASE("ReduceMin.Random.1thread" * doctest::timeout(300)) { + reduce_min>(1); } -TEST_CASE("Reduce.Random.3threads" * doctest::timeout(300)) { - reduce>(3); +TEST_CASE("ReduceMin.Random.2threads" * doctest::timeout(300)) { + reduce_min>(2); } -TEST_CASE("Reduce.Random.4threads" * doctest::timeout(300)) { - reduce>(4); +TEST_CASE("ReduceMin.Random.3threads" * doctest::timeout(300)) { + reduce_min>(3); } -TEST_CASE("Reduce.Random.5threads" * doctest::timeout(300)) { - reduce>(5); +TEST_CASE("ReduceMin.Random.4threads" * doctest::timeout(300)) { + reduce_min>(4); } -TEST_CASE("Reduce.Random.6threads" * doctest::timeout(300)) { - reduce>(6); +TEST_CASE("ReduceMin.Random.5threads" * doctest::timeout(300)) { + reduce_min>(5); } -TEST_CASE("Reduce.Random.7threads" * doctest::timeout(300)) { - reduce>(7); +TEST_CASE("ReduceMin.Random.6threads" * doctest::timeout(300)) { + reduce_min>(6); } -TEST_CASE("Reduce.Random.8threads" * doctest::timeout(300)) { - reduce>(8); +TEST_CASE("ReduceMin.Random.7threads" * doctest::timeout(300)) { + reduce_min>(7); } -TEST_CASE("Reduce.Random.9threads" * doctest::timeout(300)) { - reduce>(9); -} - -TEST_CASE("Reduce.Random.10threads" * doctest::timeout(300)) { - reduce>(10); -} - -TEST_CASE("Reduce.Random.11threads" * doctest::timeout(300)) { - reduce>(11); -} - -TEST_CASE("Reduce.Random.12threads" * doctest::timeout(300)) { - reduce>(12); +TEST_CASE("ReduceMin.Random.8threads" * doctest::timeout(300)) { + reduce_min>(8); } // -------------------------------------------------------- @@ -362,22 +298,6 @@ TEST_CASE("ReduceSum.Guided.8threads" * doctest::timeout(300)) { reduce_sum>(8); } -TEST_CASE("ReduceSum.Guided.9threads" * doctest::timeout(300)) { - reduce_sum>(9); -} - -TEST_CASE("ReduceSum.Guided.10threads" * doctest::timeout(300)) { - reduce_sum>(10); -} - -TEST_CASE("ReduceSum.Guided.11threads" * doctest::timeout(300)) { - reduce_sum>(11); -} - -TEST_CASE("ReduceSum.Guided.12threads" * doctest::timeout(300)) { - reduce_sum>(12); -} - // dynamic TEST_CASE("ReduceSum.Dynamic.1thread" * doctest::timeout(300)) { reduce_sum>(1); @@ -411,22 +331,6 @@ TEST_CASE("ReduceSum.Dynamic.8threads" * doctest::timeout(300)) { reduce_sum>(8); } -TEST_CASE("ReduceSum.Dynamic.9threads" * doctest::timeout(300)) { - reduce_sum>(9); -} - -TEST_CASE("ReduceSum.Dynamic.10threads" * doctest::timeout(300)) { - reduce_sum>(10); -} - -TEST_CASE("ReduceSum.Dynamic.11threads" * doctest::timeout(300)) { - reduce_sum>(11); -} - -TEST_CASE("ReduceSum.Dynamic.12threads" * doctest::timeout(300)) { - reduce_sum>(12); -} - // static TEST_CASE("ReduceSum.Static.1thread" * doctest::timeout(300)) { reduce_sum>(1); @@ -460,22 +364,6 @@ TEST_CASE("ReduceSum.Static.8threads" * doctest::timeout(300)) { reduce_sum>(8); } -TEST_CASE("ReduceSum.Static.9threads" * doctest::timeout(300)) { - reduce_sum>(9); -} - -TEST_CASE("ReduceSum.Static.10threads" * doctest::timeout(300)) { - reduce_sum>(10); -} - -TEST_CASE("ReduceSum.Static.11threads" * doctest::timeout(300)) { - reduce_sum>(11); -} - -TEST_CASE("ReduceSum.Static.12threads" * doctest::timeout(300)) { - reduce_sum>(12); -} - // random TEST_CASE("ReduceSum.Random.1thread" * doctest::timeout(300)) { reduce_sum>(1); @@ -509,22 +397,193 @@ TEST_CASE("ReduceSum.Random.8threads" * doctest::timeout(300)) { reduce_sum>(8); } -TEST_CASE("ReduceSum.Random.9threads" * doctest::timeout(300)) { - reduce_sum>(9); +// -------------------------------------------------------- +// Testcase: reduce_by_index_sum +// -------------------------------------------------------- + +template +void reduce_by_index_sum(unsigned W) { + + tf::Executor executor(W); + tf::Taskflow taskflow; + + std::vector vec(1000); + + for(auto& i : vec) i = ::rand() % 100 - 50; + + for(size_t n=1; n range; + + auto stask = taskflow.emplace([&](){ + range.reset(0, vec.size(), 1); + REQUIRE(range.size() == vec.size()); + for(auto itr = vec.begin(); itr != vec.end(); itr++) { + sum += *itr; + } + }); + + tf::Task ptask; + + ptask = taskflow.reduce_by_index( + std::ref(range), + sol, + [&](tf::IndexRange subrange, std::optional running_total){ + int lsum = running_total ? *running_total : 0; + for(size_t i=subrange.begin(); i(), + P(c) + ); + + stask.precede(ptask); + + executor.run(taskflow).wait(); + + REQUIRE(sol == sum); + } + } +} + +// guided +TEST_CASE("ReduceByIndexSum.Guided.1thread" * doctest::timeout(300)) { + reduce_by_index_sum>(1); +} + +TEST_CASE("ReduceByIndexSum.Guided.2threads" * doctest::timeout(300)) { + reduce_by_index_sum>(2); +} + +TEST_CASE("ReduceByIndexSum.Guided.3threads" * doctest::timeout(300)) { + reduce_by_index_sum>(3); +} + +TEST_CASE("ReduceByIndexSum.Guided.4threads" * doctest::timeout(300)) { + reduce_by_index_sum>(4); +} + +TEST_CASE("ReduceByIndexSum.Guided.5threads" * doctest::timeout(300)) { + reduce_by_index_sum>(5); +} + +TEST_CASE("ReduceByIndexSum.Guided.6threads" * doctest::timeout(300)) { + reduce_by_index_sum>(6); +} + +TEST_CASE("ReduceByIndexSum.Guided.7threads" * doctest::timeout(300)) { + reduce_by_index_sum>(7); +} + +TEST_CASE("ReduceByIndexSum.Guided.8threads" * doctest::timeout(300)) { + reduce_by_index_sum>(8); +} + +// dynamic +TEST_CASE("ReduceByIndexSum.Dynamic.1thread" * doctest::timeout(300)) { + reduce_by_index_sum>(1); +} + +TEST_CASE("ReduceByIndexSum.Dynamic.2threads" * doctest::timeout(300)) { + reduce_by_index_sum>(2); +} + +TEST_CASE("ReduceByIndexSum.Dynamic.3threads" * doctest::timeout(300)) { + reduce_by_index_sum>(3); +} + +TEST_CASE("ReduceByIndexSum.Dynamic.4threads" * doctest::timeout(300)) { + reduce_by_index_sum>(4); +} + +TEST_CASE("ReduceByIndexSum.Dynamic.5threads" * doctest::timeout(300)) { + reduce_by_index_sum>(5); +} + +TEST_CASE("ReduceByIndexSum.Dynamic.6threads" * doctest::timeout(300)) { + reduce_by_index_sum>(6); +} + +TEST_CASE("ReduceByIndexSum.Dynamic.7threads" * doctest::timeout(300)) { + reduce_by_index_sum>(7); +} + +TEST_CASE("ReduceByIndexSum.Dynamic.8threads" * doctest::timeout(300)) { + reduce_by_index_sum>(8); +} + +// static +TEST_CASE("ReduceByIndexSum.Static.1thread" * doctest::timeout(300)) { + reduce_by_index_sum>(1); +} + +TEST_CASE("ReduceByIndexSum.Static.2threads" * doctest::timeout(300)) { + reduce_by_index_sum>(2); +} + +TEST_CASE("ReduceByIndexSum.Static.3threads" * doctest::timeout(300)) { + reduce_by_index_sum>(3); } -TEST_CASE("ReduceSum.Random.10threads" * doctest::timeout(300)) { - reduce_sum>(10); +TEST_CASE("ReduceByIndexSum.Static.4threads" * doctest::timeout(300)) { + reduce_by_index_sum>(4); } -TEST_CASE("ReduceSum.Random.11threads" * doctest::timeout(300)) { - reduce_sum>(11); +TEST_CASE("ReduceByIndexSum.Static.5threads" * doctest::timeout(300)) { + reduce_by_index_sum>(5); } -TEST_CASE("ReduceSum.Random.12threads" * doctest::timeout(300)) { - reduce_sum>(12); +TEST_CASE("ReduceByIndexSum.Static.6threads" * doctest::timeout(300)) { + reduce_by_index_sum>(6); } +TEST_CASE("ReduceByIndexSum.Static.7threads" * doctest::timeout(300)) { + reduce_by_index_sum>(7); +} + +TEST_CASE("ReduceByIndexSum.Static.8threads" * doctest::timeout(300)) { + reduce_by_index_sum>(8); +} + +// random +TEST_CASE("ReduceByIndexSum.Random.1thread" * doctest::timeout(300)) { + reduce_by_index_sum>(1); +} + +TEST_CASE("ReduceByIndexSum.Random.2threads" * doctest::timeout(300)) { + reduce_by_index_sum>(2); +} + +TEST_CASE("ReduceByIndexSum.Random.3threads" * doctest::timeout(300)) { + reduce_by_index_sum>(3); +} + +TEST_CASE("ReduceByIndexSum.Random.4threads" * doctest::timeout(300)) { + reduce_by_index_sum>(4); +} + +TEST_CASE("ReduceByIndexSum.Random.5threads" * doctest::timeout(300)) { + reduce_by_index_sum>(5); +} + +TEST_CASE("ReduceByIndexSum.Random.6threads" * doctest::timeout(300)) { + reduce_by_index_sum>(6); +} + +TEST_CASE("ReduceByIndexSum.Random.7threads" * doctest::timeout(300)) { + reduce_by_index_sum>(7); +} + +TEST_CASE("ReduceByIndexSum.Random.8threads" * doctest::timeout(300)) { + reduce_by_index_sum>(8); +} // ---------------------------------------------------------------------------- // transform_reduce @@ -619,22 +678,6 @@ TEST_CASE("TransformReduce.Guided.8threads" * doctest::timeout(300)) { transform_reduce>(8); } -TEST_CASE("TransformReduce.Guided.9threads" * doctest::timeout(300)) { - transform_reduce>(9); -} - -TEST_CASE("TransformReduce.Guided.10threads" * doctest::timeout(300)) { - transform_reduce>(10); -} - -TEST_CASE("TransformReduce.Guided.11threads" * doctest::timeout(300)) { - transform_reduce>(11); -} - -TEST_CASE("TransformReduce.Guided.12threads" * doctest::timeout(300)) { - transform_reduce>(12); -} - // dynamic TEST_CASE("TransformReduce.Dynamic.1thread" * doctest::timeout(300)) { transform_reduce>(1); @@ -668,22 +711,6 @@ TEST_CASE("TransformReduce.Dynamic.8threads" * doctest::timeout(300)) { transform_reduce>(8); } -TEST_CASE("TransformReduce.Dynamic.9threads" * doctest::timeout(300)) { - transform_reduce>(9); -} - -TEST_CASE("TransformReduce.Dynamic.10threads" * doctest::timeout(300)) { - transform_reduce>(10); -} - -TEST_CASE("TransformReduce.Dynamic.11threads" * doctest::timeout(300)) { - transform_reduce>(11); -} - -TEST_CASE("TransformReduce.Dynamic.12threads" * doctest::timeout(300)) { - transform_reduce>(12); -} - // static TEST_CASE("TransformReduce.Static.1thread" * doctest::timeout(300)) { transform_reduce>(1); @@ -717,22 +744,6 @@ TEST_CASE("TransformReduce.Static.8threads" * doctest::timeout(300)) { transform_reduce>(8); } -TEST_CASE("TransformReduce.Static.9threads" * doctest::timeout(300)) { - transform_reduce>(9); -} - -TEST_CASE("TransformReduce.Static.10threads" * doctest::timeout(300)) { - transform_reduce>(10); -} - -TEST_CASE("TransformReduce.Static.11threads" * doctest::timeout(300)) { - transform_reduce>(11); -} - -TEST_CASE("TransformReduce.Static.12threads" * doctest::timeout(300)) { - transform_reduce>(12); -} - // random TEST_CASE("TransformReduce.Random.1thread" * doctest::timeout(300)) { transform_reduce>(1); @@ -766,22 +777,6 @@ TEST_CASE("TransformReduce.Random.8threads" * doctest::timeout(300)) { transform_reduce>(8); } -TEST_CASE("TransformReduce.Random.9threads" * doctest::timeout(300)) { - transform_reduce>(9); -} - -TEST_CASE("TransformReduce.Random.10threads" * doctest::timeout(300)) { - transform_reduce>(10); -} - -TEST_CASE("TransformReduce.Random.11threads" * doctest::timeout(300)) { - transform_reduce>(11); -} - -TEST_CASE("TransformReduce.Random.12threads" * doctest::timeout(300)) { - transform_reduce>(12); -} - // ---------------------------------------------------------------------------- // Transform & Reduce on Movable Data // ---------------------------------------------------------------------------- @@ -1032,22 +1027,6 @@ TEST_CASE("TransformReduceSum.Guided.8threads" * doctest::timeout(300)) { transform_reduce_sum>(8); } -TEST_CASE("TransformReduceSum.Guided.9threads" * doctest::timeout(300)) { - transform_reduce_sum>(9); -} - -TEST_CASE("TransformReduceSum.Guided.10threads" * doctest::timeout(300)) { - transform_reduce_sum>(10); -} - -TEST_CASE("TransformReduceSum.Guided.11threads" * doctest::timeout(300)) { - transform_reduce_sum>(11); -} - -TEST_CASE("TransformReduceSum.Guided.12threads" * doctest::timeout(300)) { - transform_reduce_sum>(12); -} - // dynamic TEST_CASE("TransformReduceSum.Dynamic.1thread" * doctest::timeout(300)) { transform_reduce_sum>(1); @@ -1081,22 +1060,6 @@ TEST_CASE("TransformReduceSum.Dynamic.8threads" * doctest::timeout(300)) { transform_reduce_sum>(8); } -TEST_CASE("TransformReduceSum.Dynamic.9threads" * doctest::timeout(300)) { - transform_reduce_sum>(9); -} - -TEST_CASE("TransformReduceSum.Dynamic.10threads" * doctest::timeout(300)) { - transform_reduce_sum>(10); -} - -TEST_CASE("TransformReduceSum.Dynamic.11threads" * doctest::timeout(300)) { - transform_reduce_sum>(11); -} - -TEST_CASE("TransformReduceSum.Dynamic.12threads" * doctest::timeout(300)) { - transform_reduce_sum>(12); -} - // static TEST_CASE("TransformReduceSum.Static.1thread" * doctest::timeout(300)) { transform_reduce_sum>(1); @@ -1130,22 +1093,6 @@ TEST_CASE("TransformReduceSum.Static.8threads" * doctest::timeout(300)) { transform_reduce_sum>(8); } -TEST_CASE("TransformReduceSum.Static.9threads" * doctest::timeout(300)) { - transform_reduce_sum>(9); -} - -TEST_CASE("TransformReduceSum.Static.10threads" * doctest::timeout(300)) { - transform_reduce_sum>(10); -} - -TEST_CASE("TransformReduceSum.Static.11threads" * doctest::timeout(300)) { - transform_reduce_sum>(11); -} - -TEST_CASE("TransformReduceSum.Static.12threads" * doctest::timeout(300)) { - transform_reduce_sum>(12); -} - // random TEST_CASE("TransformReduceSum.Random.1thread" * doctest::timeout(300)) { transform_reduce_sum>(1); @@ -1179,22 +1126,6 @@ TEST_CASE("TransformReduceSum.Random.8threads" * doctest::timeout(300)) { transform_reduce_sum>(8); } -TEST_CASE("TransformReduceSum.Random.9threads" * doctest::timeout(300)) { - transform_reduce_sum>(9); -} - -TEST_CASE("TransformReduceSum.Random.10threads" * doctest::timeout(300)) { - transform_reduce_sum>(10); -} - -TEST_CASE("TransformReduceSum.Random.11threads" * doctest::timeout(300)) { - transform_reduce_sum>(11); -} - -TEST_CASE("TransformReduceSum.Random.12threads" * doctest::timeout(300)) { - transform_reduce_sum>(12); -} - // ---------------------------------------------------------------------------- // binary_transform_reduce // ---------------------------------------------------------------------------- @@ -1254,196 +1185,133 @@ TEST_CASE("BinaryTransformReduce.Guided.1thread" * doctest::timeout(300)) { binary_transform_reduce>(1); } -TEST_CASE("BinaryTransformReduce.Guided.2thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Guided.2threads" * doctest::timeout(300)) { binary_transform_reduce>(2); } -TEST_CASE("BinaryTransformReduce.Guided.3thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Guided.3threads" * doctest::timeout(300)) { binary_transform_reduce>(3); } -TEST_CASE("BinaryTransformReduce.Guided.4thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Guided.4threads" * doctest::timeout(300)) { binary_transform_reduce>(4); } -TEST_CASE("BinaryTransformReduce.Guided.5thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Guided.5threads" * doctest::timeout(300)) { binary_transform_reduce>(5); } -TEST_CASE("BinaryTransformReduce.Guided.6thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Guided.6threads" * doctest::timeout(300)) { binary_transform_reduce>(6); } -TEST_CASE("BinaryTransformReduce.Guided.7thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Guided.7threads" * doctest::timeout(300)) { binary_transform_reduce>(7); } -TEST_CASE("BinaryTransformReduce.Guided.8thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Guided.8threads" * doctest::timeout(300)) { binary_transform_reduce>(8); } -TEST_CASE("BinaryTransformReduce.Guided.9thread" * doctest::timeout(300)) { - binary_transform_reduce>(9); -} - -TEST_CASE("BinaryTransformReduce.Guided.10thread" * doctest::timeout(300)) { - binary_transform_reduce>(10); -} - -TEST_CASE("BinaryTransformReduce.Guided.11thread" * doctest::timeout(300)) { - binary_transform_reduce>(11); -} - -TEST_CASE("BinaryTransformReduce.Guided.12thread" * doctest::timeout(300)) { - binary_transform_reduce>(12); -} - // dynamic TEST_CASE("BinaryTransformReduce.Dynamic.1thread" * doctest::timeout(300)) { binary_transform_reduce>(1); } -TEST_CASE("BinaryTransformReduce.Dynamic.2thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Dynamic.2threads" * doctest::timeout(300)) { binary_transform_reduce>(2); } -TEST_CASE("BinaryTransformReduce.Dynamic.3thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Dynamic.3threads" * doctest::timeout(300)) { binary_transform_reduce>(3); } -TEST_CASE("BinaryTransformReduce.Dynamic.4thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Dynamic.4threads" * doctest::timeout(300)) { binary_transform_reduce>(4); } -TEST_CASE("BinaryTransformReduce.Dynamic.5thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Dynamic.5threads" * doctest::timeout(300)) { binary_transform_reduce>(5); } -TEST_CASE("BinaryTransformReduce.Dynamic.6thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Dynamic.6threads" * doctest::timeout(300)) { binary_transform_reduce>(6); } -TEST_CASE("BinaryTransformReduce.Dynamic.7thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Dynamic.7threads" * doctest::timeout(300)) { binary_transform_reduce>(7); } -TEST_CASE("BinaryTransformReduce.Dynamic.8thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Dynamic.8threads" * doctest::timeout(300)) { binary_transform_reduce>(8); } -TEST_CASE("BinaryTransformReduce.Dynamic.9thread" * doctest::timeout(300)) { - binary_transform_reduce>(9); -} - -TEST_CASE("BinaryTransformReduce.Dynamic.10thread" * doctest::timeout(300)) { - binary_transform_reduce>(10); -} - -TEST_CASE("BinaryTransformReduce.Dynamic.11thread" * doctest::timeout(300)) { - binary_transform_reduce>(11); -} - -TEST_CASE("BinaryTransformReduce.Dynamic.12thread" * doctest::timeout(300)) { - binary_transform_reduce>(12); -} - // static TEST_CASE("BinaryTransformReduce.Static.1thread" * doctest::timeout(300)) { binary_transform_reduce>(1); } -TEST_CASE("BinaryTransformReduce.Static.2thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Static.2threads" * doctest::timeout(300)) { binary_transform_reduce>(2); } -TEST_CASE("BinaryTransformReduce.Static.3thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Static.3threads" * doctest::timeout(300)) { binary_transform_reduce>(3); } -TEST_CASE("BinaryTransformReduce.Static.4thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Static.4threads" * doctest::timeout(300)) { binary_transform_reduce>(4); } -TEST_CASE("BinaryTransformReduce.Static.5thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Static.5threads" * doctest::timeout(300)) { binary_transform_reduce>(5); } -TEST_CASE("BinaryTransformReduce.Static.6thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Static.6threads" * doctest::timeout(300)) { binary_transform_reduce>(6); } -TEST_CASE("BinaryTransformReduce.Static.7thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Static.7threads" * doctest::timeout(300)) { binary_transform_reduce>(7); } -TEST_CASE("BinaryTransformReduce.Static.8thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Static.8threads" * doctest::timeout(300)) { binary_transform_reduce>(8); } -TEST_CASE("BinaryTransformReduce.Static.9thread" * doctest::timeout(300)) { - binary_transform_reduce>(9); -} - -TEST_CASE("BinaryTransformReduce.Static.10thread" * doctest::timeout(300)) { - binary_transform_reduce>(10); -} - -TEST_CASE("BinaryTransformReduce.Static.11thread" * doctest::timeout(300)) { - binary_transform_reduce>(11); -} - -TEST_CASE("BinaryTransformReduce.Static.12thread" * doctest::timeout(300)) { - binary_transform_reduce>(12); -} - // random TEST_CASE("BinaryTransformReduce.Random.1thread" * doctest::timeout(300)) { binary_transform_reduce>(1); } -TEST_CASE("BinaryTransformReduce.Random.2thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Random.2threads" * doctest::timeout(300)) { binary_transform_reduce>(2); } -TEST_CASE("BinaryTransformReduce.Random.3thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Random.3threads" * doctest::timeout(300)) { binary_transform_reduce>(3); } -TEST_CASE("BinaryTransformReduce.Random.4thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Random.4threads" * doctest::timeout(300)) { binary_transform_reduce>(4); } -TEST_CASE("BinaryTransformReduce.Random.5thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Random.5threads" * doctest::timeout(300)) { binary_transform_reduce>(5); } -TEST_CASE("BinaryTransformReduce.Random.6thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Random.6threads" * doctest::timeout(300)) { binary_transform_reduce>(6); } -TEST_CASE("BinaryTransformReduce.Random.7thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Random.7threads" * doctest::timeout(300)) { binary_transform_reduce>(7); } -TEST_CASE("BinaryTransformReduce.Random.8thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduce.Random.8threads" * doctest::timeout(300)) { binary_transform_reduce>(8); } -TEST_CASE("BinaryTransformReduce.Random.9thread" * doctest::timeout(300)) { - binary_transform_reduce>(9); -} - -TEST_CASE("BinaryTransformReduce.Random.10thread" * doctest::timeout(300)) { - binary_transform_reduce>(10); -} - -TEST_CASE("BinaryTransformReduce.Random.11thread" * doctest::timeout(300)) { - binary_transform_reduce>(11); -} - -TEST_CASE("BinaryTransformReduce.Random.12thread" * doctest::timeout(300)) { - binary_transform_reduce>(12); -} // ---------------------------------------------------------------------------- // binary_transform_reduce_sum // ---------------------------------------------------------------------------- @@ -1502,197 +1370,133 @@ TEST_CASE("BinaryTransformReduceSum.Guided.1thread" * doctest::timeout(300)) { binary_transform_reduce_sum>(1); } -TEST_CASE("BinaryTransformReduceSum.Guided.2thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Guided.2threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(2); } -TEST_CASE("BinaryTransformReduceSum.Guided.3thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Guided.3threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(3); } -TEST_CASE("BinaryTransformReduceSum.Guided.4thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Guided.4threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(4); } -TEST_CASE("BinaryTransformReduceSum.Guided.5thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Guided.5threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(5); } -TEST_CASE("BinaryTransformReduceSum.Guided.6thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Guided.6threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(6); } -TEST_CASE("BinaryTransformReduceSum.Guided.7thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Guided.7threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(7); } -TEST_CASE("BinaryTransformReduceSum.Guided.8thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Guided.8threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(8); } -TEST_CASE("BinaryTransformReduceSum.Guided.9thread" * doctest::timeout(300)) { - binary_transform_reduce_sum>(9); -} - -TEST_CASE("BinaryTransformReduceSum.Guided.10thread" * doctest::timeout(300)) { - binary_transform_reduce_sum>(10); -} - -TEST_CASE("BinaryTransformReduceSum.Guided.11thread" * doctest::timeout(300)) { - binary_transform_reduce_sum>(11); -} - -TEST_CASE("BinaryTransformReduceSum.Guided.12thread" * doctest::timeout(300)) { - binary_transform_reduce_sum>(12); -} - // dynamic TEST_CASE("BinaryTransformReduceSum.Dynamic.1thread" * doctest::timeout(300)) { binary_transform_reduce_sum>(1); } -TEST_CASE("BinaryTransformReduceSum.Dynamic.2thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Dynamic.2threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(2); } -TEST_CASE("BinaryTransformReduceSum.Dynamic.3thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Dynamic.3threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(3); } -TEST_CASE("BinaryTransformReduceSum.Dynamic.4thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Dynamic.4threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(4); } -TEST_CASE("BinaryTransformReduceSum.Dynamic.5thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Dynamic.5threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(5); } -TEST_CASE("BinaryTransformReduceSum.Dynamic.6thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Dynamic.6threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(6); } -TEST_CASE("BinaryTransformReduceSum.Dynamic.7thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Dynamic.7threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(7); } -TEST_CASE("BinaryTransformReduceSum.Dynamic.8thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Dynamic.8threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(8); } -TEST_CASE("BinaryTransformReduceSum.Dynamic.9thread" * doctest::timeout(300)) { - binary_transform_reduce_sum>(9); -} - -TEST_CASE("BinaryTransformReduceSum.Dynamic.10thread" * doctest::timeout(300)) { - binary_transform_reduce_sum>(10); -} - -TEST_CASE("BinaryTransformReduceSum.Dynamic.11thread" * doctest::timeout(300)) { - binary_transform_reduce_sum>(11); -} - -TEST_CASE("BinaryTransformReduceSum.Dynamic.12thread" * doctest::timeout(300)) { - binary_transform_reduce_sum>(12); -} - // static TEST_CASE("BinaryTransformReduceSum.Static.1thread" * doctest::timeout(300)) { binary_transform_reduce_sum>(1); } -TEST_CASE("BinaryTransformReduceSum.Static.2thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Static.2threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(2); } -TEST_CASE("BinaryTransformReduceSum.Static.3thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Static.3threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(3); } -TEST_CASE("BinaryTransformReduceSum.Static.4thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Static.4threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(4); } -TEST_CASE("BinaryTransformReduceSum.Static.5thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Static.5threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(5); } -TEST_CASE("BinaryTransformReduceSum.Static.6thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Static.6threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(6); } -TEST_CASE("BinaryTransformReduceSum.Static.7thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Static.7threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(7); } -TEST_CASE("BinaryTransformReduceSum.Static.8thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Static.8threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(8); } -TEST_CASE("BinaryTransformReduceSum.Static.9thread" * doctest::timeout(300)) { - binary_transform_reduce_sum>(9); -} - -TEST_CASE("BinaryTransformReduceSum.Static.10thread" * doctest::timeout(300)) { - binary_transform_reduce_sum>(10); -} - -TEST_CASE("BinaryTransformReduceSum.Static.11thread" * doctest::timeout(300)) { - binary_transform_reduce_sum>(11); -} - -TEST_CASE("BinaryTransformReduceSum.Static.12thread" * doctest::timeout(300)) { - binary_transform_reduce_sum>(12); -} - // random TEST_CASE("BinaryTransformReduceSum.Random.1thread" * doctest::timeout(300)) { binary_transform_reduce_sum>(1); } -TEST_CASE("BinaryTransformReduceSum.Random.2thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Random.2threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(2); } -TEST_CASE("BinaryTransformReduceSum.Random.3thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Random.3threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(3); } -TEST_CASE("BinaryTransformReduceSum.Random.4thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Random.4threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(4); } -TEST_CASE("BinaryTransformReduceSum.Random.5thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Random.5threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(5); } -TEST_CASE("BinaryTransformReduceSum.Random.6thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Random.6threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(6); } -TEST_CASE("BinaryTransformReduceSum.Random.7thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Random.7threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(7); } -TEST_CASE("BinaryTransformReduceSum.Random.8thread" * doctest::timeout(300)) { +TEST_CASE("BinaryTransformReduceSum.Random.8threads" * doctest::timeout(300)) { binary_transform_reduce_sum>(8); } -TEST_CASE("BinaryTransformReduceSum.Random.9thread" * doctest::timeout(300)) { - binary_transform_reduce_sum>(9); -} - -TEST_CASE("BinaryTransformReduceSum.Random.10thread" * doctest::timeout(300)) { - binary_transform_reduce_sum>(10); -} - -TEST_CASE("BinaryTransformReduceSum.Random.11thread" * doctest::timeout(300)) { - binary_transform_reduce_sum>(11); -} - -TEST_CASE("BinaryTransformReduceSum.Random.12thread" * doctest::timeout(300)) { - binary_transform_reduce_sum>(12); -} - // ---------------------------------------------------------------------------- // Closure Wrapper // ---------------------------------------------------------------------------- @@ -1872,3 +1676,141 @@ TEST_CASE("ClosureWrapper.TransformReduce2.Dynamic" * doctest::timeout(300)) { } } } + +// -------------------------------------------------------- +// Silent Async Reduce +// -------------------------------------------------------- + +void silent_async(unsigned W) { + + tf::Executor executor(W); + + std::vector vec(1000); + + for(auto& i : vec) i = ::rand() % 100 - 50; + + for(size_t n=1; n::max(); + int pmin = std::numeric_limits::max(); + + auto beg = vec.begin(); + auto end = vec.end(); + + for(auto itr = beg; itr != end; itr++) { + smin = std::min(*itr, smin); + } + + executor.silent_async(tf::make_reduce_task( + beg, end, pmin, [](int& l, int& r){ + return std::min(l, r); + })); + + executor.wait_for_all(); + + REQUIRE(smin != std::numeric_limits::max()); + REQUIRE(pmin != std::numeric_limits::max()); + REQUIRE(smin == pmin); + } +} + +TEST_CASE("Reduce.SilentAsync.1thread" * doctest::timeout(300)) { + silent_async(1); +} + +TEST_CASE("Reduce.SilentAsync.2threads" * doctest::timeout(300)) { + silent_async(2); +} + +TEST_CASE("Reduce.SilentAsync.3threads" * doctest::timeout(300)) { + silent_async(3); +} + +TEST_CASE("Reduce.SilentAsync.4threads" * doctest::timeout(300)) { + silent_async(4); +} + +TEST_CASE("Reduce.SilentAsync.5threads" * doctest::timeout(300)) { + silent_async(5); +} + +TEST_CASE("Reduce.SilentAsync.6threads" * doctest::timeout(300)) { + silent_async(6); +} + +TEST_CASE("Reduce.SilentAsync.7threads" * doctest::timeout(300)) { + silent_async(7); +} + +TEST_CASE("Reduce.SilentAsync.8threads" * doctest::timeout(300)) { + silent_async(8); +} + +// -------------------------------------------------------- +// Silent Dependent Async Reduce +// -------------------------------------------------------- + +void silent_dependent_async(unsigned W) { + + tf::Executor executor(W); + + std::vector vec(1000); + + for(auto& i : vec) i = ::rand() % 100 - 50; + + for(size_t n=1; n::max(); + int pmin = std::numeric_limits::max(); + + auto beg = vec.begin(); + auto end = vec.end(); + + for(auto itr = beg; itr != end; itr++) { + smin = std::min(*itr, smin); + } + + executor.silent_dependent_async(tf::make_reduce_task( + beg, end, pmin, [](int& l, int& r){ + return std::min(l, r); + })); + + executor.wait_for_all(); + + REQUIRE(smin != std::numeric_limits::max()); + REQUIRE(pmin != std::numeric_limits::max()); + REQUIRE(smin == pmin); + } +} + +TEST_CASE("Reduce.SilentDependentAsync.1thread" * doctest::timeout(300)) { + silent_dependent_async(1); +} + +TEST_CASE("Reduce.SilentDependentAsync.2threads" * doctest::timeout(300)) { + silent_dependent_async(2); +} + +TEST_CASE("Reduce.SilentDependentAsync.3threads" * doctest::timeout(300)) { + silent_dependent_async(3); +} + +TEST_CASE("Reduce.SilentDependentAsync.4threads" * doctest::timeout(300)) { + silent_dependent_async(4); +} + +TEST_CASE("Reduce.SilentDependentAsync.5threads" * doctest::timeout(300)) { + silent_dependent_async(5); +} + +TEST_CASE("Reduce.SilentDependentAsync.6threads" * doctest::timeout(300)) { + silent_dependent_async(6); +} + +TEST_CASE("Reduce.SilentDependentAsync.7threads" * doctest::timeout(300)) { + silent_dependent_async(7); +} + +TEST_CASE("Reduce.SilentDependentAsync.8threads" * doctest::timeout(300)) { + silent_dependent_async(8); +} diff --git a/unittests/test_runtimes.cpp b/unittests/test_runtimes.cpp index f68277aee..63781d7b1 100644 --- a/unittests/test_runtimes.cpp +++ b/unittests/test_runtimes.cpp @@ -83,3 +83,129 @@ TEST_CASE("Runtime.ExternalGraph.Simple" * doctest::timeout(300)) { } + +// -------------------------------------------------------------------------------------- +// Fibonacci +// -------------------------------------------------------------------------------------- + +size_t fibonacci(size_t N, tf::Runtime& rt) { + + if (N < 2) { + return N; + } + + size_t res1, res2; + + rt.silent_async([N, &res1](tf::Runtime& rt1){ res1 = fibonacci(N-1, rt1); }); + + // tail optimization + res2 = fibonacci(N-2, rt); + + // use corun to avoid blocking the worker from waiting the two children tasks to finish + rt.corun(); + + return res1 + res2; +} + +size_t fibonacci(size_t T, size_t N) { + tf::Executor executor(T); + size_t res; + executor.async([N, &res](tf::Runtime& rt){ res = fibonacci(N, rt); }).get(); + return res; +} + +TEST_CASE("Runtime.Fibonacci.1thread" * doctest::timeout(250)) { + REQUIRE(fibonacci(1, 25) == 75025); +} + +TEST_CASE("Runtime.Fibonacci.2threads" * doctest::timeout(250)) { + REQUIRE(fibonacci(2, 25) == 75025); +} + +TEST_CASE("Runtime.Fibonacci.3threads" * doctest::timeout(250)) { + REQUIRE(fibonacci(3, 25) == 75025); +} + +TEST_CASE("Runtime.Fibonacci.4threads" * doctest::timeout(250)) { + REQUIRE(fibonacci(4, 25) == 75025); +} + +// -------------------------------------------------------------------------------------- +// Fibonacci +// -------------------------------------------------------------------------------------- + +size_t fibonacci_swapped(size_t N, tf::Runtime& rt) { + + if (N < 2) { + return N; + } + + size_t res1, res2; + + // tail optimization + res1 = fibonacci_swapped(N-1, rt); + + rt.silent_async([N, &res2](tf::Runtime& rt2){ res2 = fibonacci_swapped(N-2, rt2); }); + + // use corun to avoid blocking the worker from waiting the two children tasks to finish + rt.corun(); + + return res1 + res2; +} + +size_t fibonacci_swapped(size_t T, size_t N) { + tf::Executor executor(T); + size_t res; + executor.async([N, &res](tf::Runtime& rt){ res = fibonacci_swapped(N, rt); }).get(); + return res; +} + +TEST_CASE("Runtime.Fibonacci.1thread" * doctest::timeout(250)) { + REQUIRE(fibonacci_swapped(1, 25) == 75025); +} + +TEST_CASE("Runtime.Fibonacci.2threads" * doctest::timeout(250)) { + REQUIRE(fibonacci_swapped(2, 25) == 75025); +} + +TEST_CASE("Runtime.Fibonacci.3threads" * doctest::timeout(250)) { + REQUIRE(fibonacci_swapped(3, 25) == 75025); +} + +TEST_CASE("Runtime.Fibonacci.4threads" * doctest::timeout(250)) { + REQUIRE(fibonacci_swapped(4, 25) == 75025); +} + +// -------------------------------------------------------- +// Testcase: Runtime.Cancel +// -------------------------------------------------------- + +TEST_CASE("Runtime.Cancel" * doctest::timeout(300)) { + + std::atomic reached(false); + std::atomic cancelled(false); + + tf::Executor executor; + tf::Taskflow taskflow; + taskflow.emplace([&](tf::Runtime &rt) { + reached = true; + while (!cancelled) { + std::this_thread::yield(); + if (rt.is_cancelled()) { + cancelled = true; + break; + } + } + }); + + auto future = executor.run(std::move(taskflow)); + + // Need to wait until we run the runtime task or the cancel may immediately + // cancel the entire taskflow before the runtime task starts. + while(!reached); + future.cancel(); + future.get(); + + REQUIRE(cancelled == true); +} + diff --git a/unittests/test_scalable_pipelines.cpp b/unittests/test_scalable_pipelines.cpp index e412405f7..fb8fb1cd7 100644 --- a/unittests/test_scalable_pipelines.cpp +++ b/unittests/test_scalable_pipelines.cpp @@ -64,7 +64,7 @@ void scalable_pipeline(size_t num_lines, size_t num_pipes) { size_t N = 0; std::vector< tf::Pipe> > pipes; - std::vector< int > data(num_lines, -1); + std::vector< size_t > data(num_lines, 0); for(size_t i=0; i> > pipes; - std::vector< int > data(num_lines, -1); + std::vector< size_t > data(num_lines, 0); tf::ScalablePipeline spl(num_lines); @@ -178,7 +178,7 @@ void scalable_pipeline_iterative_reset(size_t num_lines, size_t num_pipes) { size_t N = 0; std::vector< tf::Pipe> > pipes; - std::vector< int > data(num_lines, -1); + std::vector< size_t > data(num_lines, 0); tf::ScalablePipeline spl(num_lines); @@ -249,7 +249,7 @@ void scalable_pipeline_lines_reset(size_t num_lines, size_t num_pipes) { for(size_t l = 1; l <= num_lines; ++l) { tf::Taskflow taskflow; - std::vector data(l, -1); + std::vector data(l, 0); auto init = taskflow.emplace([&](){ for(size_t i=0; i semaphores(10); + + for(auto& sema : semaphores) { + REQUIRE(sema.value() == 0); + REQUIRE(sema.max_value() == 0); + sema.reset(2); + REQUIRE(sema.value() == 2); + REQUIRE(sema.max_value() == 2); + } + + size_t N = 1024; + size_t counter {0}; + + for(size_t i=0; i> counters(L); + std::vector semaphores(L); + + for(auto& semaphore : semaphores) { + semaphore.reset(1); + } + + tf::Executor executor(W); + tf::Taskflow taskflow; + + for(size_t i=0; i(1, 100000); } -TEST_CASE("ParallelSort.int.2.100000") { +TEST_CASE("ParallelSort.int.2.100000" * doctest::timeout(300)) { ps_pod(2, 100000); } -TEST_CASE("ParallelSort.int.3.100000") { +TEST_CASE("ParallelSort.int.3.100000" * doctest::timeout(300)) { ps_pod(3, 100000); } -TEST_CASE("ParallelSort.int.4.100000") { +TEST_CASE("ParallelSort.int.4.100000" * doctest::timeout(300)) { ps_pod(4, 100000); } -TEST_CASE("ParallelSort.ldouble.1.100000") { +TEST_CASE("ParallelSort.ldouble.1.100000" * doctest::timeout(300)) { ps_pod(1, 100000); } -TEST_CASE("ParallelSort.ldouble.2.100000") { +TEST_CASE("ParallelSort.ldouble.2.100000" * doctest::timeout(300)) { ps_pod(2, 100000); } -TEST_CASE("ParallelSort.ldouble.3.100000") { +TEST_CASE("ParallelSort.ldouble.3.100000" * doctest::timeout(300)) { ps_pod(3, 100000); } -TEST_CASE("ParallelSort.ldouble.4.100000") { +TEST_CASE("ParallelSort.ldouble.4.100000" * doctest::timeout(300)) { ps_pod(4, 100000); } @@ -118,19 +118,19 @@ void ps_object(size_t W, size_t N) { )); } -TEST_CASE("ParallelSort.object.1.100000") { +TEST_CASE("ParallelSort.object.1.100000" * doctest::timeout(300)) { ps_object(1, 100000); } -TEST_CASE("ParallelSort.object.2.100000") { +TEST_CASE("ParallelSort.object.2.100000" * doctest::timeout(300)) { ps_object(2, 100000); } -TEST_CASE("ParallelSort.object.3.100000") { +TEST_CASE("ParallelSort.object.3.100000" * doctest::timeout(300)) { ps_object(3, 100000); } -TEST_CASE("ParallelSort.object.4.100000") { +TEST_CASE("ParallelSort.object.4.100000" * doctest::timeout(300)) { ps_object(4, 100000); } @@ -158,93 +158,291 @@ void move_only_ps(unsigned W) { } -TEST_CASE("ParallelSort.MoveOnlyObject.1thread") { +TEST_CASE("ParallelSort.MoveOnlyObject.1thread" * doctest::timeout(300)) { move_only_ps(1); } -TEST_CASE("ParallelSort.MoveOnlyObject.2threads") { +TEST_CASE("ParallelSort.MoveOnlyObject.2threads" * doctest::timeout(300)) { move_only_ps(2); } -TEST_CASE("ParallelSort.MoveOnlyObject.3threads") { +TEST_CASE("ParallelSort.MoveOnlyObject.3threads" * doctest::timeout(300)) { move_only_ps(3); } -TEST_CASE("ParallelSort.MoveOnlyObject.4threads") { +TEST_CASE("ParallelSort.MoveOnlyObject.4threads" * doctest::timeout(300)) { move_only_ps(4); } +// ---------------------------------------------------------------------------- +// Parallel Sort with Async Tasks +// ---------------------------------------------------------------------------- + +void async(size_t W) { + + std::srand(static_cast(time(NULL))); + + tf::Executor executor(W); + std::vector data; + + for(size_t n=0; n < 100000; n = (n ? n*10 : 1)) { + + data.resize(n); + + for(auto& d : data) { + d = ::rand() % 1000 - 500; + } + + executor.async(tf::make_sort_task(data.begin(), data.end())); + executor.wait_for_all(); + REQUIRE(std::is_sorted(data.begin(), data.end())); + } +} + +TEST_CASE("ParallelSort.Async.1thread" * doctest::timeout(300)) { + async(1); +} + +TEST_CASE("ParallelSort.Async.2threads" * doctest::timeout(300)) { + async(2); +} + +TEST_CASE("ParallelSort.Async.3threads" * doctest::timeout(300)) { + async(3); +} + +TEST_CASE("ParallelSort.Async.4threads" * doctest::timeout(300)) { + async(4); +} + +// ---------------------------------------------------------------------------- +// Parallel Sort with Dependent Async Tasks +// ---------------------------------------------------------------------------- + +void dependent_async(size_t W) { + + std::srand(static_cast(time(NULL))); + + tf::Executor executor(W); + std::vector data; + + for(size_t n=0; n < 100000; n = (n ? n*10 : 1)) { + + data.resize(n); + + for(auto& d : data) { + d = ::rand() % 1000 - 500; + } + + executor.dependent_async(tf::make_sort_task(data.begin(), data.end())); + executor.wait_for_all(); + REQUIRE(std::is_sorted(data.begin(), data.end())); + } +} + +TEST_CASE("ParallelSort.DependentAsync.1thread" * doctest::timeout(300)) { + dependent_async(1); +} + +TEST_CASE("ParallelSort.DependentAsync.2threads" * doctest::timeout(300)) { + dependent_async(2); +} + +TEST_CASE("ParallelSort.DependentAsync.3threads" * doctest::timeout(300)) { + dependent_async(3); +} + +TEST_CASE("ParallelSort.DependentAsync.4threads" * doctest::timeout(300)) { + dependent_async(4); +} + +// ---------------------------------------------------------------------------- +// Parallel Sort with Silent Async Tasks +// ---------------------------------------------------------------------------- + +void silent_async(size_t W) { + + std::srand(static_cast(time(NULL))); + + tf::Executor executor(W); + std::vector data; + + for(size_t n=0; n < 100000; n = (n ? n*10 : 1)) { + + data.resize(n); + + for(auto& d : data) { + d = ::rand() % 1000 - 500; + } + + executor.silent_async(tf::make_sort_task(data.begin(), data.end())); + executor.wait_for_all(); + REQUIRE(std::is_sorted(data.begin(), data.end())); + } +} + +TEST_CASE("ParallelSort.SilentAsync.1thread" * doctest::timeout(300)) { + silent_async(1); +} + +TEST_CASE("ParallelSort.SilentAsync.2threads" * doctest::timeout(300)) { + silent_async(2); +} + +TEST_CASE("ParallelSort.SilentAsync.3threads" * doctest::timeout(300)) { + silent_async(3); +} + +TEST_CASE("ParallelSort.SilentAsync.4threads" * doctest::timeout(300)) { + silent_async(4); +} + +// ---------------------------------------------------------------------------- +// Parallel Sort with Silent Dependent Async Tasks +// ---------------------------------------------------------------------------- + +void silent_dependent_async(size_t W) { + + std::srand(static_cast(time(NULL))); + + tf::Executor executor(W); + std::vector data; + + for(size_t n=0; n < 100000; n = (n ? n*10 : 1)) { + + data.resize(n); + + for(auto& d : data) { + d = ::rand() % 1000 - 500; + } + + executor.silent_dependent_async(tf::make_sort_task(data.begin(), data.end())); + executor.wait_for_all(); + REQUIRE(std::is_sorted(data.begin(), data.end())); + } +} + +TEST_CASE("ParallelSort.SilentDependentAsync.1thread" * doctest::timeout(300)) { + silent_dependent_async(1); +} + +TEST_CASE("ParallelSort.SilentDependentAsync.2threads" * doctest::timeout(300)) { + silent_dependent_async(2); +} + +TEST_CASE("ParallelSort.SilentDependentAsync.3threads" * doctest::timeout(300)) { + silent_dependent_async(3); +} + +TEST_CASE("ParallelSort.SilentDependentAsync.4threads" * doctest::timeout(300)) { + silent_dependent_async(4); +} + + // -------------------------------------------------------- // Testcase: BubbleSort // -------------------------------------------------------- -TEST_CASE("BubbleSort" * doctest::timeout(300)) { - - for(unsigned w=1; w<=9; w+=2) { - - tf::Executor executor(w); - - for(int end=10; end <= 1000; end += 200) { - - tf::Taskflow taskflow("BubbleSort"); - - std::vector data(end); - - for(auto& d : data) d = ::rand()%100; - - auto gold = data; - std::sort(gold.begin(), gold.end()); - - std::atomicswapped; - - // init task - auto init = taskflow.emplace([&swapped](){ swapped = false; }); - auto cond = taskflow.emplace([&swapped](){ - if(swapped) { - swapped = false; - return 0; - } - return 1; - }); - auto stop = taskflow.emplace([](){}); - - auto even_phase = taskflow.emplace([&](tf::Subflow& sf){ - for(size_t i=0; i data[i+1]) { - std::swap(data[i], data[i+1]); - swapped = true; - } - }); - } - }); - - auto odd_phase = taskflow.emplace([&](tf::Subflow& sf) { - for(size_t i=1; i data[i+1]) { - std::swap(data[i], data[i+1]); - swapped = true; - } - }); - } - }); - - init.precede(even_phase).name("init"); - even_phase.precede(odd_phase).name("even-swap"); - odd_phase.precede(cond).name("odd-swap"); - cond.precede(even_phase, stop).name("cond"); - - executor.run(taskflow).wait(); - - REQUIRE(gold == data); - } +void bubble_sort(unsigned W) { + + tf::Executor executor(W); + tf::Taskflow taskflow; + + std::vector data; + + for(int end=1; end <= 1000; end *= 10) { + + taskflow.clear(); + data.resize(end); + + for(auto& d : data) d = ::rand()%100; + + auto gold = data; + std::sort(gold.begin(), gold.end()); + + std::atomic swapped; + + // init task + auto init = taskflow.emplace([&swapped](){ swapped = false; }); + auto cond = taskflow.emplace([&swapped](){ + if(swapped) { + swapped = false; + return 0; + } + return 1; + }); + auto stop = taskflow.emplace([](){}); + + auto even_phase = taskflow.emplace([&](tf::Subflow& sf){ + for(size_t i=0; i data[i+1]) { + std::swap(data[i], data[i+1]); + swapped = true; + } + }); + } + }); + + auto odd_phase = taskflow.emplace([&](tf::Subflow& sf) { + for(size_t i=1; i data[i+1]) { + std::swap(data[i], data[i+1]); + swapped = true; + } + }); + } + }); + + init.precede(even_phase).name("init"); + even_phase.precede(odd_phase).name("even-swap"); + odd_phase.precede(cond).name("odd-swap"); + cond.precede(even_phase, stop).name("cond"); + + executor.run(taskflow).wait(); + + REQUIRE(gold == data); } } +TEST_CASE("BubbleSort.1thread" * doctest::timeout(300)) { + bubble_sort(1); +} + +TEST_CASE("BubbleSort.2threads" * doctest::timeout(300)) { + bubble_sort(2); +} + +TEST_CASE("BubbleSort.3threads" * doctest::timeout(300)) { + bubble_sort(3); +} + +TEST_CASE("BubbleSort.4threads" * doctest::timeout(300)) { + bubble_sort(4); +} + +TEST_CASE("BubbleSort.5threads" * doctest::timeout(300)) { + bubble_sort(5); +} + +TEST_CASE("BubbleSort.6threads" * doctest::timeout(300)) { + bubble_sort(6); +} + +TEST_CASE("BubbleSort.7threads" * doctest::timeout(300)) { + bubble_sort(7); +} + +TEST_CASE("BubbleSort.8threads" * doctest::timeout(300)) { + bubble_sort(8); +} + + // -------------------------------------------------------- // Testcase: SelectionSort // -------------------------------------------------------- -TEST_CASE("SelectionSort" * doctest::timeout(300)) { + +void selection_sort(unsigned W) { std::function< void(tf::Subflow& sf, std::vector&, int, int, int&) @@ -312,60 +510,91 @@ TEST_CASE("SelectionSort" * doctest::timeout(300)) { SM.succeed(SL, SR); }; - for(unsigned w=1; w<=9; w+=2) { - - tf::Executor executor(w); + tf::Executor executor(W); + tf::Taskflow taskflow; + std::vector data; - for(int end=16; end <= 512; end <<= 1) { - tf::Taskflow taskflow("SelectionSort"); + for(int end=1; end <= 256; end <<= 1) { - std::vector data(end); + taskflow.clear(); + data.resize(end); - for(auto& d : data) d = ::rand()%100; + for(auto& d : data) d = ::rand()%100; - auto gold = data; - std::sort(gold.begin(), gold.end()); + auto gold = data; + std::sort(gold.begin(), gold.end()); - int beg = 0; - int min = -1; + int beg = 0; + int min = -1; - auto start = taskflow.emplace([](){}); + auto start = taskflow.emplace([](){}); - auto argmin = taskflow.emplace( - [&spawn, &data, &beg, end, &min](tf::Subflow& sf) mutable { - spawn(sf, data, beg, end, min); - }).name(std::string("[0") - + ":" - + std::to_string(end) + ")"); + auto argmin = taskflow.emplace( + [&spawn, &data, &beg, end, &min](tf::Subflow& sf) mutable { + spawn(sf, data, beg, end, min); + }).name(std::string("[0") + + ":" + + std::to_string(end) + ")"); - auto putmin = taskflow.emplace([&](){ - std::swap(data[beg], data[min]); - //std::cout << "select " << data[beg] << '\n'; - beg++; - if(beg < end) { - min = -1; - return 0; - } - else return 1; - }); + auto putmin = taskflow.emplace([&](){ + std::swap(data[beg], data[min]); + //std::cout << "select " << data[beg] << '\n'; + beg++; + if(beg < end) { + min = -1; + return 0; + } + else return 1; + }); - start.precede(argmin); - argmin.precede(putmin); - putmin.precede(argmin); + start.precede(argmin); + argmin.precede(putmin); + putmin.precede(argmin); - executor.run(taskflow).wait(); + executor.run(taskflow).wait(); - REQUIRE(gold == data); - //std::exit(1); - } + REQUIRE(gold == data); + //std::exit(1); } +} + +TEST_CASE("SelectionSort.1thread" * doctest::timeout(300)) { + selection_sort(1); +} + +TEST_CASE("SelectionSort.2threads" * doctest::timeout(300)) { + selection_sort(2); +} + +TEST_CASE("SelectionSort.3threads" * doctest::timeout(300)) { + selection_sort(3); +} + +TEST_CASE("SelectionSort.4threads" * doctest::timeout(300)) { + selection_sort(4); +} + +TEST_CASE("SelectionSort.5threads" * doctest::timeout(300)) { + selection_sort(5); +} + +TEST_CASE("SelectionSort.6threads" * doctest::timeout(300)) { + selection_sort(6); +} +TEST_CASE("SelectionSort.7threads" * doctest::timeout(300)) { + selection_sort(7); +} + +TEST_CASE("SelectionSort.8threads" * doctest::timeout(300)) { + selection_sort(8); } // -------------------------------------------------------- // Testcase: MergeSort // -------------------------------------------------------- -TEST_CASE("MergeSort" * doctest::timeout(300)) { + +void merge_sort(unsigned W) { std::function&, int, int)> spawn; @@ -422,38 +651,69 @@ TEST_CASE("MergeSort" * doctest::timeout(300)) { SM.succeed(SL, SR); }; - for(unsigned w=1; w<=9; w+=2) { - - tf::Executor executor(w); + tf::Executor executor(W); + tf::Taskflow taskflow; + std::vector data; - for(int end=10; end <= 10000; end = end * 10) { - tf::Taskflow taskflow("MergeSort"); + for(int end=10; end <= 10000; end *= 10) { - std::vector data(end); + taskflow.clear(); + data.resize(end); - for(auto& d : data) d = ::rand()%100; + for(auto& d : data) d = ::rand()%100; - auto gold = data; + auto gold = data; - taskflow.emplace([&spawn, &data, end](tf::Subflow& sf){ - spawn(sf, data, 0, end); - }).name(std::string("[0") - + ":" - + std::to_string(end) + ")"); + taskflow.emplace([&spawn, &data, end](tf::Subflow& sf){ + spawn(sf, data, 0, end); + }).name(std::string("[0") + + ":" + + std::to_string(end) + ")"); - executor.run(taskflow).wait(); + executor.run(taskflow).wait(); - std::sort(gold.begin(), gold.end()); + std::sort(gold.begin(), gold.end()); - REQUIRE(gold == data); - } + REQUIRE(gold == data); } } +TEST_CASE("MergeSort.1thread" * doctest::timeout(300)) { + merge_sort(1); +} + +TEST_CASE("MergeSort.2threads" * doctest::timeout(300)) { + merge_sort(2); +} + +TEST_CASE("MergeSort.3threads" * doctest::timeout(300)) { + merge_sort(3); +} + +TEST_CASE("MergeSort.4threads" * doctest::timeout(300)) { + merge_sort(4); +} + +TEST_CASE("MergeSort.5threads" * doctest::timeout(300)) { + merge_sort(5); +} + +TEST_CASE("MergeSort.6threads" * doctest::timeout(300)) { + merge_sort(6); +} + +TEST_CASE("MergeSort.7threads" * doctest::timeout(300)) { + merge_sort(7); +} + +TEST_CASE("MergeSort.8threads" * doctest::timeout(300)) { + merge_sort(8); +} + // -------------------------------------------------------- // Testcase: QuickSort // -------------------------------------------------------- -TEST_CASE("QuickSort" * doctest::timeout(300)) { +void quick_sort(unsigned W) { using itr_t = std::vector::iterator; @@ -502,35 +762,68 @@ TEST_CASE("QuickSort" * doctest::timeout(300)) { + ')'); }; - for(unsigned w=1; w<=9; w+=2) { + tf::Executor executor(W); + tf::Taskflow taskflow; + std::vector data; - tf::Executor executor(w); + for(size_t end=1; end <= 10000; end *= 10) { - for(int end=16; end <= 16384; end <<= 1) { + taskflow.clear(); + data.resize(end); - tf::Taskflow taskflow("QuickSort"); + for(auto& d : data) d = ::rand()%100; - std::vector data(end); + auto gold = data; - for(auto& d : data) d = ::rand()%100; + taskflow.emplace([&spawn, &data](tf::Subflow& sf){ + spawn(sf, data, data.begin(), data.end()); + }).name(std::string("[0") + + ":" + + std::to_string(end) + ")"); - auto gold = data; + executor.run(taskflow).wait(); - taskflow.emplace([&spawn, &data](tf::Subflow& sf){ - spawn(sf, data, data.begin(), data.end()); - }).name(std::string("[0") - + ":" - + std::to_string(end) + ")"); + std::sort(gold.begin(), gold.end()); - executor.run(taskflow).wait(); + REQUIRE(gold == data); + } + +} - std::sort(gold.begin(), gold.end()); +TEST_CASE("QuickSort.1thread" * doctest::timeout(300)) { + quick_sort(1); +} - REQUIRE(gold == data); - } - } +TEST_CASE("QuickSort.2threads" * doctest::timeout(300)) { + quick_sort(2); +} + +TEST_CASE("QuickSort.3threads" * doctest::timeout(300)) { + quick_sort(3); +} + +TEST_CASE("QuickSort.4threads" * doctest::timeout(300)) { + quick_sort(4); +} + +TEST_CASE("QuickSort.5threads" * doctest::timeout(300)) { + quick_sort(5); } +TEST_CASE("QuickSort.6threads" * doctest::timeout(300)) { + quick_sort(6); +} + +TEST_CASE("QuickSort.7threads" * doctest::timeout(300)) { + quick_sort(7); +} + +TEST_CASE("QuickSort.8threads" * doctest::timeout(300)) { + quick_sort(8); +} + + + //// ---------------------------------------------------------------------------- //// Exception //// ---------------------------------------------------------------------------- diff --git a/unittests/test_subflows.cpp b/unittests/test_subflows.cpp index 00f9856cd..be2bd62d2 100644 --- a/unittests/test_subflows.cpp +++ b/unittests/test_subflows.cpp @@ -189,285 +189,585 @@ TEST_CASE("JoinedSubflow.8threads" * doctest::timeout(300)){ joined_subflow(8); } +//// -------------------------------------------------------- +//// Testcase: DetachedSubflow +//// -------------------------------------------------------- +// +//void detached_subflow(unsigned W) { +// +// using namespace std::literals::chrono_literals; +// +// SUBCASE("Trivial") { +// tf::Executor executor(W); +// tf::Taskflow tf; +// +// // empty flow with future +// tf::Task subflow3, subflow3_; +// std::atomic fu3v{0}, fu3v_{0}; +// +// // empty flow +// auto subflow1 = tf.emplace([&] (tf::Subflow& fb) { +// fu3v++; +// fb.detach(); +// }).name("subflow1"); +// +// // nested empty flow +// auto subflow2 = tf.emplace([&] (tf::Subflow& fb) { +// fu3v++; +// fb.emplace([&] (tf::Subflow& fb2) { +// fu3v++; +// fb2.emplace( [&] (tf::Subflow& fb3) { +// fu3v++; +// fb3.join(); +// }).name("subflow2_1_1"); +// fb2.detach(); +// }).name("subflow2_1"); +// fb.detach(); +// }).name("subflow2"); +// +// subflow3 = tf.emplace([&] (tf::Subflow& fb) { +// +// REQUIRE((fu3v >= 2 && fu3v <= 4)); +// +// fu3v++; +// fu3v_++; +// +// subflow3_ = fb.emplace([&] (tf::Subflow& fb2) { +// REQUIRE(fu3v_ == 3); +// fu3v++; +// fu3v_++; +// fb2.join(); +// }); +// subflow3_.name("subflow3_"); +// +// // hereafter we use 100us to avoid dangling reference ... +// auto s1 = fb.emplace([&] () { +// fu3v_++; +// fu3v++; +// }).name("s1"); +// +// auto s2 = fb.emplace([&] () { +// fu3v_++; +// fu3v++; +// }).name("s2"); +// +// auto s3 = fb.emplace([&] () { +// fu3v++; +// REQUIRE(fu3v_ == 4); +// }).name("s3"); +// +// s1.precede(subflow3_); +// s2.precede(subflow3_); +// subflow3_.precede(s3); +// +// REQUIRE(fu3v_ == 1); +// +// fb.detach(); +// +// //return 100; +// }); +// subflow3.name("subflow3"); +// +// // empty flow to test future +// auto subflow4 = tf.emplace([&] () { +// REQUIRE((fu3v >= 3 && fu3v <= 9)); +// fu3v++; +// }).name("subflow4"); +// +// subflow1.precede(subflow2); +// subflow2.precede(subflow3); +// subflow3.precede(subflow4); +// +// executor.run(tf).get(); +// +// REQUIRE(fu3v == 10); +// REQUIRE(fu3v_ == 4); +// +// } +//} +// +//TEST_CASE("DetachedSubflow.1thread" * doctest::timeout(300)) { +// detached_subflow(1); +//} +// +//TEST_CASE("DetachedSubflow.2threads" * doctest::timeout(300)) { +// detached_subflow(2); +//} +// +//TEST_CASE("DetachedSubflow.3threads" * doctest::timeout(300)) { +// detached_subflow(3); +//} +// +//TEST_CASE("DetachedSubflow.4threads" * doctest::timeout(300)) { +// detached_subflow(4); +//} +// +//TEST_CASE("DetachedSubflow.5threads" * doctest::timeout(300)) { +// detached_subflow(5); +//} +// +//TEST_CASE("DetachedSubflow.6threads" * doctest::timeout(300)) { +// detached_subflow(6); +//} +// +//TEST_CASE("DetachedSubflow.7threads" * doctest::timeout(300)) { +// detached_subflow(7); +//} +// +//TEST_CASE("DetachedSubflow.8threads" * doctest::timeout(300)) { +// detached_subflow(8); +//} +// +// +//// -------------------------------------------------------- +//// Testcase: TreeSubflow +//// -------------------------------------------------------- +//void detach_spawn(const int max_depth, std::atomic& counter, int depth, tf::Subflow& subflow) { +// if(depth < max_depth) { +// counter.fetch_add(1, std::memory_order_relaxed); +// subflow.emplace([&, max_depth, depth=depth+1](tf::Subflow& sfl){ +// detach_spawn(max_depth, counter, depth, sfl); } +// ); +// subflow.emplace([&, max_depth, depth=depth+1](tf::Subflow& sfr){ +// detach_spawn(max_depth, counter, depth, sfr); } +// ); +// subflow.detach(); +// } +//} +// +//void join_spawn(const int max_depth, std::atomic& counter, int depth, tf::Subflow& subflow) { +// if(depth < max_depth) { +// counter.fetch_add(1, std::memory_order_relaxed); +// subflow.emplace([&, max_depth, depth=depth+1](tf::Subflow& sfl){ +// join_spawn(max_depth, counter, depth, sfl); } +// ); +// subflow.emplace([&, max_depth, depth=depth+1](tf::Subflow& sfr){ +// join_spawn(max_depth, counter, depth, sfr); } +// ); +// } +//} +// +//void mix_spawn( +// const int max_depth, std::atomic& counter, int depth, tf::Subflow& subflow +//) { +// +// if(depth < max_depth) { +// auto ret = counter.fetch_add(1, std::memory_order_relaxed); +// subflow.emplace([&, max_depth, depth=depth+1](tf::Subflow& sfl){ +// mix_spawn(max_depth, counter, depth, sfl); } +// ).name(std::string("left") + std::to_string(ret%2)); +// subflow.emplace([&, max_depth, depth=depth+1](tf::Subflow& sfr){ +// mix_spawn(max_depth, counter, depth, sfr); } +// ).name(std::string("right") + std::to_string(ret%2)); +// if(ret % 2) { +// subflow.detach(); +// } +// } +//} +// +//TEST_CASE("TreeSubflow" * doctest::timeout(300)) { +// +// SUBCASE("AllDetach") { +// constexpr int max_depth {10}; +// for(int W=1; W<=4; W++) { +// std::atomic counter {0}; +// tf::Taskflow tf; +// tf.emplace([&](tf::Subflow& subflow){ +// detach_spawn(max_depth, counter, 0, subflow); +// }); +// +// tf::Executor executor(W); +// executor.run(tf).get(); +// REQUIRE(counter == (1< counter {0}; +// tf::Taskflow tf; +// tf.emplace([&](tf::Subflow& subflow){ +// join_spawn(max_depth, counter, 0, subflow); +// }); +// tf::Executor executor(W); +// executor.run(tf).get(); +// REQUIRE(counter == (1< counter {0}; +// tf::Taskflow tf; +// tf.emplace([&](tf::Subflow& subflow){ +// mix_spawn(max_depth, counter, 0, subflow); +// }).name("top task"); +// +// tf::Executor executor(W); +// executor.run(tf).get(); +// REQUIRE(counter == (1< fu3v{0}, fu3v_{0}; + taskflow.emplace([&res, N] (tf::Subflow& sbf) { + res = fibonacci_spawn(N, sbf); + }); - // empty flow - auto subflow1 = tf.emplace([&] (tf::Subflow& fb) { - fu3v++; - fb.detach(); - }).name("subflow1"); + executor.run(taskflow).wait(); - // nested empty flow - auto subflow2 = tf.emplace([&] (tf::Subflow& fb) { - fu3v++; - fb.emplace([&] (tf::Subflow& fb2) { - fu3v++; - fb2.emplace( [&] (tf::Subflow& fb3) { - fu3v++; - fb3.join(); - }).name("subflow2_1_1"); - fb2.detach(); - }).name("subflow2_1"); - fb.detach(); - }).name("subflow2"); + REQUIRE(res == 6765); +} - subflow3 = tf.emplace([&] (tf::Subflow& fb) { +TEST_CASE("FibSubflow.1thread" * doctest::timeout(300)) { + fibonacci(1); +} - REQUIRE((fu3v >= 2 && fu3v <= 4)); +TEST_CASE("FibSubflow.2threads" * doctest::timeout(300)) { + fibonacci(2); +} - fu3v++; - fu3v_++; +TEST_CASE("FibSubflow.4threads" * doctest::timeout(300)) { + fibonacci(4); +} - subflow3_ = fb.emplace([&] (tf::Subflow& fb2) { - REQUIRE(fu3v_ == 3); - fu3v++; - fu3v_++; - fb2.join(); - }); - subflow3_.name("subflow3_"); +TEST_CASE("FibSubflow.5threads" * doctest::timeout(300)) { + fibonacci(5); +} - // hereafter we use 100us to avoid dangling reference ... - auto s1 = fb.emplace([&] () { - fu3v_++; - fu3v++; - }).name("s1"); +TEST_CASE("FibSubflow.6threads" * doctest::timeout(300)) { + fibonacci(6); +} - auto s2 = fb.emplace([&] () { - fu3v_++; - fu3v++; - }).name("s2"); +TEST_CASE("FibSubflow.7threads" * doctest::timeout(300)) { + fibonacci(7); +} - auto s3 = fb.emplace([&] () { - fu3v++; - REQUIRE(fu3v_ == 4); - }).name("s3"); +TEST_CASE("FibSubflow.8threads" * doctest::timeout(300)) { + fibonacci(8); +} - s1.precede(subflow3_); - s2.precede(subflow3_); - subflow3_.precede(s3); +// ---------------------------------------------------------------------------- +// multiple subflow runs +// ---------------------------------------------------------------------------- +void multiple_subflow_runs(unsigned W) { - REQUIRE(fu3v_ == 1); + tf::Executor executor(W); + tf::Taskflow taskflow; - fb.detach(); + std::atomic count {0}; - //return 100; - }); - subflow3.name("subflow3"); + auto A = taskflow.emplace([&](){ count ++; }); + auto B = taskflow.emplace([&](tf::Subflow& subflow){ + count ++; + auto B1 = subflow.emplace([&](){ count++; }); + auto B2 = subflow.emplace([&](){ count++; }); + auto B3 = subflow.emplace([&](){ count++; }); + B1.precede(B3); B2.precede(B3); + }); + auto C = taskflow.emplace([&](){ count ++; }); + auto D = taskflow.emplace([&](){ count ++; }); + + A.precede(B, C); + B.precede(D); + C.precede(D); + + std::list> fu_list; + for(size_t i=0; i<500; i++) { + if(i == 499) { + executor.run(taskflow).get(); // Synchronize the first 500 runs + executor.run_n(taskflow, 500); // Run 500 times more + } + else if(i % 2) { + fu_list.push_back(executor.run(taskflow)); + } + else { + fu_list.push_back(executor.run(taskflow, [&, i=i](){ + REQUIRE(count == (i+1)*7); }) + ); + } + } - // empty flow to test future - auto subflow4 = tf.emplace([&] () { - REQUIRE((fu3v >= 3 && fu3v <= 9)); - fu3v++; - }).name("subflow4"); + executor.wait_for_all(); - subflow1.precede(subflow2); - subflow2.precede(subflow3); - subflow3.precede(subflow4); - - executor.run(tf).get(); + for(auto& fu: fu_list) { + REQUIRE(fu.valid()); + REQUIRE(fu.wait_for(std::chrono::seconds(1)) == std::future_status::ready); + } - REQUIRE(fu3v == 10); - REQUIRE(fu3v_ == 4); + REQUIRE(count == 7000); +} - } +TEST_CASE("MultipleSubflowRuns.1thread" * doctest::timeout(300)) { + multiple_subflow_runs(1); } -TEST_CASE("DetachedSubflow.1thread" * doctest::timeout(300)) { - detached_subflow(1); +TEST_CASE("MultipleSubflowRuns.2threads" * doctest::timeout(300)) { + multiple_subflow_runs(2); } -TEST_CASE("DetachedSubflow.2threads" * doctest::timeout(300)) { - detached_subflow(2); +TEST_CASE("MultipleSubflowRuns.3threads" * doctest::timeout(300)) { + multiple_subflow_runs(3); } -TEST_CASE("DetachedSubflow.3threads" * doctest::timeout(300)) { - detached_subflow(3); +TEST_CASE("MultipleSubflowRuns.4threads" * doctest::timeout(300)) { + multiple_subflow_runs(4); } -TEST_CASE("DetachedSubflow.4threads" * doctest::timeout(300)) { - detached_subflow(4); +TEST_CASE("MultipleSubflowRuns.4threads" * doctest::timeout(300)) { + multiple_subflow_runs(4); } -TEST_CASE("DetachedSubflow.5threads" * doctest::timeout(300)) { - detached_subflow(5); +TEST_CASE("MultipleSubflowRuns.5threads" * doctest::timeout(300)) { + multiple_subflow_runs(5); } -TEST_CASE("DetachedSubflow.6threads" * doctest::timeout(300)) { - detached_subflow(6); +TEST_CASE("MultipleSubflowRuns.6threads" * doctest::timeout(300)) { + multiple_subflow_runs(6); } -TEST_CASE("DetachedSubflow.7threads" * doctest::timeout(300)) { - detached_subflow(7); +TEST_CASE("MultipleSubflowRuns.7threads" * doctest::timeout(300)) { + multiple_subflow_runs(7); } -TEST_CASE("DetachedSubflow.8threads" * doctest::timeout(300)) { - detached_subflow(8); +TEST_CASE("MultipleSubflowRuns.8threads" * doctest::timeout(300)) { + multiple_subflow_runs(8); } +// ---------------------------------------------------------------------------- +// Multiple subflow runs with change +// ---------------------------------------------------------------------------- + +void multiple_subflow_runs_with_changed_taskflow(unsigned W) { + + tf::Executor executor(W); + tf::Taskflow taskflow; + + std::atomic count {0}; + + auto A = taskflow.emplace([&](){ count ++; }); + auto B = taskflow.emplace([&](tf::Subflow& subflow){ + count ++; + auto B1 = subflow.emplace([&](){ count++; }); + auto B2 = subflow.emplace([&](){ count++; }); + auto B3 = subflow.emplace([&](){ count++; }); + B1.precede(B3); B2.precede(B3); + }); + auto C = taskflow.emplace([&](){ count ++; }); + auto D = taskflow.emplace([&](){ count ++; }); + + A.precede(B, C); + B.precede(D); + C.precede(D); + + executor.run_n(taskflow, 10).get(); + REQUIRE(count == 70); + + auto E = taskflow.emplace([](){}); + D.precede(E); + executor.run_n(taskflow, 10).get(); + REQUIRE(count == 140); + + auto F = taskflow.emplace([](){}); + E.precede(F); + executor.run_n(taskflow, 10); + executor.wait_for_all(); + REQUIRE(count == 210); -// -------------------------------------------------------- -// Testcase: TreeSubflow -// -------------------------------------------------------- -void detach_spawn(const int max_depth, std::atomic& counter, int depth, tf::Subflow& subflow) { - if(depth < max_depth) { - counter.fetch_add(1, std::memory_order_relaxed); - subflow.emplace([&, max_depth, depth=depth+1](tf::Subflow& sfl){ - detach_spawn(max_depth, counter, depth, sfl); } - ); - subflow.emplace([&, max_depth, depth=depth+1](tf::Subflow& sfr){ - detach_spawn(max_depth, counter, depth, sfr); } - ); - subflow.detach(); - } } -void join_spawn(const int max_depth, std::atomic& counter, int depth, tf::Subflow& subflow) { - if(depth < max_depth) { - counter.fetch_add(1, std::memory_order_relaxed); - subflow.emplace([&, max_depth, depth=depth+1](tf::Subflow& sfl){ - join_spawn(max_depth, counter, depth, sfl); } - ); - subflow.emplace([&, max_depth, depth=depth+1](tf::Subflow& sfr){ - join_spawn(max_depth, counter, depth, sfr); } - ); - } +TEST_CASE("MultipleSubflowRuns.ChangedTaskflow.1thread" * doctest::timeout(300)) { + multiple_subflow_runs_with_changed_taskflow(1); } -void mix_spawn( - const int max_depth, std::atomic& counter, int depth, tf::Subflow& subflow -) { - - if(depth < max_depth) { - auto ret = counter.fetch_add(1, std::memory_order_relaxed); - subflow.emplace([&, max_depth, depth=depth+1](tf::Subflow& sfl){ - mix_spawn(max_depth, counter, depth, sfl); } - ).name(std::string("left") + std::to_string(ret%2)); - subflow.emplace([&, max_depth, depth=depth+1](tf::Subflow& sfr){ - mix_spawn(max_depth, counter, depth, sfr); } - ).name(std::string("right") + std::to_string(ret%2)); - if(ret % 2) { - subflow.detach(); - } - } +TEST_CASE("MultipleSubflowRuns.ChangedTaskflow.2threads" * doctest::timeout(300)) { + multiple_subflow_runs_with_changed_taskflow(2); } -TEST_CASE("TreeSubflow" * doctest::timeout(300)) { +TEST_CASE("MultipleSubflowRuns.ChangedTaskflow.3threads" * doctest::timeout(300)) { + multiple_subflow_runs_with_changed_taskflow(3); +} - SUBCASE("AllDetach") { - constexpr int max_depth {10}; - for(int W=1; W<=4; W++) { - std::atomic counter {0}; - tf::Taskflow tf; - tf.emplace([&](tf::Subflow& subflow){ - detach_spawn(max_depth, counter, 0, subflow); - }); +TEST_CASE("MultipleSubflowRuns.ChangedTaskflow.4threads" * doctest::timeout(300)) { + multiple_subflow_runs_with_changed_taskflow(4); +} - tf::Executor executor(W); - executor.run(tf).get(); - REQUIRE(counter == (1< counter {0}; - tf::Taskflow tf; - tf.emplace([&](tf::Subflow& subflow){ - join_spawn(max_depth, counter, 0, subflow); - }); - tf::Executor executor(W); - executor.run(tf).get(); - REQUIRE(counter == (1< counter {0}; - tf::Taskflow tf; - tf.emplace([&](tf::Subflow& subflow){ - mix_spawn(max_depth, counter, 0, subflow); - }).name("top task"); - - tf::Executor executor(W); - executor.run(tf).get(); - REQUIRE(counter == (1< count {0}; + auto A = taskflow.emplace([&](){ count ++; }); + auto B = taskflow.emplace([&](tf::Subflow& subflow){ + count ++; + auto B1 = subflow.emplace([&](){ count++; }); + auto B2 = subflow.emplace([&](){ count++; }); + auto B3 = subflow.emplace([&](){ count++; }); + B1.precede(B3); B2.precede(B3); + }); + auto C = taskflow.emplace([&](){ count ++; }); + auto D = taskflow.emplace([&](){ count ++; }); - taskflow.emplace([&res, N] (tf::Subflow& sbf) { - res = fibonacci_spawn(N, sbf); + A.precede(B, C); + B.precede(D); + C.precede(D); + + executor.run_until(taskflow, [run=10]() mutable { return run-- == 0; }, + [&](){ + REQUIRE(count == 70); + count = 0; + } + ).get(); + + + executor.run_until(taskflow, [run=10]() mutable { return run-- == 0; }, + [&](){ + REQUIRE(count == 70); + count = 0; }); - executor.run(taskflow).wait(); + executor.run_until(taskflow, [run=10]() mutable { return run-- == 0; }, + [&](){ + REQUIRE(count == 70); + count = 0; + } + ).get(); +} - REQUIRE(res == 6765); +TEST_CASE("MultipleSubflowRuns.Predicate.1thread" * doctest::timeout(300)) { + multiple_subflow_runs_with_predicate(1); } -TEST_CASE("FibSubflow.1thread") { - fibonacci(1); +TEST_CASE("MultipleSubflowRuns.Predicate.2threads" * doctest::timeout(300)) { + multiple_subflow_runs_with_predicate(2); } -TEST_CASE("FibSubflow.2threads") { - fibonacci(2); +TEST_CASE("MultipleSubflowRuns.Predicate.3threads" * doctest::timeout(300)) { + multiple_subflow_runs_with_predicate(3); } -TEST_CASE("FibSubflow.4threads") { - fibonacci(4); +TEST_CASE("MultipleSubflowRuns.Predicate.4threads" * doctest::timeout(300)) { + multiple_subflow_runs_with_predicate(4); } -TEST_CASE("FibSubflow.5threads") { - fibonacci(5); +TEST_CASE("MultipleSubflowRuns.Predicate.4threads" * doctest::timeout(300)) { + multiple_subflow_runs_with_predicate(4); } -TEST_CASE("FibSubflow.6threads") { - fibonacci(6); +TEST_CASE("MultipleSubflowRuns.Predicate.5threads" * doctest::timeout(300)) { + multiple_subflow_runs_with_predicate(5); } -TEST_CASE("FibSubflow.7threads") { - fibonacci(7); +TEST_CASE("MultipleSubflowRuns.Predicate.6threads" * doctest::timeout(300)) { + multiple_subflow_runs_with_predicate(6); } -TEST_CASE("FibSubflow.8threads") { - fibonacci(8); +TEST_CASE("MultipleSubflowRuns.Predicate.7threads" * doctest::timeout(300)) { + multiple_subflow_runs_with_predicate(7); +} + +TEST_CASE("MultipleSubflowRuns.Predicate.8threads" * doctest::timeout(300)) { + multiple_subflow_runs_with_predicate(8); +} + +// ---------------------------------------------------------------------------- +// subflow state test +// ---------------------------------------------------------------------------- + +void bit_state(unsigned W) { + tf::Executor executor(W); + tf::Taskflow taskflow; + + auto init = taskflow.emplace([](){}); + + auto task = taskflow.emplace([](tf::Subflow& sf){ + // each newly spawned subflow should have clean status + REQUIRE(sf.joinable()); + REQUIRE(sf.retain() == false); + sf.join(); + sf.retain(true); + }); + + auto cond = taskflow.emplace([i=0]() mutable { + return (i++ < 100) ? 0 : 1; + }); + + init.precede(task); + task.precede(cond); + cond.precede(task); + + executor.run(taskflow).wait(); } + +TEST_CASE("Subflow.BitState.1thread") { + bit_state(1); +} + +TEST_CASE("Subflow.BitState.2threads") { + bit_state(2); +} + +TEST_CASE("Subflow.BitState.3threads") { + bit_state(3); +} + +TEST_CASE("Subflow.BitState.4threads") { + bit_state(4); +} + + + + + + + diff --git a/unittests/test_traversals.cpp b/unittests/test_traversals.cpp index 42b8df11e..d2adb1c23 100644 --- a/unittests/test_traversals.cpp +++ b/unittests/test_traversals.cpp @@ -75,55 +75,69 @@ std::unique_ptr make_chain(size_t num_nodes) { // -------------------------------------------------------- // Testcase: StaticTraversal // -------------------------------------------------------- -TEST_CASE("StaticTraversal" * doctest::timeout(300)) { +void static_traversal(unsigned W) { + size_t max_degree = 4; size_t num_nodes = 1000; - for(unsigned w=1; w<=4; w++) { - - auto nodes = make_dag(num_nodes, max_degree); + auto nodes = make_dag(num_nodes, max_degree); - tf::Taskflow tf; - tf::Executor executor(w); + tf::Taskflow tf; + tf::Executor executor(W); - std::atomic level(0); - std::vector tasks; + std::atomic level(0); + std::vector tasks; - for(size_t i=0; ilevel = ++level; - v->visited = true; - for(size_t j=0; jsuccessors.size(); ++j) { - v->successors[j]->dependents.fetch_sub(1); - } - }).name(nodes[i].name); + for(size_t i=0; ilevel = ++level; + v->visited = true; + for(size_t j=0; jsuccessors.size(); ++j) { + v->successors[j]->dependents.fetch_sub(1); + } + }).name(nodes[i].name); - tasks.push_back(task); - } + tasks.push_back(task); + } - for(size_t i=0; iidx]); - } + for(size_t i=0; iidx]); } + } - executor.run(tf).wait(); // block until finished + executor.run(tf).wait(); // block until finished - for(size_t i=0; ilevel); - } + for(size_t i=0; ilevel); } } } + +TEST_CASE("StaticTraversal.1thread" * doctest::timeout(300)) { + static_traversal(1); +} + +TEST_CASE("StaticTraversal.2threads" * doctest::timeout(300)) { + static_traversal(2); +} + +TEST_CASE("StaticTraversal.3threads" * doctest::timeout(300)) { + static_traversal(3); +} + +TEST_CASE("StaticTraversal.4threads" * doctest::timeout(300)) { + static_traversal(4); +} // -------------------------------------------------------- // Testcase: DynamicTraversal // -------------------------------------------------------- -TEST_CASE("DynamicTraversal" * doctest::timeout(300)) { +void dynamic_traversal(unsigned W) { std::atomic level; @@ -146,98 +160,52 @@ TEST_CASE("DynamicTraversal" * doctest::timeout(300)) { size_t max_degree = 4; size_t num_nodes = 1000; - for(unsigned w=1; w<=4; w++) { - - auto nodes = make_dag(num_nodes, max_degree); + auto nodes = make_dag(num_nodes, max_degree); - std::vector src; - for(size_t i=0; i src; + for(size_t i=0; ilevel); - } + for(size_t i=0; ilevel); } } } -// -------------------------------------------------------- -// Testcase: RecursiveTraversal -// -------------------------------------------------------- -//TEST_CASE("RecursiveTraversal" * doctest::timeout(300)) { -// -// std::atomic level; -// -// std::function traverse; -// -// traverse = [&] (Node* n, tf::Subflow& subflow) { -// REQUIRE(!n->visited); -// n->visited = true; -// size_t S = n->successors.size(); -// for(size_t i=0; isuccessors[i]->dependents.fetch_sub(1) == 1) { -// n->successors[i]->level = ++level; -// subflow.emplace([s=n->successors[i], &traverse](tf::Subflow &subflow){ -// traverse(s, subflow); -// }); -// } -// } -// }; -// -// size_t num_nodes = 1000; -// -// for(unsigned w=1; w<=4; w++) { -// -// auto nodes = make_chain(num_nodes); -// -// std::vector src; -// for(size_t i=0; ilevel); -// } -// } -// } -//} +TEST_CASE("DynamicTraversal.1thread" * doctest::timeout(300)) { + dynamic_traversal(1); +} + +TEST_CASE("DynamicTraversal.2threads" * doctest::timeout(300)) { + dynamic_traversal(2); +} + +TEST_CASE("DynamicTraversal.3threads" * doctest::timeout(300)) { + dynamic_traversal(3); +} + +TEST_CASE("DynamicTraversal.4threads" * doctest::timeout(300)) { + dynamic_traversal(4); +} // -------------------------------------------------------- // Testcase: ParallelTraversal diff --git a/unittests/test_utility.cpp b/unittests/test_utility.cpp index fa33fc21a..5736f7065 100644 --- a/unittests/test_utility.cpp +++ b/unittests/test_utility.cpp @@ -3,7 +3,7 @@ #include #include -#include +//#include #include #include #include @@ -217,6 +217,8 @@ TEST_CASE("distance.integral" * doctest::timeout(300)) { // -------------------------------------------------------- // Testcase: ObjectPool.Sequential // -------------------------------------------------------- +/* +// Due to random # generation, this threaded program has a bug void test_threaded_uuid(size_t N) { std::vector uuids(65536); @@ -240,10 +242,19 @@ void test_threaded_uuid(size_t N) { auto size = uuids.size(); std::sort(uuids.begin(), uuids.end()); - std::unique(uuids.begin(), uuids.end()); - REQUIRE(uuids.size() == size); + auto it = std::unique(uuids.begin(), uuids.end()); + REQUIRE(it - uuids.begin() == size); } +TEST_CASE("uuid.10threads") { + test_threaded_uuid(10); +} + +TEST_CASE("uuid.100threads") { + test_threaded_uuid(100); +} +*/ + TEST_CASE("uuid") { tf::UUID u1, u2, u3, u4; @@ -270,19 +281,14 @@ TEST_CASE("uuid") { // Uniqueness std::vector uuids(65536); std::sort(uuids.begin(), uuids.end()); - std::unique(uuids.begin(), uuids.end()); - REQUIRE(uuids.size() == 65536); + auto it = std::unique(uuids.begin(), uuids.end()); + REQUIRE(it - uuids.begin() == 65536); } -TEST_CASE("uuid.10threads") { - test_threaded_uuid(10); -} -TEST_CASE("uuid.100threads") { - test_threaded_uuid(100); -} +/* // -------------------------------------------------------- // Testcase: ObjectPool.Sequential @@ -436,6 +442,8 @@ TEST_CASE("ObjectPool.15threads" * doctest::timeout(300)) { TEST_CASE("ObjectPool.16threads" * doctest::timeout(300)) { threaded_objectpool(16); } +*/ + // -------------------------------------------------------- // Testcase: Reference Wrapper @@ -612,7 +620,133 @@ TEST_CASE("NextPow2") { REQUIRE(tf::is_pow2(64u) == true); } +// ---------------------------------------------------------------------------- +// count the number of trailing zeros +// ---------------------------------------------------------------------------- + +TEST_CASE("CTZ") { + REQUIRE(tf::ctz(0b00000001) == 0); + REQUIRE(tf::ctz(0b00000010) == 1); + REQUIRE(tf::ctz(0b00000100) == 2); + REQUIRE(tf::ctz(0b10000000) == 7); + + REQUIRE(tf::ctz(0b00000001ULL) == 0); + REQUIRE(tf::ctz(0b00000010ULL) == 1); + REQUIRE(tf::ctz(0b00000100ULL) == 2); + REQUIRE(tf::ctz(0x8000000000000000ULL) == 63); + + //REQUIRE(tf::ctz(0) == 32); // Undefined behavior, doesn't work for Windows + REQUIRE(tf::ctz(0xFFFFFFFF) == 0); + REQUIRE(tf::ctz(0x00010000) == 16); + REQUIRE(tf::ctz(0x80000000) == 31); + + //REQUIRE(tf::ctz(0) == 64); // Undefined behavior, doesn't work for Windows + REQUIRE(tf::ctz(0xFFFFFFFFFFFFFFFFULL) == 0); + REQUIRE(tf::ctz(0x0000000100000000ULL) == 32); + REQUIRE(tf::ctz(0x0000000000008000ULL) == 15); + REQUIRE(tf::ctz(0x4000000000000000ULL) == 62); +} +// ---------------------------------------------------------------------------- +// test coprimes +// ---------------------------------------------------------------------------- + +TEST_CASE("Coprimes") { + + // Compile-time checks for known values + static_assert(tf::coprime(1) == 1); + static_assert(tf::coprime(2) == 1); + static_assert(tf::coprime(3) == 2); + static_assert(tf::coprime(4) == 3); + static_assert(tf::coprime(5) == 4); + static_assert(tf::coprime(6) == 5); + static_assert(tf::coprime(7) == 6); + static_assert(tf::coprime(8) == 7); + static_assert(tf::coprime(9) == 8); + static_assert(tf::coprime(10) == 9); + static_assert(tf::coprime(11) == 10); + static_assert(tf::coprime(12) == 11); + static_assert(tf::coprime(13) == 12); + static_assert(tf::coprime(14) == 13); + static_assert(tf::coprime(15) == 14); + static_assert(tf::coprime(16) == 15); + static_assert(tf::coprime(17) == 16); + static_assert(tf::coprime(18) == 17); + static_assert(tf::coprime(19) == 18); + static_assert(tf::coprime(20) == 19); + + constexpr auto coprime_table = tf::make_coprime_lut<51>(); + + static_assert(coprime_table[1] == 1); + static_assert(coprime_table[2] == 1); + static_assert(coprime_table[3] == 2); + static_assert(coprime_table[4] == 3); + static_assert(coprime_table[5] == 4); + static_assert(coprime_table[6] == 5); + static_assert(coprime_table[7] == 6); + static_assert(coprime_table[8] == 7); + static_assert(coprime_table[9] == 8); + static_assert(coprime_table[10] == 9); + static_assert(coprime_table[11] == 10); + static_assert(coprime_table[12] == 11); + static_assert(coprime_table[13] == 12); + static_assert(coprime_table[14] == 13); + static_assert(coprime_table[15] == 14); + static_assert(coprime_table[16] == 15); + static_assert(coprime_table[17] == 16); + static_assert(coprime_table[18] == 17); + static_assert(coprime_table[19] == 18); + static_assert(coprime_table[20] == 19); + + // Runtime assertions for all values up to 50 + for (size_t i = 1; i <= 50; ++i) { + REQUIRE(std::gcd(i, coprime_table[i]) == 1); + REQUIRE(tf::coprime(i) == coprime_table[i]); + + // randomly generate a coprime + auto v = ::rand() % 1048 + 2; + auto c = tf::coprime(v); + REQUIRE(std::gcd(v, c) == 1); + REQUIRE(c < v); + } + +} + +// ---------------------------------------------------------------------------- +// Log2 +// ---------------------------------------------------------------------------- + +TEST_CASE("FloorLog2") { + + REQUIRE(tf::floor_log2(1u) == 0); + REQUIRE(tf::floor_log2(2u) == 1); + REQUIRE(tf::floor_log2(4u) == 2); + REQUIRE(tf::floor_log2(8u) == 3); + REQUIRE(tf::floor_log2(16u) == 4); + REQUIRE(tf::floor_log2(32u) == 5); + REQUIRE(tf::floor_log2(64u) == 6); + REQUIRE(tf::floor_log2(128u) == 7); + REQUIRE(tf::floor_log2(256u) == 8); + + // Test non-powers of 2 (floor log2) + REQUIRE(tf::floor_log2(3u) == 1); + REQUIRE(tf::floor_log2(5u) == 2); + REQUIRE(tf::floor_log2(6u) == 2); + REQUIRE(tf::floor_log2(7u) == 2); + REQUIRE(tf::floor_log2(9u) == 3); + REQUIRE(tf::floor_log2(10u) == 3); + REQUIRE(tf::floor_log2(15u) == 3); + REQUIRE(tf::floor_log2(17u) == 4); + REQUIRE(tf::floor_log2(31u) == 4); + REQUIRE(tf::floor_log2(33u) == 5); + + // Test large values + REQUIRE(tf::floor_log2(1023u) == 9); + REQUIRE(tf::floor_log2(1024u) == 10); + REQUIRE(tf::floor_log2(1025u) == 10); + REQUIRE(tf::floor_log2(std::numeric_limits::max()) == 31); + REQUIRE(tf::floor_log2(std::numeric_limits::max()) == 63); +} diff --git a/unittests/test_work_stealing.cpp b/unittests/test_work_stealing.cpp index bcd0f5314..b538da73d 100644 --- a/unittests/test_work_stealing.cpp +++ b/unittests/test_work_stealing.cpp @@ -3,264 +3,11 @@ #include #include -// ============================================================================ -// Test without Priority -// ============================================================================ - -// Procedure: tsq_owner -void tsq_owner() { - - for(size_t N=1; N<=777777; N=N*2+1) { - tf::TaskQueue queue; - std::vector gold(N); - - REQUIRE(queue.empty()); - - // push and pop - for(size_t i=0; i queue; - std::vector gold(N); - std::atomic consumed {0}; - - for(size_t i=0; i threads; - std::vector> stolens(M); - for(size_t i=0; i items; - while(consumed != N) { - auto ptr = queue.pop(); - if(ptr != nullptr) { - items.push_back(ptr); - consumed.fetch_add(1, std::memory_order_relaxed); - } - } - REQUIRE(queue.steal() == nullptr); - REQUIRE(queue.pop() == nullptr); - REQUIRE(queue.empty()); - - // join thieves - for(auto& thread : threads) thread.join(); - - // merge items - for(size_t i=0; i queue; - - //for(unsigned p=0; p> gold(N); - - REQUIRE(queue.empty()); - REQUIRE(queue.pop() == nullptr); - - for(unsigned p=0; p counter{0}; + + tf::Task prev, curr; + // large linear chain followed by many branches size_t N = 1000; size_t target = 0; - taskflow.clear(); counter = 0; for(size_t l=0; l= TF_CPP20 +void waiter(size_t W) { + tf::Executor executor(W); - tf::Task task, prev; - for(size_t i=0; i<10; i++) { - task = taskflow.emplace([&](){ - //std::cout << executor.this_worker() << std::endl; - printf("linear by worker %d\n", executor.this_worker_id()); - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - }); + // waits until all workers stop stealing + while(executor.num_waiters() != W); + + // now we should have no workers stealing + REQUIRE(executor.num_waiters() == W); - if(i) { - prev.precede(task); - } + auto fu = executor.async([&](){ + // I should be able to wait until other W-1 workers sleep + while(executor.num_waiters() != W-1); - prev = task; - } + return 1; + }); + + REQUIRE(fu.get() == 1); + + tf::Taskflow taskflow; - for(size_t i=0; i<10; i++) { + for(size_t i=0; i<2048; i++) { taskflow.emplace([&](){ - //std::cout << executor.this_worker() << std::endl; - printf("parallel by worker %d\n", executor.this_worker_id()); - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - }).succeed(task); + // At least, I am not the waiter + REQUIRE(executor.num_waiters() < W); + }); } + + taskflow.emplace([&](){ + // I should be able to wait until other W-1 workers sleep + while(executor.num_waiters() != W-1); + }); executor.run(taskflow).wait(); +} +TEST_CASE("WorkStealing.Waiter.1thread") { + waiter(1); } - -//TEST_CASE("WS.broom.2threads") { -// ws_broom(10); -//} + +TEST_CASE("WorkStealing.Waiter.2threads") { + waiter(2); +} + +TEST_CASE("WorkStealing.Waiter.3threads") { + waiter(3); +} + +TEST_CASE("WorkStealing.Waiter.4threads") { + waiter(4); +} + +TEST_CASE("WorkStealing.Waiter.5threads") { + waiter(5); +} + +TEST_CASE("WorkStealing.Waiter.6threads") { + waiter(6); +} + +TEST_CASE("WorkStealing.Waiter.7threads") { + waiter(7); +} + +TEST_CASE("WorkStealing.Waiter.8threads") { + waiter(8); +} +#endif // ---------------------------------------------------------------------------- // Continuation // ---------------------------------------------------------------------------- -void continuation_test(size_t W) { +void continuation(size_t W) { tf::Taskflow taskflow; tf::Executor executor(W); @@ -1204,42 +1026,35 @@ void continuation_test(size_t W) { } TEST_CASE("WorkStealing.Continuation.1thread" * doctest::timeout(300)) { - continuation_test(1); + continuation(1); } TEST_CASE("WorkStealing.Continuation.2threads" * doctest::timeout(300)) { - continuation_test(2); + continuation(2); } TEST_CASE("WorkStealing.Continuation.3threads" * doctest::timeout(300)) { - continuation_test(3); + continuation(3); } TEST_CASE("WorkStealing.Continuation.4threads" * doctest::timeout(300)) { - continuation_test(4); + continuation(4); } TEST_CASE("WorkStealing.Continuation.5threads" * doctest::timeout(300)) { - continuation_test(5); + continuation(5); } TEST_CASE("WorkStealing.Continuation.6threads" * doctest::timeout(300)) { - continuation_test(6); + continuation(6); } TEST_CASE("WorkStealing.Continuation.7threads" * doctest::timeout(300)) { - continuation_test(7); + continuation(7); } TEST_CASE("WorkStealing.Continuation.8threads" * doctest::timeout(300)) { - continuation_test(8); + continuation(8); } - - - - - - - diff --git a/unittests/test_workers.cpp b/unittests/test_workers.cpp new file mode 100644 index 000000000..ae350fd61 --- /dev/null +++ b/unittests/test_workers.cpp @@ -0,0 +1,82 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN + +#include +#include + +class CustomWorkerBehavior : public tf::WorkerInterface { + + public: + + CustomWorkerBehavior(std::atomic& counter, std::vector& ids) : + _counter {counter}, + _ids {ids} { + } + + void scheduler_prologue(tf::Worker& wv) override { + _counter++; + + std::scoped_lock lock(_mutex); + _ids.push_back(wv.id()); + } + + void scheduler_epilogue(tf::Worker&, std::exception_ptr) override { + _counter++; + } + + std::atomic& _counter; + std::vector& _ids; + + std::mutex _mutex; + +}; + +void worker_interface_basics(unsigned W) { + + std::atomic counter{0}; + std::vector ids; + + { + tf::Executor executor(W, tf::make_worker_interface(counter, ids)); + } + + REQUIRE(counter == W*2); + REQUIRE(ids.size() == W); + + std::sort(ids.begin(), ids.end()); + + for(size_t i=0; i