diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..29bb15de --- /dev/null +++ b/.gitmodules @@ -0,0 +1,9 @@ +[submodule "benchmark/ext/BitArray"] + path = benchmark/ext/BitArray + url = https://github.com/noporpoise/BitArray.git +[submodule "benchmark/ext/itsy_bitsy"] + path = benchmark/ext/itsy_bitsy + url = https://github.com/ThePhD/itsy_bitsy.git +[submodule "benchmark/ext/dynamic_bitset"] + path = benchmark/ext/dynamic_bitset + url = https://github.com/pinam45/dynamic_bitset.git diff --git a/CMakeLists.txt b/CMakeLists.txt index b887e70f..4668a909 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.14) # set the project name -project(Bit-Vector VERSION 0.1.1) +project(Bit-Vector VERSION 0.3.0) # set output directory of builds #set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) @@ -46,6 +46,7 @@ option(BITLIB_HWY "Build with google highway SIMD extensions" OFF) option(BITLIB_BENCHMARK "Build bitlib benchmarks" OFF) option(BITLIB_EXAMPLE "Build bitlib examples" OFF) option(BITLIB_TEST "Build bitlib tests" OFF) +option(BITLIB_PROFILE "Buid simple example for profiling" OFF) option(BITLIB_COVERAGE "Compute test coverage" OFF) if (BITLIB_HWY) @@ -64,3 +65,7 @@ if(BITLIB_TEST) add_subdirectory(test) endif() +if(BITLIB_PROFILE) + add_subdirectory(profile) +endif() + diff --git a/README.md b/README.md index ab8a7cee..7d5168b9 100644 --- a/README.md +++ b/README.md @@ -130,83 +130,117 @@ Given that the majority of the library is focused on having the same interface a # Performance Benchmarks I used Google's [benchmark](https://github.com/google/benchmark) library for computing benchmarks. Each benchmark is formatted as `{bit, BitArray, std}::function` (size) [(alignment-tags)]. -* `bit` is for this library, `BitArray` is for the popular C-based [BitArray library](https://github.com/noporpoise/BitArray), and`std` is the standard library operating on the infamous `vector`. -* (size) denotes the size of the container in bits. `small = 1 << 4`, `large = 1 << 16` + * `bit` is for this library, `BitArray` is for the popular C-based [BitArray library](https://github.com/noporpoise/BitArray), [dynamic_bitset](https://github.com/pinam45/dynamic_bitset) is a header-only library similar to Boost's dynamic_bitset, and`std` is the standard library operating on the infamous `vector`. +* (size) denotes the size of the container in bits. `small = 1 << 8`, `medium= 1 << 16`, `large = 1 << 24`, `huge = 1 << 31` * (alignment-tags) refers to the memory alignment of the bit-iterators. `U` means the iterator does not fall on a word boundary, `R` means the iterator is placed at random, and `A` means the iterator is aligned with a word boundary. For example, `bit::rotate (large) (ARA)` refers to our library's implementation of the `rotate` algorithm operating on a container of 65536 bits, where `first` and `last` are aligned but `n_first` is selected at random. ``` -2022-05-04T16:54:22-05:00 -Running ./bin/bench -Run on (80 X 2899.73 MHz CPU s) -CPU Caches: - L1 Data 32 KiB (x40) - L1 Instruction 32 KiB (x40) - L2 Unified 1024 KiB (x40) - L3 Unified 28160 KiB (x2) -Load Average: 1.12, 0.98, 0.54 --------------------------------------------------------------------------------- -Benchmark Time CPU Iterations --------------------------------------------------------------------------------- -bit::shift_left (small) (AA) 4.79 ns 4.79 ns 146028612 -bit::shift_left (small) (UU) 3.72 ns 3.72 ns 187172020 -std::shift_left (small) 37.8 ns 37.8 ns 18507630 -bit::shift_left (large) (AA) 78.9 ns 78.9 ns 8887302 -bit::shift_left (large) (UU) 243 ns 243 ns 2887952 -std::shift_left (large) 156867 ns 156869 ns 4463 -bit::shift_right (small) (UU) 3.48 ns 3.48 ns 201058677 -std::shift_right (small) 35.7 ns 35.7 ns 19186367 -bit::shift_right (large) (AA) 68.3 ns 68.3 ns 10249245 -std::shift_right (large) 132458 ns 132461 ns 5276 -bit::reverse (small) (UU) 8.73 ns 8.73 ns 80176090 -std::reverse (small) 39.9 ns 39.9 ns 17545669 -bit::reverse (large) (AA) 842 ns 842 ns 830385 -bit::reverse (large) (UU) 1157 ns 1157 ns 605963 -std::reverse (large) 285799 ns 285792 ns 2456 -bit::transform(UnaryOp) (small) (AA) 5.22 ns 5.22 ns 134034538 -bit::transform(UnaryOp) (small) (UU) 6.28 ns 6.28 ns 111084155 -std::transform(UnaryOp) (small) 50.6 ns 50.6 ns 13837852 -bit::transform(UnaryOp) (large) (AA) 238 ns 238 ns 2956037 -bit::transform(UnaryOp) (large) (UU) 2005 ns 2005 ns 349160 -std::transform(UnaryOp) (large) 192498 ns 192502 ns 3637 -bit::transform(BinaryOp) (small) (AA) 7.50 ns 7.50 ns 93300797 -bit::transform(BinaryOp) (small) (UU) 7.85 ns 7.85 ns 89176138 -std::transform(BinaryOp) (small) 37.1 ns 37.1 ns 18848167 -bit::transform(BinaryOp) (large) (AA) 345 ns 345 ns 2030257 -bit::transform(BinaryOp) (large) (UU) 12924 ns 12925 ns 54165 -std::transform(BinaryOp) (large) 619243 ns 619246 ns 1134 -bit::rotate (small) (ARA) 9.14 ns 9.14 ns 123732722 -std::rotate (small) 79.7 ns 79.7 ns 9138769 -bit::rotate (large) (ARA) 7617 ns 7617 ns 92147 -std::rotate (large) 582126 ns 582135 ns 1207 -bit::count (small) (AA) 2.29 ns 2.29 ns 299434270 -std::count (small) 15.2 ns 15.2 ns 45934612 -bit::count (large) (AA) 457 ns 457 ns 1533128 -std::count (large) 57501 ns 57501 ns 12174 -bit::swap_ranges (small) (AA) 6.76 ns 6.76 ns 103735181 -bit::swap_ranges (small) (UU) 5.43 ns 5.43 ns 128688535 -std::swap_ranges (small) 27.8 ns 27.8 ns 25309938 -bit::swap_ranges (large) (AA) 446 ns 446 ns 1570781 -bit::swap_ranges (large) (UU) 5496 ns 5496 ns 127033 -std::swap_ranges (large) 507092 ns 507093 ns 1380 -bit::copy (small) (UU) 6.22 ns 6.22 ns 110731355 -std::copy (small) 27.7 ns 27.7 ns 25261667 -bit::copy (large) (UU) 5367 ns 5367 ns 130292 -std::copy (large) 184520 ns 184523 ns 3794 -bit::equal (small) (UU) 3.64 ns 3.64 ns 193325012 -std::equal (small) 32.2 ns 32.2 ns 21650629 -bit::equal (large) (UU) 1799 ns 1799 ns 389158 -std::equal (large) 200078 ns 200080 ns 3499 -bit::move (small) (UU) 6.31 ns 6.31 ns 110834953 -std::move (small) 27.7 ns 27.7 ns 25270665 -bit::move (large) (UU) 5372 ns 5372 ns 130464 -std::move (large) 184090 ns 184094 ns 3803 -bit::copy_backward (small) (UU) 9.60 ns 9.60 ns 72952203 -std::copy_backward (small) 19.9 ns 19.9 ns 35227170 -bit::copy_backward (large) (UU) 7602 ns 7602 ns 92137 -std::copy_backward (large) 431622 ns 431616 ns 1619 -bit::fill (small) (UU) 4.35 ns 4.35 ns 160834380 -std::fill (small) 2.35 ns 2.35 ns 297524146 -bit::fill (huge) (UU) 17138 ns 17137 ns 40748 -std::fill (huge) 11840 ns 11839 ns 59666 +--------------------------------------------------------------------------------------- +Benchmark Time CPU Iterations +--------------------------------------------------------------------------------------- +bit::set (large) 1.90 ns 1.90 ns 367974893 +dynamic_bitset::set (large) 2.37 ns 2.37 ns 296837879 +bitarray::set (large) 2.19 ns 2.19 ns 319133940 +std::set (large) 2.39 ns 2.39 ns 293135332 +bit::shift_left (small) 26.8 ns 26.8 ns 25929070 +bit::shift_left (small) (UU) 22.4 ns 22.4 ns 31233265 +dynamic_bitset::shift_left (small) 13.1 ns 13.1 ns 53627207 +bitarray::shift_left (small) 38.2 ns 38.2 ns 18339126 +std::shift_left (small) 345 ns 345 ns 2029283 +bit::shift_left (large) 371224 ns 371211 ns 1886 +bit::shift_left (large) (UU) 371536 ns 371530 ns 1880 +dynamic_bitset::shift_left (large) 638896 ns 638880 ns 1097 +bitarray::shift_left (large) 3156273 ns 3156003 ns 222 +std::shift_left (large) 105227752 ns 105223527 ns 7 +bit::shift_right (small) 26.9 ns 26.9 ns 25976563 +bit::shift_right (small) (UU) 39.3 ns 39.3 ns 17962533 +dynamic_bitset::shift_right (small) 12.2 ns 12.2 ns 57419526 +bitarray::shift_right (small) 38.1 ns 38.1 ns 18325350 +std::shift_right (small) 504 ns 504 ns 1386280 +bit::shift_right (large) 413297 ns 413269 ns 1693 +bit::shift_right (large) (UU) 413692 ns 413655 ns 1682 +dynamic_bitset::shift_right (large) 557287 ns 557305 ns 1257 +bitarray::shift_right (large) 3156463 ns 3156516 ns 222 +std::shift_right (large) 210100788 ns 210083631 ns 3 +bit::reverse (small) (UU) 43.4 ns 43.4 ns 16112098 +bitarray::reverse (small) (UU) 95.1 ns 95.1 ns 7387177 +std::reverse (small) 419 ns 419 ns 1677069 +bit::reverse (large) 1245260 ns 1245160 ns 563 +bit::reverse (large) (UU) 1800771 ns 1800680 ns 389 +bitarray::reverse (large) 16899481 ns 16898587 ns 41 +bitarray::reverse (large) (UU) 22719408 ns 22720393 ns 31 +std::reverse (large) 293563397 ns 293542850 ns 2 +bit::transform(UnaryOp) (small) 8.75 ns 8.75 ns 80079214 +bit::transform(UnaryOp) (small) (UU) 16.6 ns 16.6 ns 42254961 +dynamic_bitset::transform(UnaryOp) (small) 4.00 ns 4.00 ns 169219246 +bitarray::transform(UnaryOp) (small) 8.39 ns 8.39 ns 83877004 +std::transform(UnaryOp) (small) 763 ns 763 ns 917975 +bit::transform(UnaryOp) (large) 373982 ns 373950 ns 1853 +bit::transform(UnaryOp) (large) (UU) 2059234 ns 2059268 ns 339 +dynamic_bitset::transform(UnaryOp) (large) 379368 ns 379368 ns 1805 +bitarray::transform(UnaryOp) (large) 739552 ns 739544 ns 881 +std::transform(UnaryOp) (large) 197977698 ns 197969224 ns 4 +bit::transform(BinaryOp) (small) 4.38 ns 4.38 ns 160002060 +bit::transform(BinaryOp) (small) (UU) 42.1 ns 42.1 ns 16549758 +dynamic_bitset::transform(BinaryOp) (small) 4.36 ns 4.36 ns 160692979 +bitarray::transform(BinaryOp) (small) 10.7 ns 10.7 ns 66178974 +std::transform(BinaryOp) (small) 855 ns 855 ns 832115 +bit::transform(BinaryOp) (large) 763642 ns 763574 ns 912 +bit::transform(BinaryOp) (large) (UU) 10966202 ns 10966406 ns 64 +dynamic_bitset::transform(BinaryOp) (large) 758617 ns 758574 ns 906 +bitarray::transform(BinaryOp) (large) 518286 ns 518267 ns 1177 +std::transform(BinaryOp) (large) 802270688 ns 802303941 ns 1 +bit::rotate (small) 131 ns 131 ns 16525922 +std::rotate (small) 1782 ns 1782 ns 417293 +bit::rotate (large) 7333284 ns 7333170 ns 96 +std::rotate (large) 514697313 ns 514718779 ns 1 +bit::count (small) 8.14 ns 8.14 ns 86522765 +dynamic_bitset::count (small) 6.29 ns 6.29 ns 108878018 +bitarray::count (small) 5.47 ns 5.47 ns 133692569 +std::count (small) 234 ns 234 ns 2997782 +bit::count (large) 365194 ns 365159 ns 1919 +dynamic_bitset::count (large) 365279 ns 365269 ns 1919 +bitarray::count (large) 917302 ns 917185 ns 764 +std::count (large) 58934071 ns 58931785 ns 12 +bit::swap_ranges (small) 9.58 ns 9.57 ns 73128377 +bit::swap_ranges (small) (UU) 19.7 ns 19.7 ns 35498474 +std::swap_ranges (small) 756 ns 756 ns 912041 +bit::swap_ranges (large) 852205 ns 852241 ns 821 +bit::swap_ranges (large) (UU) 5691899 ns 5692145 ns 123 +std::swap_ranges (large) 522198664 ns 522161939 ns 1 +bit::copy (small) (UU) 25.0 ns 25.0 ns 28200772 +std::copy (small) 707 ns 707 ns 990757 +bit::copy (large) (UU) 5952278 ns 5951729 ns 116 +std::copy (large) 189551338 ns 189554366 ns 4 +bit::equal (small) (UU) 13.1 ns 13.1 ns 53616228 +std::equal (small) 886 ns 886 ns 790035 +bit::equal (large) (UU) 1960399 ns 1960375 ns 357 +std::equal (large) 234389098 ns 234398907 ns 3 +bit::move (small) (UU) 23.5 ns 23.5 ns 29764745 +std::move (small) 706 ns 706 ns 992054 +bit::move (large) (UU) 5135837 ns 5135619 ns 136 +std::move (large) 188961979 ns 188953500 ns 4 +bit::copy_backward (small) (UU) 39.0 ns 39.0 ns 17977387 +std::copy_backward (small) 527 ns 527 ns 1313265 +bit::copy_backward (large) (UU) 9163333 ns 9163038 ns 76 +std::copy_backward (large) 444362971 ns 444350668 ns 2 +bit::fill (small) (UU) 6.48 ns 6.48 ns 108934237 +dynamic_bitset::fill (small) 4.79 ns 4.79 ns 146205764 +bitarray::fill (small) 14.5 ns 14.5 ns 48030428 +std::fill (small) 9.15 ns 9.15 ns 76612702 +bit::fill (large) (UU) 440400 ns 440396 ns 1590 +dynamic_bitset::fill (large) 429375 ns 429359 ns 1631 +bitarray::fill (large) 369732 ns 369736 ns 1964 +std::fill (large) 356517 ns 356488 ns 1894 +bit::find (small) (UU) 3.10 ns 3.10 ns 228714994 +dynamic_bitset::find (small) 3.05 ns 3.05 ns 229830138 +bitarray::find (small) 7.38 ns 7.38 ns 99039746 +std::find (small) 110 ns 110 ns 6311725 +bit::find (large) (UU) 182002 ns 182006 ns 3850 +dynamic_bitset::find (large) 259896 ns 259908 ns 2696 +bitarray::find (large) 252434 ns 252445 ns 2774 +std::find (large) 28570723 ns 28567762 ns 25 +``` + diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index b5d8dfd1..6870bea1 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -12,9 +12,15 @@ set(CMAKE_BUILD_TYPE Release) file(GLOB BENCH_SOURCES "src/*.cc") add_executable(bitlib-bench ${BENCH_SOURCES}) +add_subdirectory(ext/dynamic_bitset) + # specify benchmark-specific libraries -include_directories(${googlebench_SOURCE_DIR}/benchmark/include src/utils) -target_link_libraries(bitlib-bench PRIVATE benchmark::benchmark -pthread) +include_directories( + ${googlebench_SOURCE_DIR}/benchmark/include + src/utils + ext/BitArray + ext/itsy_bitsy/include) +target_link_libraries(bitlib-bench PRIVATE benchmark::benchmark -pthread ${CMAKE_CURRENT_LIST_DIR}/ext/BitArray/libbitarr.a sul::dynamic_bitset) target_compile_options(bitlib-bench PUBLIC -O3 -DNDEBUG -march=native -Wpedantic) install(TARGETS bitlib-bench DESTINATION .) diff --git a/benchmark/src/benchmark_main.cc b/benchmark/src/benchmark_main.cc index bb68366b..a8cad2dc 100644 --- a/benchmark/src/benchmark_main.cc +++ b/benchmark/src/benchmark_main.cc @@ -1,10 +1,9 @@ // =============================== TEST ROOT ================================ // // Project: The Experimental Bit Algorithms Library -// Name: test_root.cc +// Name: benchmark_main.cc // Description: Brings in all of the test headers into an object to be linked // with the test main -// Creator: Vincent Reverdy -// Contributor(s): Bryce Kille [2019] +// Contributor(s): Bryce Kille // License: BSD 3-Clause License // ========================================================================== // @@ -32,6 +31,7 @@ #include "swap_ranges-bench.hpp" #include "transform_bench.hpp" #include "equal_bench.hpp" +#include "rw_bench.hpp" // Third party libraries #include #include @@ -96,133 +96,231 @@ void register_bool_containers(F test_lambda_f, std::string func_name, unsigned i //BENCHMARK_MAIN(); int main(int argc, char** argv) { - unsigned int size_small = 1 << 4; - unsigned int size_medium = 1 << 8; - unsigned int size_large = 1 << 16; - unsigned int size_huge = 1 << 22; + unsigned int size_small = 1 << 8; + unsigned int size_medium = 1 << 16; + unsigned int size_large = 1 << 26; + unsigned int size_huge = 1 << 31; + // Read/write benchmarks + register_word_containers( + BM_BitSet, + "bit::set (large)", + size_large); + register_word_containers( + BM_DynamicBitsetSet, + "dynamic_bitset::set (large)", + size_large); + register_word_containers( + BM_BitArraySet, + "bitarray::set (large)", + size_large); + register_bool_containers( + BM_BoolSet, + "std::set (large)", + size_large); + // Shift benchmarks register_word_containers( BM_BitShiftLeft, - "bit::shift_left (small) (AA)", + "bit::shift_left (small)", size_small); register_word_containers( BM_BitShiftLeft_UU, "bit::shift_left (small) (UU)", size_small); + register_word_containers( + BM_DynamicBitsetShiftLeft, + "dynamic_bitset::shift_left (small)", + size_small); + register_word_containers( + BM_BitArrayShiftLeft, + "bitarray::shift_left (small)", + size_small); register_bool_containers( BM_BoolShiftLeft, "std::shift_left (small)", size_small); register_word_containers( BM_BitShiftLeft, - "bit::shift_left (large) (AA)", - size_huge); + "bit::shift_left (large)", + size_large); register_word_containers( BM_BitShiftLeft_UU, "bit::shift_left (large) (UU)", - size_huge); + size_large); + register_word_containers( + BM_DynamicBitsetShiftLeft, + "dynamic_bitset::shift_left (large)", + size_large); + register_word_containers( + BM_BitArrayShiftLeft, + "bitarray::shift_left (large) ", + size_large); register_bool_containers( BM_BoolShiftLeft, "std::shift_left (large)", - size_huge); + size_large); + register_word_containers( + BM_BitShiftRight, + "bit::shift_right (small) ", + size_small); register_word_containers( BM_BitShiftRight_UU, "bit::shift_right (small) (UU)", size_small); + register_word_containers( + BM_DynamicBitsetShiftRight, + "dynamic_bitset::shift_right (small)", + size_small); + register_word_containers( + BM_BitArrayShiftRight, + "bitarray::shift_right (small)", + size_small); register_bool_containers( BM_BoolShiftRight, "std::shift_right (small)", size_small); register_word_containers( BM_BitShiftRight, - "bit::shift_right (large) (AA)", - size_huge); + "bit::shift_right (large) ", + size_large); register_word_containers( BM_BitShiftRight_UU, "bit::shift_right (large) (UU)", - size_huge); + size_large); + register_word_containers( + BM_DynamicBitsetShiftRight, + "dynamic_bitset::shift_right (large)", + size_large); + register_word_containers( + BM_BitArrayShiftRight, + "bitarray::shift_right (large)", + size_large); register_bool_containers( BM_BoolShiftRight, "std::shift_right (large)", - size_huge); + size_large); // Reverse benchmarks register_word_containers( BM_BitReverse_UU, "bit::reverse (small) (UU)", size_small); + register_word_containers( + BM_BitArrayReverse_UU, + "bitarray::reverse (small) (UU)", + size_small); register_bool_containers( BM_BoolReverse, "std::reverse (small)", size_small); register_word_containers( BM_BitReverse, - "bit::reverse (large) (AA)", - size_huge); + "bit::reverse (large) ", + size_large); register_word_containers( BM_BitReverse_UU, "bit::reverse (large) (UU)", - size_huge); + size_large); + register_word_containers( + BM_BitArrayReverse, + "bitarray::reverse (large)", + size_large); + register_word_containers( + BM_BitArrayReverse_UU, + "bitarray::reverse (large) (UU)", + size_large); register_bool_containers( BM_BoolReverse, "std::reverse (large)", - size_huge); + size_large); // transform benchmarks register_word_containers( BM_BitTransformUnaryAA, - "bit::transform(UnaryOp) (small) (AA)", + "bit::transform(UnaryOp) (small) ", size_small); register_word_containers( BM_BitTransformUnaryUU, "bit::transform(UnaryOp) (small) (UU)", size_small); + register_word_containers( + BM_DynamicBitsetTransformUnary, + "dynamic_bitset::transform(UnaryOp) (small) ", + size_small); + register_word_containers( + BM_BitArrayTransformUnary, + "bitarray::transform(UnaryOp) (small) ", + size_small); register_bool_containers( BM_BoolTransformUnary, "std::transform(UnaryOp) (small)", size_small); register_word_containers( BM_BitTransformUnaryAA, - "bit::transform(UnaryOp) (large) (AA)", - size_huge); + "bit::transform(UnaryOp) (large) ", + size_large); register_word_containers( BM_BitTransformUnaryUU, "bit::transform(UnaryOp) (large) (UU)", - size_huge); + size_large); + register_word_containers( + BM_DynamicBitsetTransformUnary, + "dynamic_bitset::transform(UnaryOp) (large) ", + size_large); + register_word_containers( + BM_BitArrayTransformUnary, + "bitarray::transform(UnaryOp) (large) ", + size_large); register_bool_containers( BM_BoolTransformUnary, "std::transform(UnaryOp) (large)", - size_huge); + size_large); register_word_containers( BM_BitTransformBinaryAA, - "bit::transform(BinaryOp) (small) (AA)", + "bit::transform(BinaryOp) (small) ", size_small); register_word_containers( BM_BitTransformBinaryUU, "bit::transform(BinaryOp) (small) (UU)", size_small); + register_word_containers( + BM_DynamicBitsetTransformBinary, + "dynamic_bitset::transform(BinaryOp) (small) ", + size_small); + register_word_containers( + BM_BitArrayTransformBinary, + "bitarray::transform(BinaryOp) (small) ", + size_small); register_bool_containers( BM_BoolTransformBinary, "std::transform(BinaryOp) (small)", size_small); register_word_containers( BM_BitTransformBinaryAA, - "bit::transform(BinaryOp) (large) (AA)", - size_huge); + "bit::transform(BinaryOp) (large) ", + size_large); register_word_containers( BM_BitTransformBinaryUU, "bit::transform(BinaryOp) (large) (UU)", - size_huge); + size_large); + register_word_containers( + BM_DynamicBitsetTransformBinary, + "dynamic_bitset::transform(BinaryOp) (large) ", + size_large); + register_word_containers( + BM_BitArrayTransformBinary, + "bitarray::transform(BinaryOp) (large) ", + size_large); register_bool_containers( BM_BoolTransformBinary, "std::transform(BinaryOp) (large)", - size_huge); + size_large); // Rotate benchmarks register_word_containers( BM_BitRotate, - "bit::rotate (small) (ARA)", + "bit::rotate (small)", size_small); register_bool_containers( BM_BoolRotate, @@ -230,17 +328,25 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitRotate, - "bit::rotate (large) (ARA)", - size_huge); + "bit::rotate (large)", + size_large); register_bool_containers( BM_BoolRotate, "std::rotate (large)", - size_huge); + size_large); // Count benchmarks register_word_containers( BM_BitCount, - "bit::count (small) (AA)", + "bit::count (small) ", + size_small); + register_word_containers( + BM_DynamicBitsetCount, + "dynamic_bitset::count (small)", + size_small); + register_word_containers( + BM_BitArrayCount, + "bitarray::count (small)", size_small); register_bool_containers( BM_BoolCount, @@ -248,17 +354,25 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitCount, - "bit::count (large) (AA)", - size_huge); + "bit::count (large) ", + size_large); + register_word_containers( + BM_DynamicBitsetCount, + "dynamic_bitset::count (large)", + size_large); + register_word_containers( + BM_BitArrayCount, + "bitarray::count (large)", + size_large); register_bool_containers( BM_BoolCount, "std::count (large)", - size_huge); + size_large); // swap_ranges benchmarks register_word_containers( BM_BitSwapRangesAA, - "bit::swap_ranges (small) (AA)", + "bit::swap_ranges (small) ", size_small); register_word_containers( BM_BitSwapRangesUU, @@ -270,16 +384,16 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitSwapRangesAA, - "bit::swap_ranges (large) (AA)", - size_huge); + "bit::swap_ranges (large) ", + size_large); register_word_containers( BM_BitSwapRangesUU, "bit::swap_ranges (large) (UU)", - size_huge); + size_large); register_bool_containers( BM_BoolSwapRanges, "std::swap_ranges (large)", - size_huge); + size_large); // copy benchmarks register_word_containers( @@ -293,11 +407,11 @@ int main(int argc, char** argv) { register_word_containers( BM_BitCopy, "bit::copy (large) (UU)", - size_huge); + size_large); register_bool_containers( BM_BoolCopy, "std::copy (large)", - size_huge); + size_large); // Equal benchmarks register_word_containers( @@ -311,11 +425,11 @@ int main(int argc, char** argv) { register_word_containers( BM_BitEqual, "bit::equal (large) (UU)", - size_huge); + size_large); register_bool_containers( BM_BoolEqual, "std::equal (large)", - size_huge); + size_large); // move benchmarks register_word_containers( @@ -329,11 +443,11 @@ int main(int argc, char** argv) { register_word_containers( BM_BitMove, "bit::move (large) (UU)", - size_huge); + size_large); register_bool_containers( BM_BoolMove, "std::move (large)", - size_huge); + size_large); // copy_backward benchmarks register_word_containers( @@ -347,65 +461,97 @@ int main(int argc, char** argv) { register_word_containers( BM_BitCopyBackward, "bit::copy_backward (large) (UU)", - size_huge); + size_large); register_bool_containers( BM_BoolCopyBackward, "std::copy_backward (large)", - size_huge); + size_large); // fill benchmarks register_word_containers( BM_BitFill, "bit::fill (small) (UU)", size_small); + register_bool_containers( + BM_DynamicBitsetFill, + "dynamic_bitset::fill (small)", + size_small); + register_bool_containers( + BM_BitArrayFill, + "bitarray::fill (small)", + size_small); register_bool_containers( BM_BoolFill, "std::fill (small)", size_small); register_word_containers( BM_BitFill, - "bit::fill (huge) (UU)", - size_huge); + "bit::fill (large) (UU)", + size_large); + register_bool_containers( + BM_DynamicBitsetFill, + "dynamic_bitset::fill (large)", + size_large); + register_bool_containers( + BM_BitArrayFill, + "bitarray::fill (large)", + size_large); register_bool_containers( BM_BoolFill, - "std::fill (huge)", - size_huge); + "std::fill (large)", + size_large); // find benchmarks register_word_containers( BM_BitFind, "bit::find (small) (UU)", size_small); + register_word_containers( + BM_DynamicBitsetFind, + "dynamic_bitset::find (small)", + size_small); + register_word_containers( + BM_BitArrayFind, + "bitarray::find (small)", + size_small); register_bool_containers( BM_BoolFind, "std::find (small)", size_small); register_word_containers( BM_BitFind, - "bit::find (huge) (UU)", - size_huge); + "bit::find (large) (UU)", + size_large); + register_word_containers( + BM_DynamicBitsetFind, + "dynamic_bitset::find (large)", + size_large); + register_word_containers( + BM_BitArrayFind, + "bitarray::find (large)", + size_large); register_bool_containers( BM_BoolFind, - "std::find (huge)", - size_huge); + "std::find (large)", + size_large); //// Search benchmarks //register_word_containers( //BM_BitSearch, //"Search_Bit_Large", - //size_huge); + //size_large); //register_bool_containers( //BM_BoolSearch, //"Search_Bool_Large", - //size_huge); + //size_large); //register_word_containers( //BM_BitSearch_WorstCase, //"Search_Bit_Large_WorstCase", - //size_huge); + //size_large); //register_bool_containers( //BM_BoolSearch_WorstCase, //"Search_Bool_Large_WorstCase", - //size_huge); + //size_large); benchmark::Initialize(&argc, argv); benchmark::RunSpecifiedBenchmarks(); } diff --git a/benchmark/src/count_bench.hpp b/benchmark/src/count_bench.hpp index 2ef9507d..a169d005 100644 --- a/benchmark/src/count_bench.hpp +++ b/benchmark/src/count_bench.hpp @@ -2,6 +2,8 @@ #include #include "test_utils.hpp" #include "bitlib/bit-algorithms/count.hpp" +#include "bit_array.h" +#include "sul/dynamic_bitset.hpp" auto BM_BitCount = [](benchmark::State& state, auto input) { using container_type = typename std::tuple_element<0, decltype(input)>::type; @@ -19,6 +21,29 @@ auto BM_BitCount = [](benchmark::State& state, auto input) { }; +auto BM_BitArrayCount = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + BIT_ARRAY* bitarr = bit_array_create(total_bits); + for (auto _ : state) { + benchmark::DoNotOptimize(bit_array_num_bits_set(bitarr)); + benchmark::ClobberMemory(); + } + bit_array_free(bitarr); +}; + +auto BM_DynamicBitsetCount = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + sul::dynamic_bitset<> bitset1(total_bits, 1); + for (auto _ : state) { + benchmark::DoNotOptimize(bitset1.count()); + benchmark::ClobberMemory(); + } +}; + auto BM_BoolCount = [](benchmark::State& state, auto input) { using container_type = std::vector; using num_type = typename container_type::value_type; diff --git a/benchmark/src/fill_bench.hpp b/benchmark/src/fill_bench.hpp index 58ff116e..9b17b932 100644 --- a/benchmark/src/fill_bench.hpp +++ b/benchmark/src/fill_bench.hpp @@ -1,6 +1,8 @@ #include #include #include "bitlib/bit-algorithms/fill.hpp" +#include "bit_array.h" +#include "sul/dynamic_bitset.hpp" auto BM_BitFill = [](benchmark::State& state, auto input) { using container_type = typename std::tuple_element<0, decltype(input)>::type; @@ -17,6 +19,30 @@ auto BM_BitFill = [](benchmark::State& state, auto input) { } }; +auto BM_BitArrayFill = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + BIT_ARRAY* bitarr = bit_array_create(total_bits); + for (auto _ : state) { + bit_array_set_region(bitarr, 2, total_bits - 5); + benchmark::ClobberMemory(); + } + bit_array_free(bitarr); +}; + +auto BM_DynamicBitsetFill = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + using iterator_type = typename container_type::iterator; + unsigned int total_bits = std::get<2>(input); + sul::dynamic_bitset<> bitset1(total_bits, 0); + for (auto _ : state) { + bitset1.set(2, total_bits - 5, true); + benchmark::ClobberMemory(); + } +}; + auto BM_BoolFill = [](benchmark::State& state, auto input) { using container_type = std::vector; using num_type = typename container_type::value_type; diff --git a/benchmark/src/find_bench.hpp b/benchmark/src/find_bench.hpp index 8d49faa3..050f8c92 100644 --- a/benchmark/src/find_bench.hpp +++ b/benchmark/src/find_bench.hpp @@ -1,6 +1,8 @@ #include #include #include "bitlib/bit-algorithms/find.hpp" +#include "bit_array.h" +#include "sul/dynamic_bitset.hpp" auto BM_BitFind = [](benchmark::State& state, auto input) { using container_type = typename std::tuple_element<0, decltype(input)>::type; @@ -11,13 +13,40 @@ auto BM_BitFind = [](benchmark::State& state, auto input) { container_type bitcont(container_size); auto first = bit::bit_iterator(std::begin(bitcont)); auto last = bit::bit_iterator(std::end(bitcont)); - *(first + (bitcont.size() / 2) + 4) = bit::bit1; + *(first + total_bits / 2 + 4) = bit::bit1; for (auto _ : state) { benchmark::DoNotOptimize(bit::find(first + 2, last - 3, bit::bit1)); benchmark::ClobberMemory(); } }; +auto BM_BitArrayFind = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + BIT_ARRAY* bitarr = bit_array_create(total_bits); + bit_array_set_bit(bitarr, total_bits/2 + 4); + bit_index_t result; + for (auto _ : state) { + benchmark::DoNotOptimize(bit_array_find_first_set_bit(bitarr, &result)); + benchmark::ClobberMemory(); + } + bit_array_free(bitarr); +}; + +auto BM_DynamicBitsetFind = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + using iterator_type = typename container_type::iterator; + unsigned int total_bits = std::get<2>(input); + sul::dynamic_bitset<> bitset1(total_bits, 0); + bitset1[total_bits / 2 + 4] = 1; + for (auto _ : state) { + benchmark::DoNotOptimize(bitset1.find_first()); + benchmark::ClobberMemory(); + } +}; + auto BM_BoolFind = [](benchmark::State& state, auto input) { using container_type = std::vector; using num_type = typename container_type::value_type; diff --git a/benchmark/src/reverse_bench.hpp b/benchmark/src/reverse_bench.hpp index 9779f7ed..b83a0928 100644 --- a/benchmark/src/reverse_bench.hpp +++ b/benchmark/src/reverse_bench.hpp @@ -32,6 +32,30 @@ auto BM_BitReverse_UU = [](benchmark::State& state, auto input) { } }; +auto BM_BitArrayReverse = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + BIT_ARRAY* bitarr = bit_array_create(total_bits); + for (auto _ : state) { + bit_array_reverse(bitarr); + benchmark::ClobberMemory(); + } + bit_array_free(bitarr); +}; + +auto BM_BitArrayReverse_UU = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + BIT_ARRAY* bitarr = bit_array_create(total_bits); + for (auto _ : state) { + bit_array_reverse_region(bitarr, 2, total_bits - 5); + benchmark::ClobberMemory(); + } + bit_array_free(bitarr); +}; + auto BM_BoolReverse = [](benchmark::State& state, auto input) { using container_type = std::vector; using num_type = typename container_type::value_type; diff --git a/benchmark/src/rw_bench.hpp b/benchmark/src/rw_bench.hpp index d9baae25..2aac15d1 100644 --- a/benchmark/src/rw_bench.hpp +++ b/benchmark/src/rw_bench.hpp @@ -1,6 +1,6 @@ #include #include "test_utils.hpp" -#include +#include "sul/dynamic_bitset.hpp" #include "bitlib/bitlib.hpp" auto BM_BitSet = [](benchmark::State& state, auto input) { @@ -12,11 +12,13 @@ auto BM_BitSet = [](benchmark::State& state, auto input) { auto bitvec1 = get_random_vec(container_size); auto first1 = bit::bit_iterator(std::begin(bitvec1)); - for (auto _ : state) + for (auto _ : state) { benchmark::DoNotOptimize(first1[total_bits/2] = bit::bit1); + benchmark::ClobberMemory(); + } }; -auto BM_CBitArrSet = [](benchmark::State& state, auto input) { +auto BM_BitArraySet = [](benchmark::State& state, auto input) { using container_type = typename std::tuple_element<0, decltype(input)>::type; using WordType = typename std::tuple_element<1, decltype(input)>::type; unsigned int total_bits = std::get<2>(input); @@ -24,15 +26,18 @@ auto BM_CBitArrSet = [](benchmark::State& state, auto input) { BIT_ARRAY* bitarr = bit_array_create(total_bits); for (auto _ : state) + { bit_array_set_bit(bitarr, total_bits/2); + benchmark::ClobberMemory(); + } }; -auto BM_BoostSet = [](benchmark::State& state, auto input) { +auto BM_DynamicBitsetSet = [](benchmark::State& state, auto input) { using container_type = typename std::tuple_element<0, decltype(input)>::type; using WordType = typename std::tuple_element<1, decltype(input)>::type; unsigned int total_bits = std::get<2>(input); - boost::dynamic_bitset x(total_bits); + sul::dynamic_bitset x(total_bits); container_type boolvec1 = make_random_container (total_bits); for (auto i = 0; i < total_bits; ++i) { x[i] = boolvec1[i]; @@ -40,6 +45,7 @@ auto BM_BoostSet = [](benchmark::State& state, auto input) { for (auto _ : state) { (x[total_bits/2] = true); + benchmark::ClobberMemory(); } }; @@ -50,9 +56,13 @@ auto BM_BoolSet = [](benchmark::State& state, auto input) { container_type boolvec1 = make_random_container (container_size); for (auto _ : state) + { benchmark::DoNotOptimize(boolvec1[container_size/2] = true); + benchmark::ClobberMemory(); + } }; + auto BM_BitGet = [](benchmark::State& state, auto input) { using container_type = typename std::tuple_element<0, decltype(input)>::type; using WordType = typename std::tuple_element<1, decltype(input)>::type; @@ -62,11 +72,13 @@ auto BM_BitGet = [](benchmark::State& state, auto input) { auto bitvec1 = get_random_vec(container_size); auto first1 = bit::bit_iterator(std::begin(bitvec1)); - for (auto _ : state) + for (auto _ : state) { benchmark::DoNotOptimize(first1[total_bits/2]); + benchmark::ClobberMemory(); + } }; -auto BM_CBitArrGet = [](benchmark::State& state, auto input) { +auto BM_BitArrayGet = [](benchmark::State& state, auto input) { using container_type = typename std::tuple_element<0, decltype(input)>::type; using WordType = typename std::tuple_element<1, decltype(input)>::type; unsigned int total_bits = std::get<2>(input); @@ -74,15 +86,18 @@ auto BM_CBitArrGet = [](benchmark::State& state, auto input) { BIT_ARRAY* bitarr = bit_array_create(total_bits); for (auto _ : state) + { benchmark::DoNotOptimize(bit_array_get_bit(bitarr, total_bits/2)); + benchmark::ClobberMemory(); + } }; -auto BM_BoostGet = [](benchmark::State& state, auto input) { +auto BM_DynamicBitsetGet = [](benchmark::State& state, auto input) { using container_type = typename std::tuple_element<0, decltype(input)>::type; using WordType = typename std::tuple_element<1, decltype(input)>::type; unsigned int total_bits = std::get<2>(input); - boost::dynamic_bitset x(total_bits); + sul::dynamic_bitset x(total_bits); container_type boolvec1 = make_random_container (total_bits); for (auto i = 0; i < total_bits; ++i) { x[i] = boolvec1[i]; @@ -90,6 +105,7 @@ auto BM_BoostGet = [](benchmark::State& state, auto input) { for (auto _ : state) { benchmark::DoNotOptimize(x[total_bits/2]); + benchmark::ClobberMemory(); } }; @@ -98,10 +114,10 @@ auto BM_BoolGet = [](benchmark::State& state, auto input) { unsigned int total_bits = std::get<2>(input); auto container_size = total_bits; container_type boolvec1 = make_random_container (container_size); - - for (auto _ : state){ - bool b; - benchmark::DoNotOptimize(b = boolvec1[container_size/2]); + bool x; + for (auto _ : state) + { + benchmark::DoNotOptimize(x = boolvec1[container_size/2]); + benchmark::ClobberMemory(); } }; - diff --git a/benchmark/src/shift_bench.hpp b/benchmark/src/shift_bench.hpp index d85d6089..8f38383e 100644 --- a/benchmark/src/shift_bench.hpp +++ b/benchmark/src/shift_bench.hpp @@ -1,7 +1,10 @@ #include #include +#include #include #include "bitlib/bitlib.hpp" +#include "bit_array.h" +#include "sul/dynamic_bitset.hpp" auto BM_BitShiftLeft = [](benchmark::State& state, auto input) { @@ -13,13 +16,14 @@ auto BM_BitShiftLeft = [](benchmark::State& state, auto input) { container_type bitcont = make_random_container(container_size); auto first = bit::bit_iterator(std::begin(bitcont)); auto last = bit::bit_iterator(std::end(bitcont)); - auto n = bit::distance(first, last) / 2; + auto n = total_bits / 2 - 1; for (auto _ : state) { benchmark::DoNotOptimize(bit::shift_left(first, last, n)); benchmark::ClobberMemory(); } }; + auto BM_BitShiftLeft_UU = [](benchmark::State& state, auto input) { using container_type = typename std::tuple_element<0, decltype(input)>::type; using word_type = typename std::tuple_element<1, decltype(input)>::type; @@ -30,13 +34,40 @@ auto BM_BitShiftLeft_UU = [](benchmark::State& state, auto input) { container_type bitcont = make_random_container(container_size); bit::bit_iterator first = bit::bit_iterator(bitcont.begin()) + 1; bit::bit_iterator last = bit::bit_iterator(bitcont.end()) - 1; - auto n = bit::distance(first, last) / 2 + 6; + auto n = total_bits / 2 + 3; for (auto _ : state) { benchmark::DoNotOptimize(bit::shift_left(first, last, n)); benchmark::ClobberMemory(); } }; +auto BM_BitArrayShiftLeft = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + BIT_ARRAY* bitarr = bit_array_create(total_bits); + auto n = total_bits / 2 - 1; + for (auto _ : state) { + bit_array_shift_right(bitarr, n, 0); + benchmark::ClobberMemory(); + } + bit_array_free(bitarr); +}; + + +auto BM_DynamicBitsetShiftLeft = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + using iterator_type = typename container_type::iterator; + unsigned int total_bits = std::get<2>(input); + sul::dynamic_bitset<> bitset1(total_bits, 1); + auto n = total_bits / 2 - 1; + for (auto _ : state) { + bitset1 <<= n; + benchmark::ClobberMemory(); + } +}; + auto BM_BoolShiftLeft = [](benchmark::State& state, auto input) { using container_type = std::vector; using num_type = typename container_type::value_type; @@ -44,7 +75,7 @@ auto BM_BoolShiftLeft = [](benchmark::State& state, auto input) { container_type cont = make_random_container(container_size); auto first = cont.begin(); auto last = cont.end(); - auto n = std::distance(first, last) / 2 + 6; + auto n = std::distance(first, last) / 2 - 1; for (auto _ : state) { benchmark::DoNotOptimize(bit::word_shift_left(first, last, n)); benchmark::ClobberMemory(); @@ -60,7 +91,7 @@ auto BM_BitShiftRight = [](benchmark::State& state, auto input) { container_type bitcont = make_random_container(container_size); auto first = bit::bit_iterator(std::begin(bitcont)); auto last = bit::bit_iterator(std::end(bitcont)); - auto n = bit::distance(first, last) / 2; + auto n = total_bits / 2 - 1; for (auto _ : state) { benchmark::DoNotOptimize(bit::shift_right(first, last, n)); benchmark::ClobberMemory(); @@ -74,15 +105,41 @@ auto BM_BitShiftRight_UU = [](benchmark::State& state, auto input) { auto digits = bit::binary_digits::value; auto container_size = ceil(float(total_bits) / digits); container_type bitcont = make_random_container(container_size); - auto first = bit::bit_iterator(std::begin(bitcont)) + 2; - auto last = bit::bit_iterator(std::end(bitcont)) - 3; - auto n = bit::distance(first, last) / 2 + 6; + auto first = bit::bit_iterator(std::begin(bitcont)) + 1; + auto last = bit::bit_iterator(std::end(bitcont)) - 1; + auto n = total_bits / 2 + 3; for (auto _ : state) { benchmark::DoNotOptimize(bit::shift_right(first, last, n)); benchmark::ClobberMemory(); } }; +auto BM_DynamicBitsetShiftRight = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + using iterator_type = typename container_type::iterator; + unsigned int total_bits = std::get<2>(input); + sul::dynamic_bitset<> bitset1(total_bits, 1); + auto n = total_bits / 2 - 1; + for (auto _ : state) { + bitset1 >>= n; + benchmark::ClobberMemory(); + } +}; + +auto BM_BitArrayShiftRight = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + BIT_ARRAY* bitarr = bit_array_create(total_bits); + auto n = total_bits / 2 - 1; + for (auto _ : state) { + bit_array_shift_right(bitarr, n, 0); + benchmark::ClobberMemory(); + } + bit_array_free(bitarr); +}; + auto BM_BoolShiftRight = [](benchmark::State& state, auto input) { using container_type = std::vector; using num_type = typename container_type::value_type; @@ -90,7 +147,7 @@ auto BM_BoolShiftRight = [](benchmark::State& state, auto input) { container_type cont = make_random_container(container_size); auto first = cont.begin(); auto last = cont.end(); - auto n = std::distance(first, last) / 2 + 6; + auto n = std::distance(first, last) / 2 - 1; for (auto _ : state) { benchmark::DoNotOptimize(bit::word_shift_right(first, last, n)); benchmark::ClobberMemory(); diff --git a/benchmark/src/transform_bench.hpp b/benchmark/src/transform_bench.hpp index eb91ca3b..673d4953 100644 --- a/benchmark/src/transform_bench.hpp +++ b/benchmark/src/transform_bench.hpp @@ -1,27 +1,32 @@ #include +#include #include #include "test_utils.hpp" #include "bitlib/bitlib.hpp" +#include "sul/dynamic_bitset.hpp" +#include "bit_array.h" auto BM_BitTransformUnaryAA = [](benchmark::State& state, auto input) { using container_type = typename std::tuple_element<0, decltype(input)>::type; using WordType = typename std::tuple_element<1, decltype(input)>::type; unsigned int total_bits = std::get<2>(input); auto digits = bit::binary_digits::value; - auto container_size = total_bits / digits + 1; + auto container_size = total_bits / digits; auto bitvec1 = get_random_vec(container_size); auto first1 = bit::bit_iterator(std::begin(bitvec1)); - auto bitvec2 = get_random_vec(container_size); - auto first2 = bit::bit_iterator(std::begin(bitvec2)); + auto last1 = bit::bit_iterator(std::end(bitvec1)); - auto unary_op = std::bit_not(); + constexpr auto unary_op = std::bit_not(); for (auto _ : state) - bit::transform( + { + benchmark::DoNotOptimize(bit::transform( + first1, + last1, first1, - first1 + total_bits, - first2, unary_op - ); + )); + benchmark::ClobberMemory(); + } }; auto BM_BitTransformUnaryUU = [](benchmark::State& state, auto input) { @@ -32,20 +37,49 @@ auto BM_BitTransformUnaryUU = [](benchmark::State& state, auto input) { auto container_size = total_bits / digits + 1; auto bitvec1 = get_random_vec(container_size); auto first1 = bit::bit_iterator(std::begin(bitvec1)); - auto bitvec2 = get_random_vec(container_size); - auto first2 = bit::bit_iterator(std::begin(bitvec2)); - auto unary_op = std::bit_not(); + constexpr auto unary_op = std::bit_not(); for (auto _ : state) - bit::transform( + { + benchmark::DoNotOptimize(bit::transform( first1 + 2, first1 + total_bits - 4, - first2 + 3, + first1 + 1, unary_op - ); + )); + benchmark::ClobberMemory(); + } }; +auto BM_BitArrayTransformUnary = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + BIT_ARRAY* bitarr = bit_array_create(total_bits); + + for (auto _ : state) + { + bit_array_not(bitarr, bitarr); + benchmark::ClobberMemory(); + } +}; + +auto BM_DynamicBitsetTransformUnary = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using WordType = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + auto digits = bit::binary_digits::value; + auto container_size = total_bits / digits + 1; + auto bitvec1 = get_random_vec(container_size); + sul::dynamic_bitset bitset1(total_bits, 1); + std::memcpy((char*)bitset1.data(), static_cast((bitvec1.data())), total_bits / 8); + for (auto _ : state) { + benchmark::DoNotOptimize(bitset1.flip()); + benchmark::ClobberMemory(); + } +}; + auto BM_BoolTransformUnary = [](benchmark::State& state, auto input) { using container_type = typename std::tuple_element<0, decltype(input)>::type; unsigned int total_bits = std::get<2>(input); @@ -58,12 +92,15 @@ auto BM_BoolTransformUnary = [](benchmark::State& state, auto input) { auto unary_op = [](bool b) {return !b;}; for (auto _ : state) + { std::transform( first1, first1 + total_bits, first2, unary_op ); + benchmark::ClobberMemory(); + } }; @@ -77,18 +114,19 @@ auto BM_BitTransformBinaryAA = [](benchmark::State& state, auto input) { auto first1 = bit::bit_iterator(std::begin(bitvec1)); auto bitvec2 = get_random_vec(container_size); auto first2 = bit::bit_iterator(std::begin(bitvec2)); - auto bitvec3 = get_random_vec(container_size); - auto first3 = bit::bit_iterator(std::begin(bitvec3)); - auto binary_op = std::bit_and(); + constexpr auto binary_op = std::bit_and(); for (auto _ : state) + { bit::transform( first1, first1 + total_bits, first2, - first3, + first2, binary_op ); + benchmark::ClobberMemory(); + } }; auto BM_BitTransformBinaryUU = [](benchmark::State& state, auto input) { @@ -101,18 +139,51 @@ auto BM_BitTransformBinaryUU = [](benchmark::State& state, auto input) { auto first1 = bit::bit_iterator(std::begin(bitvec1)); auto bitvec2 = get_random_vec(container_size); auto first2 = bit::bit_iterator(std::begin(bitvec2)); - auto bitvec3 = get_random_vec(container_size); - auto first3 = bit::bit_iterator(std::begin(bitvec3)); - auto binary_op = std::bit_and(); + constexpr auto binary_op = std::bit_and(); for (auto _ : state) + { bit::transform( first1 + 2, first1 + total_bits - 4, first2 + 3, - first3 + 1, + first2 + 1, binary_op ); + benchmark::ClobberMemory(); + } +}; + +auto BM_DynamicBitsetTransformBinary = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using WordType = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + auto digits = bit::binary_digits::value; + auto container_size = total_bits / digits + 1; + auto bitvec1 = get_random_vec(container_size); + auto bitvec2 = get_random_vec(container_size); + sul::dynamic_bitset bitset1(total_bits, 1); + sul::dynamic_bitset bitset2(total_bits, 1); + std::memcpy((char*)bitset1.data(), static_cast((bitvec1.data())), total_bits / 8); + std::memcpy((char*)bitset2.data(), static_cast((bitvec2.data())), total_bits / 8); + for (auto _ : state) { + benchmark::DoNotOptimize(bitset1 &= bitset2); + benchmark::ClobberMemory(); + } +}; + +auto BM_BitArrayTransformBinary = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + BIT_ARRAY* bitarr = bit_array_create(total_bits); + BIT_ARRAY* bitarr2 = bit_array_create(total_bits); + + for (auto _ : state) + { + bit_array_and(bitarr, bitarr, bitarr2); + benchmark::ClobberMemory(); + } }; @@ -125,17 +196,19 @@ auto BM_BoolTransformBinary = [](benchmark::State& state, auto input) { container_type boolvec3 = make_random_container (container_size); auto first1 = boolvec1.begin(); auto first2 = boolvec2.begin(); - auto first3 = boolvec3.begin(); auto binary_op = [](bool a, bool b) {return a && b;}; for (auto _ : state) + { std::transform( first1, first1 + total_bits, first2, - first3, + first2, binary_op ); + benchmark::ClobberMemory(); + } }; diff --git a/include/bitlib/bit-algorithms/bit_algorithm.hpp b/include/bitlib/bit-algorithms/bit_algorithm.hpp index 9e51c5c7..51cc6615 100644 --- a/include/bitlib/bit-algorithms/bit_algorithm.hpp +++ b/include/bitlib/bit-algorithms/bit_algorithm.hpp @@ -2,7 +2,6 @@ // Project: The C++ Bit Library // Name: bit_algorithm.hpp // Description: Optimized versions of algorithms for bit manipulation -// Creator: Vincent Reverdy // Contributor(s): Vincent Reverdy [2015-2017] // Maghav Kumar [2016-2017] // Bryce Kille [2019] diff --git a/include/bitlib/bit-algorithms/bit_algorithm_details.hpp b/include/bitlib/bit-algorithms/bit_algorithm_details.hpp index 2a06592a..984c499f 100644 --- a/include/bitlib/bit-algorithms/bit_algorithm_details.hpp +++ b/include/bitlib/bit-algorithms/bit_algorithm_details.hpp @@ -2,9 +2,7 @@ // Project: The Experimental Bit Algorithms Library // Name: bit_algorithm_details.hpp // Description: A set of utilities to assist in writing algorithms -// Creator: Vincent Reverdy // Contributor(s): Vincent Reverdy [2019] -// Collin Gress [2019] // Bryce Kille [2019] // License: BSD 3-Clause License // ========================================================================== // @@ -105,7 +103,7 @@ constexpr bool is_within( // Get next len bits beginning at start and store them in a word of type T template -T get_word(bit_iterator first, T len=binary_digits::value) +T get_word(bit_iterator first, size_t len=binary_digits::value) { using native_word_type = typename bit_iterator::word_type; constexpr T digits = binary_digits::value; @@ -123,7 +121,7 @@ T get_word(bit_iterator first, T len=binary_digits::value) // Fill up ret_word starting at bit [offset] using it // TODO define a mask and use the _bitblend that takes in the extra mask while (len > digits) { - ret_word = _bitblend( + ret_word = _bitblend( ret_word, static_cast(static_cast(*it) << offset), offset, @@ -134,7 +132,7 @@ T get_word(bit_iterator first, T len=binary_digits::value) len -= digits; } // Assign remaining len bits of last word - ret_word = _bitblend( + ret_word = _bitblend( ret_word, static_cast(static_cast(*it) << offset), offset, @@ -278,18 +276,16 @@ void write_word(src_type src, bit_iterator dst_bit_it, // Shifts the range [first, last) to the left by n, filling the empty // bits with 0 -// NOT OPTIMIZED. Will be replaced with std::shift eventually. -template -ForwardIt word_shift_left(ForwardIt first, - ForwardIt last, - typename ForwardIt::difference_type n +template +RandomAccessIt word_shift_left(RandomAccessIt first, + RandomAccessIt last, + typename RandomAccessIt::difference_type n ) { if (n <= 0) return last; if (n >= distance(first, last)) return first; - ForwardIt mid = first + n; + RandomAccessIt mid = first + n; auto ret = std::move(mid, last, first); - std::fill(ret, last, 0); return ret; } @@ -297,51 +293,17 @@ ForwardIt word_shift_left(ForwardIt first, // Shifts the range [first, right) to the left by n, filling the empty // bits with 0 // NOT OPTIMIZED. Will be replaced with std::shift eventually. -template -ForwardIt word_shift_right_dispatch(ForwardIt first, - ForwardIt last, - typename ForwardIt::difference_type n, - std::forward_iterator_tag -) { - auto d = distance(first, last); - if (n <= 0) return first; - if (n >= d) return last; - ForwardIt it = first; - std::advance(it, d-n); - std::rotate(first, it, last); - it = first; - std::advance(it, n); - std::fill(first, it, 0); - return std::next(first, n); -} - -template -ForwardIt word_shift_right_dispatch(ForwardIt first, - ForwardIt last, - typename ForwardIt::difference_type n, - std::random_access_iterator_tag -) { +template +RandomAccessIt word_shift_right(RandomAccessIt first, + RandomAccessIt last, + typename RandomAccessIt::difference_type n +) +{ auto d = distance(first, last); if (n <= 0) return first; if (n >= d) return last; - ForwardIt it = first; - std::advance(it, d-n); - auto ret = std::copy_backward(first, it, last); - std::fill(first, ret, 0); - return ret; -} - -template -ForwardIt word_shift_right(ForwardIt first, - ForwardIt last, - typename ForwardIt::difference_type n -) -{ - return word_shift_right_dispatch( - first, - last, - n, - typename std::iterator_traits::iterator_category()); + std::move_backward(first, last-n, last); + return std::next(first, n); } // returns a word consisting of all one bits diff --git a/include/bitlib/bit-algorithms/copy.hpp b/include/bitlib/bit-algorithms/copy.hpp index 7bdea6f8..2d507516 100644 --- a/include/bitlib/bit-algorithms/copy.hpp +++ b/include/bitlib/bit-algorithms/copy.hpp @@ -2,7 +2,6 @@ // Project: The Experimental Bit Algorithms Library // Name: copy.hpp // Description: Implementation of copy, copy_if, copy_n and copy_backward -// Creator: Vincent Reverdy // Contributor: Bryce Kille [2019] // License: BSD 3-Clause License // ========================================================================== // diff --git a/include/bitlib/bit-algorithms/copy_backward.hpp b/include/bitlib/bit-algorithms/copy_backward.hpp index 971fb914..55bd6f69 100644 --- a/include/bitlib/bit-algorithms/copy_backward.hpp +++ b/include/bitlib/bit-algorithms/copy_backward.hpp @@ -2,7 +2,6 @@ // Project: The Experimental Bit Algorithms Library // Name: copy_backward.hpp // Description: bit_iterator overloads for std::copy_backward -// Creator: Vincent Reverdy // Contributor(s): // License: BSD 3-Clause License // ========================================================================== // diff --git a/include/bitlib/bit-algorithms/count.hpp b/include/bitlib/bit-algorithms/count.hpp index f6cecc8b..1a804b17 100644 --- a/include/bitlib/bit-algorithms/count.hpp +++ b/include/bitlib/bit-algorithms/count.hpp @@ -13,9 +13,11 @@ #include // Project sources #include "bitlib/bit-iterator/bit.hpp" +#include "bitlib/bit-algorithms//libpopcnt.h" // Third-party libraries #ifdef BITLIB_HWY #include "hwy/highway.h" +HWY_BEFORE_NAMESPACE(); #endif // Miscellaneous @@ -56,37 +58,42 @@ count( result = _popcnt(first_value); ++it; } -#ifdef BITLIB_HWY - // ReduceSum not implemented for unsigned char - if constexpr (digits > 8) - { - // Align to boundary - for (; it != last.base() && !is_aligned(&(*it), 64); ++it) { - result += _popcnt(*it); - } - - // SIMD - hn::ScalableTag d; - for (; std::distance(it, last.base()) >= hn::Lanes(d); it += hn::Lanes(d)) - { - const auto popcntV = hn::PopulationCount(hn::Load(d, &*it)); - result += hn::ReduceSum(d, popcntV); - } - - // Remaining - for (; it != last.base(); ++it) { - result += _popcnt(*it); - } - } else -#endif +// The SIMD implementation here is actually slower than the standard +//#ifdef BITLIB_HWY + //// ReduceSum not implemented for unsigned char + //if constexpr (digits > 8) + //{ + //// Align to boundary + //for (; it != last.base() && !is_aligned(&(*it), 64); ++it) { + //result += _popcnt(*it); + //} + + //// SIMD + //hn::ScalableTag d; + //for (; std::distance(it, last.base()) >= hn::Lanes(d); it += hn::Lanes(d)) + //{ + //const auto popcntV = hn::PopulationCount(hn::Load(d, &*it)); + //result += hn::ReduceSum(d, popcntV); + //} + + //// Remaining + //for (; it != last.base(); ++it) { + //result += _popcnt(*it); + //} + //} else +//#endif { - result += std::transform_reduce( - it, - last.base(), - 0, - std::plus{}, - [](word_type word) {return _popcnt(word); } - ); + // std:: version + //result += std::transform_reduce( + //it, + //last.base(), + //0, + //std::plus{}, + //[](word_type word) {return _popcnt(word); } + //); + + // libpopcnt + result += popcnt(&*it, (digits / 8) * std::distance(it, last.base())); } if (last.position() != 0) { word_type last_value = *last.base() << (digits - last.position()); @@ -110,6 +117,9 @@ count( } } // namespace bit +#ifdef BITLIB_HWY +HWY_AFTER_NAMESPACE(); +#endif // ========================================================================== // #endif // _COUNT_HPP_INCLUDED diff --git a/include/bitlib/bit-algorithms/debug_utils.hpp b/include/bitlib/bit-algorithms/debug_utils.hpp index a621bc3a..74e590d1 100644 --- a/include/bitlib/bit-algorithms/debug_utils.hpp +++ b/include/bitlib/bit-algorithms/debug_utils.hpp @@ -2,7 +2,6 @@ // Project: The Experimental Bit Algorithms Library // Name: debug_utils.hpp // Description: Utilities useful for debugging -// Creator: Vincent Reverdy // Contributor: Bryce Kille [2019] // License: BSD 3-Clause License // ========================================================================== // diff --git a/include/bitlib/bit-algorithms/equal.hpp b/include/bitlib/bit-algorithms/equal.hpp index b4c5e474..e3fb1449 100644 --- a/include/bitlib/bit-algorithms/equal.hpp +++ b/include/bitlib/bit-algorithms/equal.hpp @@ -1,7 +1,6 @@ // ================================= EQUAL =================================== // // Project: The Experimental Bit Algorithms Library // Name: equal.hpp -// Creator: Vincent Reverdy // Contributor: Bryce Kille [2019] // License: BSD 3-Clause License // ========================================================================== // diff --git a/include/bitlib/bit-algorithms/fill.hpp b/include/bitlib/bit-algorithms/fill.hpp index 1481f907..08f9c61b 100644 --- a/include/bitlib/bit-algorithms/fill.hpp +++ b/include/bitlib/bit-algorithms/fill.hpp @@ -2,9 +2,8 @@ // Project: The Experimental Bit Algorithms Library // Name: fill.hpp // Description: bit_iterator overloads for std::fill -// Creator: Vincent Reverdy -// Contributor(s): Vincent Reverdy [2019] -// Bryce Kille [2019] +// Contributor(s): Bryce Kille +// Vincent Reverdy [2019] // License: BSD 3-Clause License // ========================================================================== // #ifndef _FILL_HPP_INCLUDED @@ -20,6 +19,7 @@ // Third-party libraries #ifdef BITLIB_HWY #include "hwy/highway.h" +HWY_BEFORE_NAMESPACE(); #endif // Miscellaneous #define is_aligned(POINTER, BYTE_COUNT) \ @@ -83,5 +83,8 @@ void fill(bit_iterator first, bit_iterator last, // ========================================================================== // } // namespace bit +#ifdef BITLIB_HWY +HWY_AFTER_NAMESPACE(); +#endif #endif // _FILL_HPP_INCLUDED // ========================================================================== // diff --git a/include/bitlib/bit-algorithms/find.hpp b/include/bitlib/bit-algorithms/find.hpp index fed905a0..b35583c7 100644 --- a/include/bitlib/bit-algorithms/find.hpp +++ b/include/bitlib/bit-algorithms/find.hpp @@ -15,6 +15,7 @@ // Third-party libraries #ifdef BITLIB_HWY #include "hwy/highway.h" +HWY_BEFORE_NAMESPACE(); #endif // Miscellaneous @@ -98,13 +99,13 @@ constexpr bit_iterator find( } #endif - // Finish out the remainder with typical for loop - while (it != last.base()) { - if ((bv == bit1 && (*it == 0)) || (bv == bit0 && (*it == static_cast(-1)))) { - ++it; - continue; - } + if (bv == bit1) { + it = std::find_if(it, last.base(), [](word_type a) {return a != 0;}); + } else { + it = std::find_if(it, last.base(), [](word_type a) {return a != static_cast(-1);}); + } + if (it != last.base()) { size_type num_trailing_complementary_bits = (bv == bit0) ? _tzcnt(static_cast(~*it)) : _tzcnt(static_cast(*it)); @@ -123,6 +124,9 @@ constexpr bit_iterator find( // ========================================================================== // } // namespace bit +#ifdef BITLIB_HWY +HWY_AFTER_NAMESPACE(); +#endif #endif // _FIND_HPP_INCLUDED // ========================================================================== // diff --git a/include/bitlib/bit-algorithms/libpopcnt.h b/include/bitlib/bit-algorithms/libpopcnt.h new file mode 100644 index 00000000..ffcd976b --- /dev/null +++ b/include/bitlib/bit-algorithms/libpopcnt.h @@ -0,0 +1,798 @@ +/* + * libpopcnt.h - C/C++ library for counting the number of 1 bits (bit + * population count) in an array as quickly as possible using + * specialized CPU instructions i.e. POPCNT, AVX2, AVX512, NEON. + * + * Copyright (c) 2016 - 2020, Kim Walisch + * Copyright (c) 2016 - 2018, Wojciech Muła + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef LIBPOPCNT_H +#define LIBPOPCNT_H + +#include +#include + +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + +#ifndef __has_attribute + #define __has_attribute(x) 0 +#endif + +#ifdef __GNUC__ + #define GNUC_PREREQ(x, y) \ + (__GNUC__ > x || (__GNUC__ == x && __GNUC_MINOR__ >= y)) +#else + #define GNUC_PREREQ(x, y) 0 +#endif + +#ifdef __clang__ + #define CLANG_PREREQ(x, y) \ + (__clang_major__ > x || (__clang_major__ == x && __clang_minor__ >= y)) +#else + #define CLANG_PREREQ(x, y) 0 +#endif + +#if (_MSC_VER < 1900) && \ + !defined(__cplusplus) + #define inline __inline +#endif + +#if (defined(__i386__) || \ + defined(__x86_64__) || \ + defined(_M_IX86) || \ + defined(_M_X64)) + #define X86_OR_X64 +#endif + +#if GNUC_PREREQ(4, 2) || \ + __has_builtin(__builtin_popcount) + #define HAVE_BUILTIN_POPCOUNT +#endif + +#if GNUC_PREREQ(4, 2) || \ + CLANG_PREREQ(3, 0) + #define HAVE_ASM_POPCNT +#endif + +#if defined(X86_OR_X64) && \ + (defined(HAVE_ASM_POPCNT) || \ + defined(_MSC_VER)) + #define HAVE_POPCNT +#endif + +#if defined(X86_OR_X64) && \ + GNUC_PREREQ(4, 9) + #define HAVE_AVX2 +#endif + +#if defined(X86_OR_X64) && \ + GNUC_PREREQ(5, 0) + #define HAVE_AVX512 +#endif + +#if defined(X86_OR_X64) + /* MSVC compatible compilers (Windows) */ + #if defined(_MSC_VER) + /* clang-cl (LLVM 10 from 2020) requires /arch:AVX2 or + * /arch:AVX512 to enable vector instructions */ + #if defined(__clang__) + #if defined(__AVX2__) + #define HAVE_AVX2 + #endif + #if defined(__AVX512__) + #define HAVE_AVX2 + #define HAVE_AVX512 + #endif + /* MSVC 2017 or later does not require + * /arch:AVX2 or /arch:AVX512 */ + #elif _MSC_VER >= 1910 + #define HAVE_AVX2 + #define HAVE_AVX512 + #endif + /* Clang (Unix-like OSes) */ + #elif CLANG_PREREQ(3, 8) && \ + __has_attribute(target) && \ + (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000) + #define HAVE_AVX2 + #define HAVE_AVX512 + #endif +#endif + +/* + * Only enable CPUID runtime checks if this is really + * needed. E.g. do not enable if user has compiled + * using -march=native on a CPU that supports AVX512. + */ +#if defined(X86_OR_X64) && \ + (defined(__cplusplus) || \ + defined(_MSC_VER) || \ + (GNUC_PREREQ(4, 2) || \ + __has_builtin(__sync_val_compare_and_swap))) && \ + ((defined(HAVE_AVX512) && !(defined(__AVX512__) || defined(__AVX512BW__))) || \ + (defined(HAVE_AVX2) && !defined(__AVX2__)) || \ + (defined(HAVE_POPCNT) && !defined(__POPCNT__))) + #define HAVE_CPUID +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This uses fewer arithmetic operations than any other known + * implementation on machines with fast multiplication. + * It uses 12 arithmetic operations, one of which is a multiply. + * http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation + */ +static inline uint64_t popcount64(uint64_t x) +{ + uint64_t m1 = 0x5555555555555555ll; + uint64_t m2 = 0x3333333333333333ll; + uint64_t m4 = 0x0F0F0F0F0F0F0F0Fll; + uint64_t h01 = 0x0101010101010101ll; + + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + + return (x * h01) >> 56; +} + +#if defined(HAVE_ASM_POPCNT) && \ + defined(__x86_64__) + +static inline uint64_t popcnt64(uint64_t x) +{ + __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x)); + return x; +} + +#elif defined(HAVE_ASM_POPCNT) && \ + defined(__i386__) + +static inline uint32_t popcnt32(uint32_t x) +{ + __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x)); + return x; +} + +static inline uint64_t popcnt64(uint64_t x) +{ + return popcnt32((uint32_t) x) + + popcnt32((uint32_t)(x >> 32)); +} + +#elif defined(_MSC_VER) && \ + defined(_M_X64) + +#include + +static inline uint64_t popcnt64(uint64_t x) +{ + return _mm_popcnt_u64(x); +} + +#elif defined(_MSC_VER) && \ + defined(_M_IX86) + +#include + +static inline uint64_t popcnt64(uint64_t x) +{ + return _mm_popcnt_u32((uint32_t) x) + + _mm_popcnt_u32((uint32_t)(x >> 32)); +} + +/* non x86 CPUs */ +#elif defined(HAVE_BUILTIN_POPCOUNT) + +static inline uint64_t popcnt64(uint64_t x) +{ + return __builtin_popcountll(x); +} + +/* no hardware POPCNT, + * use pure integer algorithm */ +#else + +static inline uint64_t popcnt64(uint64_t x) +{ + return popcount64(x); +} + +#endif + +#if defined(HAVE_CPUID) + +#if defined(_MSC_VER) + #include + #include +#endif + +/* %ecx bit flags */ +#define bit_POPCNT (1 << 23) + +/* %ebx bit flags */ +#define bit_AVX2 (1 << 5) +#define bit_AVX512 (1 << 30) + +/* xgetbv bit flags */ +#define XSTATE_SSE (1 << 1) +#define XSTATE_YMM (1 << 2) +#define XSTATE_ZMM (7 << 5) + +static inline void run_cpuid(int eax, int ecx, int* abcd) +{ +#if defined(_MSC_VER) + __cpuidex(abcd, eax, ecx); +#else + int ebx = 0; + int edx = 0; + + #if defined(__i386__) && \ + defined(__PIC__) + /* in case of PIC under 32-bit EBX cannot be clobbered */ + __asm__ ("movl %%ebx, %%edi;" + "cpuid;" + "xchgl %%ebx, %%edi;" + : "=D" (ebx), + "+a" (eax), + "+c" (ecx), + "=d" (edx)); + #else + __asm__ ("cpuid;" + : "+b" (ebx), + "+a" (eax), + "+c" (ecx), + "=d" (edx)); + #endif + + abcd[0] = eax; + abcd[1] = ebx; + abcd[2] = ecx; + abcd[3] = edx; +#endif +} + +#if defined(HAVE_AVX2) || \ + defined(HAVE_AVX512) + +static inline int get_xcr0() +{ + int xcr0; + +#if defined(_MSC_VER) + xcr0 = (int) _xgetbv(0); +#else + __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" ); +#endif + + return xcr0; +} + +#endif + +static inline int get_cpuid() +{ + int flags = 0; + int abcd[4]; + + run_cpuid(1, 0, abcd); + + if ((abcd[2] & bit_POPCNT) == bit_POPCNT) + flags |= bit_POPCNT; + +#if defined(HAVE_AVX2) || \ + defined(HAVE_AVX512) + + int osxsave_mask = (1 << 27); + + /* ensure OS supports extended processor state management */ + if ((abcd[2] & osxsave_mask) != osxsave_mask) + return 0; + + int ymm_mask = XSTATE_SSE | XSTATE_YMM; + int zmm_mask = XSTATE_SSE | XSTATE_YMM | XSTATE_ZMM; + + int xcr0 = get_xcr0(); + + if ((xcr0 & ymm_mask) == ymm_mask) + { + run_cpuid(7, 0, abcd); + + if ((abcd[1] & bit_AVX2) == bit_AVX2) + flags |= bit_AVX2; + + if ((xcr0 & zmm_mask) == zmm_mask) + { + if ((abcd[1] & bit_AVX512) == bit_AVX512) + flags |= bit_AVX512; + } + } + +#endif + + return flags; +} + +#endif /* cpuid */ + +#if defined(HAVE_AVX2) + +#include + +#if !defined(_MSC_VER) + __attribute__ ((target ("avx2"))) +#endif +static inline void CSA256(__m256i* h, __m256i* l, __m256i a, __m256i b, __m256i c) +{ + __m256i u = _mm256_xor_si256(a, b); + *h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c)); + *l = _mm256_xor_si256(u, c); +} + +#if !defined(_MSC_VER) + __attribute__ ((target ("avx2"))) +#endif +static inline __m256i popcnt256(__m256i v) +{ + __m256i lookup1 = _mm256_setr_epi8( + 4, 5, 5, 6, 5, 6, 6, 7, + 5, 6, 6, 7, 6, 7, 7, 8, + 4, 5, 5, 6, 5, 6, 6, 7, + 5, 6, 6, 7, 6, 7, 7, 8 + ); + + __m256i lookup2 = _mm256_setr_epi8( + 4, 3, 3, 2, 3, 2, 2, 1, + 3, 2, 2, 1, 2, 1, 1, 0, + 4, 3, 3, 2, 3, 2, 2, 1, + 3, 2, 2, 1, 2, 1, 1, 0 + ); + + __m256i low_mask = _mm256_set1_epi8(0x0f); + __m256i lo = _mm256_and_si256(v, low_mask); + __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask); + __m256i popcnt1 = _mm256_shuffle_epi8(lookup1, lo); + __m256i popcnt2 = _mm256_shuffle_epi8(lookup2, hi); + + return _mm256_sad_epu8(popcnt1, popcnt2); +} + +/* + * AVX2 Harley-Seal popcount (4th iteration). + * The algorithm is based on the paper "Faster Population Counts + * using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and + * Wojciech Mula (23 Nov 2016). + * @see https://arxiv.org/abs/1611.07612 + */ +#if !defined(_MSC_VER) + __attribute__ ((target ("avx2"))) +#endif +static inline uint64_t popcnt_avx2(const __m256i* ptr, uint64_t size) +{ + __m256i cnt = _mm256_setzero_si256(); + __m256i ones = _mm256_setzero_si256(); + __m256i twos = _mm256_setzero_si256(); + __m256i fours = _mm256_setzero_si256(); + __m256i eights = _mm256_setzero_si256(); + __m256i sixteens = _mm256_setzero_si256(); + __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; + + uint64_t i = 0; + uint64_t limit = size - size % 16; + uint64_t* cnt64; + + for(; i < limit; i += 16) + { + CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 0), _mm256_loadu_si256(ptr + i + 1)); + CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 2), _mm256_loadu_si256(ptr + i + 3)); + CSA256(&foursA, &twos, twos, twosA, twosB); + CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 4), _mm256_loadu_si256(ptr + i + 5)); + CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 6), _mm256_loadu_si256(ptr + i + 7)); + CSA256(&foursB, &twos, twos, twosA, twosB); + CSA256(&eightsA, &fours, fours, foursA, foursB); + CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 8), _mm256_loadu_si256(ptr + i + 9)); + CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 10), _mm256_loadu_si256(ptr + i + 11)); + CSA256(&foursA, &twos, twos, twosA, twosB); + CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 12), _mm256_loadu_si256(ptr + i + 13)); + CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 14), _mm256_loadu_si256(ptr + i + 15)); + CSA256(&foursB, &twos, twos, twosA, twosB); + CSA256(&eightsB, &fours, fours, foursA, foursB); + CSA256(&sixteens, &eights, eights, eightsA, eightsB); + + cnt = _mm256_add_epi64(cnt, popcnt256(sixteens)); + } + + cnt = _mm256_slli_epi64(cnt, 4); + cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(eights), 3)); + cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(fours), 2)); + cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(twos), 1)); + cnt = _mm256_add_epi64(cnt, popcnt256(ones)); + + for(; i < size; i++) + cnt = _mm256_add_epi64(cnt, popcnt256(_mm256_loadu_si256(ptr + i))); + + cnt64 = (uint64_t*) &cnt; + + return cnt64[0] + + cnt64[1] + + cnt64[2] + + cnt64[3]; +} + +#endif + +#if defined(HAVE_AVX512) + +#include + +#if !defined(_MSC_VER) + __attribute__ ((target ("avx512bw"))) +#endif +static inline __m512i popcnt512(__m512i v) +{ + __m512i m1 = _mm512_set1_epi8(0x55); + __m512i m2 = _mm512_set1_epi8(0x33); + __m512i m4 = _mm512_set1_epi8(0x0F); + __m512i vm = _mm512_and_si512(_mm512_srli_epi16(v, 1), m1); + __m512i t1 = _mm512_sub_epi8(v, vm); + __m512i tm = _mm512_and_si512(t1, m2); + __m512i tm2 = _mm512_and_si512(_mm512_srli_epi16(t1, 2), m2); + __m512i t2 = _mm512_add_epi8(tm, tm2); + __m512i tt = _mm512_add_epi8(t2, _mm512_srli_epi16(t2, 4)); + __m512i t3 = _mm512_and_si512(tt, m4); + + return _mm512_sad_epu8(t3, _mm512_setzero_si512()); +} + +#if !defined(_MSC_VER) + __attribute__ ((target ("avx512bw"))) +#endif +static inline void CSA512(__m512i* h, __m512i* l, __m512i a, __m512i b, __m512i c) +{ + *l = _mm512_ternarylogic_epi32(c, b, a, 0x96); + *h = _mm512_ternarylogic_epi32(c, b, a, 0xe8); +} + +/* + * AVX512 Harley-Seal popcount (4th iteration). + * The algorithm is based on the paper "Faster Population Counts + * using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and + * Wojciech Mula (23 Nov 2016). + * @see https://arxiv.org/abs/1611.07612 + */ +#if !defined(_MSC_VER) + __attribute__ ((target ("avx512bw"))) +#endif +static inline uint64_t popcnt_avx512(const __m512i* ptr, const uint64_t size) +{ + __m512i cnt = _mm512_setzero_si512(); + __m512i ones = _mm512_setzero_si512(); + __m512i twos = _mm512_setzero_si512(); + __m512i fours = _mm512_setzero_si512(); + __m512i eights = _mm512_setzero_si512(); + __m512i sixteens = _mm512_setzero_si512(); + __m512i twosA, twosB, foursA, foursB, eightsA, eightsB; + + uint64_t i = 0; + uint64_t limit = size - size % 16; + uint64_t* cnt64; + + for(; i < limit; i += 16) + { + CSA512(&twosA, &ones, ones, _mm512_loadu_si512(ptr + i + 0), _mm512_loadu_si512(ptr + i + 1)); + CSA512(&twosB, &ones, ones, _mm512_loadu_si512(ptr + i + 2), _mm512_loadu_si512(ptr + i + 3)); + CSA512(&foursA, &twos, twos, twosA, twosB); + CSA512(&twosA, &ones, ones, _mm512_loadu_si512(ptr + i + 4), _mm512_loadu_si512(ptr + i + 5)); + CSA512(&twosB, &ones, ones, _mm512_loadu_si512(ptr + i + 6), _mm512_loadu_si512(ptr + i + 7)); + CSA512(&foursB, &twos, twos, twosA, twosB); + CSA512(&eightsA, &fours, fours, foursA, foursB); + CSA512(&twosA, &ones, ones, _mm512_loadu_si512(ptr + i + 8), _mm512_loadu_si512(ptr + i + 9)); + CSA512(&twosB, &ones, ones, _mm512_loadu_si512(ptr + i + 10), _mm512_loadu_si512(ptr + i + 11)); + CSA512(&foursA, &twos, twos, twosA, twosB); + CSA512(&twosA, &ones, ones, _mm512_loadu_si512(ptr + i + 12), _mm512_loadu_si512(ptr + i + 13)); + CSA512(&twosB, &ones, ones, _mm512_loadu_si512(ptr + i + 14), _mm512_loadu_si512(ptr + i + 15)); + CSA512(&foursB, &twos, twos, twosA, twosB); + CSA512(&eightsB, &fours, fours, foursA, foursB); + CSA512(&sixteens, &eights, eights, eightsA, eightsB); + + cnt = _mm512_add_epi64(cnt, popcnt512(sixteens)); + } + + cnt = _mm512_slli_epi64(cnt, 4); + cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(popcnt512(eights), 3)); + cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(popcnt512(fours), 2)); + cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(popcnt512(twos), 1)); + cnt = _mm512_add_epi64(cnt, popcnt512(ones)); + + for(; i < size; i++) + cnt = _mm512_add_epi64(cnt, popcnt512(_mm512_loadu_si512(ptr + i))); + + cnt64 = (uint64_t*) &cnt; + + return cnt64[0] + + cnt64[1] + + cnt64[2] + + cnt64[3] + + cnt64[4] + + cnt64[5] + + cnt64[6] + + cnt64[7]; +} + +#endif + +/* x86 CPUs */ +#if defined(X86_OR_X64) + +/* + * Count the number of 1 bits in the data array + * @data: An array + * @size: Size of data in bytes + */ +static inline uint64_t popcnt(const void* data, uint64_t size) +{ + uint64_t i = 0; + uint64_t cnt = 0; + const uint8_t* ptr = (const uint8_t*) data; + +/* + * CPUID runtime checks are only enabled if this is needed. + * E.g. CPUID is disabled when a user compiles his + * code using -march=native on a CPU with AVX512. + */ +#if defined(HAVE_CPUID) + #if defined(__cplusplus) + /* C++11 thread-safe singleton */ + static const int cpuid = get_cpuid(); + #else + static int cpuid_ = -1; + int cpuid = cpuid_; + if (cpuid == -1) + { + cpuid = get_cpuid(); + + #if defined(_MSC_VER) + _InterlockedCompareExchange(&cpuid_, cpuid, -1); + #else + __sync_val_compare_and_swap(&cpuid_, -1, cpuid); + #endif + } + #endif +#endif + +#if defined(HAVE_AVX512) + #if defined(__AVX512__) || defined(__AVX512BW__) + /* AVX512 requires arrays >= 1024 bytes */ + if (i + 1024 <= size) + #else + if ((cpuid & bit_AVX512) && + i + 1024 <= size) + #endif + { + const __m512i* ptr512 = (const __m512i*)(ptr + i); + cnt += popcnt_avx512(ptr512, (size - i) / 64); + i = size - size % 64; + } +#endif + +#if defined(HAVE_AVX2) + #if defined(__AVX2__) + /* AVX2 requires arrays >= 512 bytes */ + if (i + 512 <= size) + #else + if ((cpuid & bit_AVX2) && + i + 512 <= size) + #endif + { + const __m256i* ptr256 = (const __m256i*)(ptr + i); + cnt += popcnt_avx2(ptr256, (size - i) / 32); + i = size - size % 32; + } +#endif + +#if defined(HAVE_POPCNT) + /* + * The user has compiled without -mpopcnt. + * Unfortunately the MSVC compiler does not have + * a POPCNT macro so we cannot get rid of the + * runtime check for MSVC. + */ + #if !defined(__POPCNT__) + if (cpuid & bit_POPCNT) + #endif + { + /* We use unaligned memory accesses here to improve performance */ + for (; i < size - size % 8; i += 8) + cnt += popcnt64(*(const uint64_t*)(ptr + i)); + for (; i < size; i++) + cnt += popcnt64(ptr[i]); + + return cnt; + } +#endif + +#if !defined(HAVE_POPCNT) || \ + !defined(__POPCNT__) + /* + * Pure integer popcount algorithm. + * We use unaligned memory accesses here to improve performance. + */ + for (; i < size - size % 8; i += 8) + cnt += popcount64(*(const uint64_t*)(ptr + i)); + + if (i < size) + { + uint64_t val = 0; + size_t bytes = (size_t)(size - i); + memcpy(&val, &ptr[i], bytes); + cnt += popcount64(val); + } + + return cnt; +#endif +} + +#elif defined(__ARM_NEON) || \ + defined(__aarch64__) + +#include + +static inline uint64x2_t vpadalq(uint64x2_t sum, uint8x16_t t) +{ + return vpadalq_u32(sum, vpaddlq_u16(vpaddlq_u8(t))); +} + +/* + * Count the number of 1 bits in the data array + * @data: An array + * @size: Size of data in bytes + */ +static inline uint64_t popcnt(const void* data, uint64_t size) +{ + uint64_t i = 0; + uint64_t cnt = 0; + uint64_t chunk_size = 64; + const uint8_t* ptr = (const uint8_t*) data; + + if (size >= chunk_size) + { + uint64_t iters = size / chunk_size; + uint64x2_t sum = vcombine_u64(vcreate_u64(0), vcreate_u64(0)); + uint8x16_t zero = vcombine_u8(vcreate_u8(0), vcreate_u8(0)); + + do + { + uint8x16_t t0 = zero; + uint8x16_t t1 = zero; + uint8x16_t t2 = zero; + uint8x16_t t3 = zero; + + /* + * After every 31 iterations we need to add the + * temporary sums (t0, t1, t2, t3) to the total sum. + * We must ensure that the temporary sums <= 255 + * and 31 * 8 bits = 248 which is OK. + */ + uint64_t limit = (i + 31 < iters) ? i + 31 : iters; + + /* Each iteration processes 64 bytes */ + for (; i < limit; i++) + { + uint8x16x4_t input = vld4q_u8(ptr); + ptr += chunk_size; + + t0 = vaddq_u8(t0, vcntq_u8(input.val[0])); + t1 = vaddq_u8(t1, vcntq_u8(input.val[1])); + t2 = vaddq_u8(t2, vcntq_u8(input.val[2])); + t3 = vaddq_u8(t3, vcntq_u8(input.val[3])); + } + + sum = vpadalq(sum, t0); + sum = vpadalq(sum, t1); + sum = vpadalq(sum, t2); + sum = vpadalq(sum, t3); + } + while (i < iters); + + i = 0; + size %= chunk_size; + + uint64_t tmp[2]; + vst1q_u64(tmp, sum); + cnt += tmp[0]; + cnt += tmp[1]; + } + +#if defined(__ARM_FEATURE_UNALIGNED) + /* We use unaligned memory accesses here to improve performance */ + for (; i < size - size % 8; i += 8) + cnt += popcnt64(*(const uint64_t*)(ptr + i)); +#else + if (i + 8 <= size) + { + /* Align memory to an 8 byte boundary */ + for (; (uintptr_t)(ptr + i) % 8; i++) + cnt += popcnt64(ptr[i]); + for (; i < size - size % 8; i += 8) + cnt += popcnt64(*(const uint64_t*)(ptr + i)); + } +#endif + + if (i < size) + { + uint64_t val = 0; + size_t bytes = (size_t)(size - i); + memcpy(&val, &ptr[i], bytes); + cnt += popcount64(val); + } + + return cnt; +} + +/* all other CPUs */ +#else + +/* + * Count the number of 1 bits in the data array + * @data: An array + * @size: Size of data in bytes + */ +static inline uint64_t popcnt(const void* data, uint64_t size) +{ + uint64_t i = 0; + uint64_t cnt = 0; + const uint8_t* ptr = (const uint8_t*) data; + + if (size >= 8) + { + /* + * Since we don't know whether this CPU architecture + * supports unaligned memory accesses we align + * memory to an 8 byte boundary. + */ + for (; (uintptr_t)(ptr + i) % 8; i++) + cnt += popcnt64(ptr[i]); + for (; i < size - size % 8; i += 8) + cnt += popcnt64(*(const uint64_t*)(ptr + i)); + } + + for (; i < size; i++) + cnt += popcnt64(ptr[i]); + + return cnt; +} + +#endif + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LIBPOPCNT_H */ diff --git a/include/bitlib/bit-algorithms/move.hpp b/include/bitlib/bit-algorithms/move.hpp index def87ef4..401cbfdb 100644 --- a/include/bitlib/bit-algorithms/move.hpp +++ b/include/bitlib/bit-algorithms/move.hpp @@ -2,7 +2,6 @@ // Project: The Experimental Bit Algorithms Library // Name: move.hpp // Description: bit_iterator overloads for std::move -// Creator: Vincent Reverdy // Contributor(s): // License: BSD 3-Clause License // ========================================================================== // diff --git a/include/bitlib/bit-algorithms/reverse.hpp b/include/bitlib/bit-algorithms/reverse.hpp index 3cc21734..37dacb7c 100644 --- a/include/bitlib/bit-algorithms/reverse.hpp +++ b/include/bitlib/bit-algorithms/reverse.hpp @@ -2,7 +2,6 @@ // Project: The Experimental Bit Algorithms Library // Name: copy.hpp // Description: Implementation of reverse -// Creator: Vincent Reverdy // Contributor: Vincent Reverdy [2019] // License: BSD 3-Clause License // ========================================================================== // diff --git a/include/bitlib/bit-algorithms/rotate.hpp b/include/bitlib/bit-algorithms/rotate.hpp index b019d7be..f9509ba8 100644 --- a/include/bitlib/bit-algorithms/rotate.hpp +++ b/include/bitlib/bit-algorithms/rotate.hpp @@ -2,7 +2,6 @@ // Project: The Experimental Bit Algorithms Library // Name: rotate.hpp // Description: bit_iterator overloads for std::rotate -// Creator: Vincent Reverdy // Contributor(s): Bryce Kille [2019] // License: BSD 3-Clause License // ========================================================================== // @@ -239,13 +238,13 @@ bit_iterator rotate( // Single word subcases if (is_within(first, n_first)) { size_type k = distance(first, n_first); - word_type temp = get_word(first, k); + word_type temp = get_word(first, k); bit_iterator new_last = shift_left(first, last, k); write_word(temp, new_last, static_cast(k)); return new_last; } else if (is_within(n_first, last)) { size_type p = distance(n_first, last); - word_type temp = get_word(n_first, p); + word_type temp = get_word(n_first, p); auto new_last = shift_right(first, last, p); write_word(temp, first, static_cast(p)); return new_last; diff --git a/include/bitlib/bit-algorithms/shift.hpp b/include/bitlib/bit-algorithms/shift.hpp index 0ef14fe1..e598e0ca 100644 --- a/include/bitlib/bit-algorithms/shift.hpp +++ b/include/bitlib/bit-algorithms/shift.hpp @@ -2,7 +2,6 @@ // Project: The Experimental Bit Algorithms Library // Name: shift.hpp // Description: Implementation of shift_left and shift_right -// Creator: Vincent Reverdy // Contributor(s): Bryce Kille [2019] // License: BSD 3-Clause License // ========================================================================== // @@ -19,6 +18,7 @@ // Third-party libraries #ifdef BITLIB_HWY #include "hwy/highway.h" +HWY_BEFORE_NAMESPACE(); #endif // Miscellaneous #define is_aligned(POINTER, BYTE_COUNT) \ @@ -55,19 +55,25 @@ bit_iterator shift_left( // Types and constants using word_type = typename bit_iterator::word_type; using size_type = typename bit_iterator::size_type; + using difference_type = typename bit_iterator::difference_type; constexpr size_type digits = binary_digits::value; // Initialization - auto d = distance(first, last); + auto d = bit::distance(first, last); const bool is_first_aligned = first.position() == 0; const bool is_last_aligned = last.position() == 0; + auto middle = first + n; // Out of range cases if (n <= 0) return last; - if (n >= d) return first; - + if (n >= d) + { + //bit::fill(first, last, bit::bit0); + return first; + } // Single word case + // Triggered if all relevant bits are in first.base() if (std::next(first.base(), is_last_aligned) == last.base()) { *first.base() = _bitblend( *first.base(), @@ -81,79 +87,154 @@ bit_iterator shift_left( first.position(), (is_last_aligned ? digits : last.position()) - first.position() ); - return bit_iterator( - first.base(), - first.position() + d - n - ); + return first + d - n; } - // More initialization - size_type word_shifts = n / digits; - size_type remaining_bitshifts = n - digits*(word_shifts); - + // Triggered if all remaining bits can fit in a word + if (d - n <= digits) + { + word_type new_word = get_word(middle, d - n); + write_word(new_word, first, d - n); + return first + d - n; + } // Multiple word case word_type first_value = *first.base(); word_type last_value = !is_last_aligned ? *last.base() : 0; - // Shift words to the left using std::shift - RandomAccessIt new_last_base = STD_SHIFT_LEFT(first.base(), - last.base(), - word_shifts - ); - if (!is_last_aligned) { - // Mask out-of-range bits so that we don't incorporate them - *last.base() &= (static_cast(1) << last.position()) - 1; - *new_last_base = *last.base(); - if (word_shifts > 0) { - *last.base() = 0; + // Align first + if (!is_first_aligned) + { + if (first.position() >= middle.position()) + { + *first.base() = _bitblend( + *first.base(), + (*middle.base()) << (first.position() - middle.position()), + first.position(), + digits - first.position() + ); } + else + { + const int n1 = digits - middle.position(); + const int n2 = digits - first.position() - n1; + *first.base() = _bitblend( + *first.base(), + (*middle.base()) >> (middle.position() - first.position()), + first.position(), + n1 + ); + *first.base() = _bitblend( + *first.base(), + (*std::next(middle.base())) << (digits - n2), + first.position() + n1, + n2 + ); + } + const int shifted = std::min(d - n, (digits - first.position())); + first += shifted; + middle += shifted; } - // Shift bit sequence to the lsb - if (remaining_bitshifts) { - RandomAccessIt it = first.base(); - -#ifdef BITLIB_HWY - // Align to 64 bit boundary - for (; std::next(it, is_last_aligned) != new_last_base && !is_aligned(&*it, 64); it++) { - *it = _shrd(*it, *std::next(it), remaining_bitshifts); + if (middle.base() == last.base()) + { + const int bits_left = last.position() - middle.position(); + if (bits_left > 0) + { + *first.base() = _bitblend( + *first.base(), + *middle.base() >> middle.position(), + 0, + bits_left + ); + first += bits_left; } + // https://en.cppreference.com/w/cpp/algorithm/shift + // "Elements that are in the original range but not the new range + // are left in a valid but unspecified state." + // + //bit::fill(first, last, bit::bit0); + return first; + } - const hn::ScalableTag d; - for (; std::distance(it, new_last_base) >= hn::Lanes(d) + 10 + !is_last_aligned; it += hn::Lanes(d)) + // More initialization + d = bit::distance(first, last); + const size_type word_shifts = n / digits; + const size_type offset = middle.position(); + + // At this point, first is aligned + if (offset == 0) + { + first = bit::bit_iterator( + STD_SHIFT_LEFT(first.base(), + last.base(), + word_shifts), + 0 + ); + if (!is_last_aligned) { - const auto v = hn::ShiftRightSame(hn::Load(d, &*it), remaining_bitshifts); - const auto v_plus1 = hn::ShiftLeftSame(hn::LoadU(d, &*(it+1)), digits - remaining_bitshifts); - hn::Store(v | v_plus1, d, &*it); + write_word(*last.base(), first, last.position()); + first += last.position(); } + // https://en.cppreference.com/w/cpp/algorithm/shift + // "Elements that are in the original range but not the new range + // are left in a valid but unspecified state." + // + //bit::fill(first, last, bit::bit0); + return first; + } + + // Shift bit sequence to the lsb +#ifdef BITLIB_HWY + // Align to 64 bit boundary + while (std::next(middle.base()) < last.base() && !is_aligned(&*first.base(), 64)) { + *first.base() = _shrd(*middle.base(), *std::next(middle.base()), offset); + first += digits; + middle += digits; + } + + const hn::ScalableTag d_tag; + while (std::distance(middle.base(), last.base()) >= hn::Lanes(d_tag) + 10 + !is_last_aligned) + { + const auto v = hn::ShiftRightSame(hn::LoadU(d_tag, &*middle.base()), offset); + const auto v_plus1 = hn::ShiftLeftSame(hn::LoadU(d_tag, &*(middle.base()+1)), digits - offset); + hn::Store(v | v_plus1, d_tag, &*first.base()); + first += hn::Lanes(d_tag)*digits; + middle += hn::Lanes(d_tag)*digits; + } #endif - // _shrd all words except the last - for (; std::next(it, is_last_aligned) != new_last_base; ++it) { - *it = _shrd(*it, *std::next(it), remaining_bitshifts); - } - // For the last word simply right shift - *it >>= remaining_bitshifts; + auto first_base = first.base(); + auto middle_base = middle.base(); + + while (std::next(middle_base) < last.base()) { + *first_base = _shrd(*middle_base, *std::next(middle_base), offset); + first_base++; + middle_base++;; } - // Blend bits of the first element - if (!is_first_aligned) { + first = bit_iterator(first_base, 0); + middle = bit_iterator(middle_base, middle.position()); + + // If middle is now penultimate word + if (std::next(middle.base()) == last.base()) + { *first.base() = _bitblend( - first_value, *first.base(), - first.position(), - digits - first.position() + *middle.base() >> offset, + 0, + digits - offset ); + first += digits - offset; + middle += digits - offset; } - // Blend bits of the last element - if (!is_last_aligned) { - *last.base() = _bitblend( - *last.base(), - last_value, - last.position(), - digits - last.position() - ); + + if (!is_last_aligned) + { + const difference_type bits_left = last.position() - middle.position(); + const word_type new_word = get_word(middle, bits_left); + write_word(new_word, first, bits_left); + first += bits_left; } - //TODO is this more or less inefficient than having a latent iterator? - bit_iterator d_last = next(first, d-n); - return d_last; + + //bit::fill(first, last, bit::bit0); + return first; } template @@ -170,7 +251,8 @@ bit_iterator shift_right( const bool is_first_aligned = first.position() == 0; const bool is_last_aligned = last.position() == 0; constexpr auto digits = binary_digits::value; - auto d = distance(first, last); + auto d = bit::distance(first, last); + bit_iterator middle = last - n; // Out of range cases if (n <= 0) return first; @@ -188,83 +270,110 @@ bit_iterator shift_right( first.position(), (is_last_aligned ? digits : last.position()) - first.position() ); - return bit_iterator( - first.base(), - first.position() + n - ); + return first + n; + } + + // Align last + if (last.position() != 0) + { + const size_type bits_to_align = std::min( + last.position(), + bit::distance(first, middle)); + const word_type word_to_write = get_word( + middle - bits_to_align, + bits_to_align); + write_word( + word_to_write, + last - bits_to_align, + bits_to_align); + middle -= bits_to_align; + last -= bits_to_align; + + // Nothing left to do + if (middle == first) + return first + n; } // More initialization - size_type word_shifts = n / digits; - size_type remaining_bitshifts = n - digits*(word_shifts); + const size_type word_shifts = n / digits; + const size_type offset = middle.position(); - // Multiple word case - word_type first_value = *first.base(); - word_type last_value = !is_last_aligned ? *last.base() : 0; - word_type mask = is_first_aligned ? - static_cast(-1) - : - static_cast( - (static_cast(1) << (digits - first.position())) - 1 - ) << first.position(); - *first.base() = *first.base() & mask; - // Shift words to the right - RandomAccessIt new_first_base = STD_SHIFT_RIGHT( - first.base(), - std::next( - last.base(), - !is_last_aligned), - word_shifts - ); - bit_iterator d_first(new_first_base, first.position()); // Shift bit sequence to the msb - if (remaining_bitshifts) { - auto it = is_last_aligned ? last.base() - 1 : last.base(); + if (offset == 0) { + auto new_first = bit::bit_iterator( + STD_SHIFT_RIGHT( + first.base(), + last.base(), + word_shifts), + first.position() + ); + // https://en.cppreference.com/w/cpp/algorithm/shift + // "Elements that are in the original range but not the new range + // are left in a valid but unspecified state." + // + //bit::fill(first, new_first, bit::bit0); + return first + n; + } + if (bit::distance(first, middle) >= digits) + { #ifdef BITLIB_HWY // Align to 64 bit boundary const hn::ScalableTag d; - for (; it != new_first_base && !is_aligned(&*(it - hn::Lanes(d) + 1), 64); it--) { - *it = _shld(*it, *(it - 1), remaining_bitshifts); + while (std::prev(middle.base()) > first.base() && !is_aligned(&*(last.base() - hn::Lanes(d)), 64)) { + *std::prev(last.base()) = _shrd(*std::prev(middle.base()), *middle.base(), offset); + last -= digits; + middle -= digits; } - for (; std::distance(new_first_base, it) >= hn::Lanes(d); it -= hn::Lanes(d)) + while (std::distance(first.base(), middle.base()) > hn::Lanes(d) + 1) { - const auto v = hn::ShiftLeftSame( - hn::Load(d, &*(it - hn::Lanes(d) + 1)), - remaining_bitshifts); - const auto v_plus1 = hn::ShiftRightSame( - hn::LoadU(d, &*(it - hn::Lanes(d))), - digits - remaining_bitshifts); - hn::Store(v | v_plus1, d, &*(it - hn::Lanes(d) + 1)); + const auto v = hn::ShiftRightSame( + hn::LoadU(d, &*(middle.base() - hn::Lanes(d))), + offset); + const auto v_plus1 = hn::ShiftLeftSame( + hn::LoadU(d, &*(middle.base() - hn::Lanes(d) + 1)), + digits - offset); + hn::Store(v | v_plus1, d, &*(last.base() - hn::Lanes(d))); + + last -= digits * hn::Lanes(d); + middle -= digits * hn::Lanes(d); } #endif - for(; it != new_first_base; --it) { - *it = _shld(*it, *(it - 1), remaining_bitshifts); + auto last_base_prev = std::prev(last.base()); + auto middle_base_prev = std::prev(middle.base()); + + while (middle_base_prev > first.base()) { + *last_base_prev = _shrd(*middle_base_prev, *std::next(middle_base_prev), offset); + last_base_prev--; + middle_base_prev--; } - *it <<= remaining_bitshifts; - } - // Blend bits of the first element - if (!is_first_aligned) { - *first.base() = _bitblend( - first_value, - *first.base(), - first.position(), - digits - first.position() - ); + + if (first.position() <= middle.position()) + { + *last_base_prev = _shrd(*middle_base_prev, *std::next(middle_base_prev), offset); + last_base_prev--; + middle_base_prev--; + } + + last = bit_iterator(std::next(last_base_prev), last.position()); + middle = bit_iterator(std::next(middle_base_prev), middle.position()); } - // Blend bits of the last element - if (!is_last_aligned) { - *last.base() = _bitblend( - *last.base(), - last_value, - last.position(), - digits - last.position() - ); + + if (first.position() != middle.position()) + { + const size_type bits_to_align = bit::distance(first, middle); + const word_type word_to_write = get_word( + first, + bits_to_align); + write_word( + word_to_write, + last - bits_to_align, + bits_to_align); } - advance(d_first, remaining_bitshifts); - return d_first; + + return first + n; } // -------------------------------------------------------------------------- // @@ -272,5 +381,8 @@ bit_iterator shift_right( // ========================================================================== // } // namespace bit +#ifdef BITLIB_HWY +HWY_AFTER_NAMESPACE(); +#endif #endif // _SHIFT_HPP_INCLUDED // ========================================================================== // diff --git a/include/bitlib/bit-algorithms/swap_ranges.hpp b/include/bitlib/bit-algorithms/swap_ranges.hpp index 799ea470..67cd5a06 100644 --- a/include/bitlib/bit-algorithms/swap_ranges.hpp +++ b/include/bitlib/bit-algorithms/swap_ranges.hpp @@ -2,7 +2,6 @@ // Project: The Experimental Bit Algorithms Library // Name: swap_ranges.hpp // Description: bit_iterator overloads for std::swap_ranges -// Creator: Vincent Reverdy // Contributor(s): Bryce Kille [2019] // License: BSD 3-Clause License // ========================================================================== // diff --git a/include/bitlib/bit-algorithms/transform.hpp b/include/bitlib/bit-algorithms/transform.hpp index 5d3321ec..88577882 100644 --- a/include/bitlib/bit-algorithms/transform.hpp +++ b/include/bitlib/bit-algorithms/transform.hpp @@ -2,8 +2,6 @@ // Project: The Experimental Bit Algorithms Library // Name: transform.hpp // Description: bit_iterator overloads for std::transform -// Creator: Vincent Reverdy -// Contributor(s): // License: BSD 3-Clause License // ========================================================================== // #ifndef _TRANSFORM_HPP_INCLUDED @@ -13,6 +11,7 @@ // ============================== PREAMBLE ================================== // // C++ standard library #include +#include // Project sources #include "bitlib/bit-iterator/bit.hpp" // Third-party libraries @@ -73,14 +72,40 @@ constexpr bit_iterator transform( advance(first, partial_bits_to_op); it++; } + auto firstIt = first.base(); if (remaining_bits_to_op > 0) { const bool is_first_aligned = first.position() == 0; //size_type words_to_op = ::std::ceil(remaining_bits_to_op / static_cast(digits)); // d_first will be aligned at this point if (is_first_aligned && remaining_bits_to_op > digits) { - auto N = ::std::distance(first.base(), last.base()); - it = std::transform(first.base(), last.base(), it, unary_op); - first += digits * N; + auto N = ::std::distance(firstIt, last.base()); +#ifdef BITLIB_HWY + if constexpr (std::is_same_v>) + { + // Align to 64 bit boundary + for (; firstIt != last.base() && !is_aligned(&*firstIt, 64); firstIt++, it++) { + *it = unary_op(*firstIt); + } + + bool out_is_aligned = is_aligned(&*it, 64); + + constexpr hn::ScalableTag d; + for (; std::distance(firstIt, last.base()) >= hn::Lanes(d); firstIt += hn::Lanes(d), it += hn::Lanes(d)) + { + const auto v = hn::Not(hn::Load(d, &*firstIt)); + if (out_is_aligned) + { + hn::Store(v, d, &*it); + } else { + hn::StoreU(v, d, &*it); + } + } + } +#endif + size_t std_dist = ::std::distance(firstIt, last.base()); + it = std::transform(firstIt, last.base(), it, unary_op); + firstIt += std_dist; + first = bit_iterator(firstIt); remaining_bits_to_op -= digits * N; } else { while (remaining_bits_to_op >= digits) { diff --git a/include/bitlib/bit-algorithms/type_traits.hpp b/include/bitlib/bit-algorithms/type_traits.hpp index 2ddbe074..0cd1fb20 100644 --- a/include/bitlib/bit-algorithms/type_traits.hpp +++ b/include/bitlib/bit-algorithms/type_traits.hpp @@ -2,7 +2,6 @@ // Project: The Experimental Bit Algorithms Library // Name: type_traits.hpp // Description: Type traits for bits -// Creator: Vincent Reverdy // Contributor(s): Vincent Reverdy [2019] // License: BSD 3-Clause License // ========================================================================== // diff --git a/include/bitlib/bit-containers/bit-containers.hpp b/include/bitlib/bit-containers/bit-containers.hpp index d2fbc378..bbbd540a 100644 --- a/include/bitlib/bit-containers/bit-containers.hpp +++ b/include/bitlib/bit-containers/bit-containers.hpp @@ -2,8 +2,6 @@ // Project: The Bit Algorithms Library // Name: bit-containers.hpp // Description: Brings in all of the container headers together -// Creator: Vincent Reverdy -// Contributor(s): Bryce Kille [2019] // License: BSD 3-Clause License // ========================================================================== // #ifndef _BIT_CONTAINERS_HPP_INCLUDED diff --git a/include/bitlib/bit-containers/bit_vector.hpp b/include/bitlib/bit-containers/bit_vector.hpp index f68d0764..f971c494 100644 --- a/include/bitlib/bit-containers/bit_vector.hpp +++ b/include/bitlib/bit-containers/bit_vector.hpp @@ -2,8 +2,7 @@ // Project: The Experimental Bit Algorithms Library // \file bit_vector.hpp // Description: Implementation of bit_vector -// Creator: Vincent Reverdy -// Contributor: Bryce Kille [2019] +// Contributor: Bryce Kille // License: BSD 3-Clause License // ========================================================================== // #ifndef _BIT_VECTOR_HPP_INCLUDED diff --git a/profile/CMakeLists.txt b/profile/CMakeLists.txt new file mode 100644 index 00000000..2efc67f3 --- /dev/null +++ b/profile/CMakeLists.txt @@ -0,0 +1,15 @@ +# set output directory of builds +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) + +# set build type +set(CMAKE_BUILD_TYPE RelWithDebInfo) + +# Add targets +file(GLOB PROFILE_SOURCES "src/*.cpp") +add_executable(bitlib-profile ${PROFILE_SOURCES}) + +# specify benchmark-specific libraries +include_directories(src/utils) + +target_compile_options(bitlib-profile PUBLIC -O2 -ggdb -Wpedantic) +install(TARGETS bitlib-profile DESTINATION .) diff --git a/profile/src/main.cpp b/profile/src/main.cpp new file mode 100644 index 00000000..9cd05bde --- /dev/null +++ b/profile/src/main.cpp @@ -0,0 +1,47 @@ +// ================================ PROFILE ================================= // +// Project: The Experimental Bit Algorithms Library +// Description: Used for profiling specific functions/algorithms +// Creator: Bryce Kille +// License: BSD 3-Clause License +// ========================================================================== // + + + +// ============================== PREAMBLE ================================== // +// C++ standard library +#include +#include +#include +#include +#include +// Project sources +#include "bitlib/bitlib.hpp" +#include "test_utils.hpp" +// Third party libraries +#include +#include +#include +#include +#include +#include +#include +// ========================================================================== // + + + +int main() +{ + using container_type = std::vector; + const int container_size = 1 << 24; + container_type bitcont = make_random_container(container_size); + auto first = bit::bit_iterator(std::begin(bitcont)); + auto last = bit::bit_iterator(std::end(bitcont)); + auto n = 1 << 10; + for (int i = 0; i < 100; i++) + { + if (i % 10 == 0) + std::cerr << i << "\n"; + bit::shift_left(first + 2, last, n + 4); + } + return 0; +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 07ab0a88..e84ae521 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -23,9 +23,11 @@ endif() # specify test-specific libraries include_directories(${googletest_SOURCE_DIR}/googletest/include/gtest src/utils) -target_link_libraries(bitlib-tests PUBLIC gtest gtest_main -pthread -lgcov --coverage) +target_link_libraries(bitlib-tests PUBLIC GTest::gtest GTest::gtest_main -pthread -lgcov --coverage) -set(BITLIB_GTEST_REPEAT 1) +if (NOT BITLIB_GTEST_REPEAT) + set(BITLIB_GTEST_REPEAT 1) +endif() enable_testing() gtest_discover_tests( diff --git a/test/src/fixtures.hpp b/test/src/fixtures.hpp index 1c9e00fd..b8cd5942 100644 --- a/test/src/fixtures.hpp +++ b/test/src/fixtures.hpp @@ -1,7 +1,7 @@ // =============================== FIXTURES ================================= // // Project: The Experimental Bit Algorithms Library // Description: Fixtures for testing -// Creator: Bryce Kille [2019] +// Contributor(s): Bryce Kille // License: BSD 3-Clause License // ========================================================================== // #ifndef _FIXTURES_HPP_INCLUDED diff --git a/test/src/test-rotate.cpp b/test/src/test-rotate.cpp index fdc93736..8ae3a554 100644 --- a/test/src/test-rotate.cpp +++ b/test/src/test-rotate.cpp @@ -2,8 +2,7 @@ // Project: The Experimental Bit Algorithms Library // Name: rotate.hpp // Description: Tests for rotate algorithms -// Creator: Vincent Reverdy -// Contributor(s): Bryce Kille [2019] +// Contributor(s): Bryce Kille // License: BSD 3-Clause License // ========================================================================== // diff --git a/test/src/vector_test.cpp b/test/src/vector_test.cpp index c7dff718..666b031d 100644 --- a/test/src/vector_test.cpp +++ b/test/src/vector_test.cpp @@ -1,7 +1,7 @@ // =============================== FIXTURES ================================= // // Project: The Experimental Bit Algorithms Library // Description: Fixtures for testing -// Creator: Bryce Kille [2019] +// Contributor(s): Bryce Kille // License: BSD 3-Clause License // ========================================================================== //