Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
ENH: Add SIMD sin/cos implementation with numpy-simd-routines
  numpy-simd-routines added as subrepo in meson subprojects
  directory and the current FP configuration is static, ~1ulp used for double-precision
  ~4ulp for single-precision with handling floating-point errors,
  special-cases extended precision for large arguments,
  subnormals are enabled by default too.

  numpy-simd-routines supports all SIMD extensions that are supported
  by Google Highway including non-FMA extensions and is fully independent
  from libm to guarantee unified results across all compilers and
  platforms.

  Full benchmarks will be provided within the pull-request, the following
  benchmark was tested against clang-19 and x86 CPU (Ryzen7 7700X)
  with AVX512 enabled.

  Note: that there was no SIMD optimization enabled for sin/cos
  for double-precision, only single-precision.

  | Before        | After       |  Ratio | Benchmark (Parameter)                    |
  |---------------|-------------|--------|------------------------------------------|
  | 713±6μs       | 633±6μs     |   0.89 | UnaryFP(<ufunc 'cos'>, 1, 2, 'f')        |
  | 717±9μs       | 637±6μs     |   0.89 | UnaryFP(<ufunc 'cos'>, 4, 1, 'f')        |
  | 705±3μs       | 607±10μs    |   0.86 | UnaryFP(<ufunc 'sin'>, 4, 1, 'f')        |
  | 714±10μs      | 595±0.5μs   |   0.83 | UnaryFP(<ufunc 'sin'>, 1, 2, 'f')        |
  | 370±0.3μs     | 277±4μs     |   0.75 | UnaryFP(<ufunc 'cos'>, 1, 1, 'f')        |
  | 373±2μs       | 236±0.6μs   |   0.63 | UnaryFP(<ufunc 'sin'>, 1, 1, 'f')        |
  | 1.06±0.01ms   | 648±3μs     |   0.61 | UnaryFP(<ufunc 'cos'>, 4, 2, 'f')        |
  | 1.06±0.01ms   | 617±30μs    |   0.58 | UnaryFP(<ufunc 'sin'>, 4, 2, 'f')        |
  | 5.06±0.06ms   | 2.61±0.3ms  |   0.52 | UnaryFPSpecial(<ufunc 'cos'>, 4, 2, 'd') |
  | 1.48±0ms      | 715±5μs     |   0.48 | UnaryFPSpecial(<ufunc 'sin'>, 1, 2, 'f') |
  | 1.50±0.01ms   | 639±6μs     |   0.43 | UnaryFPSpecial(<ufunc 'cos'>, 1, 2, 'f') |
  | 5.15±0.1ms    | 1.96±0.01ms |   0.38 | UnaryFPSpecial(<ufunc 'cos'>, 4, 1, 'd') |
  | 5.72±0.02ms   | 2.09±0.1ms  |   0.37 | UnaryFP(<ufunc 'cos'>, 4, 2, 'd')        |
  | 5.76±0.01ms   | 2.03±0.08ms |   0.35 | UnaryFP(<ufunc 'sin'>, 4, 2, 'd')        |
  | 5.07±0.08ms   | 1.76±0.2ms  |   0.35 | UnaryFPSpecial(<ufunc 'cos'>, 1, 2, 'd') |
  | 6.04±0.04ms   | 2.05±0.09ms |   0.34 | UnaryFPSpecial(<ufunc 'sin'>, 4, 2, 'd') |
  | 5.79±0.03ms   | 1.90±0.2ms  |   0.33 | UnaryFP(<ufunc 'sin'>, 4, 1, 'd')        |
  | 2.29±0.1ms    | 762±40μs    |   0.33 | UnaryFPSpecial(<ufunc 'sin'>, 4, 1, 'f') |
  | 5.72±0.1ms    | 1.75±0.07ms |   0.31 | UnaryFP(<ufunc 'cos'>, 4, 1, 'd')        |
  | 6.04±0.03ms   | 1.82±0.2ms  |   0.3  | UnaryFPSpecial(<ufunc 'sin'>, 4, 1, 'd') |
  | 2.49±0.1ms    | 748±30μs    |   0.3  | UnaryFPSpecial(<ufunc 'sin'>, 4, 2, 'f') |
  | 2.23±0.1ms    | 634±6μs     |   0.28 | UnaryFPSpecial(<ufunc 'cos'>, 4, 1, 'f') |
  | 1.31±0.03ms   | 367±5μs     |   0.28 | UnaryFPSpecial(<ufunc 'sin'>, 1, 1, 'f') |
  | 2.55±0.09ms   | 654±30μs    |   0.26 | UnaryFPSpecial(<ufunc 'cos'>, 4, 2, 'f') |
  | 4.97±0.03ms   | 1.14±0ms    |   0.23 | UnaryFPSpecial(<ufunc 'cos'>, 1, 1, 'd') |
  | 5.67±0.01ms   | 1.22±0.03ms |   0.22 | UnaryFP(<ufunc 'cos'>, 1, 2, 'd')        |
  | 5.76±0.03ms   | 1.28±0.06ms |   0.22 | UnaryFP(<ufunc 'sin'>, 1, 2, 'd')        |
  | 1.26±0.01ms   | 272±2μs     |   0.22 | UnaryFPSpecial(<ufunc 'cos'>, 1, 1, 'f') |
  | 7.03±0.02ms   | 1.31±0.01ms |   0.19 | UnaryFPSpecial(<ufunc 'sin'>, 1, 2, 'd') |
  | 5.67±0.01ms   | 810±9μs     |   0.14 | UnaryFP(<ufunc 'cos'>, 1, 1, 'd')        |
  | 5.71±0.01ms   | 817±40μs    |   0.14 | UnaryFP(<ufunc 'sin'>, 1, 1, 'd')        |
  | 7.05±0.03ms   | 915±4μs     |   0.13 | UnaryFPSpecial(<ufunc 'sin'>, 1, 1, 'd') |
  • Loading branch information
seiko2plus committed Sep 6, 2025
commit a3f746ee05adc6b6448c6627e925df8dc7753a94
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,6 @@
[submodule "numpy/_core/src/common/pythoncapi-compat"]
path = numpy/_core/src/common/pythoncapi-compat
url = https://github.com/python/pythoncapi-compat
[submodule "subprojects/numpy-simd-routines/src"]
path = subprojects/numpy-simd-routines/src
url = https://github.com/numpy/numpy-simd-routines
22 changes: 17 additions & 5 deletions numpy/_core/include/numpy/npy_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,12 +117,24 @@
#endif
#endif

#if defined(_MSC_VER)
#define NPY_NOINLINE static __declspec(noinline)
#elif defined(__GNUC__) || defined(__clang__)
#define NPY_NOINLINE static __attribute__((noinline))
#ifdef _MSC_VER
#ifdef __cplusplus
#define NPY_NOINLINE __declspec(noinline)
#else
#define NPY_NOINLINE static __declspec(noinline)
#endif
#elif defined(__GNUC__)
#ifdef __cplusplus
#define NPY_NOINLINE __attribute__((noinline))
#else
#define NPY_NOINLINE static __attribute__((noinline))
#endif
#else
#define NPY_NOINLINE static
#ifdef __cplusplus
#define NPY_NOINLINE inline
#else
#define NPY_NOINLINE static inline
#endif
#endif

#ifdef __cplusplus
Expand Down
5 changes: 4 additions & 1 deletion numpy/_core/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -918,6 +918,9 @@ umath_gen_headers = [
src_file.process('src/umath/loops_utils.h.src'),
]

npsr = subproject('numpy-simd-routines')
npsr_dep = npsr.get_variable('npsr_dep')

foreach gen_mtargets : [
[
'loops_arithm_fp.dispatch.h',
Expand Down Expand Up @@ -1082,7 +1085,7 @@ foreach gen_mtargets : [
dispatch: gen_mtargets[2],
baseline: CPU_BASELINE,
prefix: 'NPY_',
dependencies: [py_dep, np_core_dep],
dependencies: [py_dep, np_core_dep, npsr_dep],
c_args: c_args_common + max_opt,
cpp_args: cpp_args_common + max_opt,
include_directories: [
Expand Down
16 changes: 14 additions & 2 deletions numpy/_core/src/common/simd/simd.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#ifndef NUMPY__CORE_SRC_COMMON_SIMD_SIMD_HPP_
#define NUMPY__CORE_SRC_COMMON_SIMD_SIMD_HPP_

/**
* This header provides a thin wrapper over Google's Highway SIMD library.
*
Expand All @@ -19,7 +18,9 @@
*/
#ifndef NPY_DISABLE_OPTIMIZATION
#include <hwy/highway.h>

#include <npsr/npsr.h>
#include <type_traits>
#include <limits>
/**
* We avoid using Highway scalar operations for the following reasons:
*
Expand Down Expand Up @@ -67,6 +68,17 @@ namespace hn = hwy::HWY_NAMESPACE;
// internaly used by the template header
template <typename TLane>
using _Tag = hn::ScalableTag<TLane>;

/// NumPy SIMD Routines namespace alias
/// npsr is tag free by design so we only include it within main namespace (np::simd)
namespace sr = npsr::HWY_NAMESPACE;
/// Default precision configrations for NumPy SIMD Routines
template <typename T>
using Precise = std::conditional_t<
std::is_same_v<T, double> || std::is_same_v<T, long double>,
decltype(npsr::Precise{}),
decltype(npsr::Precise{npsr::kLowAccuracy})
>;
#endif
#include "simd.inc.hpp"
} // namespace simd
Expand Down
14 changes: 14 additions & 0 deletions numpy/_core/src/common/simd/simd.inc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,13 @@ LoadU(const TLane *ptr)
return hn::LoadU(_Tag<TLane>(), ptr);
}

template <typename TLane>
HWY_API Vec<TLane>
LoadN(const TLane *ptr, size_t n)
{
return hn::LoadN(_Tag<TLane>(), ptr, n);
}

/// Unaligned store of a vector to memory.
template <typename TLane>
HWY_API void
Expand All @@ -59,6 +66,13 @@ StoreU(const Vec<TLane> &a, TLane *ptr)
hn::StoreU(a, _Tag<TLane>(), ptr);
}

template <typename TLane>
HWY_API void
StoreN(const Vec<TLane> &a, TLane *ptr, size_t n)
{
hn::StoreN(a, _Tag<TLane>(), ptr, n);
}

/// Returns the number of vector lanes based on the lane type.
template <typename TLane>
HWY_API HWY_LANES_CONSTEXPR size_t
Expand Down
Loading
Loading