Thanks to visit codestin.com
Credit goes to github.com

Skip to content

ENH: float64 sin/cos using Numpy intrinsics #23399

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Apr 25, 2023
4 changes: 2 additions & 2 deletions numpy/core/code_generators/generate_umath.py
Original file line number Diff line number Diff line change
Expand Up @@ -799,7 +799,7 @@ def english_upper(s):
None,
TD('e', dispatch=[('loops_umath_fp', 'e')]),
TD('f', dispatch=[('loops_trigonometric', 'f')]),
TD('d', dispatch=[('loops_umath_fp', 'd')]),
TD('d', dispatch=[('loops_trigonometric', 'd')]),
TD('g' + cmplx, f='cos'),
TD(P, f='cos'),
),
Expand All @@ -809,7 +809,7 @@ def english_upper(s):
None,
TD('e', dispatch=[('loops_umath_fp', 'e')]),
TD('f', dispatch=[('loops_trigonometric', 'f')]),
TD('d', dispatch=[('loops_umath_fp', 'd')]),
TD('d', dispatch=[('loops_trigonometric', 'd')]),
TD('g' + cmplx, f='sin'),
TD(P, f='sin'),
),
Expand Down
8 changes: 8 additions & 0 deletions numpy/core/include/numpy/npy_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,14 @@
#define NPY_FINLINE static
#endif

#if defined(_MSC_VER)
#define NPY_NOINLINE static __declspec(noinline)
#elif defined(__GNUC__) || defined(__clang__)
#define NPY_NOINLINE static __attribute__((noinline))
#else
#define NPY_NOINLINE static
#endif

#ifdef HAVE___THREAD
#define NPY_TLS __thread
#else
Expand Down
16 changes: 6 additions & 10 deletions numpy/core/src/umath/loops.h.src
Original file line number Diff line number Diff line change
Expand Up @@ -335,15 +335,6 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_@func@,

/**end repeat**/

/**begin repeat
* #func = sin, cos#
*/

NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@,
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))

/**end repeat**/

/**begin repeat
* #TYPE = FLOAT, DOUBLE#
*/
Expand All @@ -360,12 +351,17 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@,
#ifndef NPY_DISABLE_OPTIMIZATION
#include "loops_trigonometric.dispatch.h"
#endif

/**begin repeat
* #TYPE = FLOAT, DOUBLE#
*/
/**begin repeat1
* #func = sin, cos#
*/
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_@func@, (
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@, (
char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
))
/**end repeat1**/
/**end repeat**/

#ifndef NPY_DISABLE_OPTIMIZATION
Expand Down
243 changes: 229 additions & 14 deletions numpy/core/src/umath/loops_trigonometric.dispatch.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,18 @@
#include "simd/simd.h"
#include "loops_utils.h"
#include "loops.h"
#include "fast_loop_macros.h"
/*
* TODO:
* - use vectorized version of Payne-Hanek style reduction for large elements or
* when there's no native FUSED support instead of fallback to libc
*/
#if NPY_SIMD_F32 && NPY_SIMD_FMA3 // native support
#if NPY_SIMD_FMA3 // native support
/**begin repeat
* #check = F64, F32#
* #sfx = f64, f32#
*/
#if NPY_SIMD_@check@
/*
* Vectorized Cody-Waite range reduction technique
* Performs the reduction step x* = x - y*C in three steps:
Expand All @@ -23,14 +29,189 @@
* 3) x* = x - y*c3
* c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
*/
NPY_FINLINE npyv_f32
simd_range_reduction_f32(npyv_f32 x, npyv_f32 y, npyv_f32 c1, npyv_f32 c2, npyv_f32 c3)
NPY_FINLINE npyv_@sfx@
simd_range_reduction_@sfx@(npyv_@sfx@ x, npyv_@sfx@ y, npyv_@sfx@ c1, npyv_@sfx@ c2, npyv_@sfx@ c3)
{
npyv_f32 reduced_x = npyv_muladd_f32(y, c1, x);
reduced_x = npyv_muladd_f32(y, c2, reduced_x);
reduced_x = npyv_muladd_f32(y, c3, reduced_x);
npyv_@sfx@ reduced_x = npyv_muladd_@sfx@(y, c1, x);
reduced_x = npyv_muladd_@sfx@(y, c2, reduced_x);
reduced_x = npyv_muladd_@sfx@(y, c3, reduced_x);
return reduced_x;
}
#endif
/**end repeat**/

#if NPY_SIMD_F64
/**begin repeat
* #op = cos, sin#
*/
#if defined(NPY_OS_WIN32) || defined(NPY_OS_CYGWIN)
NPY_FINLINE npyv_f64
#else
NPY_NOINLINE npyv_f64
#endif
simd_@op@_scalar_f64(npyv_f64 out, npy_uint64 cmp_bits)
{
// MSVC doesn't compile with direct vector access, so we copy it here
// as we have no npyv_get_lane/npyv_set_lane intrinsics
npy_double NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) out_copy[npyv_nlanes_f64];
npyv_storea_f64(out_copy, out);

for (unsigned i = 0; i < npyv_nlanes_f64; ++i) {
if (cmp_bits & (1 << i)) {
out_copy[i] = npy_@op@(out_copy[i]);
}
}

return npyv_loada_f64(out_copy);
}
/**end repeat**/

/*
* Approximate sine algorithm for x \in [-pi/2, pi/2]
* worst-case error is 3.5 ulp.
* abs error: 0x1.be222a58p-53 in [-pi/2, pi/2].
*/
NPY_FINLINE npyv_f64
simd_approx_sine_poly_f64(npyv_f64 r)
{
const npyv_f64 poly1 = npyv_setall_f64(-0x1.9f4a9c8b21dc9p-41);
const npyv_f64 poly2 = npyv_setall_f64(0x1.60e88a10163f2p-33);
const npyv_f64 poly3 = npyv_setall_f64(-0x1.ae6361b7254e7p-26);
const npyv_f64 poly4 = npyv_setall_f64(0x1.71de382e8d62bp-19);
const npyv_f64 poly5 = npyv_setall_f64(-0x1.a01a019aeb4ffp-13);
const npyv_f64 poly6 = npyv_setall_f64(0x1.111111110b25ep-7);
const npyv_f64 poly7 = npyv_setall_f64(-0x1.55555555554c3p-3);

npyv_f64 r2 = npyv_mul_f64(r, r);
npyv_f64 y = npyv_muladd_f64(poly1, r2, poly2);
y = npyv_muladd_f64(y, r2, poly3);
y = npyv_muladd_f64(y, r2, poly4);
y = npyv_muladd_f64(y, r2, poly5);
y = npyv_muladd_f64(y, r2, poly6);
y = npyv_muladd_f64(y, r2, poly7);
y = npyv_muladd_f64(npyv_mul_f64(y, r2), r, r);

return y;
}

/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
NPY_FINLINE npyv_f64
simd_range_reduction_pi2(npyv_f64 r, npyv_f64 n) {
const npyv_f64 pi1 = npyv_setall_f64(-0x1.921fb54442d18p+1);
const npyv_f64 pi2 = npyv_setall_f64(-0x1.1a62633145c06p-53);
const npyv_f64 pi3 = npyv_setall_f64(-0x1.c1cd129024e09p-106);

return simd_range_reduction_f64(r, n, pi1, pi2, pi3);
}

NPY_FINLINE npyv_b64 simd_cos_range_check_f64(npyv_u64 ir) {
const npyv_u64 tiny_bound = npyv_setall_u64(0x202); /* top12 (asuint64 (0x1p-509)). */
const npyv_u64 simd_thresh = npyv_setall_u64(0x214); /* top12 (asuint64 (RangeVal)) - SIMD_TINY_BOUND. */

return npyv_cmpge_u64(npyv_sub_u64(npyv_shri_u64(ir, 52), tiny_bound), simd_thresh);
}

NPY_FINLINE npyv_b64 simd_sin_range_check_f64(npyv_u64 ir) {
const npyv_f64 range_val = npyv_setall_f64(0x1p23);

return npyv_cmpge_u64(ir, npyv_reinterpret_u64_f64(range_val));
}

NPY_FINLINE npyv_f64
simd_cos_poly_f64(npyv_f64 r, npyv_u64 ir, npyv_u64 sign)
{
const npyv_f64 inv_pi = npyv_setall_f64(0x1.45f306dc9c883p-2);
const npyv_f64 half_pi = npyv_setall_f64(0x1.921fb54442d18p+0);
const npyv_f64 shift = npyv_setall_f64(0x1.8p52);

/* n = rint((|x|+pi/2)/pi) - 0.5. */
npyv_f64 n = npyv_muladd_f64(inv_pi, npyv_add_f64(r, half_pi), shift);
npyv_u64 odd = npyv_shli_u64(npyv_reinterpret_u64_f64(n), 63);
n = npyv_sub_f64(n, shift);
n = npyv_sub_f64(n, npyv_setall_f64(0.5));

/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
r = simd_range_reduction_pi2(r, n);

/* sin(r) poly approx. */
npyv_f64 y = simd_approx_sine_poly_f64(r);

/* sign. */
return npyv_reinterpret_f64_u64(npyv_xor_u64(npyv_reinterpret_u64_f64(y), odd));
}

NPY_FINLINE npyv_f64
simd_sin_poly_f64(npyv_f64 r, npyv_u64 ir, npyv_u64 sign)
{
const npyv_f64 inv_pi = npyv_setall_f64(0x1.45f306dc9c883p-2);
const npyv_f64 shift = npyv_setall_f64(0x1.8p52);

/* n = rint(|x|/pi). */
npyv_f64 n = npyv_muladd_f64(inv_pi, r, shift);
npyv_u64 odd = npyv_shli_u64(npyv_reinterpret_u64_f64(n), 63);
n = npyv_sub_f64(n, shift);

/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
r = simd_range_reduction_pi2(r, n);

/* sin(r) poly approx. */
npyv_f64 y = simd_approx_sine_poly_f64(r);

/* sign. */
return npyv_reinterpret_f64_u64(npyv_xor_u64(npyv_xor_u64(npyv_reinterpret_u64_f64(y), sign), odd));
}

/**begin repeat
* #op = cos, sin#
*/
NPY_FINLINE void
simd_@op@_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_intp len)
{
const npyv_u64 abs_mask = npyv_setall_u64(0x7fffffffffffffff);
const int vstep = npyv_nlanes_f64;

npyv_f64 out = npyv_zero_f64();
npyv_f64 x_in;

for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
if (ssrc == 1) {
x_in = npyv_load_tillz_f64(src, len);
} else {
x_in = npyv_loadn_tillz_f64(src, ssrc, len);
}

npyv_u64 ir = npyv_and_u64(npyv_reinterpret_u64_f64(x_in), abs_mask);
npyv_f64 r = npyv_reinterpret_f64_u64(ir);
npyv_u64 sign = npyv_and_u64(npyv_reinterpret_u64_f64(x_in), npyv_not_u64(abs_mask));

npyv_b64 cmp = simd_@op@_range_check_f64(ir);
/* If fenv exceptions are to be triggered correctly, set any special lanes
to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
scalar loop later. */
r = npyv_select_f64(cmp, npyv_setall_f64(1.0), r);

// Some in range, at least one calculation is useful
if (!npyv_all_b64(cmp)) {
out = simd_@op@_poly_f64(r, ir, sign);
}

if (npyv_any_b64(cmp)) {
out = npyv_select_f64(cmp, x_in, out);
out = simd_@op@_scalar_f64(out, npyv_tobits_b64(cmp));
}

if (sdst == 1) {
npyv_store_till_f64(dst, len, out);
} else {
npyv_storen_till_f64(dst, sdst, len, out);
}
}
npyv_cleanup();
}
/**end repeat**/
#endif // NPY_SIMD_F64

#if NPY_SIMD_F32
/*
* Approximate cosine algorithm for x \in [-PI/4, PI/4]
* Maximum ULP across all 32-bit floats = 0.875
Expand Down Expand Up @@ -198,24 +379,58 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
}
npyv_cleanup();
}
#endif // NPY_SIMD_FMA3
#endif // NPY_SIMD_FP32
#endif // NYP_SIMD_FMA3

/**begin repeat
* #func = cos, sin#
* #enum = SIMD_COMPUTE_COS, SIMD_COMPUTE_SIN#
*/
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
#if NPY_SIMD_F64 && NPY_SIMD_FMA3
const double *src = (double*)args[0];
double *dst = (double*)args[1];
const int lsize = sizeof(src[0]);
const npy_intp ssrc = steps[0] / lsize;
const npy_intp sdst = steps[1] / lsize;
npy_intp len = dimensions[0];
assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));

if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
!npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)
) {
for (; len > 0; --len, src += ssrc, dst += sdst) {
simd_@func@_f64(src, 1, dst, 1, 1);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we add test(s) for this? Seems easy enough.

}
} else {
simd_@func@_f64(src, ssrc, dst, sdst, len);
}
#else
UNARY_LOOP {
const npy_double in1 = *(npy_double *)ip1;
*(npy_double *)op1 = npy_@func@(in1);
}
#endif
}
/**end repeat**/

/**begin repeat
* #func = sin, cos#
* #enum = SIMD_COMPUTE_SIN, SIMD_COMPUTE_COS#
*/
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
const float *src = (float*)args[0];
float *dst = (float*)args[1];
#if NPY_SIMD_F32 && NPY_SIMD_FMA3
const npy_float *src = (npy_float*)args[0];
npy_float *dst = (npy_float*)args[1];

const int lsize = sizeof(src[0]);
const npy_intp ssrc = steps[0] / lsize;
const npy_intp sdst = steps[1] / lsize;
npy_intp len = dimensions[0];
assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
#if NPY_SIMD_F32 && NPY_SIMD_FMA3
if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
!npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)
) {
Expand All @@ -226,9 +441,9 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@)
simd_sincos_f32(src, ssrc, dst, sdst, len, @enum@);
}
#else
for (; len > 0; --len, src += ssrc, dst += sdst) {
const float src0 = *src;
*dst = npy_@func@f(src0);
UNARY_LOOP {
const npy_float in1 = *(npy_float *)ip1;
*(npy_float *)op1 = npy_@func@f(in1);
}
#endif
}
Expand Down
Loading