From 179038f94c213b528fbfb7ac6d55c94f95d18a62 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Sat, 7 Nov 2020 13:28:07 +0530 Subject: [PATCH 01/32] ENH: Added libdiv --- numpy/core/setup.py | 7 +++ numpy/core/src/umath/fast_loop_macros.h | 13 +++++- numpy/core/src/umath/loops.c.src | 58 +++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 2 deletions(-) diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 68aa0a8513fb..a3eb16a5ceae 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -386,6 +386,10 @@ def check_mathlib(config_cmd): "MATHLIB env variable") return mathlibs +def check_libdivide(): + return os.environ.get('NPY_USE_LIBDIVIDE') is not None + + def visibility_define(config): """Return the define value to use for NPY_VISIBILITY_HIDDEN (may be empty string).""" @@ -442,6 +446,9 @@ def generate_config_h(ext, build_dir): mathlibs = check_mathlib(config_cmd) moredefs.append(('MATHLIB', ','.join(mathlibs))) + # Check if libdivide needs to be used + check_libdivide() and moredefs.append('USE_LIBDIVIDE') + check_math_capabilities(config_cmd, ext, moredefs, mathlibs) moredefs.extend(cocache.check_ieee_macros(config_cmd)[0]) moredefs.extend(cocache.check_complex(config_cmd, mathlibs)[0]) diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h index 74bf016436dd..6fe0824cb882 100644 --- a/numpy/core/src/umath/fast_loop_macros.h +++ b/numpy/core/src/umath/fast_loop_macros.h @@ -46,12 +46,21 @@ abs_ptrdiff(char *a, char *b) npy_intp i;\ for(i = 0; i < n; i++, ip1 += is1, op1 += os1, op2 += os2) -/** (ip1, ip2) -> (op1) */ -#define BINARY_LOOP\ +#define BINARY_LOOP_BASE\ char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\ npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\ npy_intp n = dimensions[0];\ npy_intp i;\ + +#define BINARY_LOOP_FIXED\ + for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) + +#define BINARY_LOOP_SLIDING\ + for(i = 0; i < n; i++, ip1 += is1, op1 += os1) + +/** (ip1, ip2) -> (op1) */ +#define BINARY_LOOP\ + BINARY_LOOP_BASE\ for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) /** (ip1, ip2) -> (op1, op2) */ diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index ef3d5a21a413..d30f5a64ca0a 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -19,6 +19,7 @@ #include "ufunc_object.h" #include /* for memchr */ +#include /* * cutoff blocksize for pairwise summation @@ -826,6 +827,7 @@ NPY_NO_EXPORT void * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong# * #c = ,,,l,ll# + * #div = s32, s32, s32, s64, s64# */ NPY_NO_EXPORT NPY_GCC_OPT_3 void @@ -840,6 +842,7 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0)); } +#ifndef USE_LIBDIVIDE NPY_NO_EXPORT void @TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -865,6 +868,61 @@ NPY_NO_EXPORT void } } } +#else +NPY_NO_EXPORT void +@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + BINARY_LOOP_BASE + + if(!is2) { + struct libdivide_@div@_t fast_d = libdivide_@div@_gen(*(int*)ip2); + const @type@ in2 = *(@type@ *)ip2; + BINARY_LOOP_FIXED { + const @type@ in1 = *(@type@ *)ip1; + /* + * FIXME: On x86 at least, dividing the smallest representable integer + * by -1 causes a SIFGPE (division overflow). We treat this case here + * (to avoid a SIGFPE crash at python level), but a good solution would + * be to treat integer division problems separately from FPU exceptions + * (i.e. a different approach than npy_set_floatstatus_divbyzero()). + */ + if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) { + npy_set_floatstatus_divbyzero(); + *((@type@ *)op1) = 0; + } + else if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) { + *((@type@ *)op1) = libdivide_@div@_do(in1, &fast_d) - 1; + } + else { + *((@type@ *)op1) = libdivide_@div@_do(in1, &fast_d); + } + } + } + else { + BINARY_LOOP_SLIDING { // XXX Lot of repeated code + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + /* + * FIXME: On x86 at least, dividing the smallest representable integer + * by -1 causes a SIFGPE (division overflow). We treat this case here + * (to avoid a SIGFPE crash at python level), but a good solution would + * be to treat integer division problems separately from FPU exceptions + * (i.e. a different approach than npy_set_floatstatus_divbyzero()). + */ + if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) { + npy_set_floatstatus_divbyzero(); + *((@type@ *)op1) = 0; + } + else if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) { + *((@type@ *)op1) = in1/in2 - 1; + } + else { + *((@type@ *)op1) = in1/in2; + } + } + } +} +#endif NPY_NO_EXPORT void @TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) From e89175b20efe9383a805dc6515d615f3f8792f25 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Sat, 7 Nov 2020 21:58:09 +0530 Subject: [PATCH 02/32] ENH: Fixed typos in header | use in2 over ip2 --- numpy/core/src/umath/fast_loop_macros.h | 6 +++--- numpy/core/src/umath/loops.c.src | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h index 6fe0824cb882..90dcad3685d6 100644 --- a/numpy/core/src/umath/fast_loop_macros.h +++ b/numpy/core/src/umath/fast_loop_macros.h @@ -52,16 +52,16 @@ abs_ptrdiff(char *a, char *b) npy_intp n = dimensions[0];\ npy_intp i;\ -#define BINARY_LOOP_FIXED\ +#define BINARY_LOOP_SLIDING\ for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) -#define BINARY_LOOP_SLIDING\ +#define BINARY_LOOP_FIXED\ for(i = 0; i < n; i++, ip1 += is1, op1 += os1) /** (ip1, ip2) -> (op1) */ #define BINARY_LOOP\ BINARY_LOOP_BASE\ - for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) + BINARY_LOOP_SLIDING /** (ip1, ip2) -> (op1, op2) */ #define BINARY_LOOP_TWO_OUT\ diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index d30f5a64ca0a..ad50f021b926 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -875,8 +875,8 @@ NPY_NO_EXPORT void BINARY_LOOP_BASE if(!is2) { - struct libdivide_@div@_t fast_d = libdivide_@div@_gen(*(int*)ip2); const @type@ in2 = *(@type@ *)ip2; + struct libdivide_@div@_t fast_d = libdivide_@div@_gen(in2); BINARY_LOOP_FIXED { const @type@ in1 = *(@type@ *)ip1; /* From 565759be07004e1994ce8497f5573fd73cded7d0 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Sun, 8 Nov 2020 12:13:11 +0530 Subject: [PATCH 03/32] ENH: Added optimal divisor --- numpy/core/setup.py | 5 ++ numpy/core/src/umath/loops.c.src | 100 ++++++++++++++++++++++++------- 2 files changed, 82 insertions(+), 23 deletions(-) diff --git a/numpy/core/setup.py b/numpy/core/setup.py index a3eb16a5ceae..ca108863c355 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -389,6 +389,8 @@ def check_mathlib(config_cmd): def check_libdivide(): return os.environ.get('NPY_USE_LIBDIVIDE') is not None +def check_optimal_divisor(): + return os.environ.get('NPY_USE_OPTIMAL_DIVISOR') is not None def visibility_define(config): """Return the define value to use for NPY_VISIBILITY_HIDDEN (may be empty @@ -449,6 +451,9 @@ def generate_config_h(ext, build_dir): # Check if libdivide needs to be used check_libdivide() and moredefs.append('USE_LIBDIVIDE') + # Check if optimal divisor code needs to be used + check_optimal_divisor() and moredefs.append('USE_OPTIMAL_DIVISOR') + check_math_capabilities(config_cmd, ext, moredefs, mathlibs) moredefs.extend(cocache.check_ieee_macros(config_cmd)[0]) moredefs.extend(cocache.check_complex(config_cmd, mathlibs)[0]) diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index ad50f021b926..3a7543b99421 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -842,33 +842,61 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0)); } -#ifndef USE_LIBDIVIDE +#ifdef USE_LIBDIVIDE NPY_NO_EXPORT void @TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; + BINARY_LOOP_BASE + + if(!is2) { const @type@ in2 = *(@type@ *)ip2; - /* - * FIXME: On x86 at least, dividing the smallest representable integer - * by -1 causes a SIFGPE (division overflow). We treat this case here - * (to avoid a SIGFPE crash at python level), but a good solution would - * be to treat integer division problems separately from FPU exceptions - * (i.e. a different approach than npy_set_floatstatus_divbyzero()). - */ - if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) { - npy_set_floatstatus_divbyzero(); - *((@type@ *)op1) = 0; - } - else if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) { - *((@type@ *)op1) = in1/in2 - 1; + struct libdivide_@div@_t fast_d = libdivide_@div@_gen(in2); + BINARY_LOOP_FIXED { + const @type@ in1 = *(@type@ *)ip1; + /* + * FIXME: On x86 at least, dividing the smallest representable integer + * by -1 causes a SIFGPE (division overflow). We treat this case here + * (to avoid a SIGFPE crash at python level), but a good solution would + * be to treat integer division problems separately from FPU exceptions + * (i.e. a different approach than npy_set_floatstatus_divbyzero()). + */ + if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) { + npy_set_floatstatus_divbyzero(); + *((@type@ *)op1) = 0; + } + else if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) { + *((@type@ *)op1) = libdivide_@div@_do(in1, &fast_d) - 1; + } + else { + *((@type@ *)op1) = libdivide_@div@_do(in1, &fast_d); + } } - else { - *((@type@ *)op1) = in1/in2; + } + else { + BINARY_LOOP_SLIDING { // XXX Lot of repeated code + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + /* + * FIXME: On x86 at least, dividing the smallest representable integer + * by -1 causes a SIFGPE (division overflow). We treat this case here + * (to avoid a SIGFPE crash at python level), but a good solution would + * be to treat integer division problems separately from FPU exceptions + * (i.e. a different approach than npy_set_floatstatus_divbyzero()). + */ + if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) { + npy_set_floatstatus_divbyzero(); + *((@type@ *)op1) = 0; + } + else if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) { + *((@type@ *)op1) = in1/in2 - 1; + } + else { + *((@type@ *)op1) = in1/in2; + } } } } -#else +#elif defined(USE_OPTIMAL_DIVISOR) NPY_NO_EXPORT void @TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -876,7 +904,7 @@ NPY_NO_EXPORT void if(!is2) { const @type@ in2 = *(@type@ *)ip2; - struct libdivide_@div@_t fast_d = libdivide_@div@_gen(in2); + const float in2_f = (float) in2; BINARY_LOOP_FIXED { const @type@ in1 = *(@type@ *)ip1; /* @@ -890,11 +918,11 @@ NPY_NO_EXPORT void npy_set_floatstatus_divbyzero(); *((@type@ *)op1) = 0; } - else if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) { - *((@type@ *)op1) = libdivide_@div@_do(in1, &fast_d) - 1; + else if ((in1 > 0) != (in2 > 0)) { + *((@type@ *)op1) = floor(in1/in2_f); } else { - *((@type@ *)op1) = libdivide_@div@_do(in1, &fast_d); + *((@type@ *)op1) = in1/in2; } } } @@ -922,6 +950,32 @@ NPY_NO_EXPORT void } } } +#else +NPY_NO_EXPORT void +@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + BINARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + /* + * FIXME: On x86 at least, dividing the smallest representable integer + * by -1 causes a SIFGPE (division overflow). We treat this case here + * (to avoid a SIGFPE crash at python level), but a good solution would + * be to treat integer division problems separately from FPU exceptions + * (i.e. a different approach than npy_set_floatstatus_divbyzero()). + */ + if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) { + npy_set_floatstatus_divbyzero(); + *((@type@ *)op1) = 0; + } + else if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) { + *((@type@ *)op1) = in1/in2 - 1; + } + else { + *((@type@ *)op1) = in1/in2; + } + } +} #endif NPY_NO_EXPORT void From d0c934cf1627eecdc2771e0dec945804669bb019 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Sun, 8 Nov 2020 13:59:29 +0530 Subject: [PATCH 04/32] ENH: Added libdivide header --- numpy/core/include/numpy/libdivide.h | 2079 ++++++++++++++++++++++++++ numpy/core/src/umath/loops.c.src | 7 +- 2 files changed, 2085 insertions(+), 1 deletion(-) create mode 100644 numpy/core/include/numpy/libdivide.h diff --git a/numpy/core/include/numpy/libdivide.h b/numpy/core/include/numpy/libdivide.h new file mode 100644 index 000000000000..81057b7b43de --- /dev/null +++ b/numpy/core/include/numpy/libdivide.h @@ -0,0 +1,2079 @@ +// libdivide.h - Optimized integer division +// https://libdivide.com +// +// Copyright (C) 2010 - 2019 ridiculous_fish, +// Copyright (C) 2016 - 2019 Kim Walisch, +// +// libdivide is dual-licensed under the Boost or zlib licenses. +// You may use libdivide under the terms of either of these. +// See LICENSE.txt for more details. + +#ifndef LIBDIVIDE_H +#define LIBDIVIDE_H + +#define LIBDIVIDE_VERSION "3.0" +#define LIBDIVIDE_VERSION_MAJOR 3 +#define LIBDIVIDE_VERSION_MINOR 0 + +#include + +#if defined(__cplusplus) + #include + #include + #include +#else + #include + #include +#endif + +#if defined(LIBDIVIDE_AVX512) + #include +#elif defined(LIBDIVIDE_AVX2) + #include +#elif defined(LIBDIVIDE_SSE2) + #include +#endif + +#if defined(_MSC_VER) + #include + // disable warning C4146: unary minus operator applied + // to unsigned type, result still unsigned + #pragma warning(disable: 4146) + #define LIBDIVIDE_VC +#endif + +#if !defined(__has_builtin) + #define __has_builtin(x) 0 +#endif + +#if defined(__SIZEOF_INT128__) + #define HAS_INT128_T + // clang-cl on Windows does not yet support 128-bit division + #if !(defined(__clang__) && defined(LIBDIVIDE_VC)) + #define HAS_INT128_DIV + #endif +#endif + +#if defined(__x86_64__) || defined(_M_X64) + #define LIBDIVIDE_X86_64 +#endif + +#if defined(__i386__) + #define LIBDIVIDE_i386 +#endif + +#if defined(__GNUC__) || defined(__clang__) + #define LIBDIVIDE_GCC_STYLE_ASM +#endif + +#if defined(__cplusplus) || defined(LIBDIVIDE_VC) + #define LIBDIVIDE_FUNCTION __FUNCTION__ +#else + #define LIBDIVIDE_FUNCTION __func__ +#endif + +#define LIBDIVIDE_ERROR(msg) \ + do { \ + fprintf(stderr, "libdivide.h:%d: %s(): Error: %s\n", \ + __LINE__, LIBDIVIDE_FUNCTION, msg); \ + abort(); \ + } while (0) + +#if defined(LIBDIVIDE_ASSERTIONS_ON) + #define LIBDIVIDE_ASSERT(x) \ + do { \ + if (!(x)) { \ + fprintf(stderr, "libdivide.h:%d: %s(): Assertion failed: %s\n", \ + __LINE__, LIBDIVIDE_FUNCTION, #x); \ + abort(); \ + } \ + } while (0) +#else + #define LIBDIVIDE_ASSERT(x) +#endif + +#ifdef __cplusplus +namespace libdivide { +#endif + +// pack divider structs to prevent compilers from padding. +// This reduces memory usage by up to 43% when using a large +// array of libdivide dividers and improves performance +// by up to 10% because of reduced memory bandwidth. +#pragma pack(push, 1) + +struct libdivide_u32_t { + uint32_t magic; + uint8_t more; +}; + +struct libdivide_s32_t { + int32_t magic; + uint8_t more; +}; + +struct libdivide_u64_t { + uint64_t magic; + uint8_t more; +}; + +struct libdivide_s64_t { + int64_t magic; + uint8_t more; +}; + +struct libdivide_u32_branchfree_t { + uint32_t magic; + uint8_t more; +}; + +struct libdivide_s32_branchfree_t { + int32_t magic; + uint8_t more; +}; + +struct libdivide_u64_branchfree_t { + uint64_t magic; + uint8_t more; +}; + +struct libdivide_s64_branchfree_t { + int64_t magic; + uint8_t more; +}; + +#pragma pack(pop) + +// Explanation of the "more" field: +// +// * Bits 0-5 is the shift value (for shift path or mult path). +// * Bit 6 is the add indicator for mult path. +// * Bit 7 is set if the divisor is negative. We use bit 7 as the negative +// divisor indicator so that we can efficiently use sign extension to +// create a bitmask with all bits set to 1 (if the divisor is negative) +// or 0 (if the divisor is positive). +// +// u32: [0-4] shift value +// [5] ignored +// [6] add indicator +// magic number of 0 indicates shift path +// +// s32: [0-4] shift value +// [5] ignored +// [6] add indicator +// [7] indicates negative divisor +// magic number of 0 indicates shift path +// +// u64: [0-5] shift value +// [6] add indicator +// magic number of 0 indicates shift path +// +// s64: [0-5] shift value +// [6] add indicator +// [7] indicates negative divisor +// magic number of 0 indicates shift path +// +// In s32 and s64 branchfree modes, the magic number is negated according to +// whether the divisor is negated. In branchfree strategy, it is not negated. + +enum { + LIBDIVIDE_32_SHIFT_MASK = 0x1F, + LIBDIVIDE_64_SHIFT_MASK = 0x3F, + LIBDIVIDE_ADD_MARKER = 0x40, + LIBDIVIDE_NEGATIVE_DIVISOR = 0x80 +}; + +static inline struct libdivide_s32_t libdivide_s32_gen(int32_t d); +static inline struct libdivide_u32_t libdivide_u32_gen(uint32_t d); +static inline struct libdivide_s64_t libdivide_s64_gen(int64_t d); +static inline struct libdivide_u64_t libdivide_u64_gen(uint64_t d); + +static inline struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d); +static inline struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d); +static inline struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d); +static inline struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d); + +static inline int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom); +static inline uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom); +static inline int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom); +static inline uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom); + +static inline int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom); +static inline uint32_t libdivide_u32_branchfree_do(uint32_t numer, const struct libdivide_u32_branchfree_t *denom); +static inline int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom); +static inline uint64_t libdivide_u64_branchfree_do(uint64_t numer, const struct libdivide_u64_branchfree_t *denom); + +static inline int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom); +static inline uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom); +static inline int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom); +static inline uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom); + +static inline int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom); +static inline uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom); +static inline int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom); +static inline uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom); + +//////// Internal Utility Functions + +static inline uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) { + uint64_t xl = x, yl = y; + uint64_t rl = xl * yl; + return (uint32_t)(rl >> 32); +} + +static inline int32_t libdivide_mullhi_s32(int32_t x, int32_t y) { + int64_t xl = x, yl = y; + int64_t rl = xl * yl; + // needs to be arithmetic shift + return (int32_t)(rl >> 32); +} + +static inline uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) { +#if defined(LIBDIVIDE_VC) && \ + defined(LIBDIVIDE_X86_64) + return __umulh(x, y); +#elif defined(HAS_INT128_T) + __uint128_t xl = x, yl = y; + __uint128_t rl = xl * yl; + return (uint64_t)(rl >> 64); +#else + // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64) + uint32_t mask = 0xFFFFFFFF; + uint32_t x0 = (uint32_t)(x & mask); + uint32_t x1 = (uint32_t)(x >> 32); + uint32_t y0 = (uint32_t)(y & mask); + uint32_t y1 = (uint32_t)(y >> 32); + uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0); + uint64_t x0y1 = x0 * (uint64_t)y1; + uint64_t x1y0 = x1 * (uint64_t)y0; + uint64_t x1y1 = x1 * (uint64_t)y1; + uint64_t temp = x1y0 + x0y0_hi; + uint64_t temp_lo = temp & mask; + uint64_t temp_hi = temp >> 32; + + return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32); +#endif +} + +static inline int64_t libdivide_mullhi_s64(int64_t x, int64_t y) { +#if defined(LIBDIVIDE_VC) && \ + defined(LIBDIVIDE_X86_64) + return __mulh(x, y); +#elif defined(HAS_INT128_T) + __int128_t xl = x, yl = y; + __int128_t rl = xl * yl; + return (int64_t)(rl >> 64); +#else + // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64) + uint32_t mask = 0xFFFFFFFF; + uint32_t x0 = (uint32_t)(x & mask); + uint32_t y0 = (uint32_t)(y & mask); + int32_t x1 = (int32_t)(x >> 32); + int32_t y1 = (int32_t)(y >> 32); + uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0); + int64_t t = x1 * (int64_t)y0 + x0y0_hi; + int64_t w1 = x0 * (int64_t)y1 + (t & mask); + + return x1 * (int64_t)y1 + (t >> 32) + (w1 >> 32); +#endif +} + +static inline int32_t libdivide_count_leading_zeros32(uint32_t val) { +#if defined(__GNUC__) || \ + __has_builtin(__builtin_clz) + // Fast way to count leading zeros + return __builtin_clz(val); +#elif defined(LIBDIVIDE_VC) + unsigned long result; + if (_BitScanReverse(&result, val)) { + return 31 - result; + } + return 0; +#else + if (val == 0) + return 32; + int32_t result = 8; + uint32_t hi = 0xFFU << 24; + while ((val & hi) == 0) { + hi >>= 8; + result += 8; + } + while (val & hi) { + result -= 1; + hi <<= 1; + } + return result; +#endif +} + +static inline int32_t libdivide_count_leading_zeros64(uint64_t val) { +#if defined(__GNUC__) || \ + __has_builtin(__builtin_clzll) + // Fast way to count leading zeros + return __builtin_clzll(val); +#elif defined(LIBDIVIDE_VC) && defined(_WIN64) + unsigned long result; + if (_BitScanReverse64(&result, val)) { + return 63 - result; + } + return 0; +#else + uint32_t hi = val >> 32; + uint32_t lo = val & 0xFFFFFFFF; + if (hi != 0) return libdivide_count_leading_zeros32(hi); + return 32 + libdivide_count_leading_zeros32(lo); +#endif +} + +// libdivide_64_div_32_to_32: divides a 64-bit uint {u1, u0} by a 32-bit +// uint {v}. The result must fit in 32 bits. +// Returns the quotient directly and the remainder in *r +static inline uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) { +#if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && \ + defined(LIBDIVIDE_GCC_STYLE_ASM) + uint32_t result; + __asm__("divl %[v]" + : "=a"(result), "=d"(*r) + : [v] "r"(v), "a"(u0), "d"(u1) + ); + return result; +#else + uint64_t n = ((uint64_t)u1 << 32) | u0; + uint32_t result = (uint32_t)(n / v); + *r = (uint32_t)(n - result * (uint64_t)v); + return result; +#endif +} + +// libdivide_128_div_64_to_64: divides a 128-bit uint {u1, u0} by a 64-bit +// uint {v}. The result must fit in 64 bits. +// Returns the quotient directly and the remainder in *r +static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) { +#if defined(LIBDIVIDE_X86_64) && \ + defined(LIBDIVIDE_GCC_STYLE_ASM) + uint64_t result; + __asm__("divq %[v]" + : "=a"(result), "=d"(*r) + : [v] "r"(v), "a"(u0), "d"(u1) + ); + return result; +#elif defined(HAS_INT128_T) && \ + defined(HAS_INT128_DIV) + __uint128_t n = ((__uint128_t)u1 << 64) | u0; + uint64_t result = (uint64_t)(n / v); + *r = (uint64_t)(n - result * (__uint128_t)v); + return result; +#else + // Code taken from Hacker's Delight: + // http://www.hackersdelight.org/HDcode/divlu.c. + // License permits inclusion here per: + // http://www.hackersdelight.org/permissions.htm + + const uint64_t b = (1ULL << 32); // Number base (32 bits) + uint64_t un1, un0; // Norm. dividend LSD's + uint64_t vn1, vn0; // Norm. divisor digits + uint64_t q1, q0; // Quotient digits + uint64_t un64, un21, un10; // Dividend digit pairs + uint64_t rhat; // A remainder + int32_t s; // Shift amount for norm + + // If overflow, set rem. to an impossible value, + // and return the largest possible quotient + if (u1 >= v) { + *r = (uint64_t) -1; + return (uint64_t) -1; + } + + // count leading zeros + s = libdivide_count_leading_zeros64(v); + if (s > 0) { + // Normalize divisor + v = v << s; + un64 = (u1 << s) | (u0 >> (64 - s)); + un10 = u0 << s; // Shift dividend left + } else { + // Avoid undefined behavior of (u0 >> 64). + // The behavior is undefined if the right operand is + // negative, or greater than or equal to the length + // in bits of the promoted left operand. + un64 = u1; + un10 = u0; + } + + // Break divisor up into two 32-bit digits + vn1 = v >> 32; + vn0 = v & 0xFFFFFFFF; + + // Break right half of dividend into two digits + un1 = un10 >> 32; + un0 = un10 & 0xFFFFFFFF; + + // Compute the first quotient digit, q1 + q1 = un64 / vn1; + rhat = un64 - q1 * vn1; + + while (q1 >= b || q1 * vn0 > b * rhat + un1) { + q1 = q1 - 1; + rhat = rhat + vn1; + if (rhat >= b) + break; + } + + // Multiply and subtract + un21 = un64 * b + un1 - q1 * v; + + // Compute the second quotient digit + q0 = un21 / vn1; + rhat = un21 - q0 * vn1; + + while (q0 >= b || q0 * vn0 > b * rhat + un0) { + q0 = q0 - 1; + rhat = rhat + vn1; + if (rhat >= b) + break; + } + + *r = (un21 * b + un0 - q0 * v) >> s; + return q1 * b + q0; +#endif +} + +// Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0) +static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t signed_shift) { + if (signed_shift > 0) { + uint32_t shift = signed_shift; + *u1 <<= shift; + *u1 |= *u0 >> (64 - shift); + *u0 <<= shift; + } + else if (signed_shift < 0) { + uint32_t shift = -signed_shift; + *u0 >>= shift; + *u0 |= *u1 << (64 - shift); + *u1 >>= shift; + } +} + +// Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder. +static uint64_t libdivide_128_div_128_to_64(uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) { +#if defined(HAS_INT128_T) && \ + defined(HAS_INT128_DIV) + __uint128_t ufull = u_hi; + __uint128_t vfull = v_hi; + ufull = (ufull << 64) | u_lo; + vfull = (vfull << 64) | v_lo; + uint64_t res = (uint64_t)(ufull / vfull); + __uint128_t remainder = ufull - (vfull * res); + *r_lo = (uint64_t)remainder; + *r_hi = (uint64_t)(remainder >> 64); + return res; +#else + // Adapted from "Unsigned Doubleword Division" in Hacker's Delight + // We want to compute u / v + typedef struct { uint64_t hi; uint64_t lo; } u128_t; + u128_t u = {u_hi, u_lo}; + u128_t v = {v_hi, v_lo}; + + if (v.hi == 0) { + // divisor v is a 64 bit value, so we just need one 128/64 division + // Note that we are simpler than Hacker's Delight here, because we know + // the quotient fits in 64 bits whereas Hacker's Delight demands a full + // 128 bit quotient + *r_hi = 0; + return libdivide_128_div_64_to_64(u.hi, u.lo, v.lo, r_lo); + } + // Here v >= 2**64 + // We know that v.hi != 0, so count leading zeros is OK + // We have 0 <= n <= 63 + uint32_t n = libdivide_count_leading_zeros64(v.hi); + + // Normalize the divisor so its MSB is 1 + u128_t v1t = v; + libdivide_u128_shift(&v1t.hi, &v1t.lo, n); + uint64_t v1 = v1t.hi; // i.e. v1 = v1t >> 64 + + // To ensure no overflow + u128_t u1 = u; + libdivide_u128_shift(&u1.hi, &u1.lo, -1); + + // Get quotient from divide unsigned insn. + uint64_t rem_ignored; + uint64_t q1 = libdivide_128_div_64_to_64(u1.hi, u1.lo, v1, &rem_ignored); + + // Undo normalization and division of u by 2. + u128_t q0 = {0, q1}; + libdivide_u128_shift(&q0.hi, &q0.lo, n); + libdivide_u128_shift(&q0.hi, &q0.lo, -63); + + // Make q0 correct or too small by 1 + // Equivalent to `if (q0 != 0) q0 = q0 - 1;` + if (q0.hi != 0 || q0.lo != 0) { + q0.hi -= (q0.lo == 0); // borrow + q0.lo -= 1; + } + + // Now q0 is correct. + // Compute q0 * v as q0v + // = (q0.hi << 64 + q0.lo) * (v.hi << 64 + v.lo) + // = (q0.hi * v.hi << 128) + (q0.hi * v.lo << 64) + + // (q0.lo * v.hi << 64) + q0.lo * v.lo) + // Each term is 128 bit + // High half of full product (upper 128 bits!) are dropped + u128_t q0v = {0, 0}; + q0v.hi = q0.hi*v.lo + q0.lo*v.hi + libdivide_mullhi_u64(q0.lo, v.lo); + q0v.lo = q0.lo*v.lo; + + // Compute u - q0v as u_q0v + // This is the remainder + u128_t u_q0v = u; + u_q0v.hi -= q0v.hi + (u.lo < q0v.lo); // second term is borrow + u_q0v.lo -= q0v.lo; + + // Check if u_q0v >= v + // This checks if our remainder is larger than the divisor + if ((u_q0v.hi > v.hi) || + (u_q0v.hi == v.hi && u_q0v.lo >= v.lo)) { + // Increment q0 + q0.lo += 1; + q0.hi += (q0.lo == 0); // carry + + // Subtract v from remainder + u_q0v.hi -= v.hi + (u_q0v.lo < v.lo); + u_q0v.lo -= v.lo; + } + + *r_hi = u_q0v.hi; + *r_lo = u_q0v.lo; + + LIBDIVIDE_ASSERT(q0.hi == 0); + return q0.lo; +#endif +} + +////////// UINT32 + +static inline struct libdivide_u32_t libdivide_internal_u32_gen(uint32_t d, int branchfree) { + if (d == 0) { + LIBDIVIDE_ERROR("divider must be != 0"); + } + + struct libdivide_u32_t result; + uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(d); + + // Power of 2 + if ((d & (d - 1)) == 0) { + // We need to subtract 1 from the shift value in case of an unsigned + // branchfree divider because there is a hardcoded right shift by 1 + // in its division algorithm. Because of this we also need to add back + // 1 in its recovery algorithm. + result.magic = 0; + result.more = (uint8_t)(floor_log_2_d - (branchfree != 0)); + } else { + uint8_t more; + uint32_t rem, proposed_m; + proposed_m = libdivide_64_div_32_to_32(1U << floor_log_2_d, 0, d, &rem); + + LIBDIVIDE_ASSERT(rem > 0 && rem < d); + const uint32_t e = d - rem; + + // This power works if e < 2**floor_log_2_d. + if (!branchfree && (e < (1U << floor_log_2_d))) { + // This power works + more = floor_log_2_d; + } else { + // We have to use the general 33-bit algorithm. We need to compute + // (2**power) / d. However, we already have (2**(power-1))/d and + // its remainder. By doubling both, and then correcting the + // remainder, we can compute the larger division. + // don't care about overflow here - in fact, we expect it + proposed_m += proposed_m; + const uint32_t twice_rem = rem + rem; + if (twice_rem >= d || twice_rem < rem) proposed_m += 1; + more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; + } + result.magic = 1 + proposed_m; + result.more = more; + // result.more's shift should in general be ceil_log_2_d. But if we + // used the smaller power, we subtract one from the shift because we're + // using the smaller power. If we're using the larger power, we + // subtract one from the shift because it's taken care of by the add + // indicator. So floor_log_2_d happens to be correct in both cases. + } + return result; +} + +struct libdivide_u32_t libdivide_u32_gen(uint32_t d) { + return libdivide_internal_u32_gen(d, 0); +} + +struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) { + if (d == 1) { + LIBDIVIDE_ERROR("branchfree divider must be != 1"); + } + struct libdivide_u32_t tmp = libdivide_internal_u32_gen(d, 1); + struct libdivide_u32_branchfree_t ret = {tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_32_SHIFT_MASK)}; + return ret; +} + +uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return numer >> more; + } + else { + uint32_t q = libdivide_mullhi_u32(denom->magic, numer); + if (more & LIBDIVIDE_ADD_MARKER) { + uint32_t t = ((numer - q) >> 1) + q; + return t >> (more & LIBDIVIDE_32_SHIFT_MASK); + } + else { + // All upper bits are 0, + // don't need to mask them off. + return q >> more; + } + } +} + +uint32_t libdivide_u32_branchfree_do(uint32_t numer, const struct libdivide_u32_branchfree_t *denom) { + uint32_t q = libdivide_mullhi_u32(denom->magic, numer); + uint32_t t = ((numer - q) >> 1) + q; + return t >> denom->more; +} + +uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + + if (!denom->magic) { + return 1U << shift; + } else if (!(more & LIBDIVIDE_ADD_MARKER)) { + // We compute q = n/d = n*m / 2^(32 + shift) + // Therefore we have d = 2^(32 + shift) / m + // We need to ceil it. + // We know d is not a power of 2, so m is not a power of 2, + // so we can just add 1 to the floor + uint32_t hi_dividend = 1U << shift; + uint32_t rem_ignored; + return 1 + libdivide_64_div_32_to_32(hi_dividend, 0, denom->magic, &rem_ignored); + } else { + // Here we wish to compute d = 2^(32+shift+1)/(m+2^32). + // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now + // Also note that shift may be as high as 31, so shift + 1 will + // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and + // then double the quotient and remainder. + uint64_t half_n = 1ULL << (32 + shift); + uint64_t d = (1ULL << 32) | denom->magic; + // Note that the quotient is guaranteed <= 32 bits, but the remainder + // may need 33! + uint32_t half_q = (uint32_t)(half_n / d); + uint64_t rem = half_n % d; + // We computed 2^(32+shift)/(m+2^32) + // Need to double it, and then add 1 to the quotient if doubling th + // remainder would increase the quotient. + // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits + uint32_t full_q = half_q + half_q + ((rem<<1) >= d); + + // We rounded down in gen (hence +1) + return full_q + 1; + } +} + +uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + + if (!denom->magic) { + return 1U << (shift + 1); + } else { + // Here we wish to compute d = 2^(32+shift+1)/(m+2^32). + // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now + // Also note that shift may be as high as 31, so shift + 1 will + // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and + // then double the quotient and remainder. + uint64_t half_n = 1ULL << (32 + shift); + uint64_t d = (1ULL << 32) | denom->magic; + // Note that the quotient is guaranteed <= 32 bits, but the remainder + // may need 33! + uint32_t half_q = (uint32_t)(half_n / d); + uint64_t rem = half_n % d; + // We computed 2^(32+shift)/(m+2^32) + // Need to double it, and then add 1 to the quotient if doubling th + // remainder would increase the quotient. + // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits + uint32_t full_q = half_q + half_q + ((rem<<1) >= d); + + // We rounded down in gen (hence +1) + return full_q + 1; + } +} + +/////////// UINT64 + +static inline struct libdivide_u64_t libdivide_internal_u64_gen(uint64_t d, int branchfree) { + if (d == 0) { + LIBDIVIDE_ERROR("divider must be != 0"); + } + + struct libdivide_u64_t result; + uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(d); + + // Power of 2 + if ((d & (d - 1)) == 0) { + // We need to subtract 1 from the shift value in case of an unsigned + // branchfree divider because there is a hardcoded right shift by 1 + // in its division algorithm. Because of this we also need to add back + // 1 in its recovery algorithm. + result.magic = 0; + result.more = (uint8_t)(floor_log_2_d - (branchfree != 0)); + } else { + uint64_t proposed_m, rem; + uint8_t more; + // (1 << (64 + floor_log_2_d)) / d + proposed_m = libdivide_128_div_64_to_64(1ULL << floor_log_2_d, 0, d, &rem); + + LIBDIVIDE_ASSERT(rem > 0 && rem < d); + const uint64_t e = d - rem; + + // This power works if e < 2**floor_log_2_d. + if (!branchfree && e < (1ULL << floor_log_2_d)) { + // This power works + more = floor_log_2_d; + } else { + // We have to use the general 65-bit algorithm. We need to compute + // (2**power) / d. However, we already have (2**(power-1))/d and + // its remainder. By doubling both, and then correcting the + // remainder, we can compute the larger division. + // don't care about overflow here - in fact, we expect it + proposed_m += proposed_m; + const uint64_t twice_rem = rem + rem; + if (twice_rem >= d || twice_rem < rem) proposed_m += 1; + more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; + } + result.magic = 1 + proposed_m; + result.more = more; + // result.more's shift should in general be ceil_log_2_d. But if we + // used the smaller power, we subtract one from the shift because we're + // using the smaller power. If we're using the larger power, we + // subtract one from the shift because it's taken care of by the add + // indicator. So floor_log_2_d happens to be correct in both cases, + // which is why we do it outside of the if statement. + } + return result; +} + +struct libdivide_u64_t libdivide_u64_gen(uint64_t d) { + return libdivide_internal_u64_gen(d, 0); +} + +struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) { + if (d == 1) { + LIBDIVIDE_ERROR("branchfree divider must be != 1"); + } + struct libdivide_u64_t tmp = libdivide_internal_u64_gen(d, 1); + struct libdivide_u64_branchfree_t ret = {tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_64_SHIFT_MASK)}; + return ret; +} + +uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return numer >> more; + } + else { + uint64_t q = libdivide_mullhi_u64(denom->magic, numer); + if (more & LIBDIVIDE_ADD_MARKER) { + uint64_t t = ((numer - q) >> 1) + q; + return t >> (more & LIBDIVIDE_64_SHIFT_MASK); + } + else { + // All upper bits are 0, + // don't need to mask them off. + return q >> more; + } + } +} + +uint64_t libdivide_u64_branchfree_do(uint64_t numer, const struct libdivide_u64_branchfree_t *denom) { + uint64_t q = libdivide_mullhi_u64(denom->magic, numer); + uint64_t t = ((numer - q) >> 1) + q; + return t >> denom->more; +} + +uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + + if (!denom->magic) { + return 1ULL << shift; + } else if (!(more & LIBDIVIDE_ADD_MARKER)) { + // We compute q = n/d = n*m / 2^(64 + shift) + // Therefore we have d = 2^(64 + shift) / m + // We need to ceil it. + // We know d is not a power of 2, so m is not a power of 2, + // so we can just add 1 to the floor + uint64_t hi_dividend = 1ULL << shift; + uint64_t rem_ignored; + return 1 + libdivide_128_div_64_to_64(hi_dividend, 0, denom->magic, &rem_ignored); + } else { + // Here we wish to compute d = 2^(64+shift+1)/(m+2^64). + // Notice (m + 2^64) is a 65 bit number. This gets hairy. See + // libdivide_u32_recover for more on what we do here. + // TODO: do something better than 128 bit math + + // Full n is a (potentially) 129 bit value + // half_n is a 128 bit value + // Compute the hi half of half_n. Low half is 0. + uint64_t half_n_hi = 1ULL << shift, half_n_lo = 0; + // d is a 65 bit value. The high bit is always set to 1. + const uint64_t d_hi = 1, d_lo = denom->magic; + // Note that the quotient is guaranteed <= 64 bits, + // but the remainder may need 65! + uint64_t r_hi, r_lo; + uint64_t half_q = libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo); + // We computed 2^(64+shift)/(m+2^64) + // Double the remainder ('dr') and check if that is larger than d + // Note that d is a 65 bit value, so r1 is small and so r1 + r1 + // cannot overflow + uint64_t dr_lo = r_lo + r_lo; + uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry + int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo); + uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0); + return full_q + 1; + } +} + +uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + + if (!denom->magic) { + return 1ULL << (shift + 1); + } else { + // Here we wish to compute d = 2^(64+shift+1)/(m+2^64). + // Notice (m + 2^64) is a 65 bit number. This gets hairy. See + // libdivide_u32_recover for more on what we do here. + // TODO: do something better than 128 bit math + + // Full n is a (potentially) 129 bit value + // half_n is a 128 bit value + // Compute the hi half of half_n. Low half is 0. + uint64_t half_n_hi = 1ULL << shift, half_n_lo = 0; + // d is a 65 bit value. The high bit is always set to 1. + const uint64_t d_hi = 1, d_lo = denom->magic; + // Note that the quotient is guaranteed <= 64 bits, + // but the remainder may need 65! + uint64_t r_hi, r_lo; + uint64_t half_q = libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo); + // We computed 2^(64+shift)/(m+2^64) + // Double the remainder ('dr') and check if that is larger than d + // Note that d is a 65 bit value, so r1 is small and so r1 + r1 + // cannot overflow + uint64_t dr_lo = r_lo + r_lo; + uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry + int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo); + uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0); + return full_q + 1; + } +} + +/////////// SINT32 + +static inline struct libdivide_s32_t libdivide_internal_s32_gen(int32_t d, int branchfree) { + if (d == 0) { + LIBDIVIDE_ERROR("divider must be != 0"); + } + + struct libdivide_s32_t result; + + // If d is a power of 2, or negative a power of 2, we have to use a shift. + // This is especially important because the magic algorithm fails for -1. + // To check if d is a power of 2 or its inverse, it suffices to check + // whether its absolute value has exactly one bit set. This works even for + // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set + // and is a power of 2. + uint32_t ud = (uint32_t)d; + uint32_t absD = (d < 0) ? -ud : ud; + uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(absD); + // check if exactly one bit is set, + // don't care if absD is 0 since that's divide by zero + if ((absD & (absD - 1)) == 0) { + // Branchfree and normal paths are exactly the same + result.magic = 0; + result.more = floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0); + } else { + LIBDIVIDE_ASSERT(floor_log_2_d >= 1); + + uint8_t more; + // the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word + // is 0 and the high word is floor_log_2_d - 1 + uint32_t rem, proposed_m; + proposed_m = libdivide_64_div_32_to_32(1U << (floor_log_2_d - 1), 0, absD, &rem); + const uint32_t e = absD - rem; + + // We are going to start with a power of floor_log_2_d - 1. + // This works if works if e < 2**floor_log_2_d. + if (!branchfree && e < (1U << floor_log_2_d)) { + // This power works + more = floor_log_2_d - 1; + } else { + // We need to go one higher. This should not make proposed_m + // overflow, but it will make it negative when interpreted as an + // int32_t. + proposed_m += proposed_m; + const uint32_t twice_rem = rem + rem; + if (twice_rem >= absD || twice_rem < rem) proposed_m += 1; + more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; + } + + proposed_m += 1; + int32_t magic = (int32_t)proposed_m; + + // Mark if we are negative. Note we only negate the magic number in the + // branchfull case. + if (d < 0) { + more |= LIBDIVIDE_NEGATIVE_DIVISOR; + if (!branchfree) { + magic = -magic; + } + } + + result.more = more; + result.magic = magic; + } + return result; +} + +struct libdivide_s32_t libdivide_s32_gen(int32_t d) { + return libdivide_internal_s32_gen(d, 0); +} + +struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d) { + struct libdivide_s32_t tmp = libdivide_internal_s32_gen(d, 1); + struct libdivide_s32_branchfree_t result = {tmp.magic, tmp.more}; + return result; +} + +int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + + if (!denom->magic) { + uint32_t sign = (int8_t)more >> 7; + uint32_t mask = (1U << shift) - 1; + uint32_t uq = numer + ((numer >> 31) & mask); + int32_t q = (int32_t)uq; + q >>= shift; + q = (q ^ sign) - sign; + return q; + } else { + uint32_t uq = (uint32_t)libdivide_mullhi_s32(denom->magic, numer); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift and then sign extend + int32_t sign = (int8_t)more >> 7; + // q += (more < 0 ? -numer : numer) + // cast required to avoid UB + uq += ((uint32_t)numer ^ sign) - sign; + } + int32_t q = (int32_t)uq; + q >>= shift; + q += (q < 0); + return q; + } +} + +int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + // must be arithmetic shift and then sign extend + int32_t sign = (int8_t)more >> 7; + int32_t magic = denom->magic; + int32_t q = libdivide_mullhi_s32(magic, numer); + q += numer; + + // If q is non-negative, we have nothing to do + // If q is negative, we want to add either (2**shift)-1 if d is a power of + // 2, or (2**shift) if it is not a power of 2 + uint32_t is_power_of_2 = (magic == 0); + uint32_t q_sign = (uint32_t)(q >> 31); + q += q_sign & ((1U << shift) - is_power_of_2); + + // Now arithmetic right shift + q >>= shift; + // Negate if needed + q = (q ^ sign) - sign; + + return q; +} + +int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + if (!denom->magic) { + uint32_t absD = 1U << shift; + if (more & LIBDIVIDE_NEGATIVE_DIVISOR) { + absD = -absD; + } + return (int32_t)absD; + } else { + // Unsigned math is much easier + // We negate the magic number only in the branchfull case, and we don't + // know which case we're in. However we have enough information to + // determine the correct sign of the magic number. The divisor was + // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set, + // the magic number's sign is opposite that of the divisor. + // We want to compute the positive magic number. + int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR); + int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) + ? denom->magic > 0 : denom->magic < 0; + + // Handle the power of 2 case (including branchfree) + if (denom->magic == 0) { + int32_t result = 1U << shift; + return negative_divisor ? -result : result; + } + + uint32_t d = (uint32_t)(magic_was_negated ? -denom->magic : denom->magic); + uint64_t n = 1ULL << (32 + shift); // this shift cannot exceed 30 + uint32_t q = (uint32_t)(n / d); + int32_t result = (int32_t)q; + result += 1; + return negative_divisor ? -result : result; + } +} + +int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom) { + return libdivide_s32_recover((const struct libdivide_s32_t *)denom); +} + +///////////// SINT64 + +static inline struct libdivide_s64_t libdivide_internal_s64_gen(int64_t d, int branchfree) { + if (d == 0) { + LIBDIVIDE_ERROR("divider must be != 0"); + } + + struct libdivide_s64_t result; + + // If d is a power of 2, or negative a power of 2, we have to use a shift. + // This is especially important because the magic algorithm fails for -1. + // To check if d is a power of 2 or its inverse, it suffices to check + // whether its absolute value has exactly one bit set. This works even for + // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set + // and is a power of 2. + uint64_t ud = (uint64_t)d; + uint64_t absD = (d < 0) ? -ud : ud; + uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(absD); + // check if exactly one bit is set, + // don't care if absD is 0 since that's divide by zero + if ((absD & (absD - 1)) == 0) { + // Branchfree and non-branchfree cases are the same + result.magic = 0; + result.more = floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0); + } else { + // the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word + // is 0 and the high word is floor_log_2_d - 1 + uint8_t more; + uint64_t rem, proposed_m; + proposed_m = libdivide_128_div_64_to_64(1ULL << (floor_log_2_d - 1), 0, absD, &rem); + const uint64_t e = absD - rem; + + // We are going to start with a power of floor_log_2_d - 1. + // This works if works if e < 2**floor_log_2_d. + if (!branchfree && e < (1ULL << floor_log_2_d)) { + // This power works + more = floor_log_2_d - 1; + } else { + // We need to go one higher. This should not make proposed_m + // overflow, but it will make it negative when interpreted as an + // int32_t. + proposed_m += proposed_m; + const uint64_t twice_rem = rem + rem; + if (twice_rem >= absD || twice_rem < rem) proposed_m += 1; + // note that we only set the LIBDIVIDE_NEGATIVE_DIVISOR bit if we + // also set ADD_MARKER this is an annoying optimization that + // enables algorithm #4 to avoid the mask. However we always set it + // in the branchfree case + more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; + } + proposed_m += 1; + int64_t magic = (int64_t)proposed_m; + + // Mark if we are negative + if (d < 0) { + more |= LIBDIVIDE_NEGATIVE_DIVISOR; + if (!branchfree) { + magic = -magic; + } + } + + result.more = more; + result.magic = magic; + } + return result; +} + +struct libdivide_s64_t libdivide_s64_gen(int64_t d) { + return libdivide_internal_s64_gen(d, 0); +} + +struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d) { + struct libdivide_s64_t tmp = libdivide_internal_s64_gen(d, 1); + struct libdivide_s64_branchfree_t ret = {tmp.magic, tmp.more}; + return ret; +} + +int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + + if (!denom->magic) { // shift path + uint64_t mask = (1ULL << shift) - 1; + uint64_t uq = numer + ((numer >> 63) & mask); + int64_t q = (int64_t)uq; + q >>= shift; + // must be arithmetic shift and then sign-extend + int64_t sign = (int8_t)more >> 7; + q = (q ^ sign) - sign; + return q; + } else { + uint64_t uq = (uint64_t)libdivide_mullhi_s64(denom->magic, numer); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift and then sign extend + int64_t sign = (int8_t)more >> 7; + // q += (more < 0 ? -numer : numer) + // cast required to avoid UB + uq += ((uint64_t)numer ^ sign) - sign; + } + int64_t q = (int64_t)uq; + q >>= shift; + q += (q < 0); + return q; + } +} + +int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + // must be arithmetic shift and then sign extend + int64_t sign = (int8_t)more >> 7; + int64_t magic = denom->magic; + int64_t q = libdivide_mullhi_s64(magic, numer); + q += numer; + + // If q is non-negative, we have nothing to do. + // If q is negative, we want to add either (2**shift)-1 if d is a power of + // 2, or (2**shift) if it is not a power of 2. + uint64_t is_power_of_2 = (magic == 0); + uint64_t q_sign = (uint64_t)(q >> 63); + q += q_sign & ((1ULL << shift) - is_power_of_2); + + // Arithmetic right shift + q >>= shift; + // Negate if needed + q = (q ^ sign) - sign; + + return q; +} + +int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + if (denom->magic == 0) { // shift path + uint64_t absD = 1ULL << shift; + if (more & LIBDIVIDE_NEGATIVE_DIVISOR) { + absD = -absD; + } + return (int64_t)absD; + } else { + // Unsigned math is much easier + int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR); + int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) + ? denom->magic > 0 : denom->magic < 0; + + uint64_t d = (uint64_t)(magic_was_negated ? -denom->magic : denom->magic); + uint64_t n_hi = 1ULL << shift, n_lo = 0; + uint64_t rem_ignored; + uint64_t q = libdivide_128_div_64_to_64(n_hi, n_lo, d, &rem_ignored); + int64_t result = (int64_t)(q + 1); + if (negative_divisor) { + result = -result; + } + return result; + } +} + +int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom) { + return libdivide_s64_recover((const struct libdivide_s64_t *)denom); +} + +#if defined(LIBDIVIDE_AVX512) + +static inline __m512i libdivide_u32_do_vector(__m512i numers, const struct libdivide_u32_t *denom); +static inline __m512i libdivide_s32_do_vector(__m512i numers, const struct libdivide_s32_t *denom); +static inline __m512i libdivide_u64_do_vector(__m512i numers, const struct libdivide_u64_t *denom); +static inline __m512i libdivide_s64_do_vector(__m512i numers, const struct libdivide_s64_t *denom); + +static inline __m512i libdivide_u32_branchfree_do_vector(__m512i numers, const struct libdivide_u32_branchfree_t *denom); +static inline __m512i libdivide_s32_branchfree_do_vector(__m512i numers, const struct libdivide_s32_branchfree_t *denom); +static inline __m512i libdivide_u64_branchfree_do_vector(__m512i numers, const struct libdivide_u64_branchfree_t *denom); +static inline __m512i libdivide_s64_branchfree_do_vector(__m512i numers, const struct libdivide_s64_branchfree_t *denom); + +//////// Internal Utility Functions + +static inline __m512i libdivide_s64_signbits(__m512i v) {; + return _mm512_srai_epi64(v, 63); +} + +static inline __m512i libdivide_s64_shift_right_vector(__m512i v, int amt) { + return _mm512_srai_epi64(v, amt); +} + +// Here, b is assumed to contain one 32-bit value repeated. +static inline __m512i libdivide_mullhi_u32_vector(__m512i a, __m512i b) { + __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epu32(a, b), 32); + __m512i a1X3X = _mm512_srli_epi64(a, 32); + __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0); + __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epu32(a1X3X, b), mask); + return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3); +} + +// b is one 32-bit value repeated. +static inline __m512i libdivide_mullhi_s32_vector(__m512i a, __m512i b) { + __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epi32(a, b), 32); + __m512i a1X3X = _mm512_srli_epi64(a, 32); + __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0); + __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epi32(a1X3X, b), mask); + return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3); +} + +// Here, y is assumed to contain one 64-bit value repeated. +// https://stackoverflow.com/a/28827013 +static inline __m512i libdivide_mullhi_u64_vector(__m512i x, __m512i y) { + __m512i lomask = _mm512_set1_epi64(0xffffffff); + __m512i xh = _mm512_shuffle_epi32(x, (_MM_PERM_ENUM) 0xB1); + __m512i yh = _mm512_shuffle_epi32(y, (_MM_PERM_ENUM) 0xB1); + __m512i w0 = _mm512_mul_epu32(x, y); + __m512i w1 = _mm512_mul_epu32(x, yh); + __m512i w2 = _mm512_mul_epu32(xh, y); + __m512i w3 = _mm512_mul_epu32(xh, yh); + __m512i w0h = _mm512_srli_epi64(w0, 32); + __m512i s1 = _mm512_add_epi64(w1, w0h); + __m512i s1l = _mm512_and_si512(s1, lomask); + __m512i s1h = _mm512_srli_epi64(s1, 32); + __m512i s2 = _mm512_add_epi64(w2, s1l); + __m512i s2h = _mm512_srli_epi64(s2, 32); + __m512i hi = _mm512_add_epi64(w3, s1h); + hi = _mm512_add_epi64(hi, s2h); + + return hi; +} + +// y is one 64-bit value repeated. +static inline __m512i libdivide_mullhi_s64_vector(__m512i x, __m512i y) { + __m512i p = libdivide_mullhi_u64_vector(x, y); + __m512i t1 = _mm512_and_si512(libdivide_s64_signbits(x), y); + __m512i t2 = _mm512_and_si512(libdivide_s64_signbits(y), x); + p = _mm512_sub_epi64(p, t1); + p = _mm512_sub_epi64(p, t2); + return p; +} + +////////// UINT32 + +__m512i libdivide_u32_do_vector(__m512i numers, const struct libdivide_u32_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return _mm512_srli_epi32(numers, more); + } + else { + __m512i q = libdivide_mullhi_u32_vector(numers, _mm512_set1_epi32(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // uint32_t t = ((numer - q) >> 1) + q; + // return t >> denom->shift; + uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q); + return _mm512_srli_epi32(t, shift); + } + else { + return _mm512_srli_epi32(q, more); + } + } +} + +__m512i libdivide_u32_branchfree_do_vector(__m512i numers, const struct libdivide_u32_branchfree_t *denom) { + __m512i q = libdivide_mullhi_u32_vector(numers, _mm512_set1_epi32(denom->magic)); + __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q); + return _mm512_srli_epi32(t, denom->more); +} + +////////// UINT64 + +__m512i libdivide_u64_do_vector(__m512i numers, const struct libdivide_u64_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return _mm512_srli_epi64(numers, more); + } + else { + __m512i q = libdivide_mullhi_u64_vector(numers, _mm512_set1_epi64(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // uint32_t t = ((numer - q) >> 1) + q; + // return t >> denom->shift; + uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q); + return _mm512_srli_epi64(t, shift); + } + else { + return _mm512_srli_epi64(q, more); + } + } +} + +__m512i libdivide_u64_branchfree_do_vector(__m512i numers, const struct libdivide_u64_branchfree_t *denom) { + __m512i q = libdivide_mullhi_u64_vector(numers, _mm512_set1_epi64(denom->magic)); + __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q); + return _mm512_srli_epi64(t, denom->more); +} + +////////// SINT32 + +__m512i libdivide_s32_do_vector(__m512i numers, const struct libdivide_s32_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + uint32_t mask = (1U << shift) - 1; + __m512i roundToZeroTweak = _mm512_set1_epi32(mask); + // q = numer + ((numer >> 31) & roundToZeroTweak); + __m512i q = _mm512_add_epi32(numers, _mm512_and_si512(_mm512_srai_epi32(numers, 31), roundToZeroTweak)); + q = _mm512_srai_epi32(q, shift); + __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); + // q = (q ^ sign) - sign; + q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign); + return q; + } + else { + __m512i q = libdivide_mullhi_s32_vector(numers, _mm512_set1_epi32(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift + __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); + // q += ((numer ^ sign) - sign); + q = _mm512_add_epi32(q, _mm512_sub_epi32(_mm512_xor_si512(numers, sign), sign)); + } + // q >>= shift + q = _mm512_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK); + q = _mm512_add_epi32(q, _mm512_srli_epi32(q, 31)); // q += (q < 0) + return q; + } +} + +__m512i libdivide_s32_branchfree_do_vector(__m512i numers, const struct libdivide_s32_branchfree_t *denom) { + int32_t magic = denom->magic; + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + // must be arithmetic shift + __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); + __m512i q = libdivide_mullhi_s32_vector(numers, _mm512_set1_epi32(magic)); + q = _mm512_add_epi32(q, numers); // q += numers + + // If q is non-negative, we have nothing to do + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2 + uint32_t is_power_of_2 = (magic == 0); + __m512i q_sign = _mm512_srai_epi32(q, 31); // q_sign = q >> 31 + __m512i mask = _mm512_set1_epi32((1U << shift) - is_power_of_2); + q = _mm512_add_epi32(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask) + q = _mm512_srai_epi32(q, shift); // q >>= shift + q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign + return q; +} + +////////// SINT64 + +__m512i libdivide_s64_do_vector(__m512i numers, const struct libdivide_s64_t *denom) { + uint8_t more = denom->more; + int64_t magic = denom->magic; + if (magic == 0) { // shift path + uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + uint64_t mask = (1ULL << shift) - 1; + __m512i roundToZeroTweak = _mm512_set1_epi64(mask); + // q = numer + ((numer >> 63) & roundToZeroTweak); + __m512i q = _mm512_add_epi64(numers, _mm512_and_si512(libdivide_s64_signbits(numers), roundToZeroTweak)); + q = libdivide_s64_shift_right_vector(q, shift); + __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); + // q = (q ^ sign) - sign; + q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign); + return q; + } + else { + __m512i q = libdivide_mullhi_s64_vector(numers, _mm512_set1_epi64(magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift + __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); + // q += ((numer ^ sign) - sign); + q = _mm512_add_epi64(q, _mm512_sub_epi64(_mm512_xor_si512(numers, sign), sign)); + } + // q >>= denom->mult_path.shift + q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK); + q = _mm512_add_epi64(q, _mm512_srli_epi64(q, 63)); // q += (q < 0) + return q; + } +} + +__m512i libdivide_s64_branchfree_do_vector(__m512i numers, const struct libdivide_s64_branchfree_t *denom) { + int64_t magic = denom->magic; + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + // must be arithmetic shift + __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); + + // libdivide_mullhi_s64(numers, magic); + __m512i q = libdivide_mullhi_s64_vector(numers, _mm512_set1_epi64(magic)); + q = _mm512_add_epi64(q, numers); // q += numers + + // If q is non-negative, we have nothing to do. + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2. + uint32_t is_power_of_2 = (magic == 0); + __m512i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 + __m512i mask = _mm512_set1_epi64((1ULL << shift) - is_power_of_2); + q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask) + q = libdivide_s64_shift_right_vector(q, shift); // q >>= shift + q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign + return q; +} + +#elif defined(LIBDIVIDE_AVX2) + +static inline __m256i libdivide_u32_do_vector(__m256i numers, const struct libdivide_u32_t *denom); +static inline __m256i libdivide_s32_do_vector(__m256i numers, const struct libdivide_s32_t *denom); +static inline __m256i libdivide_u64_do_vector(__m256i numers, const struct libdivide_u64_t *denom); +static inline __m256i libdivide_s64_do_vector(__m256i numers, const struct libdivide_s64_t *denom); + +static inline __m256i libdivide_u32_branchfree_do_vector(__m256i numers, const struct libdivide_u32_branchfree_t *denom); +static inline __m256i libdivide_s32_branchfree_do_vector(__m256i numers, const struct libdivide_s32_branchfree_t *denom); +static inline __m256i libdivide_u64_branchfree_do_vector(__m256i numers, const struct libdivide_u64_branchfree_t *denom); +static inline __m256i libdivide_s64_branchfree_do_vector(__m256i numers, const struct libdivide_s64_branchfree_t *denom); + +//////// Internal Utility Functions + +// Implementation of _mm256_srai_epi64(v, 63) (from AVX512). +static inline __m256i libdivide_s64_signbits(__m256i v) { + __m256i hiBitsDuped = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1)); + __m256i signBits = _mm256_srai_epi32(hiBitsDuped, 31); + return signBits; +} + +// Implementation of _mm256_srai_epi64 (from AVX512). +static inline __m256i libdivide_s64_shift_right_vector(__m256i v, int amt) { + const int b = 64 - amt; + __m256i m = _mm256_set1_epi64x(1ULL << (b - 1)); + __m256i x = _mm256_srli_epi64(v, amt); + __m256i result = _mm256_sub_epi64(_mm256_xor_si256(x, m), m); + return result; +} + +// Here, b is assumed to contain one 32-bit value repeated. +static inline __m256i libdivide_mullhi_u32_vector(__m256i a, __m256i b) { + __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epu32(a, b), 32); + __m256i a1X3X = _mm256_srli_epi64(a, 32); + __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0); + __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epu32(a1X3X, b), mask); + return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3); +} + +// b is one 32-bit value repeated. +static inline __m256i libdivide_mullhi_s32_vector(__m256i a, __m256i b) { + __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epi32(a, b), 32); + __m256i a1X3X = _mm256_srli_epi64(a, 32); + __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0); + __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epi32(a1X3X, b), mask); + return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3); +} + +// Here, y is assumed to contain one 64-bit value repeated. +// https://stackoverflow.com/a/28827013 +static inline __m256i libdivide_mullhi_u64_vector(__m256i x, __m256i y) { + __m256i lomask = _mm256_set1_epi64x(0xffffffff); + __m256i xh = _mm256_shuffle_epi32(x, 0xB1); // x0l, x0h, x1l, x1h + __m256i yh = _mm256_shuffle_epi32(y, 0xB1); // y0l, y0h, y1l, y1h + __m256i w0 = _mm256_mul_epu32(x, y); // x0l*y0l, x1l*y1l + __m256i w1 = _mm256_mul_epu32(x, yh); // x0l*y0h, x1l*y1h + __m256i w2 = _mm256_mul_epu32(xh, y); // x0h*y0l, x1h*y0l + __m256i w3 = _mm256_mul_epu32(xh, yh); // x0h*y0h, x1h*y1h + __m256i w0h = _mm256_srli_epi64(w0, 32); + __m256i s1 = _mm256_add_epi64(w1, w0h); + __m256i s1l = _mm256_and_si256(s1, lomask); + __m256i s1h = _mm256_srli_epi64(s1, 32); + __m256i s2 = _mm256_add_epi64(w2, s1l); + __m256i s2h = _mm256_srli_epi64(s2, 32); + __m256i hi = _mm256_add_epi64(w3, s1h); + hi = _mm256_add_epi64(hi, s2h); + + return hi; +} + +// y is one 64-bit value repeated. +static inline __m256i libdivide_mullhi_s64_vector(__m256i x, __m256i y) { + __m256i p = libdivide_mullhi_u64_vector(x, y); + __m256i t1 = _mm256_and_si256(libdivide_s64_signbits(x), y); + __m256i t2 = _mm256_and_si256(libdivide_s64_signbits(y), x); + p = _mm256_sub_epi64(p, t1); + p = _mm256_sub_epi64(p, t2); + return p; +} + +////////// UINT32 + +__m256i libdivide_u32_do_vector(__m256i numers, const struct libdivide_u32_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return _mm256_srli_epi32(numers, more); + } + else { + __m256i q = libdivide_mullhi_u32_vector(numers, _mm256_set1_epi32(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // uint32_t t = ((numer - q) >> 1) + q; + // return t >> denom->shift; + uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q); + return _mm256_srli_epi32(t, shift); + } + else { + return _mm256_srli_epi32(q, more); + } + } +} + +__m256i libdivide_u32_branchfree_do_vector(__m256i numers, const struct libdivide_u32_branchfree_t *denom) { + __m256i q = libdivide_mullhi_u32_vector(numers, _mm256_set1_epi32(denom->magic)); + __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q); + return _mm256_srli_epi32(t, denom->more); +} + +////////// UINT64 + +__m256i libdivide_u64_do_vector(__m256i numers, const struct libdivide_u64_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return _mm256_srli_epi64(numers, more); + } + else { + __m256i q = libdivide_mullhi_u64_vector(numers, _mm256_set1_epi64x(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // uint32_t t = ((numer - q) >> 1) + q; + // return t >> denom->shift; + uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q); + return _mm256_srli_epi64(t, shift); + } + else { + return _mm256_srli_epi64(q, more); + } + } +} + +__m256i libdivide_u64_branchfree_do_vector(__m256i numers, const struct libdivide_u64_branchfree_t *denom) { + __m256i q = libdivide_mullhi_u64_vector(numers, _mm256_set1_epi64x(denom->magic)); + __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q); + return _mm256_srli_epi64(t, denom->more); +} + +////////// SINT32 + +__m256i libdivide_s32_do_vector(__m256i numers, const struct libdivide_s32_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + uint32_t mask = (1U << shift) - 1; + __m256i roundToZeroTweak = _mm256_set1_epi32(mask); + // q = numer + ((numer >> 31) & roundToZeroTweak); + __m256i q = _mm256_add_epi32(numers, _mm256_and_si256(_mm256_srai_epi32(numers, 31), roundToZeroTweak)); + q = _mm256_srai_epi32(q, shift); + __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); + // q = (q ^ sign) - sign; + q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign); + return q; + } + else { + __m256i q = libdivide_mullhi_s32_vector(numers, _mm256_set1_epi32(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift + __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); + // q += ((numer ^ sign) - sign); + q = _mm256_add_epi32(q, _mm256_sub_epi32(_mm256_xor_si256(numers, sign), sign)); + } + // q >>= shift + q = _mm256_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK); + q = _mm256_add_epi32(q, _mm256_srli_epi32(q, 31)); // q += (q < 0) + return q; + } +} + +__m256i libdivide_s32_branchfree_do_vector(__m256i numers, const struct libdivide_s32_branchfree_t *denom) { + int32_t magic = denom->magic; + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + // must be arithmetic shift + __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); + __m256i q = libdivide_mullhi_s32_vector(numers, _mm256_set1_epi32(magic)); + q = _mm256_add_epi32(q, numers); // q += numers + + // If q is non-negative, we have nothing to do + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2 + uint32_t is_power_of_2 = (magic == 0); + __m256i q_sign = _mm256_srai_epi32(q, 31); // q_sign = q >> 31 + __m256i mask = _mm256_set1_epi32((1U << shift) - is_power_of_2); + q = _mm256_add_epi32(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) + q = _mm256_srai_epi32(q, shift); // q >>= shift + q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign + return q; +} + +////////// SINT64 + +__m256i libdivide_s64_do_vector(__m256i numers, const struct libdivide_s64_t *denom) { + uint8_t more = denom->more; + int64_t magic = denom->magic; + if (magic == 0) { // shift path + uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + uint64_t mask = (1ULL << shift) - 1; + __m256i roundToZeroTweak = _mm256_set1_epi64x(mask); + // q = numer + ((numer >> 63) & roundToZeroTweak); + __m256i q = _mm256_add_epi64(numers, _mm256_and_si256(libdivide_s64_signbits(numers), roundToZeroTweak)); + q = libdivide_s64_shift_right_vector(q, shift); + __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); + // q = (q ^ sign) - sign; + q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign); + return q; + } + else { + __m256i q = libdivide_mullhi_s64_vector(numers, _mm256_set1_epi64x(magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift + __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); + // q += ((numer ^ sign) - sign); + q = _mm256_add_epi64(q, _mm256_sub_epi64(_mm256_xor_si256(numers, sign), sign)); + } + // q >>= denom->mult_path.shift + q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK); + q = _mm256_add_epi64(q, _mm256_srli_epi64(q, 63)); // q += (q < 0) + return q; + } +} + +__m256i libdivide_s64_branchfree_do_vector(__m256i numers, const struct libdivide_s64_branchfree_t *denom) { + int64_t magic = denom->magic; + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + // must be arithmetic shift + __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); + + // libdivide_mullhi_s64(numers, magic); + __m256i q = libdivide_mullhi_s64_vector(numers, _mm256_set1_epi64x(magic)); + q = _mm256_add_epi64(q, numers); // q += numers + + // If q is non-negative, we have nothing to do. + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2. + uint32_t is_power_of_2 = (magic == 0); + __m256i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 + __m256i mask = _mm256_set1_epi64x((1ULL << shift) - is_power_of_2); + q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) + q = libdivide_s64_shift_right_vector(q, shift); // q >>= shift + q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign + return q; +} + +#elif defined(LIBDIVIDE_SSE2) + +static inline __m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom); +static inline __m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom); +static inline __m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom); +static inline __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *denom); + +static inline __m128i libdivide_u32_branchfree_do_vector(__m128i numers, const struct libdivide_u32_branchfree_t *denom); +static inline __m128i libdivide_s32_branchfree_do_vector(__m128i numers, const struct libdivide_s32_branchfree_t *denom); +static inline __m128i libdivide_u64_branchfree_do_vector(__m128i numers, const struct libdivide_u64_branchfree_t *denom); +static inline __m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivide_s64_branchfree_t *denom); + +//////// Internal Utility Functions + +// Implementation of _mm_srai_epi64(v, 63) (from AVX512). +static inline __m128i libdivide_s64_signbits(__m128i v) { + __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1)); + __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31); + return signBits; +} + +// Implementation of _mm_srai_epi64 (from AVX512). +static inline __m128i libdivide_s64_shift_right_vector(__m128i v, int amt) { + const int b = 64 - amt; + __m128i m = _mm_set1_epi64x(1ULL << (b - 1)); + __m128i x = _mm_srli_epi64(v, amt); + __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m); + return result; +} + +// Here, b is assumed to contain one 32-bit value repeated. +static inline __m128i libdivide_mullhi_u32_vector(__m128i a, __m128i b) { + __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32); + __m128i a1X3X = _mm_srli_epi64(a, 32); + __m128i mask = _mm_set_epi32(-1, 0, -1, 0); + __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), mask); + return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3); +} + +// SSE2 does not have a signed multiplication instruction, but we can convert +// unsigned to signed pretty efficiently. Again, b is just a 32 bit value +// repeated four times. +static inline __m128i libdivide_mullhi_s32_vector(__m128i a, __m128i b) { + __m128i p = libdivide_mullhi_u32_vector(a, b); + // t1 = (a >> 31) & y, arithmetic shift + __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b); + __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a); + p = _mm_sub_epi32(p, t1); + p = _mm_sub_epi32(p, t2); + return p; +} + +// Here, y is assumed to contain one 64-bit value repeated. +// https://stackoverflow.com/a/28827013 +static inline __m128i libdivide_mullhi_u64_vector(__m128i x, __m128i y) { + __m128i lomask = _mm_set1_epi64x(0xffffffff); + __m128i xh = _mm_shuffle_epi32(x, 0xB1); // x0l, x0h, x1l, x1h + __m128i yh = _mm_shuffle_epi32(y, 0xB1); // y0l, y0h, y1l, y1h + __m128i w0 = _mm_mul_epu32(x, y); // x0l*y0l, x1l*y1l + __m128i w1 = _mm_mul_epu32(x, yh); // x0l*y0h, x1l*y1h + __m128i w2 = _mm_mul_epu32(xh, y); // x0h*y0l, x1h*y0l + __m128i w3 = _mm_mul_epu32(xh, yh); // x0h*y0h, x1h*y1h + __m128i w0h = _mm_srli_epi64(w0, 32); + __m128i s1 = _mm_add_epi64(w1, w0h); + __m128i s1l = _mm_and_si128(s1, lomask); + __m128i s1h = _mm_srli_epi64(s1, 32); + __m128i s2 = _mm_add_epi64(w2, s1l); + __m128i s2h = _mm_srli_epi64(s2, 32); + __m128i hi = _mm_add_epi64(w3, s1h); + hi = _mm_add_epi64(hi, s2h); + + return hi; +} + +// y is one 64-bit value repeated. +static inline __m128i libdivide_mullhi_s64_vector(__m128i x, __m128i y) { + __m128i p = libdivide_mullhi_u64_vector(x, y); + __m128i t1 = _mm_and_si128(libdivide_s64_signbits(x), y); + __m128i t2 = _mm_and_si128(libdivide_s64_signbits(y), x); + p = _mm_sub_epi64(p, t1); + p = _mm_sub_epi64(p, t2); + return p; +} + +////////// UINT32 + +__m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return _mm_srli_epi32(numers, more); + } + else { + __m128i q = libdivide_mullhi_u32_vector(numers, _mm_set1_epi32(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // uint32_t t = ((numer - q) >> 1) + q; + // return t >> denom->shift; + uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); + return _mm_srli_epi32(t, shift); + } + else { + return _mm_srli_epi32(q, more); + } + } +} + +__m128i libdivide_u32_branchfree_do_vector(__m128i numers, const struct libdivide_u32_branchfree_t *denom) { + __m128i q = libdivide_mullhi_u32_vector(numers, _mm_set1_epi32(denom->magic)); + __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); + return _mm_srli_epi32(t, denom->more); +} + +////////// UINT64 + +__m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return _mm_srli_epi64(numers, more); + } + else { + __m128i q = libdivide_mullhi_u64_vector(numers, _mm_set1_epi64x(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // uint32_t t = ((numer - q) >> 1) + q; + // return t >> denom->shift; + uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); + return _mm_srli_epi64(t, shift); + } + else { + return _mm_srli_epi64(q, more); + } + } +} + +__m128i libdivide_u64_branchfree_do_vector(__m128i numers, const struct libdivide_u64_branchfree_t *denom) { + __m128i q = libdivide_mullhi_u64_vector(numers, _mm_set1_epi64x(denom->magic)); + __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); + return _mm_srli_epi64(t, denom->more); +} + +////////// SINT32 + +__m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + uint32_t mask = (1U << shift) - 1; + __m128i roundToZeroTweak = _mm_set1_epi32(mask); + // q = numer + ((numer >> 31) & roundToZeroTweak); + __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); + q = _mm_srai_epi32(q, shift); + __m128i sign = _mm_set1_epi32((int8_t)more >> 7); + // q = (q ^ sign) - sign; + q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); + return q; + } + else { + __m128i q = libdivide_mullhi_s32_vector(numers, _mm_set1_epi32(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift + __m128i sign = _mm_set1_epi32((int8_t)more >> 7); + // q += ((numer ^ sign) - sign); + q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign)); + } + // q >>= shift + q = _mm_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK); + q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0) + return q; + } +} + +__m128i libdivide_s32_branchfree_do_vector(__m128i numers, const struct libdivide_s32_branchfree_t *denom) { + int32_t magic = denom->magic; + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + // must be arithmetic shift + __m128i sign = _mm_set1_epi32((int8_t)more >> 7); + __m128i q = libdivide_mullhi_s32_vector(numers, _mm_set1_epi32(magic)); + q = _mm_add_epi32(q, numers); // q += numers + + // If q is non-negative, we have nothing to do + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2 + uint32_t is_power_of_2 = (magic == 0); + __m128i q_sign = _mm_srai_epi32(q, 31); // q_sign = q >> 31 + __m128i mask = _mm_set1_epi32((1U << shift) - is_power_of_2); + q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) + q = _mm_srai_epi32(q, shift); // q >>= shift + q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign + return q; +} + +////////// SINT64 + +__m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *denom) { + uint8_t more = denom->more; + int64_t magic = denom->magic; + if (magic == 0) { // shift path + uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + uint64_t mask = (1ULL << shift) - 1; + __m128i roundToZeroTweak = _mm_set1_epi64x(mask); + // q = numer + ((numer >> 63) & roundToZeroTweak); + __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak)); + q = libdivide_s64_shift_right_vector(q, shift); + __m128i sign = _mm_set1_epi32((int8_t)more >> 7); + // q = (q ^ sign) - sign; + q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); + return q; + } + else { + __m128i q = libdivide_mullhi_s64_vector(numers, _mm_set1_epi64x(magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift + __m128i sign = _mm_set1_epi32((int8_t)more >> 7); + // q += ((numer ^ sign) - sign); + q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign)); + } + // q >>= denom->mult_path.shift + q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK); + q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0) + return q; + } +} + +__m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivide_s64_branchfree_t *denom) { + int64_t magic = denom->magic; + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + // must be arithmetic shift + __m128i sign = _mm_set1_epi32((int8_t)more >> 7); + + // libdivide_mullhi_s64(numers, magic); + __m128i q = libdivide_mullhi_s64_vector(numers, _mm_set1_epi64x(magic)); + q = _mm_add_epi64(q, numers); // q += numers + + // If q is non-negative, we have nothing to do. + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2. + uint32_t is_power_of_2 = (magic == 0); + __m128i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 + __m128i mask = _mm_set1_epi64x((1ULL << shift) - is_power_of_2); + q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) + q = libdivide_s64_shift_right_vector(q, shift); // q >>= shift + q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign + return q; +} + +#endif + +/////////// C++ stuff + +#ifdef __cplusplus + +// The C++ divider class is templated on both an integer type +// (like uint64_t) and an algorithm type. +// * BRANCHFULL is the default algorithm type. +// * BRANCHFREE is the branchfree algorithm type. +enum { + BRANCHFULL, + BRANCHFREE +}; + +#if defined(LIBDIVIDE_AVX512) + #define LIBDIVIDE_VECTOR_TYPE __m512i +#elif defined(LIBDIVIDE_AVX2) + #define LIBDIVIDE_VECTOR_TYPE __m256i +#elif defined(LIBDIVIDE_SSE2) + #define LIBDIVIDE_VECTOR_TYPE __m128i +#endif + +#if !defined(LIBDIVIDE_VECTOR_TYPE) + #define LIBDIVIDE_DIVIDE_VECTOR(ALGO) +#else + #define LIBDIVIDE_DIVIDE_VECTOR(ALGO) \ + LIBDIVIDE_VECTOR_TYPE divide(LIBDIVIDE_VECTOR_TYPE n) const { \ + return libdivide_##ALGO##_do_vector(n, &denom); \ + } +#endif + +// The DISPATCHER_GEN() macro generates C++ methods (for the given integer +// and algorithm types) that redirect to libdivide's C API. +#define DISPATCHER_GEN(T, ALGO) \ + libdivide_##ALGO##_t denom; \ + dispatcher() { } \ + dispatcher(T d) \ + : denom(libdivide_##ALGO##_gen(d)) \ + { } \ + T divide(T n) const { \ + return libdivide_##ALGO##_do(n, &denom); \ + } \ + LIBDIVIDE_DIVIDE_VECTOR(ALGO) \ + T recover() const { \ + return libdivide_##ALGO##_recover(&denom); \ + } + +// The dispatcher selects a specific division algorithm for a given +// type and ALGO using partial template specialization. +template struct dispatcher { }; + +template<> struct dispatcher { DISPATCHER_GEN(int32_t, s32) }; +template<> struct dispatcher { DISPATCHER_GEN(int32_t, s32_branchfree) }; +template<> struct dispatcher { DISPATCHER_GEN(uint32_t, u32) }; +template<> struct dispatcher { DISPATCHER_GEN(uint32_t, u32_branchfree) }; +template<> struct dispatcher { DISPATCHER_GEN(int64_t, s64) }; +template<> struct dispatcher { DISPATCHER_GEN(int64_t, s64_branchfree) }; +template<> struct dispatcher { DISPATCHER_GEN(uint64_t, u64) }; +template<> struct dispatcher { DISPATCHER_GEN(uint64_t, u64_branchfree) }; + +// This is the main divider class for use by the user (C++ API). +// The actual division algorithm is selected using the dispatcher struct +// based on the integer and algorithm template parameters. +template +class divider { +public: + // We leave the default constructor empty so that creating + // an array of dividers and then initializing them + // later doesn't slow us down. + divider() { } + + // Constructor that takes the divisor as a parameter + divider(T d) : div(d) { } + + // Divides n by the divisor + T divide(T n) const { + return div.divide(n); + } + + // Recovers the divisor, returns the value that was + // used to initialize this divider object. + T recover() const { + return div.recover(); + } + + bool operator==(const divider& other) const { + return div.denom.magic == other.denom.magic && + div.denom.more == other.denom.more; + } + + bool operator!=(const divider& other) const { + return !(*this == other); + } + +#if defined(LIBDIVIDE_VECTOR_TYPE) + // Treats the vector as packed integer values with the same type as + // the divider (e.g. s32, u32, s64, u64) and divides each of + // them by the divider, returning the packed quotients. + LIBDIVIDE_VECTOR_TYPE divide(LIBDIVIDE_VECTOR_TYPE n) const { + return div.divide(n); + } +#endif + +private: + // Storage for the actual divisor + dispatcher::value, + std::is_signed::value, sizeof(T), ALGO> div; +}; + +// Overload of operator / for scalar division +template +T operator/(T n, const divider& div) { + return div.divide(n); +} + +// Overload of operator /= for scalar division +template +T& operator/=(T& n, const divider& div) { + n = div.divide(n); + return n; +} + +#if defined(LIBDIVIDE_VECTOR_TYPE) + // Overload of operator / for vector division + template + LIBDIVIDE_VECTOR_TYPE operator/(LIBDIVIDE_VECTOR_TYPE n, const divider& div) { + return div.divide(n); + } + // Overload of operator /= for vector division + template + LIBDIVIDE_VECTOR_TYPE& operator/=(LIBDIVIDE_VECTOR_TYPE& n, const divider& div) { + n = div.divide(n); + return n; + } +#endif + +// libdivdie::branchfree_divider +template +using branchfree_divider = divider; + +} // namespace libdivide + +#endif // __cplusplus + +#endif // LIBDIVIDE_H diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 3a7543b99421..114cfed8b553 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -19,7 +19,12 @@ #include "ufunc_object.h" #include /* for memchr */ -#include + +/* Use Libdivide for faster division */ +/* TODO Explore placing specialised versions in `numpy/core/src/common/simd` */ +#ifdef USE_LIBDIVIDE +#include "numpy/libdivide.h" +#endif /* * cutoff blocksize for pairwise summation From b02399ac1c0838a84c6d966ef2c34cd60c82c30c Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Sun, 8 Nov 2020 18:08:32 +0530 Subject: [PATCH 05/32] ENH: Made libdivide default --- numpy/core/setup.py | 14 +++----- numpy/core/src/umath/loops.c.src | 58 ++------------------------------ 2 files changed, 6 insertions(+), 66 deletions(-) diff --git a/numpy/core/setup.py b/numpy/core/setup.py index ca108863c355..448499926f57 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -386,11 +386,8 @@ def check_mathlib(config_cmd): "MATHLIB env variable") return mathlibs -def check_libdivide(): - return os.environ.get('NPY_USE_LIBDIVIDE') is not None - -def check_optimal_divisor(): - return os.environ.get('NPY_USE_OPTIMAL_DIVISOR') is not None +def check_use_legacy_division(): + return os.environ.get('NPY_USE_LEGACY_DIVISION') is not None def visibility_define(config): """Return the define value to use for NPY_VISIBILITY_HIDDEN (may be empty @@ -448,11 +445,8 @@ def generate_config_h(ext, build_dir): mathlibs = check_mathlib(config_cmd) moredefs.append(('MATHLIB', ','.join(mathlibs))) - # Check if libdivide needs to be used - check_libdivide() and moredefs.append('USE_LIBDIVIDE') - - # Check if optimal divisor code needs to be used - check_optimal_divisor() and moredefs.append('USE_OPTIMAL_DIVISOR') + # Check if legacy division needs to be used + check_use_legacy_division() and moredefs.append('USE_LEGACY_DIVISION') check_math_capabilities(config_cmd, ext, moredefs, mathlibs) moredefs.extend(cocache.check_ieee_macros(config_cmd)[0]) diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 114cfed8b553..0a493affbf96 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -22,7 +22,7 @@ /* Use Libdivide for faster division */ /* TODO Explore placing specialised versions in `numpy/core/src/common/simd` */ -#ifdef USE_LIBDIVIDE +#ifndef USE_LEGACY_DIVISION #include "numpy/libdivide.h" #endif @@ -847,7 +847,7 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0)); } -#ifdef USE_LIBDIVIDE +#ifndef USE_LEGACY_DIVISION NPY_NO_EXPORT void @TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -901,60 +901,6 @@ NPY_NO_EXPORT void } } } -#elif defined(USE_OPTIMAL_DIVISOR) -NPY_NO_EXPORT void -@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - BINARY_LOOP_BASE - - if(!is2) { - const @type@ in2 = *(@type@ *)ip2; - const float in2_f = (float) in2; - BINARY_LOOP_FIXED { - const @type@ in1 = *(@type@ *)ip1; - /* - * FIXME: On x86 at least, dividing the smallest representable integer - * by -1 causes a SIFGPE (division overflow). We treat this case here - * (to avoid a SIGFPE crash at python level), but a good solution would - * be to treat integer division problems separately from FPU exceptions - * (i.e. a different approach than npy_set_floatstatus_divbyzero()). - */ - if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) { - npy_set_floatstatus_divbyzero(); - *((@type@ *)op1) = 0; - } - else if ((in1 > 0) != (in2 > 0)) { - *((@type@ *)op1) = floor(in1/in2_f); - } - else { - *((@type@ *)op1) = in1/in2; - } - } - } - else { - BINARY_LOOP_SLIDING { // XXX Lot of repeated code - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - /* - * FIXME: On x86 at least, dividing the smallest representable integer - * by -1 causes a SIFGPE (division overflow). We treat this case here - * (to avoid a SIGFPE crash at python level), but a good solution would - * be to treat integer division problems separately from FPU exceptions - * (i.e. a different approach than npy_set_floatstatus_divbyzero()). - */ - if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) { - npy_set_floatstatus_divbyzero(); - *((@type@ *)op1) = 0; - } - else if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) { - *((@type@ *)op1) = in1/in2 - 1; - } - else { - *((@type@ *)op1) = in1/in2; - } - } - } -} #else NPY_NO_EXPORT void @TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) From f0ddb7c6839fc2799cce891cb7ef71c2b0dfb097 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Sun, 8 Nov 2020 18:23:19 +0530 Subject: [PATCH 06/32] ENH: Handled divide by 0 case --- numpy/core/src/umath/loops.c.src | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 0a493affbf96..43f839931482 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -851,11 +851,12 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void NPY_NO_EXPORT void @TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { + static const struct libdivide_@div@_t EmptyStruct; BINARY_LOOP_BASE if(!is2) { const @type@ in2 = *(@type@ *)ip2; - struct libdivide_@div@_t fast_d = libdivide_@div@_gen(in2); + struct libdivide_@div@_t fast_d = in2 ? libdivide_@div@_gen(in2) : EmptyStruct; BINARY_LOOP_FIXED { const @type@ in1 = *(@type@ *)ip1; /* From 72dcc042ee572f5522b2731807a15c1fefac1315 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Mon, 9 Nov 2020 18:46:13 +0530 Subject: [PATCH 07/32] ENH: Added libdivide zlib license --- LICENSES_bundled.txt | 5 +++++ .../core/include/numpy/libdivide/LICENSE.txt | 21 +++++++++++++++++++ .../include/numpy/{ => libdivide}/libdivide.h | 0 numpy/core/src/umath/loops.c.src | 2 +- 4 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 numpy/core/include/numpy/libdivide/LICENSE.txt rename numpy/core/include/numpy/{ => libdivide}/libdivide.h (100%) diff --git a/LICENSES_bundled.txt b/LICENSES_bundled.txt index 00b7473777ca..80557d3e6ee7 100644 --- a/LICENSES_bundled.txt +++ b/LICENSES_bundled.txt @@ -15,3 +15,8 @@ Name: dragon4 Files: numpy/core/src/multiarray/dragon4.c License: MIT For license text, see numpy/core/src/multiarray/dragon4.c + +Name: libdivide +Files: numpy/core/include/numpy/libdivide/* +License: zlib + For license text, see numpy/core/include/numpy/libdivide/LICENSE.txt diff --git a/numpy/core/include/numpy/libdivide/LICENSE.txt b/numpy/core/include/numpy/libdivide/LICENSE.txt new file mode 100644 index 000000000000..d72a7c388d40 --- /dev/null +++ b/numpy/core/include/numpy/libdivide/LICENSE.txt @@ -0,0 +1,21 @@ + zlib License + ------------ + + Copyright (C) 2010 - 2019 ridiculous_fish, + Copyright (C) 2016 - 2019 Kim Walisch, + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. diff --git a/numpy/core/include/numpy/libdivide.h b/numpy/core/include/numpy/libdivide/libdivide.h similarity index 100% rename from numpy/core/include/numpy/libdivide.h rename to numpy/core/include/numpy/libdivide/libdivide.h diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 43f839931482..ae99d5bf3daa 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -23,7 +23,7 @@ /* Use Libdivide for faster division */ /* TODO Explore placing specialised versions in `numpy/core/src/common/simd` */ #ifndef USE_LEGACY_DIVISION -#include "numpy/libdivide.h" +#include "numpy/libdivide/libdivide.h" #endif /* From 19835d291fd67b57976a818b08a20e9d9734c787 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Tue, 10 Nov 2020 23:25:45 +0530 Subject: [PATCH 08/32] ENH: Removed empty structure --- numpy/core/src/umath/loops.c.src | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index ae99d5bf3daa..c82626385ce9 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -851,12 +851,13 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void NPY_NO_EXPORT void @TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - static const struct libdivide_@div@_t EmptyStruct; BINARY_LOOP_BASE if(!is2) { const @type@ in2 = *(@type@ *)ip2; - struct libdivide_@div@_t fast_d = in2 ? libdivide_@div@_gen(in2) : EmptyStruct; + + /* Creating a divisor of 0 is treated as an error by libdivide */ + struct libdivide_@div@_t fast_d = in2 ? libdivide_@div@_gen(in2) : (struct libdivide_@div@_t){0}; BINARY_LOOP_FIXED { const @type@ in1 = *(@type@ *)ip1; /* From 3975a28d8b3efa385c58a0196f55d7d377e21a77 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Wed, 11 Nov 2020 11:08:41 +0530 Subject: [PATCH 09/32] ENH: Auto generate libdivide structs --- numpy/core/src/umath/loops.c.src | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index c82626385ce9..061dbb230f55 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -832,7 +832,6 @@ NPY_NO_EXPORT void * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong# * #c = ,,,l,ll# - * #div = s32, s32, s32, s64, s64# */ NPY_NO_EXPORT NPY_GCC_OPT_3 void @@ -847,6 +846,19 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0)); } +/* Using nested loops, few more fields to be added in the future */ +/**begin repeat1 + * #kind = t, gen, do# + */ +/* Libdivde only supports 32 and 64 bit types + * We try to pick the best possible one */ +#if NPY_BITSOF_@TYPE@ <= 32 +#define libdivide_@type@_@kind@ libdivide_s32_@kind@ +#else +#define libdivide_@type@_@kind@ libdivide_s64_@kind@ +#endif +/**end repeat1**/ + #ifndef USE_LEGACY_DIVISION NPY_NO_EXPORT void @TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) @@ -857,7 +869,7 @@ NPY_NO_EXPORT void const @type@ in2 = *(@type@ *)ip2; /* Creating a divisor of 0 is treated as an error by libdivide */ - struct libdivide_@div@_t fast_d = in2 ? libdivide_@div@_gen(in2) : (struct libdivide_@div@_t){0}; + struct libdivide_@type@_t fast_d = in2 ? libdivide_@type@_gen(in2) : (struct libdivide_@type@_t){0}; BINARY_LOOP_FIXED { const @type@ in1 = *(@type@ *)ip1; /* @@ -872,10 +884,10 @@ NPY_NO_EXPORT void *((@type@ *)op1) = 0; } else if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) { - *((@type@ *)op1) = libdivide_@div@_do(in1, &fast_d) - 1; + *((@type@ *)op1) = libdivide_@type@_do(in1, &fast_d) - 1; } else { - *((@type@ *)op1) = libdivide_@div@_do(in1, &fast_d); + *((@type@ *)op1) = libdivide_@type@_do(in1, &fast_d); } } } From 90e6cf529e27543a974b53c0aa912329c6374f21 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Wed, 11 Nov 2020 11:45:08 +0530 Subject: [PATCH 10/32] ENH: Logic to optimize % --- numpy/core/src/umath/loops.c.src | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 061dbb230f55..a58ee5a788b8 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -883,11 +883,12 @@ NPY_NO_EXPORT void npy_set_floatstatus_divbyzero(); *((@type@ *)op1) = 0; } - else if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) { - *((@type@ *)op1) = libdivide_@type@_do(in1, &fast_d) - 1; - } else { *((@type@ *)op1) = libdivide_@type@_do(in1, &fast_d); + + if((*((@type@ *)op1) <= 0) && (*((@type@ *)op1) * in2 != in1)) { + *((@type@ *)op1) = *((@type@ *)op1) - 1; + } } } } From 969aa039d962818459a434ffdd3976865c87afe1 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Wed, 11 Nov 2020 12:10:12 +0530 Subject: [PATCH 11/32] ENH: Fix breaking case --- numpy/core/src/umath/loops.c.src | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index a58ee5a788b8..45a9ccef4c8b 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -886,7 +886,7 @@ NPY_NO_EXPORT void else { *((@type@ *)op1) = libdivide_@type@_do(in1, &fast_d); - if((*((@type@ *)op1) <= 0) && (*((@type@ *)op1) * in2 != in1)) { + if(((in1 > 0) != (in2 > 0)) && (*((@type@ *)op1) * in2 != in1)) { *((@type@ *)op1) = *((@type@ *)op1) - 1; } } From 44a3a31d23fe9429da4cff067685b8adbbe0106a Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Wed, 11 Nov 2020 16:19:06 +0530 Subject: [PATCH 12/32] ENH: Change comments Co-authored-by: Eric Wieser --- numpy/core/src/umath/loops.c.src | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 45a9ccef4c8b..c99a6d7f3836 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -846,12 +846,11 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0)); } -/* Using nested loops, few more fields to be added in the future */ +/* Libdivide only supports 32 and 64 bit types + * We try to pick the best possible one */ /**begin repeat1 * #kind = t, gen, do# */ -/* Libdivde only supports 32 and 64 bit types - * We try to pick the best possible one */ #if NPY_BITSOF_@TYPE@ <= 32 #define libdivide_@type@_@kind@ libdivide_s32_@kind@ #else From b3d70efeef31b953fa977a61311035c9e02e9ad2 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Wed, 11 Nov 2020 19:06:07 +0530 Subject: [PATCH 13/32] ENH: Improved floor division (#17727) --- doc/release/upcoming_changes/17727.improvement.rst | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100755 doc/release/upcoming_changes/17727.improvement.rst diff --git a/doc/release/upcoming_changes/17727.improvement.rst b/doc/release/upcoming_changes/17727.improvement.rst new file mode 100755 index 000000000000..83054a3ea1da --- /dev/null +++ b/doc/release/upcoming_changes/17727.improvement.rst @@ -0,0 +1,8 @@ +Improved performance in integer division of NumPy arrays +-------------------------------------------------------- +Integer division of NumPy arrays now uses libdivide. +With builtin support for SSE2, AVX2 and AVX512 vector +division from libdivide and other minor improvements, +there is a large speedup. +The ``//`` operator and ``np.floor_divide`` makes use +of the new changes. From 931134bfa428e0aa50fff8583fa526c2da1bbc53 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Wed, 11 Nov 2020 22:15:08 +0530 Subject: [PATCH 14/32] ENH: Added asv benchmarks --- benchmarks/benchmarks/bench_ufunc.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/benchmarks/benchmarks/bench_ufunc.py b/benchmarks/benchmarks/bench_ufunc.py index 9f45a72575ff..c388da5b5adc 100644 --- a/benchmarks/benchmarks/bench_ufunc.py +++ b/benchmarks/benchmarks/bench_ufunc.py @@ -134,6 +134,17 @@ def time_less_than_scalar2(self, dtype): (self.d < 1) +class CustomScalarInt(Benchmark): + params = [10**size for size in range(1, 8)] + param_names = ['size'] + + def setup(self, size): + self.x = np.arange(size) + + def time_floor_divide(self, size): + self.x//8 + + class Scalar(Benchmark): def setup(self): self.x = np.asarray(1.0) From 6e2e281a270652cee0028e4e1e98a1c19b57b11b Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Thu, 12 Nov 2020 08:59:37 +0530 Subject: [PATCH 15/32] ENH: Change comments Co-authored-by: Sebastian Berg --- numpy/core/src/umath/loops.c.src | 1 - 1 file changed, 1 deletion(-) diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index c99a6d7f3836..fe60993a762b 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -21,7 +21,6 @@ #include /* for memchr */ /* Use Libdivide for faster division */ -/* TODO Explore placing specialised versions in `numpy/core/src/common/simd` */ #ifndef USE_LEGACY_DIVISION #include "numpy/libdivide/libdivide.h" #endif From 90a84af7ae1edd91cc5a45069ad6a824e436d3cd Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Thu, 12 Nov 2020 09:00:10 +0530 Subject: [PATCH 16/32] ENH: Linting Co-authored-by: Sebastian Berg --- numpy/core/src/umath/loops.c.src | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index fe60993a762b..448e774ccd48 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -884,7 +884,7 @@ NPY_NO_EXPORT void else { *((@type@ *)op1) = libdivide_@type@_do(in1, &fast_d); - if(((in1 > 0) != (in2 > 0)) && (*((@type@ *)op1) * in2 != in1)) { + if (((in1 > 0) != (in2 > 0)) && (*((@type@ *)op1) * in2 != in1)) { *((@type@ *)op1) = *((@type@ *)op1) - 1; } } From 61c3d38e3293c08f48621ca52808097845252f83 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Thu, 12 Nov 2020 09:13:03 +0530 Subject: [PATCH 17/32] MAINT: Added libdivide as linguist-vendored --- .gitattributes | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitattributes b/.gitattributes index bce3dbe6daad..f4b6c0dcfbd1 100644 --- a/.gitattributes +++ b/.gitattributes @@ -14,6 +14,7 @@ doc/release/*.rst merge=union numpy/linalg/lapack_lite/f2c.c linguist-vendored numpy/linalg/lapack_lite/f2c.h linguist-vendored tools/npy_tempita/* linguist-vendored +numpy/core/include/numpy/libdivide/* linguist-vendored # Mark some files as generated numpy/linalg/lapack_lite/f2c_*.c linguist-generated From 827bc38a21f8dbeb3b992a26751ef723577cb7d9 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Thu, 12 Nov 2020 10:05:32 +0530 Subject: [PATCH 18/32] ENH: Removed legacy division --- numpy/core/setup.py | 6 ------ numpy/core/src/umath/loops.c.src | 37 ++++++-------------------------- 2 files changed, 6 insertions(+), 37 deletions(-) diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 448499926f57..68aa0a8513fb 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -386,9 +386,6 @@ def check_mathlib(config_cmd): "MATHLIB env variable") return mathlibs -def check_use_legacy_division(): - return os.environ.get('NPY_USE_LEGACY_DIVISION') is not None - def visibility_define(config): """Return the define value to use for NPY_VISIBILITY_HIDDEN (may be empty string).""" @@ -445,9 +442,6 @@ def generate_config_h(ext, build_dir): mathlibs = check_mathlib(config_cmd) moredefs.append(('MATHLIB', ','.join(mathlibs))) - # Check if legacy division needs to be used - check_use_legacy_division() and moredefs.append('USE_LEGACY_DIVISION') - check_math_capabilities(config_cmd, ext, moredefs, mathlibs) moredefs.extend(cocache.check_ieee_macros(config_cmd)[0]) moredefs.extend(cocache.check_complex(config_cmd, mathlibs)[0]) diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 448e774ccd48..b37f4c4272f4 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -857,7 +857,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void #endif /**end repeat1**/ -#ifndef USE_LEGACY_DIVISION NPY_NO_EXPORT void @TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -884,6 +883,7 @@ NPY_NO_EXPORT void else { *((@type@ *)op1) = libdivide_@type@_do(in1, &fast_d); + /* Negative quotients needs to be rounded down */ if (((in1 > 0) != (in2 > 0)) && (*((@type@ *)op1) * in2 != in1)) { *((@type@ *)op1) = *((@type@ *)op1) - 1; } @@ -905,42 +905,17 @@ NPY_NO_EXPORT void npy_set_floatstatus_divbyzero(); *((@type@ *)op1) = 0; } - else if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) { - *((@type@ *)op1) = in1/in2 - 1; - } else { *((@type@ *)op1) = in1/in2; + + /* Negative quotients needs to be rounded down */ + if (((in1 > 0) != (in2 > 0)) && (*((@type@ *)op1) * in2 != in1)) { + *((@type@ *)op1) = *((@type@ *)op1) - 1; + } } } } } -#else -NPY_NO_EXPORT void -@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - /* - * FIXME: On x86 at least, dividing the smallest representable integer - * by -1 causes a SIFGPE (division overflow). We treat this case here - * (to avoid a SIGFPE crash at python level), but a good solution would - * be to treat integer division problems separately from FPU exceptions - * (i.e. a different approach than npy_set_floatstatus_divbyzero()). - */ - if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) { - npy_set_floatstatus_divbyzero(); - *((@type@ *)op1) = 0; - } - else if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) { - *((@type@ *)op1) = in1/in2 - 1; - } - else { - *((@type@ *)op1) = in1/in2; - } - } -} -#endif NPY_NO_EXPORT void @TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) From 0ce0ebd3b895678f2a59797564e17a0aedad6872 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Thu, 12 Nov 2020 09:38:17 +0530 Subject: [PATCH 19/32] ENH: Improved floor division (#17727) --- .../{17727.improvement.rst => 17727.performance.rst} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename doc/release/upcoming_changes/{17727.improvement.rst => 17727.performance.rst} (100%) diff --git a/doc/release/upcoming_changes/17727.improvement.rst b/doc/release/upcoming_changes/17727.performance.rst similarity index 100% rename from doc/release/upcoming_changes/17727.improvement.rst rename to doc/release/upcoming_changes/17727.performance.rst From c85c44a8091dffc921ac81059280f99b9d4dc198 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Fri, 13 Nov 2020 12:02:03 +0530 Subject: [PATCH 20/32] ENH: Added libdivide to timedelta --- numpy/core/src/umath/loops.c.src | 87 +++++++++++++++++++++++++------- 1 file changed, 68 insertions(+), 19 deletions(-) diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index b37f4c4272f4..a7c0cb365f22 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -862,7 +862,7 @@ NPY_NO_EXPORT void { BINARY_LOOP_BASE - if(!is2) { + if (!is2) { const @type@ in2 = *(@type@ *)ip2; /* Creating a divisor of 0 is treated as an error by libdivide */ @@ -1403,14 +1403,33 @@ TIMEDELTA_dm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const NPY_NO_EXPORT void TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - BINARY_LOOP { - const npy_timedelta in1 = *(npy_timedelta *)ip1; + BINARY_LOOP_BASE + + if (!is2) { const npy_int64 in2 = *(npy_int64 *)ip2; - if (in1 == NPY_DATETIME_NAT || in2 == 0) { - *((npy_timedelta *)op1) = NPY_DATETIME_NAT; + + /* Creating a divisor of 0 is treated as an error by libdivide */ + struct libdivide_s64_t fast_d = in2 ? libdivide_s64_gen(in2) : (struct libdivide_s64_t){0}; + BINARY_LOOP_FIXED { + const npy_timedelta in1 = *(npy_timedelta *)ip1; + if (in1 == NPY_DATETIME_NAT || in2 == 0) { + *((npy_timedelta *)op1) = NPY_DATETIME_NAT; + } + else { + *((npy_timedelta *)op1) = libdivide_s64_do(in1, &fast_d);; + } } - else { - *((npy_timedelta *)op1) = in1 / in2; + } + else { + BINARY_LOOP_SLIDING { + const npy_timedelta in1 = *(npy_timedelta *)ip1; + const npy_int64 in2 = *(npy_int64 *)ip2; + if (in1 == NPY_DATETIME_NAT || in2 == 0) { + *((npy_timedelta *)op1) = NPY_DATETIME_NAT; + } + else { + *((npy_timedelta *)op1) = in1 / in2; + } } } } @@ -1482,23 +1501,53 @@ TIMEDELTA_mm_m_remainder(char **args, npy_intp const *dimensions, npy_intp const NPY_NO_EXPORT void TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - BINARY_LOOP { - const npy_timedelta in1 = *(npy_timedelta *)ip1; + /* TODO: This code is similar to array floor divide*/ + BINARY_LOOP_BASE + + if (!is2) { const npy_timedelta in2 = *(npy_timedelta *)ip2; - if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) { - npy_set_floatstatus_invalid(); - *((npy_int64 *)op1) = 0; - } - else if (in2 == 0) { - npy_set_floatstatus_divbyzero(); - *((npy_int64 *)op1) = 0; + + /* Creating a divisor of 0 is treated as an error by libdivide */ + struct libdivide_s64_t fast_d = in2 ? libdivide_s64_gen(in2) : (struct libdivide_s64_t){0}; + BINARY_LOOP_FIXED { + const npy_timedelta in1 = *(npy_timedelta *)ip1; + if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) { + npy_set_floatstatus_invalid(); + *((npy_int64 *)op1) = 0; + } + else if (in2 == 0) { + npy_set_floatstatus_divbyzero(); + *((npy_int64 *)op1) = 0; + } + else { + *((npy_int64 *)op1) = libdivide_s64_do(in1, &fast_d); + + /* Negative quotients needs to be rounded down */ + if (((in1 > 0) != (in2 > 0)) && (*((npy_int64 *)op1) * in2 != in1)) { + *((npy_int64 *)op1) = *((npy_int64 *)op1) - 1; + } + } } - else { - if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) { - *((npy_int64 *)op1) = in1/in2 - 1; + } + else { + BINARY_LOOP_SLIDING { + const npy_timedelta in1 = *(npy_timedelta *)ip1; + const npy_timedelta in2 = *(npy_timedelta *)ip2; + if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) { + npy_set_floatstatus_invalid(); + *((npy_int64 *)op1) = 0; + } + else if (in2 == 0) { + npy_set_floatstatus_divbyzero(); + *((npy_int64 *)op1) = 0; } else { *((npy_int64 *)op1) = in1/in2; + + /* Negative quotients needs to be rounded down */ + if (((in1 > 0) != (in2 > 0)) && (*((npy_int64 *)op1) * in2 != in1)) { + *((npy_int64 *)op1) = *((npy_int64 *)op1) - 1; + } } } } From 0517f134365808f8b81c6646cad1b0fe431f6d99 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Fri, 20 Nov 2020 23:47:45 +0530 Subject: [PATCH 21/32] TST: Added UT for floor divide --- numpy/core/tests/test_umath.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index 818b2ad6c842..bd7dd23d829a 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -249,6 +249,29 @@ def test_division_int(self): assert_equal(x // 100, [0, 0, 0, 1, -1, -1, -1, -1, -2]) assert_equal(x % 100, [5, 10, 90, 0, 95, 90, 10, 0, 80]) + @pytest.mark.parametrize("input_dtype", + [np.int8, np.int16, np.int32, np.int64]) + def test_division_int_boundary(self, input_dtype): + class ListWithDiv(list): + def __floordiv__(self, divisor): + return [i//divisor for i in self] + + iinfo = np.iinfo(input_dtype) + + # Create array with min, 25th percentile, 0, 75th percentile, max + arr = ListWithDiv([iinfo.min, iinfo.min//2, 0, iinfo.max//2, iinfo.max]) + dividends = [iinfo.min, iinfo.min//2, iinfo.max//2, iinfo.max] + a = np.array(arr, dtype = input_dtype) + + for dividend in dividends: + div_a = a // dividend + div_arr = arr // dividend + assert_(all(div_a == div_arr)) + + with np.errstate(divide='raise'): + with pytest.raises(FloatingPointError): + a // 0 + def test_division_complex(self): # check that implementation is correct msg = "Complex division implementation check" From a769d6f402b6aba2ebe9268635872fc1166d9510 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Sat, 21 Nov 2020 18:41:21 +0530 Subject: [PATCH 22/32] ENH: Improved floor division (#17727) --- doc/release/upcoming_changes/17727.performance.rst | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/release/upcoming_changes/17727.performance.rst b/doc/release/upcoming_changes/17727.performance.rst index 83054a3ea1da..c3a08bc8e04a 100755 --- a/doc/release/upcoming_changes/17727.performance.rst +++ b/doc/release/upcoming_changes/17727.performance.rst @@ -1,8 +1,7 @@ Improved performance in integer division of NumPy arrays -------------------------------------------------------- -Integer division of NumPy arrays now uses libdivide. -With builtin support for SSE2, AVX2 and AVX512 vector -division from libdivide and other minor improvements, -there is a large speedup. +Integer division of NumPy arrays now uses libdivide when +the divisor is a constant. With the usage of libdivde and +other minor optimizations, there is a large speedup. The ``//`` operator and ``np.floor_divide`` makes use of the new changes. From 0e2116f2d61ed15e69ca1b4e31e8709f17b86f71 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Sat, 21 Nov 2020 19:23:20 +0530 Subject: [PATCH 23/32] ENH: Optimized 0 divisor cases --- numpy/core/src/umath/fast_loop_macros.h | 3 + numpy/core/src/umath/loops.c.src | 113 +++++++++++++++--------- 2 files changed, 74 insertions(+), 42 deletions(-) diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h index 90dcad3685d6..7ff4d1602956 100644 --- a/numpy/core/src/umath/fast_loop_macros.h +++ b/numpy/core/src/umath/fast_loop_macros.h @@ -58,6 +58,9 @@ abs_ptrdiff(char *a, char *b) #define BINARY_LOOP_FIXED\ for(i = 0; i < n; i++, ip1 += is1, op1 += os1) +#define BINARY_LOOP_ZERO\ + for(i = 0; i < n; i++, op1 += os1) + /** (ip1, ip2) -> (op1) */ #define BINARY_LOOP\ BINARY_LOOP_BASE\ diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index a7c0cb365f22..bfd23924c701 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -862,36 +862,45 @@ NPY_NO_EXPORT void { BINARY_LOOP_BASE - if (!is2) { + /* When the divisor is a constant, use libdivde for faster division */ + if (steps[1] == 0) { const @type@ in2 = *(@type@ *)ip2; - /* Creating a divisor of 0 is treated as an error by libdivide */ - struct libdivide_@type@_t fast_d = in2 ? libdivide_@type@_gen(in2) : (struct libdivide_@type@_t){0}; - BINARY_LOOP_FIXED { - const @type@ in1 = *(@type@ *)ip1; - /* - * FIXME: On x86 at least, dividing the smallest representable integer - * by -1 causes a SIFGPE (division overflow). We treat this case here - * (to avoid a SIGFPE crash at python level), but a good solution would - * be to treat integer division problems separately from FPU exceptions - * (i.e. a different approach than npy_set_floatstatus_divbyzero()). - */ - if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) { + /* If divisor is 0, we need not compute anything*/ + if (in2 == 0) { + BINARY_LOOP_ZERO { npy_set_floatstatus_divbyzero(); *((@type@ *)op1) = 0; } - else { - *((@type@ *)op1) = libdivide_@type@_do(in1, &fast_d); + } + else { + struct libdivide_@type@_t fast_d = libdivide_@type@_gen(in2); + BINARY_LOOP_FIXED { + const @type@ in1 = *(@type@ *)ip1; + /* + * FIXME: On x86 at least, dividing the smallest representable integer + * by -1 causes a SIFGPE (division overflow). We treat this case here + * (to avoid a SIGFPE crash at python level), but a good solution would + * be to treat integer division problems separately from FPU exceptions + * (i.e. a different approach than npy_set_floatstatus_divbyzero()). + */ + if (in1 == NPY_MIN_@TYPE@ && in2 == -1) { + npy_set_floatstatus_divbyzero(); + *((@type@ *)op1) = 0; + } + else { + *((@type@ *)op1) = libdivide_@type@_do(in1, &fast_d); - /* Negative quotients needs to be rounded down */ - if (((in1 > 0) != (in2 > 0)) && (*((@type@ *)op1) * in2 != in1)) { - *((@type@ *)op1) = *((@type@ *)op1) - 1; + /* Negative quotients needs to be rounded down */ + if (((in1 > 0) != (in2 > 0)) && (*((@type@ *)op1) * in2 != in1)) { + *((@type@ *)op1) = *((@type@ *)op1) - 1; + } } } } } else { - BINARY_LOOP_SLIDING { // XXX Lot of repeated code + BINARY_LOOP_SLIDING { const @type@ in1 = *(@type@ *)ip1; const @type@ in2 = *(@type@ *)ip2; /* @@ -1405,18 +1414,27 @@ TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *s { BINARY_LOOP_BASE - if (!is2) { + /* When the divisor is a constant, use libdivde for faster division */ + if (steps[1] == 0) { const npy_int64 in2 = *(npy_int64 *)ip2; - /* Creating a divisor of 0 is treated as an error by libdivide */ - struct libdivide_s64_t fast_d = in2 ? libdivide_s64_gen(in2) : (struct libdivide_s64_t){0}; - BINARY_LOOP_FIXED { - const npy_timedelta in1 = *(npy_timedelta *)ip1; - if (in1 == NPY_DATETIME_NAT || in2 == 0) { + /* If divisor is 0, we need not compute anything */ + if (in2 == 0) { + BINARY_LOOP_ZERO { + npy_set_floatstatus_divbyzero(); *((npy_timedelta *)op1) = NPY_DATETIME_NAT; } - else { - *((npy_timedelta *)op1) = libdivide_s64_do(in1, &fast_d);; + } + else { + struct libdivide_s64_t fast_d = libdivide_s64_gen(in2); + BINARY_LOOP_FIXED { + const npy_timedelta in1 = *(npy_timedelta *)ip1; + if (in1 == NPY_DATETIME_NAT) { + *((npy_timedelta *)op1) = NPY_DATETIME_NAT; + } + else { + *((npy_timedelta *)op1) = libdivide_s64_do(in1, &fast_d);; + } } } } @@ -1501,30 +1519,41 @@ TIMEDELTA_mm_m_remainder(char **args, npy_intp const *dimensions, npy_intp const NPY_NO_EXPORT void TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - /* TODO: This code is similar to array floor divide*/ + /* NOTE: This code is similar to array floor divide*/ BINARY_LOOP_BASE - if (!is2) { + /* When the divisor is a constant, use libdivde for faster division */ + if (steps[1] == 0) { const npy_timedelta in2 = *(npy_timedelta *)ip2; - /* Creating a divisor of 0 is treated as an error by libdivide */ - struct libdivide_s64_t fast_d = in2 ? libdivide_s64_gen(in2) : (struct libdivide_s64_t){0}; - BINARY_LOOP_FIXED { - const npy_timedelta in1 = *(npy_timedelta *)ip1; - if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) { - npy_set_floatstatus_invalid(); + /* If divisor is 0 or NAT, we need not compute anything */ + if (in2 == 0) { + BINARY_LOOP_ZERO { + npy_set_floatstatus_divbyzero(); *((npy_int64 *)op1) = 0; } - else if (in2 == 0) { - npy_set_floatstatus_divbyzero(); + } + else if (in2 == NPY_DATETIME_NAT) { + BINARY_LOOP_ZERO { + npy_set_floatstatus_invalid(); *((npy_int64 *)op1) = 0; } - else { - *((npy_int64 *)op1) = libdivide_s64_do(in1, &fast_d); + } + else { + struct libdivide_s64_t fast_d = libdivide_s64_gen(in2); + BINARY_LOOP_FIXED { + const npy_timedelta in1 = *(npy_timedelta *)ip1; + if (in1 == NPY_DATETIME_NAT) { + npy_set_floatstatus_invalid(); + *((npy_int64 *)op1) = 0; + } + else { + *((npy_int64 *)op1) = libdivide_s64_do(in1, &fast_d); - /* Negative quotients needs to be rounded down */ - if (((in1 > 0) != (in2 > 0)) && (*((npy_int64 *)op1) * in2 != in1)) { - *((npy_int64 *)op1) = *((npy_int64 *)op1) - 1; + /* Negative quotients needs to be rounded down */ + if (((in1 > 0) != (in2 > 0)) && (*((npy_int64 *)op1) * in2 != in1)) { + *((npy_int64 *)op1) = *((npy_int64 *)op1) - 1; + } } } } From f93ca93e93a9a215d25751cee442665018e345e6 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Sat, 21 Nov 2020 20:04:27 +0530 Subject: [PATCH 24/32] TST: Minor changes to floor divide | Added cases for timedelta divide --- numpy/core/tests/test_umath.py | 53 ++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 12 deletions(-) diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index bd7dd23d829a..846968c118bb 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -252,25 +252,54 @@ def test_division_int(self): @pytest.mark.parametrize("input_dtype", [np.int8, np.int16, np.int32, np.int64]) def test_division_int_boundary(self, input_dtype): - class ListWithDiv(list): - def __floordiv__(self, divisor): - return [i//divisor for i in self] - iinfo = np.iinfo(input_dtype) - # Create array with min, 25th percentile, 0, 75th percentile, max - arr = ListWithDiv([iinfo.min, iinfo.min//2, 0, iinfo.max//2, iinfo.max]) - dividends = [iinfo.min, iinfo.min//2, iinfo.max//2, iinfo.max] - a = np.array(arr, dtype = input_dtype) + # Create list with min, 25th percentile, 0, 75th percentile, max + lst = [iinfo.min, iinfo.min//2, 0, iinfo.max//2, iinfo.max] + divisors = [iinfo.min, iinfo.min//2, iinfo.max//2, iinfo.max] + a = np.array(lst, dtype=input_dtype) - for dividend in dividends: - div_a = a // dividend - div_arr = arr // dividend - assert_(all(div_a == div_arr)) + for divisor in divisors: + div_a = a // divisor + b = a.copy(); b //= divisor + div_lst = [i // divisor for i in lst] + assert_(all(div_a == div_lst)) + assert_(all(div_a == b)) with np.errstate(divide='raise'): with pytest.raises(FloatingPointError): a // 0 + with pytest.raises(FloatingPointError): + a //= 0 + + @pytest.mark.parametrize( + "dividend,divisor,quotient", + [(np.timedelta64(2,'Y'), np.timedelta64(2,'M'), 12), + (np.timedelta64(2,'Y'), np.timedelta64(-2,'M'), -12), + (np.timedelta64(-2,'Y'), np.timedelta64(2,'M'), -12), + (np.timedelta64(-2,'Y'), np.timedelta64(-2,'M'), 12), + (np.timedelta64(2,'M'), np.timedelta64(-2,'Y'), -1), + (np.timedelta64(2,'Y'), np.timedelta64(0,'M'), 0), + (np.timedelta64(2,'Y'), 2, np.timedelta64(1,'Y')), + (np.timedelta64(2,'Y'), -2, np.timedelta64(-1,'Y')), + (np.timedelta64(-2,'Y'), 2, np.timedelta64(-1,'Y')), + (np.timedelta64(-2,'Y'), -2, np.timedelta64(1,'Y')), + (np.timedelta64(-2,'Y'), -2, np.timedelta64(1,'Y')), + (np.timedelta64(-2,'Y'), -3, np.timedelta64(0,'Y')), + (np.timedelta64(-2,'Y'), 0, np.timedelta64('Nat','Y')), + ]) + def test_division_int_timedelta(self, dividend, divisor, quotient): + # If either divisor is 0 or quotient is Nat, check for division by 0 + if divisor and (isinstance(quotient, int) or not np.isnat(quotient)): + assert_(dividend // divisor == quotient) + # Test for arrays as well + assert_(all( + np.array([dividend]*5) // divisor \ + == np.array([quotient]*5))) + else: + with np.errstate(divide='raise', invalid='raise'): + with pytest.raises(FloatingPointError): + dividend // divisor def test_division_complex(self): # check that implementation is correct From 285d810bcbaa883c23282f067d51f7329e8869b1 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Sun, 22 Nov 2020 11:33:09 +0530 Subject: [PATCH 25/32] ENH: Remove looping definitions | Renamed fast loop macros --- numpy/core/src/umath/fast_loop_macros.h | 15 +++--------- numpy/core/src/umath/loops.c.src | 32 ++++++++++++------------- 2 files changed, 19 insertions(+), 28 deletions(-) diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h index 7ff4d1602956..5c22c6f1c2f8 100644 --- a/numpy/core/src/umath/fast_loop_macros.h +++ b/numpy/core/src/umath/fast_loop_macros.h @@ -46,7 +46,7 @@ abs_ptrdiff(char *a, char *b) npy_intp i;\ for(i = 0; i < n; i++, ip1 += is1, op1 += os1, op2 += os2) -#define BINARY_LOOP_BASE\ +#define BINARY_DEFS\ char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\ npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\ npy_intp n = dimensions[0];\ @@ -55,15 +55,9 @@ abs_ptrdiff(char *a, char *b) #define BINARY_LOOP_SLIDING\ for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) -#define BINARY_LOOP_FIXED\ - for(i = 0; i < n; i++, ip1 += is1, op1 += os1) - -#define BINARY_LOOP_ZERO\ - for(i = 0; i < n; i++, op1 += os1) - /** (ip1, ip2) -> (op1) */ #define BINARY_LOOP\ - BINARY_LOOP_BASE\ + BINARY_DEFS\ BINARY_LOOP_SLIDING /** (ip1, ip2) -> (op1, op2) */ @@ -167,10 +161,7 @@ abs_ptrdiff(char *a, char *b) #define IVDEP_LOOP #endif #define BASE_BINARY_LOOP_INP(tin, tout, op) \ - char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\ - npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\ - npy_intp n = dimensions[0];\ - npy_intp i;\ + BINARY_DEFS\ IVDEP_LOOP \ for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) { \ const tin in1 = *(tin *)ip1; \ diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index bfd23924c701..29d9959b44f9 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -847,20 +847,20 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void /* Libdivide only supports 32 and 64 bit types * We try to pick the best possible one */ -/**begin repeat1 - * #kind = t, gen, do# - */ #if NPY_BITSOF_@TYPE@ <= 32 -#define libdivide_@type@_@kind@ libdivide_s32_@kind@ +#define libdivide_@type@_t libdivide_s32_t +#define libdivide_@type@_gen libdivide_s32_gen +#define libdivide_@type@_do libdivide_s32_do #else -#define libdivide_@type@_@kind@ libdivide_s64_@kind@ +#define libdivide_@type@_t libdivide_s64_t +#define libdivide_@type@_gen libdivide_s64_gen +#define libdivide_@type@_do libdivide_s64_do #endif -/**end repeat1**/ NPY_NO_EXPORT void @TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - BINARY_LOOP_BASE + BINARY_DEFS /* When the divisor is a constant, use libdivde for faster division */ if (steps[1] == 0) { @@ -868,14 +868,14 @@ NPY_NO_EXPORT void /* If divisor is 0, we need not compute anything*/ if (in2 == 0) { - BINARY_LOOP_ZERO { + BINARY_LOOP_SLIDING { npy_set_floatstatus_divbyzero(); *((@type@ *)op1) = 0; } } else { struct libdivide_@type@_t fast_d = libdivide_@type@_gen(in2); - BINARY_LOOP_FIXED { + BINARY_LOOP_SLIDING { const @type@ in1 = *(@type@ *)ip1; /* * FIXME: On x86 at least, dividing the smallest representable integer @@ -1412,7 +1412,7 @@ TIMEDELTA_dm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const NPY_NO_EXPORT void TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - BINARY_LOOP_BASE + BINARY_DEFS /* When the divisor is a constant, use libdivde for faster division */ if (steps[1] == 0) { @@ -1420,14 +1420,14 @@ TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *s /* If divisor is 0, we need not compute anything */ if (in2 == 0) { - BINARY_LOOP_ZERO { + BINARY_LOOP_SLIDING { npy_set_floatstatus_divbyzero(); *((npy_timedelta *)op1) = NPY_DATETIME_NAT; } } else { struct libdivide_s64_t fast_d = libdivide_s64_gen(in2); - BINARY_LOOP_FIXED { + BINARY_LOOP_SLIDING { const npy_timedelta in1 = *(npy_timedelta *)ip1; if (in1 == NPY_DATETIME_NAT) { *((npy_timedelta *)op1) = NPY_DATETIME_NAT; @@ -1520,7 +1520,7 @@ NPY_NO_EXPORT void TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { /* NOTE: This code is similar to array floor divide*/ - BINARY_LOOP_BASE + BINARY_DEFS /* When the divisor is a constant, use libdivde for faster division */ if (steps[1] == 0) { @@ -1528,20 +1528,20 @@ TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp co /* If divisor is 0 or NAT, we need not compute anything */ if (in2 == 0) { - BINARY_LOOP_ZERO { + BINARY_LOOP_SLIDING { npy_set_floatstatus_divbyzero(); *((npy_int64 *)op1) = 0; } } else if (in2 == NPY_DATETIME_NAT) { - BINARY_LOOP_ZERO { + BINARY_LOOP_SLIDING { npy_set_floatstatus_invalid(); *((npy_int64 *)op1) = 0; } } else { struct libdivide_s64_t fast_d = libdivide_s64_gen(in2); - BINARY_LOOP_FIXED { + BINARY_LOOP_SLIDING { const npy_timedelta in1 = *(npy_timedelta *)ip1; if (in1 == NPY_DATETIME_NAT) { npy_set_floatstatus_invalid(); From 98257957b67835b63badd0932c481482c650e0d0 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Sun, 22 Nov 2020 11:36:24 +0530 Subject: [PATCH 26/32] ENH: Removed unsed macro check --- numpy/core/src/umath/loops.c.src | 2 -- 1 file changed, 2 deletions(-) diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 29d9959b44f9..5e4c0de0456b 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -21,9 +21,7 @@ #include /* for memchr */ /* Use Libdivide for faster division */ -#ifndef USE_LEGACY_DIVISION #include "numpy/libdivide/libdivide.h" -#endif /* * cutoff blocksize for pairwise summation From 1f104fd565cce988cddb6120564b157f3f9ef240 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Mon, 23 Nov 2020 12:34:44 +0530 Subject: [PATCH 27/32] BUG: Added better 0 checks --- numpy/core/src/umath/loops.c.src | 38 ++++++++++++++++++++++++++++---- numpy/core/tests/test_umath.py | 30 +++++++++++++++++-------- 2 files changed, 55 insertions(+), 13 deletions(-) diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 5e4c0de0456b..c9f3b27b87ca 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -862,12 +862,21 @@ NPY_NO_EXPORT void /* When the divisor is a constant, use libdivde for faster division */ if (steps[1] == 0) { + /* If divisor is 0, set warning*/ + if (*(@type@ *)ip2 == 0) { + npy_set_floatstatus_divbyzero(); + } + + /* In case of empty array, just return*/ + if (n == 0) { + return; + } + const @type@ in2 = *(@type@ *)ip2; /* If divisor is 0, we need not compute anything*/ if (in2 == 0) { BINARY_LOOP_SLIDING { - npy_set_floatstatus_divbyzero(); *((@type@ *)op1) = 0; } } @@ -1410,16 +1419,26 @@ TIMEDELTA_dm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const NPY_NO_EXPORT void TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { + /* NOTE: This code is similar to array floor divide*/ BINARY_DEFS /* When the divisor is a constant, use libdivde for faster division */ if (steps[1] == 0) { + /* If divisor is 0, set warning*/ + if (*(npy_int64 *)ip2 == 0) { + npy_set_floatstatus_divbyzero(); + } + + /* In case of empty array, just return*/ + if (n == 0) { + return; + } + const npy_int64 in2 = *(npy_int64 *)ip2; /* If divisor is 0, we need not compute anything */ if (in2 == 0) { BINARY_LOOP_SLIDING { - npy_set_floatstatus_divbyzero(); *((npy_timedelta *)op1) = NPY_DATETIME_NAT; } } @@ -1522,18 +1541,29 @@ TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp co /* When the divisor is a constant, use libdivde for faster division */ if (steps[1] == 0) { + /* If divisor is 0 or NAT, set warning*/ + if (*(npy_timedelta *)ip2 == 0) { + npy_set_floatstatus_divbyzero(); + } + else if(*(npy_timedelta *)ip2 == NPY_DATETIME_NAT) { + npy_set_floatstatus_invalid(); + } + + /* In case of empty array, just return*/ + if (n == 0) { + return; + } + const npy_timedelta in2 = *(npy_timedelta *)ip2; /* If divisor is 0 or NAT, we need not compute anything */ if (in2 == 0) { BINARY_LOOP_SLIDING { - npy_set_floatstatus_divbyzero(); *((npy_int64 *)op1) = 0; } } else if (in2 == NPY_DATETIME_NAT) { BINARY_LOOP_SLIDING { - npy_set_floatstatus_invalid(); *((npy_int64 *)op1) = 0; } } diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index 846968c118bb..215913da51eb 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -263,14 +263,20 @@ def test_division_int_boundary(self, input_dtype): div_a = a // divisor b = a.copy(); b //= divisor div_lst = [i // divisor for i in lst] - assert_(all(div_a == div_lst)) - assert_(all(div_a == b)) + + msg = "Integer arrays floor division check (//)" + assert all(div_a == div_lst), msg + + msg = "Integer arrays floor division check (//=)" + assert all(div_a == b), msg with np.errstate(divide='raise'): with pytest.raises(FloatingPointError): a // 0 with pytest.raises(FloatingPointError): a //= 0 + with pytest.raises(FloatingPointError): + np.array([], dtype=input_dtype) // 0 @pytest.mark.parametrize( "dividend,divisor,quotient", @@ -279,7 +285,8 @@ def test_division_int_boundary(self, input_dtype): (np.timedelta64(-2,'Y'), np.timedelta64(2,'M'), -12), (np.timedelta64(-2,'Y'), np.timedelta64(-2,'M'), 12), (np.timedelta64(2,'M'), np.timedelta64(-2,'Y'), -1), - (np.timedelta64(2,'Y'), np.timedelta64(0,'M'), 0), + (np.timedelta64(2,'Y'), np.timedelta64(0,'M'), None), + (np.array([], dtype='timedelta64[Y]'), np.timedelta64('Nat','M'), None), (np.timedelta64(2,'Y'), 2, np.timedelta64(1,'Y')), (np.timedelta64(2,'Y'), -2, np.timedelta64(-1,'Y')), (np.timedelta64(-2,'Y'), 2, np.timedelta64(-1,'Y')), @@ -287,15 +294,20 @@ def test_division_int_boundary(self, input_dtype): (np.timedelta64(-2,'Y'), -2, np.timedelta64(1,'Y')), (np.timedelta64(-2,'Y'), -3, np.timedelta64(0,'Y')), (np.timedelta64(-2,'Y'), 0, np.timedelta64('Nat','Y')), + (np.array([], dtype='timedelta64[Y]'), 0, None), ]) def test_division_int_timedelta(self, dividend, divisor, quotient): - # If either divisor is 0 or quotient is Nat, check for division by 0 - if divisor and (isinstance(quotient, int) or not np.isnat(quotient)): - assert_(dividend // divisor == quotient) + # If either divisor is 0 or quotient is None or Nat, check for division by 0 + if divisor and (isinstance(quotient, int) or + not (quotient is None or np.isnat(quotient))): + msg = "Timedelta floor division check" + assert dividend // divisor == quotient, msg + # Test for arrays as well - assert_(all( - np.array([dividend]*5) // divisor \ - == np.array([quotient]*5))) + msg = "Timedelta arrays floor division check" + dividend_array = np.array([dividend]*5) + quotient_array = np.array([quotient]*5) + assert all(dividend_array // divisor == quotient_array), msg else: with np.errstate(divide='raise', invalid='raise'): with pytest.raises(FloatingPointError): From 2fde590521fd88bde9e9df1c960e38df438bd040 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Mon, 23 Nov 2020 14:42:35 +0530 Subject: [PATCH 28/32] BENCH: Added floor divide benchmarks (#17727) --- benchmarks/benchmarks/bench_ufunc.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/benchmarks/benchmarks/bench_ufunc.py b/benchmarks/benchmarks/bench_ufunc.py index c388da5b5adc..ef3ebe75f9bb 100644 --- a/benchmarks/benchmarks/bench_ufunc.py +++ b/benchmarks/benchmarks/bench_ufunc.py @@ -134,15 +134,20 @@ def time_less_than_scalar2(self, dtype): (self.d < 1) -class CustomScalarInt(Benchmark): - params = [10**size for size in range(1, 8)] - param_names = ['size'] - - def setup(self, size): - self.x = np.arange(size) - - def time_floor_divide(self, size): - self.x//8 +class CustomScalarFloorDivideInt(Benchmark): + params = ([np.int8, np.int16, np.int32, np.int64], [8, -8, 43, -43]) + param_names = ['dtype', 'divisors'] + max_value = 10**7 + min_value = -10**7 + + def setup(self, dtype, divisor): + iinfo = np.iinfo(dtype) + self.x = np.arange( + max(iinfo.min, self.min_value), + min(iinfo.max, self.max_value)) + + def time_floor_divide_int(self, dtpye, divisor): + self.x // divisor class Scalar(Benchmark): From 8912ffd9da549bb5a4dbb34eb9de10fd1c19ce43 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Mon, 23 Nov 2020 14:45:56 +0530 Subject: [PATCH 29/32] DOC: Improved floor division (#17727) --- doc/release/upcoming_changes/17727.performance.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/release/upcoming_changes/17727.performance.rst b/doc/release/upcoming_changes/17727.performance.rst index c3a08bc8e04a..7b447a3b203d 100755 --- a/doc/release/upcoming_changes/17727.performance.rst +++ b/doc/release/upcoming_changes/17727.performance.rst @@ -1,7 +1,7 @@ Improved performance in integer division of NumPy arrays -------------------------------------------------------- -Integer division of NumPy arrays now uses libdivide when -the divisor is a constant. With the usage of libdivde and +Integer division of NumPy arrays now uses `libdivide ` +when the divisor is a constant. With the usage of libdivde and other minor optimizations, there is a large speedup. The ``//`` operator and ``np.floor_divide`` makes use of the new changes. From a5e12353c78e6be9a04a0bd2b2c92daa74875a64 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Mon, 23 Nov 2020 15:16:20 +0530 Subject: [PATCH 30/32] BENCH: Improve floor divide benchmarks (#17727) --- benchmarks/benchmarks/bench_ufunc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmarks/bench_ufunc.py b/benchmarks/benchmarks/bench_ufunc.py index ef3ebe75f9bb..13b7382a1708 100644 --- a/benchmarks/benchmarks/bench_ufunc.py +++ b/benchmarks/benchmarks/bench_ufunc.py @@ -135,7 +135,7 @@ def time_less_than_scalar2(self, dtype): class CustomScalarFloorDivideInt(Benchmark): - params = ([np.int8, np.int16, np.int32, np.int64], [8, -8, 43, -43]) + params = ([np.int8, np.int16, np.int32, np.int64], [8, -8, 43, -43, 0]) param_names = ['dtype', 'divisors'] max_value = 10**7 min_value = -10**7 @@ -144,7 +144,7 @@ def setup(self, dtype, divisor): iinfo = np.iinfo(dtype) self.x = np.arange( max(iinfo.min, self.min_value), - min(iinfo.max, self.max_value)) + min(iinfo.max, self.max_value), dtype=dtype) def time_floor_divide_int(self, dtpye, divisor): self.x // divisor From ca4ba20fabcae7dd0944c0374ed0e452c684d4ac Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Mon, 23 Nov 2020 21:24:04 +0530 Subject: [PATCH 31/32] BUG,TST: Fixed division by 0 status setting --- numpy/core/src/umath/loops.c.src | 28 +++++++--------------------- numpy/core/tests/test_umath.py | 13 +++++-------- 2 files changed, 12 insertions(+), 29 deletions(-) diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index c9f3b27b87ca..6637c0e4e4f9 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -860,13 +860,8 @@ NPY_NO_EXPORT void { BINARY_DEFS - /* When the divisor is a constant, use libdivde for faster division */ + /* When the divisor is a constant, use libdivide for faster division */ if (steps[1] == 0) { - /* If divisor is 0, set warning*/ - if (*(@type@ *)ip2 == 0) { - npy_set_floatstatus_divbyzero(); - } - /* In case of empty array, just return*/ if (n == 0) { return; @@ -876,6 +871,7 @@ NPY_NO_EXPORT void /* If divisor is 0, we need not compute anything*/ if (in2 == 0) { + npy_set_floatstatus_divbyzero(); BINARY_LOOP_SLIDING { *((@type@ *)op1) = 0; } @@ -1422,13 +1418,8 @@ TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *s /* NOTE: This code is similar to array floor divide*/ BINARY_DEFS - /* When the divisor is a constant, use libdivde for faster division */ + /* When the divisor is a constant, use libdivide for faster division */ if (steps[1] == 0) { - /* If divisor is 0, set warning*/ - if (*(npy_int64 *)ip2 == 0) { - npy_set_floatstatus_divbyzero(); - } - /* In case of empty array, just return*/ if (n == 0) { return; @@ -1438,6 +1429,7 @@ TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *s /* If divisor is 0, we need not compute anything */ if (in2 == 0) { + npy_set_floatstatus_divbyzero(); BINARY_LOOP_SLIDING { *((npy_timedelta *)op1) = NPY_DATETIME_NAT; } @@ -1539,16 +1531,8 @@ TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp co /* NOTE: This code is similar to array floor divide*/ BINARY_DEFS - /* When the divisor is a constant, use libdivde for faster division */ + /* When the divisor is a constant, use libdivide for faster division */ if (steps[1] == 0) { - /* If divisor is 0 or NAT, set warning*/ - if (*(npy_timedelta *)ip2 == 0) { - npy_set_floatstatus_divbyzero(); - } - else if(*(npy_timedelta *)ip2 == NPY_DATETIME_NAT) { - npy_set_floatstatus_invalid(); - } - /* In case of empty array, just return*/ if (n == 0) { return; @@ -1558,11 +1542,13 @@ TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp co /* If divisor is 0 or NAT, we need not compute anything */ if (in2 == 0) { + npy_set_floatstatus_divbyzero(); BINARY_LOOP_SLIDING { *((npy_int64 *)op1) = 0; } } else if (in2 == NPY_DATETIME_NAT) { + npy_set_floatstatus_invalid(); BINARY_LOOP_SLIDING { *((npy_int64 *)op1) = 0; } diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index 215913da51eb..2655192737da 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -275,8 +275,8 @@ def test_division_int_boundary(self, input_dtype): a // 0 with pytest.raises(FloatingPointError): a //= 0 - with pytest.raises(FloatingPointError): - np.array([], dtype=input_dtype) // 0 + + np.array([], dtype=input_dtype) // 0 @pytest.mark.parametrize( "dividend,divisor,quotient", @@ -285,8 +285,7 @@ def test_division_int_boundary(self, input_dtype): (np.timedelta64(-2,'Y'), np.timedelta64(2,'M'), -12), (np.timedelta64(-2,'Y'), np.timedelta64(-2,'M'), 12), (np.timedelta64(2,'M'), np.timedelta64(-2,'Y'), -1), - (np.timedelta64(2,'Y'), np.timedelta64(0,'M'), None), - (np.array([], dtype='timedelta64[Y]'), np.timedelta64('Nat','M'), None), + (np.timedelta64(2,'Y'), np.timedelta64(0,'M'), 0), (np.timedelta64(2,'Y'), 2, np.timedelta64(1,'Y')), (np.timedelta64(2,'Y'), -2, np.timedelta64(-1,'Y')), (np.timedelta64(-2,'Y'), 2, np.timedelta64(-1,'Y')), @@ -294,12 +293,10 @@ def test_division_int_boundary(self, input_dtype): (np.timedelta64(-2,'Y'), -2, np.timedelta64(1,'Y')), (np.timedelta64(-2,'Y'), -3, np.timedelta64(0,'Y')), (np.timedelta64(-2,'Y'), 0, np.timedelta64('Nat','Y')), - (np.array([], dtype='timedelta64[Y]'), 0, None), ]) def test_division_int_timedelta(self, dividend, divisor, quotient): - # If either divisor is 0 or quotient is None or Nat, check for division by 0 - if divisor and (isinstance(quotient, int) or - not (quotient is None or np.isnat(quotient))): + # If either divisor is 0 or quotient is Nat, check for division by 0 + if divisor and (isinstance(quotient, int) or not np.isnat(quotient)): msg = "Timedelta floor division check" assert dividend // divisor == quotient, msg From 28aa88312164987462b1e7744e0efb5bee65c724 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Tue, 1 Dec 2020 08:52:50 +0530 Subject: [PATCH 32/32] MAINT: Linting fixes --- LICENSES_bundled.txt | 2 +- numpy/core/src/umath/loops.c.src | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/LICENSES_bundled.txt b/LICENSES_bundled.txt index 80557d3e6ee7..26c7a7829361 100644 --- a/LICENSES_bundled.txt +++ b/LICENSES_bundled.txt @@ -18,5 +18,5 @@ License: MIT Name: libdivide Files: numpy/core/include/numpy/libdivide/* -License: zlib +License: Zlib For license text, see numpy/core/include/numpy/libdivide/LICENSE.txt diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 6637c0e4e4f9..6823a13b213d 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -862,14 +862,14 @@ NPY_NO_EXPORT void /* When the divisor is a constant, use libdivide for faster division */ if (steps[1] == 0) { - /* In case of empty array, just return*/ + /* In case of empty array, just return */ if (n == 0) { return; } const @type@ in2 = *(@type@ *)ip2; - /* If divisor is 0, we need not compute anything*/ + /* If divisor is 0, we need not compute anything */ if (in2 == 0) { npy_set_floatstatus_divbyzero(); BINARY_LOOP_SLIDING { @@ -1415,12 +1415,12 @@ TIMEDELTA_dm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const NPY_NO_EXPORT void TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - /* NOTE: This code is similar to array floor divide*/ + /* NOTE: This code is similar to array floor divide */ BINARY_DEFS /* When the divisor is a constant, use libdivide for faster division */ if (steps[1] == 0) { - /* In case of empty array, just return*/ + /* In case of empty array, just return */ if (n == 0) { return; } @@ -1528,12 +1528,12 @@ TIMEDELTA_mm_m_remainder(char **args, npy_intp const *dimensions, npy_intp const NPY_NO_EXPORT void TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - /* NOTE: This code is similar to array floor divide*/ + /* NOTE: This code is similar to array floor divide */ BINARY_DEFS /* When the divisor is a constant, use libdivide for faster division */ if (steps[1] == 0) { - /* In case of empty array, just return*/ + /* In case of empty array, just return */ if (n == 0) { return; }