Thanks to visit codestin.com
Credit goes to github.com

Skip to content

MAINT: cleanup of fast_loop_macros.h #13208

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 31, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 52 additions & 69 deletions numpy/core/src/umath/fast_loop_macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,83 +74,63 @@
#define IS_BINARY_CONT(tin, tout) (steps[0] == sizeof(tin) && \
steps[1] == sizeof(tin) && \
steps[2] == sizeof(tout))

/* binary loop input and output contiguous with first scalar */
#define IS_BINARY_CONT_S1(tin, tout) (steps[0] == 0 && \
steps[1] == sizeof(tin) && \
steps[2] == sizeof(tout))

/* binary loop input and output contiguous with second scalar */
#define IS_BINARY_CONT_S2(tin, tout) (steps[0] == sizeof(tin) && \
steps[1] == 0 && \
steps[2] == sizeof(tout))


/*
* loop with contiguous specialization
* op should be the code storing the result in `tout * out`
* combine with NPY_GCC_OPT_3 to allow autovectorization
* should only be used where its worthwhile to avoid code bloat
*/
#define BASE_OUTPUT_LOOP(tout, op) \
OUTPUT_LOOP { \
tout * out = (tout *)op1; \
op; \
}
#define OUTPUT_LOOP_FAST(tout, op) \
do { \
/* condition allows compiler to optimize the generic macro */ \
if (IS_OUTPUT_CONT(tout)) { \
BASE_OUTPUT_LOOP(tout, op) \
} \
else { \
BASE_OUTPUT_LOOP(tout, op) \
} \
} \
while (0)

/*
* loop with contiguous specialization
* op should be the code working on `tin in` and
* storing the result in `tout * out`
* storing the result in `tout *out`
* combine with NPY_GCC_OPT_3 to allow autovectorization
* should only be used where its worthwhile to avoid code bloat
*/
#define BASE_UNARY_LOOP(tin, tout, op) \
UNARY_LOOP { \
const tin in = *(tin *)ip1; \
tout * out = (tout *)op1; \
tout *out = (tout *)op1; \
op; \
}
#define UNARY_LOOP_FAST(tin, tout, op) \

#define UNARY_LOOP_FAST(tin, tout, op) \
do { \
/* condition allows compiler to optimize the generic macro */ \
if (IS_UNARY_CONT(tin, tout)) { \
if (args[0] == args[1]) { \
BASE_UNARY_LOOP(tin, tout, op) \
/* condition allows compiler to optimize the generic macro */ \
if (IS_UNARY_CONT(tin, tout)) { \
if (args[0] == args[1]) { \
BASE_UNARY_LOOP(tin, tout, op) \
} \
else { \
BASE_UNARY_LOOP(tin, tout, op) \
} \
} \
else { \
BASE_UNARY_LOOP(tin, tout, op) \
} \
} \
else { \
BASE_UNARY_LOOP(tin, tout, op) \
} \
} \
while (0)

/*
* loop with contiguous specialization
* op should be the code working on `tin in1`, `tin in2` and
* storing the result in `tout * out`
* storing the result in `tout *out`
* combine with NPY_GCC_OPT_3 to allow autovectorization
* should only be used where its worthwhile to avoid code bloat
*/
#define BASE_BINARY_LOOP(tin, tout, op) \
BINARY_LOOP { \
const tin in1 = *(tin *)ip1; \
const tin in2 = *(tin *)ip2; \
tout * out = (tout *)op1; \
tout *out = (tout *)op1; \
op; \
}

/*
* unfortunately gcc 6/7 regressed and we need to give it additional hints to
* vectorize inplace operations (PR80198)
Expand All @@ -171,59 +151,62 @@
for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) { \
const tin in1 = *(tin *)ip1; \
const tin in2 = *(tin *)ip2; \
tout * out = (tout *)op1; \
tout *out = (tout *)op1; \
op; \
}

#define BASE_BINARY_LOOP_S(tin, tout, cin, cinp, vin, vinp, op) \
const tin cin = *(tin *)cinp; \
BINARY_LOOP { \
const tin vin = *(tin *)vinp; \
tout * out = (tout *)op1; \
tout *out = (tout *)op1; \
op; \
}

/* PR80198 again, scalar works without the pragma */
#define BASE_BINARY_LOOP_S_INP(tin, tout, cin, cinp, vin, vinp, op) \
const tin cin = *(tin *)cinp; \
BINARY_LOOP { \
const tin vin = *(tin *)vinp; \
tout * out = (tout *)vinp; \
tout *out = (tout *)vinp; \
op; \
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was suggesting replacing op; by *out = val so that the call passes @val@ instead of the assignment statement.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, but that was in output_loop which is now gone completely

}
#define BINARY_LOOP_FAST(tin, tout, op) \

#define BINARY_LOOP_FAST(tin, tout, op) \
do { \
/* condition allows compiler to optimize the generic macro */ \
if (IS_BINARY_CONT(tin, tout)) { \
if (abs_ptrdiff(args[2], args[0]) == 0 && \
abs_ptrdiff(args[2], args[1]) >= NPY_MAX_SIMD_SIZE) { \
BASE_BINARY_LOOP_INP(tin, tout, op) \
/* condition allows compiler to optimize the generic macro */ \
if (IS_BINARY_CONT(tin, tout)) { \
if (abs_ptrdiff(args[2], args[0]) == 0 && \
abs_ptrdiff(args[2], args[1]) >= NPY_MAX_SIMD_SIZE) { \
BASE_BINARY_LOOP_INP(tin, tout, op) \
} \
else if (abs_ptrdiff(args[2], args[1]) == 0 && \
abs_ptrdiff(args[2], args[0]) >= NPY_MAX_SIMD_SIZE) { \
BASE_BINARY_LOOP_INP(tin, tout, op) \
} \
else { \
BASE_BINARY_LOOP(tin, tout, op) \
} \
} \
else if (abs_ptrdiff(args[2], args[1]) == 0 && \
abs_ptrdiff(args[2], args[0]) >= NPY_MAX_SIMD_SIZE) { \
BASE_BINARY_LOOP_INP(tin, tout, op) \
else if (IS_BINARY_CONT_S1(tin, tout)) { \
if (abs_ptrdiff(args[2], args[1]) == 0) { \
BASE_BINARY_LOOP_S_INP(tin, tout, in1, args[0], in2, ip2, op) \
} \
else { \
BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
} \
} \
else { \
BASE_BINARY_LOOP(tin, tout, op) \
} \
} \
else if (IS_BINARY_CONT_S1(tin, tout)) { \
if (abs_ptrdiff(args[2], args[1]) == 0) { \
BASE_BINARY_LOOP_S_INP(tin, tout, in1, args[0], in2, ip2, op) \
else if (IS_BINARY_CONT_S2(tin, tout)) { \
if (abs_ptrdiff(args[2], args[0]) == 0) { \
BASE_BINARY_LOOP_S_INP(tin, tout, in2, args[1], in1, ip1, op) \
} \
else { \
BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \
}\
} \
else { \
BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
} \
} \
else if (IS_BINARY_CONT_S2(tin, tout)) { \
if (abs_ptrdiff(args[2], args[0]) == 0) { \
BASE_BINARY_LOOP_S_INP(tin, tout, in2, args[1], in1, ip1, op) \
BASE_BINARY_LOOP(tin, tout, op) \
} \
else { \
BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \
}\
} \
else { \
BASE_BINARY_LOOP(tin, tout, op) \
} \
} \
while (0)

Expand Down
12 changes: 10 additions & 2 deletions numpy/core/src/umath/loops.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -652,7 +652,11 @@ BOOL__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UN
NPY_NO_EXPORT void
BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
OUTPUT_LOOP_FAST(npy_bool, *out = @val@);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm curious - is it any slower is we just use UNARY_LOOP_FAST(npy_bool, npy_bool, *out=@val@) here? Is the compiler able to remove the unused iteration over the input?

If it is, I'd be inclined to maintain fewer macros.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I dug up the benchmark I wrote and quickly tried out your suggestion. I'll need to check my code in the morning but it looks like it causes a substantial regression:

$ asv compare upstream/master HEAD --only-changed
       before           after         ratio
     [db5fcc8e]       [0c76cb6e]
     <maximum_speedup~1>       <bool_perf>
+         177±3μs          234±2μs     1.32  bench_ufunc.IsNan.time_isnan('float16')
+     2.99±0.06μs       32.8±0.2μs    10.96  bench_ufunc.IsNan.time_isnan('int16')
+      3.09±0.5μs         33.1±1μs    10.72  bench_ufunc.IsNan.time_isnan('int32')
+     3.07±0.07μs       33.2±0.3μs    10.80  bench_ufunc.IsNan.time_isnan('int64')

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good news - there was a typo in what I tested last night and there's no performance impact if we just use UNARY_LOOP_FAST instead. I've pushed a commit that uses that and removes the OUTPUT_LOOP_FAST macros.

/*
* The (void)in; suppresses an unused variable warning raised by gcc and allows
* us to re-use this macro even though we do not depend on in
*/
UNARY_LOOP_FAST(npy_bool, npy_bool, (void)in; *out = @val@);
}

/**end repeat**/
Expand Down Expand Up @@ -896,7 +900,11 @@ NPY_NO_EXPORT void
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
OUTPUT_LOOP_FAST(npy_bool, *out = @val@);
/*
* The (void)in; suppresses an unused variable warning raised by gcc and allows
* us to re-use this macro even though we do not depend on in
*/
UNARY_LOOP_FAST(@type@, npy_bool, (void)in; *out = @val@);
}
/**end repeat1**/

Expand Down