From eaeb6f7854e2e9acec466df02e5adcf80d3d671f Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Fri, 24 Nov 2023 10:05:36 +0200 Subject: [PATCH] BUG: Fix non-contiguous memory load when ARM/Neon is enabled --- numpy/_core/src/common/simd/neon/memory.h | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/numpy/_core/src/common/simd/neon/memory.h b/numpy/_core/src/common/simd/neon/memory.h index 2dc21e5a4305..2202013f982c 100644 --- a/numpy/_core/src/common/simd/neon/memory.h +++ b/numpy/_core/src/common/simd/neon/memory.h @@ -52,21 +52,12 @@ NPYV_IMPL_NEON_MEM(f64, double) ***************************/ NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride) { - switch (stride) { - case 2: - return vld2q_s32((const int32_t*)ptr).val[0]; - case 3: - return vld3q_s32((const int32_t*)ptr).val[0]; - case 4: - return vld4q_s32((const int32_t*)ptr).val[0]; - default:; - int32x2_t ax = vcreate_s32(*ptr); - int32x4_t a = vcombine_s32(ax, ax); - a = vld1q_lane_s32((const int32_t*)ptr + stride, a, 1); - a = vld1q_lane_s32((const int32_t*)ptr + stride*2, a, 2); - a = vld1q_lane_s32((const int32_t*)ptr + stride*3, a, 3); - return a; - } + int32x4_t a; + a = vld1q_lane_s32((const int32_t*)ptr, a, 0); + a = vld1q_lane_s32((const int32_t*)ptr + stride, a, 1); + a = vld1q_lane_s32((const int32_t*)ptr + stride*2, a, 2); + a = vld1q_lane_s32((const int32_t*)ptr + stride*3, a, 3); + return a; } NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride)