From eaeb6f7854e2e9acec466df02e5adcf80d3d671f Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Fri, 24 Nov 2023 10:05:36 +0200
Subject: [PATCH] BUG: Fix non-contiguous memory load when ARM/Neon is enabled

---
 numpy/_core/src/common/simd/neon/memory.h | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/numpy/_core/src/common/simd/neon/memory.h b/numpy/_core/src/common/simd/neon/memory.h
index 2dc21e5a4305..2202013f982c 100644
--- a/numpy/_core/src/common/simd/neon/memory.h
+++ b/numpy/_core/src/common/simd/neon/memory.h
@@ -52,21 +52,12 @@ NPYV_IMPL_NEON_MEM(f64, double)
  ***************************/
 NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
 {
-    switch (stride) {
-    case 2:
-        return vld2q_s32((const int32_t*)ptr).val[0];
-    case 3:
-        return vld3q_s32((const int32_t*)ptr).val[0];
-    case 4:
-        return vld4q_s32((const int32_t*)ptr).val[0];
-    default:;
-        int32x2_t ax = vcreate_s32(*ptr);
-        int32x4_t a = vcombine_s32(ax, ax);
-                  a = vld1q_lane_s32((const int32_t*)ptr + stride,   a, 1);
-                  a = vld1q_lane_s32((const int32_t*)ptr + stride*2, a, 2);
-                  a = vld1q_lane_s32((const int32_t*)ptr + stride*3, a, 3);
-        return a;
-    }
+    int32x4_t a;
+    a = vld1q_lane_s32((const int32_t*)ptr,            a, 0);
+    a = vld1q_lane_s32((const int32_t*)ptr + stride,   a, 1);
+    a = vld1q_lane_s32((const int32_t*)ptr + stride*2, a, 2);
+    a = vld1q_lane_s32((const int32_t*)ptr + stride*3, a, 3);
+    return a;
 }
 
 NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride)