From 1d817c3265e931004bbdbfb8784154ee9f7ca00b Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Wed, 26 Feb 2025 15:59:02 +0000 Subject: [PATCH 01/22] [AggressiveInstCombine] Shrink loads used in shufflevector rebroadcasts. Attempt to shrink the size of vector loads where only some of the incoming lanes are used for rebroadcasts in shufflevector instructions. --- .../load-shufflevector.ll | 345 ++++++++++++++++++ 1 file changed, 345 insertions(+) create mode 100644 llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll diff --git a/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll b/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll new file mode 100644 index 0000000000000..3f6c8334e61cf --- /dev/null +++ b/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll @@ -0,0 +1,345 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=aggressive-instcombine -S < %s | FileCheck %s + +define <8 x half> @shuffle_v4_v8f16_r0_1(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { +; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> +; CHECK-NEXT: ret <8 x half> [[TMP1]] +; +entry: + %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32 + %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> + ret <8 x half> %val1 +} + +define <8 x half> @shuffle_v4_v8f16_r0_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { +; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_2( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> +; CHECK-NEXT: ret <8 x half> [[TMP1]] +; +entry: + %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32 + %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> + ret <8 x half> %val1 +} + +define <4 x half> @shuffle_v4_v4f16_r1_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { +; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_r1_2( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> +; CHECK-NEXT: ret <4 x half> [[TMP1]] +; +entry: + %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32 + %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <4 x i32> + ret <4 x half> %val1 +} + +define <8 x half> @shuffle_v4_v8f16_r1_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { +; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r1_2( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> +; CHECK-NEXT: ret <8 x half> [[TMP1]] +; +entry: + %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32 + %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> + ret <8 x half> %val1 +} + +define <8 x half> @shuffle_v4_v8f16_cond_r0_1(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr { +; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r0_1( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: br label %[[FINALLY:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> +; CHECK-NEXT: br label %[[FINALLY]] +; CHECK: [[FINALLY]]: +; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] +; CHECK-NEXT: ret <8 x half> [[VAL3]] +; +entry: + %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32 + br i1 %cond, label %then, label %else + +then: + %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> + br label %finally + +else: + %val2 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> + br label %finally + +finally: + %val3 = phi <8 x half> [ %val1, %then ], [ %val2, %else ] + ret <8 x half> %val3 +} + +define <4 x half> @shuffle_v4_v4f16_cond_r1_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr { +; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_cond_r1_2( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> +; CHECK-NEXT: br label %[[FINALLY:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> +; CHECK-NEXT: br label %[[FINALLY]] +; CHECK: [[FINALLY]]: +; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] +; CHECK-NEXT: ret <4 x half> [[VAL3]] +; +entry: + %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32 + br i1 %cond, label %then, label %else + +then: + %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <4 x i32> + br label %finally + +else: + %val2 = shufflevector <4 x half> %val0, <4 x half> poison, <4 x i32> + br label %finally + +finally: + %val3 = phi <4 x half> [ %val1, %then ], [ %val2, %else ] + ret <4 x half> %val3 +} + +define <8 x half> @shuffle_v4_v8f16_cond_r1_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr { +; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r1_2( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> +; CHECK-NEXT: br label %[[FINALLY:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> +; CHECK-NEXT: br label %[[FINALLY]] +; CHECK: [[FINALLY]]: +; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] +; CHECK-NEXT: ret <8 x half> [[VAL3]] +; +entry: + %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32 + br i1 %cond, label %then, label %else + +then: + %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> + br label %finally + +else: + %val2 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> + br label %finally + +finally: + %val3 = phi <8 x half> [ %val1, %then ], [ %val2, %else ] + ret <8 x half> %val3 +} + +define <8 x i32> @shuffle_v4_v8i32_r0_1(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { +; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_1( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP1]] +; +entry: + %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32 + %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> + ret <8 x i32> %val1 +} + +define <8 x i32> @shuffle_v4_v8i32_r0_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { +; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_2( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP1]] +; +entry: + %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32 + %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> + ret <8 x i32> %val1 +} + +define <4 x i32> @shuffle_v4_v4i32_r1_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { +; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_r1_2( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP1]] +; +entry: + %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32 + %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <4 x i32> + ret <4 x i32> %val1 +} + +define <8 x i32> @shuffle_v4_v8i32_r1_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { +; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r1_2( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP1]] +; +entry: + %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32 + %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> + ret <8 x i32> %val1 +} + +define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr { +; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_1( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: br label %[[FINALLY:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: br label %[[FINALLY]] +; CHECK: [[FINALLY]]: +; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] +; CHECK-NEXT: ret <8 x i32> [[VAL3]] +; +entry: + %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32 + br i1 %cond, label %then, label %else + +then: + %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> + br label %finally + +else: + %val2 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> + br label %finally + +finally: + %val3 = phi <8 x i32> [ %val1, %then ], [ %val2, %else ] + ret <8 x i32> %val3 +} + +define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr { +; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_2( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: br label %[[FINALLY:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: br label %[[FINALLY]] +; CHECK: [[FINALLY]]: +; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] +; CHECK-NEXT: ret <8 x i32> [[VAL3]] +; +entry: + %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32 + br i1 %cond, label %then, label %else + +then: + %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> + br label %finally + +else: + %val2 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> + br label %finally + +finally: + %val3 = phi <8 x i32> [ %val1, %then ], [ %val2, %else ] + ret <8 x i32> %val3 +} + +define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr { +; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_cond_r1_2( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: br label %[[FINALLY:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: br label %[[FINALLY]] +; CHECK: [[FINALLY]]: +; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] +; CHECK-NEXT: ret <4 x i32> [[VAL3]] +; +entry: + %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32 + br i1 %cond, label %then, label %else + +then: + %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <4 x i32> + br label %finally + +else: + %val2 = shufflevector <4 x i32> %val0, <4 x i32> poison, <4 x i32> + br label %finally + +finally: + %val3 = phi <4 x i32> [ %val1, %then ], [ %val2, %else ] + ret <4 x i32> %val3 +} + +define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr { +; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_2( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: br label %[[FINALLY:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: br label %[[FINALLY]] +; CHECK: [[FINALLY]]: +; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] +; CHECK-NEXT: ret <8 x i32> [[VAL3]] +; +entry: + %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32 + br i1 %cond, label %then, label %else + +then: + %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> + br label %finally + +else: + %val2 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> + br label %finally + +finally: + %val3 = phi <8 x i32> [ %val1, %then ], [ %val2, %else ] + ret <8 x i32> %val3 +} From 09ac59a2207babd9d5e9874a4c24b035d7c12db0 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Wed, 26 Feb 2025 21:18:30 +0000 Subject: [PATCH 02/22] Add implementation and update tests. --- clang/test/CodeGenOpenCL/preserve_vec3.cl | 20 ++--- .../AggressiveInstCombine.cpp | 90 +++++++++++++++++++ .../load-shufflevector.ll | 88 +++++++++--------- 3 files changed, 144 insertions(+), 54 deletions(-) diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl index 49ebae6fc7013..0538eac4029bb 100644 --- a/clang/test/CodeGenOpenCL/preserve_vec3.cl +++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl @@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4))); // CHECK-LABEL: define dso_local spir_kernel void @foo( // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16 -// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> +// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16 +// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> // CHECK-NEXT: store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]] // CHECK-NEXT: ret void // @@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) { // CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3( // CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]] -// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]] +// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> // CHECK-NEXT: store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // @@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) { // CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4( // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16 -// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> +// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16 +// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> // CHECK-NEXT: store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // @@ -47,8 +47,8 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) { // CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2( // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> +// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> // CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // @@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) { // CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3( // CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]] -// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]] +// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> // CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 8c156c93ba8d1..d1d85197fb6ca 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -987,6 +987,95 @@ static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) { return true; } +// If `I` is a load instruction, used only by shufflevector instructions with +// poison values, attempt to shrink the load to only the lanes being used. +static bool shrinkLoadsForBroadcast(Instruction &I) { + auto *OldLoad = dyn_cast(&I); + if (!OldLoad) + return false; + + auto *VecTy = dyn_cast(I.getType()); + if (!VecTy) + return false; + + auto IsPoisonOrUndef = [](Value *V) -> bool { + if (auto *C = dyn_cast(V)) { + return isa(C) || isa(C); + } + return false; + }; + + using IndexRange = std::pair; + auto GetIndexRangeInShuffles = [&]() -> std::optional { + auto OutputRange = IndexRange(VecTy->getNumElements(), 0u); + for (auto &Use: I.uses()) { + // All uses must be ShuffleVector instructions. + auto *Shuffle = dyn_cast(Use.getUser()); + if (!Shuffle) + return {}; + + // Get index range for value. + auto *Op0 = Shuffle->getOperand(0u); + auto *Op1 = Shuffle->getOperand(1u); + if (!IsPoisonOrUndef(Op1)) + return {}; + + // Find the min and max indices used by the ShuffleVector instruction. + auto Mask = Shuffle->getShuffleMask(); + auto *Op0Ty = cast(Op0->getType()); + auto NumElems = Op0Ty->getNumElements(); + + for (unsigned Index: Mask) { + if (Index < NumElems) { + OutputRange.first = std::min(Index, OutputRange.first); + OutputRange.second = std::max(Index, OutputRange.second); + } + } + } + return OutputRange; + }; + + if (auto Indices = GetIndexRangeInShuffles()) { + auto OldSize = VecTy->getNumElements(); + auto NewSize = Indices->second + 1u; + + if (NewSize < OldSize) { + auto Builder = IRBuilder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + // Create new load of smaller vector. + auto *ElemTy = VecTy->getElementType(); + auto *NewVecTy = FixedVectorType::get(ElemTy, NewSize); + auto *NewLoad = cast( + Builder.CreateLoad(NewVecTy, OldLoad->getPointerOperand())); + NewLoad->copyMetadata(I); + + // Replace all users. + auto OldShuffles = SmallVector{}; + for (auto &Use: I.uses()) { + auto *Shuffle = cast(Use.getUser()); + + Builder.SetInsertPoint(Shuffle); + Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc()); + auto *NewShuffle = Builder.CreateShuffleVector( + NewLoad, PoisonValue::get(NewVecTy), Shuffle->getShuffleMask() + ); + + Shuffle->replaceAllUsesWith(NewShuffle); + OldShuffles.push_back(Shuffle); + } + + // Erase old users. + for (auto *Shuffle: OldShuffles) + Shuffle->eraseFromParent(); + + I.eraseFromParent(); + return true; + } + } + return false; +} + namespace { class StrNCmpInliner { public: @@ -1325,6 +1414,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT, MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT); MadeChange |= foldPatternedLoads(I, DL); MadeChange |= foldICmpOrChain(I, DL, TTI, AA, DT); + MadeChange |= shrinkLoadsForBroadcast(I); // NOTE: This function introduces erasing of the instruction `I`, so it // needs to be called at the end of this sequence, otherwise we may make // bugs. diff --git a/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll b/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll index 3f6c8334e61cf..57006f2c65380 100644 --- a/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll +++ b/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll @@ -5,8 +5,8 @@ define <8 x half> @shuffle_v4_v8f16_r0_1(ptr addrspace(1) nocapture readonly %ar ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> ; CHECK-NEXT: ret <8 x half> [[TMP1]] ; entry: @@ -19,8 +19,8 @@ define <8 x half> @shuffle_v4_v8f16_r0_2(ptr addrspace(1) nocapture readonly %ar ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> ; CHECK-NEXT: ret <8 x half> [[TMP1]] ; entry: @@ -33,8 +33,8 @@ define <4 x half> @shuffle_v4_v4f16_r1_2(ptr addrspace(1) nocapture readonly %ar ; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> ; CHECK-NEXT: ret <4 x half> [[TMP1]] ; entry: @@ -47,8 +47,8 @@ define <8 x half> @shuffle_v4_v8f16_r1_2(ptr addrspace(1) nocapture readonly %ar ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> ; CHECK-NEXT: ret <8 x half> [[TMP1]] ; entry: @@ -61,16 +61,16 @@ define <8 x half> @shuffle_v4_v8f16_cond_r0_1(ptr addrspace(1) nocapture readonl ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r0_1( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 4 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> ; CHECK-NEXT: br label %[[FINALLY]] ; CHECK: [[FINALLY]]: -; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] +; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ] ; CHECK-NEXT: ret <8 x half> [[VAL3]] ; entry: @@ -94,16 +94,16 @@ define <4 x half> @shuffle_v4_v4f16_cond_r1_2(ptr addrspace(1) nocapture readonl ; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_cond_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> ; CHECK-NEXT: br label %[[FINALLY]] ; CHECK: [[FINALLY]]: -; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] +; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x half> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ] ; CHECK-NEXT: ret <4 x half> [[VAL3]] ; entry: @@ -127,16 +127,16 @@ define <8 x half> @shuffle_v4_v8f16_cond_r1_2(ptr addrspace(1) nocapture readonl ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> ; CHECK-NEXT: br label %[[FINALLY]] ; CHECK: [[FINALLY]]: -; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] +; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ] ; CHECK-NEXT: ret <8 x half> [[VAL3]] ; entry: @@ -160,8 +160,8 @@ define <8 x i32> @shuffle_v4_v8i32_r0_1(ptr addrspace(1) nocapture readonly %arg ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_1( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP1]] ; entry: @@ -174,8 +174,8 @@ define <8 x i32> @shuffle_v4_v8i32_r0_2(ptr addrspace(1) nocapture readonly %arg ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP1]] ; entry: @@ -188,8 +188,8 @@ define <4 x i32> @shuffle_v4_v4i32_r1_2(ptr addrspace(1) nocapture readonly %arg ; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP1]] ; entry: @@ -202,8 +202,8 @@ define <8 x i32> @shuffle_v4_v8i32_r1_2(ptr addrspace(1) nocapture readonly %arg ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP1]] ; entry: @@ -216,16 +216,16 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(ptr addrspace(1) nocapture readonly ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_1( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 8 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: br label %[[FINALLY]] ; CHECK: [[FINALLY]]: -; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] +; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ] ; CHECK-NEXT: ret <8 x i32> [[VAL3]] ; entry: @@ -249,16 +249,16 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(ptr addrspace(1) nocapture readonly ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> ; CHECK-NEXT: br label %[[FINALLY]] ; CHECK: [[FINALLY]]: -; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] +; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ] ; CHECK-NEXT: ret <8 x i32> [[VAL3]] ; entry: @@ -282,16 +282,16 @@ define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(ptr addrspace(1) nocapture readonly ; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_cond_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> ; CHECK-NEXT: br label %[[FINALLY]] ; CHECK: [[FINALLY]]: -; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] +; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ] ; CHECK-NEXT: ret <4 x i32> [[VAL3]] ; entry: @@ -315,16 +315,16 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(ptr addrspace(1) nocapture readonly ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> ; CHECK-NEXT: br label %[[FINALLY]] ; CHECK: [[FINALLY]]: -; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] +; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ] ; CHECK-NEXT: ret <8 x i32> [[VAL3]] ; entry: From bc8e5ce8e6cabd0a0e08fac3d51624e18161d7f8 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Fri, 28 Feb 2025 17:25:35 +0000 Subject: [PATCH 03/22] Fix broken tests. --- .../builtins-systemz-zvector-constrained.c | 4 +- .../SystemZ/builtins-systemz-zvector.c | 52 +++++++++---------- .../builtins-systemz-zvector2-constrained.c | 12 ++--- .../SystemZ/builtins-systemz-zvector2.c | 12 ++--- .../AggressiveInstCombine.cpp | 19 ++++--- 5 files changed, 49 insertions(+), 50 deletions(-) diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c index 4993df20df143..e335c363ecb48 100644 --- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c +++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c @@ -79,8 +79,8 @@ void test_core(void) { vec_xstd2(vd, idx, ptrd); vd = vec_splat(vd, 0); - // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer - // CHECK-ASM: vrepg + // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer + // CHECK-ASM: vlrepg vd = vec_splat(vd, 1); // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> // CHECK-ASM: vrepg diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c index d5d15b4dea966..422c97a77511c 100644 --- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c +++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c @@ -777,80 +777,80 @@ void test_core(void) { // CHECK: <2 x i64> splat (i64 -4503582447501313) vsc = vec_splat(vsc, 0); - // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> zeroinitializer - // CHECK-ASM: vrepb + // CHECK: shufflevector <1 x i8> %{{.*}}, <1 x i8> poison, <16 x i32> zeroinitializer + // CHECK-ASM: vlrepb vsc = vec_splat(vsc, 15); // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> // CHECK-ASM: vrepb vuc = vec_splat(vuc, 0); - // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> zeroinitializer - // CHECK-ASM: vrepb + // CHECK: store volatile <16 x i8> splat (i8 {{.*}}), ptr @vuc + // CHECK-ASM: vst vuc = vec_splat(vuc, 15); // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> // CHECK-ASM: vrepb vbc = vec_splat(vbc, 0); - // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> zeroinitializer - // CHECK-ASM: vrepb + // CHECK: shufflevector <1 x i8> %{{.*}}, <1 x i8> poison, <16 x i32> zeroinitializer + // CHECK-ASM: vlrepb vbc = vec_splat(vbc, 15); // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> // CHECK-ASM: vrepb vss = vec_splat(vss, 0); - // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> zeroinitializer - // CHECK-ASM: vreph + // CHECK: shufflevector <1 x i16> %{{.*}}, <1 x i16> poison, <8 x i32> zeroinitializer + // CHECK-ASM: vlreph vss = vec_splat(vss, 7); // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> // CHECK-ASM: vreph vus = vec_splat(vus, 0); - // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> zeroinitializer - // CHECK-ASM: vreph + // CHECK: store volatile <8 x i16> splat (i16 {{.*}}), ptr @vus + // CHECK-ASM: vst vus = vec_splat(vus, 7); // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> // CHECK-ASM: vreph vbs = vec_splat(vbs, 0); - // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> zeroinitializer - // CHECK-ASM: vreph + // CHECK: shufflevector <1 x i16> %{{.*}}, <1 x i16> poison, <8 x i32> zeroinitializer + // CHECK-ASM: vlreph vbs = vec_splat(vbs, 7); // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> // CHECK-ASM: vreph vsi = vec_splat(vsi, 0); - // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> zeroinitializer - // CHECK-ASM: vrepf + // CHECK: shufflevector <1 x i32> %{{.*}}, <1 x i32> poison, <4 x i32> zeroinitializer + // CHECK-ASM: vlrepf vsi = vec_splat(vsi, 3); // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> // CHECK-ASM: vrepf vui = vec_splat(vui, 0); - // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> zeroinitializer - // CHECK-ASM: vrepf + // CHECK: store volatile <4 x i32> splat (i32 {{.*}}), ptr @vui + // CHECK-ASM: vst vui = vec_splat(vui, 3); // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> // CHECK-ASM: vrepf vbi = vec_splat(vbi, 0); - // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> zeroinitializer - // CHECK-ASM: vrepf + // CHECK: shufflevector <1 x i32> %{{.*}}, <1 x i32> poison, <4 x i32> zeroinitializer + // CHECK-ASM: vlrepf vbi = vec_splat(vbi, 3); // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> // CHECK-ASM: vrepf vsl = vec_splat(vsl, 0); - // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> zeroinitializer + // CHECK: shufflevector <1 x i64> %{{.*}}, <1 x i64> poison, <2 x i32> zeroinitializer // CHECK-ASM: vrepg vsl = vec_splat(vsl, 1); // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> - // CHECK-ASM: vrepg + // CHECK-ASM: vst vul = vec_splat(vul, 0); - // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> zeroinitializer - // CHECK-ASM: vrepg + // CHECK: store volatile <2 x i64> splat (i64 {{.*}}), ptr @vul + // CHECK-ASM: vst vul = vec_splat(vul, 1); // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> // CHECK-ASM: vrepg vbl = vec_splat(vbl, 0); - // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> zeroinitializer - // CHECK-ASM: vrepg + // CHECK: shufflevector <1 x i64> %{{.*}}, <1 x i64> poison, <2 x i32> zeroinitializer + // CHECK-ASM: vlrepg vbl = vec_splat(vbl, 1); // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> // CHECK-ASM: vrepg vd = vec_splat(vd, 0); - // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer - // CHECK-ASM: vrepg + // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer + // CHECK-ASM: vlrepg vd = vec_splat(vd, 1); // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> // CHECK-ASM: vrepg diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c index 25b3e0b68cd02..2b79df2a1886e 100644 --- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c +++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c @@ -130,14 +130,14 @@ void test_core(void) { // CHECK-ASM: vst vf = vec_splat(vf, 0); - // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> zeroinitializer - // CHECK-ASM: vrepf + // CHECK: shufflevector <1 x float> %{{.*}}, <1 x float> poison, <4 x i32> zeroinitializer + // CHECK-ASM: vlrepf vf = vec_splat(vf, 1); - // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> - // CHECK-ASM: vrepf + // CHECK: shufflevector <2 x float> %{{.*}}, <2 x float> poison, <4 x i32> + // CHECK-ASM: vst vd = vec_splat(vd, 0); - // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer - // CHECK-ASM: vrepg + // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer + // CHECK-ASM: vlrepg vd = vec_splat(vd, 1); // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> // CHECK-ASM: vrepg diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c index c1ef178fcfaa9..1ccbe6df5f16d 100644 --- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c +++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c @@ -254,14 +254,14 @@ void test_core(void) { // CHECK-ASM: vstrlr vf = vec_splat(vf, 0); - // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> zeroinitializer - // CHECK-ASM: vrepf + // CHECK: shufflevector <1 x float> %{{.*}}, <1 x float> poison, <4 x i32> zeroinitializer + // CHECK-ASM: vlrepf vf = vec_splat(vf, 1); - // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> - // CHECK-ASM: vrepf + // CHECK: shufflevector <2 x float> %{{.*}}, <2 x float> poison, <4 x i32> + // CHECK-ASM: vst vd = vec_splat(vd, 0); - // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer - // CHECK-ASM: vrepg + // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer + // CHECK-ASM: vlrepg vd = vec_splat(vd, 1); // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> // CHECK-ASM: vrepg diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index d1d85197fb6ca..4b81f4a8e1f04 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -987,7 +987,7 @@ static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) { return true; } -// If `I` is a load instruction, used only by shufflevector instructions with +// If `I` is a load instruction, used only by shufflevector instructions with // poison values, attempt to shrink the load to only the lanes being used. static bool shrinkLoadsForBroadcast(Instruction &I) { auto *OldLoad = dyn_cast(&I); @@ -1008,7 +1008,7 @@ static bool shrinkLoadsForBroadcast(Instruction &I) { using IndexRange = std::pair; auto GetIndexRangeInShuffles = [&]() -> std::optional { auto OutputRange = IndexRange(VecTy->getNumElements(), 0u); - for (auto &Use: I.uses()) { + for (auto &Use : I.uses()) { // All uses must be ShuffleVector instructions. auto *Shuffle = dyn_cast(Use.getUser()); if (!Shuffle) @@ -1025,7 +1025,7 @@ static bool shrinkLoadsForBroadcast(Instruction &I) { auto *Op0Ty = cast(Op0->getType()); auto NumElems = Op0Ty->getNumElements(); - for (unsigned Index: Mask) { + for (unsigned Index : Mask) { if (Index < NumElems) { OutputRange.first = std::min(Index, OutputRange.first); OutputRange.second = std::max(Index, OutputRange.second); @@ -1047,26 +1047,25 @@ static bool shrinkLoadsForBroadcast(Instruction &I) { auto *ElemTy = VecTy->getElementType(); auto *NewVecTy = FixedVectorType::get(ElemTy, NewSize); auto *NewLoad = cast( - Builder.CreateLoad(NewVecTy, OldLoad->getPointerOperand())); + Builder.CreateLoad(NewVecTy, OldLoad->getPointerOperand())); NewLoad->copyMetadata(I); // Replace all users. - auto OldShuffles = SmallVector{}; - for (auto &Use: I.uses()) { + auto OldShuffles = SmallVector{}; + for (auto &Use : I.uses()) { auto *Shuffle = cast(Use.getUser()); - + Builder.SetInsertPoint(Shuffle); Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc()); auto *NewShuffle = Builder.CreateShuffleVector( - NewLoad, PoisonValue::get(NewVecTy), Shuffle->getShuffleMask() - ); + NewLoad, PoisonValue::get(NewVecTy), Shuffle->getShuffleMask()); Shuffle->replaceAllUsesWith(NewShuffle); OldShuffles.push_back(Shuffle); } // Erase old users. - for (auto *Shuffle: OldShuffles) + for (auto *Shuffle : OldShuffles) Shuffle->eraseFromParent(); I.eraseFromParent(); From 6f6fc11f7a4d300a8dd281826533bf5d057d8131 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Tue, 4 Mar 2025 17:44:56 +0000 Subject: [PATCH 04/22] Ignore non-simple loads. --- .../builtins-systemz-zvector-constrained.c | 4 +- .../SystemZ/builtins-systemz-zvector.c | 52 +++++++++---------- .../builtins-systemz-zvector2-constrained.c | 12 ++--- .../SystemZ/builtins-systemz-zvector2.c | 12 ++--- .../AggressiveInstCombine.cpp | 2 +- .../load-shufflevector.ll | 14 +++++ 6 files changed, 55 insertions(+), 41 deletions(-) diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c index e335c363ecb48..4993df20df143 100644 --- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c +++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c @@ -79,8 +79,8 @@ void test_core(void) { vec_xstd2(vd, idx, ptrd); vd = vec_splat(vd, 0); - // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer - // CHECK-ASM: vlrepg + // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer + // CHECK-ASM: vrepg vd = vec_splat(vd, 1); // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> // CHECK-ASM: vrepg diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c index 422c97a77511c..d5d15b4dea966 100644 --- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c +++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c @@ -777,80 +777,80 @@ void test_core(void) { // CHECK: <2 x i64> splat (i64 -4503582447501313) vsc = vec_splat(vsc, 0); - // CHECK: shufflevector <1 x i8> %{{.*}}, <1 x i8> poison, <16 x i32> zeroinitializer - // CHECK-ASM: vlrepb + // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> zeroinitializer + // CHECK-ASM: vrepb vsc = vec_splat(vsc, 15); // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> // CHECK-ASM: vrepb vuc = vec_splat(vuc, 0); - // CHECK: store volatile <16 x i8> splat (i8 {{.*}}), ptr @vuc - // CHECK-ASM: vst + // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> zeroinitializer + // CHECK-ASM: vrepb vuc = vec_splat(vuc, 15); // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> // CHECK-ASM: vrepb vbc = vec_splat(vbc, 0); - // CHECK: shufflevector <1 x i8> %{{.*}}, <1 x i8> poison, <16 x i32> zeroinitializer - // CHECK-ASM: vlrepb + // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> zeroinitializer + // CHECK-ASM: vrepb vbc = vec_splat(vbc, 15); // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> // CHECK-ASM: vrepb vss = vec_splat(vss, 0); - // CHECK: shufflevector <1 x i16> %{{.*}}, <1 x i16> poison, <8 x i32> zeroinitializer - // CHECK-ASM: vlreph + // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> zeroinitializer + // CHECK-ASM: vreph vss = vec_splat(vss, 7); // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> // CHECK-ASM: vreph vus = vec_splat(vus, 0); - // CHECK: store volatile <8 x i16> splat (i16 {{.*}}), ptr @vus - // CHECK-ASM: vst + // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> zeroinitializer + // CHECK-ASM: vreph vus = vec_splat(vus, 7); // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> // CHECK-ASM: vreph vbs = vec_splat(vbs, 0); - // CHECK: shufflevector <1 x i16> %{{.*}}, <1 x i16> poison, <8 x i32> zeroinitializer - // CHECK-ASM: vlreph + // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> zeroinitializer + // CHECK-ASM: vreph vbs = vec_splat(vbs, 7); // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> // CHECK-ASM: vreph vsi = vec_splat(vsi, 0); - // CHECK: shufflevector <1 x i32> %{{.*}}, <1 x i32> poison, <4 x i32> zeroinitializer - // CHECK-ASM: vlrepf + // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> zeroinitializer + // CHECK-ASM: vrepf vsi = vec_splat(vsi, 3); // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> // CHECK-ASM: vrepf vui = vec_splat(vui, 0); - // CHECK: store volatile <4 x i32> splat (i32 {{.*}}), ptr @vui - // CHECK-ASM: vst + // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> zeroinitializer + // CHECK-ASM: vrepf vui = vec_splat(vui, 3); // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> // CHECK-ASM: vrepf vbi = vec_splat(vbi, 0); - // CHECK: shufflevector <1 x i32> %{{.*}}, <1 x i32> poison, <4 x i32> zeroinitializer - // CHECK-ASM: vlrepf + // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> zeroinitializer + // CHECK-ASM: vrepf vbi = vec_splat(vbi, 3); // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> // CHECK-ASM: vrepf vsl = vec_splat(vsl, 0); - // CHECK: shufflevector <1 x i64> %{{.*}}, <1 x i64> poison, <2 x i32> zeroinitializer + // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> zeroinitializer // CHECK-ASM: vrepg vsl = vec_splat(vsl, 1); // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> - // CHECK-ASM: vst + // CHECK-ASM: vrepg vul = vec_splat(vul, 0); - // CHECK: store volatile <2 x i64> splat (i64 {{.*}}), ptr @vul - // CHECK-ASM: vst + // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> zeroinitializer + // CHECK-ASM: vrepg vul = vec_splat(vul, 1); // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> // CHECK-ASM: vrepg vbl = vec_splat(vbl, 0); - // CHECK: shufflevector <1 x i64> %{{.*}}, <1 x i64> poison, <2 x i32> zeroinitializer - // CHECK-ASM: vlrepg + // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> zeroinitializer + // CHECK-ASM: vrepg vbl = vec_splat(vbl, 1); // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> // CHECK-ASM: vrepg vd = vec_splat(vd, 0); - // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer - // CHECK-ASM: vlrepg + // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer + // CHECK-ASM: vrepg vd = vec_splat(vd, 1); // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> // CHECK-ASM: vrepg diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c index 2b79df2a1886e..25b3e0b68cd02 100644 --- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c +++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c @@ -130,14 +130,14 @@ void test_core(void) { // CHECK-ASM: vst vf = vec_splat(vf, 0); - // CHECK: shufflevector <1 x float> %{{.*}}, <1 x float> poison, <4 x i32> zeroinitializer - // CHECK-ASM: vlrepf + // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> zeroinitializer + // CHECK-ASM: vrepf vf = vec_splat(vf, 1); - // CHECK: shufflevector <2 x float> %{{.*}}, <2 x float> poison, <4 x i32> - // CHECK-ASM: vst + // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> + // CHECK-ASM: vrepf vd = vec_splat(vd, 0); - // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer - // CHECK-ASM: vlrepg + // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer + // CHECK-ASM: vrepg vd = vec_splat(vd, 1); // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> // CHECK-ASM: vrepg diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c index 1ccbe6df5f16d..c1ef178fcfaa9 100644 --- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c +++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c @@ -254,14 +254,14 @@ void test_core(void) { // CHECK-ASM: vstrlr vf = vec_splat(vf, 0); - // CHECK: shufflevector <1 x float> %{{.*}}, <1 x float> poison, <4 x i32> zeroinitializer - // CHECK-ASM: vlrepf + // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> zeroinitializer + // CHECK-ASM: vrepf vf = vec_splat(vf, 1); - // CHECK: shufflevector <2 x float> %{{.*}}, <2 x float> poison, <4 x i32> - // CHECK-ASM: vst + // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> + // CHECK-ASM: vrepf vd = vec_splat(vd, 0); - // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer - // CHECK-ASM: vlrepg + // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer + // CHECK-ASM: vrepg vd = vec_splat(vd, 1); // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> // CHECK-ASM: vrepg diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 4b81f4a8e1f04..70a8f5700043b 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -991,7 +991,7 @@ static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) { // poison values, attempt to shrink the load to only the lanes being used. static bool shrinkLoadsForBroadcast(Instruction &I) { auto *OldLoad = dyn_cast(&I); - if (!OldLoad) + if (!OldLoad || !OldLoad->isSimple()) return false; auto *VecTy = dyn_cast(I.getType()); diff --git a/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll b/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll index 57006f2c65380..9978c15e90beb 100644 --- a/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll +++ b/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll @@ -1,6 +1,20 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -passes=aggressive-instcombine -S < %s | FileCheck %s +define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { +; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1_volatile( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load volatile <4 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[TMP0]], <4 x half> poison, <8 x i32> +; CHECK-NEXT: ret <8 x half> [[TMP1]] +; +entry: + %val0 = load volatile <4 x half>, ptr addrspace(1) %arg0, align 32 + %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> + ret <8 x half> %val1 +} + define <8 x half> @shuffle_v4_v8f16_r0_1(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { From 7499a48455d1327621f552fc55a7ab19ac1aee9f Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Wed, 12 Mar 2025 00:21:08 +0000 Subject: [PATCH 05/22] Move transform to VectorCombine and update tests. --- .../AggressiveInstCombine.cpp | 89 ----------------- .../Transforms/Vectorize/VectorCombine.cpp | 99 +++++++++++++++++++ .../PhaseOrdering/X86/vec-load-combine.ll | 8 +- .../VectorCombine/X86/load-widening.ll | 6 +- .../VectorCombine/X86/shuffle-of-shuffles.ll | 24 ++--- .../load-shufflevector.ll | 2 +- 6 files changed, 116 insertions(+), 112 deletions(-) rename llvm/test/Transforms/{AggressiveInstCombine => VectorCombine}/load-shufflevector.ll (99%) diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 70a8f5700043b..8c156c93ba8d1 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -987,94 +987,6 @@ static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) { return true; } -// If `I` is a load instruction, used only by shufflevector instructions with -// poison values, attempt to shrink the load to only the lanes being used. -static bool shrinkLoadsForBroadcast(Instruction &I) { - auto *OldLoad = dyn_cast(&I); - if (!OldLoad || !OldLoad->isSimple()) - return false; - - auto *VecTy = dyn_cast(I.getType()); - if (!VecTy) - return false; - - auto IsPoisonOrUndef = [](Value *V) -> bool { - if (auto *C = dyn_cast(V)) { - return isa(C) || isa(C); - } - return false; - }; - - using IndexRange = std::pair; - auto GetIndexRangeInShuffles = [&]() -> std::optional { - auto OutputRange = IndexRange(VecTy->getNumElements(), 0u); - for (auto &Use : I.uses()) { - // All uses must be ShuffleVector instructions. - auto *Shuffle = dyn_cast(Use.getUser()); - if (!Shuffle) - return {}; - - // Get index range for value. - auto *Op0 = Shuffle->getOperand(0u); - auto *Op1 = Shuffle->getOperand(1u); - if (!IsPoisonOrUndef(Op1)) - return {}; - - // Find the min and max indices used by the ShuffleVector instruction. - auto Mask = Shuffle->getShuffleMask(); - auto *Op0Ty = cast(Op0->getType()); - auto NumElems = Op0Ty->getNumElements(); - - for (unsigned Index : Mask) { - if (Index < NumElems) { - OutputRange.first = std::min(Index, OutputRange.first); - OutputRange.second = std::max(Index, OutputRange.second); - } - } - } - return OutputRange; - }; - - if (auto Indices = GetIndexRangeInShuffles()) { - auto OldSize = VecTy->getNumElements(); - auto NewSize = Indices->second + 1u; - - if (NewSize < OldSize) { - auto Builder = IRBuilder(&I); - Builder.SetCurrentDebugLocation(I.getDebugLoc()); - - // Create new load of smaller vector. - auto *ElemTy = VecTy->getElementType(); - auto *NewVecTy = FixedVectorType::get(ElemTy, NewSize); - auto *NewLoad = cast( - Builder.CreateLoad(NewVecTy, OldLoad->getPointerOperand())); - NewLoad->copyMetadata(I); - - // Replace all users. - auto OldShuffles = SmallVector{}; - for (auto &Use : I.uses()) { - auto *Shuffle = cast(Use.getUser()); - - Builder.SetInsertPoint(Shuffle); - Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc()); - auto *NewShuffle = Builder.CreateShuffleVector( - NewLoad, PoisonValue::get(NewVecTy), Shuffle->getShuffleMask()); - - Shuffle->replaceAllUsesWith(NewShuffle); - OldShuffles.push_back(Shuffle); - } - - // Erase old users. - for (auto *Shuffle : OldShuffles) - Shuffle->eraseFromParent(); - - I.eraseFromParent(); - return true; - } - } - return false; -} - namespace { class StrNCmpInliner { public: @@ -1413,7 +1325,6 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT, MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT); MadeChange |= foldPatternedLoads(I, DL); MadeChange |= foldICmpOrChain(I, DL, TTI, AA, DT); - MadeChange |= shrinkLoadsForBroadcast(I); // NOTE: This function introduces erasing of the instruction `I`, so it // needs to be called at the end of this sequence, otherwise we may make // bugs. diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index fe8d74c43dfdc..ffcfaa5871ef6 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -137,6 +137,7 @@ class VectorCombine { bool foldSelectShuffle(Instruction &I, bool FromReduction = false); bool foldInterleaveIntrinsics(Instruction &I); bool shrinkType(Instruction &I); + bool shrinkLoadForShuffles(Instruction &I); void replaceValue(Value &Old, Value &New) { LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n'); @@ -3691,6 +3692,101 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) { return true; } +// If `I` is a load instruction, used only by shufflevector instructions with +// poison values, attempt to shrink the load to only the lanes being used. +bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { + auto *OldLoad = dyn_cast(&I); + if (!OldLoad || !OldLoad->isSimple()) + return false; + + auto *VecTy = dyn_cast(I.getType()); + if (!VecTy) + return false; + + auto IsPoisonOrUndef = [](Value *V) -> bool { + if (auto *C = dyn_cast(V)) { + return isa(C) || isa(C); + } + return false; + }; + + using IndexRange = std::pair; + auto GetIndexRangeInShuffles = [&]() -> std::optional { + auto OutputRange = IndexRange(VecTy->getNumElements(), -1); + for (auto &Use : I.uses()) { + // All uses must be ShuffleVector instructions. + auto *Shuffle = dyn_cast(Use.getUser()); + if (!Shuffle) + return {}; + + // Get index range for value. + auto *Op0 = Shuffle->getOperand(0u); + auto *Op1 = Shuffle->getOperand(1u); + if (!IsPoisonOrUndef(Op1)) + return {}; + + // Find the min and max indices used by the ShuffleVector instruction. + auto Mask = Shuffle->getShuffleMask(); + auto *Op0Ty = cast(Op0->getType()); + auto NumElems = int(Op0Ty->getNumElements()); + + for (auto Index : Mask) { + if (Index >= 0 && Index < NumElems) { + OutputRange.first = std::min(Index, OutputRange.first); + OutputRange.second = std::max(Index, OutputRange.second); + } + } + + if (OutputRange.second < OutputRange.first) + return {}; + } + return OutputRange; + }; + + if (auto Indices = GetIndexRangeInShuffles()) { + auto OldSize = VecTy->getNumElements(); + auto NewSize = Indices->second + 1u; + + if (NewSize < OldSize) { + auto Builder = IRBuilder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + // Create new load of smaller vector. + auto *ElemTy = VecTy->getElementType(); + auto *NewVecTy = FixedVectorType::get(ElemTy, NewSize); + auto *NewLoad = cast( + Builder.CreateLoad(NewVecTy, OldLoad->getPointerOperand())); + NewLoad->copyMetadata(I); + + // Compare cost of old and new loads. + auto OldCost = TTI.getMemoryOpCost( + Instruction::Load, OldLoad->getType(), OldLoad->getAlign(), + OldLoad->getPointerAddressSpace(), CostKind); + auto NewCost = TTI.getMemoryOpCost( + Instruction::Load, NewLoad->getType(), NewLoad->getAlign(), + NewLoad->getPointerAddressSpace(), CostKind); + + if (OldCost < NewCost || !NewCost.isValid()) + return false; + + // Replace all users. + for (auto &Use : I.uses()) { + auto *Shuffle = cast(Use.getUser()); + + Builder.SetInsertPoint(Shuffle); + Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc()); + auto *NewShuffle = Builder.CreateShuffleVector( + NewLoad, PoisonValue::get(NewVecTy), Shuffle->getShuffleMask()); + + replaceValue(*Shuffle, *NewShuffle); + } + + return true; + } + } + return false; +} + /// This is the entry point for all transforms. Pass manager differences are /// handled in the callers of this function. bool VectorCombine::run() { @@ -3775,6 +3871,9 @@ bool VectorCombine::run() { case Instruction::Xor: MadeChange |= foldBitOpOfBitcasts(I); break; + case Instruction::Load: + MadeChange |= shrinkLoadForShuffles(I); + break; default: MadeChange |= shrinkType(I); break; diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll index 85f6fceb5bdbe..9218cc2d019f8 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll @@ -11,13 +11,13 @@ $getAt = comdat any define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 { ; SSE-LABEL: @ConvertVectors_ByRef( -; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16 -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16 +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> ; SSE-NEXT: ret <4 x float> [[TMP3]] ; ; AVX-LABEL: @ConvertVectors_ByRef( -; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16 -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16 +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> ; AVX-NEXT: ret <4 x float> [[TMP3]] ; %2 = alloca ptr, align 8 diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll index 30a089818074e..07bd3966e8202 100644 --- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll @@ -336,7 +336,7 @@ define <8 x float> @load_v2f32_v8f32(ptr dereferenceable(32) %p) { define <4 x i32> @load_v2i32_v4i32(ptr dereferenceable(16) %p) { ; CHECK-LABEL: @load_v2i32_v4i32( -; CHECK-NEXT: [[S:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[S:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4 ; CHECK-NEXT: ret <4 x i32> [[S]] ; %l = load <2 x i32>, ptr %p, align 1 @@ -443,8 +443,8 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address { ; CHECK-LABEL: @load_v2i32_v4i32_asan( -; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1 -; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[S]] ; %l = load <2 x i32>, ptr %p, align 1 diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll index b30dc9ffdc596..9a051361e2081 100644 --- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll +++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll @@ -47,21 +47,12 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) { ; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles) define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) { -; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64( -; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] { -; SSE-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32 -; SSE-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32 -; SSE-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> -; SSE-NEXT: ret <4 x double> [[BLEND]] -; -; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64( -; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] { -; AVX-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32 -; AVX-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32 -; AVX-NEXT: [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer -; AVX-NEXT: [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer -; AVX-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> -; AVX-NEXT: ret <4 x double> [[BLEND]] +; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64( +; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 8 +; CHECK-NEXT: [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> +; CHECK-NEXT: ret <4 x double> [[BLEND]] ; %ld0 = load <4 x double>, ptr %p0, align 32 %ld1 = load <4 x double>, ptr %p1, align 32 @@ -81,3 +72,6 @@ define <2 x float> @PR86068(<2 x float> %a0, <2 x float> %a1) { %s2 = shufflevector <2 x float> %s1, <2 x float> %a0, <2 x i32> ret <2 x float> %s2 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX: {{.*}} +; SSE: {{.*}} diff --git a/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll similarity index 99% rename from llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll rename to llvm/test/Transforms/VectorCombine/load-shufflevector.ll index 9978c15e90beb..44724a5f1f127 100644 --- a/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll +++ b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -passes=aggressive-instcombine -S < %s | FileCheck %s +; RUN: opt -passes=vector-combine -S < %s | FileCheck %s define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1_volatile( From c86d296d8646d4ae2a7f894683aae76782b7bf6c Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Fri, 14 Mar 2025 17:40:27 +0000 Subject: [PATCH 06/22] Address review comments and update tests. --- .../Transforms/Vectorize/VectorCombine.cpp | 26 ++++++++++++---- .../VectorCombine/X86/load-widening.ll | 4 +-- .../VectorCombine/X86/shuffle-of-shuffles.ll | 4 +-- .../VectorCombine/load-shufflevector.ll | 30 +++++++++---------- 4 files changed, 39 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index ffcfaa5871ef6..1ba5b1539e9a5 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -3736,10 +3736,11 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { OutputRange.second = std::max(Index, OutputRange.second); } } - - if (OutputRange.second < OutputRange.first) - return {}; } + + if (OutputRange.second < OutputRange.first) + return {}; + return OutputRange; }; @@ -3754,11 +3755,12 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { // Create new load of smaller vector. auto *ElemTy = VecTy->getElementType(); auto *NewVecTy = FixedVectorType::get(ElemTy, NewSize); + auto *PtrOp = OldLoad->getPointerOperand(); auto *NewLoad = cast( - Builder.CreateLoad(NewVecTy, OldLoad->getPointerOperand())); + Builder.CreateAlignedLoad(NewVecTy, PtrOp, OldLoad->getAlign())); NewLoad->copyMetadata(I); - // Compare cost of old and new loads. + // Compare cost of old and new ops. auto OldCost = TTI.getMemoryOpCost( Instruction::Load, OldLoad->getType(), OldLoad->getAlign(), OldLoad->getPointerAddressSpace(), CostKind); @@ -3766,8 +3768,20 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { Instruction::Load, NewLoad->getType(), NewLoad->getAlign(), NewLoad->getPointerAddressSpace(), CostKind); - if (OldCost < NewCost || !NewCost.isValid()) + for (auto &Use : I.uses()) { + auto *Shuffle = cast(Use.getUser()); + auto Mask = Shuffle->getShuffleMask(); + + OldCost += TTI.getShuffleCost( + TTI::SK_PermuteSingleSrc, VecTy, Mask, CostKind); + NewCost += TTI.getShuffleCost( + TTI::SK_PermuteSingleSrc, NewVecTy, Mask, CostKind); + } + + if (OldCost < NewCost || !NewCost.isValid()) { + NewLoad->eraseFromParent(); return false; + } // Replace all users. for (auto &Use : I.uses()) { diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll index 07bd3966e8202..eacc40bfa9b53 100644 --- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll @@ -336,7 +336,7 @@ define <8 x float> @load_v2f32_v8f32(ptr dereferenceable(32) %p) { define <4 x i32> @load_v2i32_v4i32(ptr dereferenceable(16) %p) { ; CHECK-LABEL: @load_v2i32_v4i32( -; CHECK-NEXT: [[S:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[S:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1 ; CHECK-NEXT: ret <4 x i32> [[S]] ; %l = load <2 x i32>, ptr %p, align 1 @@ -443,7 +443,7 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address { ; CHECK-LABEL: @load_v2i32_v4i32_asan( -; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 1 ; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[S]] ; diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll index 9a051361e2081..eddfc57a7d256 100644 --- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll +++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll @@ -49,8 +49,8 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) { define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) { ; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64( ; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 32 +; CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 32 ; CHECK-NEXT: [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> ; CHECK-NEXT: ret <4 x double> [[BLEND]] ; diff --git a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll index 44724a5f1f127..3e302c5f43032 100644 --- a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll +++ b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll @@ -19,7 +19,7 @@ define <8 x half> @shuffle_v4_v8f16_r0_1(ptr addrspace(1) nocapture readonly %ar ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> ; CHECK-NEXT: ret <8 x half> [[TMP1]] ; @@ -33,7 +33,7 @@ define <8 x half> @shuffle_v4_v8f16_r0_2(ptr addrspace(1) nocapture readonly %ar ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> ; CHECK-NEXT: ret <8 x half> [[TMP1]] ; @@ -47,7 +47,7 @@ define <4 x half> @shuffle_v4_v4f16_r1_2(ptr addrspace(1) nocapture readonly %ar ; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> ; CHECK-NEXT: ret <4 x half> [[TMP1]] ; @@ -61,7 +61,7 @@ define <8 x half> @shuffle_v4_v8f16_r1_2(ptr addrspace(1) nocapture readonly %ar ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> ; CHECK-NEXT: ret <8 x half> [[TMP1]] ; @@ -75,7 +75,7 @@ define <8 x half> @shuffle_v4_v8f16_cond_r0_1(ptr addrspace(1) nocapture readonl ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r0_1( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> zeroinitializer @@ -108,7 +108,7 @@ define <4 x half> @shuffle_v4_v4f16_cond_r1_2(ptr addrspace(1) nocapture readonl ; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_cond_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> @@ -141,7 +141,7 @@ define <8 x half> @shuffle_v4_v8f16_cond_r1_2(ptr addrspace(1) nocapture readonl ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> @@ -174,7 +174,7 @@ define <8 x i32> @shuffle_v4_v8i32_r0_1(ptr addrspace(1) nocapture readonly %arg ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_1( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP1]] ; @@ -188,7 +188,7 @@ define <8 x i32> @shuffle_v4_v8i32_r0_2(ptr addrspace(1) nocapture readonly %arg ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP1]] ; @@ -202,7 +202,7 @@ define <4 x i32> @shuffle_v4_v4i32_r1_2(ptr addrspace(1) nocapture readonly %arg ; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP1]] ; @@ -216,7 +216,7 @@ define <8 x i32> @shuffle_v4_v8i32_r1_2(ptr addrspace(1) nocapture readonly %arg ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP1]] ; @@ -230,7 +230,7 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(ptr addrspace(1) nocapture readonly ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_1( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> zeroinitializer @@ -263,7 +263,7 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(ptr addrspace(1) nocapture readonly ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> zeroinitializer @@ -296,7 +296,7 @@ define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(ptr addrspace(1) nocapture readonly ; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_cond_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> @@ -329,7 +329,7 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(ptr addrspace(1) nocapture readonly ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> From 16f7320ffadecf3075dd77c78a5b1257a7e24e6e Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Fri, 14 Mar 2025 17:53:21 +0000 Subject: [PATCH 07/22] Code formatting. --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 1ba5b1539e9a5..70533bd0985fc 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -3772,10 +3772,10 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { auto *Shuffle = cast(Use.getUser()); auto Mask = Shuffle->getShuffleMask(); - OldCost += TTI.getShuffleCost( - TTI::SK_PermuteSingleSrc, VecTy, Mask, CostKind); - NewCost += TTI.getShuffleCost( - TTI::SK_PermuteSingleSrc, NewVecTy, Mask, CostKind); + OldCost += + TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, Mask, CostKind); + NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, NewVecTy, Mask, + CostKind); } if (OldCost < NewCost || !NewCost.isValid()) { From dfe34590c174f31afb21e6cc5f010c470cab4e9a Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Fri, 21 Mar 2025 05:23:37 +0000 Subject: [PATCH 08/22] Add cost analysis for shufflevector ops and update tests. --- clang/test/CodeGenOpenCL/preserve_vec3.cl | 20 ++--- .../Transforms/Vectorize/VectorCombine.cpp | 52 +++++++---- .../PhaseOrdering/X86/vec-load-combine.ll | 8 +- .../VectorCombine/X86/load-widening.ll | 4 +- .../VectorCombine/X86/shuffle-of-shuffles.ll | 24 +++-- .../VectorCombine/load-shufflevector.ll | 88 +++++++++---------- 6 files changed, 109 insertions(+), 87 deletions(-) diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl index 0538eac4029bb..49ebae6fc7013 100644 --- a/clang/test/CodeGenOpenCL/preserve_vec3.cl +++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl @@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4))); // CHECK-LABEL: define dso_local spir_kernel void @foo( // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16 -// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> +// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16 +// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> // CHECK-NEXT: store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]] // CHECK-NEXT: ret void // @@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) { // CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3( // CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]] -// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]] +// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> // CHECK-NEXT: store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // @@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) { // CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4( // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16 -// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> +// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16 +// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> // CHECK-NEXT: store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // @@ -47,8 +47,8 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) { // CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2( // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> +// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> // CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // @@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) { // CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3( // CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]] -// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]] +// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> // CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 70533bd0985fc..3231d91d79a8b 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" @@ -36,6 +37,7 @@ #include #include #include +#include #define DEBUG_TYPE "vector-combine" #include "llvm/Transforms/Utils/InstructionWorklist.h" @@ -3692,10 +3694,13 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) { return true; } -// If `I` is a load instruction, used only by shufflevector instructions with -// poison values, attempt to shrink the load to only the lanes being used. +// Attempt to shrink loads that are only used by shufflevector instructions. bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { - auto *OldLoad = dyn_cast(&I); + auto *InputShuffle = dyn_cast(&I); + if (!InputShuffle) + return {}; + + auto *OldLoad = dyn_cast(InputShuffle->getOperand(0u)); if (!OldLoad || !OldLoad->isSimple()) return false; @@ -3731,7 +3736,8 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { auto NumElems = int(Op0Ty->getNumElements()); for (auto Index : Mask) { - if (Index >= 0 && Index < NumElems) { + if (Index >= 0) { + Index %= NumElems; OutputRange.first = std::min(Index, OutputRange.first); OutputRange.second = std::max(Index, OutputRange.second); } @@ -3760,7 +3766,7 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { Builder.CreateAlignedLoad(NewVecTy, PtrOp, OldLoad->getAlign())); NewLoad->copyMetadata(I); - // Compare cost of old and new ops. + // Calculate costs of old and new ops. auto OldCost = TTI.getMemoryOpCost( Instruction::Load, OldLoad->getType(), OldLoad->getAlign(), OldLoad->getPointerAddressSpace(), CostKind); @@ -3768,14 +3774,25 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { Instruction::Load, NewLoad->getType(), NewLoad->getAlign(), NewLoad->getPointerAddressSpace(), CostKind); + using UseEntry = std::pair>; + auto NewUses = SmallVector(); + auto SizeDiff = OldSize - NewSize; + for (auto &Use : I.uses()) { auto *Shuffle = cast(Use.getUser()); - auto Mask = Shuffle->getShuffleMask(); - - OldCost += - TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, Mask, CostKind); - NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, NewVecTy, Mask, - CostKind); + auto OldMask = Shuffle->getShuffleMask(); + + // Create entry for new use. + NewUses.push_back({Shuffle, {}}); + auto &NewMask = NewUses.back().second; + for (auto Index : OldMask) + NewMask.push_back(Index >= int(OldSize) ? Index - SizeDiff : Index); + + // Update costs. + OldCost += TTI.getShuffleCost( + TTI::SK_PermuteSingleSrc, VecTy, OldMask, CostKind); + NewCost += TTI.getShuffleCost( + TTI::SK_PermuteSingleSrc, NewVecTy, NewMask, CostKind); } if (OldCost < NewCost || !NewCost.isValid()) { @@ -3783,14 +3800,15 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { return false; } - // Replace all users. - for (auto &Use : I.uses()) { - auto *Shuffle = cast(Use.getUser()); + // Replace all uses. + for (auto &Use : NewUses) { + auto *Shuffle = Use.first; + auto &NewMask = Use.second; Builder.SetInsertPoint(Shuffle); Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc()); auto *NewShuffle = Builder.CreateShuffleVector( - NewLoad, PoisonValue::get(NewVecTy), Shuffle->getShuffleMask()); + NewLoad, PoisonValue::get(NewVecTy), NewMask); replaceValue(*Shuffle, *NewShuffle); } @@ -3876,6 +3894,7 @@ bool VectorCombine::run() { MadeChange |= foldShuffleOfIntrinsics(I); MadeChange |= foldSelectShuffle(I); MadeChange |= foldShuffleToIdentity(I); + MadeChange |= shrinkLoadForShuffles(I); break; case Instruction::BitCast: MadeChange |= foldBitcastShuffle(I); @@ -3885,9 +3904,6 @@ bool VectorCombine::run() { case Instruction::Xor: MadeChange |= foldBitOpOfBitcasts(I); break; - case Instruction::Load: - MadeChange |= shrinkLoadForShuffles(I); - break; default: MadeChange |= shrinkType(I); break; diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll index 9218cc2d019f8..85f6fceb5bdbe 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll @@ -11,13 +11,13 @@ $getAt = comdat any define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 { ; SSE-LABEL: @ConvertVectors_ByRef( -; SSE-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16 -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16 +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> ; SSE-NEXT: ret <4 x float> [[TMP3]] ; ; AVX-LABEL: @ConvertVectors_ByRef( -; AVX-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16 -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16 +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> ; AVX-NEXT: ret <4 x float> [[TMP3]] ; %2 = alloca ptr, align 8 diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll index eacc40bfa9b53..30a089818074e 100644 --- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll @@ -443,8 +443,8 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address { ; CHECK-LABEL: @load_v2i32_v4i32_asan( -; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 1 -; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> +; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[S]] ; %l = load <2 x i32>, ptr %p, align 1 diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll index eddfc57a7d256..b30dc9ffdc596 100644 --- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll +++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll @@ -47,12 +47,21 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) { ; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles) define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) { -; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64( -; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 32 -; CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 32 -; CHECK-NEXT: [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> -; CHECK-NEXT: ret <4 x double> [[BLEND]] +; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64( +; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] { +; SSE-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32 +; SSE-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32 +; SSE-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> +; SSE-NEXT: ret <4 x double> [[BLEND]] +; +; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64( +; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] { +; AVX-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32 +; AVX-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32 +; AVX-NEXT: [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer +; AVX-NEXT: [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer +; AVX-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> +; AVX-NEXT: ret <4 x double> [[BLEND]] ; %ld0 = load <4 x double>, ptr %p0, align 32 %ld1 = load <4 x double>, ptr %p1, align 32 @@ -72,6 +81,3 @@ define <2 x float> @PR86068(<2 x float> %a0, <2 x float> %a1) { %s2 = shufflevector <2 x float> %s1, <2 x float> %a0, <2 x i32> ret <2 x float> %s2 } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; AVX: {{.*}} -; SSE: {{.*}} diff --git a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll index 3e302c5f43032..56c51ce7315cf 100644 --- a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll +++ b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll @@ -19,8 +19,8 @@ define <8 x half> @shuffle_v4_v8f16_r0_1(ptr addrspace(1) nocapture readonly %ar ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> ; CHECK-NEXT: ret <8 x half> [[TMP1]] ; entry: @@ -33,8 +33,8 @@ define <8 x half> @shuffle_v4_v8f16_r0_2(ptr addrspace(1) nocapture readonly %ar ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> ; CHECK-NEXT: ret <8 x half> [[TMP1]] ; entry: @@ -47,8 +47,8 @@ define <4 x half> @shuffle_v4_v4f16_r1_2(ptr addrspace(1) nocapture readonly %ar ; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> ; CHECK-NEXT: ret <4 x half> [[TMP1]] ; entry: @@ -61,8 +61,8 @@ define <8 x half> @shuffle_v4_v8f16_r1_2(ptr addrspace(1) nocapture readonly %ar ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> ; CHECK-NEXT: ret <8 x half> [[TMP1]] ; entry: @@ -75,16 +75,16 @@ define <8 x half> @shuffle_v4_v8f16_cond_r0_1(ptr addrspace(1) nocapture readonl ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r0_1( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> ; CHECK-NEXT: br label %[[FINALLY]] ; CHECK: [[FINALLY]]: -; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ] +; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] ; CHECK-NEXT: ret <8 x half> [[VAL3]] ; entry: @@ -108,16 +108,16 @@ define <4 x half> @shuffle_v4_v4f16_cond_r1_2(ptr addrspace(1) nocapture readonl ; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_cond_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> ; CHECK-NEXT: br label %[[FINALLY]] ; CHECK: [[FINALLY]]: -; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x half> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ] +; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] ; CHECK-NEXT: ret <4 x half> [[VAL3]] ; entry: @@ -141,16 +141,16 @@ define <8 x half> @shuffle_v4_v8f16_cond_r1_2(ptr addrspace(1) nocapture readonl ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> ; CHECK-NEXT: br label %[[FINALLY]] ; CHECK: [[FINALLY]]: -; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ] +; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] ; CHECK-NEXT: ret <8 x half> [[VAL3]] ; entry: @@ -174,8 +174,8 @@ define <8 x i32> @shuffle_v4_v8i32_r0_1(ptr addrspace(1) nocapture readonly %arg ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_1( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP1]] ; entry: @@ -188,8 +188,8 @@ define <8 x i32> @shuffle_v4_v8i32_r0_2(ptr addrspace(1) nocapture readonly %arg ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP1]] ; entry: @@ -202,8 +202,8 @@ define <4 x i32> @shuffle_v4_v4i32_r1_2(ptr addrspace(1) nocapture readonly %arg ; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP1]] ; entry: @@ -216,8 +216,8 @@ define <8 x i32> @shuffle_v4_v8i32_r1_2(ptr addrspace(1) nocapture readonly %arg ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP1]] ; entry: @@ -230,16 +230,16 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(ptr addrspace(1) nocapture readonly ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_1( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> ; CHECK-NEXT: br label %[[FINALLY]] ; CHECK: [[FINALLY]]: -; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ] +; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] ; CHECK-NEXT: ret <8 x i32> [[VAL3]] ; entry: @@ -263,16 +263,16 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(ptr addrspace(1) nocapture readonly ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> ; CHECK-NEXT: br label %[[FINALLY]] ; CHECK: [[FINALLY]]: -; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ] +; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] ; CHECK-NEXT: ret <8 x i32> [[VAL3]] ; entry: @@ -296,16 +296,16 @@ define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(ptr addrspace(1) nocapture readonly ; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_cond_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: br label %[[FINALLY]] ; CHECK: [[FINALLY]]: -; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ] +; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] ; CHECK-NEXT: ret <4 x i32> [[VAL3]] ; entry: @@ -329,16 +329,16 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(ptr addrspace(1) nocapture readonly ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> ; CHECK-NEXT: br label %[[FINALLY]] ; CHECK: [[FINALLY]]: -; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ] +; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] ; CHECK-NEXT: ret <8 x i32> [[VAL3]] ; entry: From 125c3f27552f47216c3a294062cd22144d2f573d Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Fri, 21 Mar 2025 05:29:14 +0000 Subject: [PATCH 09/22] Code formatting. --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 3231d91d79a8b..bb5695119a111 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -3774,7 +3774,7 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { Instruction::Load, NewLoad->getType(), NewLoad->getAlign(), NewLoad->getPointerAddressSpace(), CostKind); - using UseEntry = std::pair>; + using UseEntry = std::pair>; auto NewUses = SmallVector(); auto SizeDiff = OldSize - NewSize; @@ -3789,10 +3789,10 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { NewMask.push_back(Index >= int(OldSize) ? Index - SizeDiff : Index); // Update costs. - OldCost += TTI.getShuffleCost( - TTI::SK_PermuteSingleSrc, VecTy, OldMask, CostKind); - NewCost += TTI.getShuffleCost( - TTI::SK_PermuteSingleSrc, NewVecTy, NewMask, CostKind); + OldCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, OldMask, + CostKind); + NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, NewVecTy, + NewMask, CostKind); } if (OldCost < NewCost || !NewCost.isValid()) { From 3afabd1252e28abfd35beff9c9e2a9ff6436e209 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Mon, 24 Mar 2025 23:55:57 +0000 Subject: [PATCH 10/22] Address comments. --- .../Transforms/Vectorize/VectorCombine.cpp | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index bb5695119a111..52fef14690616 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -3698,7 +3698,7 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) { bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { auto *InputShuffle = dyn_cast(&I); if (!InputShuffle) - return {}; + return false; auto *OldLoad = dyn_cast(InputShuffle->getOperand(0u)); if (!OldLoad || !OldLoad->isSimple()) @@ -3708,34 +3708,31 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { if (!VecTy) return false; - auto IsPoisonOrUndef = [](Value *V) -> bool { - if (auto *C = dyn_cast(V)) { - return isa(C) || isa(C); - } - return false; - }; - + // Search all uses of `I`. If all uses are shufflevector ops, and the second + // operands are all poison values, find the minimum and maximum indices of + // the vector elements referenced by all shuffle masks. + // Otherwise return `std::nullopt`. using IndexRange = std::pair; auto GetIndexRangeInShuffles = [&]() -> std::optional { - auto OutputRange = IndexRange(VecTy->getNumElements(), -1); + IndexRange OutputRange = IndexRange(VecTy->getNumElements(), -1); for (auto &Use : I.uses()) { // All uses must be ShuffleVector instructions. auto *Shuffle = dyn_cast(Use.getUser()); if (!Shuffle) - return {}; + return std::nullopt; // Get index range for value. - auto *Op0 = Shuffle->getOperand(0u); - auto *Op1 = Shuffle->getOperand(1u); - if (!IsPoisonOrUndef(Op1)) - return {}; + auto *Op0 = Shuffle->getOperand(0); + auto *Op1 = Shuffle->getOperand(1); + if (!isa(Op1) && !isa(Op1)) + return std::nullopt; // Find the min and max indices used by the ShuffleVector instruction. - auto Mask = Shuffle->getShuffleMask(); + ArrayRef Mask = Shuffle->getShuffleMask(); auto *Op0Ty = cast(Op0->getType()); auto NumElems = int(Op0Ty->getNumElements()); - for (auto Index : Mask) { + for (int Index : Mask) { if (Index >= 0) { Index %= NumElems; OutputRange.first = std::min(Index, OutputRange.first); @@ -3745,15 +3742,18 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { } if (OutputRange.second < OutputRange.first) - return {}; + return std::nullopt; return OutputRange; }; + // Find the range of vector elements used by shufflevector ops, if possible. if (auto Indices = GetIndexRangeInShuffles()) { - auto OldSize = VecTy->getNumElements(); - auto NewSize = Indices->second + 1u; + unsigned OldSize = VecTy->getNumElements(); + unsigned NewSize = Indices->second + 1u; + // If the range of vector elements is smaller than the full load, attempt + // to create a smaller load. if (NewSize < OldSize) { auto Builder = IRBuilder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); From 875c408fe1ec643e7bf93c782e2ae16ab4421d6f Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Wed, 26 Mar 2025 12:37:30 +0000 Subject: [PATCH 11/22] Fix conflict with subvector load widening. --- clang/test/CodeGenOpenCL/preserve_vec3.cl | 22 +++--- .../Transforms/Vectorize/VectorCombine.cpp | 29 ++++---- .../PhaseOrdering/X86/vec-load-combine.ll | 8 +- .../VectorCombine/X86/load-inseltpoison.ll | 6 +- .../VectorCombine/X86/load-widening.ll | 4 +- .../VectorCombine/X86/shuffle-of-shuffles.ll | 24 +++--- .../VectorCombine/load-shufflevector.ll | 74 +++++++++---------- 7 files changed, 81 insertions(+), 86 deletions(-) diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl index 49ebae6fc7013..e73657e30d884 100644 --- a/clang/test/CodeGenOpenCL/preserve_vec3.cl +++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl @@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4))); // CHECK-LABEL: define dso_local spir_kernel void @foo( // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16 -// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16 +// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> // CHECK-NEXT: store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]] // CHECK-NEXT: ret void // @@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) { // CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3( // CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]] -// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]] +// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> // CHECK-NEXT: store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // @@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) { // CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4( // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16 -// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16 +// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> // CHECK-NEXT: store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // @@ -47,9 +47,9 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) { // CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2( // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> -// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]] +// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> +// CHECK-NEXT: store <4 x float> [[TMP1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // void kernel float3_to_double2(global float3 *a, global double2 *b) { @@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) { // CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3( // CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]] -// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]] +// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> // CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 52fef14690616..024d897661108 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -35,6 +35,7 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include +#include #include #include #include @@ -3696,38 +3697,38 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) { // Attempt to shrink loads that are only used by shufflevector instructions. bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { - auto *InputShuffle = dyn_cast(&I); - if (!InputShuffle) - return false; - - auto *OldLoad = dyn_cast(InputShuffle->getOperand(0u)); + auto *OldLoad = dyn_cast(&I); if (!OldLoad || !OldLoad->isSimple()) return false; - auto *VecTy = dyn_cast(I.getType()); + auto *VecTy = dyn_cast(OldLoad->getType()); if (!VecTy) return false; - // Search all uses of `I`. If all uses are shufflevector ops, and the second - // operands are all poison values, find the minimum and maximum indices of - // the vector elements referenced by all shuffle masks. + // Search all uses of load. If all uses are shufflevector instructions, and + // the second operands are all poison values, find the minimum and maximum + // indices of the vector elements referenced by all shuffle masks. // Otherwise return `std::nullopt`. using IndexRange = std::pair; auto GetIndexRangeInShuffles = [&]() -> std::optional { IndexRange OutputRange = IndexRange(VecTy->getNumElements(), -1); for (auto &Use : I.uses()) { - // All uses must be ShuffleVector instructions. + // All uses must be shufflevector instructions. auto *Shuffle = dyn_cast(Use.getUser()); if (!Shuffle) return std::nullopt; - // Get index range for value. + // Ignore shufflevector instructions that have no uses. + if (!Shuffle->hasNUsesOrMore(1u)) + continue; + + // Ensure second operand is a poison/undef value. auto *Op0 = Shuffle->getOperand(0); auto *Op1 = Shuffle->getOperand(1); if (!isa(Op1) && !isa(Op1)) return std::nullopt; - // Find the min and max indices used by the ShuffleVector instruction. + // Find the min and max indices used by the shufflevector instruction. ArrayRef Mask = Shuffle->getShuffleMask(); auto *Op0Ty = cast(Op0->getType()); auto NumElems = int(Op0Ty->getNumElements()); @@ -3747,7 +3748,7 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { return OutputRange; }; - // Find the range of vector elements used by shufflevector ops, if possible. + // Get the range of vector elements used by shufflevector instructions. if (auto Indices = GetIndexRangeInShuffles()) { unsigned OldSize = VecTy->getNumElements(); unsigned NewSize = Indices->second + 1u; @@ -3894,6 +3895,8 @@ bool VectorCombine::run() { MadeChange |= foldShuffleOfIntrinsics(I); MadeChange |= foldSelectShuffle(I); MadeChange |= foldShuffleToIdentity(I); + break; + case Instruction::Load: MadeChange |= shrinkLoadForShuffles(I); break; case Instruction::BitCast: diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll index 85f6fceb5bdbe..9218cc2d019f8 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll @@ -11,13 +11,13 @@ $getAt = comdat any define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 { ; SSE-LABEL: @ConvertVectors_ByRef( -; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16 -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16 +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> ; SSE-NEXT: ret <4 x float> [[TMP3]] ; ; AVX-LABEL: @ConvertVectors_ByRef( -; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16 -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16 +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> ; AVX-NEXT: ret <4 x float> [[TMP3]] ; %2 = alloca ptr, align 8 diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll index 977da754ec5a7..0c2346e616e36 100644 --- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll @@ -252,8 +252,7 @@ define <4 x i32> @unsafe_load_i32_insert_v4i32_addrspace(ptr align 16 dereferenc define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) %p) nofree nosync { ; CHECK-LABEL: @gep01_load_i16_insert_v8i16( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 2 -; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 2 ; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1 @@ -341,8 +340,7 @@ define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceab define <8 x i16> @gep10_load_i16_insert_v8i16(ptr align 16 dereferenceable(32) %p) nofree nosync { ; CHECK-LABEL: @gep10_load_i16_insert_v8i16( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 16 ; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0 diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll index 30a089818074e..eacc40bfa9b53 100644 --- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll @@ -443,8 +443,8 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address { ; CHECK-LABEL: @load_v2i32_v4i32_asan( -; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1 -; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[S]] ; %l = load <2 x i32>, ptr %p, align 1 diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll index b30dc9ffdc596..eddfc57a7d256 100644 --- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll +++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll @@ -47,21 +47,12 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) { ; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles) define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) { -; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64( -; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] { -; SSE-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32 -; SSE-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32 -; SSE-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> -; SSE-NEXT: ret <4 x double> [[BLEND]] -; -; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64( -; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] { -; AVX-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32 -; AVX-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32 -; AVX-NEXT: [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer -; AVX-NEXT: [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer -; AVX-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> -; AVX-NEXT: ret <4 x double> [[BLEND]] +; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64( +; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 32 +; CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 32 +; CHECK-NEXT: [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> +; CHECK-NEXT: ret <4 x double> [[BLEND]] ; %ld0 = load <4 x double>, ptr %p0, align 32 %ld1 = load <4 x double>, ptr %p1, align 32 @@ -81,3 +72,6 @@ define <2 x float> @PR86068(<2 x float> %a0, <2 x float> %a1) { %s2 = shufflevector <2 x float> %s1, <2 x float> %a0, <2 x i32> ret <2 x float> %s2 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX: {{.*}} +; SSE: {{.*}} diff --git a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll index 56c51ce7315cf..c42df19a0b7a6 100644 --- a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll +++ b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll @@ -19,8 +19,8 @@ define <8 x half> @shuffle_v4_v8f16_r0_1(ptr addrspace(1) nocapture readonly %ar ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> ; CHECK-NEXT: ret <8 x half> [[TMP1]] ; entry: @@ -33,8 +33,8 @@ define <8 x half> @shuffle_v4_v8f16_r0_2(ptr addrspace(1) nocapture readonly %ar ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> ; CHECK-NEXT: ret <8 x half> [[TMP1]] ; entry: @@ -47,8 +47,8 @@ define <4 x half> @shuffle_v4_v4f16_r1_2(ptr addrspace(1) nocapture readonly %ar ; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> ; CHECK-NEXT: ret <4 x half> [[TMP1]] ; entry: @@ -61,8 +61,8 @@ define <8 x half> @shuffle_v4_v8f16_r1_2(ptr addrspace(1) nocapture readonly %ar ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> ; CHECK-NEXT: ret <8 x half> [[TMP1]] ; entry: @@ -75,13 +75,13 @@ define <8 x half> @shuffle_v4_v8f16_cond_r0_1(ptr addrspace(1) nocapture readonl ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r0_1( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> ; CHECK-NEXT: br label %[[FINALLY]] ; CHECK: [[FINALLY]]: ; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] @@ -108,13 +108,13 @@ define <4 x half> @shuffle_v4_v4f16_cond_r1_2(ptr addrspace(1) nocapture readonl ; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_cond_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> ; CHECK-NEXT: br label %[[FINALLY]] ; CHECK: [[FINALLY]]: ; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] @@ -141,13 +141,13 @@ define <8 x half> @shuffle_v4_v8f16_cond_r1_2(ptr addrspace(1) nocapture readonl ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> ; CHECK-NEXT: br label %[[FINALLY]] ; CHECK: [[FINALLY]]: ; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] @@ -174,8 +174,8 @@ define <8 x i32> @shuffle_v4_v8i32_r0_1(ptr addrspace(1) nocapture readonly %arg ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_1( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP1]] ; entry: @@ -188,8 +188,8 @@ define <8 x i32> @shuffle_v4_v8i32_r0_2(ptr addrspace(1) nocapture readonly %arg ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP1]] ; entry: @@ -202,8 +202,8 @@ define <4 x i32> @shuffle_v4_v4i32_r1_2(ptr addrspace(1) nocapture readonly %arg ; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP1]] ; entry: @@ -216,8 +216,8 @@ define <8 x i32> @shuffle_v4_v8i32_r1_2(ptr addrspace(1) nocapture readonly %arg ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP1]] ; entry: @@ -230,13 +230,13 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(ptr addrspace(1) nocapture readonly ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_1( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: br label %[[FINALLY]] ; CHECK: [[FINALLY]]: ; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] @@ -263,13 +263,13 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(ptr addrspace(1) nocapture readonly ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> ; CHECK-NEXT: br label %[[FINALLY]] ; CHECK: [[FINALLY]]: ; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] @@ -296,13 +296,13 @@ define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(ptr addrspace(1) nocapture readonly ; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_cond_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> ; CHECK-NEXT: br label %[[FINALLY]] ; CHECK: [[FINALLY]]: ; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] @@ -329,13 +329,13 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(ptr addrspace(1) nocapture readonly ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_2( ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> ; CHECK-NEXT: br label %[[FINALLY]] ; CHECK: [[FINALLY]]: ; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] From 839f7bd14172b053b2836a9d3f85d6bd200c102d Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Wed, 26 Mar 2025 12:51:14 +0000 Subject: [PATCH 12/22] Fix build bot false positive for undef. --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 024d897661108..7173a4304a7f7 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -3722,7 +3722,7 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { if (!Shuffle->hasNUsesOrMore(1u)) continue; - // Ensure second operand is a poison/undef value. + // Ensure second operand is a poison value. auto *Op0 = Shuffle->getOperand(0); auto *Op1 = Shuffle->getOperand(1); if (!isa(Op1) && !isa(Op1)) From 9518894f07bdd98dd2a34946871d75d84a224c9a Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Wed, 7 May 2025 11:12:44 +0100 Subject: [PATCH 13/22] Address comments. --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 7173a4304a7f7..9cfe6603b81de 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -3713,23 +3713,19 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { auto GetIndexRangeInShuffles = [&]() -> std::optional { IndexRange OutputRange = IndexRange(VecTy->getNumElements(), -1); for (auto &Use : I.uses()) { - // All uses must be shufflevector instructions. - auto *Shuffle = dyn_cast(Use.getUser()); - if (!Shuffle) + // Ensure all uses match the required pattern. + User *Shuffle = Use.getUser(); + Value *Op0 = nullptr; + ArrayRef Mask; + + if (!match(Shuffle, m_Shuffle(m_Value(Op0), m_Undef(), m_Mask(Mask)))) return std::nullopt; // Ignore shufflevector instructions that have no uses. if (!Shuffle->hasNUsesOrMore(1u)) continue; - // Ensure second operand is a poison value. - auto *Op0 = Shuffle->getOperand(0); - auto *Op1 = Shuffle->getOperand(1); - if (!isa(Op1) && !isa(Op1)) - return std::nullopt; - // Find the min and max indices used by the shufflevector instruction. - ArrayRef Mask = Shuffle->getShuffleMask(); auto *Op0Ty = cast(Op0->getType()); auto NumElems = int(Op0Ty->getNumElements()); From 016e9a5b5e29a6b7a3c19736d227e3223fd75a85 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Mon, 19 May 2025 14:57:21 +0100 Subject: [PATCH 14/22] Address review comments. --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 9cfe6603b81de..16bc330917bac 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -3722,12 +3722,12 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { return std::nullopt; // Ignore shufflevector instructions that have no uses. - if (!Shuffle->hasNUsesOrMore(1u)) + if (Shuffle->use_empty()) continue; // Find the min and max indices used by the shufflevector instruction. - auto *Op0Ty = cast(Op0->getType()); - auto NumElems = int(Op0Ty->getNumElements()); + FixedVectorType *Op0Ty = cast(Op0->getType()); + int NumElems = static_cast(Op0Ty->getNumElements()); for (int Index : Mask) { if (Index >= 0) { From 76450c90664d6ac5fcd173b136bf9874f062919e Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Fri, 30 May 2025 19:57:20 +0100 Subject: [PATCH 15/22] Address comments. --- .../Transforms/Vectorize/VectorCombine.cpp | 64 +++++++++---------- 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 16bc330917bac..96620db3be346 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -3701,24 +3701,26 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { if (!OldLoad || !OldLoad->isSimple()) return false; - auto *VecTy = dyn_cast(OldLoad->getType()); - if (!VecTy) + auto *OldLoadTy = dyn_cast(OldLoad->getType()); + if (!OldLoadTy) return false; + unsigned const OldNumElements = OldLoadTy->getNumElements(); + // Search all uses of load. If all uses are shufflevector instructions, and // the second operands are all poison values, find the minimum and maximum // indices of the vector elements referenced by all shuffle masks. // Otherwise return `std::nullopt`. using IndexRange = std::pair; auto GetIndexRangeInShuffles = [&]() -> std::optional { - IndexRange OutputRange = IndexRange(VecTy->getNumElements(), -1); + IndexRange OutputRange = IndexRange(OldNumElements, -1); for (auto &Use : I.uses()) { // Ensure all uses match the required pattern. User *Shuffle = Use.getUser(); - Value *Op0 = nullptr; ArrayRef Mask; - if (!match(Shuffle, m_Shuffle(m_Value(Op0), m_Undef(), m_Mask(Mask)))) + if (!match(Shuffle, + m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask)))) return std::nullopt; // Ignore shufflevector instructions that have no uses. @@ -3726,12 +3728,8 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { continue; // Find the min and max indices used by the shufflevector instruction. - FixedVectorType *Op0Ty = cast(Op0->getType()); - int NumElems = static_cast(Op0Ty->getNumElements()); - for (int Index : Mask) { - if (Index >= 0) { - Index %= NumElems; + if (Index >= 0 && Index < static_cast(OldNumElements)) { OutputRange.first = std::min(Index, OutputRange.first); OutputRange.second = std::max(Index, OutputRange.second); } @@ -3746,34 +3744,29 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { // Get the range of vector elements used by shufflevector instructions. if (auto Indices = GetIndexRangeInShuffles()) { - unsigned OldSize = VecTy->getNumElements(); - unsigned NewSize = Indices->second + 1u; + unsigned const NewNumElements = Indices->second + 1u; // If the range of vector elements is smaller than the full load, attempt // to create a smaller load. - if (NewSize < OldSize) { + if (NewNumElements < OldNumElements) { auto Builder = IRBuilder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); - // Create new load of smaller vector. - auto *ElemTy = VecTy->getElementType(); - auto *NewVecTy = FixedVectorType::get(ElemTy, NewSize); - auto *PtrOp = OldLoad->getPointerOperand(); - auto *NewLoad = cast( - Builder.CreateAlignedLoad(NewVecTy, PtrOp, OldLoad->getAlign())); - NewLoad->copyMetadata(I); - // Calculate costs of old and new ops. - auto OldCost = TTI.getMemoryOpCost( + Type *ElemTy = OldLoadTy->getElementType(); + FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements); + Value *PtrOp = OldLoad->getPointerOperand(); + + InstructionCost OldCost = TTI.getMemoryOpCost( Instruction::Load, OldLoad->getType(), OldLoad->getAlign(), OldLoad->getPointerAddressSpace(), CostKind); - auto NewCost = TTI.getMemoryOpCost( - Instruction::Load, NewLoad->getType(), NewLoad->getAlign(), - NewLoad->getPointerAddressSpace(), CostKind); + InstructionCost NewCost = TTI.getMemoryOpCost( + Instruction::Load, NewLoadTy, OldLoad->getAlign(), + OldLoad->getPointerAddressSpace(), CostKind); using UseEntry = std::pair>; auto NewUses = SmallVector(); - auto SizeDiff = OldSize - NewSize; + auto SizeDiff = OldNumElements - NewNumElements; for (auto &Use : I.uses()) { auto *Shuffle = cast(Use.getUser()); @@ -3783,19 +3776,22 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { NewUses.push_back({Shuffle, {}}); auto &NewMask = NewUses.back().second; for (auto Index : OldMask) - NewMask.push_back(Index >= int(OldSize) ? Index - SizeDiff : Index); + NewMask.push_back(Index >= int(NewNumElements) ? Index - SizeDiff : Index); // Update costs. - OldCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, OldMask, - CostKind); - NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, NewVecTy, + OldCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, OldLoadTy, + OldMask, CostKind); + NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, NewLoadTy, NewMask, CostKind); } - if (OldCost < NewCost || !NewCost.isValid()) { - NewLoad->eraseFromParent(); + if (OldCost < NewCost || !NewCost.isValid()) return false; - } + + // Create new load of smaller vector. + auto *NewLoad = cast( + Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign())); + NewLoad->copyMetadata(I); // Replace all uses. for (auto &Use : NewUses) { @@ -3805,7 +3801,7 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { Builder.SetInsertPoint(Shuffle); Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc()); auto *NewShuffle = Builder.CreateShuffleVector( - NewLoad, PoisonValue::get(NewVecTy), NewMask); + NewLoad, PoisonValue::get(NewLoadTy), NewMask); replaceValue(*Shuffle, *NewShuffle); } From dcccc9234ff4fc1aae20ade7bf1d3fe03682a2a0 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Fri, 30 May 2025 20:04:38 +0100 Subject: [PATCH 16/22] Formatting. --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 96620db3be346..fb95b4297fecf 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -3719,7 +3719,7 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { User *Shuffle = Use.getUser(); ArrayRef Mask; - if (!match(Shuffle, + if (!match(Shuffle, m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask)))) return std::nullopt; @@ -3756,13 +3756,13 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { Type *ElemTy = OldLoadTy->getElementType(); FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements); Value *PtrOp = OldLoad->getPointerOperand(); - + InstructionCost OldCost = TTI.getMemoryOpCost( Instruction::Load, OldLoad->getType(), OldLoad->getAlign(), OldLoad->getPointerAddressSpace(), CostKind); - InstructionCost NewCost = TTI.getMemoryOpCost( - Instruction::Load, NewLoadTy, OldLoad->getAlign(), - OldLoad->getPointerAddressSpace(), CostKind); + InstructionCost NewCost = + TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(), + OldLoad->getPointerAddressSpace(), CostKind); using UseEntry = std::pair>; auto NewUses = SmallVector(); @@ -3776,7 +3776,8 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { NewUses.push_back({Shuffle, {}}); auto &NewMask = NewUses.back().second; for (auto Index : OldMask) - NewMask.push_back(Index >= int(NewNumElements) ? Index - SizeDiff : Index); + NewMask.push_back(Index >= int(NewNumElements) ? Index - SizeDiff + : Index); // Update costs. OldCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, OldLoadTy, From fa2ebe88c78bc77fd584d3613659cb5d94e26a89 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Fri, 30 May 2025 22:05:01 +0100 Subject: [PATCH 17/22] Correct renaming and remove function-style casts. --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index fb95b4297fecf..40bffc0062f30 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -3776,8 +3776,9 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { NewUses.push_back({Shuffle, {}}); auto &NewMask = NewUses.back().second; for (auto Index : OldMask) - NewMask.push_back(Index >= int(NewNumElements) ? Index - SizeDiff - : Index); + NewMask.push_back(Index >= static_cast(OldNumElements) + ? Index - SizeDiff + : Index); // Update costs. OldCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, OldLoadTy, From 863f10f2f7f145f894027986d6997b84cbf220ff Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Wed, 11 Jun 2025 10:57:15 +0100 Subject: [PATCH 18/22] Remove auto. --- .../Transforms/Vectorize/VectorCombine.cpp | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 40bffc0062f30..6a44ed9a007bf 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -30,6 +30,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/Local.h" @@ -3714,7 +3715,7 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { using IndexRange = std::pair; auto GetIndexRangeInShuffles = [&]() -> std::optional { IndexRange OutputRange = IndexRange(OldNumElements, -1); - for (auto &Use : I.uses()) { + for (llvm::Use &Use : I.uses()) { // Ensure all uses match the required pattern. User *Shuffle = Use.getUser(); ArrayRef Mask; @@ -3743,13 +3744,13 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { }; // Get the range of vector elements used by shufflevector instructions. - if (auto Indices = GetIndexRangeInShuffles()) { + if (std::optional Indices = GetIndexRangeInShuffles()) { unsigned const NewNumElements = Indices->second + 1u; // If the range of vector elements is smaller than the full load, attempt // to create a smaller load. if (NewNumElements < OldNumElements) { - auto Builder = IRBuilder(&I); + IRBuilder Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); // Calculate costs of old and new ops. @@ -3765,17 +3766,17 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { OldLoad->getPointerAddressSpace(), CostKind); using UseEntry = std::pair>; - auto NewUses = SmallVector(); - auto SizeDiff = OldNumElements - NewNumElements; + SmallVector NewUses; + unsigned const SizeDiff = OldNumElements - NewNumElements; - for (auto &Use : I.uses()) { + for (llvm::Use &Use : I.uses()) { auto *Shuffle = cast(Use.getUser()); - auto OldMask = Shuffle->getShuffleMask(); + ArrayRef OldMask = Shuffle->getShuffleMask(); // Create entry for new use. NewUses.push_back({Shuffle, {}}); - auto &NewMask = NewUses.back().second; - for (auto Index : OldMask) + std::vector &NewMask = NewUses.back().second; + for (int Index : OldMask) NewMask.push_back(Index >= static_cast(OldNumElements) ? Index - SizeDiff : Index); @@ -3796,13 +3797,13 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { NewLoad->copyMetadata(I); // Replace all uses. - for (auto &Use : NewUses) { - auto *Shuffle = Use.first; - auto &NewMask = Use.second; + for (UseEntry &Use : NewUses) { + ShuffleVectorInst *Shuffle = Use.first; + std::vector &NewMask = Use.second; Builder.SetInsertPoint(Shuffle); Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc()); - auto *NewShuffle = Builder.CreateShuffleVector( + Value *NewShuffle = Builder.CreateShuffleVector( NewLoad, PoisonValue::get(NewLoadTy), NewMask); replaceValue(*Shuffle, *NewShuffle); From 5f122f67244261e6bae76a52ede1fe4cf2b4371d Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Tue, 8 Jul 2025 23:23:04 +0100 Subject: [PATCH 19/22] Fix cost analysis after rebase. --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 6a44ed9a007bf..e4fc430646c8f 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -3782,10 +3782,12 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { : Index); // Update costs. - OldCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, OldLoadTy, - OldMask, CostKind); - NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, NewLoadTy, - NewMask, CostKind); + OldCost += + TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(), + OldLoadTy, OldMask, CostKind); + NewCost += + TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(), + NewLoadTy, NewMask, CostKind); } if (OldCost < NewCost || !NewCost.isValid()) From 0373b90b045b835b5d32dd53dcc8e416e54ad44f Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Wed, 9 Jul 2025 17:01:54 +0100 Subject: [PATCH 20/22] Add debug message. --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index e4fc430646c8f..fa6dc72cb82d6 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -3790,6 +3790,11 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { NewLoadTy, NewMask, CostKind); } + LLVM_DEBUG( + dbgs() << "Found a load used only by shufflevector instructions: " + << I << "\n OldCost: " << OldCost + << " vs NewCost: " << NewCost << "\n"); + if (OldCost < NewCost || !NewCost.isValid()) return false; From 204d4bfbb7b5db308554c5b47fbce0421bd443ec Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Wed, 16 Jul 2025 03:37:22 +0100 Subject: [PATCH 21/22] Address comments. --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index fa6dc72cb82d6..1c7158e47f957 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -3776,10 +3776,12 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { // Create entry for new use. NewUses.push_back({Shuffle, {}}); std::vector &NewMask = NewUses.back().second; - for (int Index : OldMask) + for (int Index : OldMask) { + assert(Index <= Indices->second); NewMask.push_back(Index >= static_cast(OldNumElements) ? Index - SizeDiff : Index); + } // Update costs. OldCost += From bceb35785513351d593ebd77102f23df7a49f0a5 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Wed, 16 Jul 2025 05:46:50 +0100 Subject: [PATCH 22/22] Remove redundant code and update tests. --- .../Transforms/Vectorize/VectorCombine.cpp | 12 ++----- .../VectorCombine/load-shufflevector.ll | 33 +++++++++++++++++++ 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 1c7158e47f957..24aaf30962127 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -3767,21 +3767,13 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { using UseEntry = std::pair>; SmallVector NewUses; - unsigned const SizeDiff = OldNumElements - NewNumElements; for (llvm::Use &Use : I.uses()) { auto *Shuffle = cast(Use.getUser()); ArrayRef OldMask = Shuffle->getShuffleMask(); // Create entry for new use. - NewUses.push_back({Shuffle, {}}); - std::vector &NewMask = NewUses.back().second; - for (int Index : OldMask) { - assert(Index <= Indices->second); - NewMask.push_back(Index >= static_cast(OldNumElements) - ? Index - SizeDiff - : Index); - } + NewUses.push_back({Shuffle, OldMask}); // Update costs. OldCost += @@ -3789,7 +3781,7 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { OldLoadTy, OldMask, CostKind); NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(), - NewLoadTy, NewMask, CostKind); + NewLoadTy, OldMask, CostKind); } LLVM_DEBUG( diff --git a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll index c42df19a0b7a6..467c20c5da0c2 100644 --- a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll +++ b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll @@ -357,3 +357,36 @@ finally: %val3 = phi <8 x i32> [ %val1, %then ], [ %val2, %else ] ret <8 x i32> %val3 } + +define <8 x i32> @shuffle_v4_v8i32_cond_r1_4(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr { +; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_4( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: br label %[[FINALLY:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: br label %[[FINALLY]] +; CHECK: [[FINALLY]]: +; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] +; CHECK-NEXT: ret <8 x i32> [[VAL3]] +; +entry: + %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32 + br i1 %cond, label %then, label %else + +then: + %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> + br label %finally + +else: + %val2 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> + br label %finally + +finally: + %val3 = phi <8 x i32> [ %val1, %then ], [ %val2, %else ] + ret <8 x i32> %val3 +}