-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[AArch64][GlobalISel] Adopt some Ld* patterns to reduce codegen regressions #135492
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-aarch64 Author: Vladislav Dzhidzhoev (dzhidzhoev) ChangesThis is an update of #69607 after #101675 and #105686. Ld1Lane64Pat, Ld1Lane128Pat, LoadInsertPatterns, Neon_INS_elt_pattern from SelectionDAG didn't work for GlobalISel on v8i8 and v16i8 vector types, because vector_insert for v8i8, v16i8 in SelectionDAG expects i32 scalar argument type, whereas G_INSERT_VECTOR_ELT expects s8. Patch is 33.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135492.diff 15 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index 2d2b2bee99ec4..51bb3ab1f3642 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -573,3 +573,10 @@ def : Pat<(v2i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
(LD1Rv2d GPR64sp:$Rn)>;
def : Pat<(v1i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
(LD1Rv1d GPR64sp:$Rn)>;
+
+defm : LoadInsertPatterns<load, v16i8, v8i8, nxv16i8, i8,
+ LDRBui, LDURBi, LDRBroW, LDRBroX,
+ ro8, am_indexed8, am_unscaled8, uimm12s1, bsub>;
+def : Ld1Lane64Pat<load, VectorIndexB, v8i8, i8, LD1i8>;
+def : Ld1Lane128Pat<load, VectorIndexB, v16i8, i8, LD1i8>;
+defm : Neon_INS_elt_pattern<v16i8, v8i8, nxv16i8, i8, VectorIndexB, INSvi8lane>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index b90792d60d102..feba474221b77 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -7266,12 +7266,12 @@ multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64, ValueType VTSVE
(VTScal (vector_extract (VT64 V64:$Rn), (i64 imm:$Immn))),
(i64 imm:$Immd))),
(INS V128:$src, imm:$Immd,
- (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn)>;
+ (VT128 (SUBREG_TO_REG (i64 0), V64:$Rn, dsub)), imm:$Immn)>;
def : Pat<(VT64 (vector_insert V64:$src,
(VTScal (vector_extract (VT128 V128:$Rn), (i64 imm:$Immn))),
(i64 imm:$Immd))),
- (EXTRACT_SUBREG (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub),
+ (EXTRACT_SUBREG (INS (VT128 (SUBREG_TO_REG (i64 0), V64:$src, dsub)),
imm:$Immd, V128:$Rn, imm:$Immn),
dsub)>;
diff --git a/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll b/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
index 7a4cdd52db904..bed38b1dc5a0e 100644
--- a/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
@@ -88,24 +88,16 @@ define <8 x i8> @test_varidx_extract_v16s8(<16 x i8> %x, i32 %idx) {
; CHECK-GISEL-NEXT: mov x8, sp
; CHECK-GISEL-NEXT: str q0, [sp]
; CHECK-GISEL-NEXT: and x9, x9, #0xf
-; CHECK-GISEL-NEXT: mov b2, v0.b[1]
-; CHECK-GISEL-NEXT: mov b3, v0.b[2]
; CHECK-GISEL-NEXT: lsl x10, x9, #1
; CHECK-GISEL-NEXT: sub x9, x10, x9
; CHECK-GISEL-NEXT: ldr b1, [x8, x9]
-; CHECK-GISEL-NEXT: mov v1.b[0], v1.b[0]
-; CHECK-GISEL-NEXT: mov v1.b[1], v2.b[0]
-; CHECK-GISEL-NEXT: mov b2, v0.b[3]
-; CHECK-GISEL-NEXT: mov v1.b[2], v3.b[0]
-; CHECK-GISEL-NEXT: mov b3, v0.b[4]
-; CHECK-GISEL-NEXT: mov v1.b[3], v2.b[0]
-; CHECK-GISEL-NEXT: mov b2, v0.b[5]
-; CHECK-GISEL-NEXT: mov v1.b[4], v3.b[0]
-; CHECK-GISEL-NEXT: mov b3, v0.b[6]
-; CHECK-GISEL-NEXT: mov b0, v0.b[7]
-; CHECK-GISEL-NEXT: mov v1.b[5], v2.b[0]
-; CHECK-GISEL-NEXT: mov v1.b[6], v3.b[0]
-; CHECK-GISEL-NEXT: mov v1.b[7], v0.b[0]
+; CHECK-GISEL-NEXT: mov v1.b[1], v0.b[1]
+; CHECK-GISEL-NEXT: mov v1.b[2], v0.b[2]
+; CHECK-GISEL-NEXT: mov v1.b[3], v0.b[3]
+; CHECK-GISEL-NEXT: mov v1.b[4], v0.b[4]
+; CHECK-GISEL-NEXT: mov v1.b[5], v0.b[5]
+; CHECK-GISEL-NEXT: mov v1.b[6], v0.b[6]
+; CHECK-GISEL-NEXT: mov v1.b[7], v0.b[7]
; CHECK-GISEL-NEXT: fmov d0, d1
; CHECK-GISEL-NEXT: add sp, sp, #16
; CHECK-GISEL-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
index 0412aef7545e9..4d0603722c3ae 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -13326,10 +13326,9 @@ define <16 x i8> @test_v16i8_post_reg_ld1lane(ptr %bar, ptr %ptr, i64 %inc, <16
;
; CHECK-GI-LABEL: test_v16i8_post_reg_ld1lane:
; CHECK-GI: ; %bb.0:
-; CHECK-GI-NEXT: ldr b1, [x0]
+; CHECK-GI-NEXT: ld1.b { v0 }[1], [x0]
; CHECK-GI-NEXT: add x8, x0, x2
; CHECK-GI-NEXT: str x8, [x1]
-; CHECK-GI-NEXT: mov.b v0[1], v1[0]
; CHECK-GI-NEXT: ret
%tmp1 = load i8, ptr %bar
%tmp2 = insertelement <16 x i8> %A, i8 %tmp1, i32 1
@@ -13373,11 +13372,10 @@ define <8 x i8> @test_v8i8_post_reg_ld1lane(ptr %bar, ptr %ptr, i64 %inc, <8 x i
;
; CHECK-GI-LABEL: test_v8i8_post_reg_ld1lane:
; CHECK-GI: ; %bb.0:
-; CHECK-GI-NEXT: ldr b1, [x0]
; CHECK-GI-NEXT: ; kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: add x8, x0, x2
+; CHECK-GI-NEXT: ld1.b { v0 }[1], [x0]
; CHECK-GI-NEXT: str x8, [x1]
-; CHECK-GI-NEXT: mov.b v0[1], v1[0]
; CHECK-GI-NEXT: ; kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%tmp1 = load i8, ptr %bar
@@ -13891,43 +13889,20 @@ define void @test_ld1lane_build_half(ptr %a, ptr %b, ptr %c, ptr %d, <4 x half>
}
define void @test_ld1lane_build_i8(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e, ptr %f, ptr %g, ptr %h, <8 x i8> %v, ptr %p) {
-; CHECK-SD-LABEL: test_ld1lane_build_i8:
-; CHECK-SD: ; %bb.0:
-; CHECK-SD-NEXT: ldr b1, [x0]
-; CHECK-SD-NEXT: ldr x8, [sp]
-; CHECK-SD-NEXT: ld1.b { v1 }[1], [x1]
-; CHECK-SD-NEXT: ld1.b { v1 }[2], [x2]
-; CHECK-SD-NEXT: ld1.b { v1 }[3], [x3]
-; CHECK-SD-NEXT: ld1.b { v1 }[4], [x4]
-; CHECK-SD-NEXT: ld1.b { v1 }[5], [x5]
-; CHECK-SD-NEXT: ld1.b { v1 }[6], [x6]
-; CHECK-SD-NEXT: ld1.b { v1 }[7], [x7]
-; CHECK-SD-NEXT: sub.8b v0, v1, v0
-; CHECK-SD-NEXT: str d0, [x8]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_ld1lane_build_i8:
-; CHECK-GI: ; %bb.0:
-; CHECK-GI-NEXT: ldr b1, [x0]
-; CHECK-GI-NEXT: ldr b2, [x1]
-; CHECK-GI-NEXT: ldr x8, [sp]
-; CHECK-GI-NEXT: mov.b v1[0], v1[0]
-; CHECK-GI-NEXT: mov.b v1[1], v2[0]
-; CHECK-GI-NEXT: ldr b2, [x2]
-; CHECK-GI-NEXT: mov.b v1[2], v2[0]
-; CHECK-GI-NEXT: ldr b2, [x3]
-; CHECK-GI-NEXT: mov.b v1[3], v2[0]
-; CHECK-GI-NEXT: ldr b2, [x4]
-; CHECK-GI-NEXT: mov.b v1[4], v2[0]
-; CHECK-GI-NEXT: ldr b2, [x5]
-; CHECK-GI-NEXT: mov.b v1[5], v2[0]
-; CHECK-GI-NEXT: ldr b2, [x6]
-; CHECK-GI-NEXT: mov.b v1[6], v2[0]
-; CHECK-GI-NEXT: ldr b2, [x7]
-; CHECK-GI-NEXT: mov.b v1[7], v2[0]
-; CHECK-GI-NEXT: sub.8b v0, v1, v0
-; CHECK-GI-NEXT: str d0, [x8]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_ld1lane_build_i8:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: ldr b1, [x0]
+; CHECK-NEXT: ldr x8, [sp]
+; CHECK-NEXT: ld1.b { v1 }[1], [x1]
+; CHECK-NEXT: ld1.b { v1 }[2], [x2]
+; CHECK-NEXT: ld1.b { v1 }[3], [x3]
+; CHECK-NEXT: ld1.b { v1 }[4], [x4]
+; CHECK-NEXT: ld1.b { v1 }[5], [x5]
+; CHECK-NEXT: ld1.b { v1 }[6], [x6]
+; CHECK-NEXT: ld1.b { v1 }[7], [x7]
+; CHECK-NEXT: sub.8b v0, v1, v0
+; CHECK-NEXT: str d0, [x8]
+; CHECK-NEXT: ret
%ld.a = load i8, ptr %a
%ld.b = load i8, ptr %b
%ld.c = load i8, ptr %c
diff --git a/llvm/test/CodeGen/AArch64/arm64-ld1.ll b/llvm/test/CodeGen/AArch64/arm64-ld1.ll
index eaa545473b2e0..0b22fa49cb5c1 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ld1.ll
@@ -1004,16 +1004,10 @@ declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3r.v2i64.p0(ptr) nounwin
declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4r.v2i64.p0(ptr) nounwind readonly
define <16 x i8> @ld1_16b(<16 x i8> %V, ptr %bar) {
-; CHECK-SD-LABEL: ld1_16b:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ld1.b { v0 }[0], [x0]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ld1_16b:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr b1, [x0]
-; CHECK-GI-NEXT: mov.b v0[0], v1[0]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ld1_16b:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1.b { v0 }[0], [x0]
+; CHECK-NEXT: ret
; Make sure we are using the operands defined by the ABI
%tmp1 = load i8, ptr %bar
%tmp2 = insertelement <16 x i8> %V, i8 %tmp1, i32 0
@@ -1086,20 +1080,12 @@ define <1 x i64> @ld1_1d(ptr %p) {
}
define <8 x i8> @ld1_8b(<8 x i8> %V, ptr %bar) {
-; CHECK-SD-LABEL: ld1_8b:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: ld1.b { v0 }[0], [x0]
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ld1_8b:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr b1, [x0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov.b v0[0], v1[0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ld1_8b:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: ld1.b { v0 }[0], [x0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
; Make sure we are using the operands defined by the ABI
%tmp1 = load i8, ptr %bar
%tmp2 = insertelement <8 x i8> %V, i8 %tmp1, i32 0
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index c0d91c1e0c836..ef6061f897638 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -75,18 +75,11 @@ define <2 x i32> @ins2sw(<2 x i32> %tmp1, i32 %tmp2) {
}
define <16 x i8> @ins16b16(<16 x i8> %tmp1, <16 x i8> %tmp2) {
-; CHECK-SD-LABEL: ins16b16:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: mov v1.b[15], v0.b[2]
-; CHECK-SD-NEXT: mov v0.16b, v1.16b
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ins16b16:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov b2, v0.b[2]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
-; CHECK-GI-NEXT: mov v0.b[15], v2.b[0]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ins16b16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v1.b[15], v0.b[2]
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
%tmp3 = extractelement <16 x i8> %tmp1, i32 2
%tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
ret <16 x i8> %tmp4
@@ -148,20 +141,12 @@ define <2 x double> @ins2df2(<2 x double> %tmp1, <2 x double> %tmp2) {
}
define <16 x i8> @ins8b16(<8 x i8> %tmp1, <16 x i8> %tmp2) {
-; CHECK-SD-LABEL: ins8b16:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: mov v1.b[15], v0.b[2]
-; CHECK-SD-NEXT: mov v0.16b, v1.16b
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ins8b16:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov b2, v0.b[2]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
-; CHECK-GI-NEXT: mov v0.b[15], v2.b[0]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ins8b16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v1.b[15], v0.b[2]
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
%tmp3 = extractelement <8 x i8> %tmp1, i32 2
%tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
ret <16 x i8> %tmp4
@@ -239,20 +224,12 @@ define <2 x double> @ins1f2_args_flipped(<2 x double> %tmp2, <1 x double> %tmp1)
}
define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) {
-; CHECK-SD-LABEL: ins16b8:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-SD-NEXT: mov v1.b[7], v0.b[2]
-; CHECK-SD-NEXT: fmov d0, d1
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ins16b8:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov b2, v0.b[2]
-; CHECK-GI-NEXT: fmov d0, d1
-; CHECK-GI-NEXT: mov v0.b[7], v2.b[0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ins16b8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v1.b[7], v0.b[2]
+; CHECK-NEXT: fmov d0, d1
+; CHECK-NEXT: ret
%tmp3 = extractelement <16 x i8> %tmp1, i32 2
%tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 7
ret <8 x i8> %tmp4
@@ -321,22 +298,13 @@ define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) {
}
define <8 x i8> @ins8b8(<8 x i8> %tmp1, <8 x i8> %tmp2) {
-; CHECK-SD-LABEL: ins8b8:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: mov v1.b[4], v0.b[2]
-; CHECK-SD-NEXT: fmov d0, d1
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ins8b8:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov b2, v0.b[2]
-; CHECK-GI-NEXT: fmov d0, d1
-; CHECK-GI-NEXT: mov v0.b[4], v2.b[0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ins8b8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v1.b[4], v0.b[2]
+; CHECK-NEXT: fmov d0, d1
+; CHECK-NEXT: ret
%tmp3 = extractelement <8 x i8> %tmp1, i32 2
%tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 4
ret <8 x i8> %tmp4
@@ -617,37 +585,22 @@ define i64 @smovx2s(<2 x i32> %tmp1) {
}
define <8 x i8> @test_vcopy_lane_s8(<8 x i8> %v1, <8 x i8> %v2) {
-; CHECK-SD-LABEL: test_vcopy_lane_s8:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-SD-NEXT: mov v0.b[5], v1.b[3]
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_vcopy_lane_s8:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov b1, v1.b[3]
-; CHECK-GI-NEXT: mov v0.b[5], v1.b[0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_vcopy_lane_s8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v0.b[5], v1.b[3]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
%vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 11, i32 6, i32 7>
ret <8 x i8> %vset_lane
}
define <16 x i8> @test_vcopyq_laneq_s8(<16 x i8> %v1, <16 x i8> %v2) {
-; CHECK-SD-LABEL: test_vcopyq_laneq_s8:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: mov v0.b[14], v1.b[6]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_vcopyq_laneq_s8:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov b1, v1.b[6]
-; CHECK-GI-NEXT: mov v0.b[14], v1.b[0]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_vcopyq_laneq_s8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.b[14], v1.b[6]
+; CHECK-NEXT: ret
%vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 22, i32 15>
ret <16 x i8> %vset_lane
}
@@ -665,18 +618,11 @@ define <8 x i8> @test_vcopy_lane_swap_s8(<8 x i8> %v1, <8 x i8> %v2) {
}
define <16 x i8> @test_vcopyq_laneq_swap_s8(<16 x i8> %v1, <16 x i8> %v2) {
-; CHECK-SD-LABEL: test_vcopyq_laneq_swap_s8:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: mov v1.b[0], v0.b[15]
-; CHECK-SD-NEXT: mov v0.16b, v1.16b
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_vcopyq_laneq_swap_s8:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov b2, v0.b[15]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
-; CHECK-GI-NEXT: mov v0.b[0], v2.b[0]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_vcopyq_laneq_swap_s8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v1.b[0], v0.b[15]
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
%vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 15, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
ret <16 x i8> %vset_lane
}
@@ -1358,21 +1304,14 @@ define <8 x i8> @getl(<16 x i8> %x) #0 {
;
; CHECK-GI-LABEL: getl:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov b2, v0.b[1]
; CHECK-GI-NEXT: mov v1.b[0], v0.b[0]
-; CHECK-GI-NEXT: mov b3, v0.b[2]
-; CHECK-GI-NEXT: mov v1.b[1], v2.b[0]
-; CHECK-GI-NEXT: mov b2, v0.b[3]
-; CHECK-GI-NEXT: mov v1.b[2], v3.b[0]
-; CHECK-GI-NEXT: mov b3, v0.b[4]
-; CHECK-GI-NEXT: mov v1.b[3], v2.b[0]
-; CHECK-GI-NEXT: mov b2, v0.b[5]
-; CHECK-GI-NEXT: mov v1.b[4], v3.b[0]
-; CHECK-GI-NEXT: mov b3, v0.b[6]
-; CHECK-GI-NEXT: mov b0, v0.b[7]
-; CHECK-GI-NEXT: mov v1.b[5], v2.b[0]
-; CHECK-GI-NEXT: mov v1.b[6], v3.b[0]
-; CHECK-GI-NEXT: mov v1.b[7], v0.b[0]
+; CHECK-GI-NEXT: mov v1.b[1], v0.b[1]
+; CHECK-GI-NEXT: mov v1.b[2], v0.b[2]
+; CHECK-GI-NEXT: mov v1.b[3], v0.b[3]
+; CHECK-GI-NEXT: mov v1.b[4], v0.b[4]
+; CHECK-GI-NEXT: mov v1.b[5], v0.b[5]
+; CHECK-GI-NEXT: mov v1.b[6], v0.b[6]
+; CHECK-GI-NEXT: mov v1.b[7], v0.b[7]
; CHECK-GI-NEXT: fmov d0, d1
; CHECK-GI-NEXT: ret
%vecext = extractelement <16 x i8> %x, i32 0
@@ -1804,22 +1743,15 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov v2.16b, v1.16b
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov b3, v0.b[1]
; CHECK-GI-NEXT: adrp x8, .LCPI127_0
; CHECK-GI-NEXT: mov v1.b[0], v0.b[0]
-; CHECK-GI-NEXT: mov b4, v0.b[2]
-; CHECK-GI-NEXT: mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT: mov b3, v0.b[3]
-; CHECK-GI-NEXT: mov v1.b[2], v4.b[0]
-; CHECK-GI-NEXT: mov b4, v0.b[4]
-; CHECK-GI-NEXT: mov v1.b[3], v3.b[0]
-; CHECK-GI-NEXT: mov b3, v0.b[5]
-; CHECK-GI-NEXT: mov v1.b[4], v4.b[0]
-; CHECK-GI-NEXT: mov b4, v0.b[6]
-; CHECK-GI-NEXT: mov b0, v0.b[7]
-; CHECK-GI-NEXT: mov v1.b[5], v3.b[0]
-; CHECK-GI-NEXT: mov v1.b[6], v4.b[0]
-; CHECK-GI-NEXT: mov v1.b[7], v0.b[0]
+; CHECK-GI-NEXT: mov v1.b[1], v0.b[1]
+; CHECK-GI-NEXT: mov v1.b[2], v0.b[2]
+; CHECK-GI-NEXT: mov v1.b[3], v0.b[3]
+; CHECK-GI-NEXT: mov v1.b[4], v0.b[4]
+; CHECK-GI-NEXT: mov v1.b[5], v0.b[5]
+; CHECK-GI-NEXT: mov v1.b[6], v0.b[6]
+; CHECK-GI-NEXT: mov v1.b[7], v0.b[7]
; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI127_0]
; CHECK-GI-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v0.16b
; CHECK-GI-NEXT: ret
@@ -1853,37 +1785,23 @@ define <16 x i8> @test_concat_v16i8_v16i8_v8i8(<16 x i8> %x, <8 x i8> %y) #0 {
;
; CHECK-GI-LABEL: test_concat_v16i8_v16i8_v8i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov b3, v0.b[1]
; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov b4, v0.b[2]
-; CHECK-GI-NEXT: mov v2.b[1], v3.b[0]
-; CHECK-GI-NEXT: mov b3, v0.b[3]
-; CHECK-GI-NEXT: mov v2.b[2], v4.b[0]
-; CHECK-GI-NEXT: mov b4, v0.b[4]
-; CHECK-GI-NEXT: mov v2.b[3], v3.b[0]
-; CHECK-GI-NEXT: mov b3, v0.b[5]
-; CHECK-GI-NEXT: mov v2.b[4], v4.b[0]
-; CHECK-GI-NEXT: mov b4, v0.b[6]
-; CHECK-GI-NEXT: mov b0, v0.b[7]
-; CHECK-GI-NEXT: mov v2.b[5], v3.b[0]
-; CHECK-GI-NEXT: mov b3, v1.b[2]
-; CHECK-GI-NEXT: mov v2.b[6], v4.b[0]
-; CHECK-GI-NEXT: mov v2.b[7], v0.b[0]
-; CHECK-GI-NEXT: mov b0, v1.b[1]
+; CHECK-GI-NEXT: mov v2.b[1], v0.b[1]
+; CHECK-GI-NEXT: mov v2.b[2], v0.b[2]
+; CHECK-GI-NEXT: mov v2.b[3], v0.b[3]
+; CHECK-GI-NEXT: mov v2.b[4], v0.b[4]
+; CHECK-GI-NEXT: mov v2.b[5], v0.b[5]
+; CHECK-GI-NEXT: mov v2.b[6], v0.b[6]
+; CHECK-GI-NEXT: mov v2.b[7], v0.b[7]
; CHECK-GI-NEXT: mov v2.b[8], v1.b[0]
-; CHECK-GI-NEXT: mov v2.b[9], v0.b[0]
-; CHECK-GI-NEXT: mov b0, v1.b[3]
-; CHECK-GI-NEXT: mov v2.b[10], v3.b[0]
-; CHECK-GI-NEXT: mov b3, v1.b[4]
-; CHECK-GI-NEXT: mov v2.b[11], v0.b[0]
-; CHECK-GI-NEXT: mov b0, v1.b[5]
-; CHECK-GI-NEXT: mov v2.b[12], v3.b[0]
-; CHECK-GI-NEXT: mov b3, v1.b[6]
-; CHECK-GI-NEXT: mov v2.b[13], v0.b[0]
-; CHECK-GI-NEXT: mov b0, v1.b[7]
-; CHECK-GI-NEXT: mov v2.b[14], v3.b[0]
-; CHECK-GI-NEXT: mov v2.b[15], v0.b[0]
+; CHECK-GI-NEXT: mov v2.b[9], v1.b[1]
+; CHECK-GI-NEXT: mov v2.b[10], v1.b[2]
+; CHECK-GI-NEXT: mov v2.b[11], v1.b[3]
+; CHECK-GI-NEXT: mov v...
[truncated]
|
Should fix the legalization behavior instead to fix the index type? Is there some inherent reason to prefer the illegal register typed index? |
If I understood everything correctly, this commit fixes the type of a scalar inserted into a vector, not the index type.
Other patterns are fixed/used similarly. |
I haven't looked at this deeply, but would it be possible to fix the SDAG patterns instead of adding new GISel specific ones? |
Patterns in question don't work for all types because i8 type is illegal in SDAG (for most instructions at least), and extract_vector_elt/insert_vector_elt operate on i32 scalars even if they extract from vXi8/vYi16. As far as I understand, it's not possible to make a pattern that matches either one type or another. Lines that are most important for this PR (that cause the biggest change in tests) are
They are meant to be an alternative for SDAG's
Ld1Lane64Pat here matches I could try making a Please correct me if I've overlooked something. |
Same principle applies. I expect to see legal types (in the DAG legal sense). The MIR the selector sees has the same type signature as the DAG node, shouldn't require adjusting the patterns but the input legalization |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In Arm/AArch64 SDAG an integer extract and an fp extract can be treated differently. That is not as simple in GISel without type info to distinguish them (although it might be possible with i8 as there are no fp i8 types at the moment). Currently I believe GISel is (correctly) marking the load and vector insert on a fp regbank, so not expanding the types to i32. It would feel a bit artificial to do that.
There were some patches on the SDAG side recently to make some store-byte patterns work for i8. They go through a aarch64mfp8 type, which isn't the best but is of the correct size.
@@ -573,3 +573,10 @@ def : Pat<(v2i64 (AArch64dup (i64 (load GPR64sp:$Rn)))), | |||
(LD1Rv2d GPR64sp:$Rn)>; | |||
def : Pat<(v1i64 (AArch64dup (i64 (load GPR64sp:$Rn)))), | |||
(LD1Rv1d GPR64sp:$Rn)>; | |||
|
|||
defm : LoadInsertPatterns<load, v16i8, v8i8, nxv16i8, i8, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we do go this route I would put these next to the existing LoadInsertPatterns, maybe with a comment about it being used for GISel. Same for the others below.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've moved these definitions back to their peers, and added OnlyGISel
predicate to
- Ignore them for SDAG
- Indicate that they are meant for GlobalISel
…ssions. This is an update of llvm#69607 after llvm#101675 and llvm#105686. Ld1Lane64Pat, Ld1Lane128Pat, LoadInsertPatterns, Neon_INS_elt_pattern from SelectionDAG didn't work for GlobalISel on v8i8 and v16i8 vector types, because vector_insert for v8i8, v16i8 in SelectionDAG expects i32 scalar argument type, whereas G_INSERT_VECTOR_ELT expects s8.
From what I see, it will require many changes. For example, If we are certain that we need it, I can try implementing that. Will it be beneficial? |
Gentle ping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is OnlyGISel necessary, or would a comment explaining why they are present by enough?
I’ve added a comment. Strictly speaking, it’s not a must, I’m ok to remove it. |
I think I would remove OnlyGISel if it will not otherwise affect anything. Does it alter SDAG to have them enabled? |
This is an update of #69607 after #101675 and #105686.
Ld1Lane64Pat, Ld1Lane128Pat, LoadInsertPatterns, Neon_INS_elt_pattern from SelectionDAG didn't work for GlobalISel on v8i8 and v16i8 vector types, because vector_insert for v8i8, v16i8 in SelectionDAG expects i32 scalar argument type, whereas G_INSERT_VECTOR_ELT expects s8.