-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[AArch64][SVE] Add patterns for bit-select instructions. #138689
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
We are not selecting BSL/NBSL/BSL1N/BSL2N in some cases, e.g.: ```cpp svuint64_t bsl(svuint64_t a, svuint64_t b, svuint64_t c) { return (a & c) | (b & ~c); } ``` Currently generates: ```gas bsl: and z0.d, z2.d, z0.d bic z1.d, z1.d, z2.d orr z0.d, z0.d, z1.d ret ``` Instead of: ```gas bsl: bsl z0.d, z0.d, z1.d, z2.d ret ``` This patch adds patterns to match (or (and a, c), (and b, (vnot c)))) to BSL, and similar derivative patterns for the other bit-sel instructions.
@llvm/pr-subscribers-backend-aarch64 Author: Ricardo Jesus (rj-jesus) ChangesThis patch adds patterns to select bit-sel instructions such as BSL from svuint64_t bsl(svuint64_t a, svuint64_t b, svuint64_t c) {
return (a & c) | (b & ~c);
} Currently: bsl:
and z0.d, z2.d, z0.d
bic z1.d, z1.d, z2.d
orr z0.d, z0.d, z1.d
ret Becomes: bsl:
bsl z0.d, z0.d, z1.d, z2.d
ret Full diff: https://github.com/llvm/llvm-project/pull/138689.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index d13728ec930c8..515e580ff5d78 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -546,11 +546,21 @@ def AArch64umulh : PatFrag<(ops node:$op1, node:$op2),
def AArch64bsl : PatFrags<(ops node:$Op1, node:$Op2, node:$Op3),
[(int_aarch64_sve_bsl node:$Op1, node:$Op2, node:$Op3),
- (AArch64bsp node:$Op3, node:$Op1, node:$Op2)]>;
+ (AArch64bsp node:$Op3, node:$Op1, node:$Op2),
+ (or (and node:$Op1, node:$Op3), (and node:$Op2, (vnot node:$Op3)))]>;
+
+def AArch64bsl1n : PatFrags<(ops node:$Op1, node:$Op2, node:$Op3),
+ [(int_aarch64_sve_bsl1n node:$Op1, node:$Op2, node:$Op3),
+ (AArch64bsl (vnot node:$Op1), node:$Op2, node:$Op3)]>;
+
+def AArch64bsl2n : PatFrags<(ops node:$Op1, node:$Op2, node:$Op3),
+ [(int_aarch64_sve_bsl2n node:$Op1, node:$Op2, node:$Op3),
+ (or (and node:$Op1, node:$Op3), (vnot (or node:$Op2, node:$Op3)))]>;
def AArch64nbsl : PatFrags<(ops node:$Op1, node:$Op2, node:$Op3),
[(int_aarch64_sve_nbsl node:$Op1, node:$Op2, node:$Op3),
- (vnot (AArch64bsp node:$Op3, node:$Op1, node:$Op2))]>;
+ (vnot (AArch64bsp node:$Op3, node:$Op1, node:$Op2)),
+ (vnot (AArch64bsl node:$Op1, node:$Op2, node:$Op3))]>;
let Predicates = [HasSVE] in {
@@ -3923,8 +3933,8 @@ let Predicates = [HasSVE2_or_SME] in {
defm EOR3_ZZZZ : sve2_int_bitwise_ternary_op<0b000, "eor3", AArch64eor3>;
defm BCAX_ZZZZ : sve2_int_bitwise_ternary_op<0b010, "bcax", AArch64bcax>;
defm BSL_ZZZZ : sve2_int_bitwise_ternary_op<0b001, "bsl", AArch64bsl>;
- defm BSL1N_ZZZZ : sve2_int_bitwise_ternary_op<0b011, "bsl1n", int_aarch64_sve_bsl1n>;
- defm BSL2N_ZZZZ : sve2_int_bitwise_ternary_op<0b101, "bsl2n", int_aarch64_sve_bsl2n>;
+ defm BSL1N_ZZZZ : sve2_int_bitwise_ternary_op<0b011, "bsl1n", AArch64bsl1n>;
+ defm BSL2N_ZZZZ : sve2_int_bitwise_ternary_op<0b101, "bsl2n", AArch64bsl2n>;
defm NBSL_ZZZZ : sve2_int_bitwise_ternary_op<0b111, "nbsl", AArch64nbsl>;
// SVE2 bitwise xor and rotate right by immediate
diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
index ef7d4abe5c5f4..e524c5d6b453e 100644
--- a/llvm/test/CodeGen/AArch64/sve2-bsl.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
@@ -93,3 +93,209 @@ define <vscale x 2 x i64> @nbsl_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b
%4 = xor <vscale x 2 x i64> %3, splat(i64 -1)
ret <vscale x 2 x i64> %4
}
+
+; Test BSL/NBSL/BSL1N/BSL2N code generation for:
+; #define BSL(x,y,z) ( ((x) & (z)) | ( (y) & ~(z)))
+; #define NBSL(x,y,z) (~(((x) & (z)) | ( (y) & ~(z))))
+; #define BSL1N(x,y,z) ( (~(x) & (z)) | ( (y) & ~(z)))
+; #define BSL2N(x,y,z) ( ((x) & (z)) | (~(y) & ~(z)))
+
+define <vscale x 16 x i8> @codegen_bsl_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2) {
+; CHECK-LABEL: codegen_bsl_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bsl z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT: ret
+ %4 = and <vscale x 16 x i8> %2, %0
+ %5 = xor <vscale x 16 x i8> %2, splat (i8 -1)
+ %6 = and <vscale x 16 x i8> %1, %5
+ %7 = or <vscale x 16 x i8> %4, %6
+ ret <vscale x 16 x i8> %7
+}
+
+define <vscale x 16 x i8> @codegen_nbsl_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2) {
+; CHECK-LABEL: codegen_nbsl_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: nbsl z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT: ret
+ %4 = and <vscale x 16 x i8> %2, %0
+ %5 = xor <vscale x 16 x i8> %2, splat (i8 -1)
+ %6 = and <vscale x 16 x i8> %1, %5
+ %7 = or <vscale x 16 x i8> %4, %6
+ %8 = xor <vscale x 16 x i8> %7, splat (i8 -1)
+ ret <vscale x 16 x i8> %8
+}
+
+define <vscale x 16 x i8> @codegen_bsl1n_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2) {
+; CHECK-LABEL: codegen_bsl1n_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bsl1n z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT: ret
+ %4 = xor <vscale x 16 x i8> %0, splat (i8 -1)
+ %5 = and <vscale x 16 x i8> %2, %4
+ %6 = xor <vscale x 16 x i8> %2, splat (i8 -1)
+ %7 = and <vscale x 16 x i8> %1, %6
+ %8 = or <vscale x 16 x i8> %5, %7
+ ret <vscale x 16 x i8> %8
+}
+
+define <vscale x 16 x i8> @codegen_bsl2n_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2) {
+; CHECK-LABEL: codegen_bsl2n_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bsl2n z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT: ret
+ %4 = and <vscale x 16 x i8> %2, %0
+ %5 = or <vscale x 16 x i8> %2, %1
+ %6 = xor <vscale x 16 x i8> %5, splat (i8 -1)
+ %7 = or <vscale x 16 x i8> %4, %6
+ ret <vscale x 16 x i8> %7
+}
+
+define <vscale x 8 x i16> @codegen_bsl_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2) {
+; CHECK-LABEL: codegen_bsl_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bsl z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT: ret
+ %4 = and <vscale x 8 x i16> %2, %0
+ %5 = xor <vscale x 8 x i16> %2, splat (i16 -1)
+ %6 = and <vscale x 8 x i16> %1, %5
+ %7 = or <vscale x 8 x i16> %4, %6
+ ret <vscale x 8 x i16> %7
+}
+
+define <vscale x 8 x i16> @codegen_nbsl_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2) {
+; CHECK-LABEL: codegen_nbsl_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: nbsl z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT: ret
+ %4 = and <vscale x 8 x i16> %2, %0
+ %5 = xor <vscale x 8 x i16> %2, splat (i16 -1)
+ %6 = and <vscale x 8 x i16> %1, %5
+ %7 = or <vscale x 8 x i16> %4, %6
+ %8 = xor <vscale x 8 x i16> %7, splat (i16 -1)
+ ret <vscale x 8 x i16> %8
+}
+
+define <vscale x 8 x i16> @codegen_bsl1n_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2) {
+; CHECK-LABEL: codegen_bsl1n_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bsl1n z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT: ret
+ %4 = xor <vscale x 8 x i16> %0, splat (i16 -1)
+ %5 = and <vscale x 8 x i16> %2, %4
+ %6 = xor <vscale x 8 x i16> %2, splat (i16 -1)
+ %7 = and <vscale x 8 x i16> %1, %6
+ %8 = or <vscale x 8 x i16> %5, %7
+ ret <vscale x 8 x i16> %8
+}
+
+define <vscale x 8 x i16> @codegen_bsl2n_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2) {
+; CHECK-LABEL: codegen_bsl2n_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bsl2n z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT: ret
+ %4 = and <vscale x 8 x i16> %2, %0
+ %5 = or <vscale x 8 x i16> %2, %1
+ %6 = xor <vscale x 8 x i16> %5, splat (i16 -1)
+ %7 = or <vscale x 8 x i16> %4, %6
+ ret <vscale x 8 x i16> %7
+}
+
+define <vscale x 4 x i32> @codegen_bsl_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2) {
+; CHECK-LABEL: codegen_bsl_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bsl z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT: ret
+ %4 = and <vscale x 4 x i32> %2, %0
+ %5 = xor <vscale x 4 x i32> %2, splat (i32 -1)
+ %6 = and <vscale x 4 x i32> %1, %5
+ %7 = or <vscale x 4 x i32> %4, %6
+ ret <vscale x 4 x i32> %7
+}
+
+define <vscale x 4 x i32> @codegen_nbsl_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2) {
+; CHECK-LABEL: codegen_nbsl_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: nbsl z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT: ret
+ %4 = and <vscale x 4 x i32> %2, %0
+ %5 = xor <vscale x 4 x i32> %2, splat (i32 -1)
+ %6 = and <vscale x 4 x i32> %1, %5
+ %7 = or <vscale x 4 x i32> %4, %6
+ %8 = xor <vscale x 4 x i32> %7, splat (i32 -1)
+ ret <vscale x 4 x i32> %8
+}
+
+define <vscale x 4 x i32> @codegen_bsl1n_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2) {
+; CHECK-LABEL: codegen_bsl1n_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bsl1n z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT: ret
+ %4 = xor <vscale x 4 x i32> %0, splat (i32 -1)
+ %5 = and <vscale x 4 x i32> %2, %4
+ %6 = xor <vscale x 4 x i32> %2, splat (i32 -1)
+ %7 = and <vscale x 4 x i32> %1, %6
+ %8 = or <vscale x 4 x i32> %5, %7
+ ret <vscale x 4 x i32> %8
+}
+
+define <vscale x 4 x i32> @codegen_bsl2n_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2) {
+; CHECK-LABEL: codegen_bsl2n_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bsl2n z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT: ret
+ %4 = and <vscale x 4 x i32> %2, %0
+ %5 = or <vscale x 4 x i32> %2, %1
+ %6 = xor <vscale x 4 x i32> %5, splat (i32 -1)
+ %7 = or <vscale x 4 x i32> %4, %6
+ ret <vscale x 4 x i32> %7
+}
+
+define <vscale x 2 x i64> @codegen_bsl_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2) {
+; CHECK-LABEL: codegen_bsl_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bsl z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT: ret
+ %4 = and <vscale x 2 x i64> %2, %0
+ %5 = xor <vscale x 2 x i64> %2, splat (i64 -1)
+ %6 = and <vscale x 2 x i64> %1, %5
+ %7 = or <vscale x 2 x i64> %4, %6
+ ret <vscale x 2 x i64> %7
+}
+
+define <vscale x 2 x i64> @codegen_nbsl_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2) {
+; CHECK-LABEL: codegen_nbsl_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: nbsl z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT: ret
+ %4 = and <vscale x 2 x i64> %2, %0
+ %5 = xor <vscale x 2 x i64> %2, splat (i64 -1)
+ %6 = and <vscale x 2 x i64> %1, %5
+ %7 = or <vscale x 2 x i64> %4, %6
+ %8 = xor <vscale x 2 x i64> %7, splat (i64 -1)
+ ret <vscale x 2 x i64> %8
+}
+
+define <vscale x 2 x i64> @codegen_bsl1n_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2) {
+; CHECK-LABEL: codegen_bsl1n_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bsl1n z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT: ret
+ %4 = xor <vscale x 2 x i64> %0, splat (i64 -1)
+ %5 = and <vscale x 2 x i64> %2, %4
+ %6 = xor <vscale x 2 x i64> %2, splat (i64 -1)
+ %7 = and <vscale x 2 x i64> %1, %6
+ %8 = or <vscale x 2 x i64> %5, %7
+ ret <vscale x 2 x i64> %8
+}
+
+define <vscale x 2 x i64> @codegen_bsl2n_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2) {
+; CHECK-LABEL: codegen_bsl2n_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bsl2n z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT: ret
+ %4 = and <vscale x 2 x i64> %2, %0
+ %5 = or <vscale x 2 x i64> %2, %1
+ %6 = xor <vscale x 2 x i64> %5, splat (i64 -1)
+ %7 = or <vscale x 2 x i64> %4, %6
+ ret <vscale x 2 x i64> %7
+}
|
Note you can use these instructions for Neon modes as well https://godbolt.org/z/ccdEoE1G7 |
I was thinking we could add these cases in a subsequent PR, but I'm happy to do it here if others agree. |
If agreeable I think now is the time for a bit of refactoring. Given |
I agree it would be better for a subsequent PR to extend the fixed length support. |
I think that sounds like a good idea. I suppose the other option would be to lower the intrinsics to elementary IR in Would you like me to add the lowering to |
Up to you. I don't mind the refactoring and the improved code generation being within this one PR. |
Thanks, that sounds good - I've just updated the PR with the lowering to |
This patch adds patterns to select bit-sel instructions such as BSL from
(or (and a, c), (and b, (vnot c)))) and other similar patterns. For example:
Currently:
Becomes:
(EDIT: Comparison with GCC: https://godbolt.org/z/es7W5rKrY.)