Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit cbf814d

Browse files
[CIR][AArch64] Lower vfmaq_v f32/f64 (#195602)
Lower `BI__builtin_neon_vfmaq_v` in CIR for the `vfmaq_f32` and `vfmaq_f64` ACLE wrappers. This is split out from the broader fused multiply-accumulate work and only covers `BI__builtin_neon_vfmaq_v`. The related `vfma_v`, `vfmaq_f16`, lane, laneq, and scalar forms remain outside this PR. Tests move the existing `vfmaq_f32` and `vfmaq_f64` coverage from `neon-intrinsics.c` into `neon/vfmaq.c`, preserve the original LLVM checks, and add ClangIR coverage. Validation: rebuilt `clang` and ran the focused `vfmaq.c` lit test. Part of #185382 Split from feedback on #188190
1 parent b018d3a commit cbf814d

5 files changed

Lines changed: 140 additions & 69 deletions

File tree

clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,17 @@ findARMVectorIntrinsicInMap(ArrayRef<ARMVectorIntrinsicInfo> intrinsicMap,
8585
//===----------------------------------------------------------------------===//
8686
// Generic helpers
8787
//===----------------------------------------------------------------------===//
88+
// Emit an intrinsic where all operands are of the same type as the result.
89+
// Depending on mode, this may be a constrained floating-point intrinsic.
90+
static mlir::Value
91+
emitCallMaybeConstrainedBuiltin(CIRGenBuilderTy &builder, mlir::Location loc,
92+
StringRef intrName, mlir::Type retTy,
93+
llvm::SmallVector<mlir::Value> &ops) {
94+
assert(!cir::MissingFeatures::emitConstrainedFPCall());
95+
96+
return builder.emitIntrinsicCallOp(loc, intrName, retTy, ops);
97+
}
98+
8899
static llvm::StringRef getLLVMIntrNameNoPrefix(llvm::Intrinsic::ID intrID) {
89100
llvm::StringRef llvmIntrName = llvm::Intrinsic::getBaseName(intrID);
90101
assert(llvmIntrName.starts_with("llvm.") && "Not an LLVM intrinsic!");
@@ -703,7 +714,21 @@ static mlir::Value emitCommonNeonBuiltinExpr(
703714
case NEON::BI__builtin_neon_vext_v:
704715
case NEON::BI__builtin_neon_vextq_v:
705716
case NEON::BI__builtin_neon_vfma_v:
706-
case NEON::BI__builtin_neon_vfmaq_v:
717+
cgf.cgm.errorNYI(expr->getSourceRange(),
718+
std::string("unimplemented AArch64 builtin call: ") +
719+
ctx.BuiltinInfo.getName(builtinID));
720+
return mlir::Value{};
721+
case NEON::BI__builtin_neon_vfmaq_v: {
722+
// NEON intrinsic: vfmaq(accumulator, multiplicand1, multiplicand2)
723+
// LLVM intrinsic: fma(multiplicand1, multiplicand2, accumulator)
724+
// Reorder arguments to match LLVM fma signature
725+
mlir::Value op0 = cgf.getBuilder().createBitcast(ops[0], ty);
726+
mlir::Value op1 = cgf.getBuilder().createBitcast(ops[1], ty);
727+
mlir::Value op2 = cgf.getBuilder().createBitcast(ops[2], ty);
728+
llvm::SmallVector<mlir::Value> fmaOps = {op1, op2, op0};
729+
return emitCallMaybeConstrainedBuiltin(cgf.getBuilder(), loc, "fma", ty,
730+
fmaOps);
731+
}
707732
case NEON::BI__builtin_neon_vld1_v:
708733
case NEON::BI__builtin_neon_vld1q_v:
709734
case NEON::BI__builtin_neon_vld1_x2_v:
@@ -888,17 +913,6 @@ static mlir::Value emitCommonNeonBuiltinExpr(
888913
}
889914
}
890915

891-
// Emit an intrinsic where all operands are of the same type as the result.
892-
// Depending on mode, this may be a constrained floating-point intrinsic.
893-
static mlir::Value
894-
emitCallMaybeConstrainedBuiltin(CIRGenBuilderTy &builder, mlir::Location loc,
895-
StringRef intrName, mlir::Type retTy,
896-
llvm::SmallVector<mlir::Value> &ops) {
897-
assert(!cir::MissingFeatures::emitConstrainedFPCall());
898-
899-
return builder.emitIntrinsicCallOp(loc, intrName, retTy, ops);
900-
}
901-
902916
bool CIRGenFunction::getAArch64SVEProcessedOperands(
903917
unsigned builtinID, const CallExpr *expr, SmallVectorImpl<mlir::Value> &ops,
904918
SVETypeFlags typeFlags) {

clang/test/CodeGen/AArch64/neon-intrinsics.c

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -886,44 +886,6 @@ float32x2_t test_vfma_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
886886
return vfma_f32(v1, v2, v3);
887887
}
888888

889-
// CHECK-LABEL: define dso_local <4 x float> @test_vfmaq_f32(
890-
// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
891-
// CHECK-NEXT: [[ENTRY:.*:]]
892-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V1]] to <4 x i32>
893-
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32>
894-
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V3]] to <4 x i32>
895-
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
896-
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
897-
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
898-
// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
899-
// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
900-
// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
901-
// CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x float> [[TMP6]])
902-
// CHECK-NEXT: ret <4 x float> [[TMP9]]
903-
//
904-
float32x4_t test_vfmaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
905-
return vfmaq_f32(v1, v2, v3);
906-
}
907-
908-
// CHECK-LABEL: define dso_local <2 x double> @test_vfmaq_f64(
909-
// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] {
910-
// CHECK-NEXT: [[ENTRY:.*:]]
911-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V1]] to <2 x i64>
912-
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V2]] to <2 x i64>
913-
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V3]] to <2 x i64>
914-
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
915-
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8>
916-
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
917-
// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double>
918-
// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
919-
// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
920-
// CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x double> [[TMP6]])
921-
// CHECK-NEXT: ret <2 x double> [[TMP9]]
922-
//
923-
float64x2_t test_vfmaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
924-
return vfmaq_f64(v1, v2, v3);
925-
}
926-
927889
// CHECK-LABEL: define dso_local <2 x float> @test_vfms_f32(
928890
// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
929891
// CHECK-NEXT: [[ENTRY:.*:]]
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
// REQUIRES: aarch64-registered-target || arm-registered-target
2+
3+
// RUN: %clang_cc1_cg_arm64_neon -target-feature +fullfp16 -emit-llvm %s -disable-O0-optnone | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=ALL,LLVM
4+
// RUN: %if cir-enabled %{%clang_cc1_cg_arm64_neon -target-feature +fullfp16 -fclangir -emit-llvm %s -disable-O0-optnone | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=ALL,LLVM %}
5+
// RUN: %if cir-enabled %{%clang_cc1_cg_arm64_neon -target-feature +fullfp16 -fclangir -emit-cir %s -disable-O0-optnone | FileCheck %s --check-prefixes=ALL,CIR %}
6+
7+
// ALL: {{[Mm]}}odule
8+
9+
//=============================================================================
10+
// NOTES
11+
//
12+
// This file contains fullfp16 tests that were originally located in:
13+
// * clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
14+
// The main difference is the use of RUN lines that enable ClangIR lowering.
15+
// This file currently covers the f16 wrapper that lowers through
16+
// BI__builtin_neon_vfmaq_v.
17+
//
18+
// ACLE section headings based on v2025Q2 of the ACLE specification:
19+
// * https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#fused-multiply-accumulate-2
20+
//
21+
//=============================================================================
22+
23+
#include <arm_neon.h>
24+
25+
//===------------------------------------------------------===//
26+
// 2.6.1.9.3 Fused multiply-accumulate, vector quad forms
27+
//===------------------------------------------------------===//
28+
29+
// LLVM-LABEL: @test_vfmaq_f16(
30+
// CIR-LABEL: @vfmaq_f16(
31+
float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
32+
// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>) -> !cir.vector<8 x !cir.f16>
33+
34+
// LLVM-SAME: <8 x half> {{.*}} [[A:%.*]], <8 x half> {{.*}} [[B:%.*]], <8 x half> {{.*}} [[C:%.*]]) {{.*}} {
35+
// LLVM: [[A_I:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
36+
// LLVM-NEXT: [[B_I:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
37+
// LLVM-NEXT: [[C_I:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
38+
// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <8 x i16> [[A_I]] to <16 x i8>
39+
// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <8 x i16> [[B_I]] to <16 x i8>
40+
// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <8 x i16> [[C_I]] to <16 x i8>
41+
// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <8 x half>
42+
// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <8 x half>
43+
// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <8 x half>
44+
// LLVM-NEXT: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B_CAST]], <8 x half> [[C_CAST]], <8 x half> [[A_CAST]])
45+
// LLVM-NEXT: ret <8 x half> [[FMA]]
46+
return vfmaq_f16(a, b, c);
47+
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
// REQUIRES: aarch64-registered-target || arm-registered-target
2+
3+
// RUN: %clang_cc1_cg_arm64_neon -emit-llvm %s -disable-O0-optnone | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=ALL,LLVM
4+
// RUN: %if cir-enabled %{%clang_cc1_cg_arm64_neon -fclangir -emit-llvm %s -disable-O0-optnone | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=ALL,LLVM %}
5+
// RUN: %if cir-enabled %{%clang_cc1_cg_arm64_neon -fclangir -emit-cir %s -disable-O0-optnone | FileCheck %s --check-prefixes=ALL,CIR %}
6+
7+
// ALL: {{[Mm]}}odule
8+
9+
//=============================================================================
10+
// NOTES
11+
//
12+
// This file contains tests that were originally located in:
13+
// * clang/test/CodeGen/AArch64/neon-intrinsics.c
14+
// The main difference is the use of RUN lines that enable ClangIR lowering.
15+
// This file currently covers the f32/f64 wrappers that lower through
16+
// BI__builtin_neon_vfmaq_v.
17+
//
18+
// ACLE section headings based on v2025Q2 of the ACLE specification:
19+
// * https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#fused-multiply-accumulate
20+
//
21+
//=============================================================================
22+
23+
#include <arm_neon.h>
24+
25+
//===------------------------------------------------------===//
26+
// 2.1.1.2.5 Fused multiply-accumulate, vector quad forms
27+
//===------------------------------------------------------===//
28+
29+
// LLVM-LABEL: @test_vfmaq_f32(
30+
// CIR-LABEL: @vfmaq_f32(
31+
float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
32+
// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>) -> !cir.vector<4 x !cir.float>
33+
34+
// LLVM-SAME: <4 x float> {{.*}} [[A:%.*]], <4 x float> {{.*}} [[B:%.*]], <4 x float> {{.*}} [[C:%.*]]) {{.*}} {
35+
// LLVM: [[A_I:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
36+
// LLVM-NEXT: [[B_I:%.*]] = bitcast <4 x float> [[B]] to <4 x i32>
37+
// LLVM-NEXT: [[C_I:%.*]] = bitcast <4 x float> [[C]] to <4 x i32>
38+
// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <4 x i32> [[A_I]] to <16 x i8>
39+
// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <4 x i32> [[B_I]] to <16 x i8>
40+
// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <4 x i32> [[C_I]] to <16 x i8>
41+
// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <4 x float>
42+
// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <4 x float>
43+
// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <4 x float>
44+
// LLVM-NEXT: [[FMA:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B_CAST]], <4 x float> [[C_CAST]], <4 x float> [[A_CAST]])
45+
// LLVM-NEXT: ret <4 x float> [[FMA]]
46+
return vfmaq_f32(a, b, c);
47+
}
48+
49+
// LLVM-LABEL: @test_vfmaq_f64(
50+
// CIR-LABEL: @vfmaq_f64(
51+
float64x2_t test_vfmaq_f64(float64x2_t a, float64x2_t b, float64x2_t c) {
52+
// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>) -> !cir.vector<2 x !cir.double>
53+
54+
// LLVM-SAME: <2 x double> {{.*}} [[A:%.*]], <2 x double> {{.*}} [[B:%.*]], <2 x double> {{.*}} [[C:%.*]]) {{.*}} {
55+
// LLVM: [[A_I:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
56+
// LLVM-NEXT: [[B_I:%.*]] = bitcast <2 x double> [[B]] to <2 x i64>
57+
// LLVM-NEXT: [[C_I:%.*]] = bitcast <2 x double> [[C]] to <2 x i64>
58+
// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <2 x i64> [[A_I]] to <16 x i8>
59+
// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <2 x i64> [[B_I]] to <16 x i8>
60+
// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <2 x i64> [[C_I]] to <16 x i8>
61+
// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <2 x double>
62+
// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <2 x double>
63+
// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <2 x double>
64+
// LLVM-NEXT: [[FMA:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[B_CAST]], <2 x double> [[C_CAST]], <2 x double> [[A_CAST]])
65+
// LLVM-NEXT: ret <2 x double> [[FMA]]
66+
return vfmaq_f64(a, b, c);
67+
}

clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1621,25 +1621,6 @@ float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
16211621
return vfma_f16(a, b, c);
16221622
}
16231623

1624-
// CHECK-LABEL: define {{[^@]+}}@test_vfmaq_f16
1625-
// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
1626-
// CHECK-NEXT: entry:
1627-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
1628-
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
1629-
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
1630-
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
1631-
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>
1632-
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
1633-
// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
1634-
// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
1635-
// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
1636-
// CHECK-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP7]], <8 x half> [[TMP8]], <8 x half> [[TMP6]])
1637-
// CHECK-NEXT: ret <8 x half> [[TMP9]]
1638-
//
1639-
float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
1640-
return vfmaq_f16(a, b, c);
1641-
}
1642-
16431624
// CHECK-LABEL: define {{[^@]+}}@test_vfms_f16
16441625
// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
16451626
// CHECK-NEXT: entry:

0 commit comments

Comments
 (0)