-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[OpenACC][CIR] Implement 'worker'/'vector' lowering #138765
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This patch implements worker and vector lowering for the loop construct, which are fairly simple clauses, except that they also have a 'no argument' form which requires a touch more work. Else, these are just like a handful of other clauses where we just keep the device_type array and operands in sync.
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-clang Author: Erich Keane (erichkeane) ChangesThis patch implements worker and vector lowering for the loop construct, which are fairly simple clauses, except that they also have a 'no argument' form which requires a touch more work. Else, these are just like a handful of other clauses where we just keep the device_type array and operands in sync. Full diff: https://github.com/llvm/llvm-project/pull/138765.diff 4 Files Affected:
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h
index fa4ce5efc39ad..ef4f64a167742 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h
@@ -392,6 +392,38 @@ class OpenACCClauseCIREmitter final
return clauseNotImplemented(clause);
}
}
+
+ void VisitWorkerClause(const OpenACCWorkerClause &clause) {
+ if constexpr (isOneOfTypes<OpTy, mlir::acc::LoopOp>) {
+ if (clause.hasIntExpr())
+ operation.addWorkerNumOperand(builder.getContext(),
+ createIntExpr(clause.getIntExpr()),
+ lastDeviceTypeValues);
+ else
+ operation.addEmptyWorker(builder.getContext(), lastDeviceTypeValues);
+
+ } else {
+ // TODO: When we've implemented this for everything, switch this to an
+ // unreachable. Combined constructs remain.
+ return clauseNotImplemented(clause);
+ }
+ }
+
+ void VisitVectorClause(const OpenACCVectorClause &clause) {
+ if constexpr (isOneOfTypes<OpTy, mlir::acc::LoopOp>) {
+ if (clause.hasIntExpr())
+ operation.addVectorOperand(builder.getContext(),
+ createIntExpr(clause.getIntExpr()),
+ lastDeviceTypeValues);
+ else
+ operation.addEmptyVector(builder.getContext(), lastDeviceTypeValues);
+
+ } else {
+ // TODO: When we've implemented this for everything, switch this to an
+ // unreachable. Combined constructs remain.
+ return clauseNotImplemented(clause);
+ }
+ }
};
template <typename OpTy>
diff --git a/clang/test/CIR/CodeGenOpenACC/loop.cpp b/clang/test/CIR/CodeGenOpenACC/loop.cpp
index b255a01adda0e..d636d1b37d969 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop.cpp
@@ -193,4 +193,134 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
// CHECK: acc.yield
// CHECK-NEXT: } loc
+
+#pragma acc kernels
+ {
+
+#pragma acc loop worker
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK: acc.loop worker {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop worker(N)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.loop worker(%[[N_CONV]] : si32) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop worker device_type(nvidia, radeon) worker
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.loop worker([#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop worker(N) device_type(nvidia, radeon) worker
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.loop worker([#acc.device_type<nvidia>, #acc.device_type<radeon>], %[[N_CONV]] : si32) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop worker device_type(nvidia, radeon) worker(N)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.loop worker([#acc.device_type<none>], %[[N_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_CONV]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop worker(N) device_type(nvidia, radeon) worker(N + 1)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[ONE_CONST]]) nsw : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+ // CHECK-NEXT: acc.loop worker(%[[N_CONV]] : si32, %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop device_type(nvidia, radeon) worker(num:N + 1)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD]], %[[ONE_CONST]]) nsw : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+ // CHECK-NEXT: acc.loop worker(%[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+
+#pragma acc loop vector
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK: acc.loop vector {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop vector(N)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.loop vector(%[[N_CONV]] : si32) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop vector device_type(nvidia, radeon) vector
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.loop vector([#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop vector(N) device_type(nvidia, radeon) vector
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.loop vector([#acc.device_type<nvidia>, #acc.device_type<radeon>], %[[N_CONV]] : si32) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop vector(N) device_type(nvidia, radeon) vector(N + 1)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[ONE_CONST]]) nsw : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+ // CHECK-NEXT: acc.loop vector(%[[N_CONV]] : si32, %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop device_type(nvidia, radeon) vector(length:N + 1)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD]], %[[ONE_CONST]]) nsw : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+ // CHECK-NEXT: acc.loop vector(%[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop worker vector device_type(nvidia) worker vector
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.loop worker([#acc.device_type<none>, #acc.device_type<nvidia>]) vector([#acc.device_type<none>, #acc.device_type<nvidia>])
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop worker(N) vector(N) device_type(nvidia) worker(N) vector(N)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV2:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD2]] : !s32i to si32
+ // CHECK-NEXT: %[[N_LOAD3:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV3:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD3]] : !s32i to si32
+ // CHECK-NEXT: %[[N_LOAD4:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV4:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD4]] : !s32i to si32
+ // CHECK-NEXT: acc.loop worker(%[[N_CONV]] : si32, %[[N_CONV3]] : si32 [#acc.device_type<nvidia>]) vector(%[[N_CONV2]] : si32, %[[N_CONV4]] : si32 [#acc.device_type<nvidia>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+ }
}
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 41b01a14a6498..ca564037fad19 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -2216,6 +2216,21 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
// values should be integral constants, with the '*' represented as a '-1'.
void setTileForDeviceTypes(MLIRContext *, llvm::ArrayRef<DeviceType>,
mlir::ValueRange);
+
+ // Add a value to the 'vector' list with a current list of device_types.
+ void addVectorOperand(MLIRContext *, mlir::Value,
+ llvm::ArrayRef<DeviceType>);
+ // Add an empty value to the 'vector' list with a current list of
+ // device_types. This is for the case where there is no expression specified
+ // in a 'vector'.
+ void addEmptyVector(MLIRContext *, llvm::ArrayRef<DeviceType>);
+ // Add a value to the 'worker' list with a current list of device_types.
+ void addWorkerNumOperand(MLIRContext *, mlir::Value,
+ llvm::ArrayRef<DeviceType>);
+ // Add an empty value to the 'worker' list with a current list of
+ // device_types. This is for the case where there is no expression specified
+ // in a 'worker'.
+ void addEmptyWorker(MLIRContext *, llvm::ArrayRef<DeviceType>);
}];
let hasCustomAssemblyFormat = 1;
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index f26b3a5143c0b..9f4645a4a7ca8 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -2720,6 +2720,34 @@ void acc::LoopOp::setTileForDeviceTypes(
setTileOperandsSegments(segments);
}
+void acc::LoopOp::addVectorOperand(
+ MLIRContext *context, mlir::Value newValue,
+ llvm::ArrayRef<DeviceType> effectiveDeviceTypes) {
+ setVectorOperandsDeviceTypeAttr(addDeviceTypeAffectedOperandHelper(
+ context, getVectorOperandsDeviceTypeAttr(), effectiveDeviceTypes,
+ newValue, getVectorOperandsMutable()));
+}
+
+void acc::LoopOp::addEmptyVector(
+ MLIRContext *context, llvm::ArrayRef<DeviceType> effectiveDeviceTypes) {
+ setVectorAttr(addDeviceTypeAffectedOperandHelper(context, getVectorAttr(),
+ effectiveDeviceTypes));
+}
+
+void acc::LoopOp::addWorkerNumOperand(
+ MLIRContext *context, mlir::Value newValue,
+ llvm::ArrayRef<DeviceType> effectiveDeviceTypes) {
+ setWorkerNumOperandsDeviceTypeAttr(addDeviceTypeAffectedOperandHelper(
+ context, getWorkerNumOperandsDeviceTypeAttr(), effectiveDeviceTypes,
+ newValue, getWorkerNumOperandsMutable()));
+}
+
+void acc::LoopOp::addEmptyWorker(
+ MLIRContext *context, llvm::ArrayRef<DeviceType> effectiveDeviceTypes) {
+ setWorkerAttr(addDeviceTypeAffectedOperandHelper(context, getWorkerAttr(),
+ effectiveDeviceTypes));
+}
+
//===----------------------------------------------------------------------===//
// DataOp
//===----------------------------------------------------------------------===//
|
@llvm/pr-subscribers-clangir Author: Erich Keane (erichkeane) ChangesThis patch implements worker and vector lowering for the loop construct, which are fairly simple clauses, except that they also have a 'no argument' form which requires a touch more work. Else, these are just like a handful of other clauses where we just keep the device_type array and operands in sync. Full diff: https://github.com/llvm/llvm-project/pull/138765.diff 4 Files Affected:
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h
index fa4ce5efc39ad..ef4f64a167742 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h
@@ -392,6 +392,38 @@ class OpenACCClauseCIREmitter final
return clauseNotImplemented(clause);
}
}
+
+ void VisitWorkerClause(const OpenACCWorkerClause &clause) {
+ if constexpr (isOneOfTypes<OpTy, mlir::acc::LoopOp>) {
+ if (clause.hasIntExpr())
+ operation.addWorkerNumOperand(builder.getContext(),
+ createIntExpr(clause.getIntExpr()),
+ lastDeviceTypeValues);
+ else
+ operation.addEmptyWorker(builder.getContext(), lastDeviceTypeValues);
+
+ } else {
+ // TODO: When we've implemented this for everything, switch this to an
+ // unreachable. Combined constructs remain.
+ return clauseNotImplemented(clause);
+ }
+ }
+
+ void VisitVectorClause(const OpenACCVectorClause &clause) {
+ if constexpr (isOneOfTypes<OpTy, mlir::acc::LoopOp>) {
+ if (clause.hasIntExpr())
+ operation.addVectorOperand(builder.getContext(),
+ createIntExpr(clause.getIntExpr()),
+ lastDeviceTypeValues);
+ else
+ operation.addEmptyVector(builder.getContext(), lastDeviceTypeValues);
+
+ } else {
+ // TODO: When we've implemented this for everything, switch this to an
+ // unreachable. Combined constructs remain.
+ return clauseNotImplemented(clause);
+ }
+ }
};
template <typename OpTy>
diff --git a/clang/test/CIR/CodeGenOpenACC/loop.cpp b/clang/test/CIR/CodeGenOpenACC/loop.cpp
index b255a01adda0e..d636d1b37d969 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop.cpp
@@ -193,4 +193,134 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
// CHECK: acc.yield
// CHECK-NEXT: } loc
+
+#pragma acc kernels
+ {
+
+#pragma acc loop worker
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK: acc.loop worker {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop worker(N)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.loop worker(%[[N_CONV]] : si32) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop worker device_type(nvidia, radeon) worker
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.loop worker([#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop worker(N) device_type(nvidia, radeon) worker
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.loop worker([#acc.device_type<nvidia>, #acc.device_type<radeon>], %[[N_CONV]] : si32) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop worker device_type(nvidia, radeon) worker(N)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.loop worker([#acc.device_type<none>], %[[N_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_CONV]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop worker(N) device_type(nvidia, radeon) worker(N + 1)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[ONE_CONST]]) nsw : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+ // CHECK-NEXT: acc.loop worker(%[[N_CONV]] : si32, %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop device_type(nvidia, radeon) worker(num:N + 1)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD]], %[[ONE_CONST]]) nsw : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+ // CHECK-NEXT: acc.loop worker(%[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+
+#pragma acc loop vector
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK: acc.loop vector {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop vector(N)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.loop vector(%[[N_CONV]] : si32) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop vector device_type(nvidia, radeon) vector
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.loop vector([#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop vector(N) device_type(nvidia, radeon) vector
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.loop vector([#acc.device_type<nvidia>, #acc.device_type<radeon>], %[[N_CONV]] : si32) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop vector(N) device_type(nvidia, radeon) vector(N + 1)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[ONE_CONST]]) nsw : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+ // CHECK-NEXT: acc.loop vector(%[[N_CONV]] : si32, %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop device_type(nvidia, radeon) vector(length:N + 1)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD]], %[[ONE_CONST]]) nsw : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+ // CHECK-NEXT: acc.loop vector(%[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop worker vector device_type(nvidia) worker vector
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.loop worker([#acc.device_type<none>, #acc.device_type<nvidia>]) vector([#acc.device_type<none>, #acc.device_type<nvidia>])
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop worker(N) vector(N) device_type(nvidia) worker(N) vector(N)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV2:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD2]] : !s32i to si32
+ // CHECK-NEXT: %[[N_LOAD3:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV3:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD3]] : !s32i to si32
+ // CHECK-NEXT: %[[N_LOAD4:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV4:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD4]] : !s32i to si32
+ // CHECK-NEXT: acc.loop worker(%[[N_CONV]] : si32, %[[N_CONV3]] : si32 [#acc.device_type<nvidia>]) vector(%[[N_CONV2]] : si32, %[[N_CONV4]] : si32 [#acc.device_type<nvidia>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+ }
}
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 41b01a14a6498..ca564037fad19 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -2216,6 +2216,21 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
// values should be integral constants, with the '*' represented as a '-1'.
void setTileForDeviceTypes(MLIRContext *, llvm::ArrayRef<DeviceType>,
mlir::ValueRange);
+
+ // Add a value to the 'vector' list with a current list of device_types.
+ void addVectorOperand(MLIRContext *, mlir::Value,
+ llvm::ArrayRef<DeviceType>);
+ // Add an empty value to the 'vector' list with a current list of
+ // device_types. This is for the case where there is no expression specified
+ // in a 'vector'.
+ void addEmptyVector(MLIRContext *, llvm::ArrayRef<DeviceType>);
+ // Add a value to the 'worker' list with a current list of device_types.
+ void addWorkerNumOperand(MLIRContext *, mlir::Value,
+ llvm::ArrayRef<DeviceType>);
+ // Add an empty value to the 'worker' list with a current list of
+ // device_types. This is for the case where there is no expression specified
+ // in a 'worker'.
+ void addEmptyWorker(MLIRContext *, llvm::ArrayRef<DeviceType>);
}];
let hasCustomAssemblyFormat = 1;
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index f26b3a5143c0b..9f4645a4a7ca8 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -2720,6 +2720,34 @@ void acc::LoopOp::setTileForDeviceTypes(
setTileOperandsSegments(segments);
}
+void acc::LoopOp::addVectorOperand(
+ MLIRContext *context, mlir::Value newValue,
+ llvm::ArrayRef<DeviceType> effectiveDeviceTypes) {
+ setVectorOperandsDeviceTypeAttr(addDeviceTypeAffectedOperandHelper(
+ context, getVectorOperandsDeviceTypeAttr(), effectiveDeviceTypes,
+ newValue, getVectorOperandsMutable()));
+}
+
+void acc::LoopOp::addEmptyVector(
+ MLIRContext *context, llvm::ArrayRef<DeviceType> effectiveDeviceTypes) {
+ setVectorAttr(addDeviceTypeAffectedOperandHelper(context, getVectorAttr(),
+ effectiveDeviceTypes));
+}
+
+void acc::LoopOp::addWorkerNumOperand(
+ MLIRContext *context, mlir::Value newValue,
+ llvm::ArrayRef<DeviceType> effectiveDeviceTypes) {
+ setWorkerNumOperandsDeviceTypeAttr(addDeviceTypeAffectedOperandHelper(
+ context, getWorkerNumOperandsDeviceTypeAttr(), effectiveDeviceTypes,
+ newValue, getWorkerNumOperandsMutable()));
+}
+
+void acc::LoopOp::addEmptyWorker(
+ MLIRContext *context, llvm::ArrayRef<DeviceType> effectiveDeviceTypes) {
+ setWorkerAttr(addDeviceTypeAffectedOperandHelper(context, getWorkerAttr(),
+ effectiveDeviceTypes));
+}
+
//===----------------------------------------------------------------------===//
// DataOp
//===----------------------------------------------------------------------===//
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
This patch implements worker and vector lowering for the loop construct, which are fairly simple clauses, except that they also have a 'no argument' form which requires a touch more work. Else, these are just like a handful of other clauses where we just keep the device_type array and operands in sync.