-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[flang] Postpone hlfir.end_associate generation for calls. #138786
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
If we generate hlfir.end_associate at the end of the statement, we get easier optimizable HLFIR, because there are no compiler generated operations with side-effects in between the call and the consumers. This allows more hlfir.eval_in_mem to reuse the LHS instead of allocating temporary buffer. I do not think the same can be done for hlfir.copy_out always, e.g.: ``` subroutine test2(x) interface function array_func2(x,y) real:: x(*), array_func2(10), y end function array_func2 end interface real :: x(:) x = array_func2(x, 1.0) end subroutine test2 ``` If we postpone the copy-out until after the assignment, then the result may be wrong.
@llvm/pr-subscribers-flang-fir-hlfir Author: Slava Zakharin (vzakhari) ChangesIf we generate hlfir.end_associate at the end of the statement, I do not think the same can be done for hlfir.copy_out always, e.g.:
If we postpone the copy-out until after the assignment, then Full diff: https://github.com/llvm/llvm-project/pull/138786.diff 2 Files Affected:
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index a5b85e25b1af0..d37d51f6ec634 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -960,9 +960,26 @@ struct CallCleanUp {
mlir::Value tempVar;
mlir::Value mustFree;
};
- void genCleanUp(mlir::Location loc, fir::FirOpBuilder &builder) {
- Fortran::common::visit([&](auto &c) { c.genCleanUp(loc, builder); },
+
+ /// Generate clean-up code.
+ /// If \p postponeAssociates is true, the ExprAssociate clean-up
+ /// is not generated, and instead the corresponding CallCleanUp
+ /// object is returned as the result.
+ std::optional<CallCleanUp> genCleanUp(mlir::Location loc,
+ fir::FirOpBuilder &builder,
+ bool postponeAssociates) {
+ std::optional<CallCleanUp> postponed;
+ Fortran::common::visit(Fortran::common::visitors{
+ [&](CopyIn &c) { c.genCleanUp(loc, builder); },
+ [&](ExprAssociate &c) {
+ if (postponeAssociates)
+ postponed = CallCleanUp{c};
+ else
+ c.genCleanUp(loc, builder);
+ },
+ },
cleanUp);
+ return postponed;
}
std::variant<CopyIn, ExprAssociate> cleanUp;
};
@@ -1729,10 +1746,23 @@ genUserCall(Fortran::lower::PreparedActualArguments &loweredActuals,
caller, callSiteType, callContext.resultType,
callContext.isElementalProcWithArrayArgs());
- /// Clean-up associations and copy-in.
- for (auto cleanUp : callCleanUps)
- cleanUp.genCleanUp(loc, builder);
-
+ // Clean-up associations and copy-in.
+ // The association clean-ups are postponed to the end of the statement
+ // lowering. The copy-in clean-ups may be delayed as well,
+ // but they are done immediately after the call currently.
+ llvm::SmallVector<CallCleanUp> associateCleanups;
+ for (auto cleanUp : callCleanUps) {
+ auto postponed =
+ cleanUp.genCleanUp(loc, builder, /*postponeAssociates=*/true);
+ if (postponed)
+ associateCleanups.push_back(*postponed);
+ }
+
+ fir::FirOpBuilder *bldr = &builder;
+ callContext.stmtCtx.attachCleanup([=]() {
+ for (auto cleanUp : associateCleanups)
+ (void)cleanUp.genCleanUp(loc, *bldr, /*postponeAssociates=*/false);
+ });
if (auto *entity = std::get_if<hlfir::EntityWithAttributes>(&loweredResult))
return *entity;
diff --git a/flang/test/Lower/HLFIR/call-postponed-associate.f90 b/flang/test/Lower/HLFIR/call-postponed-associate.f90
new file mode 100644
index 0000000000000..18df62b44324b
--- /dev/null
+++ b/flang/test/Lower/HLFIR/call-postponed-associate.f90
@@ -0,0 +1,85 @@
+! RUN: bbc -emit-hlfir -o - %s -I nowhere | FileCheck %s
+
+subroutine test1
+ interface
+ function array_func1(x)
+ real:: x, array_func1(10)
+ end function array_func1
+ end interface
+ real :: x(10)
+ x = array_func1(1.0)
+end subroutine test1
+! CHECK-LABEL: func.func @_QPtest1() {
+! CHECK: %[[VAL_5:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK: %[[VAL_6:.*]]:3 = hlfir.associate %[[VAL_5]] {adapt.valuebyref} : (f32) -> (!fir.ref<f32>, !fir.ref<f32>, i1)
+! CHECK: %[[VAL_17:.*]] = hlfir.eval_in_mem shape %{{.*}} : (!fir.shape<1>) -> !hlfir.expr<10xf32> {
+! CHECK: fir.call @_QParray_func1
+! CHECK: fir.save_result
+! CHECK: }
+! CHECK: hlfir.assign %[[VAL_17]] to %{{.*}} : !hlfir.expr<10xf32>, !fir.ref<!fir.array<10xf32>>
+! CHECK: hlfir.end_associate %[[VAL_6]]#1, %[[VAL_6]]#2 : !fir.ref<f32>, i1
+
+subroutine test2(x)
+ interface
+ function array_func2(x,y)
+ real:: x(*), array_func2(10), y
+ end function array_func2
+ end interface
+ real :: x(:)
+ x = array_func2(x, 1.0)
+end subroutine test2
+! CHECK-LABEL: func.func @_QPtest2(
+! CHECK: %[[VAL_3:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK: %[[VAL_4:.*]]:2 = hlfir.copy_in %{{.*}} to %{{.*}} : (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.box<!fir.array<?xf32>>, i1)
+! CHECK: %[[VAL_5:.*]] = fir.box_addr %[[VAL_4]]#0 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+! CHECK: %[[VAL_6:.*]]:3 = hlfir.associate %[[VAL_3]] {adapt.valuebyref} : (f32) -> (!fir.ref<f32>, !fir.ref<f32>, i1)
+! CHECK: %[[VAL_17:.*]] = hlfir.eval_in_mem shape %{{.*}} : (!fir.shape<1>) -> !hlfir.expr<10xf32> {
+! CHECK: ^bb0(%[[VAL_18:.*]]: !fir.ref<!fir.array<10xf32>>):
+! CHECK: %[[VAL_19:.*]] = fir.call @_QParray_func2(%[[VAL_5]], %[[VAL_6]]#0) fastmath<contract> : (!fir.ref<!fir.array<?xf32>>, !fir.ref<f32>) -> !fir.array<10xf32>
+! CHECK: fir.save_result %[[VAL_19]] to %[[VAL_18]](%{{.*}}) : !fir.array<10xf32>, !fir.ref<!fir.array<10xf32>>, !fir.shape<1>
+! CHECK: }
+! CHECK: hlfir.copy_out %{{.*}}, %[[VAL_4]]#1 to %{{.*}} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, i1, !fir.box<!fir.array<?xf32>>) -> ()
+! CHECK: hlfir.assign %[[VAL_17]] to %{{.*}} : !hlfir.expr<10xf32>, !fir.box<!fir.array<?xf32>>
+! CHECK: hlfir.end_associate %[[VAL_6]]#1, %[[VAL_6]]#2 : !fir.ref<f32>, i1
+! CHECK: hlfir.destroy %[[VAL_17]] : !hlfir.expr<10xf32>
+
+subroutine test3(x)
+ interface
+ function array_func3(x)
+ real :: x, array_func3(10)
+ end function array_func3
+ end interface
+ logical :: x
+ if (any(array_func3(1.0).le.array_func3(2.0))) x = .true.
+end subroutine test3
+! CHECK-LABEL: func.func @_QPtest3(
+! CHECK: %[[VAL_2:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK: %[[VAL_3:.*]]:3 = hlfir.associate %[[VAL_2]] {adapt.valuebyref} : (f32) -> (!fir.ref<f32>, !fir.ref<f32>, i1)
+! CHECK: %[[VAL_14:.*]] = hlfir.eval_in_mem shape %{{.*}} : (!fir.shape<1>) -> !hlfir.expr<10xf32> {
+! CHECK: ^bb0(%[[VAL_15:.*]]: !fir.ref<!fir.array<10xf32>>):
+! CHECK: %[[VAL_16:.*]] = fir.call @_QParray_func3(%[[VAL_3]]#0) fastmath<contract> : (!fir.ref<f32>) -> !fir.array<10xf32>
+! CHECK: fir.save_result %[[VAL_16]] to %[[VAL_15]](%{{.*}}) : !fir.array<10xf32>, !fir.ref<!fir.array<10xf32>>, !fir.shape<1>
+! CHECK: }
+! CHECK: %[[VAL_17:.*]] = arith.constant 2.000000e+00 : f32
+! CHECK: %[[VAL_18:.*]]:3 = hlfir.associate %[[VAL_17]] {adapt.valuebyref} : (f32) -> (!fir.ref<f32>, !fir.ref<f32>, i1)
+! CHECK: %[[VAL_29:.*]] = hlfir.eval_in_mem shape %{{.*}} : (!fir.shape<1>) -> !hlfir.expr<10xf32> {
+! CHECK: ^bb0(%[[VAL_30:.*]]: !fir.ref<!fir.array<10xf32>>):
+! CHECK: %[[VAL_31:.*]] = fir.call @_QParray_func3(%[[VAL_18]]#0) fastmath<contract> : (!fir.ref<f32>) -> !fir.array<10xf32>
+! CHECK: fir.save_result %[[VAL_31]] to %[[VAL_30]](%{{.*}}) : !fir.array<10xf32>, !fir.ref<!fir.array<10xf32>>, !fir.shape<1>
+! CHECK: }
+! CHECK: %[[VAL_32:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+! CHECK: ^bb0(%[[VAL_33:.*]]: index):
+! CHECK: %[[VAL_34:.*]] = hlfir.apply %[[VAL_14]], %[[VAL_33]] : (!hlfir.expr<10xf32>, index) -> f32
+! CHECK: %[[VAL_35:.*]] = hlfir.apply %[[VAL_29]], %[[VAL_33]] : (!hlfir.expr<10xf32>, index) -> f32
+! CHECK: %[[VAL_36:.*]] = arith.cmpf ole, %[[VAL_34]], %[[VAL_35]] fastmath<contract> : f32
+! CHECK: %[[VAL_37:.*]] = fir.convert %[[VAL_36]] : (i1) -> !fir.logical<4>
+! CHECK: hlfir.yield_element %[[VAL_37]] : !fir.logical<4>
+! CHECK: }
+! CHECK: %[[VAL_38:.*]] = hlfir.any %[[VAL_32]] : (!hlfir.expr<?x!fir.logical<4>>) -> !fir.logical<4>
+! CHECK: hlfir.destroy %[[VAL_32]] : !hlfir.expr<?x!fir.logical<4>>
+! CHECK: hlfir.end_associate %[[VAL_18]]#1, %[[VAL_18]]#2 : !fir.ref<f32>, i1
+! CHECK: hlfir.destroy %[[VAL_29]] : !hlfir.expr<10xf32>
+! CHECK: hlfir.end_associate %[[VAL_3]]#1, %[[VAL_3]]#2 : !fir.ref<f32>, i1
+! CHECK: hlfir.destroy %[[VAL_14]] : !hlfir.expr<10xf32>
+! CHECK: %[[VAL_39:.*]] = fir.convert %[[VAL_38]] : (!fir.logical<4>) -> i1
+! CHECK: fir.if %[[VAL_39]] {
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks great to me, thanks!
I agree delaying the copy-out is more tricky and best to not do without more thinking.
If we generate hlfir.end_associate at the end of the statement,
we get easier optimizable HLFIR, because there are no compiler
generated operations with side-effects in between the call
and the consumers. This allows more hlfir.eval_in_mem to reuse
the LHS instead of allocating temporary buffer.
I do not think the same can be done for hlfir.copy_out always, e.g.:
If we postpone the copy-out until after the assignment, then
the result may be wrong.