-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[mlir][tosa] Canonicalise slice over overlapped or inside a pad. #138270
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-tosa Author: Georgios Pinitas (GeorgeARM) ChangesUpdate the paddings and/or the slice parameters when a Full diff: https://github.com/llvm/llvm-project/pull/138270.diff 2 Files Affected:
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
index 47368532df169..5347fb1c16698 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
@@ -731,6 +731,127 @@ struct ConcatSliceOptimization : public OpRewritePattern<tosa::SliceOp> {
}
};
+struct PadSliceOptimization : public OpRewritePattern<tosa::SliceOp> {
+ using OpRewritePattern<tosa::SliceOp>::OpRewritePattern;
+
+ LogicalResult matchAndRewrite(tosa::SliceOp sliceOp,
+ PatternRewriter &rewriter) const override {
+ Value sliceInput = sliceOp.getInput1();
+
+ // Check if producer is a PadOp
+ auto padOp = sliceInput.getDefiningOp<tosa::PadOp>();
+ if (!padOp)
+ return rewriter.notifyMatchFailure(sliceOp,
+ "slice input must be a pad operation");
+
+ // Check PadOp has a single consumer
+ if (!padOp->hasOneUse())
+ return rewriter.notifyMatchFailure(sliceOp,
+ "pad shall have a single consumer");
+
+ // Check input is statically ranked
+ auto inputTy = dyn_cast<RankedTensorType>(padOp.getInput1().getType());
+ auto padTy = dyn_cast<RankedTensorType>(padOp.getType());
+ if (!inputTy || !padTy)
+ return rewriter.notifyMatchFailure(
+ sliceOp, "slice input must be a static ranked tensor");
+
+ // Validate and extract tosa::PadOp padding
+ DenseIntElementsAttr paddingElems;
+ if (!matchPattern(padOp.getPadding(), m_Constant(&paddingElems))) {
+ return rewriter.notifyMatchFailure(
+ sliceOp,
+ "The `padding` input specified on the tosa::PadOp must be constant.");
+ }
+ llvm::SmallVector<int64_t> padPaddings =
+ llvm::to_vector(paddingElems.getValues<int64_t>());
+
+ // Extract slice parameters
+ DenseElementsAttr startElems;
+ if (!matchPattern(sliceOp.getStart(), m_Constant(&startElems)))
+ return rewriter.notifyMatchFailure(
+ sliceOp, "start of slice must be a static ranked shape");
+ llvm::SmallVector<int64_t> sliceStarts =
+ llvm::to_vector(startElems.getValues<int64_t>());
+
+ DenseElementsAttr sizeElems;
+ if (!matchPattern(sliceOp.getSize(), m_Constant(&sizeElems)))
+ return rewriter.notifyMatchFailure(
+ sliceOp, "size of slice must be a static ranked shape");
+ llvm::SmallVector<int64_t> sliceSizes =
+ llvm::to_vector(sizeElems.getValues<int64_t>());
+
+ // Update the paddings
+ int64_t rank = inputTy.getRank();
+ llvm::SmallVector<int64_t> newSliceStarts(rank, 0);
+ llvm::SmallVector<int64_t> newPadPaddings(2 * rank, 0);
+ llvm::SmallVector<int64_t> newPadShape(rank, 0);
+ bool updated = false;
+ for (int64_t i = 0; i < rank; ++i) {
+ const int64_t padLo = padPaddings[i * 2];
+ const int64_t padHi = padPaddings[i * 2 + 1];
+ const int64_t sliceStart = sliceStarts[i];
+ const int64_t sliceSize = sliceSizes[i];
+ const int64_t sliceEnd = sliceStart + sliceSize;
+
+ const int64_t dimSize = inputTy.getShape()[i];
+ const int64_t dimStart = padLo;
+ const int64_t dimEnd = padLo + dimSize;
+ const int64_t dimTotal = padLo + dimSize + padHi;
+
+ // Check slice within bounds
+ if (sliceStart < 0 || sliceEnd > dimTotal)
+ return rewriter.notifyMatchFailure(sliceOp, "slice out-of-bounds");
+
+ const int64_t newPadLo = std::max<int64_t>(padLo - sliceStart, 0);
+ const int64_t newPadHi =
+ std::max<int64_t>(sliceEnd - (padLo + dimSize), 0);
+ const int64_t newSliceStart = std::max<int64_t>(sliceStart - padLo, 0);
+
+ // Compute update slice/pad parameters
+ if (sliceStart < dimStart || sliceEnd > dimEnd) {
+ // Handle slice when not within the original input entirely
+ updated |= (newPadLo != padLo) || (newPadHi != padHi) ||
+ (newSliceStart != sliceStart);
+ newPadPaddings[i * 2] = newPadLo;
+ newPadPaddings[i * 2 + 1] = newPadHi;
+ newSliceStarts[i] = newSliceStart;
+ } else {
+ // Slice is within the original input
+ updated |= newSliceStart != sliceStart;
+ newSliceStarts[i] = newSliceStart;
+ }
+
+ // Calculate new pad output shape
+ newPadShape[i] =
+ newPadPaddings[i * 2] + dimSize + newPadPaddings[i * 2 + 1];
+ }
+
+ // Check that we actually need to proceed with the rewrite
+ if (!updated)
+ return rewriter.notifyMatchFailure(
+ sliceOp, "terminate condition; nothing to rewrite");
+
+ // Create a PadOp with updated padding
+ auto newPaddingsOp =
+ getTosaConstShape(rewriter, sliceOp.getLoc(), newPadPaddings);
+ auto newPadTy =
+ RankedTensorType::get(newPadShape, inputTy.getElementType());
+ auto newPadOp = rewriter.create<tosa::PadOp>(
+ padOp.getLoc(), newPadTy, padOp.getInput1(), newPaddingsOp,
+ padOp.getPadConst());
+
+ // Update SliceOp and point to new PadOp
+ auto newStartOp =
+ getTosaConstShape(rewriter, sliceOp.getLoc(), newSliceStarts);
+ rewriter.replaceOpWithNewOp<tosa::SliceOp>(sliceOp, sliceOp.getType(),
+ newPadOp.getResult(), newStartOp,
+ sliceOp.getSize());
+
+ return success();
+ }
+};
+
// Update size operand of tosa.slice if size has dynamic dims but corresponding
// output dim is static
struct SliceDynamicSizeCanonicalization
@@ -779,8 +900,8 @@ struct SliceDynamicSizeCanonicalization
void SliceOp::getCanonicalizationPatterns(RewritePatternSet &results,
MLIRContext *context) {
- results.add<ConcatSliceOptimization, SliceDynamicSizeCanonicalization>(
- context);
+ results.add<ConcatSliceOptimization, PadSliceOptimization,
+ SliceDynamicSizeCanonicalization>(context);
}
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir
index 59fd490330691..6e99f57341982 100644
--- a/mlir/test/Dialect/Tosa/canonicalize.mlir
+++ b/mlir/test/Dialect/Tosa/canonicalize.mlir
@@ -985,6 +985,42 @@ func.func @canonicalize_concat_slice_on_non_concat_axis(%arg0 : tensor<1x12x12xf
// -----
+// CHECK-LABEL: @canonicalize_pad_slice_overlap
+// CHECK-DAG: %[[PAD_CONST:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK-DAG: %[[ZERO:.*]] = tosa.const_shape {values = dense<0> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[PADDING:.*]] = tosa.const_shape {values = dense<[0, 0, 0, 0, 1, 1, 0, 0]> : tensor<8xindex>}
+// CHECK-DAG: %[[SLICE_SIZE:.*]] = tosa.const_shape {values = dense<[1, 14, 18, 3]> : tensor<4xindex>}
+// CHECK: %[[PADDED:.*]] = tosa.pad %arg0, %[[PADDING]], %[[PAD_CONST]]
+// CHECK: %[[SLICED:.*]] = tosa.slice %[[PADDED]], %[[ZERO]], %[[SLICE_SIZE]]
+func.func @canonicalize_pad_slice_overlap(%arg0: tensor<1x16x16x3xf32>) -> tensor<1x14x18x3xf32> {
+ %pad_const = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+ %padding = tosa.const_shape {values = dense<[0, 0, 0, 0, 2, 2, 0, 0]> : tensor<8xindex>} : () -> !tosa.shape<8>
+ %padded = tosa.pad %arg0, %padding, %pad_const : (tensor<1x16x16x3xf32>, !tosa.shape<8>, tensor<1xf32>) -> tensor<1x16x20x3xf32>
+ %start = tosa.const_shape {values = dense<[0, 0, 1, 0]> : tensor<4xindex>} : () -> !tosa.shape<4>
+ %size = tosa.const_shape {values = dense<[1, 14, 18, 3]> : tensor<4xindex>} : () -> !tosa.shape<4>
+ %sliced = tosa.slice %padded, %start, %size : (tensor<1x16x20x3xf32>, !tosa.shape<4>, !tosa.shape<4>) -> tensor<1x14x18x3xf32>
+ return %sliced : tensor<1x14x18x3xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @canonicalize_pad_slice_inside
+// CHECK-DAG: %[[SLICE_START:.*]] = tosa.const_shape {values = dense<[0, 1, 2, 0]> : tensor<4xindex>}
+// CHECK-DAG: %[[SLICE_SIZE:.*]] = tosa.const_shape {values = dense<[1, 14, 10, 3]> : tensor<4xindex>}
+// CHECK-NOT: tosa.pad
+// CHECK: %[[SLICED:.*]] = tosa.slice %arg0, %[[SLICE_START]], %[[SLICE_SIZE]]
+func.func @canonicalize_pad_slice_inside(%arg0: tensor<1x16x16x3xf32>) -> tensor<1x14x14x3xf32> {
+ %pad_const = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+ %padding = tosa.const_shape {values = dense<[0, 0, 0, 0, 2, 2, 0, 0]> : tensor<8xindex>} : () -> !tosa.shape<8>
+ %padded = tosa.pad %arg0, %padding, %pad_const : (tensor<1x16x16x3xf32>, !tosa.shape<8>, tensor<1xf32>) -> tensor<1x16x20x3xf32>
+ %start = tosa.const_shape {values = dense<[0, 1, 4, 0]> : tensor<4xindex>} : () -> !tosa.shape<4>
+ %size = tosa.const_shape {values = dense<[1, 14, 10, 3]> : tensor<4xindex>} : () -> !tosa.shape<4>
+ %sliced = tosa.slice %padded, %start, %size : (tensor<1x16x20x3xf32>, !tosa.shape<4>, !tosa.shape<4>) -> tensor<1x14x14x3xf32>
+ return %sliced : tensor<1x14x14x3xf32>
+}
+
+// -----
+
// CHECK-LABEL: @fold_log_exp
func.func @fold_log_exp(%arg0: tensor<?x1xf32>) -> tensor<?x1xf32> {
// CHECK: return %arg{{.*}} : tensor<?x1xf32>
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks great, thanks @GeorgeARM! Had some small comments, otherwise LGTM
(newSliceStart != sliceStart); | ||
newPadPaddings[i * 2] = newPadLo; | ||
newPadPaddings[i * 2 + 1] = newPadHi; | ||
newSliceStarts[i] = newSliceStart; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: this can be moved out the if statement
const int64_t newPadLo = std::max<int64_t>(padLo - sliceStart, 0); | ||
const int64_t newPadHi = | ||
std::max<int64_t>(sliceEnd - (padLo + dimSize), 0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: I think these can be moved under if (sliceStart < dimStart || sliceEnd > dimEnd)
newSliceStarts[i] = newSliceStart; | ||
} else { | ||
// Slice is within the original input | ||
updated |= newSliceStart != sliceStart; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we could remove this else statement if this update is moved out as well
auto inputTy = dyn_cast<RankedTensorType>(padOp.getInput1().getType()); | ||
auto padTy = dyn_cast<RankedTensorType>(padOp.getType()); | ||
if (!inputTy || !padTy) | ||
return rewriter.notifyMatchFailure( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we extend this to allow dynamic input dims of inputs not on the sliced axis? (I'm thinking it could still be useful to optimize cases where we have a dynamic batch size etc)
return %sliced : tensor<1x14x14x3xf32> | ||
} | ||
|
||
// ----- |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it worth checking the case where there is no update?
This code was already changed to make use of UseCC/ResultCC. We can't restrict the check to provenance or address only, as both are relevant here.
Extend the verifier to ensure the number of predecessors and operands match for VPIRPhis.
Thanks to changes to type compatibility rules via WG14 N3007, these functions can now be called with a compatible type even within the same TU, which makes the -Wvisibility diagnostic too chatty to have on by default. So in C23 mode, -Wvisibility will only diagnose an incomplete tag type declared in a function prototype. If the tag is defined in the prototype, the diagnostic is silenced.
Static analysis flagged the use of Detail because we were not using std::move when returning values. Modified the returns to use std::move.
…138500) The assertion should be raised only for the NF instructions with GOTTPOFF relocation.
This commit fixes the nondeterminism issue in C++ header module enabled builds which were observed after llvm#132401. The issue was related to the fact that the hash set operation in MemberPointerType::Profile() was triggering getMostRecentDecl(). As the latter may trigger the loading of new entities from the external AST source, this was presumably causing reentrant modification of data structure or some other issue that affects compiler's output in a nondeterministic way (likely depending on specific values hashes/pointers have). The change should otherwise be a no-op, because whether we take a "most recent" or "any" Decl shouldn't matter since `getCanonicalDecl()` is called on it anyway inside `MemberPointerType::Profile()`. We haven't been able to come up with a deterministic regression test for this fix.
…lvm#137993) We found some tests checking for loops assigning between Cray pointer handles and their pointees which produced "incorrect" results with optimizations enabled; this is because the compiler expects Cray pointers not to alias with any other entity. [The HPE documentation for Cray Fortran extensions specifies:](https://support.hpe.com/hpesc/public/docDisplay?docId=a00113911en_us&docLocale=en_US&page=Types.html#cray-poiter-type) > the compiler assumes that the storage of a pointee is > never overlaid on the storage of another variable Jean pointed out that if a user's code uses entities that alias via Cray pointers, they may add the TARGET attribute to inform Flang of this aliasing, but that Flang's behavior is in line with Cray's own documentation and we should not make any changes to our alias analysis to try and detect this case. Updating documentation so that users that encounter this situation have a way to allow their code to compile as they intend.
…tags. (llvm#137916) Values sourced from binutils.
`ValueObject::AddressOf()` used to return address as a value which has it's own address, allowing to do `value.AddressOf().AddressOf()`. This patch makes the return address a simple const value.
…8011) This commit refactors the getStridesAndOffet() method on MemRefType to just call `MemRefLayoutAttrInterface::getStridesAndOffset(shape, strides& offset&)`, allowing downstream users and future layouts (ex, a potential contiguous layout) to implement it without needing to patch BuiltinTypes or without needing them to conform their affine maps to the canonical strided form.
…lvm#135424) This patch handles the global operand type properly, fixing the bug : Assertion `(isFI() || isCPI() || isTargetIndex() || isJTI()) && "Wrong MachineOperand accessor"` failed. Fixes SWDEV-504645 --------- Co-authored-by: Matt Arsenault <[email protected]>
The VOP3 form of the V_CNDMASK_B32 instruction takes a carry-in operand. The conversion to SDWA implies a conversion to VOP2 form which reads from VCC instead. Convert V_CNDMASK_B32_e64 instructions that might be converted to SDWA to V_CNDMASK_B32_e32 first and introduce a copy of the carry-in operand to VCC. Closes llvm#133431. --------- Co-authored-by: Matt Arsenault <[email protected]>
* adjustFixupValue is called even when a R_SPARC_HIX22/R_SPARC_LOX10 relocation is generated. This will be fixed shortly. * Enhanced the %h44 test to show that we don't check overflow. * Test R_SPARC_DISP32 in .gcc_except_table and .eh_frame . The original support did not test -filetype=obj output.
The computed R_SPARC_HIX22/R_SPARC_LOX10 value is non-zero even when the input is 0. We should suppress applyFixup when a relocation is generated.
Essentially revert the evaluateAsRelocatableImpl part from f39696e. Ensure that absolute relocation evaluation is in one place. SparcAsmBackend.cpp enables better diagnostics if needed.
This batch reveals two missed optimizations, but only one of which is regression as compared to the memset_patternN libcall family.
We were converting a CXXParenListInitExpr to a ParenListExpr in TreeTransform. However, ParenListExpr is typeless, so Clang could not rebuild the correct initialization sequence in some contexts. Fixes llvm#72880
…lvm#133484) Given the same branch condition in `a` and `c` SimplifyCFG converts: +> b -+ | v --> a --> c --> e --> | ^ +> d -+ into: +--> bcd ---+ | v --> a --> c --> e --> Remap source atoms on instructions duplicated from `c` into `bcd`. RFC: https://discourse.llvm.org/t/rfc-improving-is-stmt-placement-for-better-interactive-debugging/82668
Support is added for parsing. Basic semantics support is added to forward the code to Lowering. Lowering will emit a TODO error. Detailed semantics checks and lowering is further work.
This reverts commit ba29e60. As it broke tests on Windows on Arm: https://lab.llvm.org/buildbot/#/builders/141/builds/8500 ******************** Unresolved Tests (2): lldb-api :: tools/lldb-dap/completions/TestDAP_completions.py lldb-api :: tools/lldb-dap/startDebugging/TestDAP_startDebugging.py ******************** Timed Out Tests (1): lldb-api :: tools/lldb-dap/send-event/TestDAP_sendEvent.py ******************** Failed Tests (6): lldb-api :: tools/lldb-dap/console/TestDAP_console.py lldb-api :: tools/lldb-dap/console/TestDAP_redirection_to_console.py lldb-api :: tools/lldb-dap/launch/TestDAP_launch.py lldb-api :: tools/lldb-dap/stackTrace/TestDAP_stackTrace.py lldb-api :: tools/lldb-dap/stackTraceDisassemblyDisplay/TestDAP_stackTraceDisassemblyDisplay.py lldb-api :: tools/lldb-dap/variables/children/TestDAP_variables_children.py
llvm#137955) Continuing the theme from llvm#116777 and llvm#124931, this patch ensures we compute the correct address when a functions is spread across multiple sections. Due to this, it's not sufficient to adjust the offset in the section+offset pair (Address::Slide). We must actually slide the file offset and then recompute the section using the result. I found this out due to a failure to disassemble some parts of the function, so I'm testing with that, although it's likely there are other things that were broken due to this.
Fixes llvm#108136 In llvm#108136 (the new testcase), flang was missing the length parameter required for the variable length string when boxing the global variable. The code that is initializing global variables for OpenMP did not support types with length parameters. Instead of duplicating this initialization logic in OpenMP, I decided to use the exact same initialization as is used in the base language because this will already be well tested and will be updated for any new types. The difference for OpenMP is that the global variables will be zero initialized instead of left undefined. Previously `Fortran::lower::createGlobalInitialization` was used to share a smaller amount of the logic with the base language lowering. I think this bug has demonstrated that helper was too low level to be helpful, and it was only used in OpenMP so I have made it static inside of ConvertVariable.cpp.
…) class. The ELF type for i386 is always ELF32LE so we can pass ELF32LE directly to the base class template (ELFLinkGraphBuilder).
…vm#138262) Renames LIBMVEC-X86 to LIBMVEC and updates TLI to only add the existing x86 specific mapping when targeting x86.
…inline attribute (llvm#137769) OpenCL Kernels body is emitted as stubs and the kernel is emitted as call to respective stub. (llvm#115821). The stub function should be alwaysinlined, since call to stub can cause performance drop. Co-authored-by: anikelal <[email protected]>
Signed-off-by: Georgios Pinitas <[email protected]>
Update the paddings and/or the slice parameters when a
tosa.slice
after atosa.pad
is accessing only an overlapping or not region of the padded tensor.