Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit cd26e99

Browse files
authored
[mlir][memref][NVGPU] Move NVGPU ops to IndexedAccessOpInterface (#190430)
This removes the need for the memref dialect to know about nvgpu operations (though we still haven't converted ExtractAddressComputations to t new interface, so we can't remove the dependency just yet). ldmatrix is defined to access a 1-D region of memory in order to enable folding in arbitrary expand_ and collpapse_shapes, as its underlying lowering is jut a scalar getStridedElementPtr()
1 parent 25ad2ee commit cd26e99

9 files changed

Lines changed: 220 additions & 155 deletions

File tree

mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ include "mlir/Dialect/NVGPU/IR/NVGPUTypes.td"
3030
class NVGPU_Op<string mnemonic, list<Trait> traits = []> :
3131
Op<NVGPU_Dialect, mnemonic, traits> {}
3232

33+
// Promises IndexedAccessOpInterface.
3334
def NVGPU_LdMatrixOp : NVGPU_Op<"ldmatrix", [
3435
MemoryEffects<[MemRead]>,
3536
PredOpTrait<"srcMemref and res have same element type",
@@ -183,6 +184,7 @@ def NVGPU_MmaSparseSyncOp : NVGPU_MmaSyncOp<"mma.sp.sync"> {
183184
let extraClassDeclaration = extraBaseClassDeclaration;
184185
}
185186

187+
// Promises IndexedMemCopyOpInterface.
186188
def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy", [
187189
AttrSizedOperandSegments]> {
188190
let summary = "device-side asynchronous copy";
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
//===- MemoryAccessOpInterfacesImpl.h -------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef MLIR_DIALECT_NVGPU_TRANSFORMS_MEMORYACCESSOPINTERFACESIMPL_H
10+
#define MLIR_DIALECT_NVGPU_TRANSFORMS_MEMORYACCESSOPINTERFACESIMPL_H
11+
12+
namespace mlir {
13+
14+
class DialectRegistry;
15+
16+
namespace nvgpu {
17+
void registerMemoryAccessOpInterfacesExternalModels(DialectRegistry &registry);
18+
} // namespace nvgpu
19+
} // namespace mlir
20+
21+
#endif // MLIR_DIALECT_NVGPU_TRANSFORMS_MEMORYACCESSOPINTERFACESIMPL_H

mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp

Lines changed: 1 addition & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
#include "mlir/Dialect/MemRef/Transforms/Passes.h"
1919
#include "mlir/Dialect/MemRef/Transforms/Transforms.h"
2020
#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
21-
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
2221
#include "mlir/Dialect/Vector/IR/VectorOps.h"
2322
#include "mlir/IR/AffineExpr.h"
2423
#include "mlir/IR/AffineMap.h"
@@ -75,10 +74,6 @@ static Value getMemRefOperand(vector::TransferReadOp op) {
7574
return op.getBase();
7675
}
7776

78-
static Value getMemRefOperand(nvgpu::LdMatrixOp op) {
79-
return op.getSrcMemref();
80-
}
81-
8277
static Value getMemRefOperand(vector::LoadOp op) { return op.getBase(); }
8378

8479
static Value getMemRefOperand(vector::StoreOp op) { return op.getBase(); }
@@ -181,17 +176,6 @@ class SubViewOfSubViewFolder : public OpRewritePattern<memref::SubViewOp> {
181176
}
182177
};
183178

184-
/// Folds nvgpu.device_async_copy subviews into the copy itself. This pattern
185-
/// is folds subview on src and dst memref of the copy.
186-
class NVGPUAsyncCopyOpSubViewOpFolder final
187-
: public OpRewritePattern<nvgpu::DeviceAsyncCopyOp> {
188-
public:
189-
using OpRewritePattern<nvgpu::DeviceAsyncCopyOp>::OpRewritePattern;
190-
191-
LogicalResult matchAndRewrite(nvgpu::DeviceAsyncCopyOp copyOp,
192-
PatternRewriter &rewriter) const override;
193-
};
194-
195179
/// Merges subview operations with load/store like operations unless such a
196180
/// merger would cause the strides between dimensions accessed by that operaton
197181
/// to change.
@@ -345,11 +329,6 @@ LogicalResult LoadOpOfSubViewOpFolder<OpTy>::matchAndRewrite(
345329
subViewOp.getDroppedDims())),
346330
op.getPadding(), op.getMask(), op.getInBoundsAttr());
347331
})
348-
.Case([&](nvgpu::LdMatrixOp op) {
349-
rewriter.replaceOpWithNewOp<nvgpu::LdMatrixOp>(
350-
op, op.getType(), subViewOp.getSource(), sourceIndices,
351-
op.getTranspose(), op.getNumTiles());
352-
})
353332
.DefaultUnreachable("unexpected operation");
354333
return success();
355334
}
@@ -785,57 +764,6 @@ LogicalResult IndexedMemCopyOpOfCollapseShapeOpFolder::matchAndRewrite(
785764
return success();
786765
}
787766

788-
LogicalResult NVGPUAsyncCopyOpSubViewOpFolder::matchAndRewrite(
789-
nvgpu::DeviceAsyncCopyOp copyOp, PatternRewriter &rewriter) const {
790-
791-
LLVM_DEBUG(DBGS() << "copyOp : " << copyOp << "\n");
792-
793-
auto srcSubViewOp =
794-
copyOp.getSrc().template getDefiningOp<memref::SubViewOp>();
795-
auto dstSubViewOp =
796-
copyOp.getDst().template getDefiningOp<memref::SubViewOp>();
797-
798-
if (!(srcSubViewOp || dstSubViewOp))
799-
return rewriter.notifyMatchFailure(copyOp, "does not use subview ops for "
800-
"source or destination");
801-
802-
// If the source is a subview, we need to resolve the indices.
803-
SmallVector<Value> foldedSrcIndices(copyOp.getSrcIndices().begin(),
804-
copyOp.getSrcIndices().end());
805-
806-
if (srcSubViewOp) {
807-
LLVM_DEBUG(DBGS() << "srcSubViewOp : " << srcSubViewOp << "\n");
808-
affine::resolveIndicesIntoOpWithOffsetsAndStrides(
809-
rewriter, copyOp.getLoc(), srcSubViewOp.getMixedOffsets(),
810-
srcSubViewOp.getMixedStrides(), srcSubViewOp.getDroppedDims(),
811-
copyOp.getSrcIndices(), foldedSrcIndices);
812-
}
813-
814-
// If the destination is a subview, we need to resolve the indices.
815-
SmallVector<Value> foldedDstIndices(copyOp.getDstIndices().begin(),
816-
copyOp.getDstIndices().end());
817-
818-
if (dstSubViewOp) {
819-
LLVM_DEBUG(DBGS() << "dstSubViewOp : " << dstSubViewOp << "\n");
820-
affine::resolveIndicesIntoOpWithOffsetsAndStrides(
821-
rewriter, copyOp.getLoc(), dstSubViewOp.getMixedOffsets(),
822-
dstSubViewOp.getMixedStrides(), dstSubViewOp.getDroppedDims(),
823-
copyOp.getDstIndices(), foldedDstIndices);
824-
}
825-
826-
// Replace the copy op with a new copy op that uses the source and destination
827-
// of the subview.
828-
rewriter.replaceOpWithNewOp<nvgpu::DeviceAsyncCopyOp>(
829-
copyOp, nvgpu::DeviceAsyncTokenType::get(copyOp.getContext()),
830-
(dstSubViewOp ? dstSubViewOp.getSource() : copyOp.getDst()),
831-
foldedDstIndices,
832-
(srcSubViewOp ? srcSubViewOp.getSource() : copyOp.getSrc()),
833-
foldedSrcIndices, copyOp.getDstElements(), copyOp.getSrcElements(),
834-
copyOp.getBypassL1Attr());
835-
836-
return success();
837-
}
838-
839767
void memref::populateFoldMemRefAliasOpPatterns(RewritePatternSet &patterns) {
840768
patterns.add<
841769
// Interface-based patterns to which we will be migrating.
@@ -844,7 +772,6 @@ void memref::populateFoldMemRefAliasOpPatterns(RewritePatternSet &patterns) {
844772
IndexedMemCopyOpOfExpandShapeOpFolder,
845773
IndexedMemCopyOpOfCollapseShapeOpFolder,
846774
// The old way of doing things. Don't add more of these.
847-
LoadOpOfSubViewOpFolder<nvgpu::LdMatrixOp>,
848775
LoadOpOfSubViewOpFolder<vector::LoadOp>,
849776
LoadOpOfSubViewOpFolder<vector::MaskedLoadOp>,
850777
LoadOpOfSubViewOpFolder<vector::TransferReadOp>,
@@ -860,8 +787,7 @@ void memref::populateFoldMemRefAliasOpPatterns(RewritePatternSet &patterns) {
860787
LoadOpOfCollapseShapeOpFolder<vector::MaskedLoadOp>,
861788
StoreOpOfCollapseShapeOpFolder<vector::StoreOp>,
862789
StoreOpOfCollapseShapeOpFolder<vector::MaskedStoreOp>,
863-
SubViewOfSubViewFolder, NVGPUAsyncCopyOpSubViewOpFolder>(
864-
patterns.getContext());
790+
SubViewOfSubViewFolder>(patterns.getContext());
865791
}
866792

867793
//===----------------------------------------------------------------------===//

mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
1414
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
15+
#include "mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.h"
1516
#include "mlir/IR/Builders.h"
1617
#include "mlir/IR/BuiltinAttributes.h"
1718
#include "mlir/IR/BuiltinTypes.h"
@@ -40,6 +41,9 @@ void NVGPUDialect::initialize() {
4041
#define GET_OP_LIST
4142
#include "mlir/Dialect/NVGPU/IR/NVGPUOps.cpp.inc"
4243
>();
44+
declarePromisedInterfaces<memref::IndexedAccessOpInterface, LdMatrixOp>();
45+
declarePromisedInterfaces<memref::IndexedMemCopyOpInterface,
46+
DeviceAsyncCopyOp>();
4347
}
4448

4549
bool NVGPUDialect::isSharedMemoryAddressSpace(Attribute memorySpace) {

mlir/lib/Dialect/NVGPU/Transforms/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
add_mlir_dialect_library(MLIRNVGPUTransforms
22
CreateAsyncGroups.cpp
3+
MemoryAccessOpInterfacesImpl.cpp
34
OptimizeSharedMemory.cpp
45
MmaSyncTF32Transform.cpp
56
Utils.cpp
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
//===- MemoryAccessOpInterfacesImpl.cpp -----------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// Implement memref dialect interfaces that enable manipulating memref indexing
9+
// in passes like FoldMemRefAliasOps.
10+
//===----------------------------------------------------------------------===//
11+
12+
#include "mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h"
13+
14+
#include "mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.h"
15+
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
16+
#include "mlir/IR/Dialect.h"
17+
#include "mlir/IR/Operation.h"
18+
#include "mlir/IR/PatternMatch.h"
19+
20+
using namespace mlir;
21+
using namespace mlir::memref;
22+
using namespace mlir::nvgpu;
23+
24+
namespace {
25+
struct LdMatrixOpInterface final
26+
: IndexedAccessOpInterface::ExternalModel<LdMatrixOpInterface, LdMatrixOp> {
27+
TypedValue<MemRefType> getAccessedMemref(Operation *op) const {
28+
return cast<LdMatrixOp>(op).getSrcMemref();
29+
}
30+
31+
Operation::operand_range getIndices(Operation *op) const {
32+
return cast<LdMatrixOp>(op).getIndices();
33+
}
34+
35+
SmallVector<int64_t> getAccessedShape(Operation *op) const {
36+
VectorType vecTy = cast<LdMatrixOp>(op).getRes().getType();
37+
// The 2-D nature of the result is an artifact of this operation returning
38+
// a struct of vectors and doesn't reflect any strides that need to be
39+
// preserved.
40+
return SmallVector<int64_t>{vecTy.getNumElements()};
41+
}
42+
43+
std::optional<SmallVector<Value>>
44+
updateMemrefAndIndices(Operation *op, RewriterBase &rewriter, Value newMemref,
45+
ValueRange newIndices) const {
46+
auto ldMatrixOp = cast<LdMatrixOp>(op);
47+
rewriter.modifyOpInPlace(ldMatrixOp, [&]() {
48+
ldMatrixOp.getSrcMemrefMutable().assign(newMemref);
49+
ldMatrixOp.getIndicesMutable().assign(newIndices);
50+
});
51+
return std::nullopt;
52+
}
53+
54+
bool hasInboundsIndices(Operation *) const { return true; }
55+
};
56+
57+
struct DeviceAsyncCopyOpInterface final
58+
: IndexedMemCopyOpInterface::ExternalModel<DeviceAsyncCopyOpInterface,
59+
DeviceAsyncCopyOp> {
60+
TypedValue<MemRefType> getSrc(Operation *op) const {
61+
return cast<DeviceAsyncCopyOp>(op).getSrc();
62+
}
63+
64+
Operation::operand_range getSrcIndices(Operation *op) const {
65+
return cast<DeviceAsyncCopyOp>(op).getSrcIndices();
66+
}
67+
68+
TypedValue<MemRefType> getDst(Operation *op) const {
69+
return cast<DeviceAsyncCopyOp>(op).getDst();
70+
}
71+
72+
Operation::operand_range getDstIndices(Operation *op) const {
73+
return cast<DeviceAsyncCopyOp>(op).getDstIndices();
74+
}
75+
76+
void setMemrefsAndIndices(Operation *op, RewriterBase &rewriter, Value newSrc,
77+
ValueRange newSrcIndices, Value newDst,
78+
ValueRange newDstIndices) const {
79+
auto copyOp = cast<DeviceAsyncCopyOp>(op);
80+
rewriter.modifyOpInPlace(copyOp, [&]() {
81+
copyOp.getSrcMutable().assign(newSrc);
82+
copyOp.getSrcIndicesMutable().assign(newSrcIndices);
83+
copyOp.getDstMutable().assign(newDst);
84+
copyOp.getDstIndicesMutable().assign(newDstIndices);
85+
});
86+
}
87+
};
88+
} // namespace
89+
90+
void mlir::nvgpu::registerMemoryAccessOpInterfacesExternalModels(
91+
DialectRegistry &registry) {
92+
registry.addExtension(+[](MLIRContext *ctx, nvgpu::NVGPUDialect *dialect) {
93+
LdMatrixOp::attachInterface<LdMatrixOpInterface>(*ctx);
94+
DeviceAsyncCopyOp::attachInterface<DeviceAsyncCopyOpInterface>(*ctx);
95+
});
96+
}

mlir/lib/RegisterAllDialects.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060
#include "mlir/Dialect/MemRef/Transforms/BufferViewFlowOpInterfaceImpl.h"
6161
#include "mlir/Dialect/MemRef/Transforms/RuntimeOpVerification.h"
6262
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
63+
#include "mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h"
6364
#include "mlir/Dialect/OpenACC/OpenACC.h"
6465
#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
6566
#include "mlir/Dialect/PDL/IR/PDL.h"
@@ -181,6 +182,7 @@ void mlir::registerAllDialects(DialectRegistry &registry) {
181182
memref::registerValueBoundsOpInterfaceExternalModels(registry);
182183
memref::registerMemorySlotExternalModels(registry);
183184
ml_program::registerBufferizableOpInterfaceExternalModels(registry);
185+
nvgpu::registerMemoryAccessOpInterfacesExternalModels(registry);
184186
scf::registerBufferDeallocationOpInterfaceExternalModels(registry);
185187
scf::registerBufferizableOpInterfaceExternalModels(registry);
186188
scf::registerValueBoundsOpInterfaceExternalModels(registry);

mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir

Lines changed: 0 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -616,86 +616,6 @@ func.func @fold_gpu_subgroup_mma_load_matrix_2d(%arg0 : memref<128x128xf32>, %ar
616616

617617
// -----
618618

619-
620-
func.func @fold_nvgpu_device_async_copy_zero_sub_idx(%gmem_memref_3d : memref<2x128x768xf16>, %idx_1 : index, %idx_2 : index, %idx_3 : index) {
621-
622-
%c0 = arith.constant 0 : index
623-
%smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
624-
%gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%idx_1, %idx_2, %idx_3] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>>
625-
%async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%c0, %c0], %smem_memref_4d[%c0, %c0, %c0, %c0], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
626-
return
627-
}
628-
629-
// CHECK-LABEL: func.func @fold_nvgpu_device_async_copy_zero_sub_idx
630-
// CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[IDX_1:.+]]: index, %[[IDX_2:.+]]: index, %[[IDX_3:.+]]: index)
631-
// CHECK-DAG: %[[c0:.+]] = arith.constant 0 : index
632-
// CHECK-DAG: %[[SMEM_MEMREF_4d:.+]] = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
633-
// CHECK: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[IDX_1]], %[[IDX_2]], %[[IDX_3]]], %[[SMEM_MEMREF_4d]][%[[c0]], %[[c0]], %[[c0]], %[[c0]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
634-
635-
// -----
636-
637-
638-
func.func @fold_src_nvgpu_device_async_copy(%gmem_memref_3d : memref<2x128x768xf16>, %src_idx_0 : index, %src_idx_1 : index, %src_idx_2 : index, %src_sub_idx_0 : index, %src_sub_idx_1 : index) {
639-
%c0 = arith.constant 0 : index
640-
%smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
641-
%gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%src_idx_0, %src_idx_1, %src_idx_2] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>>
642-
%async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%src_sub_idx_0, %src_sub_idx_1], %smem_memref_4d[%c0, %c0, %c0, %c0], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
643-
return
644-
}
645-
646-
// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)>
647-
// CHECK: func.func @fold_src_nvgpu_device_async_copy
648-
// CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[SRC_IDX_0:.+]]: index, %[[SRC_IDX_1:.+]]: index, %[[SRC_IDX_2:.+]]: index, %[[SRC_SUB_IDX_0:.+]]: index, %[[SRC_SUB_IDX_1:.+]]: index)
649-
// CHECK-DAG: %[[c0:.+]] = arith.constant 0 : index
650-
// CHECK-DAG: %[[RESOLVED_SRC_IDX_0:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_0]], %[[SRC_SUB_IDX_0]]]
651-
// CHECK-DAG: %[[RESOLVED_SRC_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_2]], %[[SRC_SUB_IDX_1]]]
652-
// CHECK-DAG: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[RESOLVED_SRC_IDX_0]], %[[SRC_IDX_1]], %[[RESOLVED_SRC_IDX_1]]], %[[SMEM_MEMREF_4d]][%[[c0]], %[[c0]], %[[c0]], %[[c0]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
653-
654-
// -----
655-
656-
657-
func.func @fold_src_fold_dest_nvgpu_device_async_copy(%gmem_memref_3d : memref<2x128x768xf16>, %src_idx_0 : index, %src_idx_1 : index, %src_idx_2 : index, %src_sub_idx_0 : index, %src_sub_idx_1 : index, %dest_idx_0 : index, %dest_idx_1 : index, %dest_idx_2 : index, %dest_idx_3 : index, %dest_sub_idx_0 : index, %dest_sub_idx_1 : index) {
658-
%c0 = arith.constant 0 : index
659-
%smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
660-
%gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%src_idx_0, %src_idx_1, %src_idx_2] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>>
661-
%smem_memref_2d = memref.subview %smem_memref_4d[%dest_idx_0, %dest_idx_1, %dest_idx_2, %dest_idx_3] [1, 1, 1, 8] [1, 1, 1, 1] : memref<5x1x64x64xf16, #gpu.address_space<workgroup>> to memref<1x8xf16, strided<[4096, 1], offset: ?>, #gpu.address_space<workgroup>>
662-
%async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%src_sub_idx_0, %src_sub_idx_1], %smem_memref_2d[%dest_sub_idx_0, %dest_sub_idx_1], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<1x8xf16, strided<[4096, 1], offset: ?>, #gpu.address_space<workgroup>>
663-
return
664-
}
665-
666-
// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)>
667-
// CHECK: func.func @fold_src_fold_dest_nvgpu_device_async_copy
668-
// CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[SRC_IDX_0:.+]]: index, %[[SRC_IDX_1:.+]]: index, %[[SRC_IDX_2:.+]]: index, %[[SRC_SUB_IDX_0:.+]]: index, %[[SRC_SUB_IDX_1:.+]]: index, %[[DEST_IDX_0:.+]]: index, %[[DEST_IDX_1:.+]]: index, %[[DEST_IDX_2:.+]]: index, %[[DEST_IDX_3:.+]]: index, %[[DEST_SUB_IDX_0:.+]]: index, %[[DEST_SUB_IDX_1:.+]]: index)
669-
// CHECK-DAG: %[[RESOLVED_SRC_IDX_0:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_0]], %[[SRC_SUB_IDX_0]]]
670-
// CHECK-DAG: %[[RESOLVED_SRC_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_2]], %[[SRC_SUB_IDX_1]]]
671-
// CHECK-DAG: %[[RESOLVED_DST_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[DEST_IDX_1]], %[[DEST_SUB_IDX_0]]]
672-
// CHECK-DAG: %[[RESOLVED_DST_IDX_3:.+]] = affine.apply #[[MAP]]()[%[[DEST_IDX_3]], %[[DEST_SUB_IDX_1]]]
673-
// CHECK-DAG: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[RESOLVED_SRC_IDX_0]], %[[SRC_IDX_1]], %[[RESOLVED_SRC_IDX_1]]], %[[SMEM_MEMREF_4d]][%[[DEST_IDX_0]], %[[RESOLVED_DST_IDX_1]], %[[DEST_IDX_2]], %[[RESOLVED_DST_IDX_3]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
674-
675-
// -----
676-
677-
#map = affine_map<()[s0] -> (-s0 + 4)>
678-
#map1 = affine_map<()[s0] -> (-s0 + 32)>
679-
680-
func.func @test_ldmatrix(%arg0: memref<4x32x32xf16, 3>, %arg1: index, %arg2: index, %arg3: index) -> vector<4x2xf16> {
681-
%c0 = arith.constant 0 : index
682-
%0 = affine.apply #map()[%arg1]
683-
%1 = affine.apply #map1()[%arg2]
684-
%2 = affine.apply #map1()[%arg3]
685-
%subview = memref.subview %arg0[%arg1, %arg2, %arg3] [%0, %1, %2] [1, 1, 1] : memref<4x32x32xf16, 3> to memref<?x?x?xf16, strided<[1024, 32, 1], offset: ?>, 3>
686-
%3 = nvgpu.ldmatrix %subview[%c0, %c0, %c0] {numTiles = 4 : i32, transpose = false} : memref<?x?x?xf16, strided<[1024, 32, 1], offset: ?>, 3> -> vector<4x2xf16>
687-
return %3 : vector<4x2xf16>
688-
}
689-
690-
// CHECK: func @test_ldmatrix
691-
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref<4x32x32xf16, 3>
692-
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: index
693-
// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: index
694-
// CHECK-SAME: %[[ARG3:[a-zA-Z0-9_]+]]: index
695-
// CHECK: nvgpu.ldmatrix %[[ARG0]][%[[ARG1]], %[[ARG2]], %[[ARG3]]] {numTiles = 4 : i32, transpose = false} : memref<4x32x32xf16, 3> -> vector<4x2xf16>
696-
697-
// -----
698-
699619
func.func @fold_vector_load_subview(%src : memref<24x64xf32>,
700620
%off1 : index,
701621
%off2 : index,

0 commit comments

Comments
 (0)