@@ -616,86 +616,6 @@ func.func @fold_gpu_subgroup_mma_load_matrix_2d(%arg0 : memref<128x128xf32>, %ar
616616
617617// -----
618618
619-
620- func.func @fold_nvgpu_device_async_copy_zero_sub_idx (%gmem_memref_3d : memref <2 x128 x768 xf16 >, %idx_1 : index , %idx_2 : index , %idx_3 : index ) {
621-
622- %c0 = arith.constant 0 : index
623- %smem_memref_4d = memref.alloc () : memref <5 x1 x64 x64 xf16 , #gpu.address_space <workgroup >>
624- %gmem_memref_subview_2d = memref.subview %gmem_memref_3d [%idx_1 , %idx_2 , %idx_3 ] [1 , 1 , 8 ] [1 , 1 , 1 ] : memref <2 x128 x768 xf16 > to memref <1 x8 xf16 , strided <[98304 , 1 ], offset : ?>>
625- %async_token = nvgpu.device_async_copy %gmem_memref_subview_2d [%c0 , %c0 ], %smem_memref_4d [%c0 , %c0 , %c0 , %c0 ], 8 {bypassL1 } : memref <1 x8 xf16 , strided <[98304 , 1 ], offset : ?>> to memref <5 x1 x64 x64 xf16 , #gpu.address_space <workgroup >>
626- return
627- }
628-
629- // CHECK-LABEL: func.func @fold_nvgpu_device_async_copy_zero_sub_idx
630- // CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[IDX_1:.+]]: index, %[[IDX_2:.+]]: index, %[[IDX_3:.+]]: index)
631- // CHECK-DAG: %[[c0:.+]] = arith.constant 0 : index
632- // CHECK-DAG: %[[SMEM_MEMREF_4d:.+]] = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
633- // CHECK: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[IDX_1]], %[[IDX_2]], %[[IDX_3]]], %[[SMEM_MEMREF_4d]][%[[c0]], %[[c0]], %[[c0]], %[[c0]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
634-
635- // -----
636-
637-
638- func.func @fold_src_nvgpu_device_async_copy (%gmem_memref_3d : memref <2 x128 x768 xf16 >, %src_idx_0 : index , %src_idx_1 : index , %src_idx_2 : index , %src_sub_idx_0 : index , %src_sub_idx_1 : index ) {
639- %c0 = arith.constant 0 : index
640- %smem_memref_4d = memref.alloc () : memref <5 x1 x64 x64 xf16 , #gpu.address_space <workgroup >>
641- %gmem_memref_subview_2d = memref.subview %gmem_memref_3d [%src_idx_0 , %src_idx_1 , %src_idx_2 ] [1 , 1 , 8 ] [1 , 1 , 1 ] : memref <2 x128 x768 xf16 > to memref <1 x8 xf16 , strided <[98304 , 1 ], offset : ?>>
642- %async_token = nvgpu.device_async_copy %gmem_memref_subview_2d [%src_sub_idx_0 , %src_sub_idx_1 ], %smem_memref_4d [%c0 , %c0 , %c0 , %c0 ], 8 {bypassL1 } : memref <1 x8 xf16 , strided <[98304 , 1 ], offset : ?>> to memref <5 x1 x64 x64 xf16 , #gpu.address_space <workgroup >>
643- return
644- }
645-
646- // CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)>
647- // CHECK: func.func @fold_src_nvgpu_device_async_copy
648- // CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[SRC_IDX_0:.+]]: index, %[[SRC_IDX_1:.+]]: index, %[[SRC_IDX_2:.+]]: index, %[[SRC_SUB_IDX_0:.+]]: index, %[[SRC_SUB_IDX_1:.+]]: index)
649- // CHECK-DAG: %[[c0:.+]] = arith.constant 0 : index
650- // CHECK-DAG: %[[RESOLVED_SRC_IDX_0:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_0]], %[[SRC_SUB_IDX_0]]]
651- // CHECK-DAG: %[[RESOLVED_SRC_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_2]], %[[SRC_SUB_IDX_1]]]
652- // CHECK-DAG: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[RESOLVED_SRC_IDX_0]], %[[SRC_IDX_1]], %[[RESOLVED_SRC_IDX_1]]], %[[SMEM_MEMREF_4d]][%[[c0]], %[[c0]], %[[c0]], %[[c0]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
653-
654- // -----
655-
656-
657- func.func @fold_src_fold_dest_nvgpu_device_async_copy (%gmem_memref_3d : memref <2 x128 x768 xf16 >, %src_idx_0 : index , %src_idx_1 : index , %src_idx_2 : index , %src_sub_idx_0 : index , %src_sub_idx_1 : index , %dest_idx_0 : index , %dest_idx_1 : index , %dest_idx_2 : index , %dest_idx_3 : index , %dest_sub_idx_0 : index , %dest_sub_idx_1 : index ) {
658- %c0 = arith.constant 0 : index
659- %smem_memref_4d = memref.alloc () : memref <5 x1 x64 x64 xf16 , #gpu.address_space <workgroup >>
660- %gmem_memref_subview_2d = memref.subview %gmem_memref_3d [%src_idx_0 , %src_idx_1 , %src_idx_2 ] [1 , 1 , 8 ] [1 , 1 , 1 ] : memref <2 x128 x768 xf16 > to memref <1 x8 xf16 , strided <[98304 , 1 ], offset : ?>>
661- %smem_memref_2d = memref.subview %smem_memref_4d [%dest_idx_0 , %dest_idx_1 , %dest_idx_2 , %dest_idx_3 ] [1 , 1 , 1 , 8 ] [1 , 1 , 1 , 1 ] : memref <5 x1 x64 x64 xf16 , #gpu.address_space <workgroup >> to memref <1 x8 xf16 , strided <[4096 , 1 ], offset : ?>, #gpu.address_space <workgroup >>
662- %async_token = nvgpu.device_async_copy %gmem_memref_subview_2d [%src_sub_idx_0 , %src_sub_idx_1 ], %smem_memref_2d [%dest_sub_idx_0 , %dest_sub_idx_1 ], 8 {bypassL1 } : memref <1 x8 xf16 , strided <[98304 , 1 ], offset : ?>> to memref <1 x8 xf16 , strided <[4096 , 1 ], offset : ?>, #gpu.address_space <workgroup >>
663- return
664- }
665-
666- // CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)>
667- // CHECK: func.func @fold_src_fold_dest_nvgpu_device_async_copy
668- // CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[SRC_IDX_0:.+]]: index, %[[SRC_IDX_1:.+]]: index, %[[SRC_IDX_2:.+]]: index, %[[SRC_SUB_IDX_0:.+]]: index, %[[SRC_SUB_IDX_1:.+]]: index, %[[DEST_IDX_0:.+]]: index, %[[DEST_IDX_1:.+]]: index, %[[DEST_IDX_2:.+]]: index, %[[DEST_IDX_3:.+]]: index, %[[DEST_SUB_IDX_0:.+]]: index, %[[DEST_SUB_IDX_1:.+]]: index)
669- // CHECK-DAG: %[[RESOLVED_SRC_IDX_0:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_0]], %[[SRC_SUB_IDX_0]]]
670- // CHECK-DAG: %[[RESOLVED_SRC_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_2]], %[[SRC_SUB_IDX_1]]]
671- // CHECK-DAG: %[[RESOLVED_DST_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[DEST_IDX_1]], %[[DEST_SUB_IDX_0]]]
672- // CHECK-DAG: %[[RESOLVED_DST_IDX_3:.+]] = affine.apply #[[MAP]]()[%[[DEST_IDX_3]], %[[DEST_SUB_IDX_1]]]
673- // CHECK-DAG: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[RESOLVED_SRC_IDX_0]], %[[SRC_IDX_1]], %[[RESOLVED_SRC_IDX_1]]], %[[SMEM_MEMREF_4d]][%[[DEST_IDX_0]], %[[RESOLVED_DST_IDX_1]], %[[DEST_IDX_2]], %[[RESOLVED_DST_IDX_3]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
674-
675- // -----
676-
677- #map = affine_map <()[s0 ] -> (-s0 + 4 )>
678- #map1 = affine_map <()[s0 ] -> (-s0 + 32 )>
679-
680- func.func @test_ldmatrix (%arg0: memref <4 x32 x32 xf16 , 3 >, %arg1: index , %arg2: index , %arg3: index ) -> vector <4 x2 xf16 > {
681- %c0 = arith.constant 0 : index
682- %0 = affine.apply #map ()[%arg1 ]
683- %1 = affine.apply #map1 ()[%arg2 ]
684- %2 = affine.apply #map1 ()[%arg3 ]
685- %subview = memref.subview %arg0 [%arg1 , %arg2 , %arg3 ] [%0 , %1 , %2 ] [1 , 1 , 1 ] : memref <4 x32 x32 xf16 , 3 > to memref <?x?x?xf16 , strided <[1024 , 32 , 1 ], offset : ?>, 3 >
686- %3 = nvgpu.ldmatrix %subview [%c0 , %c0 , %c0 ] {numTiles = 4 : i32 , transpose = false } : memref <?x?x?xf16 , strided <[1024 , 32 , 1 ], offset : ?>, 3 > -> vector <4 x2 xf16 >
687- return %3 : vector <4 x2 xf16 >
688- }
689-
690- // CHECK: func @test_ldmatrix
691- // CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref<4x32x32xf16, 3>
692- // CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: index
693- // CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: index
694- // CHECK-SAME: %[[ARG3:[a-zA-Z0-9_]+]]: index
695- // CHECK: nvgpu.ldmatrix %[[ARG0]][%[[ARG1]], %[[ARG2]], %[[ARG3]]] {numTiles = 4 : i32, transpose = false} : memref<4x32x32xf16, 3> -> vector<4x2xf16>
696-
697- // -----
698-
699619func.func @fold_vector_load_subview (%src : memref <24 x64 xf32 >,
700620 %off1 : index ,
701621 %off2 : index ,
0 commit comments