diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index e2a8fb485850f..275e52726f093 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1264,9 +1264,9 @@ def FeaturePredictableSelectIsExpensive : SubtargetFeature<"predictable-select-expensive", "PredictableSelectIsExpensive", "true", "Prefer likely predicted branches over selects">; -def TuneNoOptimizedZeroStrideLoad - : SubtargetFeature<"no-optimized-zero-stride-load", "HasOptimizedZeroStrideLoad", - "false", "Hasn't optimized (perform fewer memory operations)" +def TuneOptimizedZeroStrideLoad + : SubtargetFeature<"optimized-zero-stride-load", "HasOptimizedZeroStrideLoad", + "true", "Optimized (perform fewer memory operations)" "zero-stride vector load">; def Experimental diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index 13a2491116b5d..6eed2ae01f646 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -231,7 +231,8 @@ def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model, FeatureStdExtZbb], [TuneSiFive7, FeaturePostRAScheduler, - TuneDLenFactor2]>; + TuneDLenFactor2, + TuneOptimizedZeroStrideLoad]>; def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model, [Feature64Bit, diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll index eb7f6b1bb6540..26ed4595ca758 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -1137,37 +1137,67 @@ define <32 x double> @buildvec_v32f64(double %e0, double %e1, double %e2, double define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double %e2, double %e3, double %e4, double %e5, double %e6, double %e7, double %e8, double %e9, double %e10, double %e11, double %e12, double %e13, double %e14, double %e15, double %e16, double %e17, double %e18, double %e19, double %e20, double %e21, double %e22, double %e23, double %e24, double %e25, double %e26, double %e27, double %e28, double %e29, double %e30, double %e31) vscale_range(2,2) { ; RV32-LABEL: buildvec_v32f64_exact_vlen: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: fsd fs0, 24(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs1, 16(sp) # 8-byte Folded Spill +; RV32-NEXT: addi sp, sp, -112 +; RV32-NEXT: .cfi_def_cfa_offset 112 +; RV32-NEXT: fsd fs0, 104(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs1, 96(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs2, 88(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs3, 80(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs4, 72(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs5, 64(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs6, 56(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs7, 48(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs8, 40(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs9, 32(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs10, 24(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs11, 16(sp) # 8-byte Folded Spill ; RV32-NEXT: .cfi_offset fs0, -8 ; RV32-NEXT: .cfi_offset fs1, -16 +; RV32-NEXT: .cfi_offset fs2, -24 +; RV32-NEXT: .cfi_offset fs3, -32 +; RV32-NEXT: .cfi_offset fs4, -40 +; RV32-NEXT: .cfi_offset fs5, -48 +; RV32-NEXT: .cfi_offset fs6, -56 +; RV32-NEXT: .cfi_offset fs7, -64 +; RV32-NEXT: .cfi_offset fs8, -72 +; RV32-NEXT: .cfi_offset fs9, -80 +; RV32-NEXT: .cfi_offset fs10, -88 +; RV32-NEXT: .cfi_offset fs11, -96 ; RV32-NEXT: sw a6, 8(sp) ; RV32-NEXT: sw a7, 12(sp) -; RV32-NEXT: fld ft4, 8(sp) +; RV32-NEXT: fld ft6, 8(sp) ; RV32-NEXT: sw a4, 8(sp) ; RV32-NEXT: sw a5, 12(sp) -; RV32-NEXT: fld ft5, 8(sp) +; RV32-NEXT: fld ft7, 8(sp) ; RV32-NEXT: sw a2, 8(sp) ; RV32-NEXT: sw a3, 12(sp) -; RV32-NEXT: fld ft6, 8(sp) +; RV32-NEXT: fld ft8, 8(sp) ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: fld ft7, 8(sp) -; RV32-NEXT: fld ft0, 184(sp) -; RV32-NEXT: fld ft1, 168(sp) -; RV32-NEXT: fld ft2, 152(sp) -; RV32-NEXT: fld ft3, 136(sp) -; RV32-NEXT: fld ft8, 120(sp) -; RV32-NEXT: fld ft9, 104(sp) -; RV32-NEXT: fld ft10, 72(sp) -; RV32-NEXT: fld ft11, 88(sp) -; RV32-NEXT: fld fs0, 56(sp) -; RV32-NEXT: fld fs1, 40(sp) +; RV32-NEXT: fld ft9, 8(sp) +; RV32-NEXT: fld ft0, 264(sp) +; RV32-NEXT: fld ft1, 256(sp) +; RV32-NEXT: fld ft2, 248(sp) +; RV32-NEXT: fld ft3, 240(sp) +; RV32-NEXT: fld ft4, 232(sp) +; RV32-NEXT: fld ft5, 224(sp) +; RV32-NEXT: fld ft10, 216(sp) +; RV32-NEXT: fld ft11, 208(sp) +; RV32-NEXT: fld fs0, 200(sp) +; RV32-NEXT: fld fs1, 192(sp) +; RV32-NEXT: fld fs2, 184(sp) +; RV32-NEXT: fld fs3, 176(sp) +; RV32-NEXT: fld fs4, 152(sp) +; RV32-NEXT: fld fs5, 144(sp) +; RV32-NEXT: fld fs6, 168(sp) +; RV32-NEXT: fld fs7, 160(sp) +; RV32-NEXT: fld fs8, 136(sp) +; RV32-NEXT: fld fs9, 128(sp) +; RV32-NEXT: fld fs10, 120(sp) +; RV32-NEXT: fld fs11, 112(sp) ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vfmv.v.f v8, ft7 -; RV32-NEXT: vfslide1down.vf v12, v8, ft6 +; RV32-NEXT: vfmv.v.f v8, ft9 +; RV32-NEXT: vfslide1down.vf v12, v8, ft8 ; RV32-NEXT: vfmv.v.f v8, fa2 ; RV32-NEXT: vfslide1down.vf v9, v8, fa3 ; RV32-NEXT: vfmv.v.f v8, fa0 @@ -1176,55 +1206,71 @@ define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double ; RV32-NEXT: vfslide1down.vf v10, v10, fa5 ; RV32-NEXT: vfmv.v.f v11, fa6 ; RV32-NEXT: vfslide1down.vf v11, v11, fa7 -; RV32-NEXT: addi a0, sp, 32 -; RV32-NEXT: vlse64.v v14, (a0), zero -; RV32-NEXT: addi a0, sp, 48 -; RV32-NEXT: vlse64.v v15, (a0), zero -; RV32-NEXT: vfmv.v.f v13, ft5 -; RV32-NEXT: vfslide1down.vf v13, v13, ft4 -; RV32-NEXT: vfslide1down.vf v14, v14, fs1 -; RV32-NEXT: vfslide1down.vf v15, v15, fs0 -; RV32-NEXT: addi a0, sp, 80 -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: addi a0, sp, 64 -; RV32-NEXT: vlse64.v v18, (a0), zero -; RV32-NEXT: addi a0, sp, 96 -; RV32-NEXT: vlse64.v v19, (a0), zero -; RV32-NEXT: addi a0, sp, 112 -; RV32-NEXT: vlse64.v v20, (a0), zero -; RV32-NEXT: vfslide1down.vf v17, v16, ft11 -; RV32-NEXT: vfslide1down.vf v16, v18, ft10 -; RV32-NEXT: vfslide1down.vf v18, v19, ft9 -; RV32-NEXT: vfslide1down.vf v19, v20, ft8 -; RV32-NEXT: addi a0, sp, 128 -; RV32-NEXT: vlse64.v v20, (a0), zero -; RV32-NEXT: addi a0, sp, 144 -; RV32-NEXT: vlse64.v v21, (a0), zero -; RV32-NEXT: addi a0, sp, 160 -; RV32-NEXT: vlse64.v v22, (a0), zero -; RV32-NEXT: addi a0, sp, 176 -; RV32-NEXT: vlse64.v v23, (a0), zero -; RV32-NEXT: vfslide1down.vf v20, v20, ft3 -; RV32-NEXT: vfslide1down.vf v21, v21, ft2 -; RV32-NEXT: vfslide1down.vf v22, v22, ft1 +; RV32-NEXT: vfmv.v.f v13, ft7 +; RV32-NEXT: vfslide1down.vf v13, v13, ft6 +; RV32-NEXT: vfmv.v.f v14, fs11 +; RV32-NEXT: vfslide1down.vf v14, v14, fs10 +; RV32-NEXT: vfmv.v.f v15, fs9 +; RV32-NEXT: vfslide1down.vf v15, v15, fs8 +; RV32-NEXT: vfmv.v.f v16, fs7 +; RV32-NEXT: vfslide1down.vf v17, v16, fs6 +; RV32-NEXT: vfmv.v.f v16, fs5 +; RV32-NEXT: vfslide1down.vf v16, v16, fs4 +; RV32-NEXT: vfmv.v.f v18, fs3 +; RV32-NEXT: vfslide1down.vf v18, v18, fs2 +; RV32-NEXT: vfmv.v.f v19, fs1 +; RV32-NEXT: vfslide1down.vf v19, v19, fs0 +; RV32-NEXT: vfmv.v.f v20, ft11 +; RV32-NEXT: vfslide1down.vf v20, v20, ft10 +; RV32-NEXT: vfmv.v.f v21, ft5 +; RV32-NEXT: vfslide1down.vf v21, v21, ft4 +; RV32-NEXT: vfmv.v.f v22, ft3 +; RV32-NEXT: vfslide1down.vf v22, v22, ft2 +; RV32-NEXT: vfmv.v.f v23, ft1 ; RV32-NEXT: vfslide1down.vf v23, v23, ft0 -; RV32-NEXT: fld fs0, 24(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs1, 16(sp) # 8-byte Folded Reload -; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: fld fs0, 104(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs1, 96(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs2, 88(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs3, 80(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs4, 72(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs5, 64(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs6, 56(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs7, 48(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs8, 40(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs9, 32(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs10, 24(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs11, 16(sp) # 8-byte Folded Reload +; RV32-NEXT: addi sp, sp, 112 ; RV32-NEXT: ret ; ; RV64-LABEL: buildvec_v32f64_exact_vlen: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -32 -; RV64-NEXT: .cfi_def_cfa_offset 32 -; RV64-NEXT: fsd fs0, 24(sp) # 8-byte Folded Spill -; RV64-NEXT: fsd fs1, 16(sp) # 8-byte Folded Spill -; RV64-NEXT: fsd fs2, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: fsd fs3, 0(sp) # 8-byte Folded Spill +; RV64-NEXT: addi sp, sp, -96 +; RV64-NEXT: .cfi_def_cfa_offset 96 +; RV64-NEXT: fsd fs0, 88(sp) # 8-byte Folded Spill +; RV64-NEXT: fsd fs1, 80(sp) # 8-byte Folded Spill +; RV64-NEXT: fsd fs2, 72(sp) # 8-byte Folded Spill +; RV64-NEXT: fsd fs3, 64(sp) # 8-byte Folded Spill +; RV64-NEXT: fsd fs4, 56(sp) # 8-byte Folded Spill +; RV64-NEXT: fsd fs5, 48(sp) # 8-byte Folded Spill +; RV64-NEXT: fsd fs6, 40(sp) # 8-byte Folded Spill +; RV64-NEXT: fsd fs7, 32(sp) # 8-byte Folded Spill +; RV64-NEXT: fsd fs8, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: fsd fs9, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: fsd fs10, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: fsd fs11, 0(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset fs0, -8 ; RV64-NEXT: .cfi_offset fs1, -16 ; RV64-NEXT: .cfi_offset fs2, -24 ; RV64-NEXT: .cfi_offset fs3, -32 +; RV64-NEXT: .cfi_offset fs4, -40 +; RV64-NEXT: .cfi_offset fs5, -48 +; RV64-NEXT: .cfi_offset fs6, -56 +; RV64-NEXT: .cfi_offset fs7, -64 +; RV64-NEXT: .cfi_offset fs8, -72 +; RV64-NEXT: .cfi_offset fs9, -80 +; RV64-NEXT: .cfi_offset fs10, -88 +; RV64-NEXT: .cfi_offset fs11, -96 ; RV64-NEXT: fmv.d.x ft4, a7 ; RV64-NEXT: fmv.d.x ft5, a6 ; RV64-NEXT: fmv.d.x ft6, a5 @@ -1233,14 +1279,22 @@ define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double ; RV64-NEXT: fmv.d.x ft9, a2 ; RV64-NEXT: fmv.d.x ft10, a1 ; RV64-NEXT: fmv.d.x ft11, a0 -; RV64-NEXT: fld ft0, 152(sp) -; RV64-NEXT: fld ft1, 136(sp) -; RV64-NEXT: fld ft2, 120(sp) -; RV64-NEXT: fld ft3, 104(sp) -; RV64-NEXT: fld fs0, 88(sp) -; RV64-NEXT: fld fs1, 72(sp) -; RV64-NEXT: fld fs2, 40(sp) -; RV64-NEXT: fld fs3, 56(sp) +; RV64-NEXT: fld ft0, 216(sp) +; RV64-NEXT: fld ft1, 208(sp) +; RV64-NEXT: fld ft2, 200(sp) +; RV64-NEXT: fld ft3, 192(sp) +; RV64-NEXT: fld fs0, 184(sp) +; RV64-NEXT: fld fs1, 176(sp) +; RV64-NEXT: fld fs2, 168(sp) +; RV64-NEXT: fld fs3, 160(sp) +; RV64-NEXT: fld fs4, 152(sp) +; RV64-NEXT: fld fs5, 144(sp) +; RV64-NEXT: fld fs6, 136(sp) +; RV64-NEXT: fld fs7, 128(sp) +; RV64-NEXT: fld fs8, 104(sp) +; RV64-NEXT: fld fs9, 96(sp) +; RV64-NEXT: fld fs10, 120(sp) +; RV64-NEXT: fld fs11, 112(sp) ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vfmv.v.f v8, fa2 ; RV64-NEXT: vfslide1down.vf v9, v8, fa3 @@ -1258,35 +1312,35 @@ define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double ; RV64-NEXT: vfslide1down.vf v14, v14, ft6 ; RV64-NEXT: vfmv.v.f v15, ft5 ; RV64-NEXT: vfslide1down.vf v15, v15, ft4 -; RV64-NEXT: addi a0, sp, 48 -; RV64-NEXT: vlse64.v v16, (a0), zero -; RV64-NEXT: addi a0, sp, 32 -; RV64-NEXT: vlse64.v v18, (a0), zero -; RV64-NEXT: addi a0, sp, 64 -; RV64-NEXT: vlse64.v v19, (a0), zero -; RV64-NEXT: addi a0, sp, 80 -; RV64-NEXT: vlse64.v v20, (a0), zero -; RV64-NEXT: vfslide1down.vf v17, v16, fs3 -; RV64-NEXT: vfslide1down.vf v16, v18, fs2 -; RV64-NEXT: vfslide1down.vf v18, v19, fs1 -; RV64-NEXT: vfslide1down.vf v19, v20, fs0 -; RV64-NEXT: addi a0, sp, 96 -; RV64-NEXT: vlse64.v v20, (a0), zero -; RV64-NEXT: addi a0, sp, 112 -; RV64-NEXT: vlse64.v v21, (a0), zero -; RV64-NEXT: addi a0, sp, 128 -; RV64-NEXT: vlse64.v v22, (a0), zero -; RV64-NEXT: addi a0, sp, 144 -; RV64-NEXT: vlse64.v v23, (a0), zero -; RV64-NEXT: vfslide1down.vf v20, v20, ft3 -; RV64-NEXT: vfslide1down.vf v21, v21, ft2 -; RV64-NEXT: vfslide1down.vf v22, v22, ft1 +; RV64-NEXT: vfmv.v.f v16, fs11 +; RV64-NEXT: vfslide1down.vf v17, v16, fs10 +; RV64-NEXT: vfmv.v.f v16, fs9 +; RV64-NEXT: vfslide1down.vf v16, v16, fs8 +; RV64-NEXT: vfmv.v.f v18, fs7 +; RV64-NEXT: vfslide1down.vf v18, v18, fs6 +; RV64-NEXT: vfmv.v.f v19, fs5 +; RV64-NEXT: vfslide1down.vf v19, v19, fs4 +; RV64-NEXT: vfmv.v.f v20, fs3 +; RV64-NEXT: vfslide1down.vf v20, v20, fs2 +; RV64-NEXT: vfmv.v.f v21, fs1 +; RV64-NEXT: vfslide1down.vf v21, v21, fs0 +; RV64-NEXT: vfmv.v.f v22, ft3 +; RV64-NEXT: vfslide1down.vf v22, v22, ft2 +; RV64-NEXT: vfmv.v.f v23, ft1 ; RV64-NEXT: vfslide1down.vf v23, v23, ft0 -; RV64-NEXT: fld fs0, 24(sp) # 8-byte Folded Reload -; RV64-NEXT: fld fs1, 16(sp) # 8-byte Folded Reload -; RV64-NEXT: fld fs2, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: fld fs3, 0(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: fld fs0, 88(sp) # 8-byte Folded Reload +; RV64-NEXT: fld fs1, 80(sp) # 8-byte Folded Reload +; RV64-NEXT: fld fs2, 72(sp) # 8-byte Folded Reload +; RV64-NEXT: fld fs3, 64(sp) # 8-byte Folded Reload +; RV64-NEXT: fld fs4, 56(sp) # 8-byte Folded Reload +; RV64-NEXT: fld fs5, 48(sp) # 8-byte Folded Reload +; RV64-NEXT: fld fs6, 40(sp) # 8-byte Folded Reload +; RV64-NEXT: fld fs7, 32(sp) # 8-byte Folded Reload +; RV64-NEXT: fld fs8, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: fld fs9, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: fld fs10, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: fld fs11, 0(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 96 ; RV64-NEXT: ret %v0 = insertelement <32 x double> poison, double %e0, i64 0 %v1 = insertelement <32 x double> %v0, double %e1, i64 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll index 6408402ef787f..958321f6c46d3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 -; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32-ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64-ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32-ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64-ZVFHMIN define <4 x half> @shuffle_v4f16(<4 x half> %x, <4 x half> %y) { ; CHECK-LABEL: shuffle_v4f16: @@ -110,13 +110,13 @@ define <4 x double> @vrgather_shuffle_xv_v4f64(<4 x double> %x) { ; CHECK-LABEL: vrgather_shuffle_xv_v4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI7_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI7_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI7_0)(a0) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vlse64.v v10, (a0), zero -; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vrsub.vi v12, v10, 4 ; CHECK-NEXT: vmv.v.i v0, 12 -; CHECK-NEXT: vrsub.vi v12, v12, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vfmv.v.f v10, fa5 ; CHECK-NEXT: vrgatherei16.vv v10, v8, v12, v0.t ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret @@ -128,14 +128,14 @@ define <4 x double> @vrgather_shuffle_vx_v4f64(<4 x double> %x) { ; CHECK-LABEL: vrgather_shuffle_vx_v4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: lui a0, %hi(.LCPI8_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0) -; CHECK-NEXT: vlse64.v v10, (a0), zero +; CHECK-NEXT: fld fa5, %lo(.LCPI8_0)(a0) ; CHECK-NEXT: li a0, 3 +; CHECK-NEXT: vmul.vx v12, v10, a0 ; CHECK-NEXT: vmv.v.i v0, 3 -; CHECK-NEXT: vmul.vx v12, v12, a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vfmv.v.f v10, fa5 ; CHECK-NEXT: vrgatherei16.vv v10, v8, v12, v0.t ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret @@ -298,12 +298,33 @@ define <4 x half> @vrgather_shuffle_vv_v4f16(<4 x half> %x, <4 x half> %y) { } define <4 x half> @vrgather_shuffle_vx_v4f16_load(ptr %p) { -; CHECK-LABEL: vrgather_shuffle_vx_v4f16_load: -; CHECK: # %bb.0: -; CHECK-NEXT: addi a0, a0, 2 -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vlse16.v v8, (a0), zero -; CHECK-NEXT: ret +; RV32-ZVFH-LABEL: vrgather_shuffle_vx_v4f16_load: +; RV32-ZVFH: # %bb.0: +; RV32-ZVFH-NEXT: flh fa5, 2(a0) +; RV32-ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-ZVFH-NEXT: vfmv.v.f v8, fa5 +; RV32-ZVFH-NEXT: ret +; +; RV64-ZVFH-LABEL: vrgather_shuffle_vx_v4f16_load: +; RV64-ZVFH: # %bb.0: +; RV64-ZVFH-NEXT: flh fa5, 2(a0) +; RV64-ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-ZVFH-NEXT: vfmv.v.f v8, fa5 +; RV64-ZVFH-NEXT: ret +; +; RV32-ZVFHMIN-LABEL: vrgather_shuffle_vx_v4f16_load: +; RV32-ZVFHMIN: # %bb.0: +; RV32-ZVFHMIN-NEXT: lh a0, 2(a0) +; RV32-ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-ZVFHMIN-NEXT: vmv.v.x v8, a0 +; RV32-ZVFHMIN-NEXT: ret +; +; RV64-ZVFHMIN-LABEL: vrgather_shuffle_vx_v4f16_load: +; RV64-ZVFHMIN: # %bb.0: +; RV64-ZVFHMIN-NEXT: lh a0, 2(a0) +; RV64-ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-ZVFHMIN-NEXT: vmv.v.x v8, a0 +; RV64-ZVFHMIN-NEXT: ret %v = load <4 x half>, ptr %p %s = shufflevector <4 x half> %v, <4 x half> undef, <4 x i32> ret <4 x half> %s diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-vrgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-vrgather.ll index de7dfab1dfcff..58b0a17cdccd6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-vrgather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-vrgather.ll @@ -5,9 +5,9 @@ define void @gather_const_v8f16(ptr %x) { ; CHECK-LABEL: gather_const_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a1, a0, 10 +; CHECK-NEXT: flh fa5, 10(a0) ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v8, (a1), zero +; CHECK-NEXT: vfmv.v.f v8, fa5 ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <8 x half>, ptr %x @@ -21,9 +21,9 @@ define void @gather_const_v8f16(ptr %x) { define void @gather_const_v4f32(ptr %x) { ; CHECK-LABEL: gather_const_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a1, a0, 8 +; CHECK-NEXT: flw fa5, 8(a0) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v8, (a1), zero +; CHECK-NEXT: vfmv.v.f v8, fa5 ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %a = load <4 x float>, ptr %x @@ -37,8 +37,9 @@ define void @gather_const_v4f32(ptr %x) { define void @gather_const_v2f64(ptr %x) { ; CHECK-LABEL: gather_const_v2f64: ; CHECK: # %bb.0: +; CHECK-NEXT: fld fa5, 0(a0) ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v8, (a0), zero +; CHECK-NEXT: vfmv.v.f v8, fa5 ; CHECK-NEXT: vse64.v v8, (a0) ; CHECK-NEXT: ret %a = load <2 x double>, ptr %x @@ -52,10 +53,10 @@ define void @gather_const_v2f64(ptr %x) { define void @gather_const_v64f16(ptr %x) { ; CHECK-LABEL: gather_const_v64f16: ; CHECK: # %bb.0: +; CHECK-NEXT: flh fa5, 94(a0) ; CHECK-NEXT: li a1, 64 -; CHECK-NEXT: addi a2, a0, 94 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; CHECK-NEXT: vlse16.v v8, (a2), zero +; CHECK-NEXT: vfmv.v.f v8, fa5 ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <64 x half>, ptr %x @@ -69,10 +70,10 @@ define void @gather_const_v64f16(ptr %x) { define void @gather_const_v32f32(ptr %x) { ; CHECK-LABEL: gather_const_v32f32: ; CHECK: # %bb.0: +; CHECK-NEXT: flw fa5, 68(a0) ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: addi a2, a0, 68 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vlse32.v v8, (a2), zero +; CHECK-NEXT: vfmv.v.f v8, fa5 ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %a = load <32 x float>, ptr %x @@ -86,9 +87,9 @@ define void @gather_const_v32f32(ptr %x) { define void @gather_const_v16f64(ptr %x) { ; CHECK-LABEL: gather_const_v16f64: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a1, a0, 80 +; CHECK-NEXT: fld fa5, 80(a0) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vlse64.v v8, (a1), zero +; CHECK-NEXT: vfmv.v.f v8, fa5 ; CHECK-NEXT: vse64.v v8, (a0) ; CHECK-NEXT: ret %a = load <16 x double>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll index 6ca96d3551583..be602e36c4e85 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -614,9 +614,9 @@ define void @buildvec_seq_v16i8_v2i64(ptr %x) { ; RV64V-LABEL: buildvec_seq_v16i8_v2i64: ; RV64V: # %bb.0: ; RV64V-NEXT: lui a1, %hi(.LCPI42_0) -; RV64V-NEXT: addi a1, a1, %lo(.LCPI42_0) +; RV64V-NEXT: ld a1, %lo(.LCPI42_0)(a1) ; RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64V-NEXT: vlse64.v v8, (a1), zero +; RV64V-NEXT: vmv.v.x v8, a1 ; RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV64V-NEXT: vse8.v v8, (a0) ; RV64V-NEXT: ret @@ -999,12 +999,18 @@ define <4 x i64> @v4xi64_exact(i64 %a, i64 %b, i64 %c, i64 %d) vscale_range(2,2) define <8 x i64> @v8xi64_exact(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h) vscale_range(2,2) { ; RV32-LABEL: v8xi64_exact: ; RV32: # %bb.0: -; RV32-NEXT: lw t0, 28(sp) -; RV32-NEXT: lw t1, 24(sp) -; RV32-NEXT: lw t2, 20(sp) -; RV32-NEXT: lw t3, 12(sp) -; RV32-NEXT: lw t4, 8(sp) -; RV32-NEXT: lw t5, 4(sp) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset s0, -4 +; RV32-NEXT: lw t0, 44(sp) +; RV32-NEXT: lw t1, 40(sp) +; RV32-NEXT: lw t2, 36(sp) +; RV32-NEXT: lw t3, 32(sp) +; RV32-NEXT: lw t4, 28(sp) +; RV32-NEXT: lw t5, 24(sp) +; RV32-NEXT: lw t6, 20(sp) +; RV32-NEXT: lw s0, 16(sp) ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v8, a4 ; RV32-NEXT: vslide1down.vx v8, v8, a5 @@ -1013,16 +1019,17 @@ define <8 x i64> @v8xi64_exact(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i ; RV32-NEXT: vmv.v.x v8, a0 ; RV32-NEXT: vslide1down.vx v8, v8, a1 ; RV32-NEXT: vslide1down.vx v8, v8, a2 -; RV32-NEXT: vlse32.v v10, (sp), zero ; RV32-NEXT: vslide1down.vx v8, v8, a3 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vlse32.v v11, (a0), zero +; RV32-NEXT: vmv.v.x v10, s0 +; RV32-NEXT: vslide1down.vx v10, v10, t6 ; RV32-NEXT: vslide1down.vx v10, v10, t5 ; RV32-NEXT: vslide1down.vx v10, v10, t4 -; RV32-NEXT: vslide1down.vx v10, v10, t3 +; RV32-NEXT: vmv.v.x v11, t3 ; RV32-NEXT: vslide1down.vx v11, v11, t2 ; RV32-NEXT: vslide1down.vx v11, v11, t1 ; RV32-NEXT: vslide1down.vx v11, v11, t0 +; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64V-LABEL: v8xi64_exact: @@ -1188,7 +1195,11 @@ define <8 x i64> @v8xi64_exact_undef_prefix(i64 %a, i64 %b, i64 %c, i64 %d) vsca define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; RV32-ONLY-LABEL: buildvec_v16i8_loads_contigous: ; RV32-ONLY: # %bb.0: -; RV32-ONLY-NEXT: addi a1, a0, 8 +; RV32-ONLY-NEXT: addi sp, sp, -16 +; RV32-ONLY-NEXT: .cfi_def_cfa_offset 16 +; RV32-ONLY-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32-ONLY-NEXT: .cfi_offset s0, -4 +; RV32-ONLY-NEXT: lbu a1, 0(a0) ; RV32-ONLY-NEXT: lbu a2, 1(a0) ; RV32-ONLY-NEXT: lbu a3, 2(a0) ; RV32-ONLY-NEXT: lbu a4, 3(a0) @@ -1196,35 +1207,38 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; RV32-ONLY-NEXT: lbu a6, 5(a0) ; RV32-ONLY-NEXT: lbu a7, 6(a0) ; RV32-ONLY-NEXT: lbu t0, 7(a0) -; RV32-ONLY-NEXT: lbu t1, 9(a0) -; RV32-ONLY-NEXT: lbu t2, 10(a0) -; RV32-ONLY-NEXT: lbu t3, 11(a0) -; RV32-ONLY-NEXT: lbu t4, 12(a0) -; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-ONLY-NEXT: vlse8.v v8, (a0), zero -; RV32-ONLY-NEXT: lbu t5, 13(a0) -; RV32-ONLY-NEXT: lbu t6, 14(a0) +; RV32-ONLY-NEXT: lbu t1, 8(a0) +; RV32-ONLY-NEXT: lbu t2, 9(a0) +; RV32-ONLY-NEXT: lbu t3, 10(a0) +; RV32-ONLY-NEXT: lbu t4, 11(a0) +; RV32-ONLY-NEXT: lbu t5, 12(a0) +; RV32-ONLY-NEXT: lbu t6, 13(a0) +; RV32-ONLY-NEXT: lbu s0, 14(a0) ; RV32-ONLY-NEXT: lbu a0, 15(a0) +; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-ONLY-NEXT: vmv.v.x v8, a1 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a2 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a3 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a4 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a5 -; RV32-ONLY-NEXT: vlse8.v v9, (a1), zero ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a6 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a7 -; RV32-ONLY-NEXT: vslide1down.vx v10, v8, t0 -; RV32-ONLY-NEXT: vslide1down.vx v8, v9, t1 +; RV32-ONLY-NEXT: vslide1down.vx v9, v8, t0 +; RV32-ONLY-NEXT: vmv.v.x v8, t1 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t2 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t3 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t4 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t5 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t6 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, s0 ; RV32-ONLY-NEXT: li a1, 255 ; RV32-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV32-ONLY-NEXT: vmv.s.x v0, a1 ; RV32-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a0 -; RV32-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV32-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t +; RV32-ONLY-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32-ONLY-NEXT: addi sp, sp, 16 ; RV32-ONLY-NEXT: ret ; ; RV32VB-LABEL: buildvec_v16i8_loads_contigous: @@ -1315,7 +1329,11 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; ; RV64V-ONLY-LABEL: buildvec_v16i8_loads_contigous: ; RV64V-ONLY: # %bb.0: -; RV64V-ONLY-NEXT: addi a1, a0, 8 +; RV64V-ONLY-NEXT: addi sp, sp, -16 +; RV64V-ONLY-NEXT: .cfi_def_cfa_offset 16 +; RV64V-ONLY-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; RV64V-ONLY-NEXT: .cfi_offset s0, -8 +; RV64V-ONLY-NEXT: lbu a1, 0(a0) ; RV64V-ONLY-NEXT: lbu a2, 1(a0) ; RV64V-ONLY-NEXT: lbu a3, 2(a0) ; RV64V-ONLY-NEXT: lbu a4, 3(a0) @@ -1323,35 +1341,38 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; RV64V-ONLY-NEXT: lbu a6, 5(a0) ; RV64V-ONLY-NEXT: lbu a7, 6(a0) ; RV64V-ONLY-NEXT: lbu t0, 7(a0) -; RV64V-ONLY-NEXT: lbu t1, 9(a0) -; RV64V-ONLY-NEXT: lbu t2, 10(a0) -; RV64V-ONLY-NEXT: lbu t3, 11(a0) -; RV64V-ONLY-NEXT: lbu t4, 12(a0) -; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64V-ONLY-NEXT: vlse8.v v8, (a0), zero -; RV64V-ONLY-NEXT: lbu t5, 13(a0) -; RV64V-ONLY-NEXT: lbu t6, 14(a0) +; RV64V-ONLY-NEXT: lbu t1, 8(a0) +; RV64V-ONLY-NEXT: lbu t2, 9(a0) +; RV64V-ONLY-NEXT: lbu t3, 10(a0) +; RV64V-ONLY-NEXT: lbu t4, 11(a0) +; RV64V-ONLY-NEXT: lbu t5, 12(a0) +; RV64V-ONLY-NEXT: lbu t6, 13(a0) +; RV64V-ONLY-NEXT: lbu s0, 14(a0) ; RV64V-ONLY-NEXT: lbu a0, 15(a0) +; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64V-ONLY-NEXT: vmv.v.x v8, a1 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a2 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a3 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a4 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a5 -; RV64V-ONLY-NEXT: vlse8.v v9, (a1), zero ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a6 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a7 -; RV64V-ONLY-NEXT: vslide1down.vx v10, v8, t0 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v9, t1 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v8, t0 +; RV64V-ONLY-NEXT: vmv.v.x v8, t1 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t2 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t3 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t4 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t5 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t6 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, s0 ; RV64V-ONLY-NEXT: li a1, 255 ; RV64V-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64V-ONLY-NEXT: vmv.s.x v0, a1 ; RV64V-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a0 -; RV64V-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV64V-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t +; RV64V-ONLY-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; RV64V-ONLY-NEXT: addi sp, sp, 16 ; RV64V-ONLY-NEXT: ret ; ; RVA22U64-LABEL: buildvec_v16i8_loads_contigous: @@ -1444,7 +1465,11 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; ; RV64ZVE32-LABEL: buildvec_v16i8_loads_contigous: ; RV64ZVE32: # %bb.0: -; RV64ZVE32-NEXT: addi a1, a0, 8 +; RV64ZVE32-NEXT: addi sp, sp, -16 +; RV64ZVE32-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; RV64ZVE32-NEXT: .cfi_offset s0, -8 +; RV64ZVE32-NEXT: lbu a1, 0(a0) ; RV64ZVE32-NEXT: lbu a2, 1(a0) ; RV64ZVE32-NEXT: lbu a3, 2(a0) ; RV64ZVE32-NEXT: lbu a4, 3(a0) @@ -1452,35 +1477,38 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; RV64ZVE32-NEXT: lbu a6, 5(a0) ; RV64ZVE32-NEXT: lbu a7, 6(a0) ; RV64ZVE32-NEXT: lbu t0, 7(a0) -; RV64ZVE32-NEXT: lbu t1, 9(a0) -; RV64ZVE32-NEXT: lbu t2, 10(a0) -; RV64ZVE32-NEXT: lbu t3, 11(a0) -; RV64ZVE32-NEXT: lbu t4, 12(a0) -; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64ZVE32-NEXT: vlse8.v v8, (a0), zero -; RV64ZVE32-NEXT: lbu t5, 13(a0) -; RV64ZVE32-NEXT: lbu t6, 14(a0) +; RV64ZVE32-NEXT: lbu t1, 8(a0) +; RV64ZVE32-NEXT: lbu t2, 9(a0) +; RV64ZVE32-NEXT: lbu t3, 10(a0) +; RV64ZVE32-NEXT: lbu t4, 11(a0) +; RV64ZVE32-NEXT: lbu t5, 12(a0) +; RV64ZVE32-NEXT: lbu t6, 13(a0) +; RV64ZVE32-NEXT: lbu s0, 14(a0) ; RV64ZVE32-NEXT: lbu a0, 15(a0) +; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64ZVE32-NEXT: vmv.v.x v8, a1 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a2 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a3 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a4 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a5 -; RV64ZVE32-NEXT: vlse8.v v9, (a1), zero ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a6 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a7 -; RV64ZVE32-NEXT: vslide1down.vx v10, v8, t0 -; RV64ZVE32-NEXT: vslide1down.vx v8, v9, t1 +; RV64ZVE32-NEXT: vslide1down.vx v9, v8, t0 +; RV64ZVE32-NEXT: vmv.v.x v8, t1 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t2 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t3 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t4 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t5 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t6 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, s0 ; RV64ZVE32-NEXT: li a1, 255 ; RV64ZVE32-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32-NEXT: vmv.s.x v0, a1 ; RV64ZVE32-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV64ZVE32-NEXT: vslidedown.vi v8, v9, 8, v0.t +; RV64ZVE32-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; RV64ZVE32-NEXT: addi sp, sp, 16 ; RV64ZVE32-NEXT: ret %p2 = getelementptr i8, ptr %p, i32 1 %p3 = getelementptr i8, ptr %p, i32 2 @@ -1538,7 +1566,11 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; RV32-ONLY-LABEL: buildvec_v16i8_loads_gather: ; RV32-ONLY: # %bb.0: -; RV32-ONLY-NEXT: addi a1, a0, 82 +; RV32-ONLY-NEXT: addi sp, sp, -16 +; RV32-ONLY-NEXT: .cfi_def_cfa_offset 16 +; RV32-ONLY-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32-ONLY-NEXT: .cfi_offset s0, -4 +; RV32-ONLY-NEXT: lbu a1, 0(a0) ; RV32-ONLY-NEXT: lbu a2, 1(a0) ; RV32-ONLY-NEXT: lbu a3, 22(a0) ; RV32-ONLY-NEXT: lbu a4, 31(a0) @@ -1546,35 +1578,38 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; RV32-ONLY-NEXT: lbu a6, 55(a0) ; RV32-ONLY-NEXT: lbu a7, 623(a0) ; RV32-ONLY-NEXT: lbu t0, 75(a0) -; RV32-ONLY-NEXT: lbu t1, 93(a0) -; RV32-ONLY-NEXT: lbu t2, 105(a0) -; RV32-ONLY-NEXT: lbu t3, 161(a0) -; RV32-ONLY-NEXT: lbu t4, 124(a0) -; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-ONLY-NEXT: vlse8.v v8, (a0), zero -; RV32-ONLY-NEXT: lbu t5, 163(a0) -; RV32-ONLY-NEXT: lbu t6, 144(a0) +; RV32-ONLY-NEXT: lbu t1, 82(a0) +; RV32-ONLY-NEXT: lbu t2, 93(a0) +; RV32-ONLY-NEXT: lbu t3, 105(a0) +; RV32-ONLY-NEXT: lbu t4, 161(a0) +; RV32-ONLY-NEXT: lbu t5, 124(a0) +; RV32-ONLY-NEXT: lbu t6, 163(a0) +; RV32-ONLY-NEXT: lbu s0, 144(a0) ; RV32-ONLY-NEXT: lbu a0, 154(a0) +; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-ONLY-NEXT: vmv.v.x v8, a1 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a2 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a3 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a4 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a5 -; RV32-ONLY-NEXT: vlse8.v v9, (a1), zero ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a6 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a7 -; RV32-ONLY-NEXT: vslide1down.vx v10, v8, t0 -; RV32-ONLY-NEXT: vslide1down.vx v8, v9, t1 +; RV32-ONLY-NEXT: vslide1down.vx v9, v8, t0 +; RV32-ONLY-NEXT: vmv.v.x v8, t1 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t2 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t3 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t4 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t5 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t6 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, s0 ; RV32-ONLY-NEXT: li a1, 255 ; RV32-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV32-ONLY-NEXT: vmv.s.x v0, a1 ; RV32-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a0 -; RV32-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV32-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t +; RV32-ONLY-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32-ONLY-NEXT: addi sp, sp, 16 ; RV32-ONLY-NEXT: ret ; ; RV32VB-LABEL: buildvec_v16i8_loads_gather: @@ -1665,7 +1700,11 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; ; RV64V-ONLY-LABEL: buildvec_v16i8_loads_gather: ; RV64V-ONLY: # %bb.0: -; RV64V-ONLY-NEXT: addi a1, a0, 82 +; RV64V-ONLY-NEXT: addi sp, sp, -16 +; RV64V-ONLY-NEXT: .cfi_def_cfa_offset 16 +; RV64V-ONLY-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; RV64V-ONLY-NEXT: .cfi_offset s0, -8 +; RV64V-ONLY-NEXT: lbu a1, 0(a0) ; RV64V-ONLY-NEXT: lbu a2, 1(a0) ; RV64V-ONLY-NEXT: lbu a3, 22(a0) ; RV64V-ONLY-NEXT: lbu a4, 31(a0) @@ -1673,35 +1712,38 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; RV64V-ONLY-NEXT: lbu a6, 55(a0) ; RV64V-ONLY-NEXT: lbu a7, 623(a0) ; RV64V-ONLY-NEXT: lbu t0, 75(a0) -; RV64V-ONLY-NEXT: lbu t1, 93(a0) -; RV64V-ONLY-NEXT: lbu t2, 105(a0) -; RV64V-ONLY-NEXT: lbu t3, 161(a0) -; RV64V-ONLY-NEXT: lbu t4, 124(a0) -; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64V-ONLY-NEXT: vlse8.v v8, (a0), zero -; RV64V-ONLY-NEXT: lbu t5, 163(a0) -; RV64V-ONLY-NEXT: lbu t6, 144(a0) +; RV64V-ONLY-NEXT: lbu t1, 82(a0) +; RV64V-ONLY-NEXT: lbu t2, 93(a0) +; RV64V-ONLY-NEXT: lbu t3, 105(a0) +; RV64V-ONLY-NEXT: lbu t4, 161(a0) +; RV64V-ONLY-NEXT: lbu t5, 124(a0) +; RV64V-ONLY-NEXT: lbu t6, 163(a0) +; RV64V-ONLY-NEXT: lbu s0, 144(a0) ; RV64V-ONLY-NEXT: lbu a0, 154(a0) +; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64V-ONLY-NEXT: vmv.v.x v8, a1 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a2 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a3 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a4 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a5 -; RV64V-ONLY-NEXT: vlse8.v v9, (a1), zero ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a6 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a7 -; RV64V-ONLY-NEXT: vslide1down.vx v10, v8, t0 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v9, t1 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v8, t0 +; RV64V-ONLY-NEXT: vmv.v.x v8, t1 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t2 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t3 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t4 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t5 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t6 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, s0 ; RV64V-ONLY-NEXT: li a1, 255 ; RV64V-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64V-ONLY-NEXT: vmv.s.x v0, a1 ; RV64V-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a0 -; RV64V-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV64V-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t +; RV64V-ONLY-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; RV64V-ONLY-NEXT: addi sp, sp, 16 ; RV64V-ONLY-NEXT: ret ; ; RVA22U64-LABEL: buildvec_v16i8_loads_gather: @@ -1794,7 +1836,11 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; ; RV64ZVE32-LABEL: buildvec_v16i8_loads_gather: ; RV64ZVE32: # %bb.0: -; RV64ZVE32-NEXT: addi a1, a0, 82 +; RV64ZVE32-NEXT: addi sp, sp, -16 +; RV64ZVE32-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; RV64ZVE32-NEXT: .cfi_offset s0, -8 +; RV64ZVE32-NEXT: lbu a1, 0(a0) ; RV64ZVE32-NEXT: lbu a2, 1(a0) ; RV64ZVE32-NEXT: lbu a3, 22(a0) ; RV64ZVE32-NEXT: lbu a4, 31(a0) @@ -1802,35 +1848,38 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; RV64ZVE32-NEXT: lbu a6, 55(a0) ; RV64ZVE32-NEXT: lbu a7, 623(a0) ; RV64ZVE32-NEXT: lbu t0, 75(a0) -; RV64ZVE32-NEXT: lbu t1, 93(a0) -; RV64ZVE32-NEXT: lbu t2, 105(a0) -; RV64ZVE32-NEXT: lbu t3, 161(a0) -; RV64ZVE32-NEXT: lbu t4, 124(a0) -; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64ZVE32-NEXT: vlse8.v v8, (a0), zero -; RV64ZVE32-NEXT: lbu t5, 163(a0) -; RV64ZVE32-NEXT: lbu t6, 144(a0) +; RV64ZVE32-NEXT: lbu t1, 82(a0) +; RV64ZVE32-NEXT: lbu t2, 93(a0) +; RV64ZVE32-NEXT: lbu t3, 105(a0) +; RV64ZVE32-NEXT: lbu t4, 161(a0) +; RV64ZVE32-NEXT: lbu t5, 124(a0) +; RV64ZVE32-NEXT: lbu t6, 163(a0) +; RV64ZVE32-NEXT: lbu s0, 144(a0) ; RV64ZVE32-NEXT: lbu a0, 154(a0) +; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64ZVE32-NEXT: vmv.v.x v8, a1 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a2 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a3 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a4 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a5 -; RV64ZVE32-NEXT: vlse8.v v9, (a1), zero ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a6 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a7 -; RV64ZVE32-NEXT: vslide1down.vx v10, v8, t0 -; RV64ZVE32-NEXT: vslide1down.vx v8, v9, t1 +; RV64ZVE32-NEXT: vslide1down.vx v9, v8, t0 +; RV64ZVE32-NEXT: vmv.v.x v8, t1 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t2 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t3 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t4 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t5 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t6 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, s0 ; RV64ZVE32-NEXT: li a1, 255 ; RV64ZVE32-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32-NEXT: vmv.s.x v0, a1 ; RV64ZVE32-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV64ZVE32-NEXT: vslidedown.vi v8, v9, 8, v0.t +; RV64ZVE32-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; RV64ZVE32-NEXT: addi sp, sp, 16 ; RV64ZVE32-NEXT: ret %p2 = getelementptr i8, ptr %p, i32 1 %p3 = getelementptr i8, ptr %p, i32 22 @@ -1887,22 +1936,22 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) { ; RV32-ONLY-LABEL: buildvec_v16i8_undef_low_half: ; RV32-ONLY: # %bb.0: -; RV32-ONLY-NEXT: addi a1, a0, 82 +; RV32-ONLY-NEXT: lbu a1, 82(a0) ; RV32-ONLY-NEXT: lbu a2, 93(a0) ; RV32-ONLY-NEXT: lbu a3, 105(a0) ; RV32-ONLY-NEXT: lbu a4, 161(a0) ; RV32-ONLY-NEXT: lbu a5, 124(a0) -; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-ONLY-NEXT: vlse8.v v8, (a1), zero -; RV32-ONLY-NEXT: lbu a1, 163(a0) -; RV32-ONLY-NEXT: lbu a6, 144(a0) +; RV32-ONLY-NEXT: lbu a6, 163(a0) +; RV32-ONLY-NEXT: lbu a7, 144(a0) ; RV32-ONLY-NEXT: lbu a0, 154(a0) +; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-ONLY-NEXT: vmv.v.x v8, a1 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a2 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a3 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a4 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a5 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a1 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a6 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a7 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a0 ; RV32-ONLY-NEXT: ret ; @@ -1962,22 +2011,22 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) { ; ; RV64V-ONLY-LABEL: buildvec_v16i8_undef_low_half: ; RV64V-ONLY: # %bb.0: -; RV64V-ONLY-NEXT: addi a1, a0, 82 +; RV64V-ONLY-NEXT: lbu a1, 82(a0) ; RV64V-ONLY-NEXT: lbu a2, 93(a0) ; RV64V-ONLY-NEXT: lbu a3, 105(a0) ; RV64V-ONLY-NEXT: lbu a4, 161(a0) ; RV64V-ONLY-NEXT: lbu a5, 124(a0) -; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64V-ONLY-NEXT: vlse8.v v8, (a1), zero -; RV64V-ONLY-NEXT: lbu a1, 163(a0) -; RV64V-ONLY-NEXT: lbu a6, 144(a0) +; RV64V-ONLY-NEXT: lbu a6, 163(a0) +; RV64V-ONLY-NEXT: lbu a7, 144(a0) ; RV64V-ONLY-NEXT: lbu a0, 154(a0) +; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64V-ONLY-NEXT: vmv.v.x v8, a1 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a2 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a3 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a4 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a5 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a1 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a6 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a7 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a0 ; RV64V-ONLY-NEXT: ret ; @@ -2037,22 +2086,22 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) { ; ; RV64ZVE32-LABEL: buildvec_v16i8_undef_low_half: ; RV64ZVE32: # %bb.0: -; RV64ZVE32-NEXT: addi a1, a0, 82 +; RV64ZVE32-NEXT: lbu a1, 82(a0) ; RV64ZVE32-NEXT: lbu a2, 93(a0) ; RV64ZVE32-NEXT: lbu a3, 105(a0) ; RV64ZVE32-NEXT: lbu a4, 161(a0) ; RV64ZVE32-NEXT: lbu a5, 124(a0) -; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64ZVE32-NEXT: vlse8.v v8, (a1), zero -; RV64ZVE32-NEXT: lbu a1, 163(a0) -; RV64ZVE32-NEXT: lbu a6, 144(a0) +; RV64ZVE32-NEXT: lbu a6, 163(a0) +; RV64ZVE32-NEXT: lbu a7, 144(a0) ; RV64ZVE32-NEXT: lbu a0, 154(a0) +; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64ZVE32-NEXT: vmv.v.x v8, a1 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a2 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a3 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a4 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a5 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a1 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a7 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a0 ; RV64ZVE32-NEXT: ret %p9 = getelementptr i8, ptr %p, i32 82 @@ -2087,21 +2136,22 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) { define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) { ; RV32-ONLY-LABEL: buildvec_v16i8_undef_high_half: ; RV32-ONLY: # %bb.0: -; RV32-ONLY-NEXT: lbu a1, 1(a0) -; RV32-ONLY-NEXT: lbu a2, 22(a0) -; RV32-ONLY-NEXT: lbu a3, 31(a0) -; RV32-ONLY-NEXT: lbu a4, 44(a0) -; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-ONLY-NEXT: vlse8.v v8, (a0), zero -; RV32-ONLY-NEXT: lbu a5, 55(a0) -; RV32-ONLY-NEXT: lbu a6, 623(a0) +; RV32-ONLY-NEXT: lbu a1, 0(a0) +; RV32-ONLY-NEXT: lbu a2, 1(a0) +; RV32-ONLY-NEXT: lbu a3, 22(a0) +; RV32-ONLY-NEXT: lbu a4, 31(a0) +; RV32-ONLY-NEXT: lbu a5, 44(a0) +; RV32-ONLY-NEXT: lbu a6, 55(a0) +; RV32-ONLY-NEXT: lbu a7, 623(a0) ; RV32-ONLY-NEXT: lbu a0, 75(a0) -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a1 +; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-ONLY-NEXT: vmv.v.x v8, a1 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a2 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a3 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a4 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a5 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a6 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a7 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a0 ; RV32-ONLY-NEXT: vslidedown.vi v8, v8, 8 ; RV32-ONLY-NEXT: ret @@ -2162,21 +2212,22 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) { ; ; RV64V-ONLY-LABEL: buildvec_v16i8_undef_high_half: ; RV64V-ONLY: # %bb.0: -; RV64V-ONLY-NEXT: lbu a1, 1(a0) -; RV64V-ONLY-NEXT: lbu a2, 22(a0) -; RV64V-ONLY-NEXT: lbu a3, 31(a0) -; RV64V-ONLY-NEXT: lbu a4, 44(a0) -; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64V-ONLY-NEXT: vlse8.v v8, (a0), zero -; RV64V-ONLY-NEXT: lbu a5, 55(a0) -; RV64V-ONLY-NEXT: lbu a6, 623(a0) +; RV64V-ONLY-NEXT: lbu a1, 0(a0) +; RV64V-ONLY-NEXT: lbu a2, 1(a0) +; RV64V-ONLY-NEXT: lbu a3, 22(a0) +; RV64V-ONLY-NEXT: lbu a4, 31(a0) +; RV64V-ONLY-NEXT: lbu a5, 44(a0) +; RV64V-ONLY-NEXT: lbu a6, 55(a0) +; RV64V-ONLY-NEXT: lbu a7, 623(a0) ; RV64V-ONLY-NEXT: lbu a0, 75(a0) -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a1 +; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64V-ONLY-NEXT: vmv.v.x v8, a1 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a2 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a3 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a4 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a5 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a6 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a7 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a0 ; RV64V-ONLY-NEXT: vslidedown.vi v8, v8, 8 ; RV64V-ONLY-NEXT: ret @@ -2237,21 +2288,22 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) { ; ; RV64ZVE32-LABEL: buildvec_v16i8_undef_high_half: ; RV64ZVE32: # %bb.0: -; RV64ZVE32-NEXT: lbu a1, 1(a0) -; RV64ZVE32-NEXT: lbu a2, 22(a0) -; RV64ZVE32-NEXT: lbu a3, 31(a0) -; RV64ZVE32-NEXT: lbu a4, 44(a0) -; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64ZVE32-NEXT: vlse8.v v8, (a0), zero -; RV64ZVE32-NEXT: lbu a5, 55(a0) -; RV64ZVE32-NEXT: lbu a6, 623(a0) +; RV64ZVE32-NEXT: lbu a1, 0(a0) +; RV64ZVE32-NEXT: lbu a2, 1(a0) +; RV64ZVE32-NEXT: lbu a3, 22(a0) +; RV64ZVE32-NEXT: lbu a4, 31(a0) +; RV64ZVE32-NEXT: lbu a5, 44(a0) +; RV64ZVE32-NEXT: lbu a6, 55(a0) +; RV64ZVE32-NEXT: lbu a7, 623(a0) ; RV64ZVE32-NEXT: lbu a0, 75(a0) -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a1 +; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64ZVE32-NEXT: vmv.v.x v8, a1 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a2 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a3 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a4 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a5 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a7 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a0 ; RV64ZVE32-NEXT: vslidedown.vi v8, v8, 8 ; RV64ZVE32-NEXT: ret @@ -2286,31 +2338,31 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) { define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) { ; RV32-ONLY-LABEL: buildvec_v16i8_undef_edges: ; RV32-ONLY: # %bb.0: -; RV32-ONLY-NEXT: addi a1, a0, 31 -; RV32-ONLY-NEXT: addi a2, a0, 82 -; RV32-ONLY-NEXT: lbu a3, 44(a0) -; RV32-ONLY-NEXT: lbu a4, 55(a0) -; RV32-ONLY-NEXT: lbu a5, 623(a0) -; RV32-ONLY-NEXT: lbu a6, 75(a0) -; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-ONLY-NEXT: vlse8.v v8, (a1), zero -; RV32-ONLY-NEXT: lbu a1, 93(a0) -; RV32-ONLY-NEXT: lbu a7, 105(a0) +; RV32-ONLY-NEXT: lbu a1, 31(a0) +; RV32-ONLY-NEXT: lbu a2, 44(a0) +; RV32-ONLY-NEXT: lbu a3, 55(a0) +; RV32-ONLY-NEXT: lbu a4, 623(a0) +; RV32-ONLY-NEXT: lbu a5, 75(a0) +; RV32-ONLY-NEXT: lbu a6, 82(a0) +; RV32-ONLY-NEXT: lbu a7, 93(a0) +; RV32-ONLY-NEXT: lbu t0, 105(a0) ; RV32-ONLY-NEXT: lbu a0, 161(a0) +; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-ONLY-NEXT: vmv.v.x v8, a1 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a2 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a3 -; RV32-ONLY-NEXT: vlse8.v v9, (a2), zero ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a4 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a5 -; RV32-ONLY-NEXT: vslide1down.vx v10, v8, a6 -; RV32-ONLY-NEXT: vslide1down.vx v8, v9, a1 +; RV32-ONLY-NEXT: vslide1down.vx v9, v8, a5 +; RV32-ONLY-NEXT: vmv.v.x v8, a6 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a7 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t0 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a0 ; RV32-ONLY-NEXT: li a0, 255 ; RV32-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV32-ONLY-NEXT: vmv.s.x v0, a0 ; RV32-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV32-ONLY-NEXT: vslidedown.vi v8, v8, 4 -; RV32-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV32-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t ; RV32-ONLY-NEXT: ret ; ; RV32VB-LABEL: buildvec_v16i8_undef_edges: @@ -2374,31 +2426,31 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) { ; ; RV64V-ONLY-LABEL: buildvec_v16i8_undef_edges: ; RV64V-ONLY: # %bb.0: -; RV64V-ONLY-NEXT: addi a1, a0, 31 -; RV64V-ONLY-NEXT: addi a2, a0, 82 -; RV64V-ONLY-NEXT: lbu a3, 44(a0) -; RV64V-ONLY-NEXT: lbu a4, 55(a0) -; RV64V-ONLY-NEXT: lbu a5, 623(a0) -; RV64V-ONLY-NEXT: lbu a6, 75(a0) -; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64V-ONLY-NEXT: vlse8.v v8, (a1), zero -; RV64V-ONLY-NEXT: lbu a1, 93(a0) -; RV64V-ONLY-NEXT: lbu a7, 105(a0) +; RV64V-ONLY-NEXT: lbu a1, 31(a0) +; RV64V-ONLY-NEXT: lbu a2, 44(a0) +; RV64V-ONLY-NEXT: lbu a3, 55(a0) +; RV64V-ONLY-NEXT: lbu a4, 623(a0) +; RV64V-ONLY-NEXT: lbu a5, 75(a0) +; RV64V-ONLY-NEXT: lbu a6, 82(a0) +; RV64V-ONLY-NEXT: lbu a7, 93(a0) +; RV64V-ONLY-NEXT: lbu t0, 105(a0) ; RV64V-ONLY-NEXT: lbu a0, 161(a0) +; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64V-ONLY-NEXT: vmv.v.x v8, a1 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a2 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a3 -; RV64V-ONLY-NEXT: vlse8.v v9, (a2), zero ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a4 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a5 -; RV64V-ONLY-NEXT: vslide1down.vx v10, v8, a6 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v9, a1 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v8, a5 +; RV64V-ONLY-NEXT: vmv.v.x v8, a6 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a7 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t0 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a0 ; RV64V-ONLY-NEXT: li a0, 255 ; RV64V-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64V-ONLY-NEXT: vmv.s.x v0, a0 ; RV64V-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64V-ONLY-NEXT: vslidedown.vi v8, v8, 4 -; RV64V-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV64V-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t ; RV64V-ONLY-NEXT: ret ; ; RVA22U64-LABEL: buildvec_v16i8_undef_edges: @@ -2462,31 +2514,31 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) { ; ; RV64ZVE32-LABEL: buildvec_v16i8_undef_edges: ; RV64ZVE32: # %bb.0: -; RV64ZVE32-NEXT: addi a1, a0, 31 -; RV64ZVE32-NEXT: addi a2, a0, 82 -; RV64ZVE32-NEXT: lbu a3, 44(a0) -; RV64ZVE32-NEXT: lbu a4, 55(a0) -; RV64ZVE32-NEXT: lbu a5, 623(a0) -; RV64ZVE32-NEXT: lbu a6, 75(a0) -; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64ZVE32-NEXT: vlse8.v v8, (a1), zero -; RV64ZVE32-NEXT: lbu a1, 93(a0) -; RV64ZVE32-NEXT: lbu a7, 105(a0) +; RV64ZVE32-NEXT: lbu a1, 31(a0) +; RV64ZVE32-NEXT: lbu a2, 44(a0) +; RV64ZVE32-NEXT: lbu a3, 55(a0) +; RV64ZVE32-NEXT: lbu a4, 623(a0) +; RV64ZVE32-NEXT: lbu a5, 75(a0) +; RV64ZVE32-NEXT: lbu a6, 82(a0) +; RV64ZVE32-NEXT: lbu a7, 93(a0) +; RV64ZVE32-NEXT: lbu t0, 105(a0) ; RV64ZVE32-NEXT: lbu a0, 161(a0) +; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64ZVE32-NEXT: vmv.v.x v8, a1 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a2 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32-NEXT: vlse8.v v9, (a2), zero ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a4 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a5 -; RV64ZVE32-NEXT: vslide1down.vx v10, v8, a6 -; RV64ZVE32-NEXT: vslide1down.vx v8, v9, a1 +; RV64ZVE32-NEXT: vslide1down.vx v9, v8, a5 +; RV64ZVE32-NEXT: vmv.v.x v8, a6 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a7 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t0 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a0 ; RV64ZVE32-NEXT: li a0, 255 ; RV64ZVE32-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32-NEXT: vmv.s.x v0, a0 ; RV64ZVE32-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64ZVE32-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV64ZVE32-NEXT: vslidedown.vi v8, v9, 8, v0.t ; RV64ZVE32-NEXT: ret %p4 = getelementptr i8, ptr %p, i32 31 %p5 = getelementptr i8, ptr %p, i32 44 @@ -2523,35 +2575,36 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) { define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; RV32-ONLY-LABEL: buildvec_v16i8_loads_undef_scattered: ; RV32-ONLY: # %bb.0: -; RV32-ONLY-NEXT: addi a1, a0, 82 +; RV32-ONLY-NEXT: lbu a1, 0(a0) ; RV32-ONLY-NEXT: lbu a2, 1(a0) ; RV32-ONLY-NEXT: lbu a3, 44(a0) ; RV32-ONLY-NEXT: lbu a4, 55(a0) ; RV32-ONLY-NEXT: lbu a5, 75(a0) -; RV32-ONLY-NEXT: lbu a6, 93(a0) -; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-ONLY-NEXT: vlse8.v v8, (a0), zero -; RV32-ONLY-NEXT: lbu a7, 124(a0) -; RV32-ONLY-NEXT: lbu t0, 144(a0) +; RV32-ONLY-NEXT: lbu a6, 82(a0) +; RV32-ONLY-NEXT: lbu a7, 93(a0) +; RV32-ONLY-NEXT: lbu t0, 124(a0) +; RV32-ONLY-NEXT: lbu t1, 144(a0) ; RV32-ONLY-NEXT: lbu a0, 154(a0) +; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-ONLY-NEXT: vmv.v.x v8, a1 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a2 ; RV32-ONLY-NEXT: vslidedown.vi v8, v8, 2 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a3 -; RV32-ONLY-NEXT: vlse8.v v9, (a1), zero ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a4 ; RV32-ONLY-NEXT: vslidedown.vi v8, v8, 1 -; RV32-ONLY-NEXT: vslide1down.vx v10, v8, a5 -; RV32-ONLY-NEXT: vslide1down.vx v8, v9, a6 -; RV32-ONLY-NEXT: vslidedown.vi v8, v8, 2 +; RV32-ONLY-NEXT: vslide1down.vx v9, v8, a5 +; RV32-ONLY-NEXT: vmv.v.x v8, a6 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a7 -; RV32-ONLY-NEXT: vslidedown.vi v8, v8, 1 +; RV32-ONLY-NEXT: vslidedown.vi v8, v8, 2 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t0 +; RV32-ONLY-NEXT: vslidedown.vi v8, v8, 1 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t1 ; RV32-ONLY-NEXT: li a1, 255 ; RV32-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV32-ONLY-NEXT: vmv.s.x v0, a1 ; RV32-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a0 -; RV32-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV32-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t ; RV32-ONLY-NEXT: ret ; ; RV32VB-LABEL: buildvec_v16i8_loads_undef_scattered: @@ -2617,35 +2670,36 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; ; RV64V-ONLY-LABEL: buildvec_v16i8_loads_undef_scattered: ; RV64V-ONLY: # %bb.0: -; RV64V-ONLY-NEXT: addi a1, a0, 82 +; RV64V-ONLY-NEXT: lbu a1, 0(a0) ; RV64V-ONLY-NEXT: lbu a2, 1(a0) ; RV64V-ONLY-NEXT: lbu a3, 44(a0) ; RV64V-ONLY-NEXT: lbu a4, 55(a0) ; RV64V-ONLY-NEXT: lbu a5, 75(a0) -; RV64V-ONLY-NEXT: lbu a6, 93(a0) -; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64V-ONLY-NEXT: vlse8.v v8, (a0), zero -; RV64V-ONLY-NEXT: lbu a7, 124(a0) -; RV64V-ONLY-NEXT: lbu t0, 144(a0) +; RV64V-ONLY-NEXT: lbu a6, 82(a0) +; RV64V-ONLY-NEXT: lbu a7, 93(a0) +; RV64V-ONLY-NEXT: lbu t0, 124(a0) +; RV64V-ONLY-NEXT: lbu t1, 144(a0) ; RV64V-ONLY-NEXT: lbu a0, 154(a0) +; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64V-ONLY-NEXT: vmv.v.x v8, a1 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a2 ; RV64V-ONLY-NEXT: vslidedown.vi v8, v8, 2 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a3 -; RV64V-ONLY-NEXT: vlse8.v v9, (a1), zero ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a4 ; RV64V-ONLY-NEXT: vslidedown.vi v8, v8, 1 -; RV64V-ONLY-NEXT: vslide1down.vx v10, v8, a5 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v9, a6 -; RV64V-ONLY-NEXT: vslidedown.vi v8, v8, 2 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v8, a5 +; RV64V-ONLY-NEXT: vmv.v.x v8, a6 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a7 -; RV64V-ONLY-NEXT: vslidedown.vi v8, v8, 1 +; RV64V-ONLY-NEXT: vslidedown.vi v8, v8, 2 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t0 +; RV64V-ONLY-NEXT: vslidedown.vi v8, v8, 1 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t1 ; RV64V-ONLY-NEXT: li a1, 255 ; RV64V-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64V-ONLY-NEXT: vmv.s.x v0, a1 ; RV64V-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a0 -; RV64V-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV64V-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t ; RV64V-ONLY-NEXT: ret ; ; RVA22U64-LABEL: buildvec_v16i8_loads_undef_scattered: @@ -2713,35 +2767,36 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; ; RV64ZVE32-LABEL: buildvec_v16i8_loads_undef_scattered: ; RV64ZVE32: # %bb.0: -; RV64ZVE32-NEXT: addi a1, a0, 82 +; RV64ZVE32-NEXT: lbu a1, 0(a0) ; RV64ZVE32-NEXT: lbu a2, 1(a0) ; RV64ZVE32-NEXT: lbu a3, 44(a0) ; RV64ZVE32-NEXT: lbu a4, 55(a0) ; RV64ZVE32-NEXT: lbu a5, 75(a0) -; RV64ZVE32-NEXT: lbu a6, 93(a0) -; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64ZVE32-NEXT: vlse8.v v8, (a0), zero -; RV64ZVE32-NEXT: lbu a7, 124(a0) -; RV64ZVE32-NEXT: lbu t0, 144(a0) +; RV64ZVE32-NEXT: lbu a6, 82(a0) +; RV64ZVE32-NEXT: lbu a7, 93(a0) +; RV64ZVE32-NEXT: lbu t0, 124(a0) +; RV64ZVE32-NEXT: lbu t1, 144(a0) ; RV64ZVE32-NEXT: lbu a0, 154(a0) +; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64ZVE32-NEXT: vmv.v.x v8, a1 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a2 ; RV64ZVE32-NEXT: vslidedown.vi v8, v8, 2 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32-NEXT: vlse8.v v9, (a1), zero ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a4 ; RV64ZVE32-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32-NEXT: vslide1down.vx v10, v8, a5 -; RV64ZVE32-NEXT: vslide1down.vx v8, v9, a6 -; RV64ZVE32-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32-NEXT: vslide1down.vx v9, v8, a5 +; RV64ZVE32-NEXT: vmv.v.x v8, a6 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a7 -; RV64ZVE32-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32-NEXT: vslidedown.vi v8, v8, 2 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t0 +; RV64ZVE32-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t1 ; RV64ZVE32-NEXT: li a1, 255 ; RV64ZVE32-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32-NEXT: vmv.s.x v0, a1 ; RV64ZVE32-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV64ZVE32-NEXT: vslidedown.vi v8, v9, 8, v0.t ; RV64ZVE32-NEXT: ret %p2 = getelementptr i8, ptr %p, i32 1 %p3 = getelementptr i8, ptr %p, i32 22 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-vrgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-vrgather.ll index 2c0b1d09b52d9..32c1f2ca32fab 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-vrgather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-vrgather.ll @@ -1,13 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV32 %s +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64 %s define void @gather_const_v16i8(ptr %x) { ; CHECK-LABEL: gather_const_v16i8: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a1, a0, 12 +; CHECK-NEXT: lbu a1, 12(a0) ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vlse8.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x @@ -21,9 +21,9 @@ define void @gather_const_v16i8(ptr %x) { define void @gather_const_v8i16(ptr %x) { ; CHECK-LABEL: gather_const_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a1, a0, 10 +; CHECK-NEXT: lh a1, 10(a0) ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x @@ -37,9 +37,9 @@ define void @gather_const_v8i16(ptr %x) { define void @gather_const_v4i32(ptr %x) { ; CHECK-LABEL: gather_const_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a1, a0, 12 +; CHECK-NEXT: lw a1, 12(a0) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x @@ -51,13 +51,21 @@ define void @gather_const_v4i32(ptr %x) { } define void @gather_const_v2i64(ptr %x) { -; CHECK-LABEL: gather_const_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: addi a1, a0, 8 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v8, (a1), zero -; CHECK-NEXT: vse64.v v8, (a0) -; CHECK-NEXT: ret +; RV32-LABEL: gather_const_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi a1, a0, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: vse64.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: gather_const_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: ld a1, 8(a0) +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: vse64.v v8, (a0) +; RV64-NEXT: ret %a = load <2 x i64>, ptr %x %b = extractelement <2 x i64> %a, i32 1 %c = insertelement <2 x i64> poison, i64 %b, i32 0 @@ -69,10 +77,10 @@ define void @gather_const_v2i64(ptr %x) { define void @gather_const_v64i8(ptr %x) { ; CHECK-LABEL: gather_const_v64i8: ; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 64 -; CHECK-NEXT: addi a2, a0, 32 -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-NEXT: vlse8.v v8, (a2), zero +; CHECK-NEXT: lbu a1, 32(a0) +; CHECK-NEXT: li a2, 64 +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x @@ -86,10 +94,10 @@ define void @gather_const_v64i8(ptr %x) { define void @gather_const_v16i16(ptr %x) { ; CHECK-LABEL: gather_const_v16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: addi a2, a0, 50 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vlse16.v v8, (a2), zero +; CHECK-NEXT: lh a1, 50(a0) +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x @@ -103,9 +111,9 @@ define void @gather_const_v16i16(ptr %x) { define void @gather_const_v16i32(ptr %x) { ; CHECK-LABEL: gather_const_v16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a1, a0, 36 +; CHECK-NEXT: lw a1, 36(a0) ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vlse32.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x @@ -117,13 +125,21 @@ define void @gather_const_v16i32(ptr %x) { } define void @gather_const_v8i64(ptr %x) { -; CHECK-LABEL: gather_const_v8i64: -; CHECK: # %bb.0: -; CHECK-NEXT: addi a1, a0, 24 -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vlse64.v v8, (a1), zero -; CHECK-NEXT: vse64.v v8, (a0) -; CHECK-NEXT: ret +; RV32-LABEL: gather_const_v8i64: +; RV32: # %bb.0: +; RV32-NEXT: addi a1, a0, 24 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: vse64.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: gather_const_v8i64: +; RV64: # %bb.0: +; RV64-NEXT: ld a1, 24(a0) +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: vse64.v v8, (a0) +; RV64-NEXT: ret %a = load <8 x i64>, ptr %x %b = extractelement <8 x i64> %a, i32 3 %c = insertelement <8 x i64> poison, i64 %b, i32 0 @@ -135,9 +151,9 @@ define void @gather_const_v8i64(ptr %x) { define void @splat_concat_low(ptr %x, ptr %y, ptr %z) { ; CHECK-LABEL: splat_concat_low: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a0, a0, 2 +; CHECK-NEXT: lh a0, 2(a0) ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v8, (a0), zero +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vse16.v v8, (a2) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x @@ -151,9 +167,9 @@ define void @splat_concat_low(ptr %x, ptr %y, ptr %z) { define void @splat_concat_high(ptr %x, ptr %y, ptr %z) { ; CHECK-LABEL: splat_concat_high: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a1, a1, 2 +; CHECK-NEXT: lh a0, 2(a1) ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vse16.v v8, (a2) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll index f4d7074c7f6b2..ea2cdae903e5a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -1401,12 +1401,12 @@ define void @mulhs_v4i32(ptr %x) { ; ; RV64-LABEL: mulhs_v4i32: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, %hi(.LCPI73_0) +; RV64-NEXT: ld a1, %lo(.LCPI73_0)(a1) ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: lui a1, %hi(.LCPI73_0) -; RV64-NEXT: addi a1, a1, %lo(.LCPI73_0) ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vlse64.v v9, (a1), zero +; RV64-NEXT: vmv.v.x v9, a1 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vmulh.vv v8, v8, v9 ; RV64-NEXT: vsra.vi v8, v8, 1 @@ -3371,15 +3371,15 @@ define void @mulhu_v16i16(ptr %x) { ; RV64-NEXT: vsrl.vv v14, v8, v16 ; RV64-NEXT: vmulhu.vv v12, v14, v12 ; RV64-NEXT: lui a1, %hi(.LCPI182_1) -; RV64-NEXT: addi a1, a1, %lo(.LCPI182_1) -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vlse64.v v14, (a1), zero -; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; RV64-NEXT: ld a1, %lo(.LCPI182_1)(a1) ; RV64-NEXT: vsub.vv v8, v8, v12 ; RV64-NEXT: vmulhu.vv v8, v8, v10 ; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: vsext.vf2 v10, v14 -; RV64-NEXT: vsrl.vv v8, v8, v10 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v10, a1 +; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; RV64-NEXT: vsext.vf2 v12, v10 +; RV64-NEXT: vsrl.vv v8, v8, v12 ; RV64-NEXT: vse16.v v8, (a0) ; RV64-NEXT: ret %a = load <16 x i16>, ptr %x @@ -3557,12 +3557,12 @@ define void @mulhs_v8i32(ptr %x) { ; ; RV64-LABEL: mulhs_v8i32: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, %hi(.LCPI187_0) +; RV64-NEXT: ld a1, %lo(.LCPI187_0)(a1) ; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: lui a1, %hi(.LCPI187_0) -; RV64-NEXT: addi a1, a1, %lo(.LCPI187_0) ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vlse64.v v10, (a1), zero +; RV64-NEXT: vmv.v.x v10, a1 ; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV64-NEXT: vmulh.vv v8, v8, v10 ; RV64-NEXT: vsra.vi v8, v8, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll index 7608349ef7aef..47cbb2509441a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll @@ -527,13 +527,13 @@ define <128 x i1> @buildvec_mask_v128i1() { ; RV64-LABEL: buildvec_mask_v128i1: ; RV64: # %bb.0: ; RV64-NEXT: lui a0, %hi(.LCPI20_0) -; RV64-NEXT: addi a0, a0, %lo(.LCPI20_0) +; RV64-NEXT: ld a0, %lo(.LCPI20_0)(a0) +; RV64-NEXT: lui a1, %hi(.LCPI20_1) +; RV64-NEXT: ld a1, %lo(.LCPI20_1)(a1) ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vlse64.v v0, (a0), zero -; RV64-NEXT: lui a0, %hi(.LCPI20_1) -; RV64-NEXT: ld a0, %lo(.LCPI20_1)(a0) +; RV64-NEXT: vmv.v.x v0, a0 ; RV64-NEXT: vsetvli zero, zero, e64, m1, tu, ma -; RV64-NEXT: vmv.s.x v0, a0 +; RV64-NEXT: vmv.s.x v0, a1 ; RV64-NEXT: ret ; ; ZVE32F-LABEL: buildvec_mask_v128i1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index a4f9eeb59cd5b..14f4f44049c53 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -526,17 +526,18 @@ define <4 x i8> @mgather_truemask_v4i8(<4 x ptr> %ptrs, <4 x i8> %passthru) { ; ; RV64ZVE32F-LABEL: mgather_truemask_v4i8: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a1, 8(a0) -; RV64ZVE32F-NEXT: ld a2, 0(a0) +; RV64ZVE32F-NEXT: ld a1, 0(a0) +; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: ld a3, 16(a0) ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: lbu a1, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vlse8.v v8, (a2), zero -; RV64ZVE32F-NEXT: lbu a2, 0(a3) +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: lbu a3, 0(a3) ; RV64ZVE32F-NEXT: lbu a0, 0(a0) -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v8, a1 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 ; RV64ZVE32F-NEXT: ret %v = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 1), <4 x i8> %passthru) @@ -1222,17 +1223,18 @@ define <4 x i16> @mgather_truemask_v4i16(<4 x ptr> %ptrs, <4 x i16> %passthru) { ; ; RV64ZVE32F-LABEL: mgather_truemask_v4i16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a1, 8(a0) -; RV64ZVE32F-NEXT: ld a2, 0(a0) +; RV64ZVE32F-NEXT: ld a1, 0(a0) +; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: ld a3, 16(a0) ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: lh a1, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vlse16.v v8, (a2), zero -; RV64ZVE32F-NEXT: lh a2, 0(a3) +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a3, 0(a3) ; RV64ZVE32F-NEXT: lh a0, 0(a0) -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v8, a1 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 ; RV64ZVE32F-NEXT: ret %v = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %ptrs, i32 2, <4 x i1> splat (i1 1), <4 x i16> %passthru) @@ -2279,17 +2281,18 @@ define <4 x i32> @mgather_truemask_v4i32(<4 x ptr> %ptrs, <4 x i32> %passthru) { ; ; RV64ZVE32F-LABEL: mgather_truemask_v4i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a1, 8(a0) -; RV64ZVE32F-NEXT: ld a2, 0(a0) +; RV64ZVE32F-NEXT: ld a1, 0(a0) +; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: ld a3, 16(a0) ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: lw a1, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vlse32.v v8, (a2), zero -; RV64ZVE32F-NEXT: lw a2, 0(a3) +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: lw a3, 0(a3) ; RV64ZVE32F-NEXT: lw a0, 0(a0) -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v8, a1 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 ; RV64ZVE32F-NEXT: ret %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 1), <4 x i32> %passthru) @@ -6617,16 +6620,17 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV32ZVE32F-NEXT: lw a4, 56(a2) ; RV32ZVE32F-NEXT: lw a5, 48(a2) ; RV32ZVE32F-NEXT: lw a6, 40(a2) -; RV32ZVE32F-NEXT: lw a7, 8(a2) +; RV32ZVE32F-NEXT: lw a7, 32(a2) +; RV32ZVE32F-NEXT: lw t0, 0(a2) +; RV32ZVE32F-NEXT: lw t1, 8(a2) +; RV32ZVE32F-NEXT: lw t2, 16(a2) +; RV32ZVE32F-NEXT: lw a2, 24(a2) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vlse32.v v8, (a2), zero -; RV32ZVE32F-NEXT: lw t0, 16(a2) -; RV32ZVE32F-NEXT: lw t1, 24(a2) -; RV32ZVE32F-NEXT: lw a2, 32(a2) -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a7 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t0 +; RV32ZVE32F-NEXT: vmv.v.x v8, t0 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t1 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t2 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a7 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a6 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a5 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a4 @@ -7046,18 +7050,19 @@ define <4 x half> @mgather_truemask_v4f16(<4 x ptr> %ptrs, <4 x half> %passthru) ; ; RV64ZVE32F-LABEL: mgather_truemask_v4f16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a1, 8(a0) -; RV64ZVE32F-NEXT: ld a2, 0(a0) +; RV64ZVE32F-NEXT: ld a1, 0(a0) +; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: ld a3, 16(a0) ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: flh fa5, 0(a1) +; RV64ZVE32F-NEXT: flh fa4, 0(a2) +; RV64ZVE32F-NEXT: flh fa3, 0(a3) +; RV64ZVE32F-NEXT: flh fa2, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vlse16.v v8, (a2), zero -; RV64ZVE32F-NEXT: flh fa4, 0(a3) -; RV64ZVE32F-NEXT: flh fa3, 0(a0) -; RV64ZVE32F-NEXT: vfslide1down.vf v8, v8, fa5 +; RV64ZVE32F-NEXT: vfmv.v.f v8, fa5 ; RV64ZVE32F-NEXT: vfslide1down.vf v8, v8, fa4 ; RV64ZVE32F-NEXT: vfslide1down.vf v8, v8, fa3 +; RV64ZVE32F-NEXT: vfslide1down.vf v8, v8, fa2 ; RV64ZVE32F-NEXT: ret %v = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> %ptrs, i32 2, <4 x i1> splat (i1 1), <4 x half> %passthru) ret <4 x half> %v @@ -7975,18 +7980,19 @@ define <4 x float> @mgather_truemask_v4f32(<4 x ptr> %ptrs, <4 x float> %passthr ; ; RV64ZVE32F-LABEL: mgather_truemask_v4f32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a1, 8(a0) -; RV64ZVE32F-NEXT: ld a2, 0(a0) +; RV64ZVE32F-NEXT: ld a1, 0(a0) +; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: ld a3, 16(a0) ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: flw fa5, 0(a1) +; RV64ZVE32F-NEXT: flw fa4, 0(a2) +; RV64ZVE32F-NEXT: flw fa3, 0(a3) +; RV64ZVE32F-NEXT: flw fa2, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vlse32.v v8, (a2), zero -; RV64ZVE32F-NEXT: flw fa4, 0(a3) -; RV64ZVE32F-NEXT: flw fa3, 0(a0) -; RV64ZVE32F-NEXT: vfslide1down.vf v8, v8, fa5 +; RV64ZVE32F-NEXT: vfmv.v.f v8, fa5 ; RV64ZVE32F-NEXT: vfslide1down.vf v8, v8, fa4 ; RV64ZVE32F-NEXT: vfslide1down.vf v8, v8, fa3 +; RV64ZVE32F-NEXT: vfslide1down.vf v8, v8, fa2 ; RV64ZVE32F-NEXT: ret %v = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 1), <4 x float> %passthru) ret <4 x float> %v @@ -11673,16 +11679,17 @@ define <8 x double> @mgather_baseidx_v8f64(ptr %base, <8 x i64> %idxs, <8 x i1> ; RV32ZVE32F-NEXT: lw a3, 56(a2) ; RV32ZVE32F-NEXT: lw a4, 48(a2) ; RV32ZVE32F-NEXT: lw a5, 40(a2) -; RV32ZVE32F-NEXT: lw a6, 8(a2) +; RV32ZVE32F-NEXT: lw a6, 32(a2) +; RV32ZVE32F-NEXT: lw a7, 0(a2) +; RV32ZVE32F-NEXT: lw t0, 8(a2) +; RV32ZVE32F-NEXT: lw t1, 16(a2) +; RV32ZVE32F-NEXT: lw a2, 24(a2) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vlse32.v v8, (a2), zero -; RV32ZVE32F-NEXT: lw a7, 16(a2) -; RV32ZVE32F-NEXT: lw t0, 24(a2) -; RV32ZVE32F-NEXT: lw a2, 32(a2) -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a6 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a7 +; RV32ZVE32F-NEXT: vmv.v.x v8, a7 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t0 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t1 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a6 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a5 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a4 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a3 @@ -12587,8 +12594,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m define <4 x i32> @mgather_broadcast_load_unmasked(ptr %base) { ; CHECK-LABEL: mgather_broadcast_load_unmasked: ; CHECK: # %bb.0: +; CHECK-NEXT: lw a0, 0(a0) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v8, (a0), zero +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: ret %ptrs = getelementptr inbounds i8, ptr %base, <4 x i32> zeroinitializer %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> poison) @@ -12599,8 +12607,9 @@ define <4 x i32> @mgather_broadcast_load_unmasked(ptr %base) { define <4 x i32> @mgather_broadcast_load_unmasked2(ptr %base) { ; CHECK-LABEL: mgather_broadcast_load_unmasked2: ; CHECK: # %bb.0: +; CHECK-NEXT: lw a0, 0(a0) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v8, (a0), zero +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: ret %ptrhead = insertelement <4 x ptr> poison, ptr %base, i32 0 %ptrs = shufflevector <4 x ptr> %ptrhead, <4 x ptr> poison, <4 x i32> zeroinitializer @@ -12690,11 +12699,11 @@ define <4 x i32> @mgather_narrow_edge_case(ptr %base) { ; ; RV64ZVE32F-LABEL: mgather_narrow_edge_case: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi a1, a0, -512 +; RV64ZVE32F-NEXT: lw a1, -512(a0) ; RV64ZVE32F-NEXT: lw a0, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vlse32.v v8, (a1), zero ; RV64ZVE32F-NEXT: vmv.v.i v0, 5 +; RV64ZVE32F-NEXT: vmv.v.x v8, a1 ; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i32, ptr %base, <4 x i8> @@ -12919,24 +12928,25 @@ define <8 x i16> @mgather_strided_2xSEW(ptr %base) { ; ; RV64ZVE32F-LABEL: mgather_strided_2xSEW: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi a1, a0, 16 +; RV64ZVE32F-NEXT: lh a1, 0(a0) ; RV64ZVE32F-NEXT: lh a2, 2(a0) ; RV64ZVE32F-NEXT: lh a3, 8(a0) ; RV64ZVE32F-NEXT: lh a4, 10(a0) -; RV64ZVE32F-NEXT: lh a5, 18(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vlse16.v v8, (a0), zero -; RV64ZVE32F-NEXT: lh a6, 24(a0) +; RV64ZVE32F-NEXT: lh a5, 16(a0) +; RV64ZVE32F-NEXT: lh a6, 18(a0) +; RV64ZVE32F-NEXT: lh a7, 24(a0) ; RV64ZVE32F-NEXT: lh a0, 26(a0) -; RV64ZVE32F-NEXT: vlse16.v v9, (a1), zero +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.x v8, a1 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a5 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v8, a4 +; RV64ZVE32F-NEXT: vmv.v.x v8, a5 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -12963,25 +12973,25 @@ define <8 x i16> @mgather_strided_2xSEW_with_offset(ptr %base) { ; ; RV64ZVE32F-LABEL: mgather_strided_2xSEW_with_offset: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi a1, a0, 20 -; RV64ZVE32F-NEXT: addi a2, a0, 4 -; RV64ZVE32F-NEXT: lh a3, 6(a0) -; RV64ZVE32F-NEXT: lh a4, 12(a0) -; RV64ZVE32F-NEXT: lh a5, 14(a0) +; RV64ZVE32F-NEXT: lh a1, 4(a0) +; RV64ZVE32F-NEXT: lh a2, 6(a0) +; RV64ZVE32F-NEXT: lh a3, 12(a0) +; RV64ZVE32F-NEXT: lh a4, 14(a0) +; RV64ZVE32F-NEXT: lh a5, 20(a0) ; RV64ZVE32F-NEXT: lh a6, 22(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vlse16.v v8, (a2), zero -; RV64ZVE32F-NEXT: lh a2, 28(a0) +; RV64ZVE32F-NEXT: lh a7, 28(a0) ; RV64ZVE32F-NEXT: lh a0, 30(a0) -; RV64ZVE32F-NEXT: vlse16.v v9, (a1), zero -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a6 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.x v8, a1 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v8, a4 +; RV64ZVE32F-NEXT: vmv.v.x v8, a5 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -13008,25 +13018,25 @@ define <8 x i16> @mgather_reverse_unit_strided_2xSEW(ptr %base) { ; ; RV64ZVE32F-LABEL: mgather_reverse_unit_strided_2xSEW: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi a1, a0, 20 -; RV64ZVE32F-NEXT: addi a2, a0, 28 -; RV64ZVE32F-NEXT: lh a3, 30(a0) -; RV64ZVE32F-NEXT: lh a4, 24(a0) -; RV64ZVE32F-NEXT: lh a5, 26(a0) +; RV64ZVE32F-NEXT: lh a1, 28(a0) +; RV64ZVE32F-NEXT: lh a2, 30(a0) +; RV64ZVE32F-NEXT: lh a3, 24(a0) +; RV64ZVE32F-NEXT: lh a4, 26(a0) +; RV64ZVE32F-NEXT: lh a5, 20(a0) ; RV64ZVE32F-NEXT: lh a6, 22(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vlse16.v v8, (a2), zero -; RV64ZVE32F-NEXT: lh a2, 16(a0) +; RV64ZVE32F-NEXT: lh a7, 16(a0) ; RV64ZVE32F-NEXT: lh a0, 18(a0) -; RV64ZVE32F-NEXT: vlse16.v v9, (a1), zero -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a6 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.x v8, a1 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v8, a4 +; RV64ZVE32F-NEXT: vmv.v.x v8, a5 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -13053,25 +13063,25 @@ define <8 x i16> @mgather_reverse_strided_2xSEW(ptr %base) { ; ; RV64ZVE32F-LABEL: mgather_reverse_strided_2xSEW: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi a1, a0, 12 -; RV64ZVE32F-NEXT: addi a2, a0, 28 -; RV64ZVE32F-NEXT: lh a3, 30(a0) -; RV64ZVE32F-NEXT: lh a4, 20(a0) -; RV64ZVE32F-NEXT: lh a5, 22(a0) +; RV64ZVE32F-NEXT: lh a1, 28(a0) +; RV64ZVE32F-NEXT: lh a2, 30(a0) +; RV64ZVE32F-NEXT: lh a3, 20(a0) +; RV64ZVE32F-NEXT: lh a4, 22(a0) +; RV64ZVE32F-NEXT: lh a5, 12(a0) ; RV64ZVE32F-NEXT: lh a6, 14(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vlse16.v v8, (a2), zero -; RV64ZVE32F-NEXT: lh a2, 4(a0) +; RV64ZVE32F-NEXT: lh a7, 4(a0) ; RV64ZVE32F-NEXT: lh a0, 6(a0) -; RV64ZVE32F-NEXT: vlse16.v v9, (a1), zero -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a6 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.x v8, a1 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v8, a4 +; RV64ZVE32F-NEXT: vmv.v.x v8, a5 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -13097,24 +13107,25 @@ define <8 x i16> @mgather_gather_2xSEW(ptr %base) { ; ; RV64ZVE32F-LABEL: mgather_gather_2xSEW: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi a1, a0, 8 +; RV64ZVE32F-NEXT: lh a1, 0(a0) ; RV64ZVE32F-NEXT: lh a2, 2(a0) ; RV64ZVE32F-NEXT: lh a3, 16(a0) ; RV64ZVE32F-NEXT: lh a4, 18(a0) -; RV64ZVE32F-NEXT: lh a5, 10(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vlse16.v v8, (a0), zero -; RV64ZVE32F-NEXT: lh a6, 4(a0) +; RV64ZVE32F-NEXT: lh a5, 8(a0) +; RV64ZVE32F-NEXT: lh a6, 10(a0) +; RV64ZVE32F-NEXT: lh a7, 4(a0) ; RV64ZVE32F-NEXT: lh a0, 6(a0) -; RV64ZVE32F-NEXT: vlse16.v v9, (a1), zero +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.x v8, a1 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a5 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v8, a4 +; RV64ZVE32F-NEXT: vmv.v.x v8, a5 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -13143,24 +13154,25 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned(ptr %base) { ; ; RV64ZVE32F-LABEL: mgather_gather_2xSEW_unaligned: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi a1, a0, 8 +; RV64ZVE32F-NEXT: lh a1, 0(a0) ; RV64ZVE32F-NEXT: lh a2, 2(a0) ; RV64ZVE32F-NEXT: lh a3, 18(a0) ; RV64ZVE32F-NEXT: lh a4, 20(a0) -; RV64ZVE32F-NEXT: lh a5, 10(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vlse16.v v8, (a0), zero -; RV64ZVE32F-NEXT: lh a6, 4(a0) +; RV64ZVE32F-NEXT: lh a5, 8(a0) +; RV64ZVE32F-NEXT: lh a6, 10(a0) +; RV64ZVE32F-NEXT: lh a7, 4(a0) ; RV64ZVE32F-NEXT: lh a0, 6(a0) -; RV64ZVE32F-NEXT: vlse16.v v9, (a1), zero +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.x v8, a1 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a5 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v8, a4 +; RV64ZVE32F-NEXT: vmv.v.x v8, a5 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 2, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -13190,24 +13202,24 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned2(ptr %base) { ; ; RV64ZVE32F-LABEL: mgather_gather_2xSEW_unaligned2: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi a1, a0, 8 -; RV64ZVE32F-NEXT: addi a2, a0, 2 -; RV64ZVE32F-NEXT: lh a3, 4(a0) -; RV64ZVE32F-NEXT: lh a4, 18(a0) -; RV64ZVE32F-NEXT: lh a5, 20(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vlse16.v v8, (a2), zero -; RV64ZVE32F-NEXT: lh a2, 10(a0) +; RV64ZVE32F-NEXT: lh a1, 2(a0) +; RV64ZVE32F-NEXT: lh a2, 4(a0) +; RV64ZVE32F-NEXT: lh a3, 18(a0) +; RV64ZVE32F-NEXT: lh a4, 20(a0) +; RV64ZVE32F-NEXT: lh a5, 8(a0) +; RV64ZVE32F-NEXT: lh a6, 10(a0) ; RV64ZVE32F-NEXT: lh a0, 6(a0) -; RV64ZVE32F-NEXT: vlse16.v v9, (a1), zero -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.x v8, a1 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v8, a4 +; RV64ZVE32F-NEXT: vmv.v.x v8, a5 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -13240,24 +13252,25 @@ define <8 x i16> @mgather_gather_4xSEW(ptr %base) { ; ; RV64ZVE32F-LABEL: mgather_gather_4xSEW: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi a1, a0, 16 +; RV64ZVE32F-NEXT: lh a1, 0(a0) ; RV64ZVE32F-NEXT: lh a2, 2(a0) ; RV64ZVE32F-NEXT: lh a3, 4(a0) ; RV64ZVE32F-NEXT: lh a4, 6(a0) -; RV64ZVE32F-NEXT: lh a5, 18(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vlse16.v v8, (a0), zero -; RV64ZVE32F-NEXT: lh a6, 20(a0) +; RV64ZVE32F-NEXT: lh a5, 16(a0) +; RV64ZVE32F-NEXT: lh a6, 18(a0) +; RV64ZVE32F-NEXT: lh a7, 20(a0) ; RV64ZVE32F-NEXT: lh a0, 22(a0) -; RV64ZVE32F-NEXT: vlse16.v v9, (a1), zero +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.x v8, a1 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a5 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v8, a4 +; RV64ZVE32F-NEXT: vmv.v.x v8, a5 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -13287,24 +13300,25 @@ define <8 x i16> @mgather_gather_4xSEW_partial_align(ptr %base) { ; ; RV64ZVE32F-LABEL: mgather_gather_4xSEW_partial_align: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi a1, a0, 16 +; RV64ZVE32F-NEXT: lh a1, 0(a0) ; RV64ZVE32F-NEXT: lh a2, 2(a0) ; RV64ZVE32F-NEXT: lh a3, 4(a0) ; RV64ZVE32F-NEXT: lh a4, 6(a0) -; RV64ZVE32F-NEXT: lh a5, 18(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vlse16.v v8, (a0), zero -; RV64ZVE32F-NEXT: lh a6, 20(a0) +; RV64ZVE32F-NEXT: lh a5, 16(a0) +; RV64ZVE32F-NEXT: lh a6, 18(a0) +; RV64ZVE32F-NEXT: lh a7, 20(a0) ; RV64ZVE32F-NEXT: lh a0, 22(a0) -; RV64ZVE32F-NEXT: vlse16.v v9, (a1), zero +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.x v8, a1 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a5 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v8, a4 +; RV64ZVE32F-NEXT: vmv.v.x v8, a5 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -13343,23 +13357,24 @@ define <8 x i16> @mgather_shuffle_rotate(ptr %base) { ; ; RV64ZVE32F-LABEL: mgather_shuffle_rotate: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi a1, a0, 8 +; RV64ZVE32F-NEXT: lh a1, 8(a0) ; RV64ZVE32F-NEXT: lh a2, 10(a0) ; RV64ZVE32F-NEXT: lh a3, 12(a0) ; RV64ZVE32F-NEXT: lh a4, 14(a0) -; RV64ZVE32F-NEXT: lh a5, 2(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vlse16.v v8, (a0), zero -; RV64ZVE32F-NEXT: lh a6, 4(a0) +; RV64ZVE32F-NEXT: lh a5, 0(a0) +; RV64ZVE32F-NEXT: lh a6, 2(a0) +; RV64ZVE32F-NEXT: lh a7, 4(a0) ; RV64ZVE32F-NEXT: lh a0, 6(a0) -; RV64ZVE32F-NEXT: vlse16.v v9, (a1), zero -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a5 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.x v8, a1 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v8, a4 +; RV64ZVE32F-NEXT: vmv.v.x v8, a5 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a2 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64> @@ -13390,24 +13405,25 @@ define <8 x i16> @mgather_shuffle_vrgather(ptr %base) { ; ; RV64ZVE32F-LABEL: mgather_shuffle_vrgather: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi a1, a0, 8 +; RV64ZVE32F-NEXT: lh a1, 0(a0) ; RV64ZVE32F-NEXT: lh a2, 4(a0) ; RV64ZVE32F-NEXT: lh a3, 6(a0) ; RV64ZVE32F-NEXT: lh a4, 2(a0) -; RV64ZVE32F-NEXT: lh a5, 10(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vlse16.v v8, (a0), zero -; RV64ZVE32F-NEXT: lh a6, 12(a0) +; RV64ZVE32F-NEXT: lh a5, 8(a0) +; RV64ZVE32F-NEXT: lh a6, 10(a0) +; RV64ZVE32F-NEXT: lh a7, 12(a0) ; RV64ZVE32F-NEXT: lh a0, 14(a0) -; RV64ZVE32F-NEXT: vlse16.v v9, (a1), zero +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.x v8, a1 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a5 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v8, a4 +; RV64ZVE32F-NEXT: vmv.v.x v8, a5 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -13889,14 +13905,14 @@ define <4 x i32> @masked_gather_widen_sew_negative_stride(ptr %base) { ; ; RV64ZVE32F-LABEL: masked_gather_widen_sew_negative_stride: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi a1, a0, 136 +; RV64ZVE32F-NEXT: lw a1, 136(a0) ; RV64ZVE32F-NEXT: lw a2, 140(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vlse32.v v8, (a1), zero -; RV64ZVE32F-NEXT: lw a1, 0(a0) +; RV64ZVE32F-NEXT: lw a3, 0(a0) ; RV64ZVE32F-NEXT: lw a0, 4(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v8, a1 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr i32, ptr %base, <4 x i64> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll index 1dd74a7c9dd1b..fe037a5af57c0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll @@ -1615,10 +1615,11 @@ define void @mscatter_v2i64_truncstore_v2i32(<2 x i64> %val, <2 x ptr> %ptrs, <2 ; ; RV32ZVE32F-LABEL: mscatter_v2i64_truncstore_v2i32: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: lw a1, 8(a0) +; RV32ZVE32F-NEXT: lw a1, 0(a0) +; RV32ZVE32F-NEXT: lw a0, 8(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vlse32.v v9, (a0), zero -; RV32ZVE32F-NEXT: vslide1down.vx v9, v9, a1 +; RV32ZVE32F-NEXT: vmv.v.x v9, a1 +; RV32ZVE32F-NEXT: vslide1down.vx v9, v9, a0 ; RV32ZVE32F-NEXT: vsoxei32.v v9, (zero), v8, v0.t ; RV32ZVE32F-NEXT: ret ; @@ -5606,16 +5607,17 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; ; RV32ZVE32F-LABEL: mscatter_baseidx_v8i64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: addi sp, sp, -32 -; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 32 -; RV32ZVE32F-NEXT: sw s0, 28(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s1, 24(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s2, 20(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s3, 16(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s4, 12(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s5, 8(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s6, 4(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: sw s7, 0(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: addi sp, sp, -48 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 48 +; RV32ZVE32F-NEXT: sw s0, 44(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 40(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s2, 36(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s3, 32(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s4, 28(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s5, 24(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s6, 20(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s7, 16(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s8, 12(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 ; RV32ZVE32F-NEXT: .cfi_offset s2, -12 @@ -5624,6 +5626,7 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; RV32ZVE32F-NEXT: .cfi_offset s5, -24 ; RV32ZVE32F-NEXT: .cfi_offset s6, -28 ; RV32ZVE32F-NEXT: .cfi_offset s7, -32 +; RV32ZVE32F-NEXT: .cfi_offset s8, -36 ; RV32ZVE32F-NEXT: lw a3, 60(a0) ; RV32ZVE32F-NEXT: lw a4, 56(a0) ; RV32ZVE32F-NEXT: lw a5, 52(a0) @@ -5641,16 +5644,17 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; RV32ZVE32F-NEXT: lw s2, 56(a2) ; RV32ZVE32F-NEXT: lw s3, 48(a2) ; RV32ZVE32F-NEXT: lw s4, 40(a2) -; RV32ZVE32F-NEXT: lw s5, 8(a2) +; RV32ZVE32F-NEXT: lw s5, 32(a2) +; RV32ZVE32F-NEXT: lw s6, 0(a2) +; RV32ZVE32F-NEXT: lw s7, 8(a2) +; RV32ZVE32F-NEXT: lw s8, 16(a2) +; RV32ZVE32F-NEXT: lw a2, 24(a2) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vlse32.v v8, (a2), zero -; RV32ZVE32F-NEXT: lw s6, 16(a2) -; RV32ZVE32F-NEXT: lw s7, 24(a2) -; RV32ZVE32F-NEXT: lw a2, 32(a2) -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s5 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s6 +; RV32ZVE32F-NEXT: vmv.v.x v8, s6 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s7 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s8 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s5 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s4 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s3 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s2 @@ -5689,15 +5693,16 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; RV32ZVE32F-NEXT: sw a4, 0(a0) ; RV32ZVE32F-NEXT: sw a3, 4(a0) ; RV32ZVE32F-NEXT: .LBB51_9: # %else14 -; RV32ZVE32F-NEXT: lw s0, 28(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s1, 24(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s2, 20(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s3, 16(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s4, 12(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s5, 8(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s6, 4(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: lw s7, 0(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: addi sp, sp, 32 +; RV32ZVE32F-NEXT: lw s0, 44(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 40(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s2, 36(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s3, 32(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s4, 28(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s5, 24(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s6, 20(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s7, 16(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s8, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 48 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB51_10: # %cond.store ; RV32ZVE32F-NEXT: lw a1, 4(a0) @@ -10238,16 +10243,17 @@ define void @mscatter_baseidx_v8f64(<8 x double> %val, ptr %base, <8 x i64> %idx ; RV32ZVE32F-NEXT: lw a2, 56(a1) ; RV32ZVE32F-NEXT: lw a3, 48(a1) ; RV32ZVE32F-NEXT: lw a4, 40(a1) -; RV32ZVE32F-NEXT: lw a5, 8(a1) +; RV32ZVE32F-NEXT: lw a5, 32(a1) +; RV32ZVE32F-NEXT: lw a6, 0(a1) +; RV32ZVE32F-NEXT: lw a7, 8(a1) +; RV32ZVE32F-NEXT: lw t0, 16(a1) +; RV32ZVE32F-NEXT: lw a1, 24(a1) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vlse32.v v8, (a1), zero -; RV32ZVE32F-NEXT: lw a6, 16(a1) -; RV32ZVE32F-NEXT: lw a7, 24(a1) -; RV32ZVE32F-NEXT: lw a1, 32(a1) -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a5 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV32ZVE32F-NEXT: vmv.v.x v8, a6 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a7 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t0 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a5 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a4 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a3 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll index e95b0bf3497fd..e57b6a22dd6ea 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvl256b | FileCheck %s --check-prefixes=CHECK,V ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+zvl256b | FileCheck %s --check-prefixes=CHECK,ZVE32F -; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+no-optimized-zero-stride-load,+zvl256b | FileCheck %s --check-prefixes=CHECK,NOT-OPTIMIZED -; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+no-optimized-zero-stride-load,+zvl256b | FileCheck %s --check-prefixes=CHECK,NOT-OPTIMIZED +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+optimized-zero-stride-load,+zvl256b | FileCheck %s --check-prefixes=CHECK,OPTIMIZED +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+optimized-zero-stride-load,+zvl256b | FileCheck %s --check-prefixes=CHECK,OPTIMIZED %struct.foo = type { i32, i32, i32, i32 } @@ -221,9 +221,10 @@ define void @gather_zero_stride_unfold(ptr noalias nocapture %A, ptr noalias noc ; V-NEXT: vsetvli zero, a3, e8, m1, ta, ma ; V-NEXT: .LBB5_1: # %vector.body ; V-NEXT: # =>This Inner Loop Header: Depth=1 -; V-NEXT: vlse8.v v8, (a1), zero -; V-NEXT: vle8.v v9, (a0) -; V-NEXT: vdivu.vv v8, v8, v9 +; V-NEXT: lbu a3, 0(a1) +; V-NEXT: vle8.v v8, (a0) +; V-NEXT: vmv.v.x v9, a3 +; V-NEXT: vdivu.vv v8, v9, v8 ; V-NEXT: vse8.v v8, (a0) ; V-NEXT: addi a0, a0, 32 ; V-NEXT: addi a1, a1, 160 @@ -238,9 +239,10 @@ define void @gather_zero_stride_unfold(ptr noalias nocapture %A, ptr noalias noc ; ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma ; ZVE32F-NEXT: .LBB5_1: # %vector.body ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 -; ZVE32F-NEXT: vlse8.v v8, (a1), zero -; ZVE32F-NEXT: vle8.v v9, (a0) -; ZVE32F-NEXT: vdivu.vv v8, v8, v9 +; ZVE32F-NEXT: lbu a3, 0(a1) +; ZVE32F-NEXT: vle8.v v8, (a0) +; ZVE32F-NEXT: vmv.v.x v9, a3 +; ZVE32F-NEXT: vdivu.vv v8, v9, v8 ; ZVE32F-NEXT: vse8.v v8, (a0) ; ZVE32F-NEXT: addi a0, a0, 32 ; ZVE32F-NEXT: addi a1, a1, 160 @@ -248,23 +250,22 @@ define void @gather_zero_stride_unfold(ptr noalias nocapture %A, ptr noalias noc ; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup ; ZVE32F-NEXT: ret ; -; NOT-OPTIMIZED-LABEL: gather_zero_stride_unfold: -; NOT-OPTIMIZED: # %bb.0: # %entry -; NOT-OPTIMIZED-NEXT: addi a2, a0, 1024 -; NOT-OPTIMIZED-NEXT: li a3, 32 -; NOT-OPTIMIZED-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; NOT-OPTIMIZED-NEXT: .LBB5_1: # %vector.body -; NOT-OPTIMIZED-NEXT: # =>This Inner Loop Header: Depth=1 -; NOT-OPTIMIZED-NEXT: lbu a3, 0(a1) -; NOT-OPTIMIZED-NEXT: vle8.v v8, (a0) -; NOT-OPTIMIZED-NEXT: vmv.v.x v9, a3 -; NOT-OPTIMIZED-NEXT: vdivu.vv v8, v9, v8 -; NOT-OPTIMIZED-NEXT: vse8.v v8, (a0) -; NOT-OPTIMIZED-NEXT: addi a0, a0, 32 -; NOT-OPTIMIZED-NEXT: addi a1, a1, 160 -; NOT-OPTIMIZED-NEXT: bne a0, a2, .LBB5_1 -; NOT-OPTIMIZED-NEXT: # %bb.2: # %for.cond.cleanup -; NOT-OPTIMIZED-NEXT: ret +; OPTIMIZED-LABEL: gather_zero_stride_unfold: +; OPTIMIZED: # %bb.0: # %entry +; OPTIMIZED-NEXT: addi a2, a0, 1024 +; OPTIMIZED-NEXT: li a3, 32 +; OPTIMIZED-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; OPTIMIZED-NEXT: .LBB5_1: # %vector.body +; OPTIMIZED-NEXT: # =>This Inner Loop Header: Depth=1 +; OPTIMIZED-NEXT: vlse8.v v8, (a1), zero +; OPTIMIZED-NEXT: vle8.v v9, (a0) +; OPTIMIZED-NEXT: vdivu.vv v8, v8, v9 +; OPTIMIZED-NEXT: vse8.v v8, (a0) +; OPTIMIZED-NEXT: addi a0, a0, 32 +; OPTIMIZED-NEXT: addi a1, a1, 160 +; OPTIMIZED-NEXT: bne a0, a2, .LBB5_1 +; OPTIMIZED-NEXT: # %bb.2: # %for.cond.cleanup +; OPTIMIZED-NEXT: ret entry: br label %vector.body diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll index d6ca6c5a4b83d..2a31ff5ab3f8c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll @@ -695,9 +695,11 @@ define <8 x i16> @vwadd_vx_v8i16_i8(ptr %x, ptr %y) { define <8 x i16> @vwadd_vx_v8i16_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_vx_v8i16_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: lh a1, 0(a1) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vlse16.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vwadd.wv v8, v8, v9 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x @@ -748,9 +750,11 @@ define <4 x i32> @vwadd_vx_v4i32_i16(ptr %x, ptr %y) { define <4 x i32> @vwadd_vx_v4i32_i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_vx_v4i32_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vlse32.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vwadd.wv v8, v8, v9 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x @@ -864,9 +868,11 @@ define <2 x i64> @vwadd_vx_v2i64_i64(ptr %x, ptr %y) nounwind { ; ; RV64-LABEL: vwadd_vx_v2i64_i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vlse64.v v8, (a1), zero +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV64-NEXT: vwadd.wv v8, v8, v9 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll index 61378a424ecba..1fc6af2d4cc1c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll @@ -695,9 +695,11 @@ define <8 x i16> @vwaddu_vx_v8i16_i8(ptr %x, ptr %y) { define <8 x i16> @vwaddu_vx_v8i16_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_vx_v8i16_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: lh a1, 0(a1) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vlse16.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vwaddu.wv v8, v8, v9 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x @@ -748,9 +750,11 @@ define <4 x i32> @vwaddu_vx_v4i32_i16(ptr %x, ptr %y) { define <4 x i32> @vwaddu_vx_v4i32_i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_vx_v4i32_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vlse32.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vwaddu.wv v8, v8, v9 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x @@ -876,9 +880,11 @@ define <2 x i64> @vwaddu_vx_v2i64_i64(ptr %x, ptr %y) nounwind { ; ; RV64-LABEL: vwaddu_vx_v2i64_i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vlse64.v v8, (a1), zero +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV64-NEXT: vwaddu.wv v8, v8, v9 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll index ee114350a4323..c73b3a0dce6be 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll @@ -734,9 +734,10 @@ define <8 x i16> @vwmulsu_vx_v8i16_i8(ptr %x, ptr %y) { define <8 x i16> @vwmulsu_vx_v8i16_i8_swap(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_vx_v8i16_i8_swap: ; CHECK: # %bb.0: +; CHECK-NEXT: lb a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vlse8.v v10, (a1), zero +; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vwmulsu.vv v8, v10, v9 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll index a2675d59ade93..c3353a2df4912 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll @@ -677,9 +677,10 @@ define <16 x i64> @vwsub_vx_v16i64(ptr %x, i32 %y) { define <8 x i16> @vwsub_vx_v8i16_i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_vx_v8i16_i8: ; CHECK: # %bb.0: +; CHECK-NEXT: lb a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vlse8.v v10, (a1), zero +; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vwsub.vv v8, v10, v9 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x @@ -695,9 +696,11 @@ define <8 x i16> @vwsub_vx_v8i16_i8(ptr %x, ptr %y) { define <8 x i16> @vwsub_vx_v8i16_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_vx_v8i16_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: lh a1, 0(a1) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vlse16.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vwsub.wv v8, v8, v9 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x @@ -731,9 +734,10 @@ define <4 x i32> @vwsub_vx_v4i32_i8(ptr %x, ptr %y) { define <4 x i32> @vwsub_vx_v4i32_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_vx_v4i32_i16: ; CHECK: # %bb.0: +; CHECK-NEXT: lh a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vlse16.v v10, (a1), zero +; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vwsub.vv v8, v10, v9 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x @@ -749,9 +753,11 @@ define <4 x i32> @vwsub_vx_v4i32_i16(ptr %x, ptr %y) { define <4 x i32> @vwsub_vx_v4i32_i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_vx_v4i32_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vlse32.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vwsub.wv v8, v8, v9 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x @@ -834,9 +840,10 @@ define <2 x i64> @vwsub_vx_v2i64_i32(ptr %x, ptr %y) nounwind { ; ; RV64-LABEL: vwsub_vx_v2i64_i32: ; RV64: # %bb.0: +; RV64-NEXT: lw a1, 0(a1) ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vlse32.v v10, (a1), zero +; RV64-NEXT: vmv.v.x v10, a1 ; RV64-NEXT: vwsub.vv v8, v10, v9 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x @@ -867,9 +874,11 @@ define <2 x i64> @vwsub_vx_v2i64_i64(ptr %x, ptr %y) nounwind { ; ; RV64-LABEL: vwsub_vx_v2i64_i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vlse64.v v8, (a1), zero +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV64-NEXT: vwsub.wv v8, v8, v9 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll index 1a9e3aac00341..9b5f4a5012f4e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll @@ -677,9 +677,10 @@ define <16 x i64> @vwsubu_vx_v16i64(ptr %x, i32 %y) { define <8 x i16> @vwsubu_vx_v8i16_i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_vx_v8i16_i8: ; CHECK: # %bb.0: +; CHECK-NEXT: lbu a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vlse8.v v10, (a1), zero +; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vwsubu.vv v8, v10, v9 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x @@ -695,9 +696,11 @@ define <8 x i16> @vwsubu_vx_v8i16_i8(ptr %x, ptr %y) { define <8 x i16> @vwsubu_vx_v8i16_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_vx_v8i16_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: lh a1, 0(a1) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vlse16.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vwsubu.wv v8, v8, v9 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x @@ -731,9 +734,10 @@ define <4 x i32> @vwsubu_vx_v4i32_i8(ptr %x, ptr %y) { define <4 x i32> @vwsubu_vx_v4i32_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_vx_v4i32_i16: ; CHECK: # %bb.0: +; CHECK-NEXT: lhu a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vlse16.v v10, (a1), zero +; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vwsubu.vv v8, v10, v9 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x @@ -749,9 +753,11 @@ define <4 x i32> @vwsubu_vx_v4i32_i16(ptr %x, ptr %y) { define <4 x i32> @vwsubu_vx_v4i32_i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_vx_v4i32_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vlse32.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vwsubu.wv v8, v8, v9 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x @@ -846,9 +852,10 @@ define <2 x i64> @vwsubu_vx_v2i64_i32(ptr %x, ptr %y) nounwind { ; ; RV64-LABEL: vwsubu_vx_v2i64_i32: ; RV64: # %bb.0: +; RV64-NEXT: lwu a1, 0(a1) ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vlse32.v v10, (a1), zero +; RV64-NEXT: vmv.v.x v10, a1 ; RV64-NEXT: vwsubu.vv v8, v10, v9 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x @@ -879,9 +886,11 @@ define <2 x i64> @vwsubu_vx_v2i64_i64(ptr %x, ptr %y) nounwind { ; ; RV64-LABEL: vwsubu_vx_v2i64_i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vlse64.v v8, (a1), zero +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV64-NEXT: vwsubu.wv v8, v8, v9 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp-combine.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp-combine.ll index a0f269b59bfe5..ab8a595dde5d7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp-combine.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp-combine.ll @@ -27,13 +27,13 @@ define @test2( %a, ; CHECK-LABEL: test2: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI1_0) -; CHECK-NEXT: addi a1, a1, %lo(.LCPI1_0) -; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v9, (a1), zero +; CHECK-NEXT: fld fa5, %lo(.LCPI1_0)(a1) ; CHECK-NEXT: lui a1, %hi(.LCPI1_1) -; CHECK-NEXT: fld fa5, %lo(.LCPI1_1)(a1) +; CHECK-NEXT: fld fa4, %lo(.LCPI1_1)(a1) +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-NEXT: vfmv.v.f v9, fa5 ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfadd.vf v9, v9, fa5, v0.t +; CHECK-NEXT: vfadd.vf v9, v9, fa4, v0.t ; CHECK-NEXT: vfmul.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %t = call @llvm.vp.fmul.nxv1f64( %a, splat (double 2.0), %m, i32 %evl) @@ -46,13 +46,13 @@ define @test3( %a, %v) { ; CHECK-LABEL: vreduce_fmin_nxv10f16: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI73_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI73_0) -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v12, (a0), zero +; CHECK-NEXT: flh fa5, %lo(.LCPI73_0)(a0) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vfmv.v.f v12, fa5 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma ; CHECK-NEXT: vslideup.vx v10, v12, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll index f93022c9d132d..1ae20c37d11e3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll @@ -124,12 +124,12 @@ define @test4(i64 %avl, i8 zeroext %cond, @test6(i64 %avl, i8 zeroext %cond, @vsplat_nxv8f16(half %f) {