Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
[AArch64][SME] Spill p-regs as z-regs when streaming hazards are poss…
…ible

This patch adds a new option `-aarch64-enable-zpr-predicate-spills`
(which is disabled by default), this option replaces predicate spills
with vector spills in streaming[-compatible] functions.

For example:

```
str	p8, [sp, #7, mul vl]            // 2-byte Folded Spill
// ...
ldr	p8, [sp, #7, mul vl]            // 2-byte Folded Reload
```

Becomes:

```
mov	z0.b, p8/z, #1
str	z0, [sp]                        // 16-byte Folded Spill
// ...
ldr	z0, [sp]                        // 16-byte Folded Reload
ptrue	p4.b
cmpne	p8.b, p4/z, z0.b, #0
```

This is done to avoid streaming memory hazards between FPR/vector and
predicate spills, which currently occupy the same stack area even when
the `-aarch64-stack-hazard-size` flag is set.

This is implemented with two new pseudos SPILL_PPR_TO_ZPR_SLOT_PSEUDO
and FILL_PPR_FROM_ZPR_SLOT_PSEUDO. The expansion of these pseudos
handles scavenging the required registers (z0 in the above example) and,
in the worst case spilling a register to an emergency stack slot in the
expansion. The condition flags are also preserved around the `cmpne`
in case they are live at the expansion point.
  • Loading branch information
MacDue committed Feb 3, 2025
commit b39e20256454e9b27a1348ed0e30277b80a52a26
335 changes: 331 additions & 4 deletions llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

Large diffs are not rendered by default.

16 changes: 15 additions & 1 deletion llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ static cl::opt<unsigned>
AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
: AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
AArch64::CATCHRET),
RI(STI.getTargetTriple()), Subtarget(STI) {}
RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}

/// GetInstSize - Return the number of bytes of code the specified
/// instruction may be. This returns the maximum number of bytes.
Expand Down Expand Up @@ -2438,6 +2438,8 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
case AArch64::STZ2Gi:
case AArch64::STZGi:
case AArch64::TAGPstack:
case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
return 2;
case AArch64::LD1B_D_IMM:
case AArch64::LD1B_H_IMM:
Expand Down Expand Up @@ -4223,6 +4225,8 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
MinOffset = -256;
MaxOffset = 254;
break;
case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
case AArch64::LDR_ZXI:
case AArch64::STR_ZXI:
Scale = TypeSize::getScalable(16);
Expand Down Expand Up @@ -5355,6 +5359,11 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
"Unexpected register store without SVE store instructions");
Opc = AArch64::STR_ZXI;
StackID = TargetStackID::ScalableVector;
} else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected predicate store without SVE store instructions");
Opc = AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO;
StackID = TargetStackID::ScalableVector;
}
break;
case 24:
Expand Down Expand Up @@ -5527,6 +5536,11 @@ void AArch64InstrInfo::loadRegFromStackSlot(
"Unexpected register load without SVE load instructions");
Opc = AArch64::LDR_ZXI;
StackID = TargetStackID::ScalableVector;
} else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected predicate load without SVE load instructions");
Opc = AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO;
StackID = TargetStackID::ScalableVector;
}
break;
case 24:
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ using namespace llvm;
#define GET_REGINFO_TARGET_DESC
#include "AArch64GenRegisterInfo.inc"

AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT)
: AArch64GenRegisterInfo(AArch64::LR), TT(TT) {
AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT, unsigned HwMode)
: AArch64GenRegisterInfo(AArch64::LR, 0, 0, 0, HwMode), TT(TT) {
AArch64_MC::initLLVMToCVRegMapping(this);
}

Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AArch64/AArch64RegisterInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo {
const Triple &TT;

public:
AArch64RegisterInfo(const Triple &TT);
AArch64RegisterInfo(const Triple &TT, unsigned HwMode);

// FIXME: This should be tablegen'd like getDwarfRegNum is
int getSEHRegNum(unsigned i) const {
Expand Down
11 changes: 10 additions & 1 deletion llvm/lib/Target/AArch64/AArch64RegisterInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -979,10 +979,19 @@ class ZPRRegOp <string Suffix, AsmOperandClass C, ElementSizeEnum Size,
//******************************************************************************

// SVE predicate register classes.

// Note: This hardware mode is enabled in AArch64Subtarget::getHwModeSet()
// (without the use of the table-gen'd predicates).
def SMEWithStreamingMemoryHazards : HwMode<"", [Predicate<"false">]>;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this not have to check the bitmask to verify bit 0 is set? (as you've set it in AArch64Subtarget)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The predicate and feature flags of the hardware mode are not used, the implementation in AArch64Subtarget override the default implementation (which only checks CPU features). The predicate is only used for hardware-mode specific DAG patterns (of which we have none (https://reviews.llvm.org/D146012).

Copy link
Member Author

@MacDue MacDue Jan 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(I initially attempted to use the predicate in table-gen to enable this mode, but was surprised to find out it's not actually used to enable the hardware mode).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The confusing part to me is that I don't see how the value of 1 << 0 then relates to RegInfo<16, 16, 16>.
What if AArch64Subtarget::getHwModeSet would set 1 << 1 instead?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can remove the << 0 (I was just doing like table-gen does it). But really, this is just selecting betweem hardware mode 0 (the default with 2 x vscale predicate spills) and hardware mode 1 (with 16 x vscale predicate predicate spills).

I think getHwModeSet returns a bitset (no bits set = default), bit 0 set = mode 1, bit 1 = mode 2 (and I think multiple bits can be set). The bits are chosen by table-gen, which does not seem to give them names.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So:

What if AArch64Subtarget::getHwModeSet would set 1 << 1 instead?

That'd active mode 2 and something would crash, because that does not exist.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I belief what you're saying, but it's odd to me that TableGen doesn't generate an enum for this, because this means we need to make the implicit assumption that SMEWithStreamingMemoryHazards == 1, even though this is not expressed anywhere.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've updated table-gen to make a AArch64HwModeBits::SMEWithZPRPredicateSpills enum automatically, which makes this: return to_underlying(AArch64HwModeBits::SMEWithZPRPredicateSpills);, which is much less magic :)


def PPRSpillFillRI : RegInfoByHwMode<
[DefaultMode, SMEWithStreamingMemoryHazards],
[RegInfo<16,16,16>, RegInfo<16,128,128>]>;

class PPRClass<int firstreg, int lastreg, int step = 1> : RegisterClass<"AArch64",
[ nxv16i1, nxv8i1, nxv4i1, nxv2i1, nxv1i1 ], 16,
(sequence "P%u", firstreg, lastreg, step)> {
let Size = 16;
let RegInfos = PPRSpillFillRI;
}

def PPR : PPRClass<0, 15> {
Expand Down
22 changes: 22 additions & 0 deletions llvm/lib/Target/AArch64/AArch64Subtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,11 @@ static cl::alias AArch64StreamingStackHazardSize(
cl::desc("alias for -aarch64-streaming-hazard-size"),
cl::aliasopt(AArch64StreamingHazardSize));

static cl::opt<bool> EnableZPRPredicateSpills(
"aarch64-enable-zpr-predicate-spills", cl::init(false), cl::Hidden,
cl::desc(
"Enables spilling/reloading SVE predicates as data vectors (ZPRs)"));

// Subreg liveness tracking is disabled by default for now until all issues
// are ironed out. This option allows the feature to be used in tests.
static cl::opt<bool>
Expand Down Expand Up @@ -400,6 +405,23 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU,
EnableSubregLiveness = EnableSubregLivenessTracking.getValue();
}

unsigned AArch64Subtarget::getHwModeSet() const {
unsigned Modes = 0;

// Use a special hardware mode in streaming functions with stack hazards.
// This changes the spill size (and alignment) for the predicate register
// class.
//
// FIXME: This overrides the table-gen'd `getHwModeSet()` which only looks at
// CPU features.
if (EnableZPRPredicateSpills.getValue() &&
(isStreaming() || isStreamingCompatible())) {
Modes |= (1 << 0);
}

return Modes;
}

const CallLowering *AArch64Subtarget::getCallLowering() const {
return CallLoweringInfo.get();
}
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AArch64/AArch64Subtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
bool IsStreaming = false, bool IsStreamingCompatible = false,
bool HasMinSize = false);

virtual unsigned getHwModeSet() const override;

// Getters for SubtargetFeatures defined in tablegen
#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
bool GETTER() const { return ATTRIBUTE; }
Expand Down
14 changes: 14 additions & 0 deletions llvm/lib/Target/AArch64/SMEInstrFormats.td
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,20 @@ def FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO :
let hasPostISelHook = 1;
}

def SPILL_PPR_TO_ZPR_SLOT_PSEUDO :
Pseudo<(outs), (ins PPRorPNRAny:$Pt, GPR64sp:$Rn, simm9:$imm9), []>, Sched<[]>
{
let mayStore = 1;
let hasSideEffects = 0;
}

def FILL_PPR_FROM_ZPR_SLOT_PSEUDO :
Pseudo<(outs PPRorPNRAny:$Pt), (ins GPR64sp:$Rn, simm9:$imm9), []>, Sched<[]>
{
let mayLoad = 1;
let hasSideEffects = 0;
}

def SDTZALoadStore : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>;
def AArch64SMELdr : SDNode<"AArch64ISD::SME_ZA_LDR", SDTZALoadStore,
[SDNPHasChain, SDNPSideEffect, SDNPMayLoad]>;
Expand Down
Loading