Thanks to visit codestin.com
Credit goes to llvm.org

LLVM 22.0.0git
AMDGPURegBankLegalizeHelper.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
16#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
25#include "llvm/IR/IntrinsicsAMDGPU.h"
26
27#define DEBUG_TYPE "amdgpu-regbanklegalize"
28
29using namespace llvm;
30using namespace AMDGPU;
31
34 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
35 : ST(B.getMF().getSubtarget<GCNSubtarget>()), B(B), MRI(*B.getMRI()),
36 MUI(MUI), RBI(RBI), RBLRules(RBLRules), IsWave32(ST.isWave32()),
37 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
38 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
39 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
40
42 const SetOfRulesForOpcode &RuleSet = RBLRules.getRulesForOpc(MI);
43 const RegBankLLTMapping &Mapping = RuleSet.findMappingForMI(MI, MRI, MUI);
44
45 SmallSet<Register, 4> WaterfallSgprs;
46 unsigned OpIdx = 0;
47 if (Mapping.DstOpMapping.size() > 0) {
48 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
49 applyMappingDst(MI, OpIdx, Mapping.DstOpMapping);
50 }
51 if (Mapping.SrcOpMapping.size() > 0) {
52 B.setInstr(MI);
53 applyMappingSrc(MI, OpIdx, Mapping.SrcOpMapping, WaterfallSgprs);
54 }
55
56 lower(MI, Mapping, WaterfallSgprs);
57}
58
59bool RegBankLegalizeHelper::executeInWaterfallLoop(
61 SmallSet<Register, 4> &SGPROperandRegs) {
62 // Track use registers which have already been expanded with a readfirstlane
63 // sequence. This may have multiple uses if moving a sequence.
64 DenseMap<Register, Register> WaterfalledRegMap;
65
66 MachineBasicBlock &MBB = B.getMBB();
67 MachineFunction &MF = B.getMF();
68
70 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
71 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
72 if (IsWave32) {
73 MovExecOpc = AMDGPU::S_MOV_B32;
74 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
75 XorTermOpc = AMDGPU::S_XOR_B32_term;
76 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
77 ExecReg = AMDGPU::EXEC_LO;
78 } else {
79 MovExecOpc = AMDGPU::S_MOV_B64;
80 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
81 XorTermOpc = AMDGPU::S_XOR_B64_term;
82 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
83 ExecReg = AMDGPU::EXEC;
84 }
85
86#ifndef NDEBUG
87 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
88#endif
89
90 MachineRegisterInfo &MRI = *B.getMRI();
91 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
92 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
93
94 // Don't bother using generic instructions/registers for the exec mask.
95 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
96
97 Register SavedExec = MRI.createVirtualRegister(WaveRC);
98
99 // To insert the loop we need to split the block. Move everything before
100 // this point to a new block, and insert a new empty block before this
101 // instruction.
104 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
105 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
107 ++MBBI;
108 MF.insert(MBBI, LoopBB);
109 MF.insert(MBBI, BodyBB);
110 MF.insert(MBBI, RestoreExecBB);
111 MF.insert(MBBI, RemainderBB);
112
113 LoopBB->addSuccessor(BodyBB);
114 BodyBB->addSuccessor(RestoreExecBB);
115 BodyBB->addSuccessor(LoopBB);
116
117 // Move the rest of the block into a new block.
119 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
120
121 MBB.addSuccessor(LoopBB);
122 RestoreExecBB->addSuccessor(RemainderBB);
123
124 B.setInsertPt(*LoopBB, LoopBB->end());
125
126 // +-MBB:------------+
127 // | ... |
128 // | %0 = G_INST_1 |
129 // | %Dst = MI %Vgpr |
130 // | %1 = G_INST_2 |
131 // | ... |
132 // +-----------------+
133 // ->
134 // +-MBB-------------------------------+
135 // | ... |
136 // | %0 = G_INST_1 |
137 // | %SaveExecReg = S_MOV_B32 $exec_lo |
138 // +----------------|------------------+
139 // | /------------------------------|
140 // V V |
141 // +-LoopBB---------------------------------------------------------------+ |
142 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
143 // | instead of executing for each lane, see if other lanes had | |
144 // | same value for %Vgpr and execute for them also. | |
145 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
146 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
147 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
148 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
149 // +----------------|-----------------------------------------------------+ |
150 // V |
151 // +-BodyBB------------------------------------------------------------+ |
152 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
153 // | executed only for active lanes and written to Dst | |
154 // | $exec = S_XOR_B32 $exec, %SavedExec | |
155 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
156 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
157 // | SI_WATERFALL_LOOP LoopBB |-----|
158 // +----------------|--------------------------------------------------+
159 // V
160 // +-RestoreExecBB--------------------------+
161 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
162 // +----------------|-----------------------+
163 // V
164 // +-RemainderBB:----------------------+
165 // | %1 = G_INST_2 |
166 // | ... |
167 // +---------------------------------- +
168
169 // Move the instruction into the loop body. Note we moved everything after
170 // Range.end() already into a new block, so Range.end() is no longer valid.
171 BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
172
173 // Figure out the iterator range after splicing the instructions.
174 MachineBasicBlock::iterator NewBegin = Range.begin()->getIterator();
175 auto NewEnd = BodyBB->end();
176 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
177
178 B.setMBB(*LoopBB);
179 Register CondReg;
180
181 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
182 for (MachineOperand &Op : MI.all_uses()) {
183 Register OldReg = Op.getReg();
184 if (!SGPROperandRegs.count(OldReg))
185 continue;
186
187 // See if we already processed this register in another instruction in
188 // the sequence.
189 auto OldVal = WaterfalledRegMap.find(OldReg);
190 if (OldVal != WaterfalledRegMap.end()) {
191 Op.setReg(OldVal->second);
192 continue;
193 }
194
195 Register OpReg = Op.getReg();
196 LLT OpTy = MRI.getType(OpReg);
197
198 // TODO: support for agpr
199 assert(MRI.getRegBank(OpReg) == VgprRB);
200 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
201 buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
202
203 // Build the comparison(s), CurrentLaneReg == OpReg.
204 unsigned OpSize = OpTy.getSizeInBits();
205 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
206 LLT PartTy = LLT::scalar(PartSize);
207 unsigned NumParts = OpSize / PartSize;
209 SmallVector<Register, 8> CurrentLaneParts;
210
211 if (NumParts == 1) {
212 OpParts.push_back(OpReg);
213 CurrentLaneParts.push_back(CurrentLaneReg);
214 } else {
215 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
216 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
217 for (unsigned i = 0; i < NumParts; ++i) {
218 OpParts.push_back(UnmergeOp.getReg(i));
219 CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
220 }
221 }
222
223 for (unsigned i = 0; i < NumParts; ++i) {
224 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
225 B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
226
227 if (!CondReg)
228 CondReg = CmpReg;
229 else
230 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
231 }
232
233 Op.setReg(CurrentLaneReg);
234
235 // Make sure we don't re-process this register again.
236 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
237 }
238 }
239
240 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
241 Register CondRegLM =
242 MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
243 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
244
245 // Update EXEC, save the original EXEC value to SavedExec.
246 B.buildInstr(AndSaveExecOpc)
247 .addDef(SavedExec)
248 .addReg(CondRegLM, RegState::Kill);
249 MRI.setSimpleHint(SavedExec, CondRegLM);
250
251 B.setInsertPt(*BodyBB, BodyBB->end());
252
253 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
254 B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
255
256 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
257 // s_cbranch_scc0?
258
259 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
260 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
261
262 // Save the EXEC mask before the loop.
263 B.setInsertPt(MBB, MBB.end());
264 B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
265
266 // Restore the EXEC mask after the loop.
267 B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
268 B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
269
270 // Set the insert point after the original instruction, so any new
271 // instructions will be in the remainder.
272 B.setInsertPt(*RemainderBB, RemainderBB->begin());
273
274 return true;
275}
276
277void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
278 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
279 MachineFunction &MF = B.getMF();
280 assert(MI.getNumMemOperands() == 1);
281 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
282 Register Dst = MI.getOperand(0).getReg();
283 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
284 Register Base = MI.getOperand(1).getReg();
285 LLT PtrTy = MRI.getType(Base);
286 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
287 LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
288 SmallVector<Register, 4> LoadPartRegs;
289
290 unsigned ByteOffset = 0;
291 for (LLT PartTy : LLTBreakdown) {
292 Register BasePlusOffset;
293 if (ByteOffset == 0) {
294 BasePlusOffset = Base;
295 } else {
296 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
297 BasePlusOffset =
298 B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
299 }
300 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
301 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
302 LoadPartRegs.push_back(LoadPart.getReg(0));
303 ByteOffset += PartTy.getSizeInBytes();
304 }
305
306 if (!MergeTy.isValid()) {
307 // Loads are of same size, concat or merge them together.
308 B.buildMergeLikeInstr(Dst, LoadPartRegs);
309 } else {
310 // Loads are not all of same size, need to unmerge them to smaller pieces
311 // of MergeTy type, then merge pieces to Dst.
312 SmallVector<Register, 4> MergeTyParts;
313 for (Register Reg : LoadPartRegs) {
314 if (MRI.getType(Reg) == MergeTy) {
315 MergeTyParts.push_back(Reg);
316 } else {
317 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
318 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
319 MergeTyParts.push_back(Unmerge.getReg(i));
320 }
321 }
322 B.buildMergeLikeInstr(Dst, MergeTyParts);
323 }
324 MI.eraseFromParent();
325}
326
327void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
328 LLT MergeTy) {
329 MachineFunction &MF = B.getMF();
330 assert(MI.getNumMemOperands() == 1);
331 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
332 Register Dst = MI.getOperand(0).getReg();
333 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
334 Register Base = MI.getOperand(1).getReg();
335
336 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
337 auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
338
339 if (WideTy.isScalar()) {
340 B.buildTrunc(Dst, WideLoad);
341 } else {
342 SmallVector<Register, 4> MergeTyParts;
343 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
344
345 LLT DstTy = MRI.getType(Dst);
346 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
347 for (unsigned i = 0; i < NumElts; ++i) {
348 MergeTyParts.push_back(Unmerge.getReg(i));
349 }
350 B.buildMergeLikeInstr(Dst, MergeTyParts);
351 }
352 MI.eraseFromParent();
353}
354
355void RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
356 Register Dst = MI.getDstReg();
357 Register Ptr = MI.getPointerReg();
358 MachineMemOperand &MMO = MI.getMMO();
359 unsigned MemSize = 8 * MMO.getSize().getValue();
360
361 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
362
363 if (MI.getOpcode() == G_LOAD) {
364 B.buildLoad(Dst, Ptr, *WideMMO);
365 } else {
366 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
367
368 if (MI.getOpcode() == G_ZEXTLOAD) {
369 APInt Mask = APInt::getLowBitsSet(S32.getSizeInBits(), MemSize);
370 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
371 B.buildAnd(Dst, Load, MaskCst);
372 } else {
373 assert(MI.getOpcode() == G_SEXTLOAD);
374 B.buildSExtInReg(Dst, Load, MemSize);
375 }
376 }
377
378 MI.eraseFromParent();
379}
380
381void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
382 Register Dst = MI.getOperand(0).getReg();
383 LLT Ty = MRI.getType(Dst);
384 Register Src = MI.getOperand(1).getReg();
385 unsigned Opc = MI.getOpcode();
386 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
387 if (Ty == S32 || Ty == S16) {
388 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
389 auto False = B.buildConstant({VgprRB, Ty}, 0);
390 B.buildSelect(Dst, Src, True, False);
391 } else if (Ty == S64) {
392 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
393 auto False = B.buildConstant({VgprRB_S32}, 0);
394 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
395 MachineInstrBuilder Hi;
396 switch (Opc) {
397 case G_SEXT:
398 Hi = Lo;
399 break;
400 case G_ZEXT:
401 Hi = False;
402 break;
403 case G_ANYEXT:
404 Hi = B.buildUndef({VgprRB_S32});
405 break;
406 default:
407 llvm_unreachable("Opcode not supported");
408 }
409
410 B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
411 } else {
412 llvm_unreachable("Type not supported");
413 }
414
415 MI.eraseFromParent();
416}
417
418std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
419 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
420 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
421 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
422 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
423 return {Lo.getReg(0), Hi.getReg(0)};
424}
425
426std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
427 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
428 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
429 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
430 return {Lo.getReg(0), Hi.getReg(0)};
431}
432
433std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
434 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
435 auto Lo = PackedS32;
436 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
437 return {Lo.getReg(0), Hi.getReg(0)};
438}
439
440void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
441 Register Lo, Hi;
442 switch (MI.getOpcode()) {
443 case AMDGPU::G_SHL: {
444 auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
445 auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
446 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
447 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
448 break;
449 }
450 case AMDGPU::G_LSHR: {
451 auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
452 auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
453 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
454 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
455 break;
456 }
457 case AMDGPU::G_ASHR: {
458 auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
459 auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
460 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
461 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
462 break;
463 }
464 default:
465 llvm_unreachable("Unpack lowering not implemented");
466 }
467 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
468 MI.eraseFromParent();
469}
470
473 return (GI->is(Intrinsic::amdgcn_sbfe));
474
475 return MI.getOpcode() == AMDGPU::G_SBFX;
476}
477
478void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
479 Register Dst = MI.getOperand(0).getReg();
480 assert(MRI.getType(Dst) == LLT::scalar(64));
481 bool Signed = isSignedBFE(MI);
482 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
483 // Extract bitfield from Src, LSBit is the least-significant bit for the
484 // extraction (field offset) and Width is size of bitfield.
485 Register Src = MI.getOperand(FirstOpnd).getReg();
486 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
487 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
488 // Comments are for signed bitfield extract, similar for unsigned. x is sign
489 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
490
491 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
492 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
493 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
494
495 auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI);
496
497 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
498 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
499 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
500 if (!ConstWidth) {
501 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
502 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
503 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
504 MI.eraseFromParent();
505 return;
506 }
507
508 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
509 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
510 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
511 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
512 auto Zero = B.buildConstant({VgprRB, S32}, 0);
513 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
514
515 if (WidthImm <= 32) {
516 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
517 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width});
518 MachineInstrBuilder Hi;
519 if (Signed) {
520 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
521 Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31));
522 } else {
523 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
524 Hi = Zero;
525 }
526 B.buildMergeLikeInstr(Dst, {Lo, Hi});
527 } else {
528 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
529 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
530 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt});
531 B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi});
532 }
533
534 MI.eraseFromParent();
535}
536
537void RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
538 Register DstReg = MI.getOperand(0).getReg();
539 LLT Ty = MRI.getType(DstReg);
540 bool Signed = isSignedBFE(MI);
541 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
542 Register Src = MI.getOperand(FirstOpnd).getReg();
543 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
544 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
545 // For uniform bit field extract there are 4 available instructions, but
546 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
547 // field offset in low and size in high 16 bits.
548
549 // Src1 Hi16|Lo16 = Size|FieldOffset
550 auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6));
551 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
552 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
553 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size);
554 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
555 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
556 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
557
558 // Select machine instruction, because of reg class constraining, insert
559 // copies from reg class to reg bank.
560 auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
561 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
562 if (!constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
563 *ST.getRegisterInfo(), RBI))
564 llvm_unreachable("failed to constrain BFE");
565
566 B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
567 MI.eraseFromParent();
568}
569
570void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
571 Register Dst = MI.getOperand(0).getReg();
572 LLT DstTy = MRI.getType(Dst);
573 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
574 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
575 auto Op1 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(1).getReg());
576 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
577 unsigned Opc = MI.getOpcode();
578 auto Flags = MI.getFlags();
579 auto Lo =
580 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)}, Flags);
581 auto Hi =
582 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
583 B.buildMergeLikeInstr(Dst, {Lo, Hi});
584 MI.eraseFromParent();
585}
586
587void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
588 Register Dst = MI.getOperand(0).getReg();
589 LLT DstTy = MRI.getType(Dst);
590 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
591 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
592 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
593 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
594 auto Op3 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(3).getReg());
595 Register Cond = MI.getOperand(1).getReg();
596 auto Flags = MI.getFlags();
597 auto Lo =
598 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(0), Op3.getReg(0), Flags);
599 auto Hi =
600 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(1), Op3.getReg(1), Flags);
601
602 B.buildMergeLikeInstr(Dst, {Lo, Hi});
603 MI.eraseFromParent();
604}
605
606void RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
607 auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
608 int Amt = MI.getOperand(2).getImm();
609 Register Lo, Hi;
610 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
611 if (Amt <= 32) {
612 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
613 if (Amt == 32) {
614 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
615 Lo = Freeze.getReg(0);
616 } else {
617 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
618 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
619 }
620
621 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
622 Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
623 } else {
624 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
625 Lo = Op1.getReg(0);
626 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
627 }
628
629 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
630 MI.eraseFromParent();
631}
632
633void RegBankLegalizeHelper::lower(MachineInstr &MI,
634 const RegBankLLTMapping &Mapping,
635 SmallSet<Register, 4> &WaterfallSgprs) {
636
637 switch (Mapping.LoweringMethod) {
638 case DoNotLower:
639 break;
640 case VccExtToSel:
641 return lowerVccExtToSel(MI);
642 case UniExtToSel: {
643 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
644 auto True = B.buildConstant({SgprRB, Ty},
645 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
646 auto False = B.buildConstant({SgprRB, Ty}, 0);
647 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
648 // We are making select here. S1 cond was already 'any-extended to S32' +
649 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
650 B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
651 False);
652 MI.eraseFromParent();
653 return;
654 }
655 case UnpackBitShift:
656 return lowerUnpackBitShift(MI);
657 case Ext32To64: {
658 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
659 MachineInstrBuilder Hi;
660 switch (MI.getOpcode()) {
661 case AMDGPU::G_ZEXT: {
662 Hi = B.buildConstant({RB, S32}, 0);
663 break;
664 }
665 case AMDGPU::G_SEXT: {
666 // Replicate sign bit from 32-bit extended part.
667 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
668 Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt);
669 break;
670 }
671 case AMDGPU::G_ANYEXT: {
672 Hi = B.buildUndef({RB, S32});
673 break;
674 }
675 default:
676 llvm_unreachable("Unsuported Opcode in Ext32To64");
677 }
678
679 B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
680 {MI.getOperand(1).getReg(), Hi});
681 MI.eraseFromParent();
682 return;
683 }
684 case UniCstExt: {
685 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
686 B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
687
688 MI.eraseFromParent();
689 return;
690 }
691 case VgprToVccCopy: {
692 Register Src = MI.getOperand(1).getReg();
693 LLT Ty = MRI.getType(Src);
694 // Take lowest bit from each lane and put it in lane mask.
695 // Lowering via compare, but we need to clean high bits first as compare
696 // compares all bits in register.
697 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
698 if (Ty == S64) {
699 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
700 auto One = B.buildConstant(VgprRB_S32, 1);
701 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
702 auto Zero = B.buildConstant(VgprRB_S32, 0);
703 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
704 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
705 } else {
706 assert(Ty == S32 || Ty == S16);
707 auto One = B.buildConstant({VgprRB, Ty}, 1);
708 B.buildAnd(BoolSrc, Src, One);
709 }
710 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
711 B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
712 MI.eraseFromParent();
713 return;
714 }
715 case V_BFE:
716 return lowerV_BFE(MI);
717 case S_BFE:
718 return lowerS_BFE(MI);
719 case SplitTo32:
720 return lowerSplitTo32(MI);
721 case SplitTo32Select:
722 return lowerSplitTo32Select(MI);
724 return lowerSplitTo32SExtInReg(MI);
725 case SplitLoad: {
726 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
727 unsigned Size = DstTy.getSizeInBits();
728 // Even split to 128-bit loads
729 if (Size > 128) {
730 LLT B128;
731 if (DstTy.isVector()) {
732 LLT EltTy = DstTy.getElementType();
733 B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
734 } else {
735 B128 = LLT::scalar(128);
736 }
737 if (Size / 128 == 2)
738 splitLoad(MI, {B128, B128});
739 else if (Size / 128 == 4)
740 splitLoad(MI, {B128, B128, B128, B128});
741 else {
742 LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
743 llvm_unreachable("SplitLoad type not supported for MI");
744 }
745 }
746 // 64 and 32 bit load
747 else if (DstTy == S96)
748 splitLoad(MI, {S64, S32}, S32);
749 else if (DstTy == V3S32)
750 splitLoad(MI, {V2S32, S32}, S32);
751 else if (DstTy == V6S16)
752 splitLoad(MI, {V4S16, V2S16}, V2S16);
753 else {
754 LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
755 llvm_unreachable("SplitLoad type not supported for MI");
756 }
757 break;
758 }
759 case WidenLoad: {
760 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
761 if (DstTy == S96)
762 widenLoad(MI, S128);
763 else if (DstTy == V3S32)
764 widenLoad(MI, V4S32, S32);
765 else if (DstTy == V6S16)
766 widenLoad(MI, V8S16, V2S16);
767 else {
768 LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
769 llvm_unreachable("WidenLoad type not supported for MI");
770 }
771 break;
772 }
773 case WidenMMOToS32:
774 return widenMMOToS32(cast<GAnyLoad>(MI));
775 }
776
777 if (!WaterfallSgprs.empty()) {
778 MachineBasicBlock::iterator I = MI.getIterator();
779 executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs);
780 }
781}
782
783LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
784 switch (ID) {
785 case Vcc:
786 case UniInVcc:
787 return LLT::scalar(1);
788 case Sgpr16:
789 case Vgpr16:
790 case UniInVgprS16:
791 return LLT::scalar(16);
792 case Sgpr32:
793 case Sgpr32_WF:
794 case Sgpr32Trunc:
795 case Sgpr32AExt:
797 case Sgpr32SExt:
798 case Sgpr32ZExt:
799 case UniInVgprS32:
800 case Vgpr32:
801 case Vgpr32SExt:
802 case Vgpr32ZExt:
803 return LLT::scalar(32);
804 case Sgpr64:
805 case Vgpr64:
806 return LLT::scalar(64);
807 case Sgpr128:
808 case Vgpr128:
809 return LLT::scalar(128);
810 case VgprP0:
811 return LLT::pointer(0, 64);
812 case SgprP1:
813 case VgprP1:
814 return LLT::pointer(1, 64);
815 case SgprP3:
816 case VgprP3:
817 return LLT::pointer(3, 32);
818 case SgprP4:
819 case VgprP4:
820 return LLT::pointer(4, 64);
821 case SgprP5:
822 case VgprP5:
823 return LLT::pointer(5, 32);
824 case SgprV2S16:
825 case VgprV2S16:
826 case UniInVgprV2S16:
827 return LLT::fixed_vector(2, 16);
828 case SgprV2S32:
829 case VgprV2S32:
830 return LLT::fixed_vector(2, 32);
831 case SgprV4S32:
832 case SgprV4S32_WF:
833 case VgprV4S32:
834 case UniInVgprV4S32:
835 return LLT::fixed_vector(4, 32);
836 default:
837 return LLT();
838 }
839}
840
841LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
842 switch (ID) {
843 case SgprB32:
844 case VgprB32:
845 case UniInVgprB32:
846 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
847 isAnyPtr(Ty, 32))
848 return Ty;
849 return LLT();
850 case SgprPtr32:
851 case VgprPtr32:
852 return isAnyPtr(Ty, 32) ? Ty : LLT();
853 case SgprPtr64:
854 case VgprPtr64:
855 return isAnyPtr(Ty, 64) ? Ty : LLT();
856 case SgprPtr128:
857 case VgprPtr128:
858 return isAnyPtr(Ty, 128) ? Ty : LLT();
859 case SgprB64:
860 case VgprB64:
861 case UniInVgprB64:
862 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
863 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
864 return Ty;
865 return LLT();
866 case SgprB96:
867 case VgprB96:
868 case UniInVgprB96:
869 if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) ||
870 Ty == LLT::fixed_vector(6, 16))
871 return Ty;
872 return LLT();
873 case SgprB128:
874 case VgprB128:
875 case UniInVgprB128:
876 if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) ||
877 Ty == LLT::fixed_vector(2, 64) || isAnyPtr(Ty, 128))
878 return Ty;
879 return LLT();
880 case SgprB256:
881 case VgprB256:
882 case UniInVgprB256:
883 if (Ty == LLT::scalar(256) || Ty == LLT::fixed_vector(8, 32) ||
884 Ty == LLT::fixed_vector(4, 64) || Ty == LLT::fixed_vector(16, 16))
885 return Ty;
886 return LLT();
887 case SgprB512:
888 case VgprB512:
889 case UniInVgprB512:
890 if (Ty == LLT::scalar(512) || Ty == LLT::fixed_vector(16, 32) ||
891 Ty == LLT::fixed_vector(8, 64))
892 return Ty;
893 return LLT();
894 default:
895 return LLT();
896 }
897}
898
899const RegisterBank *
900RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
901 switch (ID) {
902 case Vcc:
903 return VccRB;
904 case Sgpr16:
905 case Sgpr32:
906 case Sgpr32_WF:
907 case Sgpr64:
908 case Sgpr128:
909 case SgprP1:
910 case SgprP3:
911 case SgprP4:
912 case SgprP5:
913 case SgprPtr32:
914 case SgprPtr64:
915 case SgprPtr128:
916 case SgprV2S16:
917 case SgprV2S32:
918 case SgprV4S32:
919 case SgprV4S32_WF:
920 case SgprB32:
921 case SgprB64:
922 case SgprB96:
923 case SgprB128:
924 case SgprB256:
925 case SgprB512:
926 case UniInVcc:
927 case UniInVgprS16:
928 case UniInVgprS32:
929 case UniInVgprV2S16:
930 case UniInVgprV4S32:
931 case UniInVgprB32:
932 case UniInVgprB64:
933 case UniInVgprB96:
934 case UniInVgprB128:
935 case UniInVgprB256:
936 case UniInVgprB512:
937 case Sgpr32Trunc:
938 case Sgpr32AExt:
940 case Sgpr32SExt:
941 case Sgpr32ZExt:
942 return SgprRB;
943 case Vgpr16:
944 case Vgpr32:
945 case Vgpr64:
946 case Vgpr128:
947 case VgprP0:
948 case VgprP1:
949 case VgprP3:
950 case VgprP4:
951 case VgprP5:
952 case VgprPtr32:
953 case VgprPtr64:
954 case VgprPtr128:
955 case VgprV2S16:
956 case VgprV2S32:
957 case VgprV4S32:
958 case VgprB32:
959 case VgprB64:
960 case VgprB96:
961 case VgprB128:
962 case VgprB256:
963 case VgprB512:
964 case Vgpr32SExt:
965 case Vgpr32ZExt:
966 return VgprRB;
967 default:
968 return nullptr;
969 }
970}
971
972void RegBankLegalizeHelper::applyMappingDst(
973 MachineInstr &MI, unsigned &OpIdx,
974 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
975 // Defs start from operand 0
976 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
977 if (MethodIDs[OpIdx] == None)
978 continue;
979 MachineOperand &Op = MI.getOperand(OpIdx);
980 Register Reg = Op.getReg();
981 LLT Ty = MRI.getType(Reg);
982 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
983
984 switch (MethodIDs[OpIdx]) {
985 // vcc, sgpr and vgpr scalars, pointers and vectors
986 case Vcc:
987 case Sgpr16:
988 case Sgpr32:
989 case Sgpr64:
990 case Sgpr128:
991 case SgprP1:
992 case SgprP3:
993 case SgprP4:
994 case SgprP5:
995 case SgprV2S16:
996 case SgprV2S32:
997 case SgprV4S32:
998 case Vgpr16:
999 case Vgpr32:
1000 case Vgpr64:
1001 case Vgpr128:
1002 case VgprP0:
1003 case VgprP1:
1004 case VgprP3:
1005 case VgprP4:
1006 case VgprP5:
1007 case VgprV2S16:
1008 case VgprV2S32:
1009 case VgprV4S32: {
1010 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1011 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1012 break;
1013 }
1014 // sgpr and vgpr B-types
1015 case SgprB32:
1016 case SgprB64:
1017 case SgprB96:
1018 case SgprB128:
1019 case SgprB256:
1020 case SgprB512:
1021 case SgprPtr32:
1022 case SgprPtr64:
1023 case SgprPtr128:
1024 case VgprB32:
1025 case VgprB64:
1026 case VgprB96:
1027 case VgprB128:
1028 case VgprB256:
1029 case VgprB512:
1030 case VgprPtr32:
1031 case VgprPtr64:
1032 case VgprPtr128: {
1033 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1034 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1035 break;
1036 }
1037 // uniform in vcc/vgpr: scalars, vectors and B-types
1038 case UniInVcc: {
1039 assert(Ty == S1);
1040 assert(RB == SgprRB);
1041 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1042 Op.setReg(NewDst);
1043 auto CopyS32_Vcc =
1044 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1045 B.buildTrunc(Reg, CopyS32_Vcc);
1046 break;
1047 }
1048 case UniInVgprS16: {
1049 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1050 assert(RB == SgprRB);
1051 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1052 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1053 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1054 Op.setReg(NewVgprDstS16);
1055 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1056 buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI);
1057 B.buildTrunc(Reg, NewSgprDstS32);
1058 break;
1059 }
1060 case UniInVgprS32:
1061 case UniInVgprV2S16:
1062 case UniInVgprV4S32: {
1063 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1064 assert(RB == SgprRB);
1065 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1066 Op.setReg(NewVgprDst);
1067 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1068 break;
1069 }
1070 case UniInVgprB32:
1071 case UniInVgprB64:
1072 case UniInVgprB96:
1073 case UniInVgprB128:
1074 case UniInVgprB256:
1075 case UniInVgprB512: {
1076 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1077 assert(RB == SgprRB);
1078 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1079 Op.setReg(NewVgprDst);
1080 AMDGPU::buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1081 break;
1082 }
1083 // sgpr trunc
1084 case Sgpr32Trunc: {
1085 assert(Ty.getSizeInBits() < 32);
1086 assert(RB == SgprRB);
1087 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1088 Op.setReg(NewDst);
1089 B.buildTrunc(Reg, NewDst);
1090 break;
1091 }
1092 case InvalidMapping: {
1093 LLVM_DEBUG(dbgs() << "Instruction with Invalid mapping: "; MI.dump(););
1094 llvm_unreachable("missing fast rule for MI");
1095 }
1096 default:
1097 llvm_unreachable("ID not supported");
1098 }
1099 }
1100}
1101
1102void RegBankLegalizeHelper::applyMappingSrc(
1103 MachineInstr &MI, unsigned &OpIdx,
1104 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1105 SmallSet<Register, 4> &SgprWaterfallOperandRegs) {
1106 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
1107 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
1108 continue;
1109
1110 MachineOperand &Op = MI.getOperand(OpIdx);
1111 Register Reg = Op.getReg();
1112 LLT Ty = MRI.getType(Reg);
1113 const RegisterBank *RB = MRI.getRegBank(Reg);
1114
1115 switch (MethodIDs[i]) {
1116 case Vcc: {
1117 assert(Ty == S1);
1118 assert(RB == VccRB || RB == SgprRB);
1119 if (RB == SgprRB) {
1120 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1121 auto CopyVcc_Scc =
1122 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1123 Op.setReg(CopyVcc_Scc.getReg(0));
1124 }
1125 break;
1126 }
1127 // sgpr scalars, pointers and vectors
1128 case Sgpr16:
1129 case Sgpr32:
1130 case Sgpr64:
1131 case Sgpr128:
1132 case SgprP1:
1133 case SgprP3:
1134 case SgprP4:
1135 case SgprP5:
1136 case SgprV2S16:
1137 case SgprV2S32:
1138 case SgprV4S32: {
1139 assert(Ty == getTyFromID(MethodIDs[i]));
1140 assert(RB == getRegBankFromID(MethodIDs[i]));
1141 break;
1142 }
1143 // sgpr B-types
1144 case SgprB32:
1145 case SgprB64:
1146 case SgprB96:
1147 case SgprB128:
1148 case SgprB256:
1149 case SgprB512:
1150 case SgprPtr32:
1151 case SgprPtr64:
1152 case SgprPtr128: {
1153 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1154 assert(RB == getRegBankFromID(MethodIDs[i]));
1155 break;
1156 }
1157 // vgpr scalars, pointers and vectors
1158 case Vgpr16:
1159 case Vgpr32:
1160 case Vgpr64:
1161 case Vgpr128:
1162 case VgprP0:
1163 case VgprP1:
1164 case VgprP3:
1165 case VgprP4:
1166 case VgprP5:
1167 case VgprV2S16:
1168 case VgprV2S32:
1169 case VgprV4S32: {
1170 assert(Ty == getTyFromID(MethodIDs[i]));
1171 if (RB != VgprRB) {
1172 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1173 Op.setReg(CopyToVgpr.getReg(0));
1174 }
1175 break;
1176 }
1177 // vgpr B-types
1178 case VgprB32:
1179 case VgprB64:
1180 case VgprB96:
1181 case VgprB128:
1182 case VgprB256:
1183 case VgprB512:
1184 case VgprPtr32:
1185 case VgprPtr64:
1186 case VgprPtr128: {
1187 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1188 if (RB != VgprRB) {
1189 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1190 Op.setReg(CopyToVgpr.getReg(0));
1191 }
1192 break;
1193 }
1194 // sgpr waterfall, scalars and vectors
1195 case Sgpr32_WF:
1196 case SgprV4S32_WF: {
1197 assert(Ty == getTyFromID(MethodIDs[i]));
1198 if (RB != SgprRB)
1199 SgprWaterfallOperandRegs.insert(Reg);
1200 break;
1201 }
1202 // sgpr and vgpr scalars with extend
1203 case Sgpr32AExt: {
1204 // Note: this ext allows S1, and it is meant to be combined away.
1205 assert(Ty.getSizeInBits() < 32);
1206 assert(RB == SgprRB);
1207 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1208 Op.setReg(Aext.getReg(0));
1209 break;
1210 }
1211 case Sgpr32AExtBoolInReg: {
1212 // Note: this ext allows S1, and it is meant to be combined away.
1213 assert(Ty.getSizeInBits() == 1);
1214 assert(RB == SgprRB);
1215 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1216 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
1217 // most of times meant to be combined away in AMDGPURegBankCombiner.
1218 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
1219 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
1220 Op.setReg(BoolInReg.getReg(0));
1221 break;
1222 }
1223 case Sgpr32SExt: {
1224 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1225 assert(RB == SgprRB);
1226 auto Sext = B.buildSExt(SgprRB_S32, Reg);
1227 Op.setReg(Sext.getReg(0));
1228 break;
1229 }
1230 case Sgpr32ZExt: {
1231 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1232 assert(RB == SgprRB);
1233 auto Zext = B.buildZExt({SgprRB, S32}, Reg);
1234 Op.setReg(Zext.getReg(0));
1235 break;
1236 }
1237 case Vgpr32SExt: {
1238 // Note this ext allows S1, and it is meant to be combined away.
1239 assert(Ty.getSizeInBits() < 32);
1240 assert(RB == VgprRB);
1241 auto Sext = B.buildSExt({VgprRB, S32}, Reg);
1242 Op.setReg(Sext.getReg(0));
1243 break;
1244 }
1245 case Vgpr32ZExt: {
1246 // Note this ext allows S1, and it is meant to be combined away.
1247 assert(Ty.getSizeInBits() < 32);
1248 assert(RB == VgprRB);
1249 auto Zext = B.buildZExt({VgprRB, S32}, Reg);
1250 Op.setReg(Zext.getReg(0));
1251 break;
1252 }
1253 default:
1254 llvm_unreachable("ID not supported");
1255 }
1256 }
1257}
1258
1260 Register Dst = MI.getOperand(0).getReg();
1261 LLT Ty = MRI.getType(Dst);
1262
1263 if (Ty == LLT::scalar(1) && MUI.isUniform(Dst)) {
1264 B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI());
1265
1266 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1267 MI.getOperand(0).setReg(NewDst);
1268 B.buildTrunc(Dst, NewDst);
1269
1270 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1271 Register UseReg = MI.getOperand(i).getReg();
1272
1273 auto DefMI = MRI.getVRegDef(UseReg)->getIterator();
1274 MachineBasicBlock *DefMBB = DefMI->getParent();
1275
1276 B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
1277
1278 auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg);
1279 MI.getOperand(i).setReg(NewUse.getReg(0));
1280 }
1281
1282 return;
1283 }
1284
1285 // ALL divergent i1 phis should be already lowered and inst-selected into PHI
1286 // with sgpr reg class and S1 LLT.
1287 // Note: this includes divergent phis that don't require lowering.
1288 if (Ty == LLT::scalar(1) && MUI.isDivergent(Dst)) {
1289 LLVM_DEBUG(dbgs() << "Divergent S1 G_PHI: "; MI.dump(););
1290 llvm_unreachable("Make sure to run AMDGPUGlobalISelDivergenceLowering "
1291 "before RegBankLegalize to lower lane mask(vcc) phis");
1292 }
1293
1294 // We accept all types that can fit in some register class.
1295 // Uniform G_PHIs have all sgpr registers.
1296 // Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr.
1297 if (Ty == LLT::scalar(32) || Ty == LLT::pointer(1, 64) ||
1298 Ty == LLT::pointer(4, 64)) {
1299 return;
1300 }
1301
1302 LLVM_DEBUG(dbgs() << "G_PHI not handled: "; MI.dump(););
1303 llvm_unreachable("type not supported");
1304}
1305
1306[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
1307 const RegisterBank *RB,
1309 unsigned StartOpIdx,
1310 unsigned EndOpIdx) {
1311 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
1312 if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB)
1313 return false;
1314 }
1315 return true;
1316}
1317
1319 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
1320 // Put RB on all registers
1321 unsigned NumDefs = MI.getNumDefs();
1322 unsigned NumOperands = MI.getNumOperands();
1323
1324 assert(verifyRegBankOnOperands(MI, RB, MRI, 0, NumDefs - 1));
1325 if (RB == SgprRB)
1326 assert(verifyRegBankOnOperands(MI, RB, MRI, NumDefs, NumOperands - 1));
1327
1328 if (RB == VgprRB) {
1329 B.setInstr(MI);
1330 for (unsigned i = NumDefs; i < NumOperands; ++i) {
1331 Register Reg = MI.getOperand(i).getReg();
1332 if (MRI.getRegBank(Reg) != RB) {
1333 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1334 MI.getOperand(i).setReg(Copy.getReg(0));
1335 }
1336 }
1337 }
1338}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:58
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
const SmallVectorImpl< MachineOperand > & Cond
#define LLVM_DEBUG(...)
Definition Debug.h:114
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping & findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
@ ICMP_NE
not equal
Definition InstrTypes.h:700
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:165
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:214
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void dump() const
Definition Pass.cpp:146
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
bool empty() const
Definition SmallSet.h:168
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:181
void push_back(const T &Elt)
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isAnyPtr(LLT Ty, unsigned Width)
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Kill
The last use of a register.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
@ Offset
Definition DWP.cpp:477
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
LLVM_ABI bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping