Thanks to visit codestin.com
Credit goes to llvm.org

LLVM 22.0.0git
NVPTXISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines an instruction selector for the NVPTX target.
10//
11//===----------------------------------------------------------------------===//
12
13#include "NVPTXISelDAGToDAG.h"
14#include "NVPTX.h"
15#include "NVPTXUtilities.h"
16#include "llvm/ADT/APInt.h"
21#include "llvm/IR/GlobalValue.h"
23#include "llvm/IR/IntrinsicsNVPTX.h"
30#include <optional>
31
32using namespace llvm;
33
34#define DEBUG_TYPE "nvptx-isel"
35#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
36
37static cl::opt<bool>
38 EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden,
39 cl::desc("Enable reciprocal sqrt optimization"));
40
41// FIXME: This is a WAR to recover lost performance from #155024.
42// We still need to investigate the regression and find a more permanent
43// solution.
44static cl::opt<bool> EnableMADWide("nvptx-mad-wide-opt", cl::init(false),
46 cl::desc("Enable MAD wide optimization"));
47
48/// createNVPTXISelDag - This pass converts a legalized DAG into a
49/// NVPTX-specific DAG, ready for instruction scheduling.
54
59
61
63
67
69 Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
70 Scopes = NVPTXScopes(MF.getFunction().getContext());
72}
73
75NVPTXDAGToDAGISel::getDivF32Level(const SDNode *N) const {
77}
78
79bool NVPTXDAGToDAGISel::usePrecSqrtF32(const SDNode *N) const {
81}
82
83bool NVPTXDAGToDAGISel::useF32FTZ() const {
84 return Subtarget->getTargetLowering()->useF32FTZ(*MF);
85}
86
87bool NVPTXDAGToDAGISel::allowFMA() const {
88 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
89 return TL->allowFMA(*MF, OptLevel);
90}
91
92bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; }
93
94bool NVPTXDAGToDAGISel::doMADWideOpt() const { return EnableMADWide; }
95
96/// Select - Select instructions not customized! Used for
97/// expanded, promoted and normal instructions.
98void NVPTXDAGToDAGISel::Select(SDNode *N) {
99
100 if (N->isMachineOpcode()) {
101 N->setNodeId(-1);
102 return; // Already selected.
103 }
104
105 switch (N->getOpcode()) {
106 case ISD::LOAD:
107 case ISD::ATOMIC_LOAD:
108 if (tryLoad(N))
109 return;
110 break;
111 case ISD::STORE:
112 case ISD::ATOMIC_STORE:
113 if (tryStore(N))
114 return;
115 break;
116 case ISD::ATOMIC_FENCE:
117 if (tryFence(N))
118 return;
119 break;
121 tryUNPACK_VECTOR(N);
122 return;
124 if (tryEXTRACT_VECTOR_ELEMENT(N))
125 return;
126 break;
128 SelectSETP_F16X2(N);
129 return;
131 SelectSETP_BF16X2(N);
132 return;
133 case NVPTXISD::LoadV2:
134 case NVPTXISD::LoadV4:
135 case NVPTXISD::LoadV8:
136 if (tryLoadVector(N))
137 return;
138 break;
139 case NVPTXISD::LDUV2:
140 case NVPTXISD::LDUV4:
141 if (tryLDU(N))
142 return;
143 break;
147 if (tryStoreVector(N))
148 return;
149 break;
151 if (tryIntrinsicChain(N))
152 return;
153 break;
155 if (tryIntrinsicVoid(N))
156 return;
157 break;
158 case ISD::AND:
159 case ISD::SRA:
160 case ISD::SRL:
161 // Try to select BFE
162 if (tryBFE(N))
163 return;
164 break;
165 case ISD::ADDRSPACECAST:
166 SelectAddrSpaceCast(N);
167 return;
168 case ISD::CopyToReg: {
169 if (N->getOperand(1).getValueType() == MVT::i128) {
170 SelectV2I64toI128(N);
171 return;
172 }
173 break;
174 }
175 case ISD::CopyFromReg: {
176 if (N->getOperand(1).getValueType() == MVT::i128) {
177 SelectI128toV2I64(N);
178 return;
179 }
180 break;
181 }
184 selectAtomicSwap128(N);
185 return;
186 case ISD::FADD:
187 case ISD::FMUL:
188 case ISD::FSUB:
189 if (tryBF16ArithToFMA(N))
190 return;
191 break;
192 default:
193 break;
194 }
195 SelectCode(N);
196}
197
198#define TCGEN05_LD_OPCODE(SHAPE, NUM) \
199 (enablePack ? NVPTX::TCGEN05_LD_##SHAPE##_##NUM##_PACK \
200 : NVPTX::TCGEN05_LD_##SHAPE##_##NUM)
201
202static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack) {
203 switch (IID) {
204 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
205 return TCGEN05_LD_OPCODE(16x64b, x1);
206 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
207 return TCGEN05_LD_OPCODE(16x64b, x2);
208 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
209 return TCGEN05_LD_OPCODE(16x64b, x4);
210 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
211 return TCGEN05_LD_OPCODE(16x64b, x8);
212 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
213 return TCGEN05_LD_OPCODE(16x64b, x16);
214 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
215 return TCGEN05_LD_OPCODE(16x64b, x32);
216 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
217 return TCGEN05_LD_OPCODE(16x64b, x64);
218 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
219 return TCGEN05_LD_OPCODE(16x64b, x128);
220 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
221 return TCGEN05_LD_OPCODE(16x128b, x1);
222 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
223 return TCGEN05_LD_OPCODE(16x128b, x2);
224 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
225 return TCGEN05_LD_OPCODE(16x128b, x4);
226 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
227 return TCGEN05_LD_OPCODE(16x128b, x8);
228 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
229 return TCGEN05_LD_OPCODE(16x128b, x16);
230 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
231 return TCGEN05_LD_OPCODE(16x128b, x32);
232 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
233 return TCGEN05_LD_OPCODE(16x128b, x64);
234 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
235 return TCGEN05_LD_OPCODE(16x256b, x1);
236 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
237 return TCGEN05_LD_OPCODE(16x256b, x2);
238 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
239 return TCGEN05_LD_OPCODE(16x256b, x4);
240 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
241 return TCGEN05_LD_OPCODE(16x256b, x8);
242 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
243 return TCGEN05_LD_OPCODE(16x256b, x16);
244 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
245 return TCGEN05_LD_OPCODE(16x256b, x32);
246 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
247 return TCGEN05_LD_OPCODE(16x32bx2, x1);
248 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
249 return TCGEN05_LD_OPCODE(16x32bx2, x2);
250 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
251 return TCGEN05_LD_OPCODE(16x32bx2, x4);
252 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
253 return TCGEN05_LD_OPCODE(16x32bx2, x8);
254 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
255 return TCGEN05_LD_OPCODE(16x32bx2, x16);
256 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
257 return TCGEN05_LD_OPCODE(16x32bx2, x32);
258 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
259 return TCGEN05_LD_OPCODE(16x32bx2, x64);
260 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
261 return TCGEN05_LD_OPCODE(16x32bx2, x128);
262 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
263 return TCGEN05_LD_OPCODE(32x32b, x1);
264 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
265 return TCGEN05_LD_OPCODE(32x32b, x2);
266 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
267 return TCGEN05_LD_OPCODE(32x32b, x4);
268 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
269 return TCGEN05_LD_OPCODE(32x32b, x8);
270 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
271 return TCGEN05_LD_OPCODE(32x32b, x16);
272 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
273 return TCGEN05_LD_OPCODE(32x32b, x32);
274 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
275 return TCGEN05_LD_OPCODE(32x32b, x64);
276 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
277 return TCGEN05_LD_OPCODE(32x32b, x128);
278 }
279 llvm_unreachable("unhandled tcgen05.ld lowering");
280}
281
282void NVPTXDAGToDAGISel::SelectTcgen05Ld(SDNode *N, bool hasOffset) {
283 SDLoc DL(N);
284 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
285
286 if (hasOffset) {
287 bool enablePack = cast<ConstantSDNode>(N->getOperand(4))->getZExtValue();
288 auto OffsetNode = CurDAG->getTargetConstant(
289 cast<ConstantSDNode>(N->getOperand(3))->getZExtValue(), DL, MVT::i32);
290 ReplaceNode(N, CurDAG->getMachineNode(
291 getTcgen05LdOpcode(IID, enablePack), DL, N->getVTList(),
292 {N->getOperand(2), OffsetNode, N->getOperand(0)}));
293 } else {
294 bool enablePack = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
295 ReplaceNode(N, CurDAG->getMachineNode(
296 getTcgen05LdOpcode(IID, enablePack), DL, N->getVTList(),
297 {N->getOperand(2), N->getOperand(0)}));
298 }
299}
300
301bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
302 unsigned IID = N->getConstantOperandVal(1);
303 switch (IID) {
304 default:
305 return false;
306 case Intrinsic::nvvm_ldu_global_f:
307 case Intrinsic::nvvm_ldu_global_i:
308 case Intrinsic::nvvm_ldu_global_p:
309 return tryLDU(N);
310
311 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
312 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
313 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
314 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
315 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
316 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
317 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
318 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
319 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
320 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
321 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
322 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
323 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
324 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
325 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
326 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
327 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
328 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
329 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
330 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
331 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
332 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
333 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
334 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
335 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
336 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
337 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
338 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
339 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128: {
340 SelectTcgen05Ld(N);
341 return true;
342 }
343
344 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
345 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
346 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
347 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
348 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
349 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
350 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
351 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
352 SelectTcgen05Ld(N, /* hasOffset */ true);
353 return true;
354 }
355 }
356}
357
358// Map ISD:CONDCODE value to appropriate CmpMode expected by
359// NVPTXInstPrinter::printCmpMode()
360SDValue NVPTXDAGToDAGISel::getPTXCmpMode(const CondCodeSDNode &CondCode) {
362 const unsigned PTXCmpMode = [](ISD::CondCode CC) {
363 switch (CC) {
364 default:
365 llvm_unreachable("Unexpected condition code.");
366 case ISD::SETOEQ:
367 case ISD::SETEQ:
368 return CmpMode::EQ;
369 case ISD::SETOGT:
370 case ISD::SETGT:
371 return CmpMode::GT;
372 case ISD::SETOGE:
373 case ISD::SETGE:
374 return CmpMode::GE;
375 case ISD::SETOLT:
376 case ISD::SETLT:
377 return CmpMode::LT;
378 case ISD::SETOLE:
379 case ISD::SETLE:
380 return CmpMode::LE;
381 case ISD::SETONE:
382 case ISD::SETNE:
383 return CmpMode::NE;
384 case ISD::SETO:
385 return CmpMode::NUM;
386 case ISD::SETUO:
387 return CmpMode::NotANumber;
388 case ISD::SETUEQ:
389 return CmpMode::EQU;
390 case ISD::SETUGT:
391 return CmpMode::GTU;
392 case ISD::SETUGE:
393 return CmpMode::GEU;
394 case ISD::SETULT:
395 return CmpMode::LTU;
396 case ISD::SETULE:
397 return CmpMode::LEU;
398 case ISD::SETUNE:
399 return CmpMode::NEU;
400 }
401 }(CondCode.get());
402 return CurDAG->getTargetConstant(PTXCmpMode, SDLoc(), MVT::i32);
403}
404
405bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
406 SDValue PTXCmpMode = getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)));
407 SDLoc DL(N);
408 SDNode *SetP = CurDAG->getMachineNode(
409 NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1,
410 {N->getOperand(0), N->getOperand(1), PTXCmpMode,
411 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0, DL, MVT::i1)});
412 ReplaceNode(N, SetP);
413 return true;
414}
415
416bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
417 SDValue PTXCmpMode = getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)));
418 SDLoc DL(N);
419 SDNode *SetP = CurDAG->getMachineNode(
420 NVPTX::SETP_bf16x2rr, DL, MVT::i1, MVT::i1,
421 {N->getOperand(0), N->getOperand(1), PTXCmpMode,
422 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0, DL, MVT::i1)});
423 ReplaceNode(N, SetP);
424 return true;
425}
426
427bool NVPTXDAGToDAGISel::tryUNPACK_VECTOR(SDNode *N) {
428 SDValue Vector = N->getOperand(0);
429 MVT EltVT = N->getSimpleValueType(0);
430
431 MachineSDNode *N2 =
432 CurDAG->getMachineNode(NVPTX::I64toV2I32, SDLoc(N), EltVT, EltVT, Vector);
433
434 ReplaceNode(N, N2);
435 return true;
436}
437
438// Find all instances of extract_vector_elt that use this v2f16 vector
439// and coalesce them into a scattering move instruction.
440bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
441 SDValue Vector = N->getOperand(0);
442
443 MVT VT = Vector.getSimpleValueType();
444 if (!(NVPTX::isPackedVectorTy(VT) && VT.getVectorNumElements() == 2))
445 return false;
446
447 unsigned Opcode;
448 if (VT.is32BitVector())
449 Opcode = NVPTX::I32toV2I16;
450 else if (VT.is64BitVector())
451 Opcode = NVPTX::I64toV2I32;
452 else
453 llvm_unreachable("Unhandled packed type");
454
455 // Find and record all uses of this vector that extract element 0 or 1.
457 for (auto *U : Vector.getNode()->users()) {
458 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
459 continue;
460 if (U->getOperand(0) != Vector)
461 continue;
462 if (const ConstantSDNode *IdxConst =
463 dyn_cast<ConstantSDNode>(U->getOperand(1))) {
464 if (IdxConst->getZExtValue() == 0)
465 E0.push_back(U);
466 else if (IdxConst->getZExtValue() == 1)
467 E1.push_back(U);
468 else
469 llvm_unreachable("Invalid vector index.");
470 }
471 }
472
473 // There's no point scattering f16x2 if we only ever access one
474 // element of it.
475 if (E0.empty() || E1.empty())
476 return false;
477
478 // Merge (EltTy extractelt(V, 0), EltTy extractelt(V,1))
479 // into EltTy,EltTy Split[EltTy]x2(V)
480 MVT EltVT = VT.getVectorElementType();
481 SDNode *ScatterOp =
482 CurDAG->getMachineNode(Opcode, SDLoc(N), EltVT, EltVT, Vector);
483 for (auto *Node : E0)
484 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
485 for (auto *Node : E1)
486 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
487
488 return true;
489}
490
491static std::optional<NVPTX::AddressSpace> convertAS(unsigned AS) {
492 switch (AS) {
507 default:
508 return std::nullopt;
509 }
510}
511
513 return convertAS(N->getMemOperand()->getAddrSpace())
515}
516
517NVPTX::Ordering NVPTXDAGToDAGISel::getMemOrder(const MemSDNode *N) const {
518 // No "sem" orderings for SM/PTX versions which do not support memory ordering
521 auto Ordering = N->getMergedOrdering();
522 switch (Ordering) {
536 }
537 llvm_unreachable("Invalid atomic ordering");
538}
539
540NVPTX::Scope NVPTXDAGToDAGISel::getAtomicScope(const MemSDNode *N) const {
541 // No "scope" modifier for SM/PTX versions which do not support scoped atomics
542 // Functionally, these atomics are at device scope
543 if (!Subtarget->hasAtomScope())
545 return Scopes[N->getSyncScopeID()];
546}
547
548namespace {
549
550struct OperationOrderings {
551 NVPTX::Ordering InstructionOrdering, FenceOrdering;
552 OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic,
553 NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic)
554 : InstructionOrdering(IO), FenceOrdering(FO) {}
555};
556
557static OperationOrderings
558getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
559 AtomicOrdering Ordering = N->getSuccessOrdering();
560 auto CodeAddrSpace = NVPTXDAGToDAGISel::getAddrSpace(N);
561
562 bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
563 bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
564
565 // clang-format off
566
567 // Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error).
568 // Note: uses of Relaxed in the Atomic column of this table refer
569 // to LLVM AtomicOrdering::Monotonic.
570 //
571 // | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ |
572 // |---------|----------|--------------------|------------|------------------------------|
573 // | No | No | All | plain | .weak |
574 // | No | Yes | Generic,Shared, | .volatile | .volatile |
575 // | | | Global [0] | | |
576 // | No | Yes | Local,Const,Param | plain [1] | .weak [1] |
577 // | Unorder | Yes/No | All | == Relaxed | == Relaxed |
578 // | Relaxed | No | Generic,Shared, | .volatile | <atomic sem> |
579 // | | | Global [0] | | |
580 // | Other | No | Generic,Shared, | Error [2] | <atomic sem> |
581 // | | | Global [0] | | |
582 // | Yes | No | Local,Const,Param | plain [1] | .weak [1] |
583 // | Relaxed | Yes | Generic,Shared [0] | .volatile | .volatile |
584 // | Relaxed | Yes | Global [0] | .volatile | .mmio.relaxed.sys (PTX 8.2+) |
585 // | | | | | or .volatile (PTX 8.1-) |
586 // | Relaxed | Yes | Local,Const,Param | plain [1] | .weak [1] |
587 // | Other | Yes | Generic, Shared, | Error [2] | <atomic sem> [3] |
588 // | | | / Global [0] | | |
589
590 // Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX
591 // by following the ABI proven sound in:
592 // Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19.
593 // https://dl.acm.org/doi/pdf/10.1145/3297858.3304043
594 //
595 // | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence |
596 // |------------------------------------------------------|-------------------------------|
597 // | cuda::atomic_thread_fence | fence.sc.<scope>; |
598 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | |
599 // |------------------------------------------------------|-------------------------------|
600 // | cuda::atomic_load | fence.sc.<scope>; |
601 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | ld.acquire.<scope>; |
602 // |------------------------------------------------------|-------------------------------|
603 // | cuda::atomic_store | fence.sc.<scope>; |
604 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | st.release.<scope>; |
605 // |------------------------------------------------------|-------------------------------|
606 // | cuda::atomic_fetch_<op> | fence.sc.<scope>; |
607 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | atom.acq_rel.<scope>; |
608
609 // clang-format on
610
611 // [0]: volatile and atomics are only supported on global or shared
612 // memory locations, accessed via generic/shared/global pointers.
613 // MMIO is only supported on global memory locations,
614 // accessed via generic/global pointers.
615 // TODO: Implement MMIO access via generic pointer to global.
616 // Currently implemented for global pointers only.
617
618 // [1]: Lowering volatile/atomic operations to non-volatile/non-atomic
619 // PTX instructions fails to preserve their C++ side-effects.
620 //
621 // Example (https://github.com/llvm/llvm-project/issues/62057):
622 //
623 // void example() {
624 // std::atomic<bool> True = true;
625 // while (True.load(std::memory_order_relaxed));
626 // }
627 //
628 // A C++ program that calls "example" is well-defined: the infinite loop
629 // performs an atomic operation. By lowering volatile/atomics to
630 // "weak" memory operations, we are transforming the above into:
631 //
632 // void undefined_behavior() {
633 // bool True = true;
634 // while (True);
635 // }
636 //
637 // which exhibits undefined behavior in both C++ and PTX.
638 //
639 // Calling "example" in CUDA C++ compiled for sm_60- exhibits undefined
640 // behavior due to lack of Independent Forward Progress. Lowering these
641 // to weak memory operations in sm_60- is therefore fine.
642 //
643 // TODO: lower atomic and volatile operations to memory locations
644 // in local, const, and param to two PTX instructions in sm_70+:
645 // - the "weak" memory instruction we are currently lowering to, and
646 // - some other instruction that preserves the side-effect, e.g.,
647 // a dead dummy volatile load.
648 if (CodeAddrSpace == NVPTX::AddressSpace::Local ||
649 CodeAddrSpace == NVPTX::AddressSpace::Const ||
650 CodeAddrSpace == NVPTX::AddressSpace::Param) {
652 }
653
654 // [2]: Atomics with Ordering different than Unordered or Relaxed are not
655 // supported on sm_60 and older; this includes volatile atomics.
656 if (!(Ordering == AtomicOrdering::NotAtomic ||
657 Ordering == AtomicOrdering::Unordered ||
658 Ordering == AtomicOrdering::Monotonic) &&
659 !HasMemoryOrdering) {
661 formatv("PTX does not support \"atomic\" for orderings different than"
662 "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order "
663 "is: \"{}\".",
664 toIRString(Ordering)));
665 }
666
667 // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop
668 // the volatile semantics and preserve the atomic ones.
669
670 // PTX volatile and PTX atomics are not available for statespace that differ
671 // from .generic, .global, or .shared. The behavior of PTX volatile and PTX
672 // atomics is undefined if the generic address does not refer to a .global or
673 // .shared memory location.
674 bool AddrGenericOrGlobalOrShared =
675 (CodeAddrSpace == NVPTX::AddressSpace::Generic ||
676 CodeAddrSpace == NVPTX::AddressSpace::Global ||
677 CodeAddrSpace == NVPTX::AddressSpace::Shared ||
678 CodeAddrSpace == NVPTX::AddressSpace::SharedCluster);
679 if (!AddrGenericOrGlobalOrShared)
681
682 bool UseRelaxedMMIO =
683 HasRelaxedMMIO && CodeAddrSpace == NVPTX::AddressSpace::Global;
684
685 switch (Ordering) {
687 return N->isVolatile() ? NVPTX::Ordering::Volatile
690 // We lower unordered in the exact same way as 'monotonic' to respect
691 // LLVM IR atomicity requirements.
693 if (N->isVolatile())
694 return UseRelaxedMMIO ? NVPTX::Ordering::RelaxedMMIO
696 else
697 return HasMemoryOrdering ? NVPTX::Ordering::Relaxed
699 // case AtomicOrdering::Consume: // If LLVM ever provides this, lower it to
700 // Acquire.
702 if (!N->readMem())
704 formatv("PTX only supports Acquire Ordering on reads: {}",
705 N->getOperationName()));
708 if (!N->writeMem())
710 formatv("PTX only supports Release Ordering on writes: {}",
711 N->getOperationName()));
715 formatv("NVPTX does not support AcquireRelease Ordering on "
716 "read-modify-write "
717 "yet and PTX does not support it on loads or stores: {}",
718 N->getOperationName()));
719 }
721 // LLVM-IR SequentiallyConsistent atomics map to a two-instruction PTX
722 // sequence including a "fence.sc.sco" and the memory instruction with an
723 // Ordering that differs from "sc": acq, rel, or acq_rel, depending on
724 // whether the memory operation is a read, write, or read-modify-write.
725 //
726 // This sets the ordering of the fence to SequentiallyConsistent, and
727 // sets the corresponding ordering for the instruction.
728 NVPTX::Ordering InstrOrder;
729 if (N->readMem())
730 InstrOrder = NVPTX::Ordering::Acquire;
731 else if (N->writeMem())
732 InstrOrder = NVPTX::Ordering::Release;
733 else
735 formatv("NVPTX does not support SequentiallyConsistent Ordering on "
736 "read-modify-writes yet: {}",
737 N->getOperationName()));
738 return OperationOrderings(InstrOrder,
740 }
741 }
743 formatv("NVPTX backend does not support AtomicOrdering \"{}\" yet.",
744 toIRString(Ordering)));
745}
746
747} // namespace
748
749NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
750 NVPTX::Ordering O) const {
751 switch (O) {
753 case NVPTX::Ordering::Volatile: // Non-atomic volatile operations
754 // NVPTX uses Thread scope as the scope of non-atomic operations.
757 // RelaxedMMIO operations are always system scope.
758 // If a RelaxedMMIO order was generated from an atomic volatile operation
759 // with a smaller thread scope, we bump it here to system scope.
766 auto S = Scopes[N->getSyncScopeID()];
767
768 // Atomic operations must have a scope greater than thread.
769 if (S == NVPTX::Scope::Thread)
771 formatv("Atomics need scope > \"{}\".", ScopeToString(S)));
772
773 // If scope is cluster, clusters must be supported.
774 if (S == NVPTX::Scope::Cluster)
775 Subtarget->failIfClustersUnsupported("cluster scope");
776
777 // If operation is volatile, then its scope is system.
778 return N->isVolatile() ? NVPTX::Scope::System : S;
779 }
780 llvm_unreachable("unhandled ordering");
781}
782
783static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget,
784 NVPTX::AddressSpace CodeAddrSpace) {
785 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
786 // space.
787 return Subtarget.hasLDG() && CodeAddrSpace == NVPTX::AddressSpace::Global &&
788 N.isInvariant();
789}
790
791static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
792 NVPTXSubtarget const *T) {
793 if (S == NVPTX::Scope::Cluster)
794 T->failIfClustersUnsupported(".cluster scope fence");
795
796 // Fall back to .acq_rel if .acquire, .release is not supported.
797 if (!T->hasSplitAcquireAndReleaseFences() &&
800
801 switch (O) {
803 switch (S) {
805 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_sys
806 : NVPTX::INT_MEMBAR_SYS;
808 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_cta
809 : NVPTX::INT_MEMBAR_CTA;
811 return NVPTX::atomic_thread_fence_acquire_cluster;
813 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
814 : NVPTX::INT_MEMBAR_GL;
818 formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
819 ScopeToString(S)));
820 }
821 break;
823 switch (S) {
825 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_sys
826 : NVPTX::INT_MEMBAR_SYS;
828 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_cta
829 : NVPTX::INT_MEMBAR_CTA;
831 return NVPTX::atomic_thread_fence_release_cluster;
833 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
834 : NVPTX::INT_MEMBAR_GL;
838 formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
839 ScopeToString(S)));
840 }
841 break;
843 switch (S) {
845 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
846 : NVPTX::INT_MEMBAR_SYS;
848 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
849 : NVPTX::INT_MEMBAR_CTA;
851 return NVPTX::atomic_thread_fence_acq_rel_cluster;
853 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
854 : NVPTX::INT_MEMBAR_GL;
858 formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
859 ScopeToString(S)));
860 }
861 break;
862 }
864 switch (S) {
866 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
867 : NVPTX::INT_MEMBAR_SYS;
869 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
870 : NVPTX::INT_MEMBAR_CTA;
872 return NVPTX::atomic_thread_fence_seq_cst_cluster;
874 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
875 : NVPTX::INT_MEMBAR_GL;
878 report_fatal_error(formatv("Unsupported scope \"{}\" for seq_cst fence.",
879 ScopeToString(S)));
880 }
881 break;
882 }
888 formatv("Unsupported \"{}\" ordering and \"{}\" scope for fence.",
889 OrderingToString(O), ScopeToString(S)));
890 }
891 llvm_unreachable("unhandled ordering");
892}
893
894// Returns Memory Order and Scope of a memory instruction, and
895// inserts any fence before the instruction that's required to
896// implement its memory ordering.
897std::pair<NVPTX::Ordering, NVPTX::Scope>
898NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, SDValue &Chain,
899 MemSDNode *N) {
900 auto [InstructionOrdering, FenceOrdering] =
901 getOperationOrderings(N, Subtarget);
902 auto Scope = getOperationScope(N, InstructionOrdering);
903
904 // If a fence is required before the operation, insert it:
905 switch (NVPTX::Ordering(FenceOrdering)) {
907 break;
909 auto Op = getFenceOp(FenceOrdering, Scope, Subtarget);
910 Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0);
911 break;
912 }
913 default:
915 formatv("Unexpected fence ordering: \"{}\".",
916 OrderingToString(NVPTX::Ordering(FenceOrdering))));
917 }
918 return {InstructionOrdering, Scope};
919}
920
921void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
922 SDValue Src = N->getOperand(0);
923 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
924 unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
925 unsigned DstAddrSpace = CastN->getDestAddressSpace();
926 SDLoc DL(N);
927 assert(SrcAddrSpace != DstAddrSpace &&
928 "addrspacecast must be between different address spaces");
929
930 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
931 // Specific to generic
932
933 if (TM.is64Bit() && TM.getPointerSizeInBits(SrcAddrSpace) == 32) {
934 SDValue CvtNone =
935 CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
936 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u64_u32, DL, MVT::i64,
937 Src, CvtNone);
938 Src = SDValue(Cvt, 0);
939 }
940
941 unsigned Opc;
942 switch (SrcAddrSpace) {
943 default: report_fatal_error("Bad address space in addrspacecast");
945 Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
946 break;
948 Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
949 break;
951 if (!TM.is64Bit())
953 "Shared cluster address space is only supported in 64-bit mode");
954 Opc = NVPTX::cvta_shared_cluster_64;
955 break;
957 Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
958 break;
960 Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
961 break;
963 Opc = TM.is64Bit() ? NVPTX::cvta_param_64 : NVPTX::cvta_param;
964 break;
965 }
966 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src));
967 return;
968 } else {
969 // Generic to specific
970 if (SrcAddrSpace != 0)
971 report_fatal_error("Cannot cast between two non-generic address spaces");
972 unsigned Opc;
973 switch (DstAddrSpace) {
974 default: report_fatal_error("Bad address space in addrspacecast");
976 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
977 break;
979 Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
980 break;
982 if (!TM.is64Bit())
984 "Shared cluster address space is only supported in 64-bit mode");
985 Opc = NVPTX::cvta_to_shared_cluster_64;
986 break;
988 Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
989 break;
991 Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
992 break;
994 Opc = TM.is64Bit() ? NVPTX::cvta_to_param_64 : NVPTX::cvta_to_param;
995 break;
996 }
997
998 SDNode *CVTA = CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src);
999 if (TM.is64Bit() && TM.getPointerSizeInBits(DstAddrSpace) == 32) {
1000 SDValue CvtNone =
1001 CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
1002 CVTA = CurDAG->getMachineNode(NVPTX::CVT_u32_u64, DL, MVT::i32,
1003 SDValue(CVTA, 0), CvtNone);
1004 }
1005
1006 ReplaceNode(N, CVTA);
1007 return;
1008 }
1009}
1010
1011// Helper function template to reduce amount of boilerplate code for
1012// opcode selection.
1013static std::optional<unsigned>
1014pickOpcodeForVT(MVT::SimpleValueType VT, std::optional<unsigned> Opcode_i16,
1015 std::optional<unsigned> Opcode_i32,
1016 std::optional<unsigned> Opcode_i64) {
1017 switch (VT) {
1018 case MVT::f16:
1019 case MVT::i16:
1020 case MVT::bf16:
1021 return Opcode_i16;
1022 case MVT::v2f16:
1023 case MVT::v2bf16:
1024 case MVT::v2i16:
1025 case MVT::v4i8:
1026 case MVT::i32:
1027 case MVT::f32:
1028 return Opcode_i32;
1029 case MVT::v2f32:
1030 case MVT::v2i32:
1031 case MVT::i64:
1032 case MVT::f64:
1033 return Opcode_i64;
1034 default:
1035 return std::nullopt;
1036 }
1037}
1038
1039static inline bool isAddLike(const SDValue V) {
1040 return V.getOpcode() == ISD::ADD ||
1041 (V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
1042}
1043
1045 if (N.getOpcode() == ISD::AssertAlign)
1046 N = N.getOperand(0);
1047 return N;
1048}
1049
1050// selectBaseADDR - Match a dag node which will serve as the base address for an
1051// ADDR operand pair.
1053 N = stripAssertAlign(N);
1054 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(N))
1055 return DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N),
1056 GA->getValueType(0), GA->getOffset(),
1057 GA->getTargetFlags());
1058 if (const auto *ES = dyn_cast<ExternalSymbolSDNode>(N))
1059 return DAG->getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0),
1060 ES->getTargetFlags());
1061 if (const auto *FIN = dyn_cast<FrameIndexSDNode>(N))
1062 return DAG->getTargetFrameIndex(FIN->getIndex(), FIN->getValueType(0));
1063
1064 return N;
1065}
1066
1068 Addr = stripAssertAlign(Addr);
1069 APInt AccumulatedOffset(64u, 0);
1070 while (isAddLike(Addr)) {
1071 const auto *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1072 if (!CN)
1073 break;
1074
1075 const APInt CI = CN->getAPIntValue().sext(64);
1076 if (!(CI + AccumulatedOffset).isSignedIntN(32))
1077 break;
1078
1079 AccumulatedOffset += CI;
1080 Addr = stripAssertAlign(Addr->getOperand(0));
1081 }
1082 return DAG->getSignedTargetConstant(AccumulatedOffset.getSExtValue(), DL,
1083 MVT::i32);
1084}
1085
1086static std::pair<SDValue, SDValue> selectADDR(SDValue Addr, SelectionDAG *DAG) {
1087 SDValue Offset = accumulateOffset(Addr, SDLoc(Addr), DAG);
1088 SDValue Base = selectBaseADDR(Addr, DAG);
1089 return {Base, Offset};
1090}
1091
1092// Select a pair of operands which represent a valid PTX address, this could be
1093// one of the following things:
1094// - [var] - Offset is simply set to 0
1095// - [reg] - Offset is simply set to 0
1096// - [reg+immOff]
1097// - [var+immOff]
1098// Note that immOff must fit into a 32-bit signed integer.
1099bool NVPTXDAGToDAGISel::SelectADDR(SDValue Addr, SDValue &Base,
1100 SDValue &Offset) {
1101 std::tie(Base, Offset) = selectADDR(Addr, CurDAG);
1102 return true;
1103}
1104
1105bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1106 MemSDNode *LD = cast<MemSDNode>(N);
1107 assert(LD->readMem() && "Expected load");
1108
1109 // do not support pre/post inc/dec
1110 const LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(LD);
1111 if (PlainLoad && PlainLoad->isIndexed())
1112 return false;
1113
1114 // Address Space Setting
1115 const auto CodeAddrSpace = getAddrSpace(LD);
1116 if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
1117 return tryLDG(LD);
1118
1119 SDLoc DL(LD);
1120 SDValue Chain = N->getOperand(0);
1121 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
1122
1123 const unsigned FromTypeWidth = LD->getMemoryVT().getSizeInBits();
1124
1125 // Vector Setting
1126 const unsigned FromType =
1127 (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
1130
1131 assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
1132 FromTypeWidth <= 128 && "Invalid width for load");
1133
1134 // Create the machine instruction DAG
1135 const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
1136 SDValue Ops[] = {getI32Imm(Ordering, DL),
1137 getI32Imm(Scope, DL),
1138 getI32Imm(CodeAddrSpace, DL),
1139 getI32Imm(FromType, DL),
1140 getI32Imm(FromTypeWidth, DL),
1141 Base,
1142 Offset,
1143 Chain};
1144
1145 const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
1146 const std::optional<unsigned> Opcode =
1147 pickOpcodeForVT(TargetVT, NVPTX::LD_i16, NVPTX::LD_i32, NVPTX::LD_i64);
1148 if (!Opcode)
1149 return false;
1150
1151 SDNode *NVPTXLD = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
1152 if (!NVPTXLD)
1153 return false;
1154
1155 MachineMemOperand *MemRef = LD->getMemOperand();
1156 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
1157
1158 ReplaceNode(LD, NVPTXLD);
1159 return true;
1160}
1161
1162static unsigned getStoreVectorNumElts(SDNode *N) {
1163 switch (N->getOpcode()) {
1164 case NVPTXISD::StoreV2:
1165 return 2;
1166 case NVPTXISD::StoreV4:
1167 return 4;
1168 case NVPTXISD::StoreV8:
1169 return 8;
1170 default:
1171 llvm_unreachable("Unexpected opcode");
1172 }
1173}
1174
1175bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1176 MemSDNode *LD = cast<MemSDNode>(N);
1177
1178 // Address Space Setting
1179 const auto CodeAddrSpace = getAddrSpace(LD);
1180 if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
1181 return tryLDG(LD);
1182
1183 const MVT EltVT = LD->getSimpleValueType(0);
1184 SDLoc DL(LD);
1185 SDValue Chain = LD->getChain();
1186 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
1187
1188 // Type Setting: fromType + fromTypeWidth
1189 //
1190 // Sign : ISD::SEXTLOAD
1191 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1192 // type is integer
1193 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1194 // Read at least 8 bits (predicates are stored as 8-bit values)
1195 // The last operand holds the original LoadSDNode::getExtensionType() value
1196 const unsigned ExtensionType =
1197 N->getConstantOperandVal(N->getNumOperands() - 1);
1198 const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
1200 : NVPTX::PTXLdStInstCode::Untyped;
1201
1202 const unsigned FromTypeWidth = getFromTypeWidthForLoad(LD);
1203
1204 assert(!(EltVT.isVector() && ExtensionType != ISD::NON_EXTLOAD));
1205
1206 const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
1207 SDValue Ops[] = {getI32Imm(Ordering, DL),
1208 getI32Imm(Scope, DL),
1209 getI32Imm(CodeAddrSpace, DL),
1210 getI32Imm(FromType, DL),
1211 getI32Imm(FromTypeWidth, DL),
1212 Base,
1213 Offset,
1214 Chain};
1215
1216 std::optional<unsigned> Opcode;
1217 switch (N->getOpcode()) {
1218 default:
1219 llvm_unreachable("Unexpected opcode");
1220 case NVPTXISD::LoadV2:
1221 Opcode = pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i16_v2,
1222 NVPTX::LDV_i32_v2, NVPTX::LDV_i64_v2);
1223 break;
1224 case NVPTXISD::LoadV4:
1225 Opcode = pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i16_v4,
1226 NVPTX::LDV_i32_v4, NVPTX::LDV_i64_v4);
1227 break;
1228 case NVPTXISD::LoadV8:
1229 Opcode = pickOpcodeForVT(EltVT.SimpleTy, {/* no v8i16 */},
1230 NVPTX::LDV_i32_v8, {/* no v8i64 */});
1231 break;
1232 }
1233 if (!Opcode)
1234 return false;
1235
1236 SDNode *NVPTXLD = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
1237
1238 MachineMemOperand *MemRef = LD->getMemOperand();
1239 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
1240
1241 ReplaceNode(LD, NVPTXLD);
1242 return true;
1243}
1244
1245bool NVPTXDAGToDAGISel::tryLDG(MemSDNode *LD) {
1246 SDLoc DL(LD);
1247
1248 unsigned ExtensionType;
1249 if (const auto *Load = dyn_cast<LoadSDNode>(LD)) {
1250 ExtensionType = Load->getExtensionType();
1251 } else {
1252 ExtensionType = LD->getConstantOperandVal(LD->getNumOperands() - 1);
1253 }
1254 const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
1256 : NVPTX::PTXLdStInstCode::Untyped;
1257
1258 const unsigned FromTypeWidth = getFromTypeWidthForLoad(LD);
1259
1260 assert(!(LD->getSimpleValueType(0).isVector() &&
1261 ExtensionType != ISD::NON_EXTLOAD));
1262
1263 const auto [Base, Offset] = selectADDR(LD->getOperand(1), CurDAG);
1264 SDValue Ops[] = {getI32Imm(FromType, DL), getI32Imm(FromTypeWidth, DL), Base,
1265 Offset, LD->getChain()};
1266
1267 const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
1268 std::optional<unsigned> Opcode;
1269 switch (LD->getOpcode()) {
1270 default:
1271 llvm_unreachable("Unexpected opcode");
1272 case ISD::LOAD:
1273 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_i16,
1274 NVPTX::LD_GLOBAL_NC_i32, NVPTX::LD_GLOBAL_NC_i64);
1275 break;
1276 case NVPTXISD::LoadV2:
1277 Opcode =
1278 pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_v2i16,
1279 NVPTX::LD_GLOBAL_NC_v2i32, NVPTX::LD_GLOBAL_NC_v2i64);
1280 break;
1281 case NVPTXISD::LoadV4:
1282 Opcode =
1283 pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_v4i16,
1284 NVPTX::LD_GLOBAL_NC_v4i32, NVPTX::LD_GLOBAL_NC_v4i64);
1285 break;
1286 case NVPTXISD::LoadV8:
1287 Opcode = pickOpcodeForVT(TargetVT, {/* no v8i16 */},
1288 NVPTX::LD_GLOBAL_NC_v8i32, {/* no v8i64 */});
1289 break;
1290 }
1291 if (!Opcode)
1292 return false;
1293
1294 SDNode *NVPTXLDG = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
1295
1296 ReplaceNode(LD, NVPTXLDG);
1297 return true;
1298}
1299
1301 auto TotalWidth = Mem->getMemoryVT().getSizeInBits();
1302 auto NumElts = Mem->getNumValues() - 1;
1303 auto ElementBitWidth = TotalWidth / NumElts;
1304 assert(isPowerOf2_32(ElementBitWidth) && ElementBitWidth >= 8 &&
1305 ElementBitWidth <= 128 && TotalWidth <= 256 &&
1306 "Invalid width for load");
1307 return ElementBitWidth;
1308}
1309
1310bool NVPTXDAGToDAGISel::tryLDU(SDNode *N) {
1311 auto *LD = cast<MemSDNode>(N);
1312
1313 SDLoc DL(N);
1314 const unsigned FromTypeWidth = getFromTypeWidthForLoad(LD);
1315 const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
1316
1317 // If this is an LDU intrinsic, the address is the third operand. If its an
1318 // LDU SD node (from custom vector handling), then its the second operand
1319 SDValue Addr =
1320 LD->getOperand(LD->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1);
1321
1322 const auto [Base, Offset] = selectADDR(Addr, CurDAG);
1323 SDValue Ops[] = {getI32Imm(FromTypeWidth, DL), Base, Offset, LD->getChain()};
1324
1325 std::optional<unsigned> Opcode;
1326 switch (N->getOpcode()) {
1327 default:
1328 llvm_unreachable("Unexpected opcode");
1330 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LDU_GLOBAL_i16,
1331 NVPTX::LDU_GLOBAL_i32, NVPTX::LDU_GLOBAL_i64);
1332 break;
1333 case NVPTXISD::LDUV2:
1334 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LDU_GLOBAL_v2i16,
1335 NVPTX::LDU_GLOBAL_v2i32, NVPTX::LDU_GLOBAL_v2i64);
1336 break;
1337 case NVPTXISD::LDUV4:
1338 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LDU_GLOBAL_v4i16,
1339 NVPTX::LDU_GLOBAL_v4i32, {/* no v4i64 */});
1340 break;
1341 }
1342 if (!Opcode)
1343 return false;
1344
1345 SDNode *NVPTXLDU = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
1346
1347 ReplaceNode(LD, NVPTXLDU);
1348 return true;
1349}
1350
1351bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1352 MemSDNode *ST = cast<MemSDNode>(N);
1353 assert(ST->writeMem() && "Expected store");
1354 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(ST);
1355 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(ST);
1356 assert((PlainStore || AtomicStore) && "Expected store");
1357
1358 // do not support pre/post inc/dec
1359 if (PlainStore && PlainStore->isIndexed())
1360 return false;
1361
1362 // Address Space Setting
1363 const auto CodeAddrSpace = getAddrSpace(ST);
1364
1365 SDLoc DL(ST);
1366 SDValue Chain = ST->getChain();
1367 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
1368
1369 // Vector Setting
1370 const unsigned ToTypeWidth = ST->getMemoryVT().getSizeInBits();
1371
1372 // Create the machine instruction DAG
1373 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1374
1375 assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
1376 "Invalid width for store");
1377
1378 const auto [Base, Offset] = selectADDR(ST->getBasePtr(), CurDAG);
1379 SDValue Ops[] = {selectPossiblyImm(Value),
1380 getI32Imm(Ordering, DL),
1381 getI32Imm(Scope, DL),
1382 getI32Imm(CodeAddrSpace, DL),
1383 getI32Imm(ToTypeWidth, DL),
1384 Base,
1385 Offset,
1386 Chain};
1387
1388 const std::optional<unsigned> Opcode =
1389 pickOpcodeForVT(Value.getSimpleValueType().SimpleTy, NVPTX::ST_i16,
1390 NVPTX::ST_i32, NVPTX::ST_i64);
1391 if (!Opcode)
1392 return false;
1393
1394 SDNode *NVPTXST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
1395
1396 if (!NVPTXST)
1397 return false;
1398
1399 MachineMemOperand *MemRef = ST->getMemOperand();
1400 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1401 ReplaceNode(ST, NVPTXST);
1402 return true;
1403}
1404
1405bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1406 MemSDNode *ST = cast<MemSDNode>(N);
1407 const unsigned TotalWidth = ST->getMemoryVT().getSizeInBits();
1408
1409 // Address Space Setting
1410 const auto CodeAddrSpace = getAddrSpace(ST);
1411 if (CodeAddrSpace == NVPTX::AddressSpace::Const) {
1412 report_fatal_error("Cannot store to pointer that points to constant "
1413 "memory space");
1414 }
1415
1416 SDLoc DL(ST);
1417 SDValue Chain = ST->getChain();
1418 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
1419
1420 const unsigned NumElts = getStoreVectorNumElts(ST);
1421
1423 for (auto &V : ST->ops().slice(1, NumElts))
1424 Ops.push_back(selectPossiblyImm(V));
1425 SDValue Addr = N->getOperand(NumElts + 1);
1426 const unsigned ToTypeWidth = TotalWidth / NumElts;
1427
1428 assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
1429 TotalWidth <= 256 && "Invalid width for store");
1430
1431 const auto [Base, Offset] = selectADDR(Addr, CurDAG);
1432 Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
1433 getI32Imm(CodeAddrSpace, DL), getI32Imm(ToTypeWidth, DL), Base,
1434 Offset, Chain});
1435
1436 const MVT::SimpleValueType EltVT =
1437 ST->getOperand(1).getSimpleValueType().SimpleTy;
1438 std::optional<unsigned> Opcode;
1439 switch (ST->getOpcode()) {
1440 default:
1441 return false;
1442 case NVPTXISD::StoreV2:
1443 Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i16_v2, NVPTX::STV_i32_v2,
1444 NVPTX::STV_i64_v2);
1445 break;
1446 case NVPTXISD::StoreV4:
1447 Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i16_v4, NVPTX::STV_i32_v4,
1448 NVPTX::STV_i64_v4);
1449 break;
1450 case NVPTXISD::StoreV8:
1451 Opcode = pickOpcodeForVT(EltVT, {/* no v8i16 */}, NVPTX::STV_i32_v8,
1452 {/* no v8i64 */});
1453 break;
1454 }
1455
1456 if (!Opcode)
1457 return false;
1458
1459 SDNode *NVPTXST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
1460
1461 MachineMemOperand *MemRef = ST->getMemOperand();
1462 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1463
1464 ReplaceNode(ST, NVPTXST);
1465 return true;
1466}
1467
1468/// SelectBFE - Look for instruction sequences that can be made more efficient
1469/// by using the 'bfe' (bit-field extract) PTX instruction
1470bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
1471 SDLoc DL(N);
1472 SDValue LHS = N->getOperand(0);
1473 SDValue RHS = N->getOperand(1);
1474 SDValue Len;
1475 SDValue Start;
1476 SDValue Val;
1477 bool IsSigned = false;
1478
1479 if (N->getOpcode() == ISD::AND) {
1480 // Canonicalize the operands
1481 // We want 'and %val, %mask'
1483 std::swap(LHS, RHS);
1484 }
1485
1486 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
1487 if (!Mask) {
1488 // We need a constant mask on the RHS of the AND
1489 return false;
1490 }
1491
1492 // Extract the mask bits
1493 uint64_t MaskVal = Mask->getZExtValue();
1494 if (!isMask_64(MaskVal)) {
1495 // We *could* handle shifted masks here, but doing so would require an
1496 // 'and' operation to fix up the low-order bits so we would trade
1497 // shr+and for bfe+and, which has the same throughput
1498 return false;
1499 }
1500
1501 // How many bits are in our mask?
1502 int64_t NumBits = countr_one(MaskVal);
1503 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
1504
1505 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
1506 // We have a 'srl/and' pair, extract the effective start bit and length
1507 Val = LHS.getNode()->getOperand(0);
1508 Start = LHS.getNode()->getOperand(1);
1509 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
1510 if (StartConst) {
1511 uint64_t StartVal = StartConst->getZExtValue();
1512 // How many "good" bits do we have left? "good" is defined here as bits
1513 // that exist in the original value, not shifted in.
1514 int64_t GoodBits = Start.getValueSizeInBits() - StartVal;
1515 if (NumBits > GoodBits) {
1516 // Do not handle the case where bits have been shifted in. In theory
1517 // we could handle this, but the cost is likely higher than just
1518 // emitting the srl/and pair.
1519 return false;
1520 }
1521 Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
1522 } else {
1523 // Do not handle the case where the shift amount (can be zero if no srl
1524 // was found) is not constant. We could handle this case, but it would
1525 // require run-time logic that would be more expensive than just
1526 // emitting the srl/and pair.
1527 return false;
1528 }
1529 } else {
1530 // Do not handle the case where the LHS of the and is not a shift. While
1531 // it would be trivial to handle this case, it would just transform
1532 // 'and' -> 'bfe', but 'and' has higher-throughput.
1533 return false;
1534 }
1535 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
1536 if (LHS->getOpcode() == ISD::AND) {
1537 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
1538 if (!ShiftCnst) {
1539 // Shift amount must be constant
1540 return false;
1541 }
1542
1543 uint64_t ShiftAmt = ShiftCnst->getZExtValue();
1544
1545 SDValue AndLHS = LHS->getOperand(0);
1546 SDValue AndRHS = LHS->getOperand(1);
1547
1548 // Canonicalize the AND to have the mask on the RHS
1549 if (isa<ConstantSDNode>(AndLHS)) {
1550 std::swap(AndLHS, AndRHS);
1551 }
1552
1553 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
1554 if (!MaskCnst) {
1555 // Mask must be constant
1556 return false;
1557 }
1558
1559 uint64_t MaskVal = MaskCnst->getZExtValue();
1560 uint64_t NumZeros;
1561 uint64_t NumBits;
1562 if (isMask_64(MaskVal)) {
1563 NumZeros = 0;
1564 // The number of bits in the result bitfield will be the number of
1565 // trailing ones (the AND) minus the number of bits we shift off
1566 NumBits = llvm::countr_one(MaskVal) - ShiftAmt;
1567 } else if (isShiftedMask_64(MaskVal)) {
1568 NumZeros = llvm::countr_zero(MaskVal);
1569 unsigned NumOnes = llvm::countr_one(MaskVal >> NumZeros);
1570 // The number of bits in the result bitfield will be the number of
1571 // trailing zeros plus the number of set bits in the mask minus the
1572 // number of bits we shift off
1573 NumBits = NumZeros + NumOnes - ShiftAmt;
1574 } else {
1575 // This is not a mask we can handle
1576 return false;
1577 }
1578
1579 if (ShiftAmt < NumZeros) {
1580 // Handling this case would require extra logic that would make this
1581 // transformation non-profitable
1582 return false;
1583 }
1584
1585 Val = AndLHS;
1586 Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
1587 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
1588
1589 // If pre-shift AND includes the sign bit in the bitfield, we must use
1590 // signed BFE to replicate that bit during bitfield extraction. If the
1591 // sign bit is not part of the mask, unsigned BFE will zero out upper bits
1592 // of the result
1593 if (N->getOpcode() == ISD::SRA)
1594 IsSigned = (ShiftAmt + NumBits) == Val.getValueSizeInBits();
1595 } else if (LHS->getOpcode() == ISD::SHL) {
1596 // Here, we have a pattern like:
1597 //
1598 // (sra (shl val, NN), MM)
1599 // or
1600 // (srl (shl val, NN), MM)
1601 //
1602 // If MM >= NN, we can efficiently optimize this with bfe
1603 Val = LHS->getOperand(0);
1604
1605 SDValue ShlRHS = LHS->getOperand(1);
1606 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
1607 if (!ShlCnst) {
1608 // Shift amount must be constant
1609 return false;
1610 }
1611 uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
1612
1613 SDValue ShrRHS = RHS;
1614 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
1615 if (!ShrCnst) {
1616 // Shift amount must be constant
1617 return false;
1618 }
1619 uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
1620
1621 // To avoid extra codegen and be profitable, we need Outer >= Inner
1622 if (OuterShiftAmt < InnerShiftAmt) {
1623 return false;
1624 }
1625
1626 // If the outer shift is more than the type size, we have no bitfield to
1627 // extract (since we also check that the inner shift is <= the outer shift
1628 // then this also implies that the inner shift is < the type size)
1629 if (OuterShiftAmt >= Val.getValueSizeInBits()) {
1630 return false;
1631 }
1632
1633 Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
1634 MVT::i32);
1635 Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
1636 DL, MVT::i32);
1637
1638 if (N->getOpcode() == ISD::SRA) {
1639 // If we have a arithmetic right shift, we need to use the signed bfe
1640 // variant
1641 IsSigned = true;
1642 }
1643 } else {
1644 // No can do...
1645 return false;
1646 }
1647 } else {
1648 // No can do...
1649 return false;
1650 }
1651
1652
1653 unsigned Opc;
1654 // For the BFE operations we form here from "and" and "srl", always use the
1655 // unsigned variants.
1656 if (Val.getValueType() == MVT::i32) {
1657 if (IsSigned) {
1658 Opc = NVPTX::BFE_S32rii;
1659 } else {
1660 Opc = NVPTX::BFE_U32rii;
1661 }
1662 } else if (Val.getValueType() == MVT::i64) {
1663 if (IsSigned) {
1664 Opc = NVPTX::BFE_S64rii;
1665 } else {
1666 Opc = NVPTX::BFE_U64rii;
1667 }
1668 } else {
1669 // We cannot handle this type
1670 return false;
1671 }
1672
1673 SDValue Ops[] = {
1674 Val, Start, Len
1675 };
1676
1677 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
1678 return true;
1679}
1680
1681// Select bf16/bf16v2 FADD, FSUB, FMUL as fma on targets with only fma
1682bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(SDNode *N) {
1683 EVT VT = SDValue(N, 0).getValueType();
1684 if (VT.getScalarType() != MVT::bf16)
1685 return false;
1686
1687 const NVPTXSubtarget *STI = TM.getSubtargetImpl();
1688 if (STI->hasNativeBF16Support(N->getOpcode()))
1689 return false;
1690
1691 const bool IsVec = VT.isVector();
1692 assert(!IsVec || VT.getVectorNumElements() == 2);
1693 SDLoc DL(N);
1694 SDValue N0 = N->getOperand(0);
1695 SDValue N1 = N->getOperand(1);
1697 auto GetConstant = [&](float Value) -> SDValue {
1698 // BF16 immediates must be legalized to integer register values
1699 APFloat APF(Value);
1700 bool LosesInfo;
1701 APF.convert(APFloat::BFloat(), APFloat::rmNearestTiesToEven, &LosesInfo);
1702 assert(!LosesInfo);
1703 if (IsVec) {
1704 auto API = APF.bitcastToAPInt();
1705 API = API.concat(API);
1706 auto Const = CurDAG->getTargetConstant(API, DL, MVT::i32);
1707 return SDValue(CurDAG->getMachineNode(NVPTX::MOV_B32_i, DL, VT, Const),
1708 0);
1709 }
1710 auto Const = CurDAG->getTargetConstantFP(APF, DL, VT);
1711 return SDValue(CurDAG->getMachineNode(NVPTX::MOV_BF16_i, DL, VT, Const), 0);
1712 };
1713
1714 switch (N->getOpcode()) {
1715 case ISD::FADD:
1716 // add(a, b) -> fma(a, 1.0, b)
1717 Operands = {N0, GetConstant(1.0), N1};
1718 break;
1719 case ISD::FSUB:
1720 // sub(a, b) -> fma(b, -1.0, a)
1721 Operands = {N1, GetConstant(-1.0), N0};
1722 break;
1723 case ISD::FMUL:
1724 // mul(a, b) -> fma(a, b, -0.0)
1725 // NOTE: The identity is -0, not 0, because -0 + 0 == 0 for floats
1726 Operands = {N0, N1, GetConstant(-0.0)};
1727 break;
1728 default:
1729 llvm_unreachable("Unexpected opcode");
1730 };
1731
1732 int Opcode = IsVec ? NVPTX::FMA_BF16x2rrr : NVPTX::FMA_BF16rrr;
1733 MachineSDNode *FMA = CurDAG->getMachineNode(Opcode, DL, VT, Operands);
1734 ReplaceNode(N, FMA);
1735 return true;
1736}
1737
1738SDValue NVPTXDAGToDAGISel::selectPossiblyImm(SDValue V) {
1739 if (V.getOpcode() == ISD::BITCAST)
1740 V = V.getOperand(0);
1741
1742 if (auto *CN = dyn_cast<ConstantSDNode>(V))
1743 return CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(V),
1744 V.getValueType());
1745 if (auto *CN = dyn_cast<ConstantFPSDNode>(V))
1746 return CurDAG->getTargetConstantFP(CN->getValueAPF(), SDLoc(V),
1747 V.getValueType());
1748 return V;
1749}
1750
1751/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
1752/// inline asm expressions.
1754 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
1755 std::vector<SDValue> &OutOps) {
1756 switch (ConstraintID) {
1757 default:
1758 return true;
1759 case InlineAsm::ConstraintCode::m: { // memory
1760 const auto [Base, Offset] = selectADDR(Op, CurDAG);
1761 OutOps.push_back(Base);
1762 OutOps.push_back(Offset);
1763 return false;
1764 }
1765 }
1766 return true;
1767}
1768
1769void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) {
1770 // Lower a CopyToReg with two 64-bit inputs
1771 // Dst:i128, lo:i64, hi:i64
1772 //
1773 // CopyToReg Dst, lo, hi;
1774 //
1775 // ==>
1776 //
1777 // tmp = V2I64toI128 {lo, hi};
1778 // CopyToReg Dst, tmp;
1779 SDValue Dst = N->getOperand(1);
1780 SDValue Lo = N->getOperand(2);
1781 SDValue Hi = N->getOperand(3);
1782
1783 SDLoc DL(N);
1784 SDNode *Mov =
1785 CurDAG->getMachineNode(NVPTX::V2I64toI128, DL, MVT::i128, {Lo, Hi});
1786
1787 SmallVector<SDValue, 4> NewOps(N->getNumOperands() - 1);
1788 NewOps[0] = N->getOperand(0);
1789 NewOps[1] = Dst;
1790 NewOps[2] = SDValue(Mov, 0);
1791 if (N->getNumOperands() == 5)
1792 NewOps[3] = N->getOperand(4);
1793 SDValue NewValue = CurDAG->getNode(ISD::CopyToReg, DL, SmallVector<EVT>(N->values()), NewOps);
1794
1795 ReplaceNode(N, NewValue.getNode());
1796}
1797
1798void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {
1799 // Lower CopyFromReg from a 128-bit regs to two 64-bit regs
1800 // Dst:i128, Src:i128
1801 //
1802 // {lo, hi} = CopyFromReg Src
1803 //
1804 // ==>
1805 //
1806 // {lo, hi} = I128toV2I64 Src
1807 //
1808 SDValue Ch = N->getOperand(0);
1809 SDValue Src = N->getOperand(1);
1810 SDValue Glue = N->getOperand(2);
1811 SDLoc DL(N);
1812
1813 // Add Glue and Ch to the operands and results to avoid break the execution
1814 // order
1815 SDNode *Mov = CurDAG->getMachineNode(
1816 NVPTX::I128toV2I64, DL,
1817 {MVT::i64, MVT::i64, Ch.getValueType(), Glue.getValueType()},
1818 {Src, Ch, Glue});
1819
1820 ReplaceNode(N, Mov);
1821}
1822
1823bool NVPTXDAGToDAGISel::tryFence(SDNode *N) {
1824 SDLoc DL(N);
1825 assert(N->getOpcode() == ISD::ATOMIC_FENCE);
1826 unsigned int FenceOp =
1827 getFenceOp(NVPTX::Ordering(N->getConstantOperandVal(1)),
1828 Scopes[N->getConstantOperandVal(2)], Subtarget);
1829 SDValue Chain = N->getOperand(0);
1830 SDNode *FenceNode = CurDAG->getMachineNode(FenceOp, DL, MVT::Other, Chain);
1831 ReplaceNode(N, FenceNode);
1832 return true;
1833}
1834
1836 Scopes[C.getOrInsertSyncScopeID("singlethread")] = NVPTX::Scope::Thread;
1837 Scopes[C.getOrInsertSyncScopeID("")] = NVPTX::Scope::System;
1838 Scopes[C.getOrInsertSyncScopeID("block")] = NVPTX::Scope::Block;
1839 Scopes[C.getOrInsertSyncScopeID("cluster")] = NVPTX::Scope::Cluster;
1840 Scopes[C.getOrInsertSyncScopeID("device")] = NVPTX::Scope::Device;
1841}
1842
1844 if (Scopes.empty())
1845 llvm_unreachable("NVPTX Scopes must be initialized before calling "
1846 "NVPTXScopes::operator[]");
1847
1848 auto S = Scopes.find(ID);
1849 if (S == Scopes.end()) {
1850 // TODO:
1851 // - Add API to LLVMContext to get the name of a single scope.
1852 // - Use that API here to print an error containing the name
1853 // of this Unknown ID.
1854 report_fatal_error(formatv("Could not find scope ID={}.", int(ID)));
1855 }
1856 return S->second;
1857}
1858
1859bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
1860
1861#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, is_s32, suffix) \
1862 (is_s32 \
1863 ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
1864 : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
1865
1866#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32) \
1867 (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
1868 : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
1869
1870#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32) \
1871 [&]() -> auto { \
1872 if (is_mc && is_ch) \
1873 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC_CH); \
1874 if (is_ch) \
1875 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _CH); \
1876 if (is_mc) \
1877 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC); \
1878 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, ); \
1879 }()
1880
1882 bool IsShared32,
1883 bool IsCacheHint,
1884 bool IsIm2Col) {
1885 if (IsIm2Col) {
1886 switch (Dim) {
1887 case 3:
1888 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(3D, IM2COL, IsCacheHint,
1889 IsShared32);
1890 case 4:
1891 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(4D, IM2COL, IsCacheHint,
1892 IsShared32);
1893 case 5:
1894 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(5D, IM2COL, IsCacheHint,
1895 IsShared32);
1896 default:
1897 llvm_unreachable("Invalid Dimension in im2col mode for "
1898 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1899 }
1900 } else {
1901 switch (Dim) {
1902 case 1:
1903 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(1D, TILE, IsCacheHint,
1904 IsShared32);
1905 case 2:
1906 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(2D, TILE, IsCacheHint,
1907 IsShared32);
1908 case 3:
1909 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(3D, TILE, IsCacheHint,
1910 IsShared32);
1911 case 4:
1912 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(4D, TILE, IsCacheHint,
1913 IsShared32);
1914 case 5:
1915 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(5D, TILE, IsCacheHint,
1916 IsShared32);
1917 default:
1918 llvm_unreachable("Invalid Dimension in tile mode for "
1919 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1920 }
1921 }
1922}
1923
1924static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
1925 bool IsMultiCast,
1926 bool IsCacheHint, bool IsIm2Col) {
1927 if (IsIm2Col) {
1928 switch (Dim) {
1929 case 3:
1930 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, IM2COL, IsMultiCast,
1931 IsCacheHint, IsShared32);
1932 case 4:
1933 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, IM2COL, IsMultiCast,
1934 IsCacheHint, IsShared32);
1935 case 5:
1936 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, IM2COL, IsMultiCast,
1937 IsCacheHint, IsShared32);
1938 default:
1939 llvm_unreachable("Invalid Dimension in im2col mode for "
1940 "GetCpAsyncBulkTensorG2SOpcode.");
1941 }
1942 } else {
1943 switch (Dim) {
1944 case 1:
1945 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(1D, TILE, IsMultiCast,
1946 IsCacheHint, IsShared32);
1947 case 2:
1948 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(2D, TILE, IsMultiCast,
1949 IsCacheHint, IsShared32);
1950 case 3:
1951 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, TILE, IsMultiCast,
1952 IsCacheHint, IsShared32);
1953 case 4:
1954 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, TILE, IsMultiCast,
1955 IsCacheHint, IsShared32);
1956 case 5:
1957 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, TILE, IsMultiCast,
1958 IsCacheHint, IsShared32);
1959 default:
1961 "Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode.");
1962 }
1963 }
1964}
1965
1966static size_t GetDimsFromIntrinsic(unsigned IID) {
1967 switch (IID) {
1968 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
1969 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
1970 return 3;
1971 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
1972 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
1973 return 4;
1974 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
1975 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
1976 return 5;
1977 default:
1978 llvm_unreachable("Invalid im2col intrinsic in GetDimsFromIntrinsic.");
1979 }
1980}
1981
1982void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
1983 bool IsIm2Col) {
1984 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
1985 // {dst, mbar, src, dims{d0...dN}, im2col_offsets{dims-2}
1986 // multicast, cache_hint,
1987 // multicast_flag, cache_hint_flag, cta_group_flag}
1988 // NumOperands = {Chain, IID} + {Actual intrinsic args}
1989 // = {2} + {8 + dims + im2col_offsets}
1990 size_t NumOps = N->getNumOperands();
1991 size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1))
1992 : (NumOps - 10);
1993 // Offsets is always 'NumDims - 2' and only for im2col mode
1994 size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
1995 bool IsCacheHint = N->getConstantOperandVal(NumOps - 2) == 1;
1996 bool IsMultiCast = N->getConstantOperandVal(NumOps - 3) == 1;
1997 size_t NumBaseArgs = NumDims + NumOffsets + 3; // for {dst, mbar, src}
1998 size_t MultiCastIdx = NumBaseArgs + 2; // for Chain and IID
1999
2000 unsigned CTAGroupVal = N->getConstantOperandVal(NumOps - 1);
2001 if ((CTAGroupVal > 0) && !Subtarget->hasCpAsyncBulkTensorCTAGroupSupport())
2003 formatv("CpAsyncBulkTensorG2S cta_group::1/2 is not supported on sm_{}",
2004 Subtarget->getSmVersion()));
2005
2006 SDLoc DL(N);
2007 SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumBaseArgs));
2008
2009 // Push MultiCast operand, if available
2010 if (IsMultiCast)
2011 Ops.push_back(N->getOperand(MultiCastIdx));
2012
2013 // Push CacheHint operand, if available
2014 if (IsCacheHint)
2015 Ops.push_back(N->getOperand(MultiCastIdx + 1));
2016
2017 // Flag for CTA Group
2018 Ops.push_back(getI32Imm(CTAGroupVal, DL));
2019
2020 // Finally, the chain operand
2021 Ops.push_back(N->getOperand(0));
2022
2023 bool IsShared32 =
2024 CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
2025 unsigned Opcode = GetCpAsyncBulkTensorG2SOpcode(
2026 NumDims, IsShared32, IsMultiCast, IsCacheHint, IsIm2Col);
2027 ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
2028}
2029
2030void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
2031 unsigned RedOp,
2032 bool IsIm2Col) {
2033 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2034 // src, dst, dims{d0...dN}, cache_hint, cache_hint_flag
2035 // NumOperands = {Chain, IID} + {Actual intrinsic args}
2036 // = {2} + {4 + dims}
2037 size_t NumOps = N->getNumOperands();
2038 size_t NumDims = NumOps - 6;
2039 bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
2040 size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2); // src, dst, cache_hint
2041
2042 SDLoc DL(N);
2043 SmallVector<SDValue, 12> Ops(N->ops().slice(2, NumArgs));
2044 Ops.push_back(getI32Imm(RedOp, DL)); // Reduction Op
2045 Ops.push_back(N->getOperand(0)); // Chain operand
2046
2047 bool IsShared32 =
2048 CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
2050 NumDims, IsShared32, IsCacheHint, IsIm2Col);
2051 ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
2052}
2053
2054#define TCGEN05_ST_OPCODE(SHAPE, NUM) \
2055 (enableUnpack ? NVPTX::TCGEN05_ST_##SHAPE##_##NUM##_UNPACK \
2056 : NVPTX::TCGEN05_ST_##SHAPE##_##NUM)
2057
2058static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack) {
2059 switch (IID) {
2060 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2061 return TCGEN05_ST_OPCODE(16x64b, x1);
2062 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2063 return TCGEN05_ST_OPCODE(16x64b, x2);
2064 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2065 return TCGEN05_ST_OPCODE(16x64b, x4);
2066 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2067 return TCGEN05_ST_OPCODE(16x64b, x8);
2068 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2069 return TCGEN05_ST_OPCODE(16x64b, x16);
2070 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2071 return TCGEN05_ST_OPCODE(16x64b, x32);
2072 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2073 return TCGEN05_ST_OPCODE(16x64b, x64);
2074 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2075 return TCGEN05_ST_OPCODE(16x64b, x128);
2076 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2077 return TCGEN05_ST_OPCODE(16x128b, x1);
2078 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2079 return TCGEN05_ST_OPCODE(16x128b, x2);
2080 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2081 return TCGEN05_ST_OPCODE(16x128b, x4);
2082 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2083 return TCGEN05_ST_OPCODE(16x128b, x8);
2084 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2085 return TCGEN05_ST_OPCODE(16x128b, x16);
2086 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2087 return TCGEN05_ST_OPCODE(16x128b, x32);
2088 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2089 return TCGEN05_ST_OPCODE(16x128b, x64);
2090 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2091 return TCGEN05_ST_OPCODE(16x256b, x1);
2092 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2093 return TCGEN05_ST_OPCODE(16x256b, x2);
2094 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2095 return TCGEN05_ST_OPCODE(16x256b, x4);
2096 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2097 return TCGEN05_ST_OPCODE(16x256b, x8);
2098 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2099 return TCGEN05_ST_OPCODE(16x256b, x16);
2100 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2101 return TCGEN05_ST_OPCODE(16x256b, x32);
2102 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2103 return TCGEN05_ST_OPCODE(16x32bx2, x1);
2104 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2105 return TCGEN05_ST_OPCODE(16x32bx2, x2);
2106 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2107 return TCGEN05_ST_OPCODE(16x32bx2, x4);
2108 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2109 return TCGEN05_ST_OPCODE(16x32bx2, x8);
2110 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2111 return TCGEN05_ST_OPCODE(16x32bx2, x16);
2112 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2113 return TCGEN05_ST_OPCODE(16x32bx2, x32);
2114 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2115 return TCGEN05_ST_OPCODE(16x32bx2, x64);
2116 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2117 return TCGEN05_ST_OPCODE(16x32bx2, x128);
2118 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2119 return TCGEN05_ST_OPCODE(32x32b, x1);
2120 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2121 return TCGEN05_ST_OPCODE(32x32b, x2);
2122 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2123 return TCGEN05_ST_OPCODE(32x32b, x4);
2124 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2125 return TCGEN05_ST_OPCODE(32x32b, x8);
2126 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2127 return TCGEN05_ST_OPCODE(32x32b, x16);
2128 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2129 return TCGEN05_ST_OPCODE(32x32b, x32);
2130 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2131 return TCGEN05_ST_OPCODE(32x32b, x64);
2132 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2133 return TCGEN05_ST_OPCODE(32x32b, x128);
2134 }
2135 llvm_unreachable("unhandled tcgen05.st lowering");
2136}
2137
2138void NVPTXDAGToDAGISel::SelectTcgen05St(SDNode *N, bool hasOffset) {
2139 SDLoc DL(N);
2140 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2141
2143 N->getOperand(2) // taddr
2144 };
2145
2146 if (hasOffset)
2147 Operands.push_back(CurDAG->getTargetConstant(
2148 cast<ConstantSDNode>(N->getOperand(3))->getZExtValue(), DL,
2149 MVT::i32)); // Offset
2150
2151 for (unsigned I = hasOffset ? 4 : 3; I < (N->getNumOperands() - 1); I++)
2152 Operands.push_back(N->getOperand(I));
2153
2154 bool enableUnpack =
2155 cast<ConstantSDNode>(N->getOperand(N->getNumOperands() - 1))
2156 ->getZExtValue();
2157
2158 Operands.push_back(N->getOperand(0)); // Chain
2159 ReplaceNode(N, CurDAG->getMachineNode(getTcgen05StOpcode(IID, enableUnpack),
2160 DL, N->getVTList(), Operands));
2161}
2162
2163bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
2164 unsigned IID = N->getConstantOperandVal(1);
2165 using TMARedTy = llvm::nvvm::TMAReductionOp;
2166 auto CastTy = [](TMARedTy Op) { return static_cast<unsigned>(Op); };
2167 switch (IID) {
2168 default:
2169 return false;
2170 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
2171 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
2172 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
2173 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
2174 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d:
2175 SelectCpAsyncBulkTensorG2SCommon(N);
2176 return true;
2177 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
2178 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
2179 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
2180 SelectCpAsyncBulkTensorG2SCommon(N, /*IsIm2Col=*/true);
2181 return true;
2182 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
2183 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
2184 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
2185 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_4d:
2186 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_5d:
2187 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::ADD));
2188 return true;
2189 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_3d:
2190 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_4d:
2191 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_5d:
2192 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::ADD),
2193 /*IsIm2Col=*/true);
2194 return true;
2195 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_1d:
2196 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_2d:
2197 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_3d:
2198 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_4d:
2199 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_5d:
2200 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MIN));
2201 return true;
2202 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_3d:
2203 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_4d:
2204 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_5d:
2205 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MIN),
2206 /*IsIm2Col=*/true);
2207 return true;
2208 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_1d:
2209 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_2d:
2210 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_3d:
2211 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_4d:
2212 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_5d:
2213 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MAX));
2214 return true;
2215 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_3d:
2216 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_4d:
2217 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_5d:
2218 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MAX),
2219 /*IsIm2Col=*/true);
2220 return true;
2221 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_1d:
2222 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_2d:
2223 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_3d:
2224 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_4d:
2225 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_5d:
2226 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::INC));
2227 return true;
2228 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_3d:
2229 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_4d:
2230 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_5d:
2231 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::INC),
2232 /*IsIm2Col=*/true);
2233 return true;
2234 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_1d:
2235 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_2d:
2236 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_3d:
2237 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_4d:
2238 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_5d:
2239 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::DEC));
2240 return true;
2241 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_3d:
2242 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_4d:
2243 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_5d:
2244 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::DEC),
2245 /*IsIm2Col=*/true);
2246 return true;
2247 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_1d:
2248 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_2d:
2249 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_3d:
2250 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_4d:
2251 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_5d:
2252 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::AND));
2253 return true;
2254 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_3d:
2255 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_4d:
2256 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_5d:
2257 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::AND),
2258 /*IsIm2Col=*/true);
2259 return true;
2260 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_1d:
2261 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_2d:
2262 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_3d:
2263 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_4d:
2264 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_5d:
2265 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::OR));
2266 return true;
2267 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_3d:
2268 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_4d:
2269 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_5d:
2270 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::OR),
2271 /*IsIm2Col=*/true);
2272 return true;
2273 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_1d:
2274 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_2d:
2275 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_3d:
2276 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_4d:
2277 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_5d:
2278 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::XOR));
2279 return true;
2280 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_3d:
2281 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_4d:
2282 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_5d:
2283 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::XOR),
2284 /*IsIm2Col=*/true);
2285 return true;
2286
2287 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2288 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2289 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2290 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2291 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2292 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2293 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2294 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2295 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2296 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2297 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2298 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2299 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2300 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2301 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2302 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2303 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2304 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2305 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2306 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2307 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2308 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2309 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2310 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2311 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2312 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2313 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2314 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2315 case Intrinsic::nvvm_tcgen05_st_16x256b_x32: {
2316 SelectTcgen05St(N);
2317 return true;
2318 }
2319
2320 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2321 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2322 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2323 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2324 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2325 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2326 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2327 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
2328 SelectTcgen05St(N, /* hasOffset */ true);
2329 return true;
2330 }
2331 }
2332}
2333
2334void NVPTXDAGToDAGISel::selectAtomicSwap128(SDNode *N) {
2335 MemSDNode *AN = cast<MemSDNode>(N);
2336 SDLoc dl(N);
2337
2338 const SDValue Chain = N->getOperand(0);
2339 const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
2341 Ops.append(N->op_begin() + 2, N->op_end());
2342 Ops.append({
2343 getI32Imm(getMemOrder(AN), dl),
2344 getI32Imm(getAtomicScope(AN), dl),
2345 getI32Imm(getAddrSpace(AN), dl),
2346 Chain,
2347 });
2348
2349 assert(N->getOpcode() == NVPTXISD::ATOMIC_CMP_SWAP_B128 ||
2350 N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128);
2351 unsigned Opcode = N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128
2352 ? NVPTX::ATOM_EXCH_B128
2353 : NVPTX::ATOM_CAS_B128;
2354
2355 auto *ATOM = CurDAG->getMachineNode(Opcode, dl, N->getVTList(), Ops);
2356 CurDAG->setNodeMemRefs(ATOM, AN->getMemOperand());
2357
2358 ReplaceNode(N, ATOM);
2359}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define DEBUG_TYPE
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define I(x, y, z)
Definition MD5.cpp:58
mir Rename Register Operands
#define T
static unsigned getStoreVectorNumElts(SDNode *N)
static bool isAddLike(const SDValue V)
static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG)
static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG)
static size_t GetDimsFromIntrinsic(unsigned IID)
static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack)
static std::optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, std::optional< unsigned > Opcode_i16, std::optional< unsigned > Opcode_i32, std::optional< unsigned > Opcode_i64)
static cl::opt< bool > EnableMADWide("nvptx-mad-wide-opt", cl::init(false), cl::Hidden, cl::desc("Enable MAD wide optimization"))
static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim, bool IsShared32, bool IsCacheHint, bool IsIm2Col)
#define TCGEN05_LD_OPCODE(SHAPE, NUM)
static SDValue stripAssertAlign(SDValue N)
static cl::opt< bool > EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden, cl::desc("Enable reciprocal sqrt optimization"))
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32)
static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32, bool IsMultiCast, bool IsCacheHint, bool IsIm2Col)
static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, NVPTXSubtarget const *T)
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32)
#define TCGEN05_ST_OPCODE(SHAPE, NUM)
static std::optional< NVPTX::AddressSpace > convertAS(unsigned AS)
static std::pair< SDValue, SDValue > selectADDR(SDValue Addr, SelectionDAG *DAG)
static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack)
static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget, NVPTX::AddressSpace CodeAddrSpace)
This file contains the definitions of the enumerations and flags associated with NVVM Intrinsics,...
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
#define PASS_NAME
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
const SDValue & getVal() const
uint64_t getZExtValue() const
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
SimpleValueType SimpleTy
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT getVectorElementType() const
bool is64BitVector() const
Return true if this is a 64-bit vector type.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
static NVPTX::AddressSpace getAddrSpace(const MemSDNode *N)
bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions.
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
const NVPTXSubtarget * Subtarget
const NVPTXTargetLowering * getTargetLowering() const override
bool hasNativeBF16Support(int Opcode) const
bool hasRelaxedMMIO() const
bool hasMemoryOrdering() const
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
SelectionDAGISelLegacy(char &ID, std::unique_ptr< SelectionDAGISel > S)
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
SelectionDAGISel(TargetMachine &tm, CodeGenOptLevel OL=CodeGenOptLevel::Default)
virtual bool runOnMachineFunction(MachineFunction &mf)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getTargetFrameIndex(int FI, EVT VT)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
const SDValue & getValue() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ AssertAlign
AssertAlign - These nodes record if a register contains a value that has a known alignment and the tr...
Definition ISDOpcodes.h:69
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ UNPACK_VECTOR
This node is the inverse of NVPTX::BUILD_VECTOR.
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
std::string ScopeToString(Scope S)
@ SharedCluster
Definition NVPTX.h:187
std::string OrderingToString(Ordering Order)
bool isPackedVectorTy(EVT VT)
DivPrecisionLevel
Definition NVPTX.h:251
@ DefaultDevice
Definition NVPTX.h:176
@ RelaxedMMIO
Definition NVPTX.h:166
@ AcquireRelease
Definition NVPTX.h:162
@ NotAtomic
Definition NVPTX.h:155
@ SequentiallyConsistent
Definition NVPTX.h:163
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition bit.h:279
FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOptLevel OptLevel)
createNVPTXISelDag - This pass converts a legalized DAG into a NVPTX-specific DAG,...
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282
const char * toIRString(AtomicOrdering ao)
String used by LLVM IR to represent atomic ordering.
auto formatv(bool Validate, const char *Fmt, Ts &&...Vals)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:270
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
AtomicOrdering
Atomic ordering for LLVM's memory model.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:870
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static LLVM_ABI const fltSemantics & BFloat() LLVM_READNONE
Definition APFloat.cpp:265
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
NVPTXScopes()=default
NVPTX::Scope operator[](SyncScope::ID ID) const