Thanks to visit codestin.com
Credit goes to llvm.org

LLVM 22.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "AMDGPUTargetMachine.h"
19#include "GCNSubtarget.h"
22#include "SIRegisterInfo.h"
23#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/Statistic.h"
39#include "llvm/IR/IRBuilder.h"
41#include "llvm/IR/IntrinsicsAMDGPU.h"
42#include "llvm/IR/IntrinsicsR600.h"
43#include "llvm/IR/MDBuilder.h"
46#include "llvm/Support/ModRef.h"
48#include <optional>
49
50using namespace llvm;
51using namespace llvm::SDPatternMatch;
52
53#define DEBUG_TYPE "si-lower"
54
55STATISTIC(NumTailCalls, "Number of tail calls");
56
57static cl::opt<bool>
58 DisableLoopAlignment("amdgpu-disable-loop-alignment",
59 cl::desc("Do not align and prefetch loops"),
60 cl::init(false));
61
63 "amdgpu-use-divergent-register-indexing", cl::Hidden,
64 cl::desc("Use indirect register addressing for divergent indexes"),
65 cl::init(false));
66
67// TODO: This option should be removed once we switch to always using PTRADD in
68// the SelectionDAG.
70 "amdgpu-use-sdag-ptradd", cl::Hidden,
71 cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
72 "SelectionDAG ISel"),
73 cl::init(false));
74
77 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
78}
79
82 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
83}
84
85static unsigned findFirstFreeSGPR(CCState &CCInfo) {
86 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
87 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
88 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
89 return AMDGPU::SGPR0 + Reg;
90 }
91 }
92 llvm_unreachable("Cannot allocate sgpr");
93}
94
96 const GCNSubtarget &STI)
97 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
98 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
99 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
100
101 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
102 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
103
104 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
105
106 const SIRegisterInfo *TRI = STI.getRegisterInfo();
107 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
108
109 addRegisterClass(MVT::f64, V64RegClass);
110 addRegisterClass(MVT::v2f32, V64RegClass);
111 addRegisterClass(MVT::Untyped, V64RegClass);
112
113 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
114 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
115
116 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
117 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
118
119 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
120 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
121
122 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
123 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
124
125 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
126 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
127
128 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
129 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
130
131 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
132 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
133
134 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
135 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
136
137 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
138 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
139
140 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
141 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
142
143 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
144 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
145
146 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
147 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
148
149 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
150 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
151
152 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
153 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
154
155 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
156 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
157
158 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
159 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
160
161 if (Subtarget->has16BitInsts()) {
162 if (Subtarget->useRealTrue16Insts()) {
163 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
164 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
165 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
166 } else {
167 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
168 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
169 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
170 }
171
172 // Unless there are also VOP3P operations, not operations are really legal.
173 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
174 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
175 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
176 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
177 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
178 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
179 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
180 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
181 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
182 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
183 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
184 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
185 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
186 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
187 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
188 }
189
190 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
191 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
192
193 computeRegisterProperties(Subtarget->getRegisterInfo());
194
195 // The boolean content concept here is too inflexible. Compares only ever
196 // really produce a 1-bit result. Any copy/extend from these will turn into a
197 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
198 // it's what most targets use.
201
202 // We need to custom lower vector stores from local memory
203 setOperationAction(ISD::LOAD,
204 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
205 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
206 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
207 MVT::i1, MVT::v32i32},
208 Custom);
209
210 setOperationAction(ISD::STORE,
211 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
212 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
213 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
214 MVT::i1, MVT::v32i32},
215 Custom);
216
217 if (isTypeLegal(MVT::bf16)) {
218 for (unsigned Opc :
220 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
221 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
222 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
223 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
224 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
225 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
226 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
227 ISD::SETCC}) {
228 // FIXME: The promoted to type shouldn't need to be explicit
229 setOperationAction(Opc, MVT::bf16, Promote);
230 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
231 }
232
234
236 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
237
238 setOperationAction(ISD::FABS, MVT::bf16, Legal);
239 setOperationAction(ISD::FNEG, MVT::bf16, Legal);
241
242 // We only need to custom lower because we can't specify an action for bf16
243 // sources.
246 }
247
248 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
249 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
250 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
251 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
252 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
253 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
254 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
255 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
256 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
257 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
258 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
259 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
260 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
261 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
262 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
263 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
264
265 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
266 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
267 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
268 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
269 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
270 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
271 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
272
273 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
274
278 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
279
280 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
281
283 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
284
286 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
287 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
288
290 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
291 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
292 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
293 Expand);
295 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
296 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
297 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
298 Expand);
299
301 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
302 MVT::v3i16, MVT::v4i16, MVT::Other},
303 Custom);
304
305 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
306 setOperationAction(ISD::BR_CC,
307 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
308
310
312
314 Expand);
315
316#if 0
318#endif
319
320 // We only support LOAD/STORE and vector manipulation ops for vectors
321 // with > 4 elements.
322 for (MVT VT :
323 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
324 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
325 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
326 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
327 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
328 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
329 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
330 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
331 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
332 switch (Op) {
333 case ISD::LOAD:
334 case ISD::STORE:
336 case ISD::BITCAST:
337 case ISD::UNDEF:
341 case ISD::IS_FPCLASS:
342 break;
347 break;
348 default:
350 break;
351 }
352 }
353 }
354
355 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
356
357 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
358 // is expanded to avoid having two separate loops in case the index is a VGPR.
359
360 // Most operations are naturally 32-bit vector operations. We only support
361 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
362 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
364 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
365
367 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
368
370 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
371
373 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
374 }
375
376 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
378 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
379
381 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
382
384 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
385
387 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
388 }
389
390 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
392 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
393
395 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
396
398 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
399
401 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
402 }
403
404 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
406 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
407
409 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
410
412 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
413
415 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
416 }
417
418 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
420 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
421
423 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
424
426 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
427
429 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
430 }
431
433 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
434 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
435 Custom);
436
437 if (Subtarget->hasPkMovB32()) {
438 // TODO: 16-bit element vectors should be legal with even aligned elements.
439 // TODO: Can be legal with wider source types than the result with
440 // subregister extracts.
441 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
442 }
443
445 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
446 // instead lower to cndmask in SITargetLowering::LowerSELECT().
448 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
449 // alignbit.
450 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
451
452 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
453 Custom);
454
455 // Avoid stack access for these.
456 // TODO: Generalize to more vector types.
458 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
459 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
460 Custom);
461
462 // Deal with vec3 vector operations when widened to vec4.
464 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
465
466 // Deal with vec5/6/7 vector operations when widened to vec8.
468 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
469 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
470 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
471 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
472 Custom);
473
474 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
475 // and output demarshalling
476 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
477
478 // We can't return success/failure, only the old value,
479 // let LLVM add the comparison
480 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
481 Expand);
482
483 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
484
485 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
486
487 // FIXME: This should be narrowed to i32, but that only happens if i64 is
488 // illegal.
489 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
490 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
491
492 // On SI this is s_memtime and s_memrealtime on VI.
493 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
494
495 if (Subtarget->hasSMemRealTime() ||
496 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
497 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
498 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
499
500 if (Subtarget->has16BitInsts()) {
501 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
502 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
503 } else {
504 setOperationAction(ISD::FSQRT, MVT::f16, Custom);
505 }
506
507 if (Subtarget->hasMadMacF32Insts())
509
510 if (!Subtarget->hasBFI())
511 // fcopysign can be done in a single instruction with BFI.
512 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
513
514 if (!Subtarget->hasBCNT(32))
516
517 if (!Subtarget->hasBCNT(64))
519
520 if (Subtarget->hasFFBH())
522
523 if (Subtarget->hasFFBL())
525
526 // We only really have 32-bit BFE instructions (and 16-bit on VI).
527 //
528 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
529 // effort to match them now. We want this to be false for i64 cases when the
530 // extraction isn't restricted to the upper or lower half. Ideally we would
531 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
532 // span the midpoint are probably relatively rare, so don't worry about them
533 // for now.
534 if (Subtarget->hasBFE())
536
537 // Clamp modifier on add/sub
538 if (Subtarget->hasIntClamp())
540
541 if (Subtarget->hasAddNoCarry())
542 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
543 Legal);
544
546 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
547 {MVT::f32, MVT::f64}, Custom);
548
549 // These are really only legal for ieee_mode functions. We should be avoiding
550 // them for functions that don't have ieee_mode enabled, so just say they are
551 // legal.
552 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
553 {MVT::f32, MVT::f64}, Legal);
554
555 if (Subtarget->haveRoundOpsF64())
556 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
557 Legal);
558 else
559 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
560 MVT::f64, Custom);
561
562 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
563 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
564 Legal);
565 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
566
567 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
569
570 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
571 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
572
573 // Custom lower these because we can't specify a rule based on an illegal
574 // source bf16.
575 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
576 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
577
578 if (Subtarget->has16BitInsts()) {
581 MVT::i16, Legal);
582
583 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
584
586 MVT::i16, Expand);
587
591 ISD::CTPOP},
592 MVT::i16, Promote);
593
594 setOperationAction(ISD::LOAD, MVT::i16, Custom);
595
596 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
597
598 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
599 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
600 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
601 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
602
606
608
609 // F16 - Constant Actions.
612
613 // F16 - Load/Store Actions.
614 setOperationAction(ISD::LOAD, MVT::f16, Promote);
615 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
616 setOperationAction(ISD::STORE, MVT::f16, Promote);
617 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
618
619 // BF16 - Load/Store Actions.
620 setOperationAction(ISD::LOAD, MVT::bf16, Promote);
621 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
622 setOperationAction(ISD::STORE, MVT::bf16, Promote);
623 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
624
625 // F16 - VOP1 Actions.
627 ISD::FSIN, ISD::FROUND},
628 MVT::f16, Custom);
629
630 // BF16 - VOP1 Actions.
631 if (Subtarget->hasBF16TransInsts())
632 setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom);
633
636
637 // F16 - VOP2 Actions.
638 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
639 Expand);
640 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
641 setOperationAction(ISD::FFREXP, MVT::f16, Custom);
643
644 // F16 - VOP3 Actions.
646 if (STI.hasMadF16())
648
649 for (MVT VT :
650 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
651 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
652 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
653 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
654 switch (Op) {
655 case ISD::LOAD:
656 case ISD::STORE:
658 case ISD::BITCAST:
659 case ISD::UNDEF:
664 case ISD::IS_FPCLASS:
665 break;
669 break;
670 default:
672 break;
673 }
674 }
675 }
676
677 // v_perm_b32 can handle either of these.
678 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
680
681 // XXX - Do these do anything? Vector constants turn into build_vector.
682 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
683
684 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
685 Legal);
686
687 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
688 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
689 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
690 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
691
692 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
693 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
694 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
695 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
696
697 setOperationAction(ISD::AND, MVT::v2i16, Promote);
698 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
699 setOperationAction(ISD::OR, MVT::v2i16, Promote);
700 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
701 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
702 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
703
704 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
705 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
706 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
707 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
708 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
709 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
710
711 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
712 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
713 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
714 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
715 setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
716 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
717
718 setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
719 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
720 setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
721 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
722 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
723 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
724
725 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
726 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
727 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
728 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
729
730 setOperationAction(ISD::STORE, MVT::v8i16, Promote);
731 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
732 setOperationAction(ISD::STORE, MVT::v8f16, Promote);
733 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
734 setOperationAction(ISD::STORE, MVT::v8bf16, Promote);
735 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
736
737 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
738 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
739 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
740 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
741 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
742 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
743
744 setOperationAction(ISD::STORE, MVT::v16i16, Promote);
745 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
746 setOperationAction(ISD::STORE, MVT::v16f16, Promote);
747 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
748 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
749 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
750
751 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
752 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
753 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
754 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
755 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
756 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
757
758 setOperationAction(ISD::STORE, MVT::v32i16, Promote);
759 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
760 setOperationAction(ISD::STORE, MVT::v32f16, Promote);
761 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
762 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
763 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
764
766 MVT::v2i32, Expand);
767 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
768
770 MVT::v4i32, Expand);
771
773 MVT::v8i32, Expand);
774
775 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
776 Subtarget->hasVOP3PInsts() ? Legal : Custom);
777
778 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
779 // This isn't really legal, but this avoids the legalizer unrolling it (and
780 // allows matching fneg (fabs x) patterns)
781 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
782
783 // Can do this in one BFI plus a constant materialize.
785 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
786 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
787 MVT::v32f16, MVT::v32bf16},
788 Custom);
789
791 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
792 MVT::f16, Custom);
793 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
794
795 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
796 ISD::FMAXIMUMNUM},
797 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
798 Custom);
799
800 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
801 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
802 Expand);
803
804 for (MVT Vec16 :
805 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
806 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
809 Vec16, Custom);
811 }
812 }
813
814 if (Subtarget->hasVOP3PInsts()) {
818 MVT::v2i16, Legal);
819
820 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
821 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
822 MVT::v2f16, Legal);
823
825 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
826
828 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
829 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
830 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
831 Custom);
832
833 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
834 // Split vector operations.
839 VT, Custom);
840
841 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
842 // Split vector operations.
844 VT, Custom);
845
847 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
848 {MVT::v2f16, MVT::v4f16}, Custom);
849
850 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
851 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
852 Custom);
853
854 if (Subtarget->hasPackedFP32Ops()) {
856 MVT::v2f32, Legal);
858 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
859 Custom);
860 }
861 }
862
863 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
864
865 if (Subtarget->has16BitInsts()) {
867 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
869 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
870 } else {
871 // Legalization hack.
872 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
873
874 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
875 }
876
878 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
879 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
880 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
881 MVT::v32f16, MVT::v32bf16},
882 Custom);
883
885
886 if (Subtarget->hasVectorMulU64())
888 else if (Subtarget->hasScalarSMulU64())
890
891 if (Subtarget->hasMad64_32())
893
894 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
895 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
896
897 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
898 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
899 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
900 } else {
901 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
902 if (Subtarget->hasMinimum3Maximum3F32())
903 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
904
905 if (Subtarget->hasMinimum3Maximum3PKF16()) {
906 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
907
908 // If only the vector form is available, we need to widen to a vector.
909 if (!Subtarget->hasMinimum3Maximum3F16())
910 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
911 }
912 }
913
914 if (Subtarget->hasVOP3PInsts()) {
915 // We want to break these into v2f16 pieces, not scalarize.
916 setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
917 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
918 Custom);
919 }
920
921 if (Subtarget->hasIntMinMax64())
923 Legal);
924
926 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
927 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
928 MVT::i8},
929 Custom);
930
932 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
933 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
934 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
935 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
936 Custom);
937
939 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
940 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
941 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
942 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
943 Custom);
944
945 setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
947 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
948 setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
949 setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
950
951 // TODO: Could move this to custom lowering, could benefit from combines on
952 // extract of relevant bits.
953 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
954
956
957 if (Subtarget->hasBF16ConversionInsts()) {
958 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
960 }
961
962 if (Subtarget->hasBF16PackedInsts()) {
964 {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
965 MVT::v2bf16, Legal);
966 }
967
968 if (Subtarget->hasBF16TransInsts()) {
969 setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
970 }
971
972 if (Subtarget->hasCvtPkF16F32Inst()) {
974 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
975 Custom);
976 }
977
979 ISD::PTRADD,
981 ISD::SUB,
983 ISD::MUL,
984 ISD::FADD,
985 ISD::FSUB,
986 ISD::FDIV,
987 ISD::FMUL,
988 ISD::FMINNUM,
989 ISD::FMAXNUM,
990 ISD::FMINNUM_IEEE,
991 ISD::FMAXNUM_IEEE,
992 ISD::FMINIMUM,
993 ISD::FMAXIMUM,
994 ISD::FMINIMUMNUM,
995 ISD::FMAXIMUMNUM,
996 ISD::FMA,
997 ISD::SMIN,
998 ISD::SMAX,
999 ISD::UMIN,
1000 ISD::UMAX,
1001 ISD::SETCC,
1003 ISD::SMIN,
1004 ISD::SMAX,
1005 ISD::UMIN,
1006 ISD::UMAX,
1007 ISD::AND,
1008 ISD::OR,
1009 ISD::XOR,
1010 ISD::SHL,
1011 ISD::SRL,
1012 ISD::SRA,
1013 ISD::FSHR,
1023
1024 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1026
1027 // All memory operations. Some folding on the pointer operand is done to help
1028 // matching the constant offsets in the addressing modes.
1029 setTargetDAGCombine({ISD::LOAD,
1030 ISD::STORE,
1031 ISD::ATOMIC_LOAD,
1032 ISD::ATOMIC_STORE,
1033 ISD::ATOMIC_CMP_SWAP,
1034 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1035 ISD::ATOMIC_SWAP,
1036 ISD::ATOMIC_LOAD_ADD,
1037 ISD::ATOMIC_LOAD_SUB,
1038 ISD::ATOMIC_LOAD_AND,
1039 ISD::ATOMIC_LOAD_OR,
1040 ISD::ATOMIC_LOAD_XOR,
1041 ISD::ATOMIC_LOAD_NAND,
1042 ISD::ATOMIC_LOAD_MIN,
1043 ISD::ATOMIC_LOAD_MAX,
1044 ISD::ATOMIC_LOAD_UMIN,
1045 ISD::ATOMIC_LOAD_UMAX,
1046 ISD::ATOMIC_LOAD_FADD,
1047 ISD::ATOMIC_LOAD_FMIN,
1048 ISD::ATOMIC_LOAD_FMAX,
1049 ISD::ATOMIC_LOAD_UINC_WRAP,
1050 ISD::ATOMIC_LOAD_UDEC_WRAP,
1053
1054 // FIXME: In other contexts we pretend this is a per-function property.
1056
1058}
1059
1060const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1061
1063 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1064 return RCRegs;
1065}
1066
1067//===----------------------------------------------------------------------===//
1068// TargetLowering queries
1069//===----------------------------------------------------------------------===//
1070
1071// v_mad_mix* support a conversion from f16 to f32.
1072//
1073// There is only one special case when denormals are enabled we don't currently,
1074// where this is OK to use.
1075bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1076 EVT DestVT, EVT SrcVT) const {
1077 return DestVT.getScalarType() == MVT::f32 &&
1078 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1079 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1080 SrcVT.getScalarType() == MVT::f16) ||
1081 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1082 SrcVT.getScalarType() == MVT::bf16)) &&
1083 // TODO: This probably only requires no input flushing?
1085}
1086
1088 LLT DestTy, LLT SrcTy) const {
1089 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1090 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1091 DestTy.getScalarSizeInBits() == 32 &&
1092 SrcTy.getScalarSizeInBits() == 16 &&
1093 // TODO: This probably only requires no input flushing?
1094 denormalModeIsFlushAllF32(*MI.getMF());
1095}
1096
1098 // SI has some legal vector types, but no legal vector operations. Say no
1099 // shuffles are legal in order to prefer scalarizing some vector operations.
1100 return false;
1101}
1102
1104 CallingConv::ID CC,
1105 EVT VT) const {
1107 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1108
1109 if (VT.isVector()) {
1110 EVT ScalarVT = VT.getScalarType();
1111 unsigned Size = ScalarVT.getSizeInBits();
1112 if (Size == 16) {
1113 if (Subtarget->has16BitInsts()) {
1114 if (VT.isInteger())
1115 return MVT::v2i16;
1116 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1117 }
1118 return VT.isInteger() ? MVT::i32 : MVT::f32;
1119 }
1120
1121 if (Size < 16)
1122 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1123 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1124 }
1125
1126 if (VT.getSizeInBits() > 32)
1127 return MVT::i32;
1128
1129 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1130}
1131
1133 CallingConv::ID CC,
1134 EVT VT) const {
1136 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1137
1138 if (VT.isVector()) {
1139 unsigned NumElts = VT.getVectorNumElements();
1140 EVT ScalarVT = VT.getScalarType();
1141 unsigned Size = ScalarVT.getSizeInBits();
1142
1143 // FIXME: Should probably promote 8-bit vectors to i16.
1144 if (Size == 16 && Subtarget->has16BitInsts())
1145 return (NumElts + 1) / 2;
1146
1147 if (Size <= 32)
1148 return NumElts;
1149
1150 if (Size > 32)
1151 return NumElts * ((Size + 31) / 32);
1152 } else if (VT.getSizeInBits() > 32)
1153 return (VT.getSizeInBits() + 31) / 32;
1154
1155 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1156}
1157
1159 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1160 unsigned &NumIntermediates, MVT &RegisterVT) const {
1161 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1162 unsigned NumElts = VT.getVectorNumElements();
1163 EVT ScalarVT = VT.getScalarType();
1164 unsigned Size = ScalarVT.getSizeInBits();
1165 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1166 // support, but unless we can properly handle 3-vectors, it will be still be
1167 // inconsistent.
1168 if (Size == 16 && Subtarget->has16BitInsts()) {
1169 if (ScalarVT == MVT::bf16) {
1170 RegisterVT = MVT::i32;
1171 IntermediateVT = MVT::v2bf16;
1172 } else {
1173 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1174 IntermediateVT = RegisterVT;
1175 }
1176 NumIntermediates = (NumElts + 1) / 2;
1177 return NumIntermediates;
1178 }
1179
1180 if (Size == 32) {
1181 RegisterVT = ScalarVT.getSimpleVT();
1182 IntermediateVT = RegisterVT;
1183 NumIntermediates = NumElts;
1184 return NumIntermediates;
1185 }
1186
1187 if (Size < 16 && Subtarget->has16BitInsts()) {
1188 // FIXME: Should probably form v2i16 pieces
1189 RegisterVT = MVT::i16;
1190 IntermediateVT = ScalarVT;
1191 NumIntermediates = NumElts;
1192 return NumIntermediates;
1193 }
1194
1195 if (Size != 16 && Size <= 32) {
1196 RegisterVT = MVT::i32;
1197 IntermediateVT = ScalarVT;
1198 NumIntermediates = NumElts;
1199 return NumIntermediates;
1200 }
1201
1202 if (Size > 32) {
1203 RegisterVT = MVT::i32;
1204 IntermediateVT = RegisterVT;
1205 NumIntermediates = NumElts * ((Size + 31) / 32);
1206 return NumIntermediates;
1207 }
1208 }
1209
1211 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1212}
1213
1215 const DataLayout &DL, Type *Ty,
1216 unsigned MaxNumLanes) {
1217 assert(MaxNumLanes != 0);
1218
1219 LLVMContext &Ctx = Ty->getContext();
1220 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1221 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1222 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1223 NumElts);
1224 }
1225
1226 return TLI.getValueType(DL, Ty);
1227}
1228
1229// Peek through TFE struct returns to only use the data size.
1231 const DataLayout &DL, Type *Ty,
1232 unsigned MaxNumLanes) {
1233 auto *ST = dyn_cast<StructType>(Ty);
1234 if (!ST)
1235 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1236
1237 // TFE intrinsics return an aggregate type.
1238 assert(ST->getNumContainedTypes() == 2 &&
1239 ST->getContainedType(1)->isIntegerTy(32));
1240 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1241}
1242
1243/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1244/// in-memory representation. This return value is a custom type because there
1245/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1246/// could cause issues during codegen, these address space 7 pointers will be
1247/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1248/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1249/// for cost modeling, to work. (This also sets us up decently for doing the
1250/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1252 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1253 return MVT::amdgpuBufferFatPointer;
1255 DL.getPointerSizeInBits(AS) == 192)
1256 return MVT::amdgpuBufferStridedPointer;
1258}
1259/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1260/// v8i32 when padding is added.
1261/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1262/// also v8i32 with padding.
1264 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1265 DL.getPointerSizeInBits(AS) == 160) ||
1267 DL.getPointerSizeInBits(AS) == 192))
1268 return MVT::v8i32;
1270}
1271
1272static unsigned getIntrMemWidth(unsigned IntrID) {
1273 switch (IntrID) {
1274 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1275 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1276 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1277 return 8;
1278 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1279 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1280 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1281 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1282 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1283 return 32;
1284 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1285 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1286 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1287 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1288 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1289 return 64;
1290 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1291 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1292 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1293 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1294 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1295 return 128;
1296 default:
1297 llvm_unreachable("Unknown width");
1298 }
1299}
1300
1301static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad,
1303 Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
1304 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1305 switch (AtomicOrderingCABI(Ord)) {
1308 break;
1311 break;
1314 break;
1315 default:
1317 break;
1318 }
1319
1320 Info.flags =
1322 Info.flags |= MOCooperative;
1323
1324 MDNode *ScopeMD = cast<MDNode>(
1325 cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
1326 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1327 Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
1328}
1329
1331 const CallInst &CI,
1332 MachineFunction &MF,
1333 unsigned IntrID) const {
1334 Info.flags = MachineMemOperand::MONone;
1335 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1336 Info.flags |= MachineMemOperand::MOInvariant;
1337 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1339 Info.flags |= getTargetMMOFlags(CI);
1340
1341 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1343 AttributeSet Attr =
1345 MemoryEffects ME = Attr.getMemoryEffects();
1346 if (ME.doesNotAccessMemory())
1347 return false;
1348
1349 // TODO: Should images get their own address space?
1350 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1351
1352 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1353 if (RsrcIntr->IsImage) {
1354 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1356 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1357 Info.align.reset();
1358 }
1359
1360 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1361 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1362 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1363 // We conservatively set the memory operand of a buffer intrinsic to the
1364 // base resource pointer, so that we can access alias information about
1365 // those pointers. Cases like "this points at the same value
1366 // but with a different offset" are handled in
1367 // areMemAccessesTriviallyDisjoint.
1368 Info.ptrVal = RsrcArg;
1369 }
1370
1371 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1372 if (!IsSPrefetch) {
1373 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1374 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1375 Info.flags |= MachineMemOperand::MOVolatile;
1376 }
1377
1379 if (ME.onlyReadsMemory()) {
1380 if (RsrcIntr->IsImage) {
1381 unsigned MaxNumLanes = 4;
1382
1383 if (!BaseOpcode->Gather4) {
1384 // If this isn't a gather, we may have excess loaded elements in the
1385 // IR type. Check the dmask for the real number of elements loaded.
1386 unsigned DMask =
1387 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1388 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1389 }
1390
1391 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1392 CI.getType(), MaxNumLanes);
1393 } else {
1394 Info.memVT =
1396 std::numeric_limits<unsigned>::max());
1397 }
1398
1399 // FIXME: What does alignment mean for an image?
1400 Info.opc = ISD::INTRINSIC_W_CHAIN;
1401 Info.flags |= MachineMemOperand::MOLoad;
1402 } else if (ME.onlyWritesMemory()) {
1403 Info.opc = ISD::INTRINSIC_VOID;
1404
1405 Type *DataTy = CI.getArgOperand(0)->getType();
1406 if (RsrcIntr->IsImage) {
1407 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1408 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1409 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1410 DMaskLanes);
1411 } else
1412 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1413
1414 Info.flags |= MachineMemOperand::MOStore;
1415 } else {
1416 // Atomic, NoReturn Sampler or prefetch
1417 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1419 Info.flags |=
1421
1422 if (!IsSPrefetch)
1423 Info.flags |= MachineMemOperand::MOStore;
1424
1425 switch (IntrID) {
1426 default:
1427 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1428 // Fake memory access type for no return sampler intrinsics
1429 Info.memVT = MVT::i32;
1430 } else {
1431 // XXX - Should this be volatile without known ordering?
1432 Info.flags |= MachineMemOperand::MOVolatile;
1433 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1434 }
1435 break;
1436 case Intrinsic::amdgcn_raw_buffer_load_lds:
1437 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1438 case Intrinsic::amdgcn_struct_buffer_load_lds:
1439 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1440 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1441 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1442 Info.ptrVal = CI.getArgOperand(1);
1443 return true;
1444 }
1445 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1446 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1447 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1448 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1449 Info.memVT =
1451 std::numeric_limits<unsigned>::max());
1452 Info.flags &= ~MachineMemOperand::MOStore;
1453 return true;
1454 }
1455 }
1456 }
1457 return true;
1458 }
1459
1460 switch (IntrID) {
1461 case Intrinsic::amdgcn_ds_ordered_add:
1462 case Intrinsic::amdgcn_ds_ordered_swap: {
1463 Info.opc = ISD::INTRINSIC_W_CHAIN;
1464 Info.memVT = MVT::getVT(CI.getType());
1465 Info.ptrVal = CI.getOperand(0);
1466 Info.align.reset();
1468
1469 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1470 if (!Vol->isZero())
1471 Info.flags |= MachineMemOperand::MOVolatile;
1472
1473 return true;
1474 }
1475 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1476 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1477 Info.opc = ISD::INTRINSIC_W_CHAIN;
1478 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1479 Info.ptrVal = nullptr;
1480 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1482 return true;
1483 }
1484 case Intrinsic::amdgcn_ds_append:
1485 case Intrinsic::amdgcn_ds_consume: {
1486 Info.opc = ISD::INTRINSIC_W_CHAIN;
1487 Info.memVT = MVT::getVT(CI.getType());
1488 Info.ptrVal = CI.getOperand(0);
1489 Info.align.reset();
1491
1492 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1493 if (!Vol->isZero())
1494 Info.flags |= MachineMemOperand::MOVolatile;
1495
1496 return true;
1497 }
1498 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1499 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1500 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1503 Info.memVT = MVT::getVT(CI.getType());
1504 Info.ptrVal = CI.getOperand(0);
1505 Info.memVT = MVT::i64;
1506 Info.size = 8;
1507 Info.align.reset();
1509 return true;
1510 }
1511 case Intrinsic::amdgcn_global_atomic_csub: {
1512 Info.opc = ISD::INTRINSIC_W_CHAIN;
1513 Info.memVT = MVT::getVT(CI.getType());
1514 Info.ptrVal = CI.getOperand(0);
1515 Info.align.reset();
1518 return true;
1519 }
1520 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1521 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1522 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1523 Info.opc = ISD::INTRINSIC_W_CHAIN;
1524 Info.memVT =
1525 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1526 ? CI.getType()
1528 ->getElementType(0)); // XXX: what is correct VT?
1529
1530 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1531 Info.align.reset();
1532 Info.flags |=
1534 return true;
1535 }
1536 case Intrinsic::amdgcn_global_atomic_fmin_num:
1537 case Intrinsic::amdgcn_global_atomic_fmax_num:
1538 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1539 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1540 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1541 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1542 Info.opc = ISD::INTRINSIC_W_CHAIN;
1543 Info.memVT = MVT::getVT(CI.getType());
1544 Info.ptrVal = CI.getOperand(0);
1545 Info.align.reset();
1549 return true;
1550 }
1551 case Intrinsic::amdgcn_flat_load_monitor_b32:
1552 case Intrinsic::amdgcn_flat_load_monitor_b64:
1553 case Intrinsic::amdgcn_flat_load_monitor_b128:
1554 case Intrinsic::amdgcn_global_load_monitor_b32:
1555 case Intrinsic::amdgcn_global_load_monitor_b64:
1556 case Intrinsic::amdgcn_global_load_monitor_b128:
1557 case Intrinsic::amdgcn_cluster_load_b32:
1558 case Intrinsic::amdgcn_cluster_load_b64:
1559 case Intrinsic::amdgcn_cluster_load_b128:
1560 case Intrinsic::amdgcn_ds_load_tr6_b96:
1561 case Intrinsic::amdgcn_ds_load_tr4_b64:
1562 case Intrinsic::amdgcn_ds_load_tr8_b64:
1563 case Intrinsic::amdgcn_ds_load_tr16_b128:
1564 case Intrinsic::amdgcn_global_load_tr6_b96:
1565 case Intrinsic::amdgcn_global_load_tr4_b64:
1566 case Intrinsic::amdgcn_global_load_tr_b64:
1567 case Intrinsic::amdgcn_global_load_tr_b128:
1568 case Intrinsic::amdgcn_ds_read_tr4_b64:
1569 case Intrinsic::amdgcn_ds_read_tr6_b96:
1570 case Intrinsic::amdgcn_ds_read_tr8_b64:
1571 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1572 Info.opc = ISD::INTRINSIC_W_CHAIN;
1573 Info.memVT = MVT::getVT(CI.getType());
1574 Info.ptrVal = CI.getOperand(0);
1575 Info.align.reset();
1576 Info.flags |= MachineMemOperand::MOLoad;
1577 return true;
1578 }
1579 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1580 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1581 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1582 Info.opc = ISD::INTRINSIC_W_CHAIN;
1583 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1584 Info.ptrVal = CI.getOperand(0);
1585 Info.align.reset();
1586 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
1587 return true;
1588 }
1589 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1590 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1591 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1592 Info.opc = ISD::INTRINSIC_VOID;
1593 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1594 Info.ptrVal = CI.getArgOperand(0);
1595 Info.align.reset();
1596 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
1597 return true;
1598 }
1599 case Intrinsic::amdgcn_ds_gws_init:
1600 case Intrinsic::amdgcn_ds_gws_barrier:
1601 case Intrinsic::amdgcn_ds_gws_sema_v:
1602 case Intrinsic::amdgcn_ds_gws_sema_br:
1603 case Intrinsic::amdgcn_ds_gws_sema_p:
1604 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1605 Info.opc = ISD::INTRINSIC_VOID;
1606
1607 const GCNTargetMachine &TM =
1608 static_cast<const GCNTargetMachine &>(getTargetMachine());
1609
1611 Info.ptrVal = MFI->getGWSPSV(TM);
1612
1613 // This is an abstract access, but we need to specify a type and size.
1614 Info.memVT = MVT::i32;
1615 Info.size = 4;
1616 Info.align = Align(4);
1617
1618 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1619 Info.flags |= MachineMemOperand::MOLoad;
1620 else
1621 Info.flags |= MachineMemOperand::MOStore;
1622 return true;
1623 }
1624 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1625 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1626 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1627 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1628 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1629 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1630 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1631 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1632 Info.opc = ISD::INTRINSIC_VOID;
1633 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1634 Info.ptrVal = CI.getArgOperand(1);
1636 return true;
1637 }
1638 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1639 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1640 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1641 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1642 Info.opc = ISD::INTRINSIC_VOID;
1643 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1644 Info.ptrVal = CI.getArgOperand(0);
1646 return true;
1647 }
1648 case Intrinsic::amdgcn_load_to_lds:
1649 case Intrinsic::amdgcn_global_load_lds: {
1650 Info.opc = ISD::INTRINSIC_VOID;
1651 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1652 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1653 Info.ptrVal = CI.getArgOperand(1);
1655 return true;
1656 }
1657 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1658 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1659 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1660 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1661 Info.opc = ISD::INTRINSIC_W_CHAIN;
1662
1663 const GCNTargetMachine &TM =
1664 static_cast<const GCNTargetMachine &>(getTargetMachine());
1665
1667 Info.ptrVal = MFI->getGWSPSV(TM);
1668
1669 // This is an abstract access, but we need to specify a type and size.
1670 Info.memVT = MVT::i32;
1671 Info.size = 4;
1672 Info.align = Align(4);
1673
1675 return true;
1676 }
1677 case Intrinsic::amdgcn_s_prefetch_data:
1678 case Intrinsic::amdgcn_flat_prefetch:
1679 case Intrinsic::amdgcn_global_prefetch: {
1680 Info.opc = ISD::INTRINSIC_VOID;
1681 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1682 Info.ptrVal = CI.getArgOperand(0);
1683 Info.flags |= MachineMemOperand::MOLoad;
1684 return true;
1685 }
1686 default:
1687 return false;
1688 }
1689}
1690
1692 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1694 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1695 // The DAG's ValueType loses the addrspaces.
1696 // Add them as 2 extra Constant operands "from" and "to".
1697 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1698 unsigned DstAS = I.getType()->getPointerAddressSpace();
1699 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1700 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1701 break;
1702 }
1703 default:
1704 break;
1705 }
1706}
1707
1710 Type *&AccessTy) const {
1711 Value *Ptr = nullptr;
1712 switch (II->getIntrinsicID()) {
1713 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1714 case Intrinsic::amdgcn_cluster_load_b128:
1715 case Intrinsic::amdgcn_cluster_load_b64:
1716 case Intrinsic::amdgcn_cluster_load_b32:
1717 case Intrinsic::amdgcn_ds_append:
1718 case Intrinsic::amdgcn_ds_consume:
1719 case Intrinsic::amdgcn_ds_load_tr8_b64:
1720 case Intrinsic::amdgcn_ds_load_tr16_b128:
1721 case Intrinsic::amdgcn_ds_load_tr4_b64:
1722 case Intrinsic::amdgcn_ds_load_tr6_b96:
1723 case Intrinsic::amdgcn_ds_read_tr4_b64:
1724 case Intrinsic::amdgcn_ds_read_tr6_b96:
1725 case Intrinsic::amdgcn_ds_read_tr8_b64:
1726 case Intrinsic::amdgcn_ds_read_tr16_b64:
1727 case Intrinsic::amdgcn_ds_ordered_add:
1728 case Intrinsic::amdgcn_ds_ordered_swap:
1729 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1730 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1731 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1732 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1733 case Intrinsic::amdgcn_flat_load_monitor_b128:
1734 case Intrinsic::amdgcn_flat_load_monitor_b32:
1735 case Intrinsic::amdgcn_flat_load_monitor_b64:
1736 case Intrinsic::amdgcn_global_atomic_csub:
1737 case Intrinsic::amdgcn_global_atomic_fmax_num:
1738 case Intrinsic::amdgcn_global_atomic_fmin_num:
1739 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1740 case Intrinsic::amdgcn_global_load_monitor_b128:
1741 case Intrinsic::amdgcn_global_load_monitor_b32:
1742 case Intrinsic::amdgcn_global_load_monitor_b64:
1743 case Intrinsic::amdgcn_global_load_tr_b64:
1744 case Intrinsic::amdgcn_global_load_tr_b128:
1745 case Intrinsic::amdgcn_global_load_tr4_b64:
1746 case Intrinsic::amdgcn_global_load_tr6_b96:
1747 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1748 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1749 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1750 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1751 Ptr = II->getArgOperand(0);
1752 break;
1753 case Intrinsic::amdgcn_load_to_lds:
1754 case Intrinsic::amdgcn_global_load_lds:
1755 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1756 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1757 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1758 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1759 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1760 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1761 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1762 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1763 Ptr = II->getArgOperand(1);
1764 break;
1765 default:
1766 return false;
1767 }
1768 AccessTy = II->getType();
1769 Ops.push_back(Ptr);
1770 return true;
1771}
1772
1774 unsigned AddrSpace) const {
1775 if (!Subtarget->hasFlatInstOffsets()) {
1776 // Flat instructions do not have offsets, and only have the register
1777 // address.
1778 return AM.BaseOffs == 0 && AM.Scale == 0;
1779 }
1780
1781 decltype(SIInstrFlags::FLAT) FlatVariant =
1785
1786 return AM.Scale == 0 &&
1787 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1788 AM.BaseOffs, AddrSpace, FlatVariant));
1789}
1790
1792 if (Subtarget->hasFlatGlobalInsts())
1794
1795 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1796 // Assume the we will use FLAT for all global memory accesses
1797 // on VI.
1798 // FIXME: This assumption is currently wrong. On VI we still use
1799 // MUBUF instructions for the r + i addressing mode. As currently
1800 // implemented, the MUBUF instructions only work on buffer < 4GB.
1801 // It may be possible to support > 4GB buffers with MUBUF instructions,
1802 // by setting the stride value in the resource descriptor which would
1803 // increase the size limit to (stride * 4GB). However, this is risky,
1804 // because it has never been validated.
1806 }
1807
1808 return isLegalMUBUFAddressingMode(AM);
1809}
1810
1811bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1812 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1813 // additionally can do r + r + i with addr64. 32-bit has more addressing
1814 // mode options. Depending on the resource constant, it can also do
1815 // (i64 r0) + (i32 r1) * (i14 i).
1816 //
1817 // Private arrays end up using a scratch buffer most of the time, so also
1818 // assume those use MUBUF instructions. Scratch loads / stores are currently
1819 // implemented as mubuf instructions with offen bit set, so slightly
1820 // different than the normal addr64.
1821 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1822 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1823 return false;
1824
1825 // FIXME: Since we can split immediate into soffset and immediate offset,
1826 // would it make sense to allow any immediate?
1827
1828 switch (AM.Scale) {
1829 case 0: // r + i or just i, depending on HasBaseReg.
1830 return true;
1831 case 1:
1832 return true; // We have r + r or r + i.
1833 case 2:
1834 if (AM.HasBaseReg) {
1835 // Reject 2 * r + r.
1836 return false;
1837 }
1838
1839 // Allow 2 * r as r + r
1840 // Or 2 * r + i is allowed as r + r + i.
1841 return true;
1842 default: // Don't allow n * r
1843 return false;
1844 }
1845}
1846
1848 const AddrMode &AM, Type *Ty,
1849 unsigned AS,
1850 Instruction *I) const {
1851 // No global is ever allowed as a base.
1852 if (AM.BaseGV)
1853 return false;
1854
1855 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1856 return isLegalGlobalAddressingMode(AM);
1857
1858 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1862 // If the offset isn't a multiple of 4, it probably isn't going to be
1863 // correctly aligned.
1864 // FIXME: Can we get the real alignment here?
1865 if (AM.BaseOffs % 4 != 0)
1866 return isLegalMUBUFAddressingMode(AM);
1867
1868 if (!Subtarget->hasScalarSubwordLoads()) {
1869 // There are no SMRD extloads, so if we have to do a small type access we
1870 // will use a MUBUF load.
1871 // FIXME?: We also need to do this if unaligned, but we don't know the
1872 // alignment here.
1873 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1874 return isLegalGlobalAddressingMode(AM);
1875 }
1876
1877 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1878 // SMRD instructions have an 8-bit, dword offset on SI.
1879 if (!isUInt<8>(AM.BaseOffs / 4))
1880 return false;
1881 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1882 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1883 // in 8-bits, it can use a smaller encoding.
1884 if (!isUInt<32>(AM.BaseOffs / 4))
1885 return false;
1886 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1887 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1888 if (!isUInt<20>(AM.BaseOffs))
1889 return false;
1890 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1891 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1892 // for S_BUFFER_* instructions).
1893 if (!isInt<21>(AM.BaseOffs))
1894 return false;
1895 } else {
1896 // On GFX12, all offsets are signed 24-bit in bytes.
1897 if (!isInt<24>(AM.BaseOffs))
1898 return false;
1899 }
1900
1901 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1903 AM.BaseOffs < 0) {
1904 // Scalar (non-buffer) loads can only use a negative offset if
1905 // soffset+offset is non-negative. Since the compiler can only prove that
1906 // in a few special cases, it is safer to claim that negative offsets are
1907 // not supported.
1908 return false;
1909 }
1910
1911 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1912 return true;
1913
1914 if (AM.Scale == 1 && AM.HasBaseReg)
1915 return true;
1916
1917 return false;
1918 }
1919
1920 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1921 return Subtarget->enableFlatScratch()
1923 : isLegalMUBUFAddressingMode(AM);
1924
1925 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1926 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1927 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1928 // field.
1929 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1930 // an 8-bit dword offset but we don't know the alignment here.
1931 if (!isUInt<16>(AM.BaseOffs))
1932 return false;
1933
1934 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1935 return true;
1936
1937 if (AM.Scale == 1 && AM.HasBaseReg)
1938 return true;
1939
1940 return false;
1941 }
1942
1944 // For an unknown address space, this usually means that this is for some
1945 // reason being used for pure arithmetic, and not based on some addressing
1946 // computation. We don't have instructions that compute pointers with any
1947 // addressing modes, so treat them as having no offset like flat
1948 // instructions.
1950 }
1951
1952 // Assume a user alias of global for unknown address spaces.
1953 return isLegalGlobalAddressingMode(AM);
1954}
1955
1957 const MachineFunction &MF) const {
1959 return (MemVT.getSizeInBits() <= 4 * 32);
1960 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1961 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1962 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1963 }
1965 return (MemVT.getSizeInBits() <= 2 * 32);
1966 return true;
1967}
1968
1970 unsigned Size, unsigned AddrSpace, Align Alignment,
1971 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1972 if (IsFast)
1973 *IsFast = 0;
1974
1975 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1976 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1977 // Check if alignment requirements for ds_read/write instructions are
1978 // disabled.
1979 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1980 return false;
1981
1982 Align RequiredAlignment(
1983 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1984 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1985 Alignment < RequiredAlignment)
1986 return false;
1987
1988 // Either, the alignment requirements are "enabled", or there is an
1989 // unaligned LDS access related hardware bug though alignment requirements
1990 // are "disabled". In either case, we need to check for proper alignment
1991 // requirements.
1992 //
1993 switch (Size) {
1994 case 64:
1995 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1996 // address is negative, then the instruction is incorrectly treated as
1997 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1998 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1999 // load later in the SILoadStoreOptimizer.
2000 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2001 return false;
2002
2003 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2004 // can do a 4 byte aligned, 8 byte access in a single operation using
2005 // ds_read2/write2_b32 with adjacent offsets.
2006 RequiredAlignment = Align(4);
2007
2008 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2009 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2010 // ds_write2_b32 depending on the alignment. In either case with either
2011 // alignment there is no faster way of doing this.
2012
2013 // The numbers returned here and below are not additive, it is a 'speed
2014 // rank'. They are just meant to be compared to decide if a certain way
2015 // of lowering an operation is faster than another. For that purpose
2016 // naturally aligned operation gets it bitsize to indicate that "it
2017 // operates with a speed comparable to N-bit wide load". With the full
2018 // alignment ds128 is slower than ds96 for example. If underaligned it
2019 // is comparable to a speed of a single dword access, which would then
2020 // mean 32 < 128 and it is faster to issue a wide load regardless.
2021 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2022 // wider load which will not be aligned anymore the latter is slower.
2023 if (IsFast)
2024 *IsFast = (Alignment >= RequiredAlignment) ? 64
2025 : (Alignment < Align(4)) ? 32
2026 : 1;
2027 return true;
2028 }
2029
2030 break;
2031 case 96:
2032 if (!Subtarget->hasDS96AndDS128())
2033 return false;
2034
2035 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2036 // gfx8 and older.
2037
2038 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2039 // Naturally aligned access is fastest. However, also report it is Fast
2040 // if memory is aligned less than DWORD. A narrow load or store will be
2041 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2042 // be more of them, so overall we will pay less penalty issuing a single
2043 // instruction.
2044
2045 // See comment on the values above.
2046 if (IsFast)
2047 *IsFast = (Alignment >= RequiredAlignment) ? 96
2048 : (Alignment < Align(4)) ? 32
2049 : 1;
2050 return true;
2051 }
2052
2053 break;
2054 case 128:
2055 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2056 return false;
2057
2058 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2059 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2060 // single operation using ds_read2/write2_b64.
2061 RequiredAlignment = Align(8);
2062
2063 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2064 // Naturally aligned access is fastest. However, also report it is Fast
2065 // if memory is aligned less than DWORD. A narrow load or store will be
2066 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2067 // will be more of them, so overall we will pay less penalty issuing a
2068 // single instruction.
2069
2070 // See comment on the values above.
2071 if (IsFast)
2072 *IsFast = (Alignment >= RequiredAlignment) ? 128
2073 : (Alignment < Align(4)) ? 32
2074 : 1;
2075 return true;
2076 }
2077
2078 break;
2079 default:
2080 if (Size > 32)
2081 return false;
2082
2083 break;
2084 }
2085
2086 // See comment on the values above.
2087 // Note that we have a single-dword or sub-dword here, so if underaligned
2088 // it is a slowest possible access, hence returned value is 0.
2089 if (IsFast)
2090 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2091
2092 return Alignment >= RequiredAlignment ||
2093 Subtarget->hasUnalignedDSAccessEnabled();
2094 }
2095
2096 // FIXME: We have to be conservative here and assume that flat operations
2097 // will access scratch. If we had access to the IR function, then we
2098 // could determine if any private memory was used in the function.
2099 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2100 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2101 bool AlignedBy4 = Alignment >= Align(4);
2102 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2103 if (IsFast)
2104 *IsFast = AlignedBy4 ? Size : 1;
2105 return true;
2106 }
2107
2108 if (IsFast)
2109 *IsFast = AlignedBy4;
2110
2111 return AlignedBy4;
2112 }
2113
2114 // So long as they are correct, wide global memory operations perform better
2115 // than multiple smaller memory ops -- even when misaligned
2116 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2117 if (IsFast)
2118 *IsFast = Size;
2119
2120 return Alignment >= Align(4) ||
2121 Subtarget->hasUnalignedBufferAccessEnabled();
2122 }
2123
2124 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2125 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2126 // out-of-bounds behavior, but in the edge case where an access starts
2127 // out-of-bounds and then enter in-bounds, the entire access would be treated
2128 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2129 // natural alignment of buffer accesses.
2130 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2131 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2132 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2133 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2134 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2135 return false;
2136 }
2137
2138 // Smaller than dword value must be aligned.
2139 if (Size < 32)
2140 return false;
2141
2142 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2143 // byte-address are ignored, thus forcing Dword alignment.
2144 // This applies to private, global, and constant memory.
2145 if (IsFast)
2146 *IsFast = 1;
2147
2148 return Size >= 32 && Alignment >= Align(4);
2149}
2150
2152 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2153 unsigned *IsFast) const {
2155 Alignment, Flags, IsFast);
2156}
2157
2159 LLVMContext &Context, const MemOp &Op,
2160 const AttributeList &FuncAttributes) const {
2161 // FIXME: Should account for address space here.
2162
2163 // The default fallback uses the private pointer size as a guess for a type to
2164 // use. Make sure we switch these to 64-bit accesses.
2165
2166 if (Op.size() >= 16 &&
2167 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2168 return MVT::v4i32;
2169
2170 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2171 return MVT::v2i32;
2172
2173 // Use the default.
2174 return MVT::Other;
2175}
2176
2178 const MemSDNode *MemNode = cast<MemSDNode>(N);
2179 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2180}
2181
2186
2188 unsigned DestAS) const {
2189 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2190 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2191 Subtarget->hasGloballyAddressableScratch()) {
2192 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2193 return false;
2194 }
2195
2196 // Flat -> private/local is a simple truncate.
2197 // Flat -> global is no-op
2198 return true;
2199 }
2200
2201 const GCNTargetMachine &TM =
2202 static_cast<const GCNTargetMachine &>(getTargetMachine());
2203 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2204}
2205
2213
2215 Type *Ty) const {
2216 // FIXME: Could be smarter if called for vector constants.
2217 return true;
2218}
2219
2221 unsigned Index) const {
2223 return false;
2224
2225 // TODO: Add more cases that are cheap.
2226 return Index == 0;
2227}
2228
2229bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2230 // TODO: This should be more aggressive, particular for 16-bit element
2231 // vectors. However there are some mixed improvements and regressions.
2232 EVT EltTy = VT.getVectorElementType();
2233 return EltTy.getSizeInBits() % 32 == 0;
2234}
2235
2237 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2238 switch (Op) {
2239 case ISD::LOAD:
2240 case ISD::STORE:
2241 return true;
2242 default:
2243 return false;
2244 }
2245 }
2246
2247 // SimplifySetCC uses this function to determine whether or not it should
2248 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2249 if (VT == MVT::i1 && Op == ISD::SETCC)
2250 return false;
2251
2253}
2254
2255SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2256 const SDLoc &SL,
2257 SDValue Chain,
2258 uint64_t Offset) const {
2259 const DataLayout &DL = DAG.getDataLayout();
2263
2264 auto [InputPtrReg, RC, ArgTy] =
2265 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2266
2267 // We may not have the kernarg segment argument if we have no kernel
2268 // arguments.
2269 if (!InputPtrReg)
2270 return DAG.getConstant(Offset, SL, PtrVT);
2271
2273 SDValue BasePtr = DAG.getCopyFromReg(
2274 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2275
2276 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2277}
2278
2279SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2280 const SDLoc &SL) const {
2283 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2284}
2285
2286SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2287 const SDLoc &SL) const {
2288
2290 std::optional<uint32_t> KnownSize =
2292 if (KnownSize.has_value())
2293 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2294 return SDValue();
2295}
2296
2297SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2298 const SDLoc &SL, SDValue Val,
2299 bool Signed,
2300 const ISD::InputArg *Arg) const {
2301 // First, if it is a widened vector, narrow it.
2302 if (VT.isVector() &&
2304 EVT NarrowedVT =
2307 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2308 DAG.getConstant(0, SL, MVT::i32));
2309 }
2310
2311 // Then convert the vector elements or scalar value.
2312 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2313 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2314 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2315 }
2316
2317 if (MemVT.isFloatingPoint())
2318 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2319 else if (Signed)
2320 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2321 else
2322 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2323
2324 return Val;
2325}
2326
2327SDValue SITargetLowering::lowerKernargMemParameter(
2328 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2329 uint64_t Offset, Align Alignment, bool Signed,
2330 const ISD::InputArg *Arg) const {
2331 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2332
2333 // Try to avoid using an extload by loading earlier than the argument address,
2334 // and extracting the relevant bits. The load should hopefully be merged with
2335 // the previous argument.
2336 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2337 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2338 int64_t AlignDownOffset = alignDown(Offset, 4);
2339 int64_t OffsetDiff = Offset - AlignDownOffset;
2340
2341 EVT IntVT = MemVT.changeTypeToInteger();
2342
2343 // TODO: If we passed in the base kernel offset we could have a better
2344 // alignment than 4, but we don't really need it.
2345 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2346 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2349
2350 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2351 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2352
2353 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2354 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2355 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2356
2357 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2358 }
2359
2360 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2361 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2364
2365 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2366 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2367}
2368
2369/// Coerce an argument which was passed in a different ABI type to the original
2370/// expected value type.
2371SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2372 SDValue Val,
2373 CCValAssign &VA,
2374 const SDLoc &SL) const {
2375 EVT ValVT = VA.getValVT();
2376
2377 // If this is an 8 or 16-bit value, it is really passed promoted
2378 // to 32 bits. Insert an assert[sz]ext to capture this, then
2379 // truncate to the right size.
2380 switch (VA.getLocInfo()) {
2381 case CCValAssign::Full:
2382 return Val;
2383 case CCValAssign::BCvt:
2384 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2385 case CCValAssign::SExt:
2386 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2387 DAG.getValueType(ValVT));
2388 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2389 case CCValAssign::ZExt:
2390 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2391 DAG.getValueType(ValVT));
2392 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2393 case CCValAssign::AExt:
2394 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2395 default:
2396 llvm_unreachable("Unknown loc info!");
2397 }
2398}
2399
2400SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2401 CCValAssign &VA, const SDLoc &SL,
2402 SDValue Chain,
2403 const ISD::InputArg &Arg) const {
2404 MachineFunction &MF = DAG.getMachineFunction();
2405 MachineFrameInfo &MFI = MF.getFrameInfo();
2406
2407 if (Arg.Flags.isByVal()) {
2408 unsigned Size = Arg.Flags.getByValSize();
2409 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2410 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2411 }
2412
2413 unsigned ArgOffset = VA.getLocMemOffset();
2414 unsigned ArgSize = VA.getValVT().getStoreSize();
2415
2416 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2417
2418 // Create load nodes to retrieve arguments from the stack.
2419 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2420
2421 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2423 MVT MemVT = VA.getValVT();
2424
2425 switch (VA.getLocInfo()) {
2426 default:
2427 break;
2428 case CCValAssign::BCvt:
2429 MemVT = VA.getLocVT();
2430 break;
2431 case CCValAssign::SExt:
2432 ExtType = ISD::SEXTLOAD;
2433 break;
2434 case CCValAssign::ZExt:
2435 ExtType = ISD::ZEXTLOAD;
2436 break;
2437 case CCValAssign::AExt:
2438 ExtType = ISD::EXTLOAD;
2439 break;
2440 }
2441
2442 SDValue ArgValue = DAG.getExtLoad(
2443 ExtType, SL, VA.getLocVT(), Chain, FIN,
2445
2446 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2447 if (ConvertedVal == ArgValue)
2448 return ConvertedVal;
2449
2450 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2451}
2452
2453SDValue SITargetLowering::lowerWorkGroupId(
2454 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2457 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2458 if (!Subtarget->hasClusters())
2459 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2460
2461 // Clusters are supported. Return the global position in the grid. If clusters
2462 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2463
2464 // WorkGroupIdXYZ = ClusterId == 0 ?
2465 // ClusterIdXYZ :
2466 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2467 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2468 SDLoc SL(ClusterIdXYZ);
2469 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2470 SDValue One = DAG.getConstant(1, SL, VT);
2471 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2472 SDValue ClusterWorkGroupIdXYZ =
2473 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2474 SDValue GlobalIdXYZ =
2475 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2476 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2477
2478 switch (MFI.getClusterDims().getKind()) {
2481 return GlobalIdXYZ;
2483 return ClusterIdXYZ;
2485 using namespace AMDGPU::Hwreg;
2486 SDValue ClusterIdField =
2487 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2488 SDNode *GetReg =
2489 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2490 SDValue ClusterId(GetReg, 0);
2491 SDValue Zero = DAG.getConstant(0, SL, VT);
2492 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2493 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2494 }
2495 }
2496
2497 llvm_unreachable("nothing should reach here");
2498}
2499
2500SDValue SITargetLowering::getPreloadedValue(
2501 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2503 const ArgDescriptor *Reg = nullptr;
2504 const TargetRegisterClass *RC;
2505 LLT Ty;
2506
2508 const ArgDescriptor WorkGroupIDX =
2509 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2510 // If GridZ is not programmed in an entry function then the hardware will set
2511 // it to all zeros, so there is no need to mask the GridY value in the low
2512 // order bits.
2513 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2514 AMDGPU::TTMP7,
2515 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2516 const ArgDescriptor WorkGroupIDZ =
2517 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2518 const ArgDescriptor ClusterWorkGroupIDX =
2519 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2520 const ArgDescriptor ClusterWorkGroupIDY =
2521 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2522 const ArgDescriptor ClusterWorkGroupIDZ =
2523 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2524 const ArgDescriptor ClusterWorkGroupMaxIDX =
2525 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2526 const ArgDescriptor ClusterWorkGroupMaxIDY =
2527 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2528 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2529 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2530 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2531 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2532
2533 auto LoadConstant = [&](unsigned N) {
2534 return DAG.getConstant(N, SDLoc(), VT);
2535 };
2536
2537 if (Subtarget->hasArchitectedSGPRs() &&
2539 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2540 bool HasFixedDims = ClusterDims.isFixedDims();
2541
2542 switch (PVID) {
2544 Reg = &WorkGroupIDX;
2545 RC = &AMDGPU::SReg_32RegClass;
2546 Ty = LLT::scalar(32);
2547 break;
2549 Reg = &WorkGroupIDY;
2550 RC = &AMDGPU::SReg_32RegClass;
2551 Ty = LLT::scalar(32);
2552 break;
2554 Reg = &WorkGroupIDZ;
2555 RC = &AMDGPU::SReg_32RegClass;
2556 Ty = LLT::scalar(32);
2557 break;
2559 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2560 return LoadConstant(0);
2561 Reg = &ClusterWorkGroupIDX;
2562 RC = &AMDGPU::SReg_32RegClass;
2563 Ty = LLT::scalar(32);
2564 break;
2566 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2567 return LoadConstant(0);
2568 Reg = &ClusterWorkGroupIDY;
2569 RC = &AMDGPU::SReg_32RegClass;
2570 Ty = LLT::scalar(32);
2571 break;
2573 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2574 return LoadConstant(0);
2575 Reg = &ClusterWorkGroupIDZ;
2576 RC = &AMDGPU::SReg_32RegClass;
2577 Ty = LLT::scalar(32);
2578 break;
2580 if (HasFixedDims)
2581 return LoadConstant(ClusterDims.getDims()[0] - 1);
2582 Reg = &ClusterWorkGroupMaxIDX;
2583 RC = &AMDGPU::SReg_32RegClass;
2584 Ty = LLT::scalar(32);
2585 break;
2587 if (HasFixedDims)
2588 return LoadConstant(ClusterDims.getDims()[1] - 1);
2589 Reg = &ClusterWorkGroupMaxIDY;
2590 RC = &AMDGPU::SReg_32RegClass;
2591 Ty = LLT::scalar(32);
2592 break;
2594 if (HasFixedDims)
2595 return LoadConstant(ClusterDims.getDims()[2] - 1);
2596 Reg = &ClusterWorkGroupMaxIDZ;
2597 RC = &AMDGPU::SReg_32RegClass;
2598 Ty = LLT::scalar(32);
2599 break;
2601 Reg = &ClusterWorkGroupMaxFlatID;
2602 RC = &AMDGPU::SReg_32RegClass;
2603 Ty = LLT::scalar(32);
2604 break;
2605 default:
2606 break;
2607 }
2608 }
2609
2610 if (!Reg)
2611 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2612 if (!Reg) {
2614 // It's possible for a kernarg intrinsic call to appear in a kernel with
2615 // no allocated segment, in which case we do not add the user sgpr
2616 // argument, so just return null.
2617 return DAG.getConstant(0, SDLoc(), VT);
2618 }
2619
2620 // It's undefined behavior if a function marked with the amdgpu-no-*
2621 // attributes uses the corresponding intrinsic.
2622 return DAG.getPOISON(VT);
2623 }
2624
2625 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2626}
2627
2629 CallingConv::ID CallConv,
2630 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2631 FunctionType *FType,
2633 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2634 const ISD::InputArg *Arg = &Ins[I];
2635
2636 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2637 "vector type argument should have been split");
2638
2639 // First check if it's a PS input addr.
2640 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2641 PSInputNum <= 15) {
2642 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2643
2644 // Inconveniently only the first part of the split is marked as isSplit,
2645 // so skip to the end. We only want to increment PSInputNum once for the
2646 // entire split argument.
2647 if (Arg->Flags.isSplit()) {
2648 while (!Arg->Flags.isSplitEnd()) {
2649 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2650 "unexpected vector split in ps argument type");
2651 if (!SkipArg)
2652 Splits.push_back(*Arg);
2653 Arg = &Ins[++I];
2654 }
2655 }
2656
2657 if (SkipArg) {
2658 // We can safely skip PS inputs.
2659 Skipped.set(Arg->getOrigArgIndex());
2660 ++PSInputNum;
2661 continue;
2662 }
2663
2664 Info->markPSInputAllocated(PSInputNum);
2665 if (Arg->Used)
2666 Info->markPSInputEnabled(PSInputNum);
2667
2668 ++PSInputNum;
2669 }
2670
2671 Splits.push_back(*Arg);
2672 }
2673}
2674
2675// Allocate special inputs passed in VGPRs.
2677 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2678 SIMachineFunctionInfo &Info) const {
2679 const LLT S32 = LLT::scalar(32);
2681
2682 if (Info.hasWorkItemIDX()) {
2683 Register Reg = AMDGPU::VGPR0;
2684 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2685
2686 CCInfo.AllocateReg(Reg);
2687 unsigned Mask =
2688 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2689 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2690 }
2691
2692 if (Info.hasWorkItemIDY()) {
2693 assert(Info.hasWorkItemIDX());
2694 if (Subtarget->hasPackedTID()) {
2695 Info.setWorkItemIDY(
2696 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2697 } else {
2698 unsigned Reg = AMDGPU::VGPR1;
2699 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2700
2701 CCInfo.AllocateReg(Reg);
2702 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2703 }
2704 }
2705
2706 if (Info.hasWorkItemIDZ()) {
2707 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2708 if (Subtarget->hasPackedTID()) {
2709 Info.setWorkItemIDZ(
2710 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2711 } else {
2712 unsigned Reg = AMDGPU::VGPR2;
2713 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2714
2715 CCInfo.AllocateReg(Reg);
2716 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2717 }
2718 }
2719}
2720
2721// Try to allocate a VGPR at the end of the argument list, or if no argument
2722// VGPRs are left allocating a stack slot.
2723// If \p Mask is is given it indicates bitfield position in the register.
2724// If \p Arg is given use it with new ]p Mask instead of allocating new.
2725static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2726 ArgDescriptor Arg = ArgDescriptor()) {
2727 if (Arg.isSet())
2728 return ArgDescriptor::createArg(Arg, Mask);
2729
2730 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2731 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2732 if (RegIdx == ArgVGPRs.size()) {
2733 // Spill to stack required.
2734 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2735
2736 return ArgDescriptor::createStack(Offset, Mask);
2737 }
2738
2739 unsigned Reg = ArgVGPRs[RegIdx];
2740 Reg = CCInfo.AllocateReg(Reg);
2741 assert(Reg != AMDGPU::NoRegister);
2742
2743 MachineFunction &MF = CCInfo.getMachineFunction();
2744 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2745 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2746 return ArgDescriptor::createRegister(Reg, Mask);
2747}
2748
2750 const TargetRegisterClass *RC,
2751 unsigned NumArgRegs) {
2752 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2753 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2754 if (RegIdx == ArgSGPRs.size())
2755 report_fatal_error("ran out of SGPRs for arguments");
2756
2757 unsigned Reg = ArgSGPRs[RegIdx];
2758 Reg = CCInfo.AllocateReg(Reg);
2759 assert(Reg != AMDGPU::NoRegister);
2760
2761 MachineFunction &MF = CCInfo.getMachineFunction();
2762 MF.addLiveIn(Reg, RC);
2764}
2765
2766// If this has a fixed position, we still should allocate the register in the
2767// CCInfo state. Technically we could get away with this for values passed
2768// outside of the normal argument range.
2770 const TargetRegisterClass *RC,
2771 MCRegister Reg) {
2772 Reg = CCInfo.AllocateReg(Reg);
2773 assert(Reg != AMDGPU::NoRegister);
2774 MachineFunction &MF = CCInfo.getMachineFunction();
2775 MF.addLiveIn(Reg, RC);
2776}
2777
2778static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2779 if (Arg) {
2780 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2781 Arg.getRegister());
2782 } else
2783 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2784}
2785
2786static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2787 if (Arg) {
2788 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2789 Arg.getRegister());
2790 } else
2791 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2792}
2793
2794/// Allocate implicit function VGPR arguments at the end of allocated user
2795/// arguments.
2797 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2798 SIMachineFunctionInfo &Info) const {
2799 const unsigned Mask = 0x3ff;
2800 ArgDescriptor Arg;
2801
2802 if (Info.hasWorkItemIDX()) {
2803 Arg = allocateVGPR32Input(CCInfo, Mask);
2804 Info.setWorkItemIDX(Arg);
2805 }
2806
2807 if (Info.hasWorkItemIDY()) {
2808 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2809 Info.setWorkItemIDY(Arg);
2810 }
2811
2812 if (Info.hasWorkItemIDZ())
2813 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2814}
2815
2816/// Allocate implicit function VGPR arguments in fixed registers.
2818 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2819 SIMachineFunctionInfo &Info) const {
2820 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2821 if (!Reg)
2822 report_fatal_error("failed to allocate VGPR for implicit arguments");
2823
2824 const unsigned Mask = 0x3ff;
2825 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2826 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2827 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2828}
2829
2831 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2832 SIMachineFunctionInfo &Info) const {
2833 auto &ArgInfo = Info.getArgInfo();
2834 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2835
2836 // TODO: Unify handling with private memory pointers.
2837 if (UserSGPRInfo.hasDispatchPtr())
2838 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2839
2840 if (UserSGPRInfo.hasQueuePtr())
2841 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2842
2843 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2844 // constant offset from the kernarg segment.
2845 if (Info.hasImplicitArgPtr())
2846 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2847
2848 if (UserSGPRInfo.hasDispatchID())
2849 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2850
2851 // flat_scratch_init is not applicable for non-kernel functions.
2852
2853 if (Info.hasWorkGroupIDX())
2854 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2855
2856 if (Info.hasWorkGroupIDY())
2857 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2858
2859 if (Info.hasWorkGroupIDZ())
2860 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2861
2862 if (Info.hasLDSKernelId())
2863 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2864}
2865
2866// Allocate special inputs passed in user SGPRs.
2868 MachineFunction &MF,
2869 const SIRegisterInfo &TRI,
2870 SIMachineFunctionInfo &Info) const {
2871 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2872 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2873 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2874 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2875 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2876 }
2877
2878 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2879 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2880 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2881 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2882 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2883 }
2884
2885 if (UserSGPRInfo.hasDispatchPtr()) {
2886 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2887 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2888 CCInfo.AllocateReg(DispatchPtrReg);
2889 }
2890
2891 if (UserSGPRInfo.hasQueuePtr()) {
2892 Register QueuePtrReg = Info.addQueuePtr(TRI);
2893 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2894 CCInfo.AllocateReg(QueuePtrReg);
2895 }
2896
2897 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2899 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2900 CCInfo.AllocateReg(InputPtrReg);
2901
2902 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2903 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2904 }
2905
2906 if (UserSGPRInfo.hasDispatchID()) {
2907 Register DispatchIDReg = Info.addDispatchID(TRI);
2908 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2909 CCInfo.AllocateReg(DispatchIDReg);
2910 }
2911
2912 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2913 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2914 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2915 CCInfo.AllocateReg(FlatScratchInitReg);
2916 }
2917
2918 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2919 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2920 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2921 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2922 }
2923
2924 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2925 // these from the dispatch pointer.
2926}
2927
2928// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2929// sequential starting from the first argument.
2931 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2933 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2934 Function &F = MF.getFunction();
2935 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2936 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2937 bool InPreloadSequence = true;
2938 unsigned InIdx = 0;
2939 bool AlignedForImplictArgs = false;
2940 unsigned ImplicitArgOffset = 0;
2941 for (auto &Arg : F.args()) {
2942 if (!InPreloadSequence || !Arg.hasInRegAttr())
2943 break;
2944
2945 unsigned ArgIdx = Arg.getArgNo();
2946 // Don't preload non-original args or parts not in the current preload
2947 // sequence.
2948 if (InIdx < Ins.size() &&
2949 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2950 break;
2951
2952 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2953 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2954 InIdx++) {
2955 assert(ArgLocs[ArgIdx].isMemLoc());
2956 auto &ArgLoc = ArgLocs[InIdx];
2957 const Align KernelArgBaseAlign = Align(16);
2958 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2959 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2960 unsigned NumAllocSGPRs =
2961 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2962
2963 // Fix alignment for hidden arguments.
2964 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2965 if (!AlignedForImplictArgs) {
2966 ImplicitArgOffset =
2967 alignTo(LastExplicitArgOffset,
2968 Subtarget->getAlignmentForImplicitArgPtr()) -
2969 LastExplicitArgOffset;
2970 AlignedForImplictArgs = true;
2971 }
2972 ArgOffset += ImplicitArgOffset;
2973 }
2974
2975 // Arg is preloaded into the previous SGPR.
2976 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2977 assert(InIdx >= 1 && "No previous SGPR");
2978 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2979 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2980 continue;
2981 }
2982
2983 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2984 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2985 // Check for free user SGPRs for preloading.
2986 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2987 InPreloadSequence = false;
2988 break;
2989 }
2990
2991 // Preload this argument.
2992 const TargetRegisterClass *RC =
2993 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2994 SmallVectorImpl<MCRegister> *PreloadRegs =
2995 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2996
2997 if (PreloadRegs->size() > 1)
2998 RC = &AMDGPU::SGPR_32RegClass;
2999 for (auto &Reg : *PreloadRegs) {
3000 assert(Reg);
3001 MF.addLiveIn(Reg, RC);
3002 CCInfo.AllocateReg(Reg);
3003 }
3004
3005 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3006 }
3007 }
3008}
3009
3011 const SIRegisterInfo &TRI,
3012 SIMachineFunctionInfo &Info) const {
3013 // Always allocate this last since it is a synthetic preload.
3014 if (Info.hasLDSKernelId()) {
3015 Register Reg = Info.addLDSKernelId();
3016 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3017 CCInfo.AllocateReg(Reg);
3018 }
3019}
3020
3021// Allocate special input registers that are initialized per-wave.
3024 CallingConv::ID CallConv,
3025 bool IsShader) const {
3026 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3027 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3028 // Note: user SGPRs are handled by the front-end for graphics shaders
3029 // Pad up the used user SGPRs with dead inputs.
3030
3031 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3032 // before enabling architected SGPRs for workgroup IDs.
3033 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3034
3035 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3036 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3037 // rely on it to reach 16 since if we end up having no stack usage, it will
3038 // not really be added.
3039 unsigned NumRequiredSystemSGPRs =
3040 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3041 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3042 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3043 Register Reg = Info.addReservedUserSGPR();
3044 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3045 CCInfo.AllocateReg(Reg);
3046 }
3047 }
3048
3049 if (!HasArchitectedSGPRs) {
3050 if (Info.hasWorkGroupIDX()) {
3051 Register Reg = Info.addWorkGroupIDX();
3052 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3053 CCInfo.AllocateReg(Reg);
3054 }
3055
3056 if (Info.hasWorkGroupIDY()) {
3057 Register Reg = Info.addWorkGroupIDY();
3058 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3059 CCInfo.AllocateReg(Reg);
3060 }
3061
3062 if (Info.hasWorkGroupIDZ()) {
3063 Register Reg = Info.addWorkGroupIDZ();
3064 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3065 CCInfo.AllocateReg(Reg);
3066 }
3067 }
3068
3069 if (Info.hasWorkGroupInfo()) {
3070 Register Reg = Info.addWorkGroupInfo();
3071 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3072 CCInfo.AllocateReg(Reg);
3073 }
3074
3075 if (Info.hasPrivateSegmentWaveByteOffset()) {
3076 // Scratch wave offset passed in system SGPR.
3077 unsigned PrivateSegmentWaveByteOffsetReg;
3078
3079 if (IsShader) {
3080 PrivateSegmentWaveByteOffsetReg =
3081 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3082
3083 // This is true if the scratch wave byte offset doesn't have a fixed
3084 // location.
3085 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3086 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3087 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3088 }
3089 } else
3090 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3091
3092 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3093 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3094 }
3095
3096 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3097 Info.getNumPreloadedSGPRs() >= 16);
3098}
3099
3101 MachineFunction &MF,
3102 const SIRegisterInfo &TRI,
3104 // Now that we've figured out where the scratch register inputs are, see if
3105 // should reserve the arguments and use them directly.
3106 MachineFrameInfo &MFI = MF.getFrameInfo();
3107 bool HasStackObjects = MFI.hasStackObjects();
3108 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3109
3110 // Record that we know we have non-spill stack objects so we don't need to
3111 // check all stack objects later.
3112 if (HasStackObjects)
3113 Info.setHasNonSpillStackObjects(true);
3114
3115 // Everything live out of a block is spilled with fast regalloc, so it's
3116 // almost certain that spilling will be required.
3117 if (TM.getOptLevel() == CodeGenOptLevel::None)
3118 HasStackObjects = true;
3119
3120 // For now assume stack access is needed in any callee functions, so we need
3121 // the scratch registers to pass in.
3122 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3123
3124 if (!ST.enableFlatScratch()) {
3125 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3126 // If we have stack objects, we unquestionably need the private buffer
3127 // resource. For the Code Object V2 ABI, this will be the first 4 user
3128 // SGPR inputs. We can reserve those and use them directly.
3129
3130 Register PrivateSegmentBufferReg =
3132 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3133 } else {
3134 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3135 // We tentatively reserve the last registers (skipping the last registers
3136 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3137 // we'll replace these with the ones immediately after those which were
3138 // really allocated. In the prologue copies will be inserted from the
3139 // argument to these reserved registers.
3140
3141 // Without HSA, relocations are used for the scratch pointer and the
3142 // buffer resource setup is always inserted in the prologue. Scratch wave
3143 // offset is still in an input SGPR.
3144 Info.setScratchRSrcReg(ReservedBufferReg);
3145 }
3146 }
3147
3149
3150 // For entry functions we have to set up the stack pointer if we use it,
3151 // whereas non-entry functions get this "for free". This means there is no
3152 // intrinsic advantage to using S32 over S34 in cases where we do not have
3153 // calls but do need a frame pointer (i.e. if we are requested to have one
3154 // because frame pointer elimination is disabled). To keep things simple we
3155 // only ever use S32 as the call ABI stack pointer, and so using it does not
3156 // imply we need a separate frame pointer.
3157 //
3158 // Try to use s32 as the SP, but move it if it would interfere with input
3159 // arguments. This won't work with calls though.
3160 //
3161 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3162 // registers.
3163 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3164 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3165 } else {
3167
3168 if (MFI.hasCalls())
3169 report_fatal_error("call in graphics shader with too many input SGPRs");
3170
3171 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3172 if (!MRI.isLiveIn(Reg)) {
3173 Info.setStackPtrOffsetReg(Reg);
3174 break;
3175 }
3176 }
3177
3178 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3179 report_fatal_error("failed to find register for SP");
3180 }
3181
3182 // hasFP should be accurate for entry functions even before the frame is
3183 // finalized, because it does not rely on the known stack size, only
3184 // properties like whether variable sized objects are present.
3185 if (ST.getFrameLowering()->hasFP(MF)) {
3186 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3187 }
3188}
3189
3192 return !Info->isEntryFunction();
3193}
3194
3196
3198 MachineBasicBlock *Entry,
3199 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3201
3202 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3203 if (!IStart)
3204 return;
3205
3206 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3207 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3208 MachineBasicBlock::iterator MBBI = Entry->begin();
3209 for (const MCPhysReg *I = IStart; *I; ++I) {
3210 const TargetRegisterClass *RC = nullptr;
3211 if (AMDGPU::SReg_64RegClass.contains(*I))
3212 RC = &AMDGPU::SGPR_64RegClass;
3213 else if (AMDGPU::SReg_32RegClass.contains(*I))
3214 RC = &AMDGPU::SGPR_32RegClass;
3215 else
3216 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3217
3218 Register NewVR = MRI->createVirtualRegister(RC);
3219 // Create copy from CSR to a virtual register.
3220 Entry->addLiveIn(*I);
3221 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3222 .addReg(*I);
3223
3224 // Insert the copy-back instructions right before the terminator.
3225 for (auto *Exit : Exits)
3226 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3227 TII->get(TargetOpcode::COPY), *I)
3228 .addReg(NewVR);
3229 }
3230}
3231
3233 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3234 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3235 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3237
3239 const Function &Fn = MF.getFunction();
3242 bool IsError = false;
3243
3244 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3246 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3247 IsError = true;
3248 }
3249
3252 BitVector Skipped(Ins.size());
3253 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3254 *DAG.getContext());
3255
3256 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3257 bool IsKernel = AMDGPU::isKernel(CallConv);
3258 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3259
3260 if (IsGraphics) {
3261 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3262 assert(!UserSGPRInfo.hasDispatchPtr() &&
3263 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3264 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3265 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3266 (void)UserSGPRInfo;
3267 if (!Subtarget->enableFlatScratch())
3268 assert(!UserSGPRInfo.hasFlatScratchInit());
3269 if ((CallConv != CallingConv::AMDGPU_CS &&
3270 CallConv != CallingConv::AMDGPU_Gfx &&
3271 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3272 !Subtarget->hasArchitectedSGPRs())
3273 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3274 !Info->hasWorkGroupIDZ());
3275 }
3276
3277 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3278
3279 if (CallConv == CallingConv::AMDGPU_PS) {
3280 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3281
3282 // At least one interpolation mode must be enabled or else the GPU will
3283 // hang.
3284 //
3285 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3286 // set PSInputAddr, the user wants to enable some bits after the compilation
3287 // based on run-time states. Since we can't know what the final PSInputEna
3288 // will look like, so we shouldn't do anything here and the user should take
3289 // responsibility for the correct programming.
3290 //
3291 // Otherwise, the following restrictions apply:
3292 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3293 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3294 // enabled too.
3295 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3296 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3297 CCInfo.AllocateReg(AMDGPU::VGPR0);
3298 CCInfo.AllocateReg(AMDGPU::VGPR1);
3299 Info->markPSInputAllocated(0);
3300 Info->markPSInputEnabled(0);
3301 }
3302 if (Subtarget->isAmdPalOS()) {
3303 // For isAmdPalOS, the user does not enable some bits after compilation
3304 // based on run-time states; the register values being generated here are
3305 // the final ones set in hardware. Therefore we need to apply the
3306 // workaround to PSInputAddr and PSInputEnable together. (The case where
3307 // a bit is set in PSInputAddr but not PSInputEnable is where the
3308 // frontend set up an input arg for a particular interpolation mode, but
3309 // nothing uses that input arg. Really we should have an earlier pass
3310 // that removes such an arg.)
3311 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3312 if ((PsInputBits & 0x7F) == 0 ||
3313 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3314 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3315 }
3316 } else if (IsKernel) {
3317 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3318 } else {
3319 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3320 Ins.end());
3321 }
3322
3323 if (IsKernel)
3324 analyzeFormalArgumentsCompute(CCInfo, Ins);
3325
3326 if (IsEntryFunc) {
3327 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3328 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3329 if (IsKernel && Subtarget->hasKernargPreload())
3330 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3331
3332 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3333 } else if (!IsGraphics) {
3334 // For the fixed ABI, pass workitem IDs in the last argument register.
3335 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3336
3337 // FIXME: Sink this into allocateSpecialInputSGPRs
3338 if (!Subtarget->enableFlatScratch())
3339 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3340
3341 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3342 }
3343
3344 if (!IsKernel) {
3345 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3346 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3347
3348 // This assumes the registers are allocated by CCInfo in ascending order
3349 // with no gaps.
3350 Info->setNumWaveDispatchSGPRs(
3351 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3352 Info->setNumWaveDispatchVGPRs(
3353 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3354 } else if (Info->getNumKernargPreloadedSGPRs()) {
3355 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3356 }
3357
3359
3360 if (IsWholeWaveFunc) {
3362 {MVT::i1, MVT::Other}, Chain);
3363 InVals.push_back(Setup.getValue(0));
3364 Chains.push_back(Setup.getValue(1));
3365 }
3366
3367 // FIXME: This is the minimum kernel argument alignment. We should improve
3368 // this to the maximum alignment of the arguments.
3369 //
3370 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3371 // kern arg offset.
3372 const Align KernelArgBaseAlign = Align(16);
3373
3374 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3375 ++i) {
3376 const ISD::InputArg &Arg = Ins[i];
3377 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3378 InVals.push_back(DAG.getPOISON(Arg.VT));
3379 continue;
3380 }
3381
3382 CCValAssign &VA = ArgLocs[ArgIdx++];
3383 MVT VT = VA.getLocVT();
3384
3385 if (IsEntryFunc && VA.isMemLoc()) {
3386 VT = Ins[i].VT;
3387 EVT MemVT = VA.getLocVT();
3388
3389 const uint64_t Offset = VA.getLocMemOffset();
3390 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3391
3392 if (Arg.Flags.isByRef()) {
3393 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3394
3395 const GCNTargetMachine &TM =
3396 static_cast<const GCNTargetMachine &>(getTargetMachine());
3397 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3398 Arg.Flags.getPointerAddrSpace())) {
3401 }
3402
3403 InVals.push_back(Ptr);
3404 continue;
3405 }
3406
3407 SDValue NewArg;
3408 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3409 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3410 // In this case the argument is packed into the previous preload SGPR.
3411 int64_t AlignDownOffset = alignDown(Offset, 4);
3412 int64_t OffsetDiff = Offset - AlignDownOffset;
3413 EVT IntVT = MemVT.changeTypeToInteger();
3414
3415 const SIMachineFunctionInfo *Info =
3418 Register Reg =
3419 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3420
3421 assert(Reg);
3422 Register VReg = MRI.getLiveInVirtReg(Reg);
3423 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3424
3425 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3426 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3427
3428 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3429 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3430 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3431 Ins[i].Flags.isSExt(), &Ins[i]);
3432
3433 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3434 } else {
3435 const SIMachineFunctionInfo *Info =
3438 const SmallVectorImpl<MCRegister> &PreloadRegs =
3439 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3440
3441 SDValue Copy;
3442 if (PreloadRegs.size() == 1) {
3443 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3444 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3445 NewArg = DAG.getCopyFromReg(
3446 Chain, DL, VReg,
3448 TRI->getRegSizeInBits(*RC)));
3449
3450 } else {
3451 // If the kernarg alignment does not match the alignment of the SGPR
3452 // tuple RC that can accommodate this argument, it will be built up
3453 // via copies from from the individual SGPRs that the argument was
3454 // preloaded to.
3456 for (auto Reg : PreloadRegs) {
3457 Register VReg = MRI.getLiveInVirtReg(Reg);
3458 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3459 Elts.push_back(Copy);
3460 }
3461 NewArg =
3462 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3463 PreloadRegs.size()),
3464 DL, Elts);
3465 }
3466
3467 // If the argument was preloaded to multiple consecutive 32-bit
3468 // registers because of misalignment between addressable SGPR tuples
3469 // and the argument size, we can still assume that because of kernarg
3470 // segment alignment restrictions that NewArg's size is the same as
3471 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3472 // truncate since we cannot preload to less than a single SGPR and the
3473 // MemVT may be smaller.
3474 EVT MemVTInt =
3476 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3477 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3478
3479 NewArg = DAG.getBitcast(MemVT, NewArg);
3480 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3481 Ins[i].Flags.isSExt(), &Ins[i]);
3482 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3483 }
3484 } else {
3485 // Hidden arguments that are in the kernel signature must be preloaded
3486 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3487 // the argument list and is not preloaded.
3488 if (Arg.isOrigArg()) {
3489 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3490 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3492 *OrigArg->getParent(),
3493 "hidden argument in kernel signature was not preloaded",
3494 DL.getDebugLoc()));
3495 }
3496 }
3497
3498 NewArg =
3499 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3500 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3501 }
3502 Chains.push_back(NewArg.getValue(1));
3503
3504 auto *ParamTy =
3505 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3506 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3507 ParamTy &&
3508 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3509 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3510 // On SI local pointers are just offsets into LDS, so they are always
3511 // less than 16-bits. On CI and newer they could potentially be
3512 // real pointers, so we can't guarantee their size.
3513 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3514 DAG.getValueType(MVT::i16));
3515 }
3516
3517 InVals.push_back(NewArg);
3518 continue;
3519 }
3520 if (!IsEntryFunc && VA.isMemLoc()) {
3521 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3522 InVals.push_back(Val);
3523 if (!Arg.Flags.isByVal())
3524 Chains.push_back(Val.getValue(1));
3525 continue;
3526 }
3527
3528 assert(VA.isRegLoc() && "Parameter must be in a register!");
3529
3530 Register Reg = VA.getLocReg();
3531 const TargetRegisterClass *RC = nullptr;
3532 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3533 RC = &AMDGPU::VGPR_32RegClass;
3534 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3535 RC = &AMDGPU::SGPR_32RegClass;
3536 else
3537 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3538
3539 Reg = MF.addLiveIn(Reg, RC);
3540 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3541
3542 if (Arg.Flags.isSRet()) {
3543 // The return object should be reasonably addressable.
3544
3545 // FIXME: This helps when the return is a real sret. If it is a
3546 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3547 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3548 unsigned NumBits =
3550 Val = DAG.getNode(
3551 ISD::AssertZext, DL, VT, Val,
3552 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3553 }
3554
3555 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3556 InVals.push_back(Val);
3557 }
3558
3559 // Start adding system SGPRs.
3560 if (IsEntryFunc)
3561 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3562
3563 // DAG.getPass() returns nullptr when using new pass manager.
3564 // TODO: Use DAG.getMFAM() to access analysis result.
3565 if (DAG.getPass()) {
3566 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3567 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3568 }
3569
3570 unsigned StackArgSize = CCInfo.getStackSize();
3571 Info->setBytesInStackArgArea(StackArgSize);
3572
3573 return Chains.empty() ? Chain
3574 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3575}
3576
3577// TODO: If return values can't fit in registers, we should return as many as
3578// possible in registers before passing on stack.
3580 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3581 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3582 const Type *RetTy) const {
3583 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3584 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3585 // for shaders. Vector types should be explicitly handled by CC.
3586 if (AMDGPU::isEntryFunctionCC(CallConv))
3587 return true;
3588
3590 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3591 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3592 return false;
3593
3594 // We must use the stack if return would require unavailable registers.
3595 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3596 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3597 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3598 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3599 return false;
3600
3601 return true;
3602}
3603
3604SDValue
3606 bool isVarArg,
3608 const SmallVectorImpl<SDValue> &OutVals,
3609 const SDLoc &DL, SelectionDAG &DAG) const {
3613
3614 if (AMDGPU::isKernel(CallConv)) {
3615 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3616 OutVals, DL, DAG);
3617 }
3618
3619 bool IsShader = AMDGPU::isShader(CallConv);
3620
3621 Info->setIfReturnsVoid(Outs.empty());
3622 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3623
3624 // CCValAssign - represent the assignment of the return value to a location.
3626
3627 // CCState - Info about the registers and stack slots.
3628 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3629 *DAG.getContext());
3630
3631 // Analyze outgoing return values.
3632 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3633
3634 SDValue Glue;
3636 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3637
3638 SDValue ReadFirstLane =
3639 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3640 // Copy the result values into the output registers.
3641 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3642 ++I, ++RealRVLocIdx) {
3643 CCValAssign &VA = RVLocs[I];
3644 assert(VA.isRegLoc() && "Can only return in registers!");
3645 // TODO: Partially return in registers if return values don't fit.
3646 SDValue Arg = OutVals[RealRVLocIdx];
3647
3648 // Copied from other backends.
3649 switch (VA.getLocInfo()) {
3650 case CCValAssign::Full:
3651 break;
3652 case CCValAssign::BCvt:
3653 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3654 break;
3655 case CCValAssign::SExt:
3656 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3657 break;
3658 case CCValAssign::ZExt:
3659 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3660 break;
3661 case CCValAssign::AExt:
3662 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3663 break;
3664 default:
3665 llvm_unreachable("Unknown loc info!");
3666 }
3667 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3669 ReadFirstLane, Arg);
3670 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3671 Glue = Chain.getValue(1);
3672 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3673 }
3674
3675 // FIXME: Does sret work properly?
3676 if (!Info->isEntryFunction()) {
3677 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3678 const MCPhysReg *I =
3679 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3680 if (I) {
3681 for (; *I; ++I) {
3682 if (AMDGPU::SReg_64RegClass.contains(*I))
3683 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3684 else if (AMDGPU::SReg_32RegClass.contains(*I))
3685 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3686 else
3687 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3688 }
3689 }
3690 }
3691
3692 // Update chain and glue.
3693 RetOps[0] = Chain;
3694 if (Glue.getNode())
3695 RetOps.push_back(Glue);
3696
3697 unsigned Opc = AMDGPUISD::ENDPGM;
3698 if (!IsWaveEnd)
3699 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3700 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3702 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3703}
3704
3706 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3707 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3708 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3709 SDValue ThisVal) const {
3710 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3711
3712 // Assign locations to each value returned by this call.
3714 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3715 *DAG.getContext());
3716 CCInfo.AnalyzeCallResult(Ins, RetCC);
3717
3718 // Copy all of the result registers out of their specified physreg.
3719 for (CCValAssign VA : RVLocs) {
3720 SDValue Val;
3721
3722 if (VA.isRegLoc()) {
3723 Val =
3724 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3725 Chain = Val.getValue(1);
3726 InGlue = Val.getValue(2);
3727 } else if (VA.isMemLoc()) {
3728 report_fatal_error("TODO: return values in memory");
3729 } else
3730 llvm_unreachable("unknown argument location type");
3731
3732 switch (VA.getLocInfo()) {
3733 case CCValAssign::Full:
3734 break;
3735 case CCValAssign::BCvt:
3736 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3737 break;
3738 case CCValAssign::ZExt:
3739 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3740 DAG.getValueType(VA.getValVT()));
3741 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3742 break;
3743 case CCValAssign::SExt:
3744 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3745 DAG.getValueType(VA.getValVT()));
3746 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3747 break;
3748 case CCValAssign::AExt:
3749 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3750 break;
3751 default:
3752 llvm_unreachable("Unknown loc info!");
3753 }
3754
3755 InVals.push_back(Val);
3756 }
3757
3758 return Chain;
3759}
3760
3761// Add code to pass special inputs required depending on used features separate
3762// from the explicit user arguments present in the IR.
3764 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3765 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3766 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3767 // If we don't have a call site, this was a call inserted by
3768 // legalization. These can never use special inputs.
3769 if (!CLI.CB)
3770 return;
3771
3772 SelectionDAG &DAG = CLI.DAG;
3773 const SDLoc &DL = CLI.DL;
3774 const Function &F = DAG.getMachineFunction().getFunction();
3775
3776 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3777 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3778
3779 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3781 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3782 // DAG.getPass() returns nullptr when using new pass manager.
3783 // TODO: Use DAG.getMFAM() to access analysis result.
3784 if (DAG.getPass()) {
3785 auto &ArgUsageInfo =
3787 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3788 }
3789 }
3790
3791 // TODO: Unify with private memory register handling. This is complicated by
3792 // the fact that at least in kernels, the input argument is not necessarily
3793 // in the same location as the input.
3794 // clang-format off
3795 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3796 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3797 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3798 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3799 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3800 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3801 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3802 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3803 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3804 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3805 };
3806 // clang-format on
3807
3808 for (auto [InputID, Attrs] : ImplicitAttrs) {
3809 // If the callee does not use the attribute value, skip copying the value.
3810 if (all_of(Attrs, [&](StringRef Attr) {
3811 return Attr.empty() || CLI.CB->hasFnAttr(Attr);
3812 }))
3813 continue;
3814
3815 const auto [OutgoingArg, ArgRC, ArgTy] =
3816 CalleeArgInfo->getPreloadedValue(InputID);
3817 if (!OutgoingArg)
3818 continue;
3819
3820 const auto [IncomingArg, IncomingArgRC, Ty] =
3821 CallerArgInfo.getPreloadedValue(InputID);
3822 assert(IncomingArgRC == ArgRC);
3823
3824 // All special arguments are ints for now.
3825 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3826 SDValue InputReg;
3827
3828 if (IncomingArg) {
3829 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3830 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3831 // The implicit arg ptr is special because it doesn't have a corresponding
3832 // input for kernels, and is computed from the kernarg segment pointer.
3833 InputReg = getImplicitArgPtr(DAG, DL);
3834 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3835 std::optional<uint32_t> Id =
3837 if (Id.has_value()) {
3838 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3839 } else {
3840 InputReg = DAG.getPOISON(ArgVT);
3841 }
3842 } else {
3843 // We may have proven the input wasn't needed, although the ABI is
3844 // requiring it. We just need to allocate the register appropriately.
3845 InputReg = DAG.getPOISON(ArgVT);
3846 }
3847
3848 if (OutgoingArg->isRegister()) {
3849 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3850 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3851 report_fatal_error("failed to allocate implicit input argument");
3852 } else {
3853 unsigned SpecialArgOffset =
3854 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3855 SDValue ArgStore =
3856 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3857 MemOpChains.push_back(ArgStore);
3858 }
3859 }
3860
3861 // Pack workitem IDs into a single register or pass it as is if already
3862 // packed.
3863
3864 auto [OutgoingArg, ArgRC, Ty] =
3866 if (!OutgoingArg)
3867 std::tie(OutgoingArg, ArgRC, Ty) =
3869 if (!OutgoingArg)
3870 std::tie(OutgoingArg, ArgRC, Ty) =
3872 if (!OutgoingArg)
3873 return;
3874
3875 const ArgDescriptor *IncomingArgX = std::get<0>(
3877 const ArgDescriptor *IncomingArgY = std::get<0>(
3879 const ArgDescriptor *IncomingArgZ = std::get<0>(
3881
3882 SDValue InputReg;
3883 SDLoc SL;
3884
3885 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3886 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3887 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3888
3889 // If incoming ids are not packed we need to pack them.
3890 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3891 NeedWorkItemIDX) {
3892 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3893 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3894 } else {
3895 InputReg = DAG.getConstant(0, DL, MVT::i32);
3896 }
3897 }
3898
3899 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3900 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3901 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3902 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3903 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3904 InputReg = InputReg.getNode()
3905 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3906 : Y;
3907 }
3908
3909 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3910 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3911 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3912 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3913 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3914 InputReg = InputReg.getNode()
3915 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3916 : Z;
3917 }
3918
3919 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3920 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3921 // We're in a situation where the outgoing function requires the workitem
3922 // ID, but the calling function does not have it (e.g a graphics function
3923 // calling a C calling convention function). This is illegal, but we need
3924 // to produce something.
3925 InputReg = DAG.getPOISON(MVT::i32);
3926 } else {
3927 // Workitem ids are already packed, any of present incoming arguments
3928 // will carry all required fields.
3929 ArgDescriptor IncomingArg =
3930 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3931 : IncomingArgY ? *IncomingArgY
3932 : *IncomingArgZ,
3933 ~0u);
3934 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3935 }
3936 }
3937
3938 if (OutgoingArg->isRegister()) {
3939 if (InputReg)
3940 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3941
3942 CCInfo.AllocateReg(OutgoingArg->getRegister());
3943 } else {
3944 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3945 if (InputReg) {
3946 SDValue ArgStore =
3947 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3948 MemOpChains.push_back(ArgStore);
3949 }
3950 }
3951}
3952
3954 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3956 const SmallVectorImpl<SDValue> &OutVals,
3957 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3958 if (AMDGPU::isChainCC(CalleeCC))
3959 return true;
3960
3961 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3962 return false;
3963
3964 // For a divergent call target, we need to do a waterfall loop over the
3965 // possible callees which precludes us from using a simple jump.
3966 if (Callee->isDivergent())
3967 return false;
3968
3970 const Function &CallerF = MF.getFunction();
3971 CallingConv::ID CallerCC = CallerF.getCallingConv();
3973 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3974
3975 // Kernels aren't callable, and don't have a live in return address so it
3976 // doesn't make sense to do a tail call with entry functions.
3977 if (!CallerPreserved)
3978 return false;
3979
3980 bool CCMatch = CallerCC == CalleeCC;
3981
3983 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
3984 return true;
3985 return false;
3986 }
3987
3988 // TODO: Can we handle var args?
3989 if (IsVarArg)
3990 return false;
3991
3992 for (const Argument &Arg : CallerF.args()) {
3993 if (Arg.hasByValAttr())
3994 return false;
3995 }
3996
3997 LLVMContext &Ctx = *DAG.getContext();
3998
3999 // Check that the call results are passed in the same way.
4000 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
4001 CCAssignFnForCall(CalleeCC, IsVarArg),
4002 CCAssignFnForCall(CallerCC, IsVarArg)))
4003 return false;
4004
4005 // The callee has to preserve all registers the caller needs to preserve.
4006 if (!CCMatch) {
4007 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4008 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4009 return false;
4010 }
4011
4012 // Nothing more to check if the callee is taking no arguments.
4013 if (Outs.empty())
4014 return true;
4015
4017 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4018
4019 // FIXME: We are not allocating special input registers, so we will be
4020 // deciding based on incorrect register assignments.
4021 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4022
4023 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4024 // If the stack arguments for this call do not fit into our own save area then
4025 // the call cannot be made tail.
4026 // TODO: Is this really necessary?
4027 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4028 return false;
4029
4030 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4031 // FIXME: What about inreg arguments that end up passed in memory?
4032 if (!CCVA.isRegLoc())
4033 continue;
4034
4035 // If we are passing an argument in an SGPR, and the value is divergent,
4036 // this call requires a waterfall loop.
4037 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4038 LLVM_DEBUG(
4039 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4040 << printReg(CCVA.getLocReg(), TRI) << '\n');
4041 return false;
4042 }
4043 }
4044
4045 const MachineRegisterInfo &MRI = MF.getRegInfo();
4046 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4047}
4048
4050 if (!CI->isTailCall())
4051 return false;
4052
4053 const Function *ParentFn = CI->getParent()->getParent();
4055 return false;
4056 return true;
4057}
4058
4059namespace {
4060// Chain calls have special arguments that we need to handle. These are
4061// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4062// arguments (index 0 and 1 respectively).
4063enum ChainCallArgIdx {
4064 Exec = 2,
4065 Flags,
4066 NumVGPRs,
4067 FallbackExec,
4068 FallbackCallee
4069};
4070} // anonymous namespace
4071
4072// The wave scratch offset register is used as the global base pointer.
4074 SmallVectorImpl<SDValue> &InVals) const {
4075 CallingConv::ID CallConv = CLI.CallConv;
4076 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4077
4078 SelectionDAG &DAG = CLI.DAG;
4079
4080 const SDLoc &DL = CLI.DL;
4081 SDValue Chain = CLI.Chain;
4082 SDValue Callee = CLI.Callee;
4083
4084 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4085 bool UsesDynamicVGPRs = false;
4086 if (IsChainCallConv) {
4087 // The last arguments should be the value that we need to put in EXEC,
4088 // followed by the flags and any other arguments with special meanings.
4089 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4090 // we don't treat them like the "real" arguments.
4091 auto RequestedExecIt =
4092 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4093 return Arg.OrigArgIndex == 2;
4094 });
4095 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4096
4097 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4098 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4099 CLI.OutVals.end());
4100 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4101
4102 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4103 "Haven't popped all the special args");
4104
4105 TargetLowering::ArgListEntry RequestedExecArg =
4106 CLI.Args[ChainCallArgIdx::Exec];
4107 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4108 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4109
4110 // Convert constants into TargetConstants, so they become immediate operands
4111 // instead of being selected into S_MOV.
4112 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4113 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4114 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4115 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4116 } else
4117 ChainCallSpecialArgs.push_back(Arg.Node);
4118 };
4119
4120 PushNodeOrTargetConstant(RequestedExecArg);
4121
4122 // Process any other special arguments depending on the value of the flags.
4123 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4124
4125 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4126 if (FlagsValue.isZero()) {
4127 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4128 return lowerUnhandledCall(CLI, InVals,
4129 "no additional args allowed if flags == 0");
4130 } else if (FlagsValue.isOneBitSet(0)) {
4131 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4132 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4133 }
4134
4135 if (!Subtarget->isWave32()) {
4136 return lowerUnhandledCall(
4137 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4138 }
4139
4140 UsesDynamicVGPRs = true;
4141 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4142 CLI.Args.end(), PushNodeOrTargetConstant);
4143 }
4144 }
4145
4147 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4149 bool &IsTailCall = CLI.IsTailCall;
4150 bool IsVarArg = CLI.IsVarArg;
4151 bool IsSibCall = false;
4153
4154 if (Callee.isUndef() || isNullConstant(Callee)) {
4155 if (!CLI.IsTailCall) {
4156 for (ISD::InputArg &Arg : CLI.Ins)
4157 InVals.push_back(DAG.getPOISON(Arg.VT));
4158 }
4159
4160 return Chain;
4161 }
4162
4163 if (IsVarArg) {
4164 return lowerUnhandledCall(CLI, InVals,
4165 "unsupported call to variadic function ");
4166 }
4167
4168 if (!CLI.CB)
4169 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4170
4171 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4172 return lowerUnhandledCall(CLI, InVals,
4173 "unsupported required tail call to function ");
4174 }
4175
4176 if (IsTailCall) {
4177 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4178 Outs, OutVals, Ins, DAG);
4179 if (!IsTailCall &&
4180 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4181 report_fatal_error("failed to perform tail call elimination on a call "
4182 "site marked musttail or on llvm.amdgcn.cs.chain");
4183 }
4184
4185 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4186
4187 // A sibling call is one where we're under the usual C ABI and not planning
4188 // to change that but can still do a tail call:
4189 if (!TailCallOpt && IsTailCall)
4190 IsSibCall = true;
4191
4192 if (IsTailCall)
4193 ++NumTailCalls;
4194 }
4195
4198 SmallVector<SDValue, 8> MemOpChains;
4199
4200 // Analyze operands of the call, assigning locations to each operand.
4202 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4203 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4204
4205 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4207 // With a fixed ABI, allocate fixed registers before user arguments.
4208 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4209 }
4210
4211 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4212
4213 // Get a count of how many bytes are to be pushed on the stack.
4214 unsigned NumBytes = CCInfo.getStackSize();
4215
4216 if (IsSibCall) {
4217 // Since we're not changing the ABI to make this a tail call, the memory
4218 // operands are already available in the caller's incoming argument space.
4219 NumBytes = 0;
4220 }
4221
4222 // FPDiff is the byte offset of the call's argument area from the callee's.
4223 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4224 // by this amount for a tail call. In a sibling call it must be 0 because the
4225 // caller will deallocate the entire stack and the callee still expects its
4226 // arguments to begin at SP+0. Completely unused for non-tail calls.
4227 int32_t FPDiff = 0;
4228 MachineFrameInfo &MFI = MF.getFrameInfo();
4229 auto *TRI = Subtarget->getRegisterInfo();
4230
4231 // Adjust the stack pointer for the new arguments...
4232 // These operations are automatically eliminated by the prolog/epilog pass
4233 if (!IsSibCall)
4234 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4235
4236 if (!IsSibCall || IsChainCallConv) {
4237 if (!Subtarget->enableFlatScratch()) {
4238 SmallVector<SDValue, 4> CopyFromChains;
4239
4240 // In the HSA case, this should be an identity copy.
4241 SDValue ScratchRSrcReg =
4242 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4243 RegsToPass.emplace_back(IsChainCallConv
4244 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4245 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4246 ScratchRSrcReg);
4247 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4248 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4249 }
4250 }
4251
4252 const unsigned NumSpecialInputs = RegsToPass.size();
4253
4254 MVT PtrVT = MVT::i32;
4255
4256 // Walk the register/memloc assignments, inserting copies/loads.
4257 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4258 CCValAssign &VA = ArgLocs[i];
4259 SDValue Arg = OutVals[i];
4260
4261 // Promote the value if needed.
4262 switch (VA.getLocInfo()) {
4263 case CCValAssign::Full:
4264 break;
4265 case CCValAssign::BCvt:
4266 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4267 break;
4268 case CCValAssign::ZExt:
4269 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4270 break;
4271 case CCValAssign::SExt:
4272 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4273 break;
4274 case CCValAssign::AExt:
4275 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4276 break;
4277 case CCValAssign::FPExt:
4278 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4279 break;
4280 default:
4281 llvm_unreachable("Unknown loc info!");
4282 }
4283
4284 if (VA.isRegLoc()) {
4285 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4286 } else {
4287 assert(VA.isMemLoc());
4288
4289 SDValue DstAddr;
4290 MachinePointerInfo DstInfo;
4291
4292 unsigned LocMemOffset = VA.getLocMemOffset();
4293 int32_t Offset = LocMemOffset;
4294
4295 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4296 MaybeAlign Alignment;
4297
4298 if (IsTailCall) {
4299 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4300 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4301 : VA.getValVT().getStoreSize();
4302
4303 // FIXME: We can have better than the minimum byval required alignment.
4304 Alignment =
4305 Flags.isByVal()
4306 ? Flags.getNonZeroByValAlign()
4307 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4308
4309 Offset = Offset + FPDiff;
4310 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4311
4312 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4313 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4314
4315 // Make sure any stack arguments overlapping with where we're storing
4316 // are loaded before this eventual operation. Otherwise they'll be
4317 // clobbered.
4318
4319 // FIXME: Why is this really necessary? This seems to just result in a
4320 // lot of code to copy the stack and write them back to the same
4321 // locations, which are supposed to be immutable?
4322 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4323 } else {
4324 // Stores to the argument stack area are relative to the stack pointer.
4325 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4326 MVT::i32);
4327 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4328 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4329 Alignment =
4330 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4331 }
4332
4333 if (Outs[i].Flags.isByVal()) {
4334 SDValue SizeNode =
4335 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4336 SDValue Cpy =
4337 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4338 Outs[i].Flags.getNonZeroByValAlign(),
4339 /*isVol = */ false, /*AlwaysInline = */ true,
4340 /*CI=*/nullptr, std::nullopt, DstInfo,
4342
4343 MemOpChains.push_back(Cpy);
4344 } else {
4345 SDValue Store =
4346 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4347 MemOpChains.push_back(Store);
4348 }
4349 }
4350 }
4351
4352 if (!MemOpChains.empty())
4353 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4354
4355 SDValue ReadFirstLaneID =
4356 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4357
4358 SDValue TokenGlue;
4359 if (CLI.ConvergenceControlToken) {
4360 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4362 }
4363
4364 // Build a sequence of copy-to-reg nodes chained together with token chain
4365 // and flag operands which copy the outgoing args into the appropriate regs.
4366 SDValue InGlue;
4367
4368 unsigned ArgIdx = 0;
4369 for (auto [Reg, Val] : RegsToPass) {
4370 if (ArgIdx++ >= NumSpecialInputs &&
4371 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4372 // For chain calls, the inreg arguments are required to be
4373 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4374 // they are uniform.
4375 //
4376 // For other calls, if an inreg arguments is known to be uniform,
4377 // speculatively insert a readfirstlane in case it is in a VGPR.
4378 //
4379 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4380 // value, so let that continue to produce invalid code.
4381
4382 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4383 if (TokenGlue)
4384 ReadfirstlaneArgs.push_back(TokenGlue);
4386 ReadfirstlaneArgs);
4387 }
4388
4389 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4390 InGlue = Chain.getValue(1);
4391 }
4392
4393 // We don't usually want to end the call-sequence here because we would tidy
4394 // the frame up *after* the call, however in the ABI-changing tail-call case
4395 // we've carefully laid out the parameters so that when sp is reset they'll be
4396 // in the correct location.
4397 if (IsTailCall && !IsSibCall) {
4398 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4399 InGlue = Chain.getValue(1);
4400 }
4401
4402 std::vector<SDValue> Ops({Chain});
4403
4404 // Add a redundant copy of the callee global which will not be legalized, as
4405 // we need direct access to the callee later.
4407 const GlobalValue *GV = GSD->getGlobal();
4408 Ops.push_back(Callee);
4409 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4410 } else {
4411 if (IsTailCall) {
4412 // isEligibleForTailCallOptimization considered whether the call target is
4413 // divergent, but we may still end up with a uniform value in a VGPR.
4414 // Insert a readfirstlane just in case.
4415 SDValue ReadFirstLaneID =
4416 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4417
4418 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4419 if (TokenGlue)
4420 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4421 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4422 ReadfirstlaneArgs);
4423 }
4424
4425 Ops.push_back(Callee);
4426 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4427 }
4428
4429 if (IsTailCall) {
4430 // Each tail call may have to adjust the stack by a different amount, so
4431 // this information must travel along with the operation for eventual
4432 // consumption by emitEpilogue.
4433 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4434 }
4435
4436 if (IsChainCallConv)
4437 llvm::append_range(Ops, ChainCallSpecialArgs);
4438
4439 // Add argument registers to the end of the list so that they are known live
4440 // into the call.
4441 for (auto &[Reg, Val] : RegsToPass)
4442 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4443
4444 // Add a register mask operand representing the call-preserved registers.
4445 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4446 assert(Mask && "Missing call preserved mask for calling convention");
4447 Ops.push_back(DAG.getRegisterMask(Mask));
4448
4449 if (SDValue Token = CLI.ConvergenceControlToken) {
4451 GlueOps.push_back(Token);
4452 if (InGlue)
4453 GlueOps.push_back(InGlue);
4454
4455 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4456 MVT::Glue, GlueOps),
4457 0);
4458 }
4459
4460 if (InGlue)
4461 Ops.push_back(InGlue);
4462
4463 // If we're doing a tall call, use a TC_RETURN here rather than an
4464 // actual call instruction.
4465 if (IsTailCall) {
4466 MFI.setHasTailCall();
4467 unsigned OPC = AMDGPUISD::TC_RETURN;
4468 switch (CallConv) {
4471 break;
4474 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4476 break;
4477 }
4478
4479 // If the caller is a whole wave function, we need to use a special opcode
4480 // so we can patch up EXEC.
4481 if (Info->isWholeWaveFunction())
4483
4484 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4485 }
4486
4487 // Returns a chain and a flag for retval copy to use.
4488 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4489 Chain = Call.getValue(0);
4490 InGlue = Call.getValue(1);
4491
4492 uint64_t CalleePopBytes = NumBytes;
4493 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4494 if (!Ins.empty())
4495 InGlue = Chain.getValue(1);
4496
4497 // Handle result values, copying them out of physregs into vregs that we
4498 // return.
4499 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4500 InVals, /*IsThisReturn=*/false, SDValue());
4501}
4502
4503// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4504// except for:
4505// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4506// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4508 SelectionDAG &DAG) const {
4509 const MachineFunction &MF = DAG.getMachineFunction();
4511
4512 SDLoc dl(Op);
4513 EVT VT = Op.getValueType();
4514 SDValue Chain = Op.getOperand(0);
4515 Register SPReg = Info->getStackPtrOffsetReg();
4516
4517 // Chain the dynamic stack allocation so that it doesn't modify the stack
4518 // pointer when other instructions are using the stack.
4519 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4520
4521 SDValue Size = Op.getOperand(1);
4522 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4523 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4524
4525 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4527 "Stack grows upwards for AMDGPU");
4528
4529 Chain = BaseAddr.getValue(1);
4530 Align StackAlign = TFL->getStackAlign();
4531 if (Alignment > StackAlign) {
4532 uint64_t ScaledAlignment = Alignment.value()
4533 << Subtarget->getWavefrontSizeLog2();
4534 uint64_t StackAlignMask = ScaledAlignment - 1;
4535 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4536 DAG.getConstant(StackAlignMask, dl, VT));
4537 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4538 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4539 }
4540
4541 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4542 SDValue NewSP;
4544 // For constant sized alloca, scale alloca size by wave-size
4545 SDValue ScaledSize = DAG.getNode(
4546 ISD::SHL, dl, VT, Size,
4547 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4548 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4549 } else {
4550 // For dynamic sized alloca, perform wave-wide reduction to get max of
4551 // alloca size(divergent) and then scale it by wave-size
4552 SDValue WaveReduction =
4553 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4554 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4555 Size, DAG.getConstant(0, dl, MVT::i32));
4556 SDValue ScaledSize = DAG.getNode(
4557 ISD::SHL, dl, VT, Size,
4558 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4559 NewSP =
4560 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4561 SDValue ReadFirstLaneID =
4562 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4563 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4564 NewSP);
4565 }
4566
4567 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4568 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4569
4570 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4571}
4572
4574 if (Op.getValueType() != MVT::i32)
4575 return Op; // Defer to cannot select error.
4576
4578 SDLoc SL(Op);
4579
4580 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4581
4582 // Convert from wave uniform to swizzled vector address. This should protect
4583 // from any edge cases where the stacksave result isn't directly used with
4584 // stackrestore.
4585 SDValue VectorAddress =
4586 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4587 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4588}
4589
4591 SelectionDAG &DAG) const {
4592 SDLoc SL(Op);
4593 assert(Op.getValueType() == MVT::i32);
4594
4595 uint32_t BothRoundHwReg =
4597 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4598
4599 SDValue IntrinID =
4600 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4601 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4602 Op.getOperand(0), IntrinID, GetRoundBothImm);
4603
4604 // There are two rounding modes, one for f32 and one for f64/f16. We only
4605 // report in the standard value range if both are the same.
4606 //
4607 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4608 // ties away from zero is not supported, and the other values are rotated by
4609 // 1.
4610 //
4611 // If the two rounding modes are not the same, report a target defined value.
4612
4613 // Mode register rounding mode fields:
4614 //
4615 // [1:0] Single-precision round mode.
4616 // [3:2] Double/Half-precision round mode.
4617 //
4618 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4619 //
4620 // Hardware Spec
4621 // Toward-0 3 0
4622 // Nearest Even 0 1
4623 // +Inf 1 2
4624 // -Inf 2 3
4625 // NearestAway0 N/A 4
4626 //
4627 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4628 // table we can index by the raw hardware mode.
4629 //
4630 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4631
4632 SDValue BitTable =
4634
4635 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4636 SDValue RoundModeTimesNumBits =
4637 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4638
4639 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4640 // knew only one mode was demanded.
4641 SDValue TableValue =
4642 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4643 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4644
4645 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4646 SDValue TableEntry =
4647 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4648
4649 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4650 // if it's an extended value.
4651 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4652 SDValue IsStandardValue =
4653 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4654 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4655 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4656 TableEntry, EnumOffset);
4657
4658 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4659}
4660
4662 SelectionDAG &DAG) const {
4663 SDLoc SL(Op);
4664
4665 SDValue NewMode = Op.getOperand(1);
4666 assert(NewMode.getValueType() == MVT::i32);
4667
4668 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4669 // hardware MODE.fp_round values.
4670 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4671 uint32_t ClampedVal = std::min(
4672 static_cast<uint32_t>(ConstMode->getZExtValue()),
4674 NewMode = DAG.getConstant(
4675 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4676 } else {
4677 // If we know the input can only be one of the supported standard modes in
4678 // the range 0-3, we can use a simplified mapping to hardware values.
4679 KnownBits KB = DAG.computeKnownBits(NewMode);
4680 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4681 // The supported standard values are 0-3. The extended values start at 8. We
4682 // need to offset by 4 if the value is in the extended range.
4683
4684 if (UseReducedTable) {
4685 // Truncate to the low 32-bits.
4686 SDValue BitTable = DAG.getConstant(
4687 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4688
4689 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4690 SDValue RoundModeTimesNumBits =
4691 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4692
4693 NewMode =
4694 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4695
4696 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4697 // the table extracted bits into inline immediates.
4698 } else {
4699 // table_index = umin(value, value - 4)
4700 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4701 SDValue BitTable =
4703
4704 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4705 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4706 SDValue IndexVal =
4707 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4708
4709 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4710 SDValue RoundModeTimesNumBits =
4711 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4712
4713 SDValue TableValue =
4714 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4715 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4716
4717 // No need to mask out the high bits since the setreg will ignore them
4718 // anyway.
4719 NewMode = TruncTable;
4720 }
4721
4722 // Insert a readfirstlane in case the value is a VGPR. We could do this
4723 // earlier and keep more operations scalar, but that interferes with
4724 // combining the source.
4725 SDValue ReadFirstLaneID =
4726 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4727 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4728 ReadFirstLaneID, NewMode);
4729 }
4730
4731 // N.B. The setreg will be later folded into s_round_mode on supported
4732 // targets.
4733 SDValue IntrinID =
4734 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4735 uint32_t BothRoundHwReg =
4737 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4738
4739 SDValue SetReg =
4740 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4741 IntrinID, RoundBothImm, NewMode);
4742
4743 return SetReg;
4744}
4745
4747 if (Op->isDivergent() &&
4748 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4749 // Cannot do I$ prefetch with divergent pointer.
4750 return SDValue();
4751
4752 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4756 break;
4758 if (Subtarget->hasSafeSmemPrefetch())
4759 break;
4760 [[fallthrough]];
4761 default:
4762 return SDValue();
4763 }
4764
4765 // I$ prefetch
4766 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4767 return SDValue();
4768
4769 return Op;
4770}
4771
4772// Work around DAG legality rules only based on the result type.
4774 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4775 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4776 EVT SrcVT = Src.getValueType();
4777
4778 if (SrcVT.getScalarType() != MVT::bf16)
4779 return Op;
4780
4781 SDLoc SL(Op);
4782 SDValue BitCast =
4783 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4784
4785 EVT DstVT = Op.getValueType();
4786 if (IsStrict)
4787 llvm_unreachable("Need STRICT_BF16_TO_FP");
4788
4789 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4790}
4791
4793 SDLoc SL(Op);
4794 if (Op.getValueType() != MVT::i64)
4795 return Op;
4796
4797 uint32_t ModeHwReg =
4799 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4800 uint32_t TrapHwReg =
4802 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4803
4804 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4805 SDValue IntrinID =
4806 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4807 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4808 Op.getOperand(0), IntrinID, ModeHwRegImm);
4809 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4810 Op.getOperand(0), IntrinID, TrapHwRegImm);
4811 SDValue TokenReg =
4812 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4813 GetTrapReg.getValue(1));
4814
4815 SDValue CvtPtr =
4816 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4817 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4818
4819 return DAG.getMergeValues({Result, TokenReg}, SL);
4820}
4821
4823 SDLoc SL(Op);
4824 if (Op.getOperand(1).getValueType() != MVT::i64)
4825 return Op;
4826
4827 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4828 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4829 DAG.getConstant(0, SL, MVT::i32));
4830 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4831 DAG.getConstant(1, SL, MVT::i32));
4832
4833 SDValue ReadFirstLaneID =
4834 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4835 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4836 ReadFirstLaneID, NewModeReg);
4837 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4838 ReadFirstLaneID, NewTrapReg);
4839
4840 unsigned ModeHwReg =
4842 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4843 unsigned TrapHwReg =
4845 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4846
4847 SDValue IntrinID =
4848 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4849 SDValue SetModeReg =
4850 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4851 IntrinID, ModeHwRegImm, NewModeReg);
4852 SDValue SetTrapReg =
4853 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4854 IntrinID, TrapHwRegImm, NewTrapReg);
4855 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4856}
4857
4859 const MachineFunction &MF) const {
4860 const Function &Fn = MF.getFunction();
4861
4863 .Case("m0", AMDGPU::M0)
4864 .Case("exec", AMDGPU::EXEC)
4865 .Case("exec_lo", AMDGPU::EXEC_LO)
4866 .Case("exec_hi", AMDGPU::EXEC_HI)
4867 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4868 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4869 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4870 .Default(Register());
4871 if (!Reg)
4872 return Reg;
4873
4874 if (!Subtarget->hasFlatScrRegister() &&
4875 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4876 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4877 "\" for subtarget."));
4878 }
4879
4880 switch (Reg) {
4881 case AMDGPU::M0:
4882 case AMDGPU::EXEC_LO:
4883 case AMDGPU::EXEC_HI:
4884 case AMDGPU::FLAT_SCR_LO:
4885 case AMDGPU::FLAT_SCR_HI:
4886 if (VT.getSizeInBits() == 32)
4887 return Reg;
4888 break;
4889 case AMDGPU::EXEC:
4890 case AMDGPU::FLAT_SCR:
4891 if (VT.getSizeInBits() == 64)
4892 return Reg;
4893 break;
4894 default:
4895 llvm_unreachable("missing register type checking");
4896 }
4897
4899 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4900}
4901
4902// If kill is not the last instruction, split the block so kill is always a
4903// proper terminator.
4906 MachineBasicBlock *BB) const {
4907 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4909 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4910 return SplitBB;
4911}
4912
4913// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4914// \p MI will be the only instruction in the loop body block. Otherwise, it will
4915// be the first instruction in the remainder block.
4916//
4917/// \returns { LoopBody, Remainder }
4918static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4920 MachineFunction *MF = MBB.getParent();
4922
4923 // To insert the loop we need to split the block. Move everything after this
4924 // point to a new block, and insert a new empty block between the two.
4926 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4928 ++MBBI;
4929
4930 MF->insert(MBBI, LoopBB);
4931 MF->insert(MBBI, RemainderBB);
4932
4933 LoopBB->addSuccessor(LoopBB);
4934 LoopBB->addSuccessor(RemainderBB);
4935
4936 // Move the rest of the block into a new block.
4937 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4938
4939 if (InstInLoop) {
4940 auto Next = std::next(I);
4941
4942 // Move instruction to loop body.
4943 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4944
4945 // Move the rest of the block.
4946 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4947 } else {
4948 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4949 }
4950
4951 MBB.addSuccessor(LoopBB);
4952
4953 return std::pair(LoopBB, RemainderBB);
4954}
4955
4956/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4958 MachineBasicBlock *MBB = MI.getParent();
4960 auto I = MI.getIterator();
4961 auto E = std::next(I);
4962
4963 // clang-format off
4964 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4965 .addImm(0);
4966 // clang-format on
4967
4968 MIBundleBuilder Bundler(*MBB, I, E);
4969 finalizeBundle(*MBB, Bundler.begin());
4970}
4971
4974 MachineBasicBlock *BB) const {
4975 const DebugLoc &DL = MI.getDebugLoc();
4976
4978
4980
4981 // Apparently kill flags are only valid if the def is in the same block?
4982 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4983 Src->setIsKill(false);
4984
4985 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4986
4987 MachineBasicBlock::iterator I = LoopBB->end();
4988
4989 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4991
4992 // Clear TRAP_STS.MEM_VIOL
4993 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4994 .addImm(0)
4995 .addImm(EncodedReg);
4996
4998
4999 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5000
5001 // Load and check TRAP_STS.MEM_VIOL
5002 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
5003 .addImm(EncodedReg);
5004
5005 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5006 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5007 .addReg(Reg, RegState::Kill)
5008 .addImm(0);
5009 // clang-format off
5010 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5011 .addMBB(LoopBB);
5012 // clang-format on
5013
5014 return RemainderBB;
5015}
5016
5017// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5018// wavefront. If the value is uniform and just happens to be in a VGPR, this
5019// will only do one iteration. In the worst case, this will loop 64 times.
5020//
5021// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5024 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5025 const DebugLoc &DL, const MachineOperand &Idx,
5026 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5027 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5028 Register &SGPRIdxReg) {
5029
5030 MachineFunction *MF = OrigBB.getParent();
5031 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5032 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5035
5036 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5037 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5038 Register NewExec = MRI.createVirtualRegister(BoolRC);
5039 Register CurrentIdxReg =
5040 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5041 Register CondReg = MRI.createVirtualRegister(BoolRC);
5042
5043 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5044 .addReg(InitReg)
5045 .addMBB(&OrigBB)
5046 .addReg(ResultReg)
5047 .addMBB(&LoopBB);
5048
5049 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5050 .addReg(InitSaveExecReg)
5051 .addMBB(&OrigBB)
5052 .addReg(NewExec)
5053 .addMBB(&LoopBB);
5054
5055 // Read the next variant <- also loop target.
5056 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5057 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5058
5059 // Compare the just read M0 value to all possible Idx values.
5060 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5061 .addReg(CurrentIdxReg)
5062 .addReg(Idx.getReg(), 0, Idx.getSubReg());
5063
5064 // Update EXEC, save the original EXEC value to VCC.
5065 BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
5066 .addReg(CondReg, RegState::Kill);
5067
5068 MRI.setSimpleHint(NewExec, CondReg);
5069
5070 if (UseGPRIdxMode) {
5071 if (Offset == 0) {
5072 SGPRIdxReg = CurrentIdxReg;
5073 } else {
5074 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5075 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5076 .addReg(CurrentIdxReg, RegState::Kill)
5077 .addImm(Offset);
5078 }
5079 } else {
5080 // Move index from VCC into M0
5081 if (Offset == 0) {
5082 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5083 .addReg(CurrentIdxReg, RegState::Kill);
5084 } else {
5085 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5086 .addReg(CurrentIdxReg, RegState::Kill)
5087 .addImm(Offset);
5088 }
5089 }
5090
5091 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5092 MachineInstr *InsertPt =
5093 BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
5094 .addReg(LMC.ExecReg)
5095 .addReg(NewExec);
5096
5097 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5098 // s_cbranch_scc0?
5099
5100 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5101 // clang-format off
5102 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5103 .addMBB(&LoopBB);
5104 // clang-format on
5105
5106 return InsertPt->getIterator();
5107}
5108
5109// This has slightly sub-optimal regalloc when the source vector is killed by
5110// the read. The register allocator does not understand that the kill is
5111// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5112// subregister from it, using 1 more VGPR than necessary. This was saved when
5113// this was expanded after register allocation.
5116 unsigned InitResultReg, unsigned PhiReg, int Offset,
5117 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5118 MachineFunction *MF = MBB.getParent();
5119 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5120 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5122 const DebugLoc &DL = MI.getDebugLoc();
5124
5125 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5126 Register DstReg = MI.getOperand(0).getReg();
5127 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5128 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5130
5131 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5132
5133 // Save the EXEC mask
5134 // clang-format off
5135 BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
5136 .addReg(LMC.ExecReg);
5137 // clang-format on
5138
5139 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5140
5141 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5142
5143 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5144 InitResultReg, DstReg, PhiReg, TmpExec,
5145 Offset, UseGPRIdxMode, SGPRIdxReg);
5146
5147 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5149 ++MBBI;
5150 MF->insert(MBBI, LandingPad);
5151 LoopBB->removeSuccessor(RemainderBB);
5152 LandingPad->addSuccessor(RemainderBB);
5153 LoopBB->addSuccessor(LandingPad);
5154 MachineBasicBlock::iterator First = LandingPad->begin();
5155 // clang-format off
5156 BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
5157 .addReg(SaveExec);
5158 // clang-format on
5159
5160 return InsPt;
5161}
5162
5163// Returns subreg index, offset
5164static std::pair<unsigned, int>
5166 const TargetRegisterClass *SuperRC, unsigned VecReg,
5167 int Offset) {
5168 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5169
5170 // Skip out of bounds offsets, or else we would end up using an undefined
5171 // register.
5172 if (Offset >= NumElts || Offset < 0)
5173 return std::pair(AMDGPU::sub0, Offset);
5174
5175 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5176}
5177
5180 int Offset) {
5181 MachineBasicBlock *MBB = MI.getParent();
5182 const DebugLoc &DL = MI.getDebugLoc();
5184
5185 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5186
5187 assert(Idx->getReg() != AMDGPU::NoRegister);
5188
5189 if (Offset == 0) {
5190 // clang-format off
5191 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5192 .add(*Idx);
5193 // clang-format on
5194 } else {
5195 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5196 .add(*Idx)
5197 .addImm(Offset);
5198 }
5199}
5200
5203 int Offset) {
5204 MachineBasicBlock *MBB = MI.getParent();
5205 const DebugLoc &DL = MI.getDebugLoc();
5207
5208 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5209
5210 if (Offset == 0)
5211 return Idx->getReg();
5212
5213 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5214 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5215 .add(*Idx)
5216 .addImm(Offset);
5217 return Tmp;
5218}
5219
5222 const GCNSubtarget &ST) {
5223 const SIInstrInfo *TII = ST.getInstrInfo();
5224 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5225 MachineFunction *MF = MBB.getParent();
5227
5228 Register Dst = MI.getOperand(0).getReg();
5229 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5230 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5231 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5232
5233 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5234 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5235
5236 unsigned SubReg;
5237 std::tie(SubReg, Offset) =
5238 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5239
5240 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5241
5242 // Check for a SGPR index.
5243 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5245 const DebugLoc &DL = MI.getDebugLoc();
5246
5247 if (UseGPRIdxMode) {
5248 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5249 // to avoid interfering with other uses, so probably requires a new
5250 // optimization pass.
5252
5253 const MCInstrDesc &GPRIDXDesc =
5254 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5255 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5256 .addReg(SrcReg)
5257 .addReg(Idx)
5258 .addImm(SubReg);
5259 } else {
5261
5262 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5263 .addReg(SrcReg, 0, SubReg)
5264 .addReg(SrcReg, RegState::Implicit);
5265 }
5266
5267 MI.eraseFromParent();
5268
5269 return &MBB;
5270 }
5271
5272 // Control flow needs to be inserted if indexing with a VGPR.
5273 const DebugLoc &DL = MI.getDebugLoc();
5275
5276 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5277 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5278
5279 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5280
5281 Register SGPRIdxReg;
5282 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5283 UseGPRIdxMode, SGPRIdxReg);
5284
5285 MachineBasicBlock *LoopBB = InsPt->getParent();
5286
5287 if (UseGPRIdxMode) {
5288 const MCInstrDesc &GPRIDXDesc =
5289 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5290
5291 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5292 .addReg(SrcReg)
5293 .addReg(SGPRIdxReg)
5294 .addImm(SubReg);
5295 } else {
5296 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5297 .addReg(SrcReg, 0, SubReg)
5298 .addReg(SrcReg, RegState::Implicit);
5299 }
5300
5301 MI.eraseFromParent();
5302
5303 return LoopBB;
5304}
5305
5308 const GCNSubtarget &ST) {
5309 const SIInstrInfo *TII = ST.getInstrInfo();
5310 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5311 MachineFunction *MF = MBB.getParent();
5313
5314 Register Dst = MI.getOperand(0).getReg();
5315 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5316 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5317 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5318 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5319 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5320 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5321
5322 // This can be an immediate, but will be folded later.
5323 assert(Val->getReg());
5324
5325 unsigned SubReg;
5326 std::tie(SubReg, Offset) =
5327 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5328 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5329
5330 if (Idx->getReg() == AMDGPU::NoRegister) {
5332 const DebugLoc &DL = MI.getDebugLoc();
5333
5334 assert(Offset == 0);
5335
5336 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5337 .add(*SrcVec)
5338 .add(*Val)
5339 .addImm(SubReg);
5340
5341 MI.eraseFromParent();
5342 return &MBB;
5343 }
5344
5345 // Check for a SGPR index.
5346 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5348 const DebugLoc &DL = MI.getDebugLoc();
5349
5350 if (UseGPRIdxMode) {
5352
5353 const MCInstrDesc &GPRIDXDesc =
5354 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5355 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5356 .addReg(SrcVec->getReg())
5357 .add(*Val)
5358 .addReg(Idx)
5359 .addImm(SubReg);
5360 } else {
5362
5363 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5364 TRI.getRegSizeInBits(*VecRC), 32, false);
5365 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5366 .addReg(SrcVec->getReg())
5367 .add(*Val)
5368 .addImm(SubReg);
5369 }
5370 MI.eraseFromParent();
5371 return &MBB;
5372 }
5373
5374 // Control flow needs to be inserted if indexing with a VGPR.
5375 if (Val->isReg())
5376 MRI.clearKillFlags(Val->getReg());
5377
5378 const DebugLoc &DL = MI.getDebugLoc();
5379
5380 Register PhiReg = MRI.createVirtualRegister(VecRC);
5381
5382 Register SGPRIdxReg;
5383 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5384 UseGPRIdxMode, SGPRIdxReg);
5385 MachineBasicBlock *LoopBB = InsPt->getParent();
5386
5387 if (UseGPRIdxMode) {
5388 const MCInstrDesc &GPRIDXDesc =
5389 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5390
5391 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5392 .addReg(PhiReg)
5393 .add(*Val)
5394 .addReg(SGPRIdxReg)
5395 .addImm(SubReg);
5396 } else {
5397 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5398 TRI.getRegSizeInBits(*VecRC), 32, false);
5399 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5400 .addReg(PhiReg)
5401 .add(*Val)
5402 .addImm(SubReg);
5403 }
5404
5405 MI.eraseFromParent();
5406 return LoopBB;
5407}
5408
5410 MachineBasicBlock *BB) {
5411 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5412 // For GFX12, we emit s_add_u64 and s_sub_u64.
5413 MachineFunction *MF = BB->getParent();
5414 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5415 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5417 const DebugLoc &DL = MI.getDebugLoc();
5418 MachineOperand &Dest = MI.getOperand(0);
5419 MachineOperand &Src0 = MI.getOperand(1);
5420 MachineOperand &Src1 = MI.getOperand(2);
5421 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5422 if (ST.hasScalarAddSub64()) {
5423 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5424 // clang-format off
5425 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5426 .add(Src0)
5427 .add(Src1);
5428 // clang-format on
5429 } else {
5430 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5431 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5432
5433 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5434 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5435
5436 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5437 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5438 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5439 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5440
5441 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5442 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5443 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5444 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5445
5446 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5447 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5448 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5449 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5450 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5451 .addReg(DestSub0)
5452 .addImm(AMDGPU::sub0)
5453 .addReg(DestSub1)
5454 .addImm(AMDGPU::sub1);
5455 }
5456 MI.eraseFromParent();
5457 return BB;
5458}
5459
5461 switch (Opc) {
5462 case AMDGPU::S_MIN_U32:
5463 return std::numeric_limits<uint32_t>::max();
5464 case AMDGPU::S_MIN_I32:
5465 return std::numeric_limits<int32_t>::max();
5466 case AMDGPU::S_MAX_U32:
5467 return std::numeric_limits<uint32_t>::min();
5468 case AMDGPU::S_MAX_I32:
5469 return std::numeric_limits<int32_t>::min();
5470 case AMDGPU::S_ADD_I32:
5471 case AMDGPU::S_SUB_I32:
5472 case AMDGPU::S_OR_B32:
5473 case AMDGPU::S_XOR_B32:
5474 return std::numeric_limits<uint32_t>::min();
5475 case AMDGPU::S_AND_B32:
5476 return std::numeric_limits<uint32_t>::max();
5477 default:
5479 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5480 }
5481}
5482
5484 switch (Opc) {
5485 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5486 return std::numeric_limits<uint64_t>::max();
5487 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5488 return std::numeric_limits<int64_t>::max();
5489 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5490 return std::numeric_limits<uint64_t>::min();
5491 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5492 return std::numeric_limits<int64_t>::min();
5493 case AMDGPU::S_ADD_U64_PSEUDO:
5494 case AMDGPU::S_SUB_U64_PSEUDO:
5495 case AMDGPU::S_OR_B64:
5496 case AMDGPU::S_XOR_B64:
5497 return std::numeric_limits<uint64_t>::min();
5498 case AMDGPU::S_AND_B64:
5499 return std::numeric_limits<uint64_t>::max();
5500 default:
5502 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5503 }
5504}
5505
5506static bool is32bitWaveReduceOperation(unsigned Opc) {
5507 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5508 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5509 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5510 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5511 Opc == AMDGPU::S_XOR_B32;
5512}
5513
5516 const GCNSubtarget &ST,
5517 unsigned Opc) {
5519 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5520 const DebugLoc &DL = MI.getDebugLoc();
5521 const SIInstrInfo *TII = ST.getInstrInfo();
5522
5523 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5524 Register SrcReg = MI.getOperand(1).getReg();
5525 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5526 Register DstReg = MI.getOperand(0).getReg();
5527 MachineBasicBlock *RetBB = nullptr;
5528 if (isSGPR) {
5529 switch (Opc) {
5530 case AMDGPU::S_MIN_U32:
5531 case AMDGPU::S_MIN_I32:
5532 case AMDGPU::S_MAX_U32:
5533 case AMDGPU::S_MAX_I32:
5534 case AMDGPU::S_AND_B32:
5535 case AMDGPU::S_OR_B32: {
5536 // Idempotent operations.
5537 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5538 RetBB = &BB;
5539 break;
5540 }
5541 case AMDGPU::V_CMP_LT_U64_e64: // umin
5542 case AMDGPU::V_CMP_LT_I64_e64: // min
5543 case AMDGPU::V_CMP_GT_U64_e64: // umax
5544 case AMDGPU::V_CMP_GT_I64_e64: // max
5545 case AMDGPU::S_AND_B64:
5546 case AMDGPU::S_OR_B64: {
5547 // Idempotent operations.
5548 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5549 RetBB = &BB;
5550 break;
5551 }
5552 case AMDGPU::S_XOR_B32:
5553 case AMDGPU::S_XOR_B64:
5554 case AMDGPU::S_ADD_I32:
5555 case AMDGPU::S_ADD_U64_PSEUDO:
5556 case AMDGPU::S_SUB_I32:
5557 case AMDGPU::S_SUB_U64_PSEUDO: {
5558 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5559 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5560 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5561 Register NumActiveLanes =
5562 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5563
5564 bool IsWave32 = ST.isWave32();
5565 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5566 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5567 unsigned BitCountOpc =
5568 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5569
5570 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5571
5572 auto NewAccumulator =
5573 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5574 .addReg(ExecMask);
5575
5576 switch (Opc) {
5577 case AMDGPU::S_XOR_B32:
5578 case AMDGPU::S_XOR_B64: {
5579 // Performing an XOR operation on a uniform value
5580 // depends on the parity of the number of active lanes.
5581 // For even parity, the result will be 0, for odd
5582 // parity the result will be the same as the input value.
5583 Register ParityRegister =
5584 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5585
5586 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5587 .addReg(NewAccumulator->getOperand(0).getReg())
5588 .addImm(1)
5589 .setOperandDead(3); // Dead scc
5590 if (Opc == AMDGPU::S_XOR_B32) {
5591 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5592 .addReg(SrcReg)
5593 .addReg(ParityRegister);
5594 } else {
5595 Register DestSub0 =
5596 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5597 Register DestSub1 =
5598 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5599
5600 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5601 const TargetRegisterClass *SrcSubRC =
5602 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5603
5604 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5605 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5606 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5607 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5608
5609 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5610 .add(Op1L)
5611 .addReg(ParityRegister);
5612
5613 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5614 .add(Op1H)
5615 .addReg(ParityRegister);
5616
5617 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5618 .addReg(DestSub0)
5619 .addImm(AMDGPU::sub0)
5620 .addReg(DestSub1)
5621 .addImm(AMDGPU::sub1);
5622 }
5623 break;
5624 }
5625 case AMDGPU::S_SUB_I32: {
5626 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5627
5628 // Take the negation of the source operand.
5629 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5630 .addImm(0)
5631 .addReg(SrcReg);
5632 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5633 .addReg(NegatedVal)
5634 .addReg(NewAccumulator->getOperand(0).getReg());
5635 break;
5636 }
5637 case AMDGPU::S_ADD_I32: {
5638 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5639 .addReg(SrcReg)
5640 .addReg(NewAccumulator->getOperand(0).getReg());
5641 break;
5642 }
5643 case AMDGPU::S_ADD_U64_PSEUDO:
5644 case AMDGPU::S_SUB_U64_PSEUDO: {
5645 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5646 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5647 Register Op1H_Op0L_Reg =
5648 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5649 Register Op1L_Op0H_Reg =
5650 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5651 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5652 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5653 Register NegatedValLo =
5654 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5655 Register NegatedValHi =
5656 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5657
5658 const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5659 const TargetRegisterClass *Src1SubRC =
5660 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5661
5662 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5663 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5664 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5665 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5666
5667 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5668 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5669 .addImm(0)
5670 .addReg(NewAccumulator->getOperand(0).getReg())
5671 .setOperandDead(3); // Dead scc
5672 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5673 .addReg(NegatedValLo)
5674 .addImm(31)
5675 .setOperandDead(3); // Dead scc
5676 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5677 .add(Op1L)
5678 .addReg(NegatedValHi);
5679 }
5680 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5681 ? NegatedValLo
5682 : NewAccumulator->getOperand(0).getReg();
5683 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5684 .add(Op1L)
5685 .addReg(LowOpcode);
5686 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5687 .add(Op1L)
5688 .addReg(LowOpcode);
5689 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5690 .add(Op1H)
5691 .addReg(LowOpcode);
5692
5693 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5694 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5695 .addReg(CarryReg)
5696 .addReg(Op1H_Op0L_Reg)
5697 .setOperandDead(3); // Dead scc
5698
5699 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5700 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5701 .addReg(HiVal)
5702 .addReg(Op1L_Op0H_Reg)
5703 .setOperandDead(3); // Dead scc
5704 }
5705 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5706 .addReg(DestSub0)
5707 .addImm(AMDGPU::sub0)
5708 .addReg(DestSub1)
5709 .addImm(AMDGPU::sub1);
5710 break;
5711 }
5712 }
5713 RetBB = &BB;
5714 }
5715 }
5716 } else {
5717 // TODO: Implement DPP Strategy and switch based on immediate strategy
5718 // operand. For now, for all the cases (default, Iterative and DPP we use
5719 // iterative approach by default.)
5720
5721 // To reduce the VGPR using iterative approach, we need to iterate
5722 // over all the active lanes. Lowering consists of ComputeLoop,
5723 // which iterate over only active lanes. We use copy of EXEC register
5724 // as induction variable and every active lane modifies it using bitset0
5725 // so that we will get the next active lane for next iteration.
5727 Register SrcReg = MI.getOperand(1).getReg();
5728 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5729
5730 // Create Control flow for loop
5731 // Split MI's Machine Basic block into For loop
5732 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5733
5734 // Create virtual registers required for lowering.
5735 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5736 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5737 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5738 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5739 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5740 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5741 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5742 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5743 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5744
5745 bool IsWave32 = ST.isWave32();
5746 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5747 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5748
5749 // Create initial values of induction variable from Exec, Accumulator and
5750 // insert branch instr to newly created ComputeBlock
5751 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5752 if (is32BitOpc) {
5754 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5755 .addImm(IdentityValue);
5756 } else {
5758 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5759 .addImm(IdentityValue);
5760 }
5761 // clang-format off
5762 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5763 .addMBB(ComputeLoop);
5764 // clang-format on
5765
5766 // Start constructing ComputeLoop
5767 I = ComputeLoop->begin();
5768 auto Accumulator =
5769 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5770 .addReg(IdentityValReg)
5771 .addMBB(&BB);
5772 auto ActiveBits =
5773 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5774 .addReg(LoopIterator)
5775 .addMBB(&BB);
5776
5777 I = ComputeLoop->end();
5778 MachineInstr *NewAccumulator;
5779 // Perform the computations
5780 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5781 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5782 .addReg(ActiveBitsReg);
5783 if (is32BitOpc) {
5784 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5785 LaneValueReg)
5786 .addReg(SrcReg)
5787 .addReg(FF1Reg);
5788 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5789 .addReg(Accumulator->getOperand(0).getReg())
5790 .addReg(LaneValueReg);
5791 } else {
5792 Register LaneValueLoReg =
5793 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5794 Register LaneValueHiReg =
5795 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5796 Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5797 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5798 const TargetRegisterClass *SrcSubRC =
5799 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5800 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5801 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5802 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5803 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5804 // lane value input should be in an sgpr
5805 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5806 LaneValueLoReg)
5807 .add(Op1L)
5808 .addReg(FF1Reg);
5809 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5810 LaneValueHiReg)
5811 .add(Op1H)
5812 .addReg(FF1Reg);
5813 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5814 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5815 .addReg(LaneValueLoReg)
5816 .addImm(AMDGPU::sub0)
5817 .addReg(LaneValueHiReg)
5818 .addImm(AMDGPU::sub1);
5819 switch (Opc) {
5820 case AMDGPU::S_OR_B64:
5821 case AMDGPU::S_AND_B64:
5822 case AMDGPU::S_XOR_B64: {
5823 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5824 .addReg(Accumulator->getOperand(0).getReg())
5825 .addReg(LaneValue->getOperand(0).getReg())
5826 .setOperandDead(3); // Dead scc
5827 break;
5828 }
5829 case AMDGPU::V_CMP_GT_I64_e64:
5830 case AMDGPU::V_CMP_GT_U64_e64:
5831 case AMDGPU::V_CMP_LT_I64_e64:
5832 case AMDGPU::V_CMP_LT_U64_e64: {
5833 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5834 Register ComparisonResultReg =
5835 MRI.createVirtualRegister(WaveMaskRegClass);
5836 const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
5837 const TargetRegisterClass *VSubRegClass =
5838 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5839 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5840 MachineOperand SrcReg0Sub0 =
5841 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5842 VregClass, AMDGPU::sub0, VSubRegClass);
5843 MachineOperand SrcReg0Sub1 =
5844 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5845 VregClass, AMDGPU::sub1, VSubRegClass);
5846 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5847 AccumulatorVReg)
5848 .add(SrcReg0Sub0)
5849 .addImm(AMDGPU::sub0)
5850 .add(SrcReg0Sub1)
5851 .addImm(AMDGPU::sub1);
5852 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5853 .addReg(LaneValue->getOperand(0).getReg())
5854 .addReg(AccumulatorVReg);
5855
5856 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5857 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5858 .addReg(LaneMaskReg)
5859 .addReg(ActiveBitsReg);
5860
5861 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5862 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5863 .addReg(LaneValue->getOperand(0).getReg())
5864 .addReg(Accumulator->getOperand(0).getReg());
5865 break;
5866 }
5867 case AMDGPU::S_ADD_U64_PSEUDO:
5868 case AMDGPU::S_SUB_U64_PSEUDO: {
5869 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5870 .addReg(Accumulator->getOperand(0).getReg())
5871 .addReg(LaneValue->getOperand(0).getReg());
5872 ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
5873 break;
5874 }
5875 }
5876 }
5877 // Manipulate the iterator to get the next active lane
5878 unsigned BITSETOpc =
5879 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5880 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5881 .addReg(FF1Reg)
5882 .addReg(ActiveBitsReg);
5883
5884 // Add phi nodes
5885 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
5886 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5887
5888 // Creating branching
5889 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5890 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5891 .addReg(NewActiveBitsReg)
5892 .addImm(0);
5893 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5894 .addMBB(ComputeLoop);
5895
5896 RetBB = ComputeEnd;
5897 }
5898 MI.eraseFromParent();
5899 return RetBB;
5900}
5901
5904 MachineBasicBlock *BB) const {
5905 MachineFunction *MF = BB->getParent();
5907 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5909 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
5910
5911 switch (MI.getOpcode()) {
5912 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5913 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5914 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5915 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
5916 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5917 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5918 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5919 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
5920 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5921 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5922 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5923 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
5924 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5925 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5926 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5927 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
5928 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5929 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5930 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5931 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
5932 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5933 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5934 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5935 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
5936 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5937 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5938 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5939 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
5940 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5941 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5942 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5943 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
5944 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5945 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5946 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5947 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
5948 case AMDGPU::S_UADDO_PSEUDO:
5949 case AMDGPU::S_USUBO_PSEUDO: {
5950 const DebugLoc &DL = MI.getDebugLoc();
5951 MachineOperand &Dest0 = MI.getOperand(0);
5952 MachineOperand &Dest1 = MI.getOperand(1);
5953 MachineOperand &Src0 = MI.getOperand(2);
5954 MachineOperand &Src1 = MI.getOperand(3);
5955
5956 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5957 ? AMDGPU::S_ADD_U32
5958 : AMDGPU::S_SUB_U32;
5959 // clang-format off
5960 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5961 .add(Src0)
5962 .add(Src1);
5963 // clang-format on
5964
5965 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5966 .addImm(1)
5967 .addImm(0);
5968
5969 MI.eraseFromParent();
5970 return BB;
5971 }
5972 case AMDGPU::S_ADD_U64_PSEUDO:
5973 case AMDGPU::S_SUB_U64_PSEUDO: {
5974 return Expand64BitScalarArithmetic(MI, BB);
5975 }
5976 case AMDGPU::V_ADD_U64_PSEUDO:
5977 case AMDGPU::V_SUB_U64_PSEUDO: {
5979 const DebugLoc &DL = MI.getDebugLoc();
5980
5981 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5982
5983 MachineOperand &Dest = MI.getOperand(0);
5984 MachineOperand &Src0 = MI.getOperand(1);
5985 MachineOperand &Src1 = MI.getOperand(2);
5986
5987 if (ST.hasAddSubU64Insts()) {
5988 auto I = BuildMI(*BB, MI, DL,
5989 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5990 : AMDGPU::V_SUB_U64_e64),
5991 Dest.getReg())
5992 .add(Src0)
5993 .add(Src1)
5994 .addImm(0); // clamp
5995 TII->legalizeOperands(*I);
5996 MI.eraseFromParent();
5997 return BB;
5998 }
5999
6000 if (IsAdd && ST.hasLshlAddU64Inst()) {
6001 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
6002 Dest.getReg())
6003 .add(Src0)
6004 .addImm(0)
6005 .add(Src1);
6006 TII->legalizeOperands(*Add);
6007 MI.eraseFromParent();
6008 return BB;
6009 }
6010
6011 const auto *CarryRC = TRI->getWaveMaskRegClass();
6012
6013 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6014 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6015
6016 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6017 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6018
6019 const TargetRegisterClass *Src0RC = Src0.isReg()
6020 ? MRI.getRegClass(Src0.getReg())
6021 : &AMDGPU::VReg_64RegClass;
6022 const TargetRegisterClass *Src1RC = Src1.isReg()
6023 ? MRI.getRegClass(Src1.getReg())
6024 : &AMDGPU::VReg_64RegClass;
6025
6026 const TargetRegisterClass *Src0SubRC =
6027 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6028 const TargetRegisterClass *Src1SubRC =
6029 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6030
6031 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6032 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6033 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6034 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6035
6036 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6037 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6038 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6039 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6040
6041 unsigned LoOpc =
6042 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6043 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6044 .addReg(CarryReg, RegState::Define)
6045 .add(SrcReg0Sub0)
6046 .add(SrcReg1Sub0)
6047 .addImm(0); // clamp bit
6048
6049 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6050 MachineInstr *HiHalf =
6051 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6052 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6053 .add(SrcReg0Sub1)
6054 .add(SrcReg1Sub1)
6055 .addReg(CarryReg, RegState::Kill)
6056 .addImm(0); // clamp bit
6057
6058 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6059 .addReg(DestSub0)
6060 .addImm(AMDGPU::sub0)
6061 .addReg(DestSub1)
6062 .addImm(AMDGPU::sub1);
6063 TII->legalizeOperands(*LoHalf);
6064 TII->legalizeOperands(*HiHalf);
6065 MI.eraseFromParent();
6066 return BB;
6067 }
6068 case AMDGPU::S_ADD_CO_PSEUDO:
6069 case AMDGPU::S_SUB_CO_PSEUDO: {
6070 // This pseudo has a chance to be selected
6071 // only from uniform add/subcarry node. All the VGPR operands
6072 // therefore assumed to be splat vectors.
6075 const DebugLoc &DL = MI.getDebugLoc();
6076 MachineOperand &Dest = MI.getOperand(0);
6077 MachineOperand &CarryDest = MI.getOperand(1);
6078 MachineOperand &Src0 = MI.getOperand(2);
6079 MachineOperand &Src1 = MI.getOperand(3);
6080 MachineOperand &Src2 = MI.getOperand(4);
6081 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
6082 ? AMDGPU::S_ADDC_U32
6083 : AMDGPU::S_SUBB_U32;
6084 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6085 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6086 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6087 .addReg(Src0.getReg());
6088 Src0.setReg(RegOp0);
6089 }
6090 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6091 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6092 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6093 .addReg(Src1.getReg());
6094 Src1.setReg(RegOp1);
6095 }
6096 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6097 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6098 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6099 .addReg(Src2.getReg());
6100 Src2.setReg(RegOp2);
6101 }
6102
6103 if (ST.isWave64()) {
6104 if (ST.hasScalarCompareEq64()) {
6105 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6106 .addReg(Src2.getReg())
6107 .addImm(0);
6108 } else {
6109 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6110 const TargetRegisterClass *SubRC =
6111 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6112 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6113 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6114 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6115 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6116 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6117
6118 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6119 .add(Src2Sub0)
6120 .add(Src2Sub1);
6121
6122 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6123 .addReg(Src2_32, RegState::Kill)
6124 .addImm(0);
6125 }
6126 } else {
6127 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6128 .addReg(Src2.getReg())
6129 .addImm(0);
6130 }
6131
6132 // clang-format off
6133 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
6134 .add(Src0)
6135 .add(Src1);
6136 // clang-format on
6137
6138 unsigned SelOpc =
6139 (ST.isWave64()) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6140
6141 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6142 .addImm(-1)
6143 .addImm(0);
6144
6145 MI.eraseFromParent();
6146 return BB;
6147 }
6148 case AMDGPU::SI_INIT_M0: {
6149 MachineOperand &M0Init = MI.getOperand(0);
6150 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6151 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6152 AMDGPU::M0)
6153 .add(M0Init);
6154 MI.eraseFromParent();
6155 return BB;
6156 }
6157 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6158 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6159 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6160 TII->get(AMDGPU::S_CMP_EQ_U32))
6161 .addImm(0)
6162 .addImm(0);
6163 return BB;
6164 }
6165 case AMDGPU::GET_GROUPSTATICSIZE: {
6166 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6167 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6168 DebugLoc DL = MI.getDebugLoc();
6169 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6170 .add(MI.getOperand(0))
6171 .addImm(MFI->getLDSSize());
6172 MI.eraseFromParent();
6173 return BB;
6174 }
6175 case AMDGPU::GET_SHADERCYCLESHILO: {
6178 const DebugLoc &DL = MI.getDebugLoc();
6179 // The algorithm is:
6180 //
6181 // hi1 = getreg(SHADER_CYCLES_HI)
6182 // lo1 = getreg(SHADER_CYCLES_LO)
6183 // hi2 = getreg(SHADER_CYCLES_HI)
6184 //
6185 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6186 // Otherwise there was overflow and the result is hi2:0. In both cases the
6187 // result should represent the actual time at some point during the sequence
6188 // of three getregs.
6189 using namespace AMDGPU::Hwreg;
6190 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6191 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6192 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6193 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6194 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6195 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6196 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6197 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6198 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6199 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6200 .addReg(RegHi1)
6201 .addReg(RegHi2);
6202 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6203 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6204 .addReg(RegLo1)
6205 .addImm(0);
6206 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6207 .add(MI.getOperand(0))
6208 .addReg(RegLo)
6209 .addImm(AMDGPU::sub0)
6210 .addReg(RegHi2)
6211 .addImm(AMDGPU::sub1);
6212 MI.eraseFromParent();
6213 return BB;
6214 }
6215 case AMDGPU::SI_INDIRECT_SRC_V1:
6216 case AMDGPU::SI_INDIRECT_SRC_V2:
6217 case AMDGPU::SI_INDIRECT_SRC_V4:
6218 case AMDGPU::SI_INDIRECT_SRC_V8:
6219 case AMDGPU::SI_INDIRECT_SRC_V9:
6220 case AMDGPU::SI_INDIRECT_SRC_V10:
6221 case AMDGPU::SI_INDIRECT_SRC_V11:
6222 case AMDGPU::SI_INDIRECT_SRC_V12:
6223 case AMDGPU::SI_INDIRECT_SRC_V16:
6224 case AMDGPU::SI_INDIRECT_SRC_V32:
6225 return emitIndirectSrc(MI, *BB, *getSubtarget());
6226 case AMDGPU::SI_INDIRECT_DST_V1:
6227 case AMDGPU::SI_INDIRECT_DST_V2:
6228 case AMDGPU::SI_INDIRECT_DST_V4:
6229 case AMDGPU::SI_INDIRECT_DST_V8:
6230 case AMDGPU::SI_INDIRECT_DST_V9:
6231 case AMDGPU::SI_INDIRECT_DST_V10:
6232 case AMDGPU::SI_INDIRECT_DST_V11:
6233 case AMDGPU::SI_INDIRECT_DST_V12:
6234 case AMDGPU::SI_INDIRECT_DST_V16:
6235 case AMDGPU::SI_INDIRECT_DST_V32:
6236 return emitIndirectDst(MI, *BB, *getSubtarget());
6237 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6238 case AMDGPU::SI_KILL_I1_PSEUDO:
6239 return splitKillBlock(MI, BB);
6240 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6242
6243 Register Dst = MI.getOperand(0).getReg();
6244 const MachineOperand &Src0 = MI.getOperand(1);
6245 const MachineOperand &Src1 = MI.getOperand(2);
6246 const DebugLoc &DL = MI.getDebugLoc();
6247 Register SrcCond = MI.getOperand(3).getReg();
6248
6249 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6250 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6251 const auto *CondRC = TRI->getWaveMaskRegClass();
6252 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
6253
6254 const TargetRegisterClass *Src0RC = Src0.isReg()
6255 ? MRI.getRegClass(Src0.getReg())
6256 : &AMDGPU::VReg_64RegClass;
6257 const TargetRegisterClass *Src1RC = Src1.isReg()
6258 ? MRI.getRegClass(Src1.getReg())
6259 : &AMDGPU::VReg_64RegClass;
6260
6261 const TargetRegisterClass *Src0SubRC =
6262 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6263 const TargetRegisterClass *Src1SubRC =
6264 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6265
6266 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6267 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6268 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6269 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6270
6271 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6272 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6273 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6274 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6275
6276 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
6277 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
6278 .addImm(0)
6279 .add(Src0Sub0)
6280 .addImm(0)
6281 .add(Src1Sub0)
6282 .addReg(SrcCondCopy);
6283 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
6284 .addImm(0)
6285 .add(Src0Sub1)
6286 .addImm(0)
6287 .add(Src1Sub1)
6288 .addReg(SrcCondCopy);
6289
6290 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
6291 .addReg(DstLo)
6292 .addImm(AMDGPU::sub0)
6293 .addReg(DstHi)
6294 .addImm(AMDGPU::sub1);
6295 MI.eraseFromParent();
6296 return BB;
6297 }
6298 case AMDGPU::SI_BR_UNDEF: {
6299 const DebugLoc &DL = MI.getDebugLoc();
6300 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6301 .add(MI.getOperand(0));
6302 Br->getOperand(1).setIsUndef(); // read undef SCC
6303 MI.eraseFromParent();
6304 return BB;
6305 }
6306 case AMDGPU::ADJCALLSTACKUP:
6307 case AMDGPU::ADJCALLSTACKDOWN: {
6309 MachineInstrBuilder MIB(*MF, &MI);
6310 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
6311 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
6312 return BB;
6313 }
6314 case AMDGPU::SI_CALL_ISEL: {
6315 const DebugLoc &DL = MI.getDebugLoc();
6316
6317 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
6318
6320 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6321
6322 for (const MachineOperand &MO : MI.operands())
6323 MIB.add(MO);
6324
6325 MIB.cloneMemRefs(MI);
6326 MI.eraseFromParent();
6327 return BB;
6328 }
6329 case AMDGPU::V_ADD_CO_U32_e32:
6330 case AMDGPU::V_SUB_CO_U32_e32:
6331 case AMDGPU::V_SUBREV_CO_U32_e32: {
6332 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6333 const DebugLoc &DL = MI.getDebugLoc();
6334 unsigned Opc = MI.getOpcode();
6335
6336 bool NeedClampOperand = false;
6337 if (TII->pseudoToMCOpcode(Opc) == -1) {
6339 NeedClampOperand = true;
6340 }
6341
6342 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
6343 if (TII->isVOP3(*I)) {
6344 I.addReg(TRI->getVCC(), RegState::Define);
6345 }
6346 I.add(MI.getOperand(1)).add(MI.getOperand(2));
6347 if (NeedClampOperand)
6348 I.addImm(0); // clamp bit for e64 encoding
6349
6350 TII->legalizeOperands(*I);
6351
6352 MI.eraseFromParent();
6353 return BB;
6354 }
6355 case AMDGPU::V_ADDC_U32_e32:
6356 case AMDGPU::V_SUBB_U32_e32:
6357 case AMDGPU::V_SUBBREV_U32_e32:
6358 // These instructions have an implicit use of vcc which counts towards the
6359 // constant bus limit.
6360 TII->legalizeOperands(MI);
6361 return BB;
6362 case AMDGPU::DS_GWS_INIT:
6363 case AMDGPU::DS_GWS_SEMA_BR:
6364 case AMDGPU::DS_GWS_BARRIER:
6365 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
6366 [[fallthrough]];
6367 case AMDGPU::DS_GWS_SEMA_V:
6368 case AMDGPU::DS_GWS_SEMA_P:
6369 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6370 // A s_waitcnt 0 is required to be the instruction immediately following.
6371 if (getSubtarget()->hasGWSAutoReplay()) {
6373 return BB;
6374 }
6375
6376 return emitGWSMemViolTestLoop(MI, BB);
6377 case AMDGPU::S_SETREG_B32: {
6378 // Try to optimize cases that only set the denormal mode or rounding mode.
6379 //
6380 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6381 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6382 // instead.
6383 //
6384 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6385 // allow you to have a no side effect instruction in the output of a
6386 // sideeffecting pattern.
6387 auto [ID, Offset, Width] =
6388 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6390 return BB;
6391
6392 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6393 const unsigned SetMask = WidthMask << Offset;
6394
6395 if (getSubtarget()->hasDenormModeInst()) {
6396 unsigned SetDenormOp = 0;
6397 unsigned SetRoundOp = 0;
6398
6399 // The dedicated instructions can only set the whole denorm or round mode
6400 // at once, not a subset of bits in either.
6401 if (SetMask ==
6403 // If this fully sets both the round and denorm mode, emit the two
6404 // dedicated instructions for these.
6405 SetRoundOp = AMDGPU::S_ROUND_MODE;
6406 SetDenormOp = AMDGPU::S_DENORM_MODE;
6407 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6408 SetRoundOp = AMDGPU::S_ROUND_MODE;
6409 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6410 SetDenormOp = AMDGPU::S_DENORM_MODE;
6411 }
6412
6413 if (SetRoundOp || SetDenormOp) {
6415 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6416 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6417 unsigned ImmVal = Def->getOperand(1).getImm();
6418 if (SetRoundOp) {
6419 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6420 .addImm(ImmVal & 0xf);
6421
6422 // If we also have the denorm mode, get just the denorm mode bits.
6423 ImmVal >>= 4;
6424 }
6425
6426 if (SetDenormOp) {
6427 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6428 .addImm(ImmVal & 0xf);
6429 }
6430
6431 MI.eraseFromParent();
6432 return BB;
6433 }
6434 }
6435 }
6436
6437 // If only FP bits are touched, used the no side effects pseudo.
6438 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6439 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6440 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6441
6442 return BB;
6443 }
6444 case AMDGPU::S_INVERSE_BALLOT_U32:
6445 case AMDGPU::S_INVERSE_BALLOT_U64:
6446 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6447 // necessary. After that they are equivalent to a COPY.
6448 MI.setDesc(TII->get(AMDGPU::COPY));
6449 return BB;
6450 case AMDGPU::ENDPGM_TRAP: {
6451 const DebugLoc &DL = MI.getDebugLoc();
6452 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6453 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6454 MI.addOperand(MachineOperand::CreateImm(0));
6455 return BB;
6456 }
6457
6458 // We need a block split to make the real endpgm a terminator. We also don't
6459 // want to break phis in successor blocks, so we can't just delete to the
6460 // end of the block.
6461
6462 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6464 MF->push_back(TrapBB);
6465 // clang-format off
6466 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6467 .addImm(0);
6468 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6469 .addMBB(TrapBB);
6470 // clang-format on
6471
6472 BB->addSuccessor(TrapBB);
6473 MI.eraseFromParent();
6474 return SplitBB;
6475 }
6476 case AMDGPU::SIMULATED_TRAP: {
6477 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6479 MachineBasicBlock *SplitBB =
6480 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6481 MI.eraseFromParent();
6482 return SplitBB;
6483 }
6484 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6485 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6487
6488 // During ISel, it's difficult to propagate the original EXEC mask to use as
6489 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6490 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6491 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6492 Register OriginalExec = Setup->getOperand(0).getReg();
6493 MF->getRegInfo().clearKillFlags(OriginalExec);
6494 MI.getOperand(0).setReg(OriginalExec);
6495 return BB;
6496 }
6497 default:
6498 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6499 if (!MI.mayStore())
6501 return BB;
6502 }
6504 }
6505}
6506
6508 // This currently forces unfolding various combinations of fsub into fma with
6509 // free fneg'd operands. As long as we have fast FMA (controlled by
6510 // isFMAFasterThanFMulAndFAdd), we should perform these.
6511
6512 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6513 // most of these combines appear to be cycle neutral but save on instruction
6514 // count / code size.
6515 return true;
6516}
6517
6519
6521 EVT VT) const {
6522 if (!VT.isVector()) {
6523 return MVT::i1;
6524 }
6525 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6526}
6527
6529 // TODO: Should i16 be used always if legal? For now it would force VALU
6530 // shifts.
6531 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6532}
6533
6535 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6536 ? Ty.changeElementSize(16)
6537 : Ty.changeElementSize(32);
6538}
6539
6540// Answering this is somewhat tricky and depends on the specific device which
6541// have different rates for fma or all f64 operations.
6542//
6543// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6544// regardless of which device (although the number of cycles differs between
6545// devices), so it is always profitable for f64.
6546//
6547// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6548// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6549// which we can always do even without fused FP ops since it returns the same
6550// result as the separate operations and since it is always full
6551// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6552// however does not support denormals, so we do report fma as faster if we have
6553// a fast fma device and require denormals.
6554//
6556 EVT VT) const {
6557 VT = VT.getScalarType();
6558
6559 switch (VT.getSimpleVT().SimpleTy) {
6560 case MVT::f32: {
6561 // If mad is not available this depends only on if f32 fma is full rate.
6562 if (!Subtarget->hasMadMacF32Insts())
6563 return Subtarget->hasFastFMAF32();
6564
6565 // Otherwise f32 mad is always full rate and returns the same result as
6566 // the separate operations so should be preferred over fma.
6567 // However does not support denormals.
6569 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6570
6571 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6572 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6573 }
6574 case MVT::f64:
6575 return true;
6576 case MVT::f16:
6577 case MVT::bf16:
6578 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6579 default:
6580 break;
6581 }
6582
6583 return false;
6584}
6585
6587 LLT Ty) const {
6588 switch (Ty.getScalarSizeInBits()) {
6589 case 16:
6590 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6591 case 32:
6592 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6593 case 64:
6594 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6595 default:
6596 break;
6597 }
6598
6599 return false;
6600}
6601
6603 if (!Ty.isScalar())
6604 return false;
6605
6606 if (Ty.getScalarSizeInBits() == 16)
6607 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6608 if (Ty.getScalarSizeInBits() == 32)
6609 return Subtarget->hasMadMacF32Insts() &&
6610 denormalModeIsFlushAllF32(*MI.getMF());
6611
6612 return false;
6613}
6614
6616 const SDNode *N) const {
6617 // TODO: Check future ftz flag
6618 // v_mad_f32/v_mac_f32 do not support denormals.
6619 EVT VT = N->getValueType(0);
6620 if (VT == MVT::f32)
6621 return Subtarget->hasMadMacF32Insts() &&
6623 if (VT == MVT::f16) {
6624 return Subtarget->hasMadF16() &&
6626 }
6627
6628 return false;
6629}
6630
6631//===----------------------------------------------------------------------===//
6632// Custom DAG Lowering Operations
6633//===----------------------------------------------------------------------===//
6634
6635// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6636// wider vector type is legal.
6638 SelectionDAG &DAG) const {
6639 unsigned Opc = Op.getOpcode();
6640 EVT VT = Op.getValueType();
6641 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6642 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6643 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6644 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6645
6646 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6647
6648 SDLoc SL(Op);
6649 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6650 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6651
6652 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6653}
6654
6655// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6656// regression whereby extra unnecessary instructions were added to codegen
6657// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6658// instructions to extract the result from the vector.
6660 [[maybe_unused]] EVT VT = Op.getValueType();
6661
6662 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6663 VT == MVT::v16i32) &&
6664 "Unexpected ValueType.");
6665
6666 return DAG.UnrollVectorOp(Op.getNode());
6667}
6668
6669// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6670// wider vector type is legal.
6672 SelectionDAG &DAG) const {
6673 unsigned Opc = Op.getOpcode();
6674 EVT VT = Op.getValueType();
6675 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6676 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6677 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6678 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6679 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6680 VT == MVT::v32bf16);
6681
6682 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6683 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6684
6685 SDLoc SL(Op);
6686
6687 SDValue OpLo =
6688 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6689 SDValue OpHi =
6690 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6691
6692 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6693}
6694
6696 SelectionDAG &DAG) const {
6697 unsigned Opc = Op.getOpcode();
6698 EVT VT = Op.getValueType();
6699 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6700 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6701 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6702 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6703 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6704 VT == MVT::v32bf16);
6705
6706 SDValue Op0 = Op.getOperand(0);
6707 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6708 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6709 : std::pair(Op0, Op0);
6710
6711 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6712 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6713
6714 SDLoc SL(Op);
6715 auto ResVT = DAG.GetSplitDestVTs(VT);
6716
6717 SDValue OpLo =
6718 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6719 SDValue OpHi =
6720 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6721
6722 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6723}
6724
6726 switch (Op.getOpcode()) {
6727 default:
6729 case ISD::BRCOND:
6730 return LowerBRCOND(Op, DAG);
6731 case ISD::RETURNADDR:
6732 return LowerRETURNADDR(Op, DAG);
6733 case ISD::LOAD: {
6734 SDValue Result = LowerLOAD(Op, DAG);
6735 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6736 "Load should return a value and a chain");
6737 return Result;
6738 }
6739 case ISD::FSQRT: {
6740 EVT VT = Op.getValueType();
6741 if (VT == MVT::f32)
6742 return lowerFSQRTF32(Op, DAG);
6743 if (VT == MVT::f64)
6744 return lowerFSQRTF64(Op, DAG);
6745 return SDValue();
6746 }
6747 case ISD::FSIN:
6748 case ISD::FCOS:
6749 return LowerTrig(Op, DAG);
6750 case ISD::SELECT:
6751 return LowerSELECT(Op, DAG);
6752 case ISD::FDIV:
6753 return LowerFDIV(Op, DAG);
6754 case ISD::FFREXP:
6755 return LowerFFREXP(Op, DAG);
6756 case ISD::ATOMIC_CMP_SWAP:
6757 return LowerATOMIC_CMP_SWAP(Op, DAG);
6758 case ISD::STORE:
6759 return LowerSTORE(Op, DAG);
6760 case ISD::GlobalAddress: {
6763 return LowerGlobalAddress(MFI, Op, DAG);
6764 }
6766 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6768 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6770 return LowerINTRINSIC_VOID(Op, DAG);
6771 case ISD::ADDRSPACECAST:
6772 return lowerADDRSPACECAST(Op, DAG);
6774 return lowerINSERT_SUBVECTOR(Op, DAG);
6776 return lowerINSERT_VECTOR_ELT(Op, DAG);
6778 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6780 return lowerVECTOR_SHUFFLE(Op, DAG);
6782 return lowerSCALAR_TO_VECTOR(Op, DAG);
6783 case ISD::BUILD_VECTOR:
6784 return lowerBUILD_VECTOR(Op, DAG);
6785 case ISD::FP_ROUND:
6787 return lowerFP_ROUND(Op, DAG);
6788 case ISD::TRAP:
6789 return lowerTRAP(Op, DAG);
6790 case ISD::DEBUGTRAP:
6791 return lowerDEBUGTRAP(Op, DAG);
6792 case ISD::ABS:
6793 case ISD::FABS:
6794 case ISD::FNEG:
6795 case ISD::FCANONICALIZE:
6796 case ISD::BSWAP:
6797 return splitUnaryVectorOp(Op, DAG);
6798 case ISD::FMINNUM:
6799 case ISD::FMAXNUM:
6800 return lowerFMINNUM_FMAXNUM(Op, DAG);
6801 case ISD::FMINIMUMNUM:
6802 case ISD::FMAXIMUMNUM:
6803 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6804 case ISD::FMINIMUM:
6805 case ISD::FMAXIMUM:
6806 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6807 case ISD::FLDEXP:
6808 case ISD::STRICT_FLDEXP:
6809 return lowerFLDEXP(Op, DAG);
6810 case ISD::FMA:
6811 return splitTernaryVectorOp(Op, DAG);
6812 case ISD::FP_TO_SINT:
6813 case ISD::FP_TO_UINT:
6814 return LowerFP_TO_INT(Op, DAG);
6815 case ISD::SHL:
6816 case ISD::SRA:
6817 case ISD::SRL:
6818 case ISD::ADD:
6819 case ISD::SUB:
6820 case ISD::SMIN:
6821 case ISD::SMAX:
6822 case ISD::UMIN:
6823 case ISD::UMAX:
6824 case ISD::FADD:
6825 case ISD::FMUL:
6826 case ISD::FMINNUM_IEEE:
6827 case ISD::FMAXNUM_IEEE:
6828 case ISD::UADDSAT:
6829 case ISD::USUBSAT:
6830 case ISD::SADDSAT:
6831 case ISD::SSUBSAT:
6832 return splitBinaryVectorOp(Op, DAG);
6833 case ISD::FCOPYSIGN:
6834 return lowerFCOPYSIGN(Op, DAG);
6835 case ISD::MUL:
6836 return lowerMUL(Op, DAG);
6837 case ISD::SMULO:
6838 case ISD::UMULO:
6839 return lowerXMULO(Op, DAG);
6840 case ISD::SMUL_LOHI:
6841 case ISD::UMUL_LOHI:
6842 return lowerXMUL_LOHI(Op, DAG);
6843 case ISD::DYNAMIC_STACKALLOC:
6844 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6845 case ISD::STACKSAVE:
6846 return LowerSTACKSAVE(Op, DAG);
6847 case ISD::GET_ROUNDING:
6848 return lowerGET_ROUNDING(Op, DAG);
6849 case ISD::SET_ROUNDING:
6850 return lowerSET_ROUNDING(Op, DAG);
6851 case ISD::PREFETCH:
6852 return lowerPREFETCH(Op, DAG);
6853 case ISD::FP_EXTEND:
6855 return lowerFP_EXTEND(Op, DAG);
6856 case ISD::GET_FPENV:
6857 return lowerGET_FPENV(Op, DAG);
6858 case ISD::SET_FPENV:
6859 return lowerSET_FPENV(Op, DAG);
6860 case ISD::ROTR:
6861 return lowerROTR(Op, DAG);
6862 }
6863 return SDValue();
6864}
6865
6866// Used for D16: Casts the result of an instruction into the right vector,
6867// packs values if loads return unpacked values.
6869 const SDLoc &DL, SelectionDAG &DAG,
6870 bool Unpacked) {
6871 if (!LoadVT.isVector())
6872 return Result;
6873
6874 // Cast back to the original packed type or to a larger type that is a
6875 // multiple of 32 bit for D16. Widening the return type is a required for
6876 // legalization.
6877 EVT FittingLoadVT = LoadVT;
6878 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6879 FittingLoadVT =
6881 LoadVT.getVectorNumElements() + 1);
6882 }
6883
6884 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6885 // Truncate to v2i16/v4i16.
6886 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6887
6888 // Workaround legalizer not scalarizing truncate after vector op
6889 // legalization but not creating intermediate vector trunc.
6891 DAG.ExtractVectorElements(Result, Elts);
6892 for (SDValue &Elt : Elts)
6893 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6894
6895 // Pad illegal v1i16/v3fi6 to v4i16
6896 if ((LoadVT.getVectorNumElements() % 2) == 1)
6897 Elts.push_back(DAG.getPOISON(MVT::i16));
6898
6899 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6900
6901 // Bitcast to original type (v2f16/v4f16).
6902 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6903 }
6904
6905 // Cast back to the original packed type.
6906 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6907}
6908
6909SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6910 SelectionDAG &DAG,
6912 bool IsIntrinsic) const {
6913 SDLoc DL(M);
6914
6915 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6916 EVT LoadVT = M->getValueType(0);
6917
6918 EVT EquivLoadVT = LoadVT;
6919 if (LoadVT.isVector()) {
6920 if (Unpacked) {
6921 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6922 LoadVT.getVectorNumElements());
6923 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6924 // Widen v3f16 to legal type
6925 EquivLoadVT =
6927 LoadVT.getVectorNumElements() + 1);
6928 }
6929 }
6930
6931 // Change from v4f16/v2f16 to EquivLoadVT.
6932 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6933
6935 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6936 M->getMemoryVT(), M->getMemOperand());
6937
6938 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6939
6940 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6941}
6942
6943SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6944 SelectionDAG &DAG,
6945 ArrayRef<SDValue> Ops) const {
6946 SDLoc DL(M);
6947 EVT LoadVT = M->getValueType(0);
6948 EVT EltType = LoadVT.getScalarType();
6949 EVT IntVT = LoadVT.changeTypeToInteger();
6950
6951 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6952
6953 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6954 bool IsTFE = M->getNumValues() == 3;
6955
6956 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6958 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
6959 : AMDGPUISD::BUFFER_LOAD;
6960
6961 if (IsD16) {
6962 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6963 }
6964
6965 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6966 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6967 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6968 IsTFE);
6969
6970 if (isTypeLegal(LoadVT)) {
6971 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6972 M->getMemOperand(), DAG);
6973 }
6974
6975 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6976 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6977 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6978 M->getMemOperand(), DAG);
6979 return DAG.getMergeValues(
6980 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6981 DL);
6982}
6983
6985 SelectionDAG &DAG) {
6986 EVT VT = N->getValueType(0);
6987 unsigned CondCode = N->getConstantOperandVal(3);
6988 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6989 return DAG.getPOISON(VT);
6990
6991 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6992
6993 SDValue LHS = N->getOperand(1);
6994 SDValue RHS = N->getOperand(2);
6995
6996 SDLoc DL(N);
6997
6998 EVT CmpVT = LHS.getValueType();
6999 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
7000 unsigned PromoteOp =
7002 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
7003 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
7004 }
7005
7006 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
7007
7008 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7009 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7010
7011 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
7012 DAG.getCondCode(CCOpcode));
7013 if (VT.bitsEq(CCVT))
7014 return SetCC;
7015 return DAG.getZExtOrTrunc(SetCC, DL, VT);
7016}
7017
7019 SelectionDAG &DAG) {
7020 EVT VT = N->getValueType(0);
7021
7022 unsigned CondCode = N->getConstantOperandVal(3);
7023 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7024 return DAG.getPOISON(VT);
7025
7026 SDValue Src0 = N->getOperand(1);
7027 SDValue Src1 = N->getOperand(2);
7028 EVT CmpVT = Src0.getValueType();
7029 SDLoc SL(N);
7030
7031 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7032 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7033 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7034 }
7035
7036 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7037 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7038 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7039 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7040 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7041 DAG.getCondCode(CCOpcode));
7042 if (VT.bitsEq(CCVT))
7043 return SetCC;
7044 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7045}
7046
7048 SelectionDAG &DAG) {
7049 EVT VT = N->getValueType(0);
7050 SDValue Src = N->getOperand(1);
7051 SDLoc SL(N);
7052
7053 if (Src.getOpcode() == ISD::SETCC) {
7054 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7055 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
7056 Src.getOperand(1), Src.getOperand(2));
7057 }
7058 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7059 // (ballot 0) -> 0
7060 if (Arg->isZero())
7061 return DAG.getConstant(0, SL, VT);
7062
7063 // (ballot 1) -> EXEC/EXEC_LO
7064 if (Arg->isOne()) {
7065 Register Exec;
7066 if (VT.getScalarSizeInBits() == 32)
7067 Exec = AMDGPU::EXEC_LO;
7068 else if (VT.getScalarSizeInBits() == 64)
7069 Exec = AMDGPU::EXEC;
7070 else
7071 return SDValue();
7072
7073 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7074 }
7075 }
7076
7077 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7078 // ISD::SETNE)
7079 return DAG.getNode(
7080 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7081 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7082}
7083
7085 SelectionDAG &DAG) {
7086 EVT VT = N->getValueType(0);
7087 unsigned ValSize = VT.getSizeInBits();
7088 unsigned IID = N->getConstantOperandVal(0);
7089 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7090 IID == Intrinsic::amdgcn_permlanex16;
7091 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7092 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7093 SDLoc SL(N);
7094 MVT IntVT = MVT::getIntegerVT(ValSize);
7095 const GCNSubtarget *ST = TLI.getSubtarget();
7096 unsigned SplitSize = 32;
7097 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7098 ST->hasDPALU_DPP() &&
7099 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7100 SplitSize = 64;
7101
7102 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7103 SDValue Src2, MVT ValT) -> SDValue {
7105 switch (IID) {
7106 case Intrinsic::amdgcn_permlane16:
7107 case Intrinsic::amdgcn_permlanex16:
7108 case Intrinsic::amdgcn_update_dpp:
7109 Operands.push_back(N->getOperand(6));
7110 Operands.push_back(N->getOperand(5));
7111 Operands.push_back(N->getOperand(4));
7112 [[fallthrough]];
7113 case Intrinsic::amdgcn_writelane:
7114 Operands.push_back(Src2);
7115 [[fallthrough]];
7116 case Intrinsic::amdgcn_readlane:
7117 case Intrinsic::amdgcn_set_inactive:
7118 case Intrinsic::amdgcn_set_inactive_chain_arg:
7119 case Intrinsic::amdgcn_mov_dpp8:
7120 Operands.push_back(Src1);
7121 [[fallthrough]];
7122 case Intrinsic::amdgcn_readfirstlane:
7123 case Intrinsic::amdgcn_permlane64:
7124 Operands.push_back(Src0);
7125 break;
7126 default:
7127 llvm_unreachable("unhandled lane op");
7128 }
7129
7130 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7131 std::reverse(Operands.begin(), Operands.end());
7132
7133 if (SDNode *GL = N->getGluedNode()) {
7134 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7135 GL = GL->getOperand(0).getNode();
7136 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7137 SDValue(GL, 0)));
7138 }
7139
7140 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7141 };
7142
7143 SDValue Src0 = N->getOperand(1);
7144 SDValue Src1, Src2;
7145 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7146 IID == Intrinsic::amdgcn_mov_dpp8 ||
7147 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7148 Src1 = N->getOperand(2);
7149 if (IID == Intrinsic::amdgcn_writelane ||
7150 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7151 Src2 = N->getOperand(3);
7152 }
7153
7154 if (ValSize == SplitSize) {
7155 // Already legal
7156 return SDValue();
7157 }
7158
7159 if (ValSize < 32) {
7160 bool IsFloat = VT.isFloatingPoint();
7161 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7162 SL, MVT::i32);
7163
7164 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7165 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7166 SL, MVT::i32);
7167 }
7168
7169 if (IID == Intrinsic::amdgcn_writelane) {
7170 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7171 SL, MVT::i32);
7172 }
7173
7174 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7175 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7176 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7177 }
7178
7179 if (ValSize % SplitSize != 0)
7180 return SDValue();
7181
7182 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7183 EVT VT = N->getValueType(0);
7184 unsigned NE = VT.getVectorNumElements();
7185 EVT EltVT = VT.getVectorElementType();
7187 unsigned NumOperands = N->getNumOperands();
7188 SmallVector<SDValue, 4> Operands(NumOperands);
7189 SDNode *GL = N->getGluedNode();
7190
7191 // only handle convergencectrl_glue
7192 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7193
7194 for (unsigned i = 0; i != NE; ++i) {
7195 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7196 ++j) {
7197 SDValue Operand = N->getOperand(j);
7198 EVT OperandVT = Operand.getValueType();
7199 if (OperandVT.isVector()) {
7200 // A vector operand; extract a single element.
7201 EVT OperandEltVT = OperandVT.getVectorElementType();
7202 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7203 Operand, DAG.getVectorIdxConstant(i, SL));
7204 } else {
7205 // A scalar operand; just use it as is.
7206 Operands[j] = Operand;
7207 }
7208 }
7209
7210 if (GL)
7211 Operands[NumOperands - 1] =
7212 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7213 SDValue(GL->getOperand(0).getNode(), 0));
7214
7215 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7216 }
7217
7218 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7219 return DAG.getBuildVector(VecVT, SL, Scalars);
7220 };
7221
7222 if (VT.isVector()) {
7223 switch (MVT::SimpleValueType EltTy =
7225 case MVT::i32:
7226 case MVT::f32:
7227 if (SplitSize == 32) {
7228 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7229 return unrollLaneOp(LaneOp.getNode());
7230 }
7231 [[fallthrough]];
7232 case MVT::i16:
7233 case MVT::f16:
7234 case MVT::bf16: {
7235 unsigned SubVecNumElt =
7236 SplitSize / VT.getVectorElementType().getSizeInBits();
7237 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7239 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7240 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7241 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7242 DAG.getConstant(EltIdx, SL, MVT::i32));
7243
7244 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7245 IsPermLane16)
7246 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7247 DAG.getConstant(EltIdx, SL, MVT::i32));
7248
7249 if (IID == Intrinsic::amdgcn_writelane)
7250 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7251 DAG.getConstant(EltIdx, SL, MVT::i32));
7252
7253 Pieces.push_back(
7254 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7255 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7256 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7257 EltIdx += SubVecNumElt;
7258 }
7259 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
7260 }
7261 default:
7262 // Handle all other cases by bitcasting to i32 vectors
7263 break;
7264 }
7265 }
7266
7267 MVT VecVT =
7268 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
7269 Src0 = DAG.getBitcast(VecVT, Src0);
7270
7271 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7272 Src1 = DAG.getBitcast(VecVT, Src1);
7273
7274 if (IID == Intrinsic::amdgcn_writelane)
7275 Src2 = DAG.getBitcast(VecVT, Src2);
7276
7277 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7278 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7279 return DAG.getBitcast(VT, UnrolledLaneOp);
7280}
7281
7284 SelectionDAG &DAG) const {
7285 switch (N->getOpcode()) {
7287 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
7288 Results.push_back(Res);
7289 return;
7290 }
7292 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
7293 Results.push_back(Res);
7294 return;
7295 }
7297 unsigned IID = N->getConstantOperandVal(0);
7298 switch (IID) {
7299 case Intrinsic::amdgcn_make_buffer_rsrc:
7300 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
7301 return;
7302 case Intrinsic::amdgcn_cvt_pkrtz: {
7303 SDValue Src0 = N->getOperand(1);
7304 SDValue Src1 = N->getOperand(2);
7305 SDLoc SL(N);
7306 SDValue Cvt =
7307 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7308 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7309 return;
7310 }
7311 case Intrinsic::amdgcn_cvt_pknorm_i16:
7312 case Intrinsic::amdgcn_cvt_pknorm_u16:
7313 case Intrinsic::amdgcn_cvt_pk_i16:
7314 case Intrinsic::amdgcn_cvt_pk_u16: {
7315 SDValue Src0 = N->getOperand(1);
7316 SDValue Src1 = N->getOperand(2);
7317 SDLoc SL(N);
7318 unsigned Opcode;
7319
7320 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7322 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7324 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7326 else
7328
7329 EVT VT = N->getValueType(0);
7330 if (isTypeLegal(VT))
7331 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
7332 else {
7333 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
7334 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7335 }
7336 return;
7337 }
7338 case Intrinsic::amdgcn_s_buffer_load: {
7339 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7340 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7341 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7342 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7343 // s_buffer_load_i8.
7344 if (!Subtarget->hasScalarSubwordLoads())
7345 return;
7346 SDValue Op = SDValue(N, 0);
7347 SDValue Rsrc = Op.getOperand(1);
7348 SDValue Offset = Op.getOperand(2);
7349 SDValue CachePolicy = Op.getOperand(3);
7350 EVT VT = Op.getValueType();
7351 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7352 SDLoc DL(Op);
7354 const DataLayout &DataLayout = DAG.getDataLayout();
7355 Align Alignment =
7361 VT.getStoreSize(), Alignment);
7362 SDValue LoadVal;
7363 if (!Offset->isDivergent()) {
7364 SDValue Ops[] = {Rsrc, // source register
7365 Offset, CachePolicy};
7366 SDValue BufferLoad =
7368 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7369 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7370 } else {
7371 SDValue Ops[] = {
7372 DAG.getEntryNode(), // Chain
7373 Rsrc, // rsrc
7374 DAG.getConstant(0, DL, MVT::i32), // vindex
7375 {}, // voffset
7376 {}, // soffset
7377 {}, // offset
7378 CachePolicy, // cachepolicy
7379 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7380 };
7381 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7382 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7383 }
7384 Results.push_back(LoadVal);
7385 return;
7386 }
7387 case Intrinsic::amdgcn_dead: {
7388 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7389 Results.push_back(DAG.getPOISON(N->getValueType(I)));
7390 return;
7391 }
7392 }
7393 break;
7394 }
7396 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7397 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7398 // FIXME: Hacky
7399 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7400 Results.push_back(Res.getOperand(I));
7401 }
7402 } else {
7403 Results.push_back(Res);
7404 Results.push_back(Res.getValue(1));
7405 }
7406 return;
7407 }
7408
7409 break;
7410 }
7411 case ISD::SELECT: {
7412 SDLoc SL(N);
7413 EVT VT = N->getValueType(0);
7414 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
7415 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
7416 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
7417
7418 EVT SelectVT = NewVT;
7419 if (NewVT.bitsLT(MVT::i32)) {
7420 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
7421 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
7422 SelectVT = MVT::i32;
7423 }
7424
7425 SDValue NewSelect =
7426 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
7427
7428 if (NewVT != SelectVT)
7429 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
7430 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
7431 return;
7432 }
7433 case ISD::FNEG: {
7434 if (N->getValueType(0) != MVT::v2f16)
7435 break;
7436
7437 SDLoc SL(N);
7438 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7439
7440 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
7441 DAG.getConstant(0x80008000, SL, MVT::i32));
7442 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7443 return;
7444 }
7445 case ISD::FABS: {
7446 if (N->getValueType(0) != MVT::v2f16)
7447 break;
7448
7449 SDLoc SL(N);
7450 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7451
7452 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
7453 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
7454 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7455 return;
7456 }
7457 case ISD::FSQRT: {
7458 if (N->getValueType(0) != MVT::f16)
7459 break;
7460 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
7461 break;
7462 }
7463 default:
7465 break;
7466 }
7467}
7468
7469/// Helper function for LowerBRCOND
7470static SDNode *findUser(SDValue Value, unsigned Opcode) {
7471
7472 for (SDUse &U : Value->uses()) {
7473 if (U.get() != Value)
7474 continue;
7475
7476 if (U.getUser()->getOpcode() == Opcode)
7477 return U.getUser();
7478 }
7479 return nullptr;
7480}
7481
7482unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7483 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7484 switch (Intr->getConstantOperandVal(1)) {
7485 case Intrinsic::amdgcn_if:
7486 return AMDGPUISD::IF;
7487 case Intrinsic::amdgcn_else:
7488 return AMDGPUISD::ELSE;
7489 case Intrinsic::amdgcn_loop:
7490 return AMDGPUISD::LOOP;
7491 case Intrinsic::amdgcn_end_cf:
7492 llvm_unreachable("should not occur");
7493 default:
7494 return 0;
7495 }
7496 }
7497
7498 // break, if_break, else_break are all only used as inputs to loop, not
7499 // directly as branch conditions.
7500 return 0;
7501}
7502
7509
7511 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7512 return false;
7513
7514 // FIXME: Either avoid relying on address space here or change the default
7515 // address space for functions to avoid the explicit check.
7516 return (GV->getValueType()->isFunctionTy() ||
7519}
7520
7522 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7523}
7524
7526 if (!GV->hasExternalLinkage())
7527 return true;
7528
7529 const auto OS = getTargetMachine().getTargetTriple().getOS();
7530 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7531}
7532
7533/// This transforms the control flow intrinsics to get the branch destination as
7534/// last parameter, also switches branch target with BR if the need arise
7535SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7536 SDLoc DL(BRCOND);
7537
7538 SDNode *Intr = BRCOND.getOperand(1).getNode();
7539 SDValue Target = BRCOND.getOperand(2);
7540 SDNode *BR = nullptr;
7541 SDNode *SetCC = nullptr;
7542
7543 switch (Intr->getOpcode()) {
7544 case ISD::SETCC: {
7545 // As long as we negate the condition everything is fine
7546 SetCC = Intr;
7547 Intr = SetCC->getOperand(0).getNode();
7548 break;
7549 }
7550 case ISD::XOR: {
7551 // Similar to SETCC, if we have (xor c, -1), we will be fine.
7552 SDValue LHS = Intr->getOperand(0);
7553 SDValue RHS = Intr->getOperand(1);
7554 if (auto *C = dyn_cast<ConstantSDNode>(RHS); C && C->getZExtValue()) {
7555 Intr = LHS.getNode();
7556 break;
7557 }
7558 [[fallthrough]];
7559 }
7560 default: {
7561 // Get the target from BR if we don't negate the condition
7562 BR = findUser(BRCOND, ISD::BR);
7563 assert(BR && "brcond missing unconditional branch user");
7564 Target = BR->getOperand(1);
7565 }
7566 }
7567
7568 unsigned CFNode = isCFIntrinsic(Intr);
7569 if (CFNode == 0) {
7570 // This is a uniform branch so we don't need to legalize.
7571 return BRCOND;
7572 }
7573
7574 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7576
7577 assert(!SetCC ||
7578 (SetCC->getConstantOperandVal(1) == 1 &&
7579 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7580 ISD::SETNE));
7581
7582 // operands of the new intrinsic call
7584 if (HaveChain)
7585 Ops.push_back(BRCOND.getOperand(0));
7586
7587 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7588 Ops.push_back(Target);
7589
7590 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7591
7592 // build the new intrinsic call
7593 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7594
7595 if (!HaveChain) {
7596 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7597
7599 }
7600
7601 if (BR) {
7602 // Give the branch instruction our target
7603 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7604 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7605 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7606 }
7607
7608 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7609
7610 // Copy the intrinsic results to registers
7611 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7612 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
7613 if (!CopyToReg)
7614 continue;
7615
7616 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
7617 SDValue(Result, i - 1), SDValue());
7618
7619 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
7620 }
7621
7622 // Remove the old intrinsic from the chain
7623 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
7624 Intr->getOperand(0));
7625
7626 return Chain;
7627}
7628
7629SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7630 MVT VT = Op.getSimpleValueType();
7631 SDLoc DL(Op);
7632 // Checking the depth
7633 if (Op.getConstantOperandVal(0) != 0)
7634 return DAG.getConstant(0, DL, VT);
7635
7636 MachineFunction &MF = DAG.getMachineFunction();
7637 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7638 // Check for kernel and shader functions
7639 if (Info->isEntryFunction())
7640 return DAG.getConstant(0, DL, VT);
7641
7642 MachineFrameInfo &MFI = MF.getFrameInfo();
7643 // There is a call to @llvm.returnaddress in this function
7644 MFI.setReturnAddressIsTaken(true);
7645
7646 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7647 // Get the return address reg and mark it as an implicit live-in
7648 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7649 getRegClassFor(VT, Op.getNode()->isDivergent()));
7650
7651 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7652}
7653
7654SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7655 const SDLoc &DL, EVT VT) const {
7656 return Op.getValueType().bitsLE(VT)
7657 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7658 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7659 DAG.getTargetConstant(0, DL, MVT::i32));
7660}
7661
7662SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7663 SelectionDAG &DAG) const {
7664 EVT DstVT = Op.getValueType();
7665 unsigned NumElts = DstVT.getVectorNumElements();
7666 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7667
7668 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7669
7670 SDLoc DL(Op);
7671 unsigned Opc = Op.getOpcode();
7672 SDValue Flags = Op.getOperand(1);
7673 EVT HalfDstVT =
7674 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7675 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7676 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7677
7678 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7679}
7680
7681SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7682 SDValue Src = Op.getOperand(0);
7683 EVT SrcVT = Src.getValueType();
7684 EVT DstVT = Op.getValueType();
7685
7686 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7687 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7688 if (SrcVT.getScalarType() != MVT::f32)
7689 return SDValue();
7690 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7691 }
7692
7693 if (SrcVT.getScalarType() != MVT::f64)
7694 return Op;
7695
7696 SDLoc DL(Op);
7697 if (DstVT == MVT::f16) {
7698 // TODO: Handle strictfp
7699 if (Op.getOpcode() != ISD::FP_ROUND)
7700 return Op;
7701
7702 if (!Subtarget->has16BitInsts()) {
7703 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7704 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7705 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7706 }
7707 if (Op->getFlags().hasApproximateFuncs()) {
7708 SDValue Flags = Op.getOperand(1);
7709 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7710 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7711 }
7712 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7713 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7714 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7715 }
7716
7717 assert(DstVT.getScalarType() == MVT::bf16 &&
7718 "custom lower FP_ROUND for f16 or bf16");
7719 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7720
7721 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7722 // hardware f32 -> bf16 instruction.
7723 EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) :
7724 MVT::f32;
7725 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7726 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7727 DAG.getTargetConstant(0, DL, MVT::i32));
7728}
7729
7730SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7731 SelectionDAG &DAG) const {
7732 EVT VT = Op.getValueType();
7733 const MachineFunction &MF = DAG.getMachineFunction();
7734 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7735 bool IsIEEEMode = Info->getMode().IEEE;
7736
7737 // FIXME: Assert during selection that this is only selected for
7738 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7739 // mode functions, but this happens to be OK since it's only done in cases
7740 // where there is known no sNaN.
7741 if (IsIEEEMode)
7742 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7743
7744 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7745 VT == MVT::v16bf16)
7746 return splitBinaryVectorOp(Op, DAG);
7747 return Op;
7748}
7749
7750SDValue
7751SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7752 SelectionDAG &DAG) const {
7753 EVT VT = Op.getValueType();
7754 const MachineFunction &MF = DAG.getMachineFunction();
7755 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7756 bool IsIEEEMode = Info->getMode().IEEE;
7757
7758 if (IsIEEEMode)
7759 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7760
7761 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7762 VT == MVT::v16bf16)
7763 return splitBinaryVectorOp(Op, DAG);
7764 return Op;
7765}
7766
7767SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7768 SelectionDAG &DAG) const {
7769 EVT VT = Op.getValueType();
7770 if (VT.isVector())
7771 return splitBinaryVectorOp(Op, DAG);
7772
7773 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7774 !Subtarget->hasMinimum3Maximum3F16() &&
7775 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7776 "should not need to widen f16 minimum/maximum to v2f16");
7777
7778 // Widen f16 operation to v2f16
7779
7780 // fminimum f16:x, f16:y ->
7781 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7782 // (v2f16 (scalar_to_vector y))), 0
7783 SDLoc SL(Op);
7784 SDValue WideSrc0 =
7785 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7786 SDValue WideSrc1 =
7787 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7788
7789 SDValue Widened =
7790 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7791
7792 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7793 DAG.getConstant(0, SL, MVT::i32));
7794}
7795
7796SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7797 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7798 EVT VT = Op.getValueType();
7799 assert(VT == MVT::f16);
7800
7801 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7802 EVT ExpVT = Exp.getValueType();
7803 if (ExpVT == MVT::i16)
7804 return Op;
7805
7806 SDLoc DL(Op);
7807
7808 // Correct the exponent type for f16 to i16.
7809 // Clamp the range of the exponent to the instruction's range.
7810
7811 // TODO: This should be a generic narrowing legalization, and can easily be
7812 // for GlobalISel.
7813
7814 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7815 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7816
7817 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7818 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7819
7820 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7821
7822 if (IsStrict) {
7823 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7824 {Op.getOperand(0), Op.getOperand(1), TruncExp});
7825 }
7826
7827 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7828}
7829
7831 switch (Op->getOpcode()) {
7832 case ISD::SRA:
7833 case ISD::SMIN:
7834 case ISD::SMAX:
7835 return ISD::SIGN_EXTEND;
7836 case ISD::SRL:
7837 case ISD::UMIN:
7838 case ISD::UMAX:
7839 return ISD::ZERO_EXTEND;
7840 case ISD::ADD:
7841 case ISD::SUB:
7842 case ISD::AND:
7843 case ISD::OR:
7844 case ISD::XOR:
7845 case ISD::SHL:
7846 case ISD::SELECT:
7847 case ISD::MUL:
7848 // operation result won't be influenced by garbage high bits.
7849 // TODO: are all of those cases correct, and are there more?
7850 return ISD::ANY_EXTEND;
7851 case ISD::SETCC: {
7852 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7854 }
7855 default:
7856 llvm_unreachable("unexpected opcode!");
7857 }
7858}
7859
7860SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
7861 DAGCombinerInfo &DCI) const {
7862 const unsigned Opc = Op.getOpcode();
7863 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
7864 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
7865 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
7866 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
7867 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
7868
7869 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
7870 : Op->getOperand(0).getValueType();
7871 auto ExtTy = OpTy.changeElementType(MVT::i32);
7872
7873 if (DCI.isBeforeLegalizeOps() ||
7874 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
7875 return SDValue();
7876
7877 auto &DAG = DCI.DAG;
7878
7879 SDLoc DL(Op);
7880 SDValue LHS;
7881 SDValue RHS;
7882 if (Opc == ISD::SELECT) {
7883 LHS = Op->getOperand(1);
7884 RHS = Op->getOperand(2);
7885 } else {
7886 LHS = Op->getOperand(0);
7887 RHS = Op->getOperand(1);
7888 }
7889
7890 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
7891 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
7892
7893 // Special case: for shifts, the RHS always needs a zext.
7894 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
7895 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
7896 else
7897 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
7898
7899 // setcc always return i1/i1 vec so no need to truncate after.
7900 if (Opc == ISD::SETCC) {
7901 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7902 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
7903 }
7904
7905 // For other ops, we extend the operation's return type as well so we need to
7906 // truncate back to the original type.
7907 SDValue NewVal;
7908 if (Opc == ISD::SELECT)
7909 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
7910 else
7911 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
7912
7913 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
7914}
7915
7916SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
7917 SDValue Mag = Op.getOperand(0);
7918 EVT MagVT = Mag.getValueType();
7919
7920 if (MagVT.getVectorNumElements() > 2)
7921 return splitBinaryVectorOp(Op, DAG);
7922
7923 SDValue Sign = Op.getOperand(1);
7924 EVT SignVT = Sign.getValueType();
7925
7926 if (MagVT == SignVT)
7927 return Op;
7928
7929 // fcopysign v2f16:mag, v2f32:sign ->
7930 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
7931
7932 SDLoc SL(Op);
7933 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7934 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
7935
7936 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7937
7938 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
7939}
7940
7941// Custom lowering for vector multiplications and s_mul_u64.
7942SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
7943 EVT VT = Op.getValueType();
7944
7945 // Split vector operands.
7946 if (VT.isVector())
7947 return splitBinaryVectorOp(Op, DAG);
7948
7949 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
7950
7951 // There are four ways to lower s_mul_u64:
7952 //
7953 // 1. If all the operands are uniform, then we lower it as it is.
7954 //
7955 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
7956 // multiplications because there is not a vector equivalent of s_mul_u64.
7957 //
7958 // 3. If the cost model decides that it is more efficient to use vector
7959 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
7960 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
7961 //
7962 // 4. If the cost model decides to use vector registers and both of the
7963 // operands are zero-extended/sign-extended from 32-bits, then we split the
7964 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
7965 // possible to check if the operands are zero-extended or sign-extended in
7966 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
7967 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
7968 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
7969 // If the cost model decides that we have to use vector registers, then
7970 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
7971 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
7972 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
7973 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
7974 // SIInstrInfo.cpp .
7975
7976 if (Op->isDivergent())
7977 return SDValue();
7978
7979 SDValue Op0 = Op.getOperand(0);
7980 SDValue Op1 = Op.getOperand(1);
7981 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
7982 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
7983 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
7984 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
7985 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7986 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
7987 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7988 SDLoc SL(Op);
7989 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7990 return SDValue(
7991 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7992 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
7993 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7994 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7995 return SDValue(
7996 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7997 // If all the operands are uniform, then we lower s_mul_u64 as it is.
7998 return Op;
7999}
8000
8001SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
8002 EVT VT = Op.getValueType();
8003 SDLoc SL(Op);
8004 SDValue LHS = Op.getOperand(0);
8005 SDValue RHS = Op.getOperand(1);
8006 bool isSigned = Op.getOpcode() == ISD::SMULO;
8007
8008 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
8009 const APInt &C = RHSC->getAPIntValue();
8010 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
8011 if (C.isPowerOf2()) {
8012 // smulo(x, signed_min) is same as umulo(x, signed_min).
8013 bool UseArithShift = isSigned && !C.isMinSignedValue();
8014 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
8015 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
8016 SDValue Overflow =
8017 DAG.getSetCC(SL, MVT::i1,
8018 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
8019 Result, ShiftAmt),
8020 LHS, ISD::SETNE);
8021 return DAG.getMergeValues({Result, Overflow}, SL);
8022 }
8023 }
8024
8025 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
8026 SDValue Top =
8027 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
8028
8029 SDValue Sign = isSigned
8030 ? DAG.getNode(ISD::SRA, SL, VT, Result,
8031 DAG.getConstant(VT.getScalarSizeInBits() - 1,
8032 SL, MVT::i32))
8033 : DAG.getConstant(0, SL, VT);
8034 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
8035
8036 return DAG.getMergeValues({Result, Overflow}, SL);
8037}
8038
8039SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8040 if (Op->isDivergent()) {
8041 // Select to V_MAD_[IU]64_[IU]32.
8042 return Op;
8043 }
8044 if (Subtarget->hasSMulHi()) {
8045 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8046 return SDValue();
8047 }
8048 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8049 // calculate the high part, so we might as well do the whole thing with
8050 // V_MAD_[IU]64_[IU]32.
8051 return Op;
8052}
8053
8054SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8055 if (!Subtarget->isTrapHandlerEnabled() ||
8056 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8057 return lowerTrapEndpgm(Op, DAG);
8058
8059 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8060 : lowerTrapHsaQueuePtr(Op, DAG);
8061}
8062
8063SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8064 SDLoc SL(Op);
8065 SDValue Chain = Op.getOperand(0);
8066 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8067}
8068
8069SDValue
8070SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8071 const SDLoc &DL, Align Alignment,
8072 ImplicitParameter Param) const {
8073 MachineFunction &MF = DAG.getMachineFunction();
8074 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8075 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
8076 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8077 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
8080}
8081
8082SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8083 SelectionDAG &DAG) const {
8084 SDLoc SL(Op);
8085 SDValue Chain = Op.getOperand(0);
8086
8087 SDValue QueuePtr;
8088 // For code object version 5, QueuePtr is passed through implicit kernarg.
8089 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8091 QueuePtr =
8092 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
8093 } else {
8094 MachineFunction &MF = DAG.getMachineFunction();
8095 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8096 Register UserSGPR = Info->getQueuePtrUserSGPR();
8097
8098 if (UserSGPR == AMDGPU::NoRegister) {
8099 // We probably are in a function incorrectly marked with
8100 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8101 // trap, so just use a null pointer.
8102 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
8103 } else {
8104 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
8105 MVT::i64);
8106 }
8107 }
8108
8109 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
8110 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
8111
8112 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8113 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
8114 ToReg.getValue(1)};
8115 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8116}
8117
8118SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8119 SDLoc SL(Op);
8120 SDValue Chain = Op.getOperand(0);
8121
8122 // We need to simulate the 's_trap 2' instruction on targets that run in
8123 // PRIV=1 (where it is treated as a nop).
8124 if (Subtarget->hasPrivEnabledTrap2NopBug())
8125 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8126
8127 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8128 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8129 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8130}
8131
8132SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8133 SDLoc SL(Op);
8134 SDValue Chain = Op.getOperand(0);
8135 MachineFunction &MF = DAG.getMachineFunction();
8136
8137 if (!Subtarget->isTrapHandlerEnabled() ||
8138 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8139 LLVMContext &Ctx = MF.getFunction().getContext();
8140 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
8141 "debugtrap handler not supported",
8142 Op.getDebugLoc(), DS_Warning));
8143 return Chain;
8144 }
8145
8146 uint64_t TrapID =
8147 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8148 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8149 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8150}
8151
8152SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8153 SelectionDAG &DAG) const {
8154 if (Subtarget->hasApertureRegs()) {
8155 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8156 ? AMDGPU::SRC_SHARED_BASE
8157 : AMDGPU::SRC_PRIVATE_BASE;
8158 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8159 !Subtarget->hasGloballyAddressableScratch()) &&
8160 "Cannot use src_private_base with globally addressable scratch!");
8161 // Note: this feature (register) is broken. When used as a 32-bit operand,
8162 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8163 // bits.
8164 //
8165 // To work around the issue, emit a 64 bit copy from this register
8166 // then extract the high bits. Note that this shouldn't even result in a
8167 // shift being emitted and simply become a pair of registers (e.g.):
8168 // s_mov_b64 s[6:7], src_shared_base
8169 // v_mov_b32_e32 v1, s7
8170 SDValue Copy =
8171 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
8172 return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
8173 }
8174
8175 // For code object version 5, private_base and shared_base are passed through
8176 // implicit kernargs.
8177 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8181 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
8182 }
8183
8184 MachineFunction &MF = DAG.getMachineFunction();
8185 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8186 Register UserSGPR = Info->getQueuePtrUserSGPR();
8187 if (UserSGPR == AMDGPU::NoRegister) {
8188 // We probably are in a function incorrectly marked with
8189 // amdgpu-no-queue-ptr. This is undefined.
8190 return DAG.getPOISON(MVT::i32);
8191 }
8192
8193 SDValue QueuePtr =
8194 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
8195
8196 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8197 // private_segment_aperture_base_hi.
8198 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8199
8200 SDValue Ptr =
8201 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
8202
8203 // TODO: Use custom target PseudoSourceValue.
8204 // TODO: We should use the value from the IR intrinsic call, but it might not
8205 // be available and how do we get it?
8206 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8207 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
8208 commonAlignment(Align(64), StructOffset),
8211}
8212
8213/// Return true if the value is a known valid address, such that a null check is
8214/// not necessary.
8216 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8218 return true;
8219
8220 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8221 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8222
8223 // TODO: Search through arithmetic, handle arguments and loads
8224 // marked nonnull.
8225 return false;
8226}
8227
8228SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8229 SelectionDAG &DAG) const {
8230 SDLoc SL(Op);
8231
8232 const AMDGPUTargetMachine &TM =
8233 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8234
8235 unsigned DestAS, SrcAS;
8236 SDValue Src;
8237 bool IsNonNull = false;
8238 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
8239 SrcAS = ASC->getSrcAddressSpace();
8240 Src = ASC->getOperand(0);
8241 DestAS = ASC->getDestAddressSpace();
8242 } else {
8243 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8244 Op.getConstantOperandVal(0) ==
8245 Intrinsic::amdgcn_addrspacecast_nonnull);
8246 Src = Op->getOperand(1);
8247 SrcAS = Op->getConstantOperandVal(2);
8248 DestAS = Op->getConstantOperandVal(3);
8249 IsNonNull = true;
8250 }
8251
8252 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
8253
8254 // flat -> local/private
8255 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8256 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8257 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8258 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8259
8260 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8261 Subtarget->hasGloballyAddressableScratch()) {
8262 // flat -> private with globally addressable scratch: subtract
8263 // src_flat_scratch_base_lo.
8264 SDValue FlatScratchBaseLo(
8265 DAG.getMachineNode(
8266 AMDGPU::S_MOV_B32, SL, MVT::i32,
8267 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8268 0);
8269 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
8270 }
8271
8272 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8273 return Ptr;
8274
8275 unsigned NullVal = TM.getNullPointerValue(DestAS);
8276 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8277 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
8278
8279 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
8280 SegmentNullPtr);
8281 }
8282 }
8283
8284 // local/private -> flat
8285 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8286 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8287 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8288 SDValue CvtPtr;
8289 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8290 Subtarget->hasGloballyAddressableScratch()) {
8291 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8292 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8293 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
8294 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
8295 ThreadID = DAG.getNode(
8296 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8297 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
8298 AllOnes, ThreadID);
8299 if (Subtarget->isWave64())
8300 ThreadID = DAG.getNode(
8301 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8302 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
8303 AllOnes, ThreadID);
8304 SDValue ShAmt = DAG.getShiftAmountConstant(
8305 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8306 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
8307 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
8308 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8309 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8310 // 64-bit hi:lo value.
8311 SDValue FlatScratchBase = {
8312 DAG.getMachineNode(
8313 AMDGPU::S_MOV_B64, SL, MVT::i64,
8314 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8315 0};
8316 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8317 } else {
8318 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8319 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
8320 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8321 }
8322
8323 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8324 return CvtPtr;
8325
8326 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8327 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8328
8329 SDValue NonNull =
8330 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
8331
8332 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
8333 FlatNullPtr);
8334 }
8335 }
8336
8337 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8338 Op.getValueType() == MVT::i64) {
8339 const SIMachineFunctionInfo *Info =
8340 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8341 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
8342 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
8343 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8344 }
8345
8346 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8347 Src.getValueType() == MVT::i64)
8348 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8349
8350 // global <-> flat are no-ops and never emitted.
8351
8352 // Invalid casts are poison.
8353 return DAG.getPOISON(Op->getValueType(0));
8354}
8355
8356// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8357// the small vector and inserting them into the big vector. That is better than
8358// the default expansion of doing it via a stack slot. Even though the use of
8359// the stack slot would be optimized away afterwards, the stack slot itself
8360// remains.
8361SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8362 SelectionDAG &DAG) const {
8363 SDValue Vec = Op.getOperand(0);
8364 SDValue Ins = Op.getOperand(1);
8365 SDValue Idx = Op.getOperand(2);
8366 EVT VecVT = Vec.getValueType();
8367 EVT InsVT = Ins.getValueType();
8368 EVT EltVT = VecVT.getVectorElementType();
8369 unsigned InsNumElts = InsVT.getVectorNumElements();
8370 unsigned IdxVal = Idx->getAsZExtVal();
8371 SDLoc SL(Op);
8372
8373 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8374 // Insert 32-bit registers at a time.
8375 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8376
8377 unsigned VecNumElts = VecVT.getVectorNumElements();
8378 EVT NewVecVT =
8379 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
8380 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8382 MVT::i32, InsNumElts / 2);
8383
8384 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8385 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8386
8387 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8388 SDValue Elt;
8389 if (InsNumElts == 2) {
8390 Elt = Ins;
8391 } else {
8392 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
8393 DAG.getConstant(I, SL, MVT::i32));
8394 }
8395 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
8396 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
8397 }
8398
8399 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
8400 }
8401
8402 for (unsigned I = 0; I != InsNumElts; ++I) {
8403 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
8404 DAG.getConstant(I, SL, MVT::i32));
8405 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
8406 DAG.getConstant(IdxVal + I, SL, MVT::i32));
8407 }
8408 return Vec;
8409}
8410
8411SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8412 SelectionDAG &DAG) const {
8413 SDValue Vec = Op.getOperand(0);
8414 SDValue InsVal = Op.getOperand(1);
8415 SDValue Idx = Op.getOperand(2);
8416 EVT VecVT = Vec.getValueType();
8417 EVT EltVT = VecVT.getVectorElementType();
8418 unsigned VecSize = VecVT.getSizeInBits();
8419 unsigned EltSize = EltVT.getSizeInBits();
8420 SDLoc SL(Op);
8421
8422 // Specially handle the case of v4i16 with static indexing.
8423 unsigned NumElts = VecVT.getVectorNumElements();
8424 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
8425 if (NumElts == 4 && EltSize == 16 && KIdx) {
8426 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
8427
8428 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8429 DAG.getConstant(0, SL, MVT::i32));
8430 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8431 DAG.getConstant(1, SL, MVT::i32));
8432
8433 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8434 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8435
8436 unsigned Idx = KIdx->getZExtValue();
8437 bool InsertLo = Idx < 2;
8438 SDValue InsHalf = DAG.getNode(
8439 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
8440 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8441 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8442
8443 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8444
8445 SDValue Concat =
8446 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
8447 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8448
8449 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
8450 }
8451
8452 // Static indexing does not lower to stack access, and hence there is no need
8453 // for special custom lowering to avoid stack access.
8454 if (isa<ConstantSDNode>(Idx))
8455 return SDValue();
8456
8457 // Avoid stack access for dynamic indexing by custom lowering to
8458 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8459
8460 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8461
8462 MVT IntVT = MVT::getIntegerVT(VecSize);
8463
8464 // Convert vector index to bit-index and get the required bit mask.
8465 assert(isPowerOf2_32(EltSize));
8466 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8467 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8468 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8469 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
8470 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8471
8472 // 1. Create a congruent vector with the target value in each element.
8473 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8474 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8475
8476 // 2. Mask off all other indices except the required index within (1).
8477 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8478
8479 // 3. Mask off the required index within the target vector.
8480 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8481 SDValue RHS =
8482 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8483
8484 // 4. Get (2) and (3) ORed into the target vector.
8485 SDValue BFI =
8486 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8487
8488 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8489}
8490
8491SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8492 SelectionDAG &DAG) const {
8493 SDLoc SL(Op);
8494
8495 EVT ResultVT = Op.getValueType();
8496 SDValue Vec = Op.getOperand(0);
8497 SDValue Idx = Op.getOperand(1);
8498 EVT VecVT = Vec.getValueType();
8499 unsigned VecSize = VecVT.getSizeInBits();
8500 EVT EltVT = VecVT.getVectorElementType();
8501
8502 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8503
8504 // Make sure we do any optimizations that will make it easier to fold
8505 // source modifiers before obscuring it with bit operations.
8506
8507 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8508 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8509 return Combined;
8510
8511 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8512 SDValue Lo, Hi;
8513 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8514
8515 if (VecSize == 128) {
8516 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8517 Lo = DAG.getBitcast(LoVT,
8518 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8519 DAG.getConstant(0, SL, MVT::i32)));
8520 Hi = DAG.getBitcast(HiVT,
8521 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8522 DAG.getConstant(1, SL, MVT::i32)));
8523 } else if (VecSize == 256) {
8524 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8525 SDValue Parts[4];
8526 for (unsigned P = 0; P < 4; ++P) {
8527 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8528 DAG.getConstant(P, SL, MVT::i32));
8529 }
8530
8531 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8532 Parts[0], Parts[1]));
8533 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8534 Parts[2], Parts[3]));
8535 } else {
8536 assert(VecSize == 512);
8537
8538 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8539 SDValue Parts[8];
8540 for (unsigned P = 0; P < 8; ++P) {
8541 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8542 DAG.getConstant(P, SL, MVT::i32));
8543 }
8544
8545 Lo = DAG.getBitcast(LoVT,
8546 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8547 Parts[0], Parts[1], Parts[2], Parts[3]));
8548 Hi = DAG.getBitcast(HiVT,
8549 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8550 Parts[4], Parts[5], Parts[6], Parts[7]));
8551 }
8552
8553 EVT IdxVT = Idx.getValueType();
8554 unsigned NElem = VecVT.getVectorNumElements();
8555 assert(isPowerOf2_32(NElem));
8556 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8557 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8558 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8559 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8560 }
8561
8562 assert(VecSize <= 64);
8563
8564 MVT IntVT = MVT::getIntegerVT(VecSize);
8565
8566 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8567 SDValue VecBC = peekThroughBitcasts(Vec);
8568 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8569 SDValue Src = VecBC.getOperand(0);
8570 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8571 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8572 }
8573
8574 unsigned EltSize = EltVT.getSizeInBits();
8575 assert(isPowerOf2_32(EltSize));
8576
8577 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8578
8579 // Convert vector index to bit-index (* EltSize)
8580 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8581
8582 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8583 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8584
8585 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8586 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8587 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8588 }
8589
8590 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8591}
8592
8593static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8594 assert(Elt % 2 == 0);
8595 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8596}
8597
8598static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8599 assert(Elt % 2 == 0);
8600 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8601 !(Mask[Elt + 1] & 1);
8602}
8603
8604SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8605 SelectionDAG &DAG) const {
8606 SDLoc SL(Op);
8607 EVT ResultVT = Op.getValueType();
8608 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
8609 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8610 const int NewSrcNumElts = 2;
8611 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
8612 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
8613
8614 // Break up the shuffle into registers sized pieces.
8615 //
8616 // We're trying to form sub-shuffles that the register allocation pipeline
8617 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8618 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8619 // pair of copies into a consecutive register copy, so use the ordinary
8620 // extract_vector_elt lowering unless we can use the shuffle.
8621 //
8622 // TODO: This is a bit of hack, and we should probably always use
8623 // extract_subvector for the largest possible subvector we can (or at least
8624 // use it for PackVT aligned pieces). However we have worse support for
8625 // combines on them don't directly treat extract_subvector / insert_subvector
8626 // as legal. The DAG scheduler also ends up doing a worse job with the
8627 // extract_subvectors.
8628 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8629
8630 // vector_shuffle <0,1,6,7> lhs, rhs
8631 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8632 //
8633 // vector_shuffle <6,7,2,3> lhs, rhs
8634 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
8635 //
8636 // vector_shuffle <6,7,0,1> lhs, rhs
8637 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
8638
8639 // Avoid scalarizing when both halves are reading from consecutive elements.
8640
8641 // If we're treating 2 element shuffles as legal, also create odd-to-even
8642 // shuffles of neighboring pairs.
8643 //
8644 // vector_shuffle <3,2,7,6> lhs, rhs
8645 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
8646 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
8647
8649 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
8650 if (ShouldUseConsecutiveExtract &&
8652 const int Idx = SVN->getMaskElt(I);
8653 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8654 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8655 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
8656 SVN->getOperand(VecIdx),
8657 DAG.getConstant(EltIdx, SL, MVT::i32));
8658 Pieces.push_back(SubVec);
8659 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
8661 int Idx0 = SVN->getMaskElt(I);
8662 int Idx1 = SVN->getMaskElt(I + 1);
8663
8664 SDValue SrcOp0 = SVN->getOperand(0);
8665 SDValue SrcOp1 = SrcOp0;
8666 if (Idx0 >= SrcNumElts) {
8667 SrcOp0 = SVN->getOperand(1);
8668 Idx0 -= SrcNumElts;
8669 }
8670
8671 if (Idx1 >= SrcNumElts) {
8672 SrcOp1 = SVN->getOperand(1);
8673 Idx1 -= SrcNumElts;
8674 }
8675
8676 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8677 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8678
8679 // Extract nearest even aligned piece.
8680 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8681 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8682 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8683 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8684
8685 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8686 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8687
8688 SDValue Result0 = SubVec0;
8689 SDValue Result1 = SubVec0;
8690
8691 if (SubVec0 != SubVec1) {
8692 NewMaskIdx1 += NewSrcNumElts;
8693 Result1 = SubVec1;
8694 } else {
8695 Result1 = DAG.getPOISON(PackVT);
8696 }
8697
8698 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8699 {NewMaskIdx0, NewMaskIdx1});
8700 Pieces.push_back(Shuf);
8701 } else {
8702 const int Idx0 = SVN->getMaskElt(I);
8703 const int Idx1 = SVN->getMaskElt(I + 1);
8704 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8705 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8706 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8707 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8708
8709 SDValue Vec0 = SVN->getOperand(VecIdx0);
8710 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8711 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8712
8713 SDValue Vec1 = SVN->getOperand(VecIdx1);
8714 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8715 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8716 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8717 }
8718 }
8719
8720 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8721}
8722
8723SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8724 SelectionDAG &DAG) const {
8725 SDValue SVal = Op.getOperand(0);
8726 EVT ResultVT = Op.getValueType();
8727 EVT SValVT = SVal.getValueType();
8728 SDValue UndefVal = DAG.getPOISON(SValVT);
8729 SDLoc SL(Op);
8730
8732 VElts.push_back(SVal);
8733 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8734 VElts.push_back(UndefVal);
8735
8736 return DAG.getBuildVector(ResultVT, SL, VElts);
8737}
8738
8739SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8740 SelectionDAG &DAG) const {
8741 SDLoc SL(Op);
8742 EVT VT = Op.getValueType();
8743
8744 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8745 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8746
8747 SDValue Lo = Op.getOperand(0);
8748 SDValue Hi = Op.getOperand(1);
8749
8750 // Avoid adding defined bits with the zero_extend.
8751 if (Hi.isUndef()) {
8752 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8753 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8754 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8755 }
8756
8757 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8758 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8759
8760 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8761 DAG.getConstant(16, SL, MVT::i32));
8762 if (Lo.isUndef())
8763 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8764
8765 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8766 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8767
8768 SDValue Or =
8769 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8770 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8771 }
8772
8773 // Split into 2-element chunks.
8774 const unsigned NumParts = VT.getVectorNumElements() / 2;
8775 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
8776 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8777
8779 for (unsigned P = 0; P < NumParts; ++P) {
8780 SDValue Vec = DAG.getBuildVector(
8781 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8782 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8783 }
8784
8785 SDValue Blend =
8786 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8787 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8788}
8789
8791 const GlobalAddressSDNode *GA) const {
8792 // OSes that use ELF REL relocations (instead of RELA) can only store a
8793 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8794 // which can create arbitrary 64-bit addends. (This is only a problem for
8795 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8796 // the high 32 bits of the addend.)
8797 //
8798 // This should be kept in sync with how HasRelocationAddend is initialized in
8799 // the constructor of ELFAMDGPUAsmBackend.
8800 if (!Subtarget->isAmdHsaOS())
8801 return false;
8802
8803 // We can fold offsets for anything that doesn't require a GOT relocation.
8804 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8808}
8809
8810static SDValue
8812 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8813 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8814 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8815 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8816 // lowered to the following code sequence:
8817 //
8818 // For constant address space:
8819 // s_getpc_b64 s[0:1]
8820 // s_add_u32 s0, s0, $symbol
8821 // s_addc_u32 s1, s1, 0
8822 //
8823 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8824 // a fixup or relocation is emitted to replace $symbol with a literal
8825 // constant, which is a pc-relative offset from the encoding of the $symbol
8826 // operand to the global variable.
8827 //
8828 // For global address space:
8829 // s_getpc_b64 s[0:1]
8830 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8831 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8832 //
8833 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8834 // fixups or relocations are emitted to replace $symbol@*@lo and
8835 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8836 // which is a 64-bit pc-relative offset from the encoding of the $symbol
8837 // operand to the global variable.
8838 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
8839 assert(GAFlags != SIInstrInfo::MO_NONE);
8840
8841 SDValue Ptr =
8842 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
8843 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
8844 }
8845
8846 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
8847 SDValue PtrHi;
8848 if (GAFlags == SIInstrInfo::MO_NONE)
8849 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
8850 else
8851 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
8852 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
8853}
8854
8855SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
8856 SDValue Op,
8857 SelectionDAG &DAG) const {
8858 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
8859 SDLoc DL(GSD);
8860 EVT PtrVT = Op.getValueType();
8861
8862 const GlobalValue *GV = GSD->getGlobal();
8868 GV->hasExternalLinkage()) {
8869 Type *Ty = GV->getValueType();
8870 // HIP uses an unsized array `extern __shared__ T s[]` or similar
8871 // zero-sized type in other languages to declare the dynamic shared
8872 // memory which size is not known at the compile time. They will be
8873 // allocated by the runtime and placed directly after the static
8874 // allocated ones. They all share the same offset.
8875 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
8876 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
8877 // Adjust alignment for that dynamic shared memory array.
8880 MFI->setUsesDynamicLDS(true);
8881 return SDValue(
8882 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
8883 }
8884 }
8886 }
8887
8889 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
8891 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
8892 }
8893
8894 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8895 if (Subtarget->has64BitLiterals()) {
8897 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
8898 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
8899 0);
8900 }
8901
8902 SDValue AddrLo = DAG.getTargetGlobalAddress(
8903 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
8904 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
8905
8906 SDValue AddrHi = DAG.getTargetGlobalAddress(
8907 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
8908 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
8909
8910 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
8911 }
8912
8913 if (shouldEmitFixup(GV))
8914 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
8915
8916 if (shouldEmitPCReloc(GV))
8917 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
8919
8920 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
8922 PointerType *PtrTy =
8924 const DataLayout &DataLayout = DAG.getDataLayout();
8925 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
8926 MachinePointerInfo PtrInfo =
8928
8929 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
8932}
8933
8935 const SDLoc &DL, SDValue V) const {
8936 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
8937 // the destination register.
8938 //
8939 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
8940 // so we will end up with redundant moves to m0.
8941 //
8942 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
8943
8944 // A Null SDValue creates a glue result.
8945 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
8946 V, Chain);
8947 return SDValue(M0, 0);
8948}
8949
8950SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
8951 MVT VT,
8952 unsigned Offset) const {
8953 SDLoc SL(Op);
8954 SDValue Param = lowerKernargMemParameter(
8955 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
8956 // The local size values will have the hi 16-bits as zero.
8957 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
8958 DAG.getValueType(VT));
8959}
8960
8962 EVT VT) {
8965 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
8966 return DAG.getPOISON(VT);
8967}
8968
8970 EVT VT) {
8973 "intrinsic not supported on subtarget", DL.getDebugLoc()));
8974 return DAG.getPOISON(VT);
8975}
8976
8978 ArrayRef<SDValue> Elts) {
8979 assert(!Elts.empty());
8980 MVT Type;
8981 unsigned NumElts = Elts.size();
8982
8983 if (NumElts <= 12) {
8984 Type = MVT::getVectorVT(MVT::f32, NumElts);
8985 } else {
8986 assert(Elts.size() <= 16);
8987 Type = MVT::v16f32;
8988 NumElts = 16;
8989 }
8990
8991 SmallVector<SDValue, 16> VecElts(NumElts);
8992 for (unsigned i = 0; i < Elts.size(); ++i) {
8993 SDValue Elt = Elts[i];
8994 if (Elt.getValueType() != MVT::f32)
8995 Elt = DAG.getBitcast(MVT::f32, Elt);
8996 VecElts[i] = Elt;
8997 }
8998 for (unsigned i = Elts.size(); i < NumElts; ++i)
8999 VecElts[i] = DAG.getPOISON(MVT::f32);
9000
9001 if (NumElts == 1)
9002 return VecElts[0];
9003 return DAG.getBuildVector(Type, DL, VecElts);
9004}
9005
9006static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
9007 SDValue Src, int ExtraElts) {
9008 EVT SrcVT = Src.getValueType();
9009
9011
9012 if (SrcVT.isVector())
9013 DAG.ExtractVectorElements(Src, Elts);
9014 else
9015 Elts.push_back(Src);
9016
9017 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
9018 while (ExtraElts--)
9019 Elts.push_back(Undef);
9020
9021 return DAG.getBuildVector(CastVT, DL, Elts);
9022}
9023
9024// Re-construct the required return value for a image load intrinsic.
9025// This is more complicated due to the optional use TexFailCtrl which means the
9026// required return type is an aggregate
9028 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9029 bool Unpacked, bool IsD16, int DMaskPop,
9030 int NumVDataDwords, bool IsAtomicPacked16Bit,
9031 const SDLoc &DL) {
9032 // Determine the required return type. This is the same regardless of
9033 // IsTexFail flag
9034 EVT ReqRetVT = ResultTypes[0];
9035 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9036 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9037 ? (ReqRetNumElts + 1) / 2
9038 : ReqRetNumElts;
9039
9040 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9041
9042 MVT DataDwordVT =
9043 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
9044
9045 MVT MaskPopVT =
9046 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
9047
9048 SDValue Data(Result, 0);
9049 SDValue TexFail;
9050
9051 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9052 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
9053 if (MaskPopVT.isVector()) {
9054 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
9055 SDValue(Result, 0), ZeroIdx);
9056 } else {
9057 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
9058 SDValue(Result, 0), ZeroIdx);
9059 }
9060 }
9061
9062 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9063 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
9064 NumDataDwords - MaskPopDwords);
9065
9066 if (IsD16)
9067 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
9068
9069 EVT LegalReqRetVT = ReqRetVT;
9070 if (!ReqRetVT.isVector()) {
9071 if (!Data.getValueType().isInteger())
9072 Data = DAG.getNode(ISD::BITCAST, DL,
9073 Data.getValueType().changeTypeToInteger(), Data);
9074 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
9075 } else {
9076 // We need to widen the return vector to a legal type
9077 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9078 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9079 LegalReqRetVT =
9081 ReqRetVT.getVectorNumElements() + 1);
9082 }
9083 }
9084 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
9085
9086 if (IsTexFail) {
9087 TexFail =
9088 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
9089 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
9090
9091 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
9092 }
9093
9094 if (Result->getNumValues() == 1)
9095 return Data;
9096
9097 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
9098}
9099
9100static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9101 SDValue *LWE, bool &IsTexFail) {
9102 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
9103
9104 uint64_t Value = TexFailCtrlConst->getZExtValue();
9105 if (Value) {
9106 IsTexFail = true;
9107 }
9108
9109 SDLoc DL(TexFailCtrlConst);
9110 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
9111 Value &= ~(uint64_t)0x1;
9112 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
9113 Value &= ~(uint64_t)0x2;
9114
9115 return Value == 0;
9116}
9117
9119 MVT PackVectorVT,
9120 SmallVectorImpl<SDValue> &PackedAddrs,
9121 unsigned DimIdx, unsigned EndIdx,
9122 unsigned NumGradients) {
9123 SDLoc DL(Op);
9124 for (unsigned I = DimIdx; I < EndIdx; I++) {
9125 SDValue Addr = Op.getOperand(I);
9126
9127 // Gradients are packed with undef for each coordinate.
9128 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9129 // 1D: undef,dx/dh; undef,dx/dv
9130 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9131 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9132 if (((I + 1) >= EndIdx) ||
9133 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9134 I == DimIdx + NumGradients - 1))) {
9135 if (Addr.getValueType() != MVT::i16)
9136 Addr = DAG.getBitcast(MVT::i16, Addr);
9137 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
9138 } else {
9139 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
9140 I++;
9141 }
9142 Addr = DAG.getBitcast(MVT::f32, Addr);
9143 PackedAddrs.push_back(Addr);
9144 }
9145}
9146
9147SDValue SITargetLowering::lowerImage(SDValue Op,
9149 SelectionDAG &DAG, bool WithChain) const {
9150 SDLoc DL(Op);
9151 MachineFunction &MF = DAG.getMachineFunction();
9152 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9153 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9155 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
9156 unsigned IntrOpcode = Intr->BaseOpcode;
9157 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
9158 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9159 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9160
9161 SmallVector<EVT, 3> ResultTypes(Op->values());
9162 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9163 bool IsD16 = false;
9164 bool IsG16 = false;
9165 bool IsA16 = false;
9166 SDValue VData;
9167 int NumVDataDwords = 0;
9168 bool AdjustRetType = false;
9169 bool IsAtomicPacked16Bit = false;
9170
9171 // Offset of intrinsic arguments
9172 const unsigned ArgOffset = WithChain ? 2 : 1;
9173
9174 unsigned DMask;
9175 unsigned DMaskLanes = 0;
9176
9177 if (BaseOpcode->Atomic) {
9178 VData = Op.getOperand(2);
9179
9180 IsAtomicPacked16Bit =
9181 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9182 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
9183
9184 bool Is64Bit = VData.getValueSizeInBits() == 64;
9185 if (BaseOpcode->AtomicX2) {
9186 SDValue VData2 = Op.getOperand(3);
9187 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9188 {VData, VData2});
9189 if (Is64Bit)
9190 VData = DAG.getBitcast(MVT::v4i32, VData);
9191
9192 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9193 DMask = Is64Bit ? 0xf : 0x3;
9194 NumVDataDwords = Is64Bit ? 4 : 2;
9195 } else {
9196 DMask = Is64Bit ? 0x3 : 0x1;
9197 NumVDataDwords = Is64Bit ? 2 : 1;
9198 }
9199 } else {
9200 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
9201 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
9202
9203 if (BaseOpcode->Store) {
9204 VData = Op.getOperand(2);
9205
9206 MVT StoreVT = VData.getSimpleValueType();
9207 if (StoreVT.getScalarType() == MVT::f16) {
9208 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9209 return Op; // D16 is unsupported for this instruction
9210
9211 IsD16 = true;
9212 VData = handleD16VData(VData, DAG, true);
9213 }
9214
9215 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9216 } else if (!BaseOpcode->NoReturn) {
9217 // Work out the num dwords based on the dmask popcount and underlying type
9218 // and whether packing is supported.
9219 MVT LoadVT = ResultTypes[0].getSimpleVT();
9220 if (LoadVT.getScalarType() == MVT::f16) {
9221 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9222 return Op; // D16 is unsupported for this instruction
9223
9224 IsD16 = true;
9225 }
9226
9227 // Confirm that the return type is large enough for the dmask specified
9228 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9229 (!LoadVT.isVector() && DMaskLanes > 1))
9230 return Op;
9231
9232 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9233 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9234 // instructions.
9235 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9236 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9237 NumVDataDwords = (DMaskLanes + 1) / 2;
9238 else
9239 NumVDataDwords = DMaskLanes;
9240
9241 AdjustRetType = true;
9242 }
9243 }
9244
9245 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9247
9248 // Check for 16 bit addresses or derivatives and pack if true.
9249 MVT VAddrVT =
9250 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
9251 MVT VAddrScalarVT = VAddrVT.getScalarType();
9252 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9253 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9254
9255 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
9256 VAddrScalarVT = VAddrVT.getScalarType();
9257 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9258 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9259
9260 // Push back extra arguments.
9261 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9262 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
9263 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9264 // Special handling of bias when A16 is on. Bias is of type half but
9265 // occupies full 32-bit.
9266 SDValue Bias = DAG.getBuildVector(
9267 MVT::v2f16, DL,
9268 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
9269 VAddrs.push_back(Bias);
9270 } else {
9271 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9272 "Bias needs to be converted to 16 bit in A16 mode");
9273 VAddrs.push_back(Op.getOperand(ArgOffset + I));
9274 }
9275 }
9276
9277 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9278 // 16 bit gradients are supported, but are tied to the A16 control
9279 // so both gradients and addresses must be 16 bit
9280 LLVM_DEBUG(
9281 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9282 "require 16 bit args for both gradients and addresses");
9283 return Op;
9284 }
9285
9286 if (IsA16) {
9287 if (!ST->hasA16()) {
9288 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9289 "support 16 bit addresses\n");
9290 return Op;
9291 }
9292 }
9293
9294 // We've dealt with incorrect input so we know that if IsA16, IsG16
9295 // are set then we have to compress/pack operands (either address,
9296 // gradient or both)
9297 // In the case where a16 and gradients are tied (no G16 support) then we
9298 // have already verified that both IsA16 and IsG16 are true
9299 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9300 // Activate g16
9301 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9303 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9304 }
9305
9306 // Add gradients (packed or unpacked)
9307 if (IsG16) {
9308 // Pack the gradients
9309 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9310 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
9311 ArgOffset + Intr->GradientStart,
9312 ArgOffset + Intr->CoordStart, Intr->NumGradients);
9313 } else {
9314 for (unsigned I = ArgOffset + Intr->GradientStart;
9315 I < ArgOffset + Intr->CoordStart; I++)
9316 VAddrs.push_back(Op.getOperand(I));
9317 }
9318
9319 // Add addresses (packed or unpacked)
9320 if (IsA16) {
9321 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
9322 ArgOffset + Intr->CoordStart, VAddrEnd,
9323 0 /* No gradients */);
9324 } else {
9325 // Add uncompressed address
9326 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9327 VAddrs.push_back(Op.getOperand(I));
9328 }
9329
9330 // If the register allocator cannot place the address registers contiguously
9331 // without introducing moves, then using the non-sequential address encoding
9332 // is always preferable, since it saves VALU instructions and is usually a
9333 // wash in terms of code size or even better.
9334 //
9335 // However, we currently have no way of hinting to the register allocator that
9336 // MIMG addresses should be placed contiguously when it is possible to do so,
9337 // so force non-NSA for the common 2-address case as a heuristic.
9338 //
9339 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9340 // allocation when possible.
9341 //
9342 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9343 // set of the remaining addresses.
9344 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
9345 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9346 const bool UseNSA = ST->hasNSAEncoding() &&
9347 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9348 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9349 const bool UsePartialNSA =
9350 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9351
9352 SDValue VAddr;
9353 if (UsePartialNSA) {
9354 VAddr = getBuildDwordsVector(DAG, DL,
9355 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9356 } else if (!UseNSA) {
9357 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
9358 }
9359
9360 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
9361 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
9362 SDValue Unorm;
9363 if (!BaseOpcode->Sampler) {
9364 Unorm = True;
9365 } else {
9366 uint64_t UnormConst =
9367 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
9368
9369 Unorm = UnormConst ? True : False;
9370 }
9371
9372 SDValue TFE;
9373 SDValue LWE;
9374 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
9375 bool IsTexFail = false;
9376 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9377 return Op;
9378
9379 if (IsTexFail) {
9380 if (!DMaskLanes) {
9381 // Expecting to get an error flag since TFC is on - and dmask is 0
9382 // Force dmask to be at least 1 otherwise the instruction will fail
9383 DMask = 0x1;
9384 DMaskLanes = 1;
9385 NumVDataDwords = 1;
9386 }
9387 NumVDataDwords += 1;
9388 AdjustRetType = true;
9389 }
9390
9391 // Has something earlier tagged that the return type needs adjusting
9392 // This happens if the instruction is a load or has set TexFailCtrl flags
9393 if (AdjustRetType) {
9394 // NumVDataDwords reflects the true number of dwords required in the return
9395 // type
9396 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9397 // This is a no-op load. This can be eliminated
9398 SDValue Undef = DAG.getPOISON(Op.getValueType());
9399 if (isa<MemSDNode>(Op))
9400 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
9401 return Undef;
9402 }
9403
9404 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
9405 MVT::i32, NumVDataDwords)
9406 : MVT::i32;
9407
9408 ResultTypes[0] = NewVT;
9409 if (ResultTypes.size() == 3) {
9410 // Original result was aggregate type used for TexFailCtrl results
9411 // The actual instruction returns as a vector type which has now been
9412 // created. Remove the aggregate result.
9413 ResultTypes.erase(&ResultTypes[1]);
9414 }
9415 }
9416
9417 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9418 if (BaseOpcode->Atomic)
9419 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
9420 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9422 return Op;
9423
9425 if (BaseOpcode->Store || BaseOpcode->Atomic)
9426 Ops.push_back(VData); // vdata
9427 if (UsePartialNSA) {
9428 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
9429 Ops.push_back(VAddr);
9430 } else if (UseNSA)
9431 append_range(Ops, VAddrs);
9432 else
9433 Ops.push_back(VAddr);
9434 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
9435 EVT RsrcVT = Rsrc.getValueType();
9436 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9437 return Op;
9438 Ops.push_back(Rsrc);
9439 if (BaseOpcode->Sampler) {
9440 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
9441 if (Samp.getValueType() != MVT::v4i32)
9442 return Op;
9443 Ops.push_back(Samp);
9444 }
9445 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
9446 if (IsGFX10Plus)
9447 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
9448 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9449 Ops.push_back(Unorm);
9450 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
9451 Ops.push_back(IsA16 && // r128, a16 for gfx9
9452 ST->hasFeature(AMDGPU::FeatureR128A16)
9453 ? True
9454 : False);
9455 if (IsGFX10Plus)
9456 Ops.push_back(IsA16 ? True : False);
9457
9458 if (!Subtarget->hasGFX90AInsts())
9459 Ops.push_back(TFE); // tfe
9460 else if (TFE->getAsZExtVal()) {
9461 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9463 "TFE is not supported on this GPU", DL.getDebugLoc()));
9464 }
9465
9466 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9467 Ops.push_back(LWE); // lwe
9468 if (!IsGFX10Plus)
9469 Ops.push_back(DimInfo->DA ? True : False);
9470 if (BaseOpcode->HasD16)
9471 Ops.push_back(IsD16 ? True : False);
9472 if (isa<MemSDNode>(Op))
9473 Ops.push_back(Op.getOperand(0)); // chain
9474
9475 int NumVAddrDwords =
9476 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9477 int Opcode = -1;
9478
9479 if (IsGFX12Plus) {
9480 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9481 NumVDataDwords, NumVAddrDwords);
9482 } else if (IsGFX11Plus) {
9483 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9484 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9485 : AMDGPU::MIMGEncGfx11Default,
9486 NumVDataDwords, NumVAddrDwords);
9487 } else if (IsGFX10Plus) {
9488 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9489 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9490 : AMDGPU::MIMGEncGfx10Default,
9491 NumVDataDwords, NumVAddrDwords);
9492 } else {
9493 if (Subtarget->hasGFX90AInsts()) {
9494 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9495 NumVDataDwords, NumVAddrDwords);
9496 if (Opcode == -1) {
9497 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9499 "requested image instruction is not supported on this GPU",
9500 DL.getDebugLoc()));
9501
9502 unsigned Idx = 0;
9503 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9504 for (EVT VT : OrigResultTypes) {
9505 if (VT == MVT::Other)
9506 RetValues[Idx++] = Op.getOperand(0); // Chain
9507 else
9508 RetValues[Idx++] = DAG.getPOISON(VT);
9509 }
9510
9511 return DAG.getMergeValues(RetValues, DL);
9512 }
9513 }
9514 if (Opcode == -1 &&
9515 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9516 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9517 NumVDataDwords, NumVAddrDwords);
9518 if (Opcode == -1)
9519 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9520 NumVDataDwords, NumVAddrDwords);
9521 }
9522 if (Opcode == -1)
9523 return Op;
9524
9525 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9526 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9527 MachineMemOperand *MemRef = MemOp->getMemOperand();
9528 DAG.setNodeMemRefs(NewNode, {MemRef});
9529 }
9530
9531 if (BaseOpcode->AtomicX2) {
9533 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9534 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9535 }
9536 if (BaseOpcode->NoReturn)
9537 return SDValue(NewNode, 0);
9538 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9539 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9540 NumVDataDwords, IsAtomicPacked16Bit, DL);
9541}
9542
9543SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9544 SDValue Offset, SDValue CachePolicy,
9545 SelectionDAG &DAG) const {
9546 MachineFunction &MF = DAG.getMachineFunction();
9547
9548 const DataLayout &DataLayout = DAG.getDataLayout();
9549 Align Alignment =
9550 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
9551
9552 MachineMemOperand *MMO = MF.getMachineMemOperand(
9553 MachinePointerInfo(),
9556 VT.getStoreSize(), Alignment);
9557
9558 if (!Offset->isDivergent()) {
9559 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9560
9561 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9562 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9563 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9564 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9565 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9566 SDValue BufferLoad =
9568 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9569 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
9570 }
9571
9572 // Widen vec3 load to vec4.
9573 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9574 !Subtarget->hasScalarDwordx3Loads()) {
9575 EVT WidenedVT =
9577 auto WidenedOp = DAG.getMemIntrinsicNode(
9578 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
9579 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
9580 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
9581 DAG.getVectorIdxConstant(0, DL));
9582 return Subvector;
9583 }
9584
9586 DAG.getVTList(VT), Ops, VT, MMO);
9587 }
9588
9589 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9590 // assume that the buffer is unswizzled.
9591 SDValue Ops[] = {
9592 DAG.getEntryNode(), // Chain
9593 Rsrc, // rsrc
9594 DAG.getConstant(0, DL, MVT::i32), // vindex
9595 {}, // voffset
9596 {}, // soffset
9597 {}, // offset
9598 CachePolicy, // cachepolicy
9599 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9600 };
9601 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9602 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
9603 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
9604 }
9605
9607 unsigned NumLoads = 1;
9608 MVT LoadVT = VT.getSimpleVT();
9609 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
9610 assert((LoadVT.getScalarType() == MVT::i32 ||
9611 LoadVT.getScalarType() == MVT::f32));
9612
9613 if (NumElts == 8 || NumElts == 16) {
9614 NumLoads = NumElts / 4;
9615 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
9616 }
9617
9618 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
9619
9620 // Use the alignment to ensure that the required offsets will fit into the
9621 // immediate offsets.
9622 setBufferOffsets(Offset, DAG, &Ops[3],
9623 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
9624
9625 uint64_t InstOffset = Ops[5]->getAsZExtVal();
9626 for (unsigned i = 0; i < NumLoads; ++i) {
9627 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
9628 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
9629 LoadVT, MMO, DAG));
9630 }
9631
9632 if (NumElts == 8 || NumElts == 16)
9633 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
9634
9635 return Loads[0];
9636}
9637
9638SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
9639 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
9640 if (!Subtarget->hasArchitectedSGPRs())
9641 return {};
9642 SDLoc SL(Op);
9643 MVT VT = MVT::i32;
9644 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
9645 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9646 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
9647}
9648
9649SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
9650 AMDGPU::Hwreg::Id HwReg,
9651 unsigned LowBit,
9652 unsigned Width) const {
9653 SDLoc SL(Op);
9654 using namespace AMDGPU::Hwreg;
9655 return {DAG.getMachineNode(
9656 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9657 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
9658 SL, MVT::i32)),
9659 0};
9660}
9661
9662SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
9663 unsigned Dim,
9664 const ArgDescriptor &Arg) const {
9665 SDLoc SL(Op);
9666 MachineFunction &MF = DAG.getMachineFunction();
9667 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
9668 if (MaxID == 0)
9669 return DAG.getConstant(0, SL, MVT::i32);
9670
9671 // It's undefined behavior if a function marked with the amdgpu-no-*
9672 // attributes uses the corresponding intrinsic.
9673 if (!Arg)
9674 return DAG.getPOISON(Op->getValueType(0));
9675
9676 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
9677 SDLoc(DAG.getEntryNode()), Arg);
9678
9679 // Don't bother inserting AssertZext for packed IDs since we're emitting the
9680 // masking operations anyway.
9681 //
9682 // TODO: We could assert the top bit is 0 for the source copy.
9683 if (Arg.isMasked())
9684 return Val;
9685
9686 // Preserve the known bits after expansion to a copy.
9687 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
9688 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
9689 DAG.getValueType(SmallVT));
9690}
9691
9692SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
9693 SelectionDAG &DAG) const {
9694 MachineFunction &MF = DAG.getMachineFunction();
9695 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
9696
9697 EVT VT = Op.getValueType();
9698 SDLoc DL(Op);
9699 unsigned IntrinsicID = Op.getConstantOperandVal(0);
9700
9701 // TODO: Should this propagate fast-math-flags?
9702
9703 switch (IntrinsicID) {
9704 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9705 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9706 return emitNonHSAIntrinsicError(DAG, DL, VT);
9707 return getPreloadedValue(DAG, *MFI, VT,
9709 }
9710 case Intrinsic::amdgcn_dispatch_ptr:
9711 case Intrinsic::amdgcn_queue_ptr: {
9712 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9713 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9714 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9715 DL.getDebugLoc()));
9716 return DAG.getPOISON(VT);
9717 }
9718
9719 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9722 return getPreloadedValue(DAG, *MFI, VT, RegID);
9723 }
9724 case Intrinsic::amdgcn_implicitarg_ptr: {
9725 if (MFI->isEntryFunction())
9726 return getImplicitArgPtr(DAG, DL);
9727 return getPreloadedValue(DAG, *MFI, VT,
9729 }
9730 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9732 // This only makes sense to call in a kernel, so just lower to null.
9733 return DAG.getConstant(0, DL, VT);
9734 }
9735
9736 return getPreloadedValue(DAG, *MFI, VT,
9738 }
9739 case Intrinsic::amdgcn_dispatch_id: {
9740 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9741 }
9742 case Intrinsic::amdgcn_rcp:
9743 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9744 case Intrinsic::amdgcn_rsq:
9745 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9746 case Intrinsic::amdgcn_rsq_legacy:
9747 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9748 return emitRemovedIntrinsicError(DAG, DL, VT);
9749 return SDValue();
9750 case Intrinsic::amdgcn_rcp_legacy:
9751 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9752 return emitRemovedIntrinsicError(DAG, DL, VT);
9753 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9754 case Intrinsic::amdgcn_rsq_clamp: {
9755 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9756 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9757
9758 Type *Type = VT.getTypeForEVT(*DAG.getContext());
9759 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
9760 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
9761
9762 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9763 SDValue Tmp =
9764 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9765 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9766 DAG.getConstantFP(Min, DL, VT));
9767 }
9768 case Intrinsic::r600_read_ngroups_x:
9769 if (Subtarget->isAmdHsaOS())
9770 return emitNonHSAIntrinsicError(DAG, DL, VT);
9771
9772 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9774 false);
9775 case Intrinsic::r600_read_ngroups_y:
9776 if (Subtarget->isAmdHsaOS())
9777 return emitNonHSAIntrinsicError(DAG, DL, VT);
9778
9779 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9781 false);
9782 case Intrinsic::r600_read_ngroups_z:
9783 if (Subtarget->isAmdHsaOS())
9784 return emitNonHSAIntrinsicError(DAG, DL, VT);
9785
9786 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9788 false);
9789 case Intrinsic::r600_read_local_size_x:
9790 if (Subtarget->isAmdHsaOS())
9791 return emitNonHSAIntrinsicError(DAG, DL, VT);
9792
9793 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9795 case Intrinsic::r600_read_local_size_y:
9796 if (Subtarget->isAmdHsaOS())
9797 return emitNonHSAIntrinsicError(DAG, DL, VT);
9798
9799 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9801 case Intrinsic::r600_read_local_size_z:
9802 if (Subtarget->isAmdHsaOS())
9803 return emitNonHSAIntrinsicError(DAG, DL, VT);
9804
9805 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9807 case Intrinsic::amdgcn_workgroup_id_x:
9808 return lowerWorkGroupId(DAG, *MFI, VT,
9812 case Intrinsic::amdgcn_workgroup_id_y:
9813 return lowerWorkGroupId(DAG, *MFI, VT,
9817 case Intrinsic::amdgcn_workgroup_id_z:
9818 return lowerWorkGroupId(DAG, *MFI, VT,
9822 case Intrinsic::amdgcn_cluster_id_x:
9823 return Subtarget->hasClusters()
9824 ? getPreloadedValue(DAG, *MFI, VT,
9826 : DAG.getPOISON(VT);
9827 case Intrinsic::amdgcn_cluster_id_y:
9828 return Subtarget->hasClusters()
9829 ? getPreloadedValue(DAG, *MFI, VT,
9831 : DAG.getPOISON(VT);
9832 case Intrinsic::amdgcn_cluster_id_z:
9833 return Subtarget->hasClusters()
9834 ? getPreloadedValue(DAG, *MFI, VT,
9836 : DAG.getPOISON(VT);
9837 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9838 return Subtarget->hasClusters()
9839 ? getPreloadedValue(
9840 DAG, *MFI, VT,
9842 : DAG.getPOISON(VT);
9843 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9844 return Subtarget->hasClusters()
9845 ? getPreloadedValue(
9846 DAG, *MFI, VT,
9848 : DAG.getPOISON(VT);
9849 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9850 return Subtarget->hasClusters()
9851 ? getPreloadedValue(
9852 DAG, *MFI, VT,
9854 : DAG.getPOISON(VT);
9855 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9856 return Subtarget->hasClusters()
9857 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
9858 : SDValue();
9859 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9860 return Subtarget->hasClusters()
9861 ? getPreloadedValue(
9862 DAG, *MFI, VT,
9864 : DAG.getPOISON(VT);
9865 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9866 return Subtarget->hasClusters()
9867 ? getPreloadedValue(
9868 DAG, *MFI, VT,
9870 : DAG.getPOISON(VT);
9871 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9872 return Subtarget->hasClusters()
9873 ? getPreloadedValue(
9874 DAG, *MFI, VT,
9876 : DAG.getPOISON(VT);
9877 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
9878 return Subtarget->hasClusters()
9879 ? getPreloadedValue(
9880 DAG, *MFI, VT,
9882 : DAG.getPOISON(VT);
9883 case Intrinsic::amdgcn_wave_id:
9884 return lowerWaveID(DAG, Op);
9885 case Intrinsic::amdgcn_lds_kernel_id: {
9886 if (MFI->isEntryFunction())
9887 return getLDSKernelId(DAG, DL);
9888 return getPreloadedValue(DAG, *MFI, VT,
9890 }
9891 case Intrinsic::amdgcn_workitem_id_x:
9892 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
9893 case Intrinsic::amdgcn_workitem_id_y:
9894 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
9895 case Intrinsic::amdgcn_workitem_id_z:
9896 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
9897 case Intrinsic::amdgcn_wavefrontsize:
9898 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
9899 SDLoc(Op), MVT::i32);
9900 case Intrinsic::amdgcn_s_buffer_load: {
9901 unsigned CPol = Op.getConstantOperandVal(3);
9902 // s_buffer_load, because of how it's optimized, can't be volatile
9903 // so reject ones with the volatile bit set.
9904 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
9907 return Op;
9908 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
9909 Op.getOperand(3), DAG);
9910 }
9911 case Intrinsic::amdgcn_fdiv_fast:
9912 return lowerFDIV_FAST(Op, DAG);
9913 case Intrinsic::amdgcn_sin:
9914 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
9915
9916 case Intrinsic::amdgcn_cos:
9917 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
9918
9919 case Intrinsic::amdgcn_mul_u24:
9920 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
9921 Op.getOperand(2));
9922 case Intrinsic::amdgcn_mul_i24:
9923 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
9924 Op.getOperand(2));
9925
9926 case Intrinsic::amdgcn_log_clamp: {
9927 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9928 return SDValue();
9929
9930 return emitRemovedIntrinsicError(DAG, DL, VT);
9931 }
9932 case Intrinsic::amdgcn_fract:
9933 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
9934
9935 case Intrinsic::amdgcn_class:
9936 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
9937 Op.getOperand(2));
9938 case Intrinsic::amdgcn_div_fmas:
9939 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
9940 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9941
9942 case Intrinsic::amdgcn_div_fixup:
9943 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
9944 Op.getOperand(2), Op.getOperand(3));
9945
9946 case Intrinsic::amdgcn_div_scale: {
9947 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
9948
9949 // Translate to the operands expected by the machine instruction. The
9950 // first parameter must be the same as the first instruction.
9951 SDValue Numerator = Op.getOperand(1);
9952 SDValue Denominator = Op.getOperand(2);
9953
9954 // Note this order is opposite of the machine instruction's operations,
9955 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
9956 // intrinsic has the numerator as the first operand to match a normal
9957 // division operation.
9958
9959 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
9960
9961 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
9962 Denominator, Numerator);
9963 }
9964 case Intrinsic::amdgcn_icmp: {
9965 // There is a Pat that handles this variant, so return it as-is.
9966 if (Op.getOperand(1).getValueType() == MVT::i1 &&
9967 Op.getConstantOperandVal(2) == 0 &&
9968 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
9969 return Op;
9970 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
9971 }
9972 case Intrinsic::amdgcn_fcmp: {
9973 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
9974 }
9975 case Intrinsic::amdgcn_ballot:
9976 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
9977 case Intrinsic::amdgcn_fmed3:
9978 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
9979 Op.getOperand(2), Op.getOperand(3));
9980 case Intrinsic::amdgcn_fdot2:
9981 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
9982 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9983 case Intrinsic::amdgcn_fmul_legacy:
9984 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
9985 Op.getOperand(2));
9986 case Intrinsic::amdgcn_sffbh:
9987 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
9988 case Intrinsic::amdgcn_sbfe:
9989 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
9990 Op.getOperand(2), Op.getOperand(3));
9991 case Intrinsic::amdgcn_ubfe:
9992 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
9993 Op.getOperand(2), Op.getOperand(3));
9994 case Intrinsic::amdgcn_cvt_pkrtz:
9995 case Intrinsic::amdgcn_cvt_pknorm_i16:
9996 case Intrinsic::amdgcn_cvt_pknorm_u16:
9997 case Intrinsic::amdgcn_cvt_pk_i16:
9998 case Intrinsic::amdgcn_cvt_pk_u16: {
9999 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
10000 EVT VT = Op.getValueType();
10001 unsigned Opcode;
10002
10003 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10005 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10007 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10009 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10011 else
10013
10014 if (isTypeLegal(VT))
10015 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
10016
10017 SDValue Node =
10018 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
10019 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
10020 }
10021 case Intrinsic::amdgcn_fmad_ftz:
10022 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
10023 Op.getOperand(2), Op.getOperand(3));
10024
10025 case Intrinsic::amdgcn_if_break:
10026 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
10027 Op->getOperand(1), Op->getOperand(2)),
10028 0);
10029
10030 case Intrinsic::amdgcn_groupstaticsize: {
10032 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10033 return Op;
10034
10035 const Module *M = MF.getFunction().getParent();
10036 const GlobalValue *GV =
10037 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
10038 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
10040 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10041 }
10042 case Intrinsic::amdgcn_is_shared:
10043 case Intrinsic::amdgcn_is_private: {
10044 SDLoc SL(Op);
10045 SDValue SrcVec =
10046 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10047 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
10048 DAG.getConstant(1, SL, MVT::i32));
10049
10050 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10052 : AMDGPUAS::PRIVATE_ADDRESS;
10053 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10054 Subtarget->hasGloballyAddressableScratch()) {
10055 SDValue FlatScratchBaseHi(
10056 DAG.getMachineNode(
10057 AMDGPU::S_MOV_B32, DL, MVT::i32,
10058 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10059 0);
10060 // Test bits 63..58 against the aperture address.
10061 return DAG.getSetCC(
10062 SL, MVT::i1,
10063 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
10064 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
10065 }
10066
10067 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10068 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
10069 }
10070 case Intrinsic::amdgcn_perm:
10071 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
10072 Op.getOperand(2), Op.getOperand(3));
10073 case Intrinsic::amdgcn_reloc_constant: {
10074 Module *M = MF.getFunction().getParent();
10075 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
10076 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
10077 auto *RelocSymbol = cast<GlobalVariable>(
10078 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
10079 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
10081 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10082 }
10083 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10084 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10085 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10086 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10087 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10088 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10089 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10090 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10091 if (Op.getOperand(4).getValueType() == MVT::i32)
10092 return SDValue();
10093
10094 SDLoc SL(Op);
10095 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
10096 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10097 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10098 Op.getOperand(3), IndexKeyi32);
10099 }
10100 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10101 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10102 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10103 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10104 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10105 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10106 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10107 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10108 if (Op.getOperand(4).getValueType() == MVT::i64)
10109 return SDValue();
10110
10111 SDLoc SL(Op);
10112 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
10113 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10114 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10115 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10116 Op.getOperand(6)});
10117 }
10118 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10119 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10120 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10121 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10122 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10123 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10124 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10125 ? MVT::i64
10126 : MVT::i32;
10127 if (Op.getOperand(6).getValueType() == IndexKeyTy)
10128 return SDValue();
10129
10130 SDLoc SL(Op);
10131 auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
10132 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10133 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10134 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10135 IndexKey, Op.getOperand(7),
10136 Op.getOperand(8)}); // No clamp operand
10137 }
10138 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10139 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10140 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10141 if (Op.getOperand(6).getValueType() == MVT::i32)
10142 return SDValue();
10143
10144 SDLoc SL(Op);
10145 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
10146 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10147 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10148 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10149 IndexKeyi32, Op.getOperand(7)});
10150 }
10151 case Intrinsic::amdgcn_addrspacecast_nonnull:
10152 return lowerADDRSPACECAST(Op, DAG);
10153 case Intrinsic::amdgcn_readlane:
10154 case Intrinsic::amdgcn_readfirstlane:
10155 case Intrinsic::amdgcn_writelane:
10156 case Intrinsic::amdgcn_permlane16:
10157 case Intrinsic::amdgcn_permlanex16:
10158 case Intrinsic::amdgcn_permlane64:
10159 case Intrinsic::amdgcn_set_inactive:
10160 case Intrinsic::amdgcn_set_inactive_chain_arg:
10161 case Intrinsic::amdgcn_mov_dpp8:
10162 case Intrinsic::amdgcn_update_dpp:
10163 return lowerLaneOp(*this, Op.getNode(), DAG);
10164 case Intrinsic::amdgcn_dead: {
10166 for (const EVT ValTy : Op.getNode()->values())
10167 Poisons.push_back(DAG.getPOISON(ValTy));
10168 return DAG.getMergeValues(Poisons, SDLoc(Op));
10169 }
10170 default:
10171 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10173 return lowerImage(Op, ImageDimIntr, DAG, false);
10174
10175 return Op;
10176 }
10177}
10178
10179// On targets not supporting constant in soffset field, turn zero to
10180// SGPR_NULL to avoid generating an extra s_mov with zero.
10182 const GCNSubtarget *Subtarget) {
10183 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
10184 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10185 return SOffset;
10186}
10187
10188SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10189 SelectionDAG &DAG,
10190 unsigned NewOpcode) const {
10191 SDLoc DL(Op);
10192
10193 SDValue VData = Op.getOperand(2);
10194 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10195 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10196 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10197 SDValue Ops[] = {
10198 Op.getOperand(0), // Chain
10199 VData, // vdata
10200 Rsrc, // rsrc
10201 DAG.getConstant(0, DL, MVT::i32), // vindex
10202 VOffset, // voffset
10203 SOffset, // soffset
10204 Offset, // offset
10205 Op.getOperand(6), // cachepolicy
10206 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10207 };
10208
10209 auto *M = cast<MemSDNode>(Op);
10210
10211 EVT MemVT = VData.getValueType();
10212 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10213 M->getMemOperand());
10214}
10215
10216SDValue
10217SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10218 unsigned NewOpcode) const {
10219 SDLoc DL(Op);
10220
10221 SDValue VData = Op.getOperand(2);
10222 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10223 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10224 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10225 SDValue Ops[] = {
10226 Op.getOperand(0), // Chain
10227 VData, // vdata
10228 Rsrc, // rsrc
10229 Op.getOperand(4), // vindex
10230 VOffset, // voffset
10231 SOffset, // soffset
10232 Offset, // offset
10233 Op.getOperand(7), // cachepolicy
10234 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10235 };
10236
10237 auto *M = cast<MemSDNode>(Op);
10238
10239 EVT MemVT = VData.getValueType();
10240 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10241 M->getMemOperand());
10242}
10243
10244SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10245 SelectionDAG &DAG) const {
10246 unsigned IntrID = Op.getConstantOperandVal(1);
10247 SDLoc DL(Op);
10248
10249 switch (IntrID) {
10250 case Intrinsic::amdgcn_ds_ordered_add:
10251 case Intrinsic::amdgcn_ds_ordered_swap: {
10252 MemSDNode *M = cast<MemSDNode>(Op);
10253 SDValue Chain = M->getOperand(0);
10254 SDValue M0 = M->getOperand(2);
10255 SDValue Value = M->getOperand(3);
10256 unsigned IndexOperand = M->getConstantOperandVal(7);
10257 unsigned WaveRelease = M->getConstantOperandVal(8);
10258 unsigned WaveDone = M->getConstantOperandVal(9);
10259
10260 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10261 IndexOperand &= ~0x3f;
10262 unsigned CountDw = 0;
10263
10264 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10265 CountDw = (IndexOperand >> 24) & 0xf;
10266 IndexOperand &= ~(0xf << 24);
10267
10268 if (CountDw < 1 || CountDw > 4) {
10269 const Function &Fn = DAG.getMachineFunction().getFunction();
10270 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10271 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10272 DL.getDebugLoc()));
10273 CountDw = 1;
10274 }
10275 }
10276
10277 if (IndexOperand) {
10278 const Function &Fn = DAG.getMachineFunction().getFunction();
10279 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10280 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10281 }
10282
10283 if (WaveDone && !WaveRelease) {
10284 // TODO: Move this to IR verifier
10285 const Function &Fn = DAG.getMachineFunction().getFunction();
10286 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10287 Fn, "ds_ordered_count: wave_done requires wave_release",
10288 DL.getDebugLoc()));
10289 }
10290
10291 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10292 unsigned ShaderType =
10294 unsigned Offset0 = OrderedCountIndex << 2;
10295 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10296
10297 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10298 Offset1 |= (CountDw - 1) << 6;
10299
10300 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10301 Offset1 |= ShaderType << 2;
10302
10303 unsigned Offset = Offset0 | (Offset1 << 8);
10304
10305 SDValue Ops[] = {
10306 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
10307 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
10308 };
10310 M->getVTList(), Ops, M->getMemoryVT(),
10311 M->getMemOperand());
10312 }
10313 case Intrinsic::amdgcn_raw_buffer_load:
10314 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10315 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10316 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10317 case Intrinsic::amdgcn_raw_buffer_load_format:
10318 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10319 const bool IsFormat =
10320 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10321 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10322
10323 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10324 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10325 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10326 SDValue Ops[] = {
10327 Op.getOperand(0), // Chain
10328 Rsrc, // rsrc
10329 DAG.getConstant(0, DL, MVT::i32), // vindex
10330 VOffset, // voffset
10331 SOffset, // soffset
10332 Offset, // offset
10333 Op.getOperand(5), // cachepolicy, swizzled buffer
10334 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10335 };
10336
10337 auto *M = cast<MemSDNode>(Op);
10338 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10339 }
10340 case Intrinsic::amdgcn_struct_buffer_load:
10341 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10342 case Intrinsic::amdgcn_struct_buffer_load_format:
10343 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10344 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10345 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10346 const bool IsFormat =
10347 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10348 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10349
10350 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10351 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10352 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10353 SDValue Ops[] = {
10354 Op.getOperand(0), // Chain
10355 Rsrc, // rsrc
10356 Op.getOperand(3), // vindex
10357 VOffset, // voffset
10358 SOffset, // soffset
10359 Offset, // offset
10360 Op.getOperand(6), // cachepolicy, swizzled buffer
10361 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10362 };
10363
10364 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
10365 }
10366 case Intrinsic::amdgcn_raw_tbuffer_load:
10367 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10368 MemSDNode *M = cast<MemSDNode>(Op);
10369 EVT LoadVT = Op.getValueType();
10370 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10371 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10372 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10373
10374 SDValue Ops[] = {
10375 Op.getOperand(0), // Chain
10376 Rsrc, // rsrc
10377 DAG.getConstant(0, DL, MVT::i32), // vindex
10378 VOffset, // voffset
10379 SOffset, // soffset
10380 Offset, // offset
10381 Op.getOperand(5), // format
10382 Op.getOperand(6), // cachepolicy, swizzled buffer
10383 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10384 };
10385
10386 if (LoadVT.getScalarType() == MVT::f16)
10387 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10388 Ops);
10389 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10390 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10391 DAG);
10392 }
10393 case Intrinsic::amdgcn_struct_tbuffer_load:
10394 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10395 MemSDNode *M = cast<MemSDNode>(Op);
10396 EVT LoadVT = Op.getValueType();
10397 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10398 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10399 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10400
10401 SDValue Ops[] = {
10402 Op.getOperand(0), // Chain
10403 Rsrc, // rsrc
10404 Op.getOperand(3), // vindex
10405 VOffset, // voffset
10406 SOffset, // soffset
10407 Offset, // offset
10408 Op.getOperand(6), // format
10409 Op.getOperand(7), // cachepolicy, swizzled buffer
10410 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10411 };
10412
10413 if (LoadVT.getScalarType() == MVT::f16)
10414 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10415 Ops);
10416 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10417 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10418 DAG);
10419 }
10420 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10421 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10422 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10423 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10424 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10425 return lowerStructBufferAtomicIntrin(Op, DAG,
10427 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10428 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10429 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10430 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10431 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10432 return lowerStructBufferAtomicIntrin(Op, DAG,
10434 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10435 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10436 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10437 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10438 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10439 return lowerStructBufferAtomicIntrin(Op, DAG,
10441 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10442 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10443 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10444 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10445 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10446 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10447 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10448 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10449 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10450 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10451 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10452 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10453 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10454 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10455 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10456 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10457 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10458 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10459 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10460 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10461 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10462 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10463 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10464 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10465 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10466 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10467 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10468 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10469 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10470 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10471 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10472 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10473 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10474 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10475 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10476 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10477 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10478 return lowerRawBufferAtomicIntrin(Op, DAG,
10480 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10481 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10482 return lowerStructBufferAtomicIntrin(Op, DAG,
10484 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10485 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10486 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10487 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10488 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10489 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10490 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10491 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10492 return lowerStructBufferAtomicIntrin(Op, DAG,
10494 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10495 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10496 return lowerStructBufferAtomicIntrin(Op, DAG,
10498 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10499 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10500 return lowerStructBufferAtomicIntrin(Op, DAG,
10502 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10503 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10504 return lowerStructBufferAtomicIntrin(Op, DAG,
10506 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10507 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10508 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10509 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10510 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10511 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10512 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10513 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10514 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10515 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10516 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10517 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10518 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10519 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10520 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10521 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10522 return lowerStructBufferAtomicIntrin(Op, DAG,
10524
10525 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10526 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10527 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
10528 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10529 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10530 SDValue Ops[] = {
10531 Op.getOperand(0), // Chain
10532 Op.getOperand(2), // src
10533 Op.getOperand(3), // cmp
10534 Rsrc, // rsrc
10535 DAG.getConstant(0, DL, MVT::i32), // vindex
10536 VOffset, // voffset
10537 SOffset, // soffset
10538 Offset, // offset
10539 Op.getOperand(7), // cachepolicy
10540 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10541 };
10542 EVT VT = Op.getValueType();
10543 auto *M = cast<MemSDNode>(Op);
10544
10546 Op->getVTList(), Ops, VT,
10547 M->getMemOperand());
10548 }
10549 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10550 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10551 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
10552 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
10553 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
10554 SDValue Ops[] = {
10555 Op.getOperand(0), // Chain
10556 Op.getOperand(2), // src
10557 Op.getOperand(3), // cmp
10558 Rsrc, // rsrc
10559 Op.getOperand(5), // vindex
10560 VOffset, // voffset
10561 SOffset, // soffset
10562 Offset, // offset
10563 Op.getOperand(8), // cachepolicy
10564 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10565 };
10566 EVT VT = Op.getValueType();
10567 auto *M = cast<MemSDNode>(Op);
10568
10570 Op->getVTList(), Ops, VT,
10571 M->getMemOperand());
10572 }
10573 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10574 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10575 MemSDNode *M = cast<MemSDNode>(Op);
10576 SDValue NodePtr = M->getOperand(2);
10577 SDValue RayExtent = M->getOperand(3);
10578 SDValue InstanceMask = M->getOperand(4);
10579 SDValue RayOrigin = M->getOperand(5);
10580 SDValue RayDir = M->getOperand(6);
10581 SDValue Offsets = M->getOperand(7);
10582 SDValue TDescr = M->getOperand(8);
10583
10584 assert(NodePtr.getValueType() == MVT::i64);
10585 assert(RayDir.getValueType() == MVT::v3f32);
10586
10587 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10588 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10589 return SDValue();
10590 }
10591
10592 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10593 const unsigned NumVDataDwords = 10;
10594 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10595 int Opcode = AMDGPU::getMIMGOpcode(
10596 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10597 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10598 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10599 assert(Opcode != -1);
10600
10602 Ops.push_back(NodePtr);
10603 Ops.push_back(DAG.getBuildVector(
10604 MVT::v2i32, DL,
10605 {DAG.getBitcast(MVT::i32, RayExtent),
10606 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10607 Ops.push_back(RayOrigin);
10608 Ops.push_back(RayDir);
10609 Ops.push_back(Offsets);
10610 Ops.push_back(TDescr);
10611 Ops.push_back(M->getChain());
10612
10613 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10614 MachineMemOperand *MemRef = M->getMemOperand();
10615 DAG.setNodeMemRefs(NewNode, {MemRef});
10616 return SDValue(NewNode, 0);
10617 }
10618 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10619 MemSDNode *M = cast<MemSDNode>(Op);
10620 SDValue NodePtr = M->getOperand(2);
10621 SDValue RayExtent = M->getOperand(3);
10622 SDValue RayOrigin = M->getOperand(4);
10623 SDValue RayDir = M->getOperand(5);
10624 SDValue RayInvDir = M->getOperand(6);
10625 SDValue TDescr = M->getOperand(7);
10626
10627 assert(NodePtr.getValueType() == MVT::i32 ||
10628 NodePtr.getValueType() == MVT::i64);
10629 assert(RayDir.getValueType() == MVT::v3f16 ||
10630 RayDir.getValueType() == MVT::v3f32);
10631
10632 if (!Subtarget->hasGFX10_AEncoding()) {
10633 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10634 return SDValue();
10635 }
10636
10637 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
10638 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10639 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10640 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
10641 const bool Is64 = NodePtr.getValueType() == MVT::i64;
10642 const unsigned NumVDataDwords = 4;
10643 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10644 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10645 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10646 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
10647 IsGFX12Plus;
10648 const unsigned BaseOpcodes[2][2] = {
10649 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10650 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10651 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10652 int Opcode;
10653 if (UseNSA) {
10654 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10655 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10656 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10657 : AMDGPU::MIMGEncGfx10NSA,
10658 NumVDataDwords, NumVAddrDwords);
10659 } else {
10660 assert(!IsGFX12Plus);
10661 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10662 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10663 : AMDGPU::MIMGEncGfx10Default,
10664 NumVDataDwords, NumVAddrDwords);
10665 }
10666 assert(Opcode != -1);
10667
10669
10670 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
10672 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
10673 if (Lanes[0].getValueSizeInBits() == 32) {
10674 for (unsigned I = 0; I < 3; ++I)
10675 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
10676 } else {
10677 if (IsAligned) {
10678 Ops.push_back(DAG.getBitcast(
10679 MVT::i32,
10680 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
10681 Ops.push_back(Lanes[2]);
10682 } else {
10683 SDValue Elt0 = Ops.pop_back_val();
10684 Ops.push_back(DAG.getBitcast(
10685 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
10686 Ops.push_back(DAG.getBitcast(
10687 MVT::i32,
10688 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
10689 }
10690 }
10691 };
10692
10693 if (UseNSA && IsGFX11Plus) {
10694 Ops.push_back(NodePtr);
10695 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10696 Ops.push_back(RayOrigin);
10697 if (IsA16) {
10698 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
10699 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
10700 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
10701 for (unsigned I = 0; I < 3; ++I) {
10702 MergedLanes.push_back(DAG.getBitcast(
10703 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
10704 {DirLanes[I], InvDirLanes[I]})));
10705 }
10706 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
10707 } else {
10708 Ops.push_back(RayDir);
10709 Ops.push_back(RayInvDir);
10710 }
10711 } else {
10712 if (Is64)
10713 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
10714 2);
10715 else
10716 Ops.push_back(NodePtr);
10717
10718 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10719 packLanes(RayOrigin, true);
10720 packLanes(RayDir, true);
10721 packLanes(RayInvDir, false);
10722 }
10723
10724 if (!UseNSA) {
10725 // Build a single vector containing all the operands so far prepared.
10726 if (NumVAddrDwords > 12) {
10727 SDValue Undef = DAG.getPOISON(MVT::i32);
10728 Ops.append(16 - Ops.size(), Undef);
10729 }
10730 assert(Ops.size() >= 8 && Ops.size() <= 12);
10731 SDValue MergedOps =
10732 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
10733 Ops.clear();
10734 Ops.push_back(MergedOps);
10735 }
10736
10737 Ops.push_back(TDescr);
10738 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
10739 Ops.push_back(M->getChain());
10740
10741 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10742 MachineMemOperand *MemRef = M->getMemOperand();
10743 DAG.setNodeMemRefs(NewNode, {MemRef});
10744 return SDValue(NewNode, 0);
10745 }
10746 case Intrinsic::amdgcn_global_atomic_fmin_num:
10747 case Intrinsic::amdgcn_global_atomic_fmax_num:
10748 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10749 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10750 MemSDNode *M = cast<MemSDNode>(Op);
10751 SDValue Ops[] = {
10752 M->getOperand(0), // Chain
10753 M->getOperand(2), // Ptr
10754 M->getOperand(3) // Value
10755 };
10756 unsigned Opcode = 0;
10757 switch (IntrID) {
10758 case Intrinsic::amdgcn_global_atomic_fmin_num:
10759 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10760 Opcode = ISD::ATOMIC_LOAD_FMIN;
10761 break;
10762 }
10763 case Intrinsic::amdgcn_global_atomic_fmax_num:
10764 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10765 Opcode = ISD::ATOMIC_LOAD_FMAX;
10766 break;
10767 }
10768 default:
10769 llvm_unreachable("unhandled atomic opcode");
10770 }
10771 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
10772 Ops, M->getMemOperand());
10773 }
10774 case Intrinsic::amdgcn_s_get_barrier_state:
10775 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10776 SDValue Chain = Op->getOperand(0);
10778 unsigned Opc;
10779
10780 if (isa<ConstantSDNode>(Op->getOperand(2))) {
10781 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
10782 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10783 BarID = (BarID >> 4) & 0x3F;
10784 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10785 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10786 Ops.push_back(K);
10787 Ops.push_back(Chain);
10788 } else {
10789 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10790 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10791 SDValue M0Val;
10792 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
10793 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10794 M0Val = SDValue(
10795 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10796 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10797 0);
10798 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10799 } else
10800 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
10801 }
10802
10803 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10804 return SDValue(NewMI, 0);
10805 }
10806 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10807 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10808 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10809 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
10810 SDValue Chain = Op->getOperand(0);
10811 SDValue Ptr = Op->getOperand(2);
10812 EVT VT = Op->getValueType(0);
10813 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
10814 Chain, Ptr, MII->getMemOperand());
10815 }
10816 default:
10817
10818 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10820 return lowerImage(Op, ImageDimIntr, DAG, true);
10821
10822 return SDValue();
10823 }
10824}
10825
10826// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
10827// dwordx4 if on SI and handle TFE loads.
10828SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
10829 SDVTList VTList,
10830 ArrayRef<SDValue> Ops, EVT MemVT,
10831 MachineMemOperand *MMO,
10832 SelectionDAG &DAG) const {
10833 LLVMContext &C = *DAG.getContext();
10834 MachineFunction &MF = DAG.getMachineFunction();
10835 EVT VT = VTList.VTs[0];
10836
10837 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
10838 bool IsTFE = VTList.NumVTs == 3;
10839 if (IsTFE) {
10840 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
10841 unsigned NumOpDWords = NumValueDWords + 1;
10842 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
10843 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
10844 MachineMemOperand *OpDWordsMMO =
10845 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
10846 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
10847 OpDWordsVT, OpDWordsMMO, DAG);
10848 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10849 DAG.getVectorIdxConstant(NumValueDWords, DL));
10850 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
10851 SDValue ValueDWords =
10852 NumValueDWords == 1
10853 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
10855 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
10856 ZeroIdx);
10857 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
10858 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10859 }
10860
10861 if (!Subtarget->hasDwordx3LoadStores() &&
10862 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10863 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
10864 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
10865 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
10866 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
10867 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
10868 WidenedMemVT, WidenedMMO);
10870 DAG.getVectorIdxConstant(0, DL));
10871 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
10872 }
10873
10874 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
10875}
10876
10877SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
10878 bool ImageStore) const {
10879 EVT StoreVT = VData.getValueType();
10880
10881 // No change for f16 and legal vector D16 types.
10882 if (!StoreVT.isVector())
10883 return VData;
10884
10885 SDLoc DL(VData);
10886 unsigned NumElements = StoreVT.getVectorNumElements();
10887
10888 if (Subtarget->hasUnpackedD16VMem()) {
10889 // We need to unpack the packed data to store.
10890 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10891 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10892
10893 EVT EquivStoreVT =
10894 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
10895 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
10896 return DAG.UnrollVectorOp(ZExt.getNode());
10897 }
10898
10899 // The sq block of gfx8.1 does not estimate register use correctly for d16
10900 // image store instructions. The data operand is computed as if it were not a
10901 // d16 image instruction.
10902 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10903 // Bitcast to i16
10904 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10905 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10906
10907 // Decompose into scalars
10909 DAG.ExtractVectorElements(IntVData, Elts);
10910
10911 // Group pairs of i16 into v2i16 and bitcast to i32
10912 SmallVector<SDValue, 4> PackedElts;
10913 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
10914 SDValue Pair =
10915 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
10916 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10917 PackedElts.push_back(IntPair);
10918 }
10919 if ((NumElements % 2) == 1) {
10920 // Handle v3i16
10921 unsigned I = Elts.size() / 2;
10922 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
10923 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
10924 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10925 PackedElts.push_back(IntPair);
10926 }
10927
10928 // Pad using UNDEF
10929 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
10930
10931 // Build final vector
10932 EVT VecVT =
10933 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
10934 return DAG.getBuildVector(VecVT, DL, PackedElts);
10935 }
10936
10937 if (NumElements == 3) {
10938 EVT IntStoreVT =
10940 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10941
10942 EVT WidenedStoreVT = EVT::getVectorVT(
10943 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
10944 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
10945 WidenedStoreVT.getStoreSizeInBits());
10946 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
10947 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
10948 }
10949
10950 assert(isTypeLegal(StoreVT));
10951 return VData;
10952}
10953
10954SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
10955 SelectionDAG &DAG) const {
10956 SDLoc DL(Op);
10957 SDValue Chain = Op.getOperand(0);
10958 unsigned IntrinsicID = Op.getConstantOperandVal(1);
10959 MachineFunction &MF = DAG.getMachineFunction();
10960
10961 switch (IntrinsicID) {
10962 case Intrinsic::amdgcn_exp_compr: {
10963 if (!Subtarget->hasCompressedExport()) {
10964 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10966 "intrinsic not supported on subtarget", DL.getDebugLoc()));
10967 }
10968 SDValue Src0 = Op.getOperand(4);
10969 SDValue Src1 = Op.getOperand(5);
10970 // Hack around illegal type on SI by directly selecting it.
10971 if (isTypeLegal(Src0.getValueType()))
10972 return SDValue();
10973
10974 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
10975 SDValue Undef = DAG.getPOISON(MVT::f32);
10976 const SDValue Ops[] = {
10977 Op.getOperand(2), // tgt
10978 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
10979 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
10980 Undef, // src2
10981 Undef, // src3
10982 Op.getOperand(7), // vm
10983 DAG.getTargetConstant(1, DL, MVT::i1), // compr
10984 Op.getOperand(3), // en
10985 Op.getOperand(0) // Chain
10986 };
10987
10988 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10989 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
10990 }
10991
10992 case Intrinsic::amdgcn_struct_tbuffer_store:
10993 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10994 SDValue VData = Op.getOperand(2);
10995 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10996 if (IsD16)
10997 VData = handleD16VData(VData, DAG);
10998 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10999 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11000 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11001 SDValue Ops[] = {
11002 Chain,
11003 VData, // vdata
11004 Rsrc, // rsrc
11005 Op.getOperand(4), // vindex
11006 VOffset, // voffset
11007 SOffset, // soffset
11008 Offset, // offset
11009 Op.getOperand(7), // format
11010 Op.getOperand(8), // cachepolicy, swizzled buffer
11011 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11012 };
11013 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11015 MemSDNode *M = cast<MemSDNode>(Op);
11016 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11017 M->getMemoryVT(), M->getMemOperand());
11018 }
11019
11020 case Intrinsic::amdgcn_raw_tbuffer_store:
11021 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11022 SDValue VData = Op.getOperand(2);
11023 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11024 if (IsD16)
11025 VData = handleD16VData(VData, DAG);
11026 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11027 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11028 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11029 SDValue Ops[] = {
11030 Chain,
11031 VData, // vdata
11032 Rsrc, // rsrc
11033 DAG.getConstant(0, DL, MVT::i32), // vindex
11034 VOffset, // voffset
11035 SOffset, // soffset
11036 Offset, // offset
11037 Op.getOperand(6), // format
11038 Op.getOperand(7), // cachepolicy, swizzled buffer
11039 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11040 };
11041 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11043 MemSDNode *M = cast<MemSDNode>(Op);
11044 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11045 M->getMemoryVT(), M->getMemOperand());
11046 }
11047
11048 case Intrinsic::amdgcn_raw_buffer_store:
11049 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11050 case Intrinsic::amdgcn_raw_buffer_store_format:
11051 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11052 const bool IsFormat =
11053 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11054 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11055
11056 SDValue VData = Op.getOperand(2);
11057 EVT VDataVT = VData.getValueType();
11058 EVT EltType = VDataVT.getScalarType();
11059 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11060 if (IsD16) {
11061 VData = handleD16VData(VData, DAG);
11062 VDataVT = VData.getValueType();
11063 }
11064
11065 if (!isTypeLegal(VDataVT)) {
11066 VData =
11067 DAG.getNode(ISD::BITCAST, DL,
11068 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11069 }
11070
11071 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11072 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11073 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11074 SDValue Ops[] = {
11075 Chain,
11076 VData,
11077 Rsrc,
11078 DAG.getConstant(0, DL, MVT::i32), // vindex
11079 VOffset, // voffset
11080 SOffset, // soffset
11081 Offset, // offset
11082 Op.getOperand(6), // cachepolicy, swizzled buffer
11083 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11084 };
11085 unsigned Opc =
11088 MemSDNode *M = cast<MemSDNode>(Op);
11089
11090 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11091 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11092 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
11093
11094 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11095 M->getMemoryVT(), M->getMemOperand());
11096 }
11097
11098 case Intrinsic::amdgcn_struct_buffer_store:
11099 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11100 case Intrinsic::amdgcn_struct_buffer_store_format:
11101 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11102 const bool IsFormat =
11103 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11104 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11105
11106 SDValue VData = Op.getOperand(2);
11107 EVT VDataVT = VData.getValueType();
11108 EVT EltType = VDataVT.getScalarType();
11109 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11110
11111 if (IsD16) {
11112 VData = handleD16VData(VData, DAG);
11113 VDataVT = VData.getValueType();
11114 }
11115
11116 if (!isTypeLegal(VDataVT)) {
11117 VData =
11118 DAG.getNode(ISD::BITCAST, DL,
11119 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11120 }
11121
11122 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11123 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11124 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11125 SDValue Ops[] = {
11126 Chain,
11127 VData,
11128 Rsrc,
11129 Op.getOperand(4), // vindex
11130 VOffset, // voffset
11131 SOffset, // soffset
11132 Offset, // offset
11133 Op.getOperand(7), // cachepolicy, swizzled buffer
11134 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11135 };
11136 unsigned Opc =
11139 MemSDNode *M = cast<MemSDNode>(Op);
11140
11141 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11142 EVT VDataType = VData.getValueType().getScalarType();
11143 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11144 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11145
11146 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11147 M->getMemoryVT(), M->getMemOperand());
11148 }
11149 case Intrinsic::amdgcn_raw_buffer_load_lds:
11150 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11151 case Intrinsic::amdgcn_struct_buffer_load_lds:
11152 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11153 if (!Subtarget->hasVMemToLDSLoad())
11154 return SDValue();
11155 unsigned Opc;
11156 bool HasVIndex =
11157 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11158 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11159 unsigned OpOffset = HasVIndex ? 1 : 0;
11160 SDValue VOffset = Op.getOperand(5 + OpOffset);
11161 bool HasVOffset = !isNullConstant(VOffset);
11162 unsigned Size = Op->getConstantOperandVal(4);
11163
11164 switch (Size) {
11165 default:
11166 return SDValue();
11167 case 1:
11168 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11169 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11170 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11171 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11172 break;
11173 case 2:
11174 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11175 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11176 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11177 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11178 break;
11179 case 4:
11180 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11181 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11182 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11183 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11184 break;
11185 case 12:
11186 if (!Subtarget->hasLDSLoadB96_B128())
11187 return SDValue();
11188 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11189 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11190 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11191 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11192 break;
11193 case 16:
11194 if (!Subtarget->hasLDSLoadB96_B128())
11195 return SDValue();
11196 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11197 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11198 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11199 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11200 break;
11201 }
11202
11203 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11204
11206
11207 if (HasVIndex && HasVOffset)
11208 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
11209 {Op.getOperand(5), // VIndex
11210 VOffset}));
11211 else if (HasVIndex)
11212 Ops.push_back(Op.getOperand(5));
11213 else if (HasVOffset)
11214 Ops.push_back(VOffset);
11215
11216 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11217 Ops.push_back(Rsrc);
11218 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
11219 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
11220 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11221 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
11222 Ops.push_back(DAG.getTargetConstant(
11223 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11224 DL, MVT::i8)); // cpol
11225 Ops.push_back(DAG.getTargetConstant(
11226 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11227 ? 1
11228 : 0,
11229 DL, MVT::i8)); // swz
11230 Ops.push_back(M0Val.getValue(0)); // Chain
11231 Ops.push_back(M0Val.getValue(1)); // Glue
11232
11233 auto *M = cast<MemSDNode>(Op);
11234 MachineMemOperand *LoadMMO = M->getMemOperand();
11235 // Don't set the offset value here because the pointer points to the base of
11236 // the buffer.
11237 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11238
11239 MachinePointerInfo StorePtrI = LoadPtrI;
11240 LoadPtrI.V = PoisonValue::get(
11244
11245 auto F = LoadMMO->getFlags() &
11247 LoadMMO =
11249 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11250
11251 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11252 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
11253 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11254
11255 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
11256 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11257
11258 return SDValue(Load, 0);
11259 }
11260 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11261 // for "trust me" that the remaining cases are global pointers until
11262 // such time as we can put two mem operands on an intrinsic.
11263 case Intrinsic::amdgcn_load_to_lds:
11264 case Intrinsic::amdgcn_global_load_lds: {
11265 if (!Subtarget->hasVMemToLDSLoad())
11266 return SDValue();
11267
11268 unsigned Opc;
11269 unsigned Size = Op->getConstantOperandVal(4);
11270 switch (Size) {
11271 default:
11272 return SDValue();
11273 case 1:
11274 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11275 break;
11276 case 2:
11277 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11278 break;
11279 case 4:
11280 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11281 break;
11282 case 12:
11283 if (!Subtarget->hasLDSLoadB96_B128())
11284 return SDValue();
11285 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11286 break;
11287 case 16:
11288 if (!Subtarget->hasLDSLoadB96_B128())
11289 return SDValue();
11290 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11291 break;
11292 }
11293
11294 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11295
11297
11298 SDValue Addr = Op.getOperand(2); // Global ptr
11299 SDValue VOffset;
11300 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11301 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11302 if (Addr->isDivergent() && Addr->isAnyAdd()) {
11303 SDValue LHS = Addr.getOperand(0);
11304 SDValue RHS = Addr.getOperand(1);
11305
11306 if (LHS->isDivergent())
11307 std::swap(LHS, RHS);
11308
11309 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11310 RHS.getOperand(0).getValueType() == MVT::i32) {
11311 // add (i64 sgpr), (zero_extend (i32 vgpr))
11312 Addr = LHS;
11313 VOffset = RHS.getOperand(0);
11314 }
11315 }
11316
11317 Ops.push_back(Addr);
11318 if (!Addr->isDivergent()) {
11320 if (!VOffset)
11321 VOffset =
11322 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
11323 DAG.getTargetConstant(0, DL, MVT::i32)),
11324 0);
11325 Ops.push_back(VOffset);
11326 }
11327
11328 Ops.push_back(Op.getOperand(5)); // Offset
11329 Ops.push_back(Op.getOperand(6)); // CPol
11330 Ops.push_back(M0Val.getValue(0)); // Chain
11331 Ops.push_back(M0Val.getValue(1)); // Glue
11332
11333 auto *M = cast<MemSDNode>(Op);
11334 MachineMemOperand *LoadMMO = M->getMemOperand();
11335 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11336 LoadPtrI.Offset = Op->getConstantOperandVal(5);
11337 MachinePointerInfo StorePtrI = LoadPtrI;
11338 LoadPtrI.V = PoisonValue::get(
11342 auto F = LoadMMO->getFlags() &
11344 LoadMMO =
11346 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11347 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11348 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
11349 LoadMMO->getAAInfo());
11350
11351 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11352 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11353
11354 return SDValue(Load, 0);
11355 }
11356 case Intrinsic::amdgcn_end_cf:
11357 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
11358 Op->getOperand(2), Chain),
11359 0);
11360 case Intrinsic::amdgcn_s_barrier_init:
11361 case Intrinsic::amdgcn_s_barrier_signal_var: {
11362 // these two intrinsics have two operands: barrier pointer and member count
11363 SDValue Chain = Op->getOperand(0);
11365 SDValue BarOp = Op->getOperand(2);
11366 SDValue CntOp = Op->getOperand(3);
11367 SDValue M0Val;
11368 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11369 ? AMDGPU::S_BARRIER_INIT_M0
11370 : AMDGPU::S_BARRIER_SIGNAL_M0;
11371 // extract the BarrierID from bits 4-9 of BarOp
11372 SDValue BarID;
11373 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11374 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11375 BarID =
11376 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
11377 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11378 0);
11379 // Member count should be put into M0[ShAmt:+6]
11380 // Barrier ID should be put into M0[5:0]
11381 M0Val =
11382 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
11383 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11384 0);
11385 constexpr unsigned ShAmt = 16;
11386 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
11387 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
11388
11389 M0Val = SDValue(
11390 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
11391
11392 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11393
11394 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11395 return SDValue(NewMI, 0);
11396 }
11397 case Intrinsic::amdgcn_s_barrier_join: {
11398 // these three intrinsics have one operand: barrier pointer
11399 SDValue Chain = Op->getOperand(0);
11401 SDValue BarOp = Op->getOperand(2);
11402 unsigned Opc;
11403
11404 if (isa<ConstantSDNode>(BarOp)) {
11405 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
11406 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11407
11408 // extract the BarrierID from bits 4-9 of the immediate
11409 unsigned BarID = (BarVal >> 4) & 0x3F;
11410 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11411 Ops.push_back(K);
11412 Ops.push_back(Chain);
11413 } else {
11414 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11415
11416 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11417 SDValue M0Val;
11418 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11419 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11420 M0Val =
11421 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11422 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11423 0);
11424 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11425 }
11426
11427 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11428 return SDValue(NewMI, 0);
11429 }
11430 case Intrinsic::amdgcn_s_prefetch_data: {
11431 // For non-global address space preserve the chain and remove the call.
11433 return Op.getOperand(0);
11434 return Op;
11435 }
11436 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11437 SDValue Ops[] = {
11438 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
11439 Op.getOperand(3), // offset
11440 Op.getOperand(4), // length
11441 };
11442
11443 MemSDNode *M = cast<MemSDNode>(Op);
11445 Op->getVTList(), Ops, M->getMemoryVT(),
11446 M->getMemOperand());
11447 }
11448 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11449 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11450 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11451 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11452 SDValue Chain = Op->getOperand(0);
11453 SDValue Ptr = Op->getOperand(2);
11454 SDValue Val = Op->getOperand(3);
11455 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
11456 Ptr, MII->getMemOperand());
11457 }
11458 default: {
11459 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11461 return lowerImage(Op, ImageDimIntr, DAG, true);
11462
11463 return Op;
11464 }
11465 }
11466}
11467
11468// Return whether the operation has NoUnsignedWrap property.
11469static bool isNoUnsignedWrap(SDValue Addr) {
11470 return (Addr.getOpcode() == ISD::ADD &&
11471 Addr->getFlags().hasNoUnsignedWrap()) ||
11472 Addr->getOpcode() == ISD::OR;
11473}
11474
11476 EVT PtrVT) const {
11477 return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
11478}
11479
11481 EVT PtrVT) const {
11482 return true;
11483}
11484
11485// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11486// offset (the offset that is included in bounds checking and swizzling, to be
11487// split between the instruction's voffset and immoffset fields) and soffset
11488// (the offset that is excluded from bounds checking and swizzling, to go in
11489// the instruction's soffset field). This function takes the first kind of
11490// offset and figures out how to split it between voffset and immoffset.
11491std::pair<SDValue, SDValue>
11492SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11493 SDLoc DL(Offset);
11494 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
11495 SDValue N0 = Offset;
11496 ConstantSDNode *C1 = nullptr;
11497
11498 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
11499 N0 = SDValue();
11500 else if (DAG.isBaseWithConstantOffset(N0)) {
11501 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11502 // being added, so we can only safely match a 32-bit addition with no
11503 // unsigned overflow.
11504 bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
11505 if (!CheckNUW || isNoUnsignedWrap(N0)) {
11506 C1 = cast<ConstantSDNode>(N0.getOperand(1));
11507 N0 = N0.getOperand(0);
11508 }
11509 }
11510
11511 if (C1) {
11512 unsigned ImmOffset = C1->getZExtValue();
11513 // If the immediate value is too big for the immoffset field, put only bits
11514 // that would normally fit in the immoffset field. The remaining value that
11515 // is copied/added for the voffset field is a large power of 2, and it
11516 // stands more chance of being CSEd with the copy/add for another similar
11517 // load/store.
11518 // However, do not do that rounding down if that is a negative
11519 // number, as it appears to be illegal to have a negative offset in the
11520 // vgpr, even if adding the immediate offset makes it positive.
11521 unsigned Overflow = ImmOffset & ~MaxImm;
11522 ImmOffset -= Overflow;
11523 if ((int32_t)Overflow < 0) {
11524 Overflow += ImmOffset;
11525 ImmOffset = 0;
11526 }
11527 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
11528 if (Overflow) {
11529 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
11530 if (!N0)
11531 N0 = OverflowVal;
11532 else {
11533 SDValue Ops[] = {N0, OverflowVal};
11534 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
11535 }
11536 }
11537 }
11538 if (!N0)
11539 N0 = DAG.getConstant(0, DL, MVT::i32);
11540 if (!C1)
11541 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
11542 return {N0, SDValue(C1, 0)};
11543}
11544
11545// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
11546// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
11547// pointed to by Offsets.
11548void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
11549 SelectionDAG &DAG, SDValue *Offsets,
11550 Align Alignment) const {
11551 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11552 SDLoc DL(CombinedOffset);
11553 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
11554 uint32_t Imm = C->getZExtValue();
11555 uint32_t SOffset, ImmOffset;
11556 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11557 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
11558 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11559 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11560 return;
11561 }
11562 }
11563 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
11564 SDValue N0 = CombinedOffset.getOperand(0);
11565 SDValue N1 = CombinedOffset.getOperand(1);
11566 uint32_t SOffset, ImmOffset;
11567 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
11568 if (Offset >= 0 &&
11569 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
11570 Offsets[0] = N0;
11571 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11572 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11573 return;
11574 }
11575 }
11576
11577 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11578 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
11579 : DAG.getConstant(0, DL, MVT::i32);
11580
11581 Offsets[0] = CombinedOffset;
11582 Offsets[1] = SOffsetZero;
11583 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
11584}
11585
11586SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
11587 SelectionDAG &DAG) const {
11588 if (!MaybePointer.getValueType().isScalarInteger())
11589 return MaybePointer;
11590
11591 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
11592 return Rsrc;
11593}
11594
11595// Wrap a global or flat pointer into a buffer intrinsic using the flags
11596// specified in the intrinsic.
11597SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
11598 SelectionDAG &DAG) const {
11599 SDLoc Loc(Op);
11600
11601 SDValue Pointer = Op->getOperand(1);
11602 SDValue Stride = Op->getOperand(2);
11603 SDValue NumRecords = Op->getOperand(3);
11604 SDValue Flags = Op->getOperand(4);
11605
11606 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11607 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11608 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11609 std::optional<uint32_t> ConstStride = std::nullopt;
11610 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
11611 ConstStride = ConstNode->getZExtValue();
11612
11613 SDValue NewHighHalf = Masked;
11614 if (!ConstStride || *ConstStride != 0) {
11615 SDValue ShiftedStride;
11616 if (ConstStride) {
11617 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
11618 } else {
11619 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11620 ShiftedStride =
11621 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11622 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11623 }
11624 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11625 }
11626
11627 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
11628 NewHighHalf, NumRecords, Flags);
11629 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11630 return RsrcPtr;
11631}
11632
11633// Handle 8 bit and 16 bit buffer loads
11634SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
11635 EVT LoadVT, SDLoc DL,
11637 MachineMemOperand *MMO,
11638 bool IsTFE) const {
11639 EVT IntVT = LoadVT.changeTypeToInteger();
11640
11641 if (IsTFE) {
11642 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
11645 MachineFunction &MF = DAG.getMachineFunction();
11646 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
11647 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
11648 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11649 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11650 DAG.getConstant(1, DL, MVT::i32));
11651 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11652 DAG.getConstant(0, DL, MVT::i32));
11653 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
11654 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
11655 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11656 }
11657
11658 unsigned Opc = LoadVT.getScalarType() == MVT::i8
11661
11662 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
11663 SDValue BufferLoad =
11664 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
11665 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
11666 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
11667
11668 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
11669}
11670
11671// Handle 8 bit and 16 bit buffer stores
11672SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
11673 EVT VDataType, SDLoc DL,
11674 SDValue Ops[],
11675 MemSDNode *M) const {
11676 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11677 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
11678
11679 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
11680 Ops[1] = BufferStoreExt;
11681 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11682 : AMDGPUISD::BUFFER_STORE_SHORT;
11683 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
11684 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
11685 M->getMemOperand());
11686}
11687
11689 SDValue Op, const SDLoc &SL, EVT VT) {
11690 if (VT.bitsLT(Op.getValueType()))
11691 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
11692
11693 switch (ExtType) {
11694 case ISD::SEXTLOAD:
11695 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
11696 case ISD::ZEXTLOAD:
11697 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
11698 case ISD::EXTLOAD:
11699 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
11700 case ISD::NON_EXTLOAD:
11701 return Op;
11702 }
11703
11704 llvm_unreachable("invalid ext type");
11705}
11706
11707// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
11708// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
11709SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
11710 DAGCombinerInfo &DCI) const {
11711 SelectionDAG &DAG = DCI.DAG;
11712 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
11713 return SDValue();
11714
11715 // FIXME: Constant loads should all be marked invariant.
11716 unsigned AS = Ld->getAddressSpace();
11717 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
11719 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
11720 return SDValue();
11721
11722 // Don't do this early, since it may interfere with adjacent load merging for
11723 // illegal types. We can avoid losing alignment information for exotic types
11724 // pre-legalize.
11725 EVT MemVT = Ld->getMemoryVT();
11726 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
11727 MemVT.getSizeInBits() >= 32)
11728 return SDValue();
11729
11730 SDLoc SL(Ld);
11731
11732 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
11733 "unexpected vector extload");
11734
11735 // TODO: Drop only high part of range.
11736 SDValue Ptr = Ld->getBasePtr();
11737 SDValue NewLoad = DAG.getLoad(
11738 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
11739 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
11740 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
11741 nullptr); // Drop ranges
11742
11743 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
11744 if (MemVT.isFloatingPoint()) {
11746 "unexpected fp extload");
11747 TruncVT = MemVT.changeTypeToInteger();
11748 }
11749
11750 SDValue Cvt = NewLoad;
11751 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
11752 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
11753 DAG.getValueType(TruncVT));
11754 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
11756 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
11757 } else {
11759 }
11760
11761 EVT VT = Ld->getValueType(0);
11762 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
11763
11764 DCI.AddToWorklist(Cvt.getNode());
11765
11766 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
11767 // the appropriate extension from the 32-bit load.
11768 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
11769 DCI.AddToWorklist(Cvt.getNode());
11770
11771 // Handle conversion back to floating point if necessary.
11772 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
11773
11774 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
11775}
11776
11778 const SIMachineFunctionInfo &Info) {
11779 // TODO: Should check if the address can definitely not access stack.
11780 if (Info.isEntryFunction())
11781 return Info.getUserSGPRInfo().hasFlatScratchInit();
11782 return true;
11783}
11784
11785SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
11786 SDLoc DL(Op);
11787 LoadSDNode *Load = cast<LoadSDNode>(Op);
11788 ISD::LoadExtType ExtType = Load->getExtensionType();
11789 EVT MemVT = Load->getMemoryVT();
11790 MachineMemOperand *MMO = Load->getMemOperand();
11791
11792 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
11793 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
11794 return SDValue();
11795
11796 // FIXME: Copied from PPC
11797 // First, load into 32 bits, then truncate to 1 bit.
11798
11799 SDValue Chain = Load->getChain();
11800 SDValue BasePtr = Load->getBasePtr();
11801
11802 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11803
11804 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
11805 RealMemVT, MMO);
11806
11807 if (!MemVT.isVector()) {
11808 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
11809 NewLD.getValue(1)};
11810
11811 return DAG.getMergeValues(Ops, DL);
11812 }
11813
11815 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
11816 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
11817 DAG.getConstant(I, DL, MVT::i32));
11818
11819 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
11820 }
11821
11822 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
11823
11824 return DAG.getMergeValues(Ops, DL);
11825 }
11826
11827 if (!MemVT.isVector())
11828 return SDValue();
11829
11830 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
11831 "Custom lowering for non-i32 vectors hasn't been implemented.");
11832
11833 Align Alignment = Load->getAlign();
11834 unsigned AS = Load->getAddressSpace();
11835 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11836 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
11837 return SplitVectorLoad(Op, DAG);
11838 }
11839
11840 MachineFunction &MF = DAG.getMachineFunction();
11841 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
11842 // If there is a possibility that flat instruction access scratch memory
11843 // then we need to use the same legalization rules we use for private.
11844 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11845 !Subtarget->hasMultiDwordFlatScratchAddressing())
11846 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
11849
11850 unsigned NumElements = MemVT.getVectorNumElements();
11851
11852 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11854 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
11855 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
11857 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
11858 Alignment >= Align(4) && NumElements < 32) {
11859 if (MemVT.isPow2VectorType() ||
11860 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11861 return SDValue();
11862 return WidenOrSplitVectorLoad(Op, DAG);
11863 }
11864 // Non-uniform loads will be selected to MUBUF instructions, so they
11865 // have the same legalization requirements as global and private
11866 // loads.
11867 //
11868 }
11869 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11872 if (NumElements > 4)
11873 return SplitVectorLoad(Op, DAG);
11874 // v3 loads not supported on SI.
11875 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11876 return WidenOrSplitVectorLoad(Op, DAG);
11877
11878 // v3 and v4 loads are supported for private and global memory.
11879 return SDValue();
11880 }
11881 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11882 // Depending on the setting of the private_element_size field in the
11883 // resource descriptor, we can only make private accesses up to a certain
11884 // size.
11885 switch (Subtarget->getMaxPrivateElementSize()) {
11886 case 4: {
11887 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
11888 return DAG.getMergeValues({Op0, Op1}, DL);
11889 }
11890 case 8:
11891 if (NumElements > 2)
11892 return SplitVectorLoad(Op, DAG);
11893 return SDValue();
11894 case 16:
11895 // Same as global/flat
11896 if (NumElements > 4)
11897 return SplitVectorLoad(Op, DAG);
11898 // v3 loads not supported on SI.
11899 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11900 return WidenOrSplitVectorLoad(Op, DAG);
11901
11902 return SDValue();
11903 default:
11904 llvm_unreachable("unsupported private_element_size");
11905 }
11906 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11907 unsigned Fast = 0;
11908 auto Flags = Load->getMemOperand()->getFlags();
11910 Load->getAlign(), Flags, &Fast) &&
11911 Fast > 1)
11912 return SDValue();
11913
11914 if (MemVT.isVector())
11915 return SplitVectorLoad(Op, DAG);
11916 }
11917
11919 MemVT, *Load->getMemOperand())) {
11920 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
11921 return DAG.getMergeValues({Op0, Op1}, DL);
11922 }
11923
11924 return SDValue();
11925}
11926
11927SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
11928 EVT VT = Op.getValueType();
11929 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
11930 VT.getSizeInBits() == 512)
11931 return splitTernaryVectorOp(Op, DAG);
11932
11933 assert(VT.getSizeInBits() == 64);
11934
11935 SDLoc DL(Op);
11936 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
11937
11938 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
11939 SDValue One = DAG.getConstant(1, DL, MVT::i32);
11940
11941 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
11942 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
11943
11944 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
11945 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
11946
11947 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
11948
11949 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
11950 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
11951
11952 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
11953
11954 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
11955 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
11956}
11957
11958// Catch division cases where we can use shortcuts with rcp and rsq
11959// instructions.
11960SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
11961 SelectionDAG &DAG) const {
11962 SDLoc SL(Op);
11963 SDValue LHS = Op.getOperand(0);
11964 SDValue RHS = Op.getOperand(1);
11965 EVT VT = Op.getValueType();
11966 const SDNodeFlags Flags = Op->getFlags();
11967
11968 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
11969
11970 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
11971 // Without !fpmath accuracy information, we can't do more because we don't
11972 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
11973 // f16 is always accurate enough
11974 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11975 return SDValue();
11976
11977 if (CLHS->isExactlyValue(1.0)) {
11978 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
11979 // the CI documentation has a worst case error of 1 ulp.
11980 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
11981 // use it as long as we aren't trying to use denormals.
11982 //
11983 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
11984
11985 // 1.0 / sqrt(x) -> rsq(x)
11986
11987 // XXX - Is afn sufficient to do this for f64? The maximum ULP
11988 // error seems really high at 2^29 ULP.
11989 // 1.0 / x -> rcp(x)
11990 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
11991 }
11992
11993 // Same as for 1.0, but expand the sign out of the constant.
11994 if (CLHS->isExactlyValue(-1.0)) {
11995 // -1.0 / x -> rcp (fneg x)
11996 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
11997 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
11998 }
11999 }
12000
12001 // For f16 and bf16 require afn or arcp.
12002 // For f32 require afn.
12003 if (!AllowInaccurateRcp &&
12004 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
12005 return SDValue();
12006
12007 // Turn into multiply by the reciprocal.
12008 // x / y -> x * (1.0 / y)
12009 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12010 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
12011}
12012
12013SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12014 SelectionDAG &DAG) const {
12015 SDLoc SL(Op);
12016 SDValue X = Op.getOperand(0);
12017 SDValue Y = Op.getOperand(1);
12018 EVT VT = Op.getValueType();
12019 const SDNodeFlags Flags = Op->getFlags();
12020
12021 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12022 if (!AllowInaccurateDiv)
12023 return SDValue();
12024
12025 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
12026 SDValue One = DAG.getConstantFP(1.0, SL, VT);
12027
12028 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
12029 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12030
12031 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
12032 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12033 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
12034 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
12035 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
12036 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
12037}
12038
12039static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12040 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12041 SDNodeFlags Flags) {
12042 if (GlueChain->getNumValues() <= 1) {
12043 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
12044 }
12045
12046 assert(GlueChain->getNumValues() == 3);
12047
12048 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12049 switch (Opcode) {
12050 default:
12051 llvm_unreachable("no chain equivalent for opcode");
12052 case ISD::FMUL:
12053 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12054 break;
12055 }
12056
12057 return DAG.getNode(Opcode, SL, VTList,
12058 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
12059 Flags);
12060}
12061
12062static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12063 EVT VT, SDValue A, SDValue B, SDValue C,
12064 SDValue GlueChain, SDNodeFlags Flags) {
12065 if (GlueChain->getNumValues() <= 1) {
12066 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
12067 }
12068
12069 assert(GlueChain->getNumValues() == 3);
12070
12071 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12072 switch (Opcode) {
12073 default:
12074 llvm_unreachable("no chain equivalent for opcode");
12075 case ISD::FMA:
12076 Opcode = AMDGPUISD::FMA_W_CHAIN;
12077 break;
12078 }
12079
12080 return DAG.getNode(Opcode, SL, VTList,
12081 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
12082 Flags);
12083}
12084
12085SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12086 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12087 return FastLowered;
12088
12089 SDLoc SL(Op);
12090 EVT VT = Op.getValueType();
12091 SDValue LHS = Op.getOperand(0);
12092 SDValue RHS = Op.getOperand(1);
12093
12094 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
12095 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
12096
12097 if (VT == MVT::bf16) {
12098 SDValue ExtDiv =
12099 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
12100 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
12101 DAG.getTargetConstant(0, SL, MVT::i32));
12102 }
12103
12104 assert(VT == MVT::f16);
12105
12106 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12107 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12108 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12109 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12110 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12111 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12112 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12113 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12114 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12115 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12116 // q16.u = opx(V_CVT_F16_F32, q32.u);
12117 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12118
12119 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12120 unsigned FMADOpCode =
12122 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12123 SDValue Rcp =
12124 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
12125 SDValue Quot =
12126 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
12127 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12128 Op->getFlags());
12129 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
12130 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12131 Op->getFlags());
12132 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
12133 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
12134 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
12135 DAG.getConstant(0xff800000, SL, MVT::i32));
12136 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12137 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
12138 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
12139 DAG.getTargetConstant(0, SL, MVT::i32));
12140 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
12141 Op->getFlags());
12142}
12143
12144// Faster 2.5 ULP division that does not support denormals.
12145SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12146 SDNodeFlags Flags = Op->getFlags();
12147 SDLoc SL(Op);
12148 SDValue LHS = Op.getOperand(1);
12149 SDValue RHS = Op.getOperand(2);
12150
12151 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
12152
12153 const APFloat K0Val(0x1p+96f);
12154 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
12155
12156 const APFloat K1Val(0x1p-32f);
12157 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
12158
12159 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12160
12161 EVT SetCCVT =
12162 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
12163
12164 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
12165
12166 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
12167
12168 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
12169
12170 // rcp does not support denormals.
12171 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
12172
12173 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
12174
12175 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
12176}
12177
12178// Returns immediate value for setting the F32 denorm mode when using the
12179// S_DENORM_MODE instruction.
12182 const GCNSubtarget *ST) {
12183 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12184 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12185 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12186 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
12187}
12188
12189SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12190 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12191 return FastLowered;
12192
12193 // The selection matcher assumes anything with a chain selecting to a
12194 // mayRaiseFPException machine instruction. Since we're introducing a chain
12195 // here, we need to explicitly report nofpexcept for the regular fdiv
12196 // lowering.
12197 SDNodeFlags Flags = Op->getFlags();
12198 Flags.setNoFPExcept(true);
12199
12200 SDLoc SL(Op);
12201 SDValue LHS = Op.getOperand(0);
12202 SDValue RHS = Op.getOperand(1);
12203
12204 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12205
12206 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
12207
12208 SDValue DenominatorScaled =
12209 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
12210 SDValue NumeratorScaled =
12211 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
12212
12213 // Denominator is scaled to not be denormal, so using rcp is ok.
12214 SDValue ApproxRcp =
12215 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12216 SDValue NegDivScale0 =
12217 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12218
12219 using namespace AMDGPU::Hwreg;
12220 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12221 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
12222
12223 const MachineFunction &MF = DAG.getMachineFunction();
12224 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12225 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12226
12227 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12228 const bool HasDynamicDenormals =
12229 (DenormMode.Input == DenormalMode::Dynamic) ||
12230 (DenormMode.Output == DenormalMode::Dynamic);
12231
12232 SDValue SavedDenormMode;
12233
12234 if (!PreservesDenormals) {
12235 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12236 // lowering. The chain dependence is insufficient, and we need glue. We do
12237 // not need the glue variants in a strictfp function.
12238
12239 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12240
12241 SDValue Glue = DAG.getEntryNode();
12242 if (HasDynamicDenormals) {
12243 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
12244 DAG.getVTList(MVT::i32, MVT::Glue),
12245 {BitField, Glue});
12246 SavedDenormMode = SDValue(GetReg, 0);
12247
12248 Glue = DAG.getMergeValues(
12249 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
12250 }
12251
12252 SDNode *EnableDenorm;
12253 if (Subtarget->hasDenormModeInst()) {
12254 const SDValue EnableDenormValue =
12256
12257 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12258 EnableDenormValue)
12259 .getNode();
12260 } else {
12261 const SDValue EnableDenormValue =
12262 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
12263 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12264 {EnableDenormValue, BitField, Glue});
12265 }
12266
12267 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12268 SDValue(EnableDenorm, 1)};
12269
12270 NegDivScale0 = DAG.getMergeValues(Ops, SL);
12271 }
12272
12273 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
12274 ApproxRcp, One, NegDivScale0, Flags);
12275
12276 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
12277 ApproxRcp, Fma0, Flags);
12278
12279 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
12280 Fma1, Flags);
12281
12282 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
12283 NumeratorScaled, Mul, Flags);
12284
12285 SDValue Fma3 =
12286 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
12287
12288 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
12289 NumeratorScaled, Fma3, Flags);
12290
12291 if (!PreservesDenormals) {
12292 SDNode *DisableDenorm;
12293 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12294 const SDValue DisableDenormValue = getSPDenormModeValue(
12295 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
12296
12297 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12298 DisableDenorm =
12299 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12300 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
12301 .getNode();
12302 } else {
12303 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12304 const SDValue DisableDenormValue =
12305 HasDynamicDenormals
12306 ? SavedDenormMode
12307 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
12308
12309 DisableDenorm = DAG.getMachineNode(
12310 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12311 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
12312 }
12313
12314 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
12315 SDValue(DisableDenorm, 0), DAG.getRoot());
12316 DAG.setRoot(OutputChain);
12317 }
12318
12319 SDValue Scale = NumeratorScaled.getValue(1);
12320 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
12321 {Fma4, Fma1, Fma3, Scale}, Flags);
12322
12323 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
12324}
12325
12326SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12327 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12328 return FastLowered;
12329
12330 SDLoc SL(Op);
12331 SDValue X = Op.getOperand(0);
12332 SDValue Y = Op.getOperand(1);
12333
12334 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
12335
12336 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
12337
12338 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
12339
12340 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12341
12342 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12343
12344 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
12345
12346 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
12347
12348 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
12349
12350 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
12351
12352 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
12353 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
12354
12355 SDValue Fma4 =
12356 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
12357
12358 SDValue Scale;
12359
12360 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12361 // Workaround a hardware bug on SI where the condition output from div_scale
12362 // is not usable.
12363
12364 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
12365
12366 // Figure out if the scale to use for div_fmas.
12367 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
12368 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
12369 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12370 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12371
12372 SDValue NumHi =
12373 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
12374 SDValue DenHi =
12375 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
12376
12377 SDValue Scale0Hi =
12378 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
12379 SDValue Scale1Hi =
12380 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
12381
12382 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
12383 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
12384 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
12385 } else {
12386 Scale = DivScale1.getValue(1);
12387 }
12388
12389 SDValue Fmas =
12390 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
12391
12392 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
12393}
12394
12395SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12396 EVT VT = Op.getValueType();
12397
12398 if (VT == MVT::f32)
12399 return LowerFDIV32(Op, DAG);
12400
12401 if (VT == MVT::f64)
12402 return LowerFDIV64(Op, DAG);
12403
12404 if (VT == MVT::f16 || VT == MVT::bf16)
12405 return LowerFDIV16(Op, DAG);
12406
12407 llvm_unreachable("Unexpected type for fdiv");
12408}
12409
12410SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12411 SDLoc dl(Op);
12412 SDValue Val = Op.getOperand(0);
12413 EVT VT = Val.getValueType();
12414 EVT ResultExpVT = Op->getValueType(1);
12415 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12416
12417 SDValue Mant = DAG.getNode(
12419 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
12420
12421 SDValue Exp = DAG.getNode(
12422 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
12423 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
12424
12425 if (Subtarget->hasFractBug()) {
12426 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
12427 SDValue Inf =
12429
12430 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
12431 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
12432 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
12433 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
12434 }
12435
12436 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
12437 return DAG.getMergeValues({Mant, CastExp}, dl);
12438}
12439
12440SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12441 SDLoc DL(Op);
12442 StoreSDNode *Store = cast<StoreSDNode>(Op);
12443 EVT VT = Store->getMemoryVT();
12444
12445 if (VT == MVT::i1) {
12446 return DAG.getTruncStore(
12447 Store->getChain(), DL,
12448 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
12449 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
12450 }
12451
12452 assert(VT.isVector() &&
12453 Store->getValue().getValueType().getScalarType() == MVT::i32);
12454
12455 unsigned AS = Store->getAddressSpace();
12456 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12457 Store->getAlign().value() < VT.getStoreSize() &&
12458 VT.getSizeInBits() > 32) {
12459 return SplitVectorStore(Op, DAG);
12460 }
12461
12462 MachineFunction &MF = DAG.getMachineFunction();
12463 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12464 // If there is a possibility that flat instruction access scratch memory
12465 // then we need to use the same legalization rules we use for private.
12466 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12467 !Subtarget->hasMultiDwordFlatScratchAddressing())
12468 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
12471
12472 unsigned NumElements = VT.getVectorNumElements();
12474 if (NumElements > 4)
12475 return SplitVectorStore(Op, DAG);
12476 // v3 stores not supported on SI.
12477 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12478 return SplitVectorStore(Op, DAG);
12479
12481 VT, *Store->getMemOperand()))
12482 return expandUnalignedStore(Store, DAG);
12483
12484 return SDValue();
12485 }
12486 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12487 switch (Subtarget->getMaxPrivateElementSize()) {
12488 case 4:
12489 return scalarizeVectorStore(Store, DAG);
12490 case 8:
12491 if (NumElements > 2)
12492 return SplitVectorStore(Op, DAG);
12493 return SDValue();
12494 case 16:
12495 if (NumElements > 4 ||
12496 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12497 return SplitVectorStore(Op, DAG);
12498 return SDValue();
12499 default:
12500 llvm_unreachable("unsupported private_element_size");
12501 }
12502 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12503 unsigned Fast = 0;
12504 auto Flags = Store->getMemOperand()->getFlags();
12506 Store->getAlign(), Flags, &Fast) &&
12507 Fast > 1)
12508 return SDValue();
12509
12510 if (VT.isVector())
12511 return SplitVectorStore(Op, DAG);
12512
12513 return expandUnalignedStore(Store, DAG);
12514 }
12515
12516 // Probably an invalid store. If so we'll end up emitting a selection error.
12517 return SDValue();
12518}
12519
12520// Avoid the full correct expansion for f32 sqrt when promoting from f16.
12521SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
12522 SDLoc SL(Op);
12523 assert(!Subtarget->has16BitInsts());
12524 SDNodeFlags Flags = Op->getFlags();
12525 SDValue Ext =
12526 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
12527
12528 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
12529 SDValue Sqrt =
12530 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
12531
12532 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
12533 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
12534}
12535
12536SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
12537 SDLoc DL(Op);
12538 SDNodeFlags Flags = Op->getFlags();
12539 MVT VT = Op.getValueType().getSimpleVT();
12540 const SDValue X = Op.getOperand(0);
12541
12542 if (allowApproxFunc(DAG, Flags)) {
12543 // Instruction is 1ulp but ignores denormals.
12544 return DAG.getNode(
12546 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
12547 }
12548
12549 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
12550 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
12551
12552 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
12553
12554 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
12555
12556 SDValue SqrtX =
12557 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
12558
12559 SDValue SqrtS;
12560 if (needsDenormHandlingF32(DAG, X, Flags)) {
12561 SDValue SqrtID =
12562 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
12563 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
12564
12565 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
12566 SDValue SqrtSNextDownInt =
12567 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12568 DAG.getAllOnesConstant(DL, MVT::i32));
12569 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
12570
12571 SDValue NegSqrtSNextDown =
12572 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
12573
12574 SDValue SqrtVP =
12575 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
12576
12577 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12578 DAG.getConstant(1, DL, MVT::i32));
12579 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
12580
12581 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
12582 SDValue SqrtVS =
12583 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
12584
12585 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
12586 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
12587
12588 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
12589 Flags);
12590
12591 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
12592 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
12593 Flags);
12594 } else {
12595 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
12596
12597 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
12598
12599 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
12600 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
12601 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
12602
12603 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
12604 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
12605 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
12606
12607 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
12608 SDValue SqrtD =
12609 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
12610 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
12611 }
12612
12613 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
12614
12615 SDValue ScaledDown =
12616 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
12617
12618 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
12619 SDValue IsZeroOrInf =
12620 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12621 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12622
12623 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
12624}
12625
12626SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
12627 // For double type, the SQRT and RSQ instructions don't have required
12628 // precision, we apply Goldschmidt's algorithm to improve the result:
12629 //
12630 // y0 = rsq(x)
12631 // g0 = x * y0
12632 // h0 = 0.5 * y0
12633 //
12634 // r0 = 0.5 - h0 * g0
12635 // g1 = g0 * r0 + g0
12636 // h1 = h0 * r0 + h0
12637 //
12638 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
12639 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
12640 // h2 = h1 * r1 + h1
12641 //
12642 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
12643 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
12644 //
12645 // sqrt(x) = g3
12646
12647 SDNodeFlags Flags = Op->getFlags();
12648
12649 SDLoc DL(Op);
12650
12651 SDValue X = Op.getOperand(0);
12652 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
12653
12654 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
12655
12656 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
12657
12658 // Scale up input if it is too small.
12659 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
12660 SDValue ScaleUp =
12661 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
12662 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
12663
12664 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
12665
12666 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
12667
12668 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
12669 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
12670
12671 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
12672 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
12673
12674 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
12675
12676 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
12677
12678 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
12679 SDValue SqrtD0 =
12680 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
12681
12682 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
12683
12684 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
12685 SDValue SqrtD1 =
12686 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
12687
12688 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
12689
12690 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
12691 SDValue ScaleDown =
12692 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
12693 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12694
12695 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
12696 // with finite only or nsz because rsq(+/-0) = +/-inf
12697
12698 // TODO: Check for DAZ and expand to subnormals
12699 SDValue IsZeroOrInf =
12700 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12701 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12702
12703 // If x is +INF, +0, or -0, use its original value
12704 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
12705 Flags);
12706}
12707
12708SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
12709 SDLoc DL(Op);
12710 EVT VT = Op.getValueType();
12711 SDValue Arg = Op.getOperand(0);
12712 SDValue TrigVal;
12713
12714 // Propagate fast-math flags so that the multiply we introduce can be folded
12715 // if Arg is already the result of a multiply by constant.
12716 auto Flags = Op->getFlags();
12717
12718 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
12719
12720 if (Subtarget->hasTrigReducedRange()) {
12721 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12722 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
12723 } else {
12724 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12725 }
12726
12727 switch (Op.getOpcode()) {
12728 case ISD::FCOS:
12729 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
12730 case ISD::FSIN:
12731 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
12732 default:
12733 llvm_unreachable("Wrong trig opcode");
12734 }
12735}
12736
12737SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
12738 SelectionDAG &DAG) const {
12739 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
12740 assert(AtomicNode->isCompareAndSwap());
12741 unsigned AS = AtomicNode->getAddressSpace();
12742
12743 // No custom lowering required for local address space
12745 return Op;
12746
12747 // Non-local address space requires custom lowering for atomic compare
12748 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
12749 SDLoc DL(Op);
12750 SDValue ChainIn = Op.getOperand(0);
12751 SDValue Addr = Op.getOperand(1);
12752 SDValue Old = Op.getOperand(2);
12753 SDValue New = Op.getOperand(3);
12754 EVT VT = Op.getValueType();
12755 MVT SimpleVT = VT.getSimpleVT();
12756 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
12757
12758 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
12759 SDValue Ops[] = {ChainIn, Addr, NewOld};
12760
12762 Op->getVTList(), Ops, VT,
12763 AtomicNode->getMemOperand());
12764}
12765
12766//===----------------------------------------------------------------------===//
12767// Custom DAG optimizations
12768//===----------------------------------------------------------------------===//
12769
12770SDValue
12771SITargetLowering::performUCharToFloatCombine(SDNode *N,
12772 DAGCombinerInfo &DCI) const {
12773 EVT VT = N->getValueType(0);
12774 EVT ScalarVT = VT.getScalarType();
12775 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12776 return SDValue();
12777
12778 SelectionDAG &DAG = DCI.DAG;
12779 SDLoc DL(N);
12780
12781 SDValue Src = N->getOperand(0);
12782 EVT SrcVT = Src.getValueType();
12783
12784 // TODO: We could try to match extracting the higher bytes, which would be
12785 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
12786 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
12787 // about in practice.
12788 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12789 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
12790 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
12791 DCI.AddToWorklist(Cvt.getNode());
12792
12793 // For the f16 case, fold to a cast to f32 and then cast back to f16.
12794 if (ScalarVT != MVT::f32) {
12795 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
12796 DAG.getTargetConstant(0, DL, MVT::i32));
12797 }
12798 return Cvt;
12799 }
12800 }
12801
12802 return SDValue();
12803}
12804
12805SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
12806 DAGCombinerInfo &DCI) const {
12807 SDValue MagnitudeOp = N->getOperand(0);
12808 SDValue SignOp = N->getOperand(1);
12809
12810 // The generic combine for fcopysign + fp cast is too conservative with
12811 // vectors, and also gets confused by the splitting we will perform here, so
12812 // peek through FP casts.
12813 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
12814 SignOp.getOpcode() == ISD::FP_ROUND)
12815 SignOp = SignOp.getOperand(0);
12816
12817 SelectionDAG &DAG = DCI.DAG;
12818 SDLoc DL(N);
12819 EVT SignVT = SignOp.getValueType();
12820
12821 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
12822 // lower half with a copy.
12823 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
12824 EVT MagVT = MagnitudeOp.getValueType();
12825
12826 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
12827
12828 if (MagVT.getScalarType() == MVT::f64) {
12829 EVT F32VT = MagVT.isVector()
12830 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12831 : MVT::v2f32;
12832
12833 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
12834
12836 for (unsigned I = 0; I != NumElts; ++I) {
12837 SDValue MagLo =
12838 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12839 DAG.getConstant(2 * I, DL, MVT::i32));
12840 SDValue MagHi =
12841 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12842 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12843
12844 SDValue SignOpElt =
12845 MagVT.isVector()
12847 SignOp, DAG.getConstant(I, DL, MVT::i32))
12848 : SignOp;
12849
12850 SDValue HiOp =
12851 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
12852
12853 SDValue Vector =
12854 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
12855
12856 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
12857 NewElts.push_back(NewElt);
12858 }
12859
12860 if (NewElts.size() == 1)
12861 return NewElts[0];
12862
12863 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
12864 }
12865
12866 if (SignVT.getScalarType() != MVT::f64)
12867 return SDValue();
12868
12869 // Reduce width of sign operand, we only need the highest bit.
12870 //
12871 // fcopysign f64:x, f64:y ->
12872 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
12873 // TODO: In some cases it might make sense to go all the way to f16.
12874
12875 EVT F32VT = MagVT.isVector()
12876 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12877 : MVT::v2f32;
12878
12879 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
12880
12881 SmallVector<SDValue, 8> F32Signs;
12882 for (unsigned I = 0; I != NumElts; ++I) {
12883 // Take sign from odd elements of cast vector
12884 SDValue SignAsF32 =
12885 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
12886 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12887 F32Signs.push_back(SignAsF32);
12888 }
12889
12890 SDValue NewSign =
12891 NumElts == 1
12892 ? F32Signs.back()
12894 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
12895 F32Signs);
12896
12897 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
12898 NewSign);
12899}
12900
12901// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
12902// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
12903// bits
12904
12905// This is a variant of
12906// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
12907//
12908// The normal DAG combiner will do this, but only if the add has one use since
12909// that would increase the number of instructions.
12910//
12911// This prevents us from seeing a constant offset that can be folded into a
12912// memory instruction's addressing mode. If we know the resulting add offset of
12913// a pointer can be folded into an addressing offset, we can replace the pointer
12914// operand with the add of new constant offset. This eliminates one of the uses,
12915// and may allow the remaining use to also be simplified.
12916//
12917SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
12918 EVT MemVT,
12919 DAGCombinerInfo &DCI) const {
12920 SDValue N0 = N->getOperand(0);
12921 SDValue N1 = N->getOperand(1);
12922
12923 // We only do this to handle cases where it's profitable when there are
12924 // multiple uses of the add, so defer to the standard combine.
12925 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
12926 return SDValue();
12927
12928 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
12929 if (!CN1)
12930 return SDValue();
12931
12932 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
12933 if (!CAdd)
12934 return SDValue();
12935
12936 SelectionDAG &DAG = DCI.DAG;
12937
12938 if (N0->getOpcode() == ISD::OR &&
12939 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
12940 return SDValue();
12941
12942 // If the resulting offset is too large, we can't fold it into the
12943 // addressing mode offset.
12944 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
12945 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
12946
12947 AddrMode AM;
12948 AM.HasBaseReg = true;
12949 AM.BaseOffs = Offset.getSExtValue();
12950 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
12951 return SDValue();
12952
12953 SDLoc SL(N);
12954 EVT VT = N->getValueType(0);
12955
12956 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
12957 SDValue COffset = DAG.getConstant(Offset, SL, VT);
12958
12959 SDNodeFlags Flags;
12960 Flags.setNoUnsignedWrap(
12961 N->getFlags().hasNoUnsignedWrap() &&
12962 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
12963
12964 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
12965 // be sure that the new left operand is a proper base pointer.
12966 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
12967}
12968
12969/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
12970/// by the chain and intrinsic ID. Theoretically we would also need to check the
12971/// specific intrinsic, but they all place the pointer operand first.
12972static unsigned getBasePtrIndex(const MemSDNode *N) {
12973 switch (N->getOpcode()) {
12974 case ISD::STORE:
12977 return 2;
12978 default:
12979 return 1;
12980 }
12981}
12982
12983SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
12984 DAGCombinerInfo &DCI) const {
12985 SelectionDAG &DAG = DCI.DAG;
12986
12987 unsigned PtrIdx = getBasePtrIndex(N);
12988 SDValue Ptr = N->getOperand(PtrIdx);
12989
12990 // TODO: We could also do this for multiplies.
12991 if (Ptr.getOpcode() == ISD::SHL) {
12992 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
12993 N->getMemoryVT(), DCI);
12994 if (NewPtr) {
12995 SmallVector<SDValue, 8> NewOps(N->ops());
12996
12997 NewOps[PtrIdx] = NewPtr;
12998 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
12999 }
13000 }
13001
13002 return SDValue();
13003}
13004
13005static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
13006 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13007 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13008 (Opc == ISD::XOR && Val == 0);
13009}
13010
13011// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13012// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13013// integer combine opportunities since most 64-bit operations are decomposed
13014// this way. TODO: We won't want this for SALU especially if it is an inline
13015// immediate.
13016SDValue SITargetLowering::splitBinaryBitConstantOp(
13017 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13018 const ConstantSDNode *CRHS) const {
13019 uint64_t Val = CRHS->getZExtValue();
13020 uint32_t ValLo = Lo_32(Val);
13021 uint32_t ValHi = Hi_32(Val);
13022 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13023
13024 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
13026 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
13027 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13028 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13029 !CRHS->user_begin()->isDivergent())
13030 return SDValue();
13031
13032 // If we need to materialize a 64-bit immediate, it will be split up later
13033 // anyway. Avoid creating the harder to understand 64-bit immediate
13034 // materialization.
13035 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13036 }
13037
13038 return SDValue();
13039}
13040
13042 if (V.getValueType() != MVT::i1)
13043 return false;
13044 switch (V.getOpcode()) {
13045 default:
13046 break;
13047 case ISD::SETCC:
13048 case ISD::IS_FPCLASS:
13050 return true;
13051 case ISD::AND:
13052 case ISD::OR:
13053 case ISD::XOR:
13054 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
13055 case ISD::SADDO:
13056 case ISD::UADDO:
13057 case ISD::SSUBO:
13058 case ISD::USUBO:
13059 case ISD::SMULO:
13060 case ISD::UMULO:
13061 return V.getResNo() == 1;
13063 unsigned IntrinsicID = V.getConstantOperandVal(0);
13064 switch (IntrinsicID) {
13065 case Intrinsic::amdgcn_is_shared:
13066 case Intrinsic::amdgcn_is_private:
13067 return true;
13068 default:
13069 return false;
13070 }
13071
13072 return false;
13073 }
13074 }
13075 return false;
13076}
13077
13078// If a constant has all zeroes or all ones within each byte return it.
13079// Otherwise return 0.
13081 // 0xff for any zero byte in the mask
13082 uint32_t ZeroByteMask = 0;
13083 if (!(C & 0x000000ff))
13084 ZeroByteMask |= 0x000000ff;
13085 if (!(C & 0x0000ff00))
13086 ZeroByteMask |= 0x0000ff00;
13087 if (!(C & 0x00ff0000))
13088 ZeroByteMask |= 0x00ff0000;
13089 if (!(C & 0xff000000))
13090 ZeroByteMask |= 0xff000000;
13091 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13092 if ((NonZeroByteMask & C) != NonZeroByteMask)
13093 return 0; // Partial bytes selected.
13094 return C;
13095}
13096
13097// Check if a node selects whole bytes from its operand 0 starting at a byte
13098// boundary while masking the rest. Returns select mask as in the v_perm_b32
13099// or -1 if not succeeded.
13100// Note byte select encoding:
13101// value 0-3 selects corresponding source byte;
13102// value 0xc selects zero;
13103// value 0xff selects 0xff.
13105 assert(V.getValueSizeInBits() == 32);
13106
13107 if (V.getNumOperands() != 2)
13108 return ~0;
13109
13110 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
13111 if (!N1)
13112 return ~0;
13113
13114 uint32_t C = N1->getZExtValue();
13115
13116 switch (V.getOpcode()) {
13117 default:
13118 break;
13119 case ISD::AND:
13120 if (uint32_t ConstMask = getConstantPermuteMask(C))
13121 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13122 break;
13123
13124 case ISD::OR:
13125 if (uint32_t ConstMask = getConstantPermuteMask(C))
13126 return (0x03020100 & ~ConstMask) | ConstMask;
13127 break;
13128
13129 case ISD::SHL:
13130 if (C % 8)
13131 return ~0;
13132
13133 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13134
13135 case ISD::SRL:
13136 if (C % 8)
13137 return ~0;
13138
13139 return uint32_t(0x0c0c0c0c03020100ull >> C);
13140 }
13141
13142 return ~0;
13143}
13144
13145SDValue SITargetLowering::performAndCombine(SDNode *N,
13146 DAGCombinerInfo &DCI) const {
13147 if (DCI.isBeforeLegalize())
13148 return SDValue();
13149
13150 SelectionDAG &DAG = DCI.DAG;
13151 EVT VT = N->getValueType(0);
13152 SDValue LHS = N->getOperand(0);
13153 SDValue RHS = N->getOperand(1);
13154
13155 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13156 if (VT == MVT::i64 && CRHS) {
13157 if (SDValue Split =
13158 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
13159 return Split;
13160 }
13161
13162 if (CRHS && VT == MVT::i32) {
13163 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13164 // nb = number of trailing zeroes in mask
13165 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13166 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13167 uint64_t Mask = CRHS->getZExtValue();
13168 unsigned Bits = llvm::popcount(Mask);
13169 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13170 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
13171 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
13172 unsigned Shift = CShift->getZExtValue();
13173 unsigned NB = CRHS->getAPIntValue().countr_zero();
13174 unsigned Offset = NB + Shift;
13175 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13176 SDLoc SL(N);
13177 SDValue BFE =
13178 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
13179 DAG.getConstant(Offset, SL, MVT::i32),
13180 DAG.getConstant(Bits, SL, MVT::i32));
13181 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
13182 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
13183 DAG.getValueType(NarrowVT));
13184 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
13185 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
13186 return Shl;
13187 }
13188 }
13189 }
13190
13191 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13192 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13193 isa<ConstantSDNode>(LHS.getOperand(2))) {
13194 uint32_t Sel = getConstantPermuteMask(Mask);
13195 if (!Sel)
13196 return SDValue();
13197
13198 // Select 0xc for all zero bytes
13199 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13200 SDLoc DL(N);
13201 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13202 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13203 }
13204 }
13205
13206 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13207 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13208 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13209 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13210 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
13211
13212 SDValue X = LHS.getOperand(0);
13213 SDValue Y = RHS.getOperand(0);
13214 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
13215 !isTypeLegal(X.getValueType()))
13216 return SDValue();
13217
13218 if (LCC == ISD::SETO) {
13219 if (X != LHS.getOperand(1))
13220 return SDValue();
13221
13222 if (RCC == ISD::SETUNE) {
13223 const ConstantFPSDNode *C1 =
13224 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
13225 if (!C1 || !C1->isInfinity() || C1->isNegative())
13226 return SDValue();
13227
13228 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13232
13233 static_assert(
13236 0x3ff) == Mask,
13237 "mask not equal");
13238
13239 SDLoc DL(N);
13240 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
13241 DAG.getConstant(Mask, DL, MVT::i32));
13242 }
13243 }
13244 }
13245
13246 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13247 std::swap(LHS, RHS);
13248
13249 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13250 RHS.hasOneUse()) {
13251 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13252 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13253 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13254 // | n_nan)
13255 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13256 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13257 (RHS.getOperand(0) == LHS.getOperand(0) &&
13258 LHS.getOperand(0) == LHS.getOperand(1))) {
13259 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13260 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13261 : Mask->getZExtValue() & OrdMask;
13262
13263 SDLoc DL(N);
13264 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
13265 DAG.getConstant(NewMask, DL, MVT::i32));
13266 }
13267 }
13268
13269 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13270 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13271 // and x, (sext cc from i1) => select cc, x, 0
13272 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13273 std::swap(LHS, RHS);
13274 if (isBoolSGPR(RHS.getOperand(0)))
13275 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
13276 DAG.getConstant(0, SDLoc(N), MVT::i32));
13277 }
13278
13279 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13280 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13281 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13282 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13283 uint32_t LHSMask = getPermuteMask(LHS);
13284 uint32_t RHSMask = getPermuteMask(RHS);
13285 if (LHSMask != ~0u && RHSMask != ~0u) {
13286 // Canonicalize the expression in an attempt to have fewer unique masks
13287 // and therefore fewer registers used to hold the masks.
13288 if (LHSMask > RHSMask) {
13289 std::swap(LHSMask, RHSMask);
13290 std::swap(LHS, RHS);
13291 }
13292
13293 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13294 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13295 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13296 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13297
13298 // Check of we need to combine values from two sources within a byte.
13299 if (!(LHSUsedLanes & RHSUsedLanes) &&
13300 // If we select high and lower word keep it for SDWA.
13301 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13302 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13303 // Each byte in each mask is either selector mask 0-3, or has higher
13304 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13305 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13306 // mask which is not 0xff wins. By anding both masks we have a correct
13307 // result except that 0x0c shall be corrected to give 0x0c only.
13308 uint32_t Mask = LHSMask & RHSMask;
13309 for (unsigned I = 0; I < 32; I += 8) {
13310 uint32_t ByteSel = 0xff << I;
13311 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13312 Mask &= (0x0c << I) & 0xffffffff;
13313 }
13314
13315 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13316 // or 0x0c.
13317 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13318 SDLoc DL(N);
13319
13320 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13321 RHS.getOperand(0),
13322 DAG.getConstant(Sel, DL, MVT::i32));
13323 }
13324 }
13325 }
13326
13327 return SDValue();
13328}
13329
13330// A key component of v_perm is a mapping between byte position of the src
13331// operands, and the byte position of the dest. To provide such, we need: 1. the
13332// node that provides x byte of the dest of the OR, and 2. the byte of the node
13333// used to provide that x byte. calculateByteProvider finds which node provides
13334// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13335// and finds an ultimate src and byte position For example: The supported
13336// LoadCombine pattern for vector loads is as follows
13337// t1
13338// or
13339// / \
13340// t2 t3
13341// zext shl
13342// | | \
13343// t4 t5 16
13344// or anyext
13345// / \ |
13346// t6 t7 t8
13347// srl shl or
13348// / | / \ / \
13349// t9 t10 t11 t12 t13 t14
13350// trunc* 8 trunc* 8 and and
13351// | | / | | \
13352// t15 t16 t17 t18 t19 t20
13353// trunc* 255 srl -256
13354// | / \
13355// t15 t15 16
13356//
13357// *In this example, the truncs are from i32->i16
13358//
13359// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13360// respectively. calculateSrcByte would find (given node) -> ultimate src &
13361// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13362// After finding the mapping, we can combine the tree into vperm t15, t16,
13363// 0x05000407
13364
13365// Find the source and byte position from a node.
13366// \p DestByte is the byte position of the dest of the or that the src
13367// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13368// dest of the or byte. \p Depth tracks how many recursive iterations we have
13369// performed.
13370static const std::optional<ByteProvider<SDValue>>
13371calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13372 unsigned Depth = 0) {
13373 // We may need to recursively traverse a series of SRLs
13374 if (Depth >= 6)
13375 return std::nullopt;
13376
13377 if (Op.getValueSizeInBits() < 8)
13378 return std::nullopt;
13379
13380 if (Op.getValueType().isVector())
13381 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13382
13383 switch (Op->getOpcode()) {
13384 case ISD::TRUNCATE: {
13385 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13386 }
13387
13388 case ISD::SIGN_EXTEND:
13389 case ISD::ZERO_EXTEND:
13391 SDValue NarrowOp = Op->getOperand(0);
13392 auto NarrowVT = NarrowOp.getValueType();
13393 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13394 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13395 NarrowVT = VTSign->getVT();
13396 }
13397 if (!NarrowVT.isByteSized())
13398 return std::nullopt;
13399 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13400
13401 if (SrcIndex >= NarrowByteWidth)
13402 return std::nullopt;
13403 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13404 }
13405
13406 case ISD::SRA:
13407 case ISD::SRL: {
13408 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13409 if (!ShiftOp)
13410 return std::nullopt;
13411
13412 uint64_t BitShift = ShiftOp->getZExtValue();
13413
13414 if (BitShift % 8 != 0)
13415 return std::nullopt;
13416
13417 SrcIndex += BitShift / 8;
13418
13419 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13420 }
13421
13422 default: {
13423 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13424 }
13425 }
13426 llvm_unreachable("fully handled switch");
13427}
13428
13429// For a byte position in the result of an Or, traverse the tree and find the
13430// node (and the byte of the node) which ultimately provides this {Or,
13431// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13432// the byte position of the Op that corresponds with the originally requested
13433// byte of the Or \p Depth tracks how many recursive iterations we have
13434// performed. \p StartingIndex is the originally requested byte of the Or
13435static const std::optional<ByteProvider<SDValue>>
13436calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13437 unsigned StartingIndex = 0) {
13438 // Finding Src tree of RHS of or typically requires at least 1 additional
13439 // depth
13440 if (Depth > 6)
13441 return std::nullopt;
13442
13443 unsigned BitWidth = Op.getScalarValueSizeInBits();
13444 if (BitWidth % 8 != 0)
13445 return std::nullopt;
13446 if (Index > BitWidth / 8 - 1)
13447 return std::nullopt;
13448
13449 bool IsVec = Op.getValueType().isVector();
13450 switch (Op.getOpcode()) {
13451 case ISD::OR: {
13452 if (IsVec)
13453 return std::nullopt;
13454
13455 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
13456 StartingIndex);
13457 if (!RHS)
13458 return std::nullopt;
13459 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13460 StartingIndex);
13461 if (!LHS)
13462 return std::nullopt;
13463 // A well formed Or will have two ByteProviders for each byte, one of which
13464 // is constant zero
13465 if (!LHS->isConstantZero() && !RHS->isConstantZero())
13466 return std::nullopt;
13467 if (!LHS || LHS->isConstantZero())
13468 return RHS;
13469 if (!RHS || RHS->isConstantZero())
13470 return LHS;
13471 return std::nullopt;
13472 }
13473
13474 case ISD::AND: {
13475 if (IsVec)
13476 return std::nullopt;
13477
13478 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13479 if (!BitMaskOp)
13480 return std::nullopt;
13481
13482 uint32_t BitMask = BitMaskOp->getZExtValue();
13483 // Bits we expect for our StartingIndex
13484 uint32_t IndexMask = 0xFF << (Index * 8);
13485
13486 if ((IndexMask & BitMask) != IndexMask) {
13487 // If the result of the and partially provides the byte, then it
13488 // is not well formatted
13489 if (IndexMask & BitMask)
13490 return std::nullopt;
13492 }
13493
13494 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
13495 }
13496
13497 case ISD::FSHR: {
13498 if (IsVec)
13499 return std::nullopt;
13500
13501 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
13502 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13503 if (!ShiftOp || Op.getValueType().isVector())
13504 return std::nullopt;
13505
13506 uint64_t BitsProvided = Op.getValueSizeInBits();
13507 if (BitsProvided % 8 != 0)
13508 return std::nullopt;
13509
13510 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13511 if (BitShift % 8)
13512 return std::nullopt;
13513
13514 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13515 uint64_t ByteShift = BitShift / 8;
13516
13517 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13518 uint64_t BytesProvided = BitsProvided / 8;
13519 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13520 NewIndex %= BytesProvided;
13521 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
13522 }
13523
13524 case ISD::SRA:
13525 case ISD::SRL: {
13526 if (IsVec)
13527 return std::nullopt;
13528
13529 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13530 if (!ShiftOp)
13531 return std::nullopt;
13532
13533 uint64_t BitShift = ShiftOp->getZExtValue();
13534 if (BitShift % 8)
13535 return std::nullopt;
13536
13537 auto BitsProvided = Op.getScalarValueSizeInBits();
13538 if (BitsProvided % 8 != 0)
13539 return std::nullopt;
13540
13541 uint64_t BytesProvided = BitsProvided / 8;
13542 uint64_t ByteShift = BitShift / 8;
13543 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
13544 // If the byte we are trying to provide (as tracked by index) falls in this
13545 // range, then the SRL provides the byte. The byte of interest of the src of
13546 // the SRL is Index + ByteShift
13547 return BytesProvided - ByteShift > Index
13548 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
13549 Index + ByteShift)
13551 }
13552
13553 case ISD::SHL: {
13554 if (IsVec)
13555 return std::nullopt;
13556
13557 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13558 if (!ShiftOp)
13559 return std::nullopt;
13560
13561 uint64_t BitShift = ShiftOp->getZExtValue();
13562 if (BitShift % 8 != 0)
13563 return std::nullopt;
13564 uint64_t ByteShift = BitShift / 8;
13565
13566 // If we are shifting by an amount greater than (or equal to)
13567 // the index we are trying to provide, then it provides 0s. If not,
13568 // then this bytes are not definitively 0s, and the corresponding byte
13569 // of interest is Index - ByteShift of the src
13570 return Index < ByteShift
13572 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
13573 Depth + 1, StartingIndex);
13574 }
13575 case ISD::ANY_EXTEND:
13576 case ISD::SIGN_EXTEND:
13577 case ISD::ZERO_EXTEND:
13579 case ISD::AssertZext:
13580 case ISD::AssertSext: {
13581 if (IsVec)
13582 return std::nullopt;
13583
13584 SDValue NarrowOp = Op->getOperand(0);
13585 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
13586 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
13587 Op->getOpcode() == ISD::AssertZext ||
13588 Op->getOpcode() == ISD::AssertSext) {
13589 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13590 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13591 }
13592 if (NarrowBitWidth % 8 != 0)
13593 return std::nullopt;
13594 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13595
13596 if (Index >= NarrowByteWidth)
13597 return Op.getOpcode() == ISD::ZERO_EXTEND
13598 ? std::optional<ByteProvider<SDValue>>(
13600 : std::nullopt;
13601 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
13602 }
13603
13604 case ISD::TRUNCATE: {
13605 if (IsVec)
13606 return std::nullopt;
13607
13608 uint64_t NarrowByteWidth = BitWidth / 8;
13609
13610 if (NarrowByteWidth >= Index) {
13611 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13612 StartingIndex);
13613 }
13614
13615 return std::nullopt;
13616 }
13617
13618 case ISD::CopyFromReg: {
13619 if (BitWidth / 8 > Index)
13620 return calculateSrcByte(Op, StartingIndex, Index);
13621
13622 return std::nullopt;
13623 }
13624
13625 case ISD::LOAD: {
13626 auto *L = cast<LoadSDNode>(Op.getNode());
13627
13628 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13629 if (NarrowBitWidth % 8 != 0)
13630 return std::nullopt;
13631 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13632
13633 // If the width of the load does not reach byte we are trying to provide for
13634 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
13635 // question
13636 if (Index >= NarrowByteWidth) {
13637 return L->getExtensionType() == ISD::ZEXTLOAD
13638 ? std::optional<ByteProvider<SDValue>>(
13640 : std::nullopt;
13641 }
13642
13643 if (NarrowByteWidth > Index) {
13644 return calculateSrcByte(Op, StartingIndex, Index);
13645 }
13646
13647 return std::nullopt;
13648 }
13649
13650 case ISD::BSWAP: {
13651 if (IsVec)
13652 return std::nullopt;
13653
13654 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
13655 Depth + 1, StartingIndex);
13656 }
13657
13659 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13660 if (!IdxOp)
13661 return std::nullopt;
13662 auto VecIdx = IdxOp->getZExtValue();
13663 auto ScalarSize = Op.getScalarValueSizeInBits();
13664 if (ScalarSize < 32)
13665 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13666 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
13667 StartingIndex, Index);
13668 }
13669
13670 case AMDGPUISD::PERM: {
13671 if (IsVec)
13672 return std::nullopt;
13673
13674 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13675 if (!PermMask)
13676 return std::nullopt;
13677
13678 auto IdxMask =
13679 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13680 if (IdxMask > 0x07 && IdxMask != 0x0c)
13681 return std::nullopt;
13682
13683 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13684 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13685
13686 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
13689 }
13690
13691 default: {
13692 return std::nullopt;
13693 }
13694 }
13695
13696 llvm_unreachable("fully handled switch");
13697}
13698
13699// Returns true if the Operand is a scalar and is 16 bits
13700static bool isExtendedFrom16Bits(SDValue &Operand) {
13701
13702 switch (Operand.getOpcode()) {
13703 case ISD::ANY_EXTEND:
13704 case ISD::SIGN_EXTEND:
13705 case ISD::ZERO_EXTEND: {
13706 auto OpVT = Operand.getOperand(0).getValueType();
13707 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
13708 }
13709 case ISD::LOAD: {
13710 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
13711 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13712 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
13713 ExtType == ISD::EXTLOAD) {
13714 auto MemVT = L->getMemoryVT();
13715 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
13716 }
13717 return L->getMemoryVT().getSizeInBits() == 16;
13718 }
13719 default:
13720 return false;
13721 }
13722}
13723
13724// Returns true if the mask matches consecutive bytes, and the first byte
13725// begins at a power of 2 byte offset from 0th byte
13726static bool addresses16Bits(int Mask) {
13727 int Low8 = Mask & 0xff;
13728 int Hi8 = (Mask & 0xff00) >> 8;
13729
13730 assert(Low8 < 8 && Hi8 < 8);
13731 // Are the bytes contiguous in the order of increasing addresses.
13732 bool IsConsecutive = (Hi8 - Low8 == 1);
13733 // Is the first byte at location that is aligned for 16 bit instructions.
13734 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
13735 // In this case, we still need code to extract the 16 bit operand, so it
13736 // is better to use i8 v_perm
13737 bool Is16Aligned = !(Low8 % 2);
13738
13739 return IsConsecutive && Is16Aligned;
13740}
13741
13742// Do not lower into v_perm if the operands are actually 16 bit
13743// and the selected bits (based on PermMask) correspond with two
13744// easily addressable 16 bit operands.
13746 SDValue &OtherOp) {
13747 int Low16 = PermMask & 0xffff;
13748 int Hi16 = (PermMask & 0xffff0000) >> 16;
13749
13750 auto TempOp = peekThroughBitcasts(Op);
13751 auto TempOtherOp = peekThroughBitcasts(OtherOp);
13752
13753 auto OpIs16Bit =
13754 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
13755 if (!OpIs16Bit)
13756 return true;
13757
13758 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13759 isExtendedFrom16Bits(TempOtherOp);
13760 if (!OtherOpIs16Bit)
13761 return true;
13762
13763 // Do we cleanly address both
13764 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
13765}
13766
13768 unsigned DWordOffset) {
13769 SDValue Ret;
13770
13771 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
13772 // ByteProvider must be at least 8 bits
13773 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13774
13775 if (TypeSize <= 32)
13776 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
13777
13778 if (Src.getValueType().isVector()) {
13779 auto ScalarTySize = Src.getScalarValueSizeInBits();
13780 auto ScalarTy = Src.getValueType().getScalarType();
13781 if (ScalarTySize == 32) {
13782 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
13783 DAG.getConstant(DWordOffset, SL, MVT::i32));
13784 }
13785 if (ScalarTySize > 32) {
13786 Ret = DAG.getNode(
13787 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
13788 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13789 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13790 if (ShiftVal)
13791 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
13792 DAG.getConstant(ShiftVal, SL, MVT::i32));
13793 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13794 }
13795
13796 assert(ScalarTySize < 32);
13797 auto NumElements = TypeSize / ScalarTySize;
13798 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13799 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13800 auto NumElementsIn32 = 32 / ScalarTySize;
13801 auto NumAvailElements = DWordOffset < Trunc32Elements
13802 ? NumElementsIn32
13803 : NumElements - NormalizedTrunc;
13804
13806 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
13807 NumAvailElements);
13808
13809 Ret = DAG.getBuildVector(
13810 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
13811 VecSrcs);
13812 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13813 }
13814
13815 /// Scalar Type
13816 auto ShiftVal = 32 * DWordOffset;
13817 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
13818 DAG.getConstant(ShiftVal, SL, MVT::i32));
13819 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13820}
13821
13823 SelectionDAG &DAG = DCI.DAG;
13824 [[maybe_unused]] EVT VT = N->getValueType(0);
13826
13827 // VT is known to be MVT::i32, so we need to provide 4 bytes.
13828 assert(VT == MVT::i32);
13829 for (int i = 0; i < 4; i++) {
13830 // Find the ByteProvider that provides the ith byte of the result of OR
13831 std::optional<ByteProvider<SDValue>> P =
13832 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
13833 // TODO support constantZero
13834 if (!P || P->isConstantZero())
13835 return SDValue();
13836
13837 PermNodes.push_back(*P);
13838 }
13839 if (PermNodes.size() != 4)
13840 return SDValue();
13841
13842 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13843 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13844 uint64_t PermMask = 0x00000000;
13845 for (size_t i = 0; i < PermNodes.size(); i++) {
13846 auto PermOp = PermNodes[i];
13847 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
13848 // by sizeof(Src2) = 4
13849 int SrcByteAdjust = 4;
13850
13851 // If the Src uses a byte from a different DWORD, then it corresponds
13852 // with a difference source
13853 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13854 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13855 if (SecondSrc)
13856 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13857 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13858 return SDValue();
13859
13860 // Set the index of the second distinct Src node
13861 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13862 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13863 SrcByteAdjust = 0;
13864 }
13865 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13867 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13868 }
13869 SDLoc DL(N);
13870 SDValue Op = *PermNodes[FirstSrc.first].Src;
13871 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
13872 assert(Op.getValueSizeInBits() == 32);
13873
13874 // Check that we are not just extracting the bytes in order from an op
13875 if (!SecondSrc) {
13876 int Low16 = PermMask & 0xffff;
13877 int Hi16 = (PermMask & 0xffff0000) >> 16;
13878
13879 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13880 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13881
13882 // The perm op would really just produce Op. So combine into Op
13883 if (WellFormedLow && WellFormedHi)
13884 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
13885 }
13886
13887 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
13888
13889 if (SecondSrc) {
13890 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
13891 assert(OtherOp.getValueSizeInBits() == 32);
13892 }
13893
13894 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
13895
13896 assert(Op.getValueType().isByteSized() &&
13897 OtherOp.getValueType().isByteSized());
13898
13899 // If the ultimate src is less than 32 bits, then we will only be
13900 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
13901 // CalculateByteProvider would not have returned Op as source if we
13902 // used a byte that is outside its ValueType. Thus, we are free to
13903 // ANY_EXTEND as the extended bits are dont-cares.
13904 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
13905 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
13906
13907 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
13908 DAG.getConstant(PermMask, DL, MVT::i32));
13909 }
13910 return SDValue();
13911}
13912
13913SDValue SITargetLowering::performOrCombine(SDNode *N,
13914 DAGCombinerInfo &DCI) const {
13915 SelectionDAG &DAG = DCI.DAG;
13916 SDValue LHS = N->getOperand(0);
13917 SDValue RHS = N->getOperand(1);
13918
13919 EVT VT = N->getValueType(0);
13920 if (VT == MVT::i1) {
13921 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
13922 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13923 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
13924 SDValue Src = LHS.getOperand(0);
13925 if (Src != RHS.getOperand(0))
13926 return SDValue();
13927
13928 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
13929 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13930 if (!CLHS || !CRHS)
13931 return SDValue();
13932
13933 // Only 10 bits are used.
13934 static const uint32_t MaxMask = 0x3ff;
13935
13936 uint32_t NewMask =
13937 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
13938 SDLoc DL(N);
13939 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
13940 DAG.getConstant(NewMask, DL, MVT::i32));
13941 }
13942
13943 return SDValue();
13944 }
13945
13946 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13948 LHS.getOpcode() == AMDGPUISD::PERM &&
13949 isa<ConstantSDNode>(LHS.getOperand(2))) {
13950 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
13951 if (!Sel)
13952 return SDValue();
13953
13954 Sel |= LHS.getConstantOperandVal(2);
13955 SDLoc DL(N);
13956 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13957 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13958 }
13959
13960 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13961 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13962 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13963 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13964
13965 // If all the uses of an or need to extract the individual elements, do not
13966 // attempt to lower into v_perm
13967 auto usesCombinedOperand = [](SDNode *OrUse) {
13968 // If we have any non-vectorized use, then it is a candidate for v_perm
13969 if (OrUse->getOpcode() != ISD::BITCAST ||
13970 !OrUse->getValueType(0).isVector())
13971 return true;
13972
13973 // If we have any non-vectorized use, then it is a candidate for v_perm
13974 for (auto *VUser : OrUse->users()) {
13975 if (!VUser->getValueType(0).isVector())
13976 return true;
13977
13978 // If the use of a vector is a store, then combining via a v_perm
13979 // is beneficial.
13980 // TODO -- whitelist more uses
13981 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
13982 if (VUser->getOpcode() == VectorwiseOp)
13983 return true;
13984 }
13985 return false;
13986 };
13987
13988 if (!any_of(N->users(), usesCombinedOperand))
13989 return SDValue();
13990
13991 uint32_t LHSMask = getPermuteMask(LHS);
13992 uint32_t RHSMask = getPermuteMask(RHS);
13993
13994 if (LHSMask != ~0u && RHSMask != ~0u) {
13995 // Canonicalize the expression in an attempt to have fewer unique masks
13996 // and therefore fewer registers used to hold the masks.
13997 if (LHSMask > RHSMask) {
13998 std::swap(LHSMask, RHSMask);
13999 std::swap(LHS, RHS);
14000 }
14001
14002 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14003 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14004 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14005 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14006
14007 // Check of we need to combine values from two sources within a byte.
14008 if (!(LHSUsedLanes & RHSUsedLanes) &&
14009 // If we select high and lower word keep it for SDWA.
14010 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14011 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14012 // Kill zero bytes selected by other mask. Zero value is 0xc.
14013 LHSMask &= ~RHSUsedLanes;
14014 RHSMask &= ~LHSUsedLanes;
14015 // Add 4 to each active LHS lane
14016 LHSMask |= LHSUsedLanes & 0x04040404;
14017 // Combine masks
14018 uint32_t Sel = LHSMask | RHSMask;
14019 SDLoc DL(N);
14020
14021 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14022 RHS.getOperand(0),
14023 DAG.getConstant(Sel, DL, MVT::i32));
14024 }
14025 }
14026 if (LHSMask == ~0u || RHSMask == ~0u) {
14027 if (SDValue Perm = matchPERM(N, DCI))
14028 return Perm;
14029 }
14030 }
14031
14032 // Detect identity v2i32 OR and replace with identity source node.
14033 // Specifically an Or that has operands constructed from the same source node
14034 // via extract_vector_elt and build_vector. I.E.
14035 // v2i32 or(
14036 // v2i32 build_vector(
14037 // i32 extract_elt(%IdentitySrc, 0),
14038 // i32 0
14039 // ),
14040 // v2i32 build_vector(
14041 // i32 0,
14042 // i32 extract_elt(%IdentitySrc, 1)
14043 // ) )
14044 // =>
14045 // v2i32 %IdentitySrc
14046
14047 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14048 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14049
14050 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
14051 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
14052
14053 // Test for and normalise build vectors.
14054 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14055
14056 // Get the extract_vector_element operands.
14057 SDValue LEVE = LHS->getOperand(0);
14058 SDValue REVE = RHS->getOperand(1);
14059
14060 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14062 // Check that different elements from the same vector are
14063 // extracted.
14064 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
14065 LEVE->getOperand(1) != REVE->getOperand(1)) {
14066 SDValue IdentitySrc = LEVE.getOperand(0);
14067 return IdentitySrc;
14068 }
14069 }
14070 }
14071 }
14072
14073 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14074 return SDValue();
14075
14076 // TODO: This could be a generic combine with a predicate for extracting the
14077 // high half of an integer being free.
14078
14079 // (or i64:x, (zero_extend i32:y)) ->
14080 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14081 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14082 RHS.getOpcode() != ISD::ZERO_EXTEND)
14083 std::swap(LHS, RHS);
14084
14085 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14086 SDValue ExtSrc = RHS.getOperand(0);
14087 EVT SrcVT = ExtSrc.getValueType();
14088 if (SrcVT == MVT::i32) {
14089 SDLoc SL(N);
14090 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
14091 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
14092
14093 DCI.AddToWorklist(LowOr.getNode());
14094 DCI.AddToWorklist(HiBits.getNode());
14095
14096 SDValue Vec =
14097 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
14098 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14099 }
14100 }
14101
14102 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
14103 if (CRHS) {
14104 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
14105 N->getOperand(0), CRHS))
14106 return Split;
14107 }
14108
14109 return SDValue();
14110}
14111
14112SDValue SITargetLowering::performXorCombine(SDNode *N,
14113 DAGCombinerInfo &DCI) const {
14114 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
14115 return RV;
14116
14117 SDValue LHS = N->getOperand(0);
14118 SDValue RHS = N->getOperand(1);
14119
14120 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
14121 SelectionDAG &DAG = DCI.DAG;
14122
14123 EVT VT = N->getValueType(0);
14124 if (CRHS && VT == MVT::i64) {
14125 if (SDValue Split =
14126 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
14127 return Split;
14128 }
14129
14130 // v2i32 (xor (vselect cc, x, y), K) ->
14131 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14132 // replaced with source modifiers when the select is lowered to CNDMASK.
14133 unsigned Opc = LHS.getOpcode();
14134 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14135 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14136 CRHS && CRHS->getAPIntValue().isSignMask()) {
14137 SDValue CC = LHS->getOperand(0);
14138 SDValue TRUE = LHS->getOperand(1);
14139 SDValue FALSE = LHS->getOperand(2);
14140 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
14141 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
14142 SDValue XSelect =
14143 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
14144 return XSelect;
14145 }
14146
14147 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14148 // fneg-like xors into 64-bit select.
14149 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14150 // This looks like an fneg, try to fold as a source modifier.
14151 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14153 // xor (select c, a, b), 0x80000000 ->
14154 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14155 SDLoc DL(N);
14156 SDValue CastLHS =
14157 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
14158 SDValue CastRHS =
14159 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
14160 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
14161 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
14162 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
14163 LHS->getOperand(0), FNegLHS, FNegRHS);
14164 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
14165 }
14166 }
14167
14168 return SDValue();
14169}
14170
14171SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
14172 DAGCombinerInfo &DCI) const {
14173 if (!Subtarget->has16BitInsts() ||
14174 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14175 return SDValue();
14176
14177 EVT VT = N->getValueType(0);
14178 if (VT != MVT::i32)
14179 return SDValue();
14180
14181 SDValue Src = N->getOperand(0);
14182 if (Src.getValueType() != MVT::i16)
14183 return SDValue();
14184
14185 return SDValue();
14186}
14187
14188SDValue
14189SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14190 DAGCombinerInfo &DCI) const {
14191 SDValue Src = N->getOperand(0);
14192 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
14193
14194 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14195 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14196 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14197 VTSign->getVT() == MVT::i8) ||
14198 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14199 VTSign->getVT() == MVT::i16))) {
14200 assert(Subtarget->hasScalarSubwordLoads() &&
14201 "s_buffer_load_{u8, i8} are supported "
14202 "in GFX12 (or newer) architectures.");
14203 EVT VT = Src.getValueType();
14204 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14207 SDLoc DL(N);
14208 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14209 SDValue Ops[] = {
14210 Src.getOperand(0), // source register
14211 Src.getOperand(1), // offset
14212 Src.getOperand(2) // cachePolicy
14213 };
14214 auto *M = cast<MemSDNode>(Src);
14215 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14216 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14217 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
14218 return LoadVal;
14219 }
14220 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14221 VTSign->getVT() == MVT::i8) ||
14222 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14223 VTSign->getVT() == MVT::i16)) &&
14224 Src.hasOneUse()) {
14225 auto *M = cast<MemSDNode>(Src);
14226 SDValue Ops[] = {Src.getOperand(0), // Chain
14227 Src.getOperand(1), // rsrc
14228 Src.getOperand(2), // vindex
14229 Src.getOperand(3), // voffset
14230 Src.getOperand(4), // soffset
14231 Src.getOperand(5), // offset
14232 Src.getOperand(6), Src.getOperand(7)};
14233 // replace with BUFFER_LOAD_BYTE/SHORT
14234 SDVTList ResList =
14235 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14236 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14239 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14240 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14241 return DCI.DAG.getMergeValues(
14242 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
14243 }
14244 return SDValue();
14245}
14246
14247SDValue SITargetLowering::performClassCombine(SDNode *N,
14248 DAGCombinerInfo &DCI) const {
14249 SelectionDAG &DAG = DCI.DAG;
14250 SDValue Mask = N->getOperand(1);
14251
14252 // fp_class x, 0 -> false
14253 if (isNullConstant(Mask))
14254 return DAG.getConstant(0, SDLoc(N), MVT::i1);
14255
14256 if (N->getOperand(0).isUndef())
14257 return DAG.getUNDEF(MVT::i1);
14258
14259 return SDValue();
14260}
14261
14262SDValue SITargetLowering::performRcpCombine(SDNode *N,
14263 DAGCombinerInfo &DCI) const {
14264 EVT VT = N->getValueType(0);
14265 SDValue N0 = N->getOperand(0);
14266
14267 if (N0.isUndef()) {
14268 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
14269 SDLoc(N), VT);
14270 }
14271
14272 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
14273 N0.getOpcode() == ISD::SINT_TO_FP)) {
14274 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
14275 N->getFlags());
14276 }
14277
14278 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14279 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14280 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
14281 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
14282 N->getFlags());
14283 }
14284
14286}
14287
14289 unsigned MaxDepth) const {
14290 unsigned Opcode = Op.getOpcode();
14291 if (Opcode == ISD::FCANONICALIZE)
14292 return true;
14293
14294 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14295 const auto &F = CFP->getValueAPF();
14296 if (F.isNaN() && F.isSignaling())
14297 return false;
14298 if (!F.isDenormal())
14299 return true;
14300
14301 DenormalMode Mode =
14302 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
14303 return Mode == DenormalMode::getIEEE();
14304 }
14305
14306 // If source is a result of another standard FP operation it is already in
14307 // canonical form.
14308 if (MaxDepth == 0)
14309 return false;
14310
14311 switch (Opcode) {
14312 // These will flush denorms if required.
14313 case ISD::FADD:
14314 case ISD::FSUB:
14315 case ISD::FMUL:
14316 case ISD::FCEIL:
14317 case ISD::FFLOOR:
14318 case ISD::FMA:
14319 case ISD::FMAD:
14320 case ISD::FSQRT:
14321 case ISD::FDIV:
14322 case ISD::FREM:
14323 case ISD::FP_ROUND:
14324 case ISD::FP_EXTEND:
14325 case ISD::FP16_TO_FP:
14326 case ISD::FP_TO_FP16:
14327 case ISD::BF16_TO_FP:
14328 case ISD::FP_TO_BF16:
14329 case ISD::FLDEXP:
14332 case AMDGPUISD::RCP:
14333 case AMDGPUISD::RSQ:
14337 case AMDGPUISD::LOG:
14338 case AMDGPUISD::EXP:
14342 case AMDGPUISD::FRACT:
14349 case AMDGPUISD::SIN_HW:
14350 case AMDGPUISD::COS_HW:
14351 return true;
14352
14353 // It can/will be lowered or combined as a bit operation.
14354 // Need to check their input recursively to handle.
14355 case ISD::FNEG:
14356 case ISD::FABS:
14357 case ISD::FCOPYSIGN:
14358 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14359
14360 case ISD::AND:
14361 if (Op.getValueType() == MVT::i32) {
14362 // Be careful as we only know it is a bitcast floating point type. It
14363 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14364 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14365 // is valid to optimize for all types.
14366 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
14367 if (RHS->getZExtValue() == 0xffff0000) {
14368 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14369 }
14370 }
14371 }
14372 break;
14373
14374 case ISD::FSIN:
14375 case ISD::FCOS:
14376 case ISD::FSINCOS:
14377 return Op.getValueType().getScalarType() != MVT::f16;
14378
14379 case ISD::FMINNUM:
14380 case ISD::FMAXNUM:
14381 case ISD::FMINNUM_IEEE:
14382 case ISD::FMAXNUM_IEEE:
14383 case ISD::FMINIMUM:
14384 case ISD::FMAXIMUM:
14385 case ISD::FMINIMUMNUM:
14386 case ISD::FMAXIMUMNUM:
14387 case AMDGPUISD::CLAMP:
14388 case AMDGPUISD::FMED3:
14389 case AMDGPUISD::FMAX3:
14390 case AMDGPUISD::FMIN3:
14392 case AMDGPUISD::FMINIMUM3: {
14393 // FIXME: Shouldn't treat the generic operations different based these.
14394 // However, we aren't really required to flush the result from
14395 // minnum/maxnum..
14396
14397 // snans will be quieted, so we only need to worry about denormals.
14398 if (Subtarget->supportsMinMaxDenormModes() ||
14399 // FIXME: denormalsEnabledForType is broken for dynamic
14400 denormalsEnabledForType(DAG, Op.getValueType()))
14401 return true;
14402
14403 // Flushing may be required.
14404 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
14405 // targets need to check their input recursively.
14406
14407 // FIXME: Does this apply with clamp? It's implemented with max.
14408 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
14409 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
14410 return false;
14411 }
14412
14413 return true;
14414 }
14415 case ISD::SELECT: {
14416 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
14417 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
14418 }
14419 case ISD::BUILD_VECTOR: {
14420 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
14421 SDValue SrcOp = Op.getOperand(i);
14422 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
14423 return false;
14424 }
14425
14426 return true;
14427 }
14430 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14431 }
14433 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
14434 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
14435 }
14436 case ISD::UNDEF:
14437 // Could be anything.
14438 return false;
14439
14440 case ISD::BITCAST:
14441 // TODO: This is incorrect as it loses track of the operand's type. We may
14442 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
14443 // same bits that are canonicalized in one type need not be in the other.
14444 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14445 case ISD::TRUNCATE: {
14446 // Hack round the mess we make when legalizing extract_vector_elt
14447 if (Op.getValueType() == MVT::i16) {
14448 SDValue TruncSrc = Op.getOperand(0);
14449 if (TruncSrc.getValueType() == MVT::i32 &&
14450 TruncSrc.getOpcode() == ISD::BITCAST &&
14451 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
14452 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
14453 }
14454 }
14455 return false;
14456 }
14458 unsigned IntrinsicID = Op.getConstantOperandVal(0);
14459 // TODO: Handle more intrinsics
14460 switch (IntrinsicID) {
14461 case Intrinsic::amdgcn_cvt_pkrtz:
14462 case Intrinsic::amdgcn_cubeid:
14463 case Intrinsic::amdgcn_frexp_mant:
14464 case Intrinsic::amdgcn_fdot2:
14465 case Intrinsic::amdgcn_rcp:
14466 case Intrinsic::amdgcn_rsq:
14467 case Intrinsic::amdgcn_rsq_clamp:
14468 case Intrinsic::amdgcn_rcp_legacy:
14469 case Intrinsic::amdgcn_rsq_legacy:
14470 case Intrinsic::amdgcn_trig_preop:
14471 case Intrinsic::amdgcn_tanh:
14472 case Intrinsic::amdgcn_log:
14473 case Intrinsic::amdgcn_exp2:
14474 case Intrinsic::amdgcn_sqrt:
14475 return true;
14476 default:
14477 break;
14478 }
14479
14480 break;
14481 }
14482 default:
14483 break;
14484 }
14485
14486 // FIXME: denormalsEnabledForType is broken for dynamic
14487 return denormalsEnabledForType(DAG, Op.getValueType()) &&
14488 DAG.isKnownNeverSNaN(Op);
14489}
14490
14492 unsigned MaxDepth) const {
14493 const MachineRegisterInfo &MRI = MF.getRegInfo();
14494 MachineInstr *MI = MRI.getVRegDef(Reg);
14495 unsigned Opcode = MI->getOpcode();
14496
14497 if (Opcode == AMDGPU::G_FCANONICALIZE)
14498 return true;
14499
14500 std::optional<FPValueAndVReg> FCR;
14501 // Constant splat (can be padded with undef) or scalar constant.
14503 if (FCR->Value.isSignaling())
14504 return false;
14505 if (!FCR->Value.isDenormal())
14506 return true;
14507
14508 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
14509 return Mode == DenormalMode::getIEEE();
14510 }
14511
14512 if (MaxDepth == 0)
14513 return false;
14514
14515 switch (Opcode) {
14516 case AMDGPU::G_FADD:
14517 case AMDGPU::G_FSUB:
14518 case AMDGPU::G_FMUL:
14519 case AMDGPU::G_FCEIL:
14520 case AMDGPU::G_FFLOOR:
14521 case AMDGPU::G_FRINT:
14522 case AMDGPU::G_FNEARBYINT:
14523 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14524 case AMDGPU::G_INTRINSIC_TRUNC:
14525 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14526 case AMDGPU::G_FMA:
14527 case AMDGPU::G_FMAD:
14528 case AMDGPU::G_FSQRT:
14529 case AMDGPU::G_FDIV:
14530 case AMDGPU::G_FREM:
14531 case AMDGPU::G_FPOW:
14532 case AMDGPU::G_FPEXT:
14533 case AMDGPU::G_FLOG:
14534 case AMDGPU::G_FLOG2:
14535 case AMDGPU::G_FLOG10:
14536 case AMDGPU::G_FPTRUNC:
14537 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14538 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14539 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14540 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14541 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14542 return true;
14543 case AMDGPU::G_FNEG:
14544 case AMDGPU::G_FABS:
14545 case AMDGPU::G_FCOPYSIGN:
14546 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
14547 case AMDGPU::G_FMINNUM:
14548 case AMDGPU::G_FMAXNUM:
14549 case AMDGPU::G_FMINNUM_IEEE:
14550 case AMDGPU::G_FMAXNUM_IEEE:
14551 case AMDGPU::G_FMINIMUM:
14552 case AMDGPU::G_FMAXIMUM:
14553 case AMDGPU::G_FMINIMUMNUM:
14554 case AMDGPU::G_FMAXIMUMNUM: {
14555 if (Subtarget->supportsMinMaxDenormModes() ||
14556 // FIXME: denormalsEnabledForType is broken for dynamic
14557 denormalsEnabledForType(MRI.getType(Reg), MF))
14558 return true;
14559
14560 [[fallthrough]];
14561 }
14562 case AMDGPU::G_BUILD_VECTOR:
14563 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
14564 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
14565 return false;
14566 return true;
14567 case AMDGPU::G_INTRINSIC:
14568 case AMDGPU::G_INTRINSIC_CONVERGENT:
14569 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
14570 case Intrinsic::amdgcn_fmul_legacy:
14571 case Intrinsic::amdgcn_fmad_ftz:
14572 case Intrinsic::amdgcn_sqrt:
14573 case Intrinsic::amdgcn_fmed3:
14574 case Intrinsic::amdgcn_sin:
14575 case Intrinsic::amdgcn_cos:
14576 case Intrinsic::amdgcn_log:
14577 case Intrinsic::amdgcn_exp2:
14578 case Intrinsic::amdgcn_log_clamp:
14579 case Intrinsic::amdgcn_rcp:
14580 case Intrinsic::amdgcn_rcp_legacy:
14581 case Intrinsic::amdgcn_rsq:
14582 case Intrinsic::amdgcn_rsq_clamp:
14583 case Intrinsic::amdgcn_rsq_legacy:
14584 case Intrinsic::amdgcn_div_scale:
14585 case Intrinsic::amdgcn_div_fmas:
14586 case Intrinsic::amdgcn_div_fixup:
14587 case Intrinsic::amdgcn_fract:
14588 case Intrinsic::amdgcn_cvt_pkrtz:
14589 case Intrinsic::amdgcn_cubeid:
14590 case Intrinsic::amdgcn_cubema:
14591 case Intrinsic::amdgcn_cubesc:
14592 case Intrinsic::amdgcn_cubetc:
14593 case Intrinsic::amdgcn_frexp_mant:
14594 case Intrinsic::amdgcn_fdot2:
14595 case Intrinsic::amdgcn_trig_preop:
14596 case Intrinsic::amdgcn_tanh:
14597 return true;
14598 default:
14599 break;
14600 }
14601
14602 [[fallthrough]];
14603 default:
14604 return false;
14605 }
14606
14607 llvm_unreachable("invalid operation");
14608}
14609
14610// Constant fold canonicalize.
14611SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
14612 const SDLoc &SL, EVT VT,
14613 const APFloat &C) const {
14614 // Flush denormals to 0 if not enabled.
14615 if (C.isDenormal()) {
14616 DenormalMode Mode =
14617 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
14618 if (Mode == DenormalMode::getPreserveSign()) {
14619 return DAG.getConstantFP(
14620 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
14621 }
14622
14623 if (Mode != DenormalMode::getIEEE())
14624 return SDValue();
14625 }
14626
14627 if (C.isNaN()) {
14628 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
14629 if (C.isSignaling()) {
14630 // Quiet a signaling NaN.
14631 // FIXME: Is this supposed to preserve payload bits?
14632 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14633 }
14634
14635 // Make sure it is the canonical NaN bitpattern.
14636 //
14637 // TODO: Can we use -1 as the canonical NaN value since it's an inline
14638 // immediate?
14639 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
14640 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14641 }
14642
14643 // Already canonical.
14644 return DAG.getConstantFP(C, SL, VT);
14645}
14646
14648 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
14649}
14650
14651SDValue
14652SITargetLowering::performFCanonicalizeCombine(SDNode *N,
14653 DAGCombinerInfo &DCI) const {
14654 SelectionDAG &DAG = DCI.DAG;
14655 SDValue N0 = N->getOperand(0);
14656 EVT VT = N->getValueType(0);
14657
14658 // fcanonicalize undef -> qnan
14659 if (N0.isUndef()) {
14661 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
14662 }
14663
14664 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
14665 EVT VT = N->getValueType(0);
14666 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
14667 }
14668
14669 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
14670 // (fcanonicalize k)
14671 //
14672 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
14673
14674 // TODO: This could be better with wider vectors that will be split to v2f16,
14675 // and to consider uses since there aren't that many packed operations.
14676 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
14677 isTypeLegal(MVT::v2f16)) {
14678 SDLoc SL(N);
14679 SDValue NewElts[2];
14680 SDValue Lo = N0.getOperand(0);
14681 SDValue Hi = N0.getOperand(1);
14682 EVT EltVT = Lo.getValueType();
14683
14685 for (unsigned I = 0; I != 2; ++I) {
14686 SDValue Op = N0.getOperand(I);
14687 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14688 NewElts[I] =
14689 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14690 } else if (Op.isUndef()) {
14691 // Handled below based on what the other operand is.
14692 NewElts[I] = Op;
14693 } else {
14694 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
14695 }
14696 }
14697
14698 // If one half is undef, and one is constant, prefer a splat vector rather
14699 // than the normal qNaN. If it's a register, prefer 0.0 since that's
14700 // cheaper to use and may be free with a packed operation.
14701 if (NewElts[0].isUndef()) {
14702 if (isa<ConstantFPSDNode>(NewElts[1]))
14703 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14704 ? NewElts[1]
14705 : DAG.getConstantFP(0.0f, SL, EltVT);
14706 }
14707
14708 if (NewElts[1].isUndef()) {
14709 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14710 ? NewElts[0]
14711 : DAG.getConstantFP(0.0f, SL, EltVT);
14712 }
14713
14714 return DAG.getBuildVector(VT, SL, NewElts);
14715 }
14716 }
14717
14718 return SDValue();
14719}
14720
14721static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
14722 switch (Opc) {
14723 case ISD::FMAXNUM:
14724 case ISD::FMAXNUM_IEEE:
14725 case ISD::FMAXIMUMNUM:
14726 return AMDGPUISD::FMAX3;
14727 case ISD::FMAXIMUM:
14728 return AMDGPUISD::FMAXIMUM3;
14729 case ISD::SMAX:
14730 return AMDGPUISD::SMAX3;
14731 case ISD::UMAX:
14732 return AMDGPUISD::UMAX3;
14733 case ISD::FMINNUM:
14734 case ISD::FMINNUM_IEEE:
14735 case ISD::FMINIMUMNUM:
14736 return AMDGPUISD::FMIN3;
14737 case ISD::FMINIMUM:
14738 return AMDGPUISD::FMINIMUM3;
14739 case ISD::SMIN:
14740 return AMDGPUISD::SMIN3;
14741 case ISD::UMIN:
14742 return AMDGPUISD::UMIN3;
14743 default:
14744 llvm_unreachable("Not a min/max opcode");
14745 }
14746}
14747
14748SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
14749 const SDLoc &SL, SDValue Src,
14750 SDValue MinVal,
14751 SDValue MaxVal,
14752 bool Signed) const {
14753
14754 // med3 comes from
14755 // min(max(x, K0), K1), K0 < K1
14756 // max(min(x, K0), K1), K1 < K0
14757 //
14758 // "MinVal" and "MaxVal" respectively refer to the rhs of the
14759 // min/max op.
14760 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
14761 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
14762
14763 if (!MinK || !MaxK)
14764 return SDValue();
14765
14766 if (Signed) {
14767 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
14768 return SDValue();
14769 } else {
14770 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
14771 return SDValue();
14772 }
14773
14774 EVT VT = MinK->getValueType(0);
14775 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
14776 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14777 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14778
14779 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
14780 // not available, but this is unlikely to be profitable as constants
14781 // will often need to be materialized & extended, especially on
14782 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
14783 return SDValue();
14784}
14785
14788 return C;
14789
14791 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
14792 return C;
14793 }
14794
14795 return nullptr;
14796}
14797
14798SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
14799 const SDLoc &SL, SDValue Op0,
14800 SDValue Op1) const {
14801 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
14802 if (!K1)
14803 return SDValue();
14804
14805 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
14806 if (!K0)
14807 return SDValue();
14808
14809 // Ordered >= (although NaN inputs should have folded away by now).
14810 if (K0->getValueAPF() > K1->getValueAPF())
14811 return SDValue();
14812
14813 // med3 with a nan input acts like
14814 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
14815 //
14816 // So the result depends on whether the IEEE mode bit is enabled or not with a
14817 // signaling nan input.
14818 // ieee=1
14819 // s0 snan: yields s2
14820 // s1 snan: yields s2
14821 // s2 snan: qnan
14822
14823 // s0 qnan: min(s1, s2)
14824 // s1 qnan: min(s0, s2)
14825 // s2 qnan: min(s0, s1)
14826
14827 // ieee=0
14828 // s0 snan: min(s1, s2)
14829 // s1 snan: min(s0, s2)
14830 // s2 snan: qnan
14831
14832 // s0 qnan: min(s1, s2)
14833 // s1 qnan: min(s0, s2)
14834 // s2 qnan: min(s0, s1)
14835 const MachineFunction &MF = DAG.getMachineFunction();
14836 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
14837
14838 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
14839 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
14840 // can only form if op0 is fmaxnum_ieee if IEEE=1.
14841 EVT VT = Op0.getValueType();
14842 if (Info->getMode().DX10Clamp) {
14843 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
14844 // hardware fmed3 behavior converting to a min.
14845 // FIXME: Should this be allowing -0.0?
14846 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
14847 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
14848 }
14849
14850 // med3 for f16 is only available on gfx9+, and not available for v2f16.
14851 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14852 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
14853 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
14854 // then give the other result, which is different from med3 with a NaN
14855 // input.
14856 SDValue Var = Op0.getOperand(0);
14857 if (!DAG.isKnownNeverSNaN(Var))
14858 return SDValue();
14859
14860 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14861
14862 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
14863 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
14864 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
14865 SDValue(K0, 0), SDValue(K1, 0));
14866 }
14867 }
14868
14869 return SDValue();
14870}
14871
14872/// \return true if the subtarget supports minimum3 and maximum3 with the given
14873/// base min/max opcode \p Opc for type \p VT.
14874static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
14875 EVT VT) {
14876 switch (Opc) {
14877 case ISD::FMINNUM:
14878 case ISD::FMAXNUM:
14879 case ISD::FMINNUM_IEEE:
14880 case ISD::FMAXNUM_IEEE:
14881 case ISD::FMINIMUMNUM:
14882 case ISD::FMAXIMUMNUM:
14885 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
14886 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
14887 case ISD::FMINIMUM:
14888 case ISD::FMAXIMUM:
14889 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
14890 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
14891 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
14892 case ISD::SMAX:
14893 case ISD::SMIN:
14894 case ISD::UMAX:
14895 case ISD::UMIN:
14896 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
14897 default:
14898 return false;
14899 }
14900
14901 llvm_unreachable("not a min/max opcode");
14902}
14903
14904SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
14905 DAGCombinerInfo &DCI) const {
14906 SelectionDAG &DAG = DCI.DAG;
14907
14908 EVT VT = N->getValueType(0);
14909 unsigned Opc = N->getOpcode();
14910 SDValue Op0 = N->getOperand(0);
14911 SDValue Op1 = N->getOperand(1);
14912
14913 // Only do this if the inner op has one use since this will just increases
14914 // register pressure for no benefit.
14915
14916 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
14917 // max(max(a, b), c) -> max3(a, b, c)
14918 // min(min(a, b), c) -> min3(a, b, c)
14919 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
14920 SDLoc DL(N);
14921 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14922 Op0.getOperand(0), Op0.getOperand(1), Op1);
14923 }
14924
14925 // Try commuted.
14926 // max(a, max(b, c)) -> max3(a, b, c)
14927 // min(a, min(b, c)) -> min3(a, b, c)
14928 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
14929 SDLoc DL(N);
14930 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14931 Op0, Op1.getOperand(0), Op1.getOperand(1));
14932 }
14933 }
14934
14935 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
14936 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
14937 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
14938 if (SDValue Med3 = performIntMed3ImmCombine(
14939 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
14940 return Med3;
14941 }
14942 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
14943 if (SDValue Med3 = performIntMed3ImmCombine(
14944 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
14945 return Med3;
14946 }
14947
14948 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
14949 if (SDValue Med3 = performIntMed3ImmCombine(
14950 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
14951 return Med3;
14952 }
14953 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
14954 if (SDValue Med3 = performIntMed3ImmCombine(
14955 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
14956 return Med3;
14957 }
14958
14959 // if !is_snan(x):
14960 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14961 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14962 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14963 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14964 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
14965 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
14966 (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
14968 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
14969 (VT == MVT::f32 || VT == MVT::f64 ||
14970 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14971 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14972 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14973 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14974 Op0.hasOneUse()) {
14975 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
14976 return Res;
14977 }
14978
14979 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
14980 // for some types, but at a higher cost since it's implemented with a 3
14981 // operand form.
14982 const SDNodeFlags Flags = N->getFlags();
14983 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
14984 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
14985 unsigned NewOpc =
14986 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
14987 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
14988 }
14989
14990 return SDValue();
14991}
14992
14996 // FIXME: Should this be allowing -0.0?
14997 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
14998 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
14999 }
15000 }
15001
15002 return false;
15003}
15004
15005// FIXME: Should only worry about snans for version with chain.
15006SDValue SITargetLowering::performFMed3Combine(SDNode *N,
15007 DAGCombinerInfo &DCI) const {
15008 EVT VT = N->getValueType(0);
15009 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15010 // NaNs. With a NaN input, the order of the operands may change the result.
15011
15012 SelectionDAG &DAG = DCI.DAG;
15013 SDLoc SL(N);
15014
15015 SDValue Src0 = N->getOperand(0);
15016 SDValue Src1 = N->getOperand(1);
15017 SDValue Src2 = N->getOperand(2);
15018
15019 if (isClampZeroToOne(Src0, Src1)) {
15020 // const_a, const_b, x -> clamp is safe in all cases including signaling
15021 // nans.
15022 // FIXME: Should this be allowing -0.0?
15023 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15024 }
15025
15026 const MachineFunction &MF = DAG.getMachineFunction();
15027 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15028
15029 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15030 // handling no dx10-clamp?
15031 if (Info->getMode().DX10Clamp) {
15032 // If NaNs is clamped to 0, we are free to reorder the inputs.
15033
15034 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15035 std::swap(Src0, Src1);
15036
15037 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
15038 std::swap(Src1, Src2);
15039
15040 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15041 std::swap(Src0, Src1);
15042
15043 if (isClampZeroToOne(Src1, Src2))
15044 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15045 }
15046
15047 return SDValue();
15048}
15049
15050SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15051 DAGCombinerInfo &DCI) const {
15052 SDValue Src0 = N->getOperand(0);
15053 SDValue Src1 = N->getOperand(1);
15054 if (Src0.isUndef() && Src1.isUndef())
15055 return DCI.DAG.getUNDEF(N->getValueType(0));
15056 return SDValue();
15057}
15058
15059// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15060// expanded into a set of cmp/select instructions.
15062 unsigned NumElem,
15063 bool IsDivergentIdx,
15064 const GCNSubtarget *Subtarget) {
15066 return false;
15067
15068 unsigned VecSize = EltSize * NumElem;
15069
15070 // Sub-dword vectors of size 2 dword or less have better implementation.
15071 if (VecSize <= 64 && EltSize < 32)
15072 return false;
15073
15074 // Always expand the rest of sub-dword instructions, otherwise it will be
15075 // lowered via memory.
15076 if (EltSize < 32)
15077 return true;
15078
15079 // Always do this if var-idx is divergent, otherwise it will become a loop.
15080 if (IsDivergentIdx)
15081 return true;
15082
15083 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15084 unsigned NumInsts = NumElem /* Number of compares */ +
15085 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15086
15087 // On some architectures (GFX9) movrel is not available and it's better
15088 // to expand.
15089 if (Subtarget->useVGPRIndexMode())
15090 return NumInsts <= 16;
15091
15092 // If movrel is available, use it instead of expanding for vector of 8
15093 // elements.
15094 if (Subtarget->hasMovrel())
15095 return NumInsts <= 15;
15096
15097 return true;
15098}
15099
15101 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
15102 if (isa<ConstantSDNode>(Idx))
15103 return false;
15104
15105 SDValue Vec = N->getOperand(0);
15106 EVT VecVT = Vec.getValueType();
15107 EVT EltVT = VecVT.getVectorElementType();
15108 unsigned EltSize = EltVT.getSizeInBits();
15109 unsigned NumElem = VecVT.getVectorNumElements();
15110
15112 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
15113}
15114
15115SDValue
15116SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15117 DAGCombinerInfo &DCI) const {
15118 SDValue Vec = N->getOperand(0);
15119 SelectionDAG &DAG = DCI.DAG;
15120
15121 EVT VecVT = Vec.getValueType();
15122 EVT VecEltVT = VecVT.getVectorElementType();
15123 EVT ResVT = N->getValueType(0);
15124
15125 unsigned VecSize = VecVT.getSizeInBits();
15126 unsigned VecEltSize = VecEltVT.getSizeInBits();
15127
15128 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15130 SDLoc SL(N);
15131 SDValue Idx = N->getOperand(1);
15132 SDValue Elt =
15133 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
15134 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
15135 }
15136
15137 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15138 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15139 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15140 // depending on the shift operand. See e.g. performSraCombine().
15141 // This combine ensures that the optimisation is compatible with v2i32
15142 // legalised AND.
15143 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15144 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
15145
15147 if (!C || C->getZExtValue() != 0x1f)
15148 return SDValue();
15149
15150 SDLoc SL(N);
15151 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
15152 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
15153 Vec->getOperand(0), N->getOperand(1));
15154 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
15155 DAG.ReplaceAllUsesWith(N, A.getNode());
15156 }
15157
15158 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15159 // =>
15160 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15161 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15162 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15163 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15164 SDLoc SL(N);
15165 SDValue Idx = N->getOperand(1);
15166 unsigned Opc = Vec.getOpcode();
15167
15168 switch (Opc) {
15169 default:
15170 break;
15171 // TODO: Support other binary operations.
15172 case ISD::FADD:
15173 case ISD::FSUB:
15174 case ISD::FMUL:
15175 case ISD::ADD:
15176 case ISD::UMIN:
15177 case ISD::UMAX:
15178 case ISD::SMIN:
15179 case ISD::SMAX:
15180 case ISD::FMAXNUM:
15181 case ISD::FMINNUM:
15182 case ISD::FMAXNUM_IEEE:
15183 case ISD::FMINNUM_IEEE:
15184 case ISD::FMAXIMUM:
15185 case ISD::FMINIMUM: {
15186 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15187 Vec.getOperand(0), Idx);
15188 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15189 Vec.getOperand(1), Idx);
15190
15191 DCI.AddToWorklist(Elt0.getNode());
15192 DCI.AddToWorklist(Elt1.getNode());
15193 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
15194 }
15195 }
15196 }
15197
15198 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15200 SDLoc SL(N);
15201 SDValue Idx = N->getOperand(1);
15202 SDValue V;
15203 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15204 SDValue IC = DAG.getVectorIdxConstant(I, SL);
15205 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
15206 if (I == 0)
15207 V = Elt;
15208 else
15209 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
15210 }
15211 return V;
15212 }
15213
15214 if (!DCI.isBeforeLegalize())
15215 return SDValue();
15216
15217 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15218 // elements. This exposes more load reduction opportunities by replacing
15219 // multiple small extract_vector_elements with a single 32-bit extract.
15220 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
15221 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
15222 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15223 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
15224
15225 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15226 unsigned EltIdx = BitIndex / 32;
15227 unsigned LeftoverBitIdx = BitIndex % 32;
15228 SDLoc SL(N);
15229
15230 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
15231 DCI.AddToWorklist(Cast.getNode());
15232
15233 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
15234 DAG.getConstant(EltIdx, SL, MVT::i32));
15235 DCI.AddToWorklist(Elt.getNode());
15236 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
15237 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
15238 DCI.AddToWorklist(Srl.getNode());
15239
15240 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15241 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
15242 DCI.AddToWorklist(Trunc.getNode());
15243
15244 if (VecEltVT == ResVT) {
15245 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15246 }
15247
15248 assert(ResVT.isScalarInteger());
15249 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
15250 }
15251
15252 return SDValue();
15253}
15254
15255SDValue
15256SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15257 DAGCombinerInfo &DCI) const {
15258 SDValue Vec = N->getOperand(0);
15259 SDValue Idx = N->getOperand(2);
15260 EVT VecVT = Vec.getValueType();
15261 EVT EltVT = VecVT.getVectorElementType();
15262
15263 // INSERT_VECTOR_ELT (<n x e>, var-idx)
15264 // => BUILD_VECTOR n x select (e, const-idx)
15266 return SDValue();
15267
15268 SelectionDAG &DAG = DCI.DAG;
15269 SDLoc SL(N);
15270 SDValue Ins = N->getOperand(1);
15271 EVT IdxVT = Idx.getValueType();
15272
15274 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15275 SDValue IC = DAG.getConstant(I, SL, IdxVT);
15276 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
15277 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
15278 Ops.push_back(V);
15279 }
15280
15281 return DAG.getBuildVector(VecVT, SL, Ops);
15282}
15283
15284/// Return the source of an fp_extend from f16 to f32, or a converted FP
15285/// constant.
15287 if (Src.getOpcode() == ISD::FP_EXTEND &&
15288 Src.getOperand(0).getValueType() == MVT::f16) {
15289 return Src.getOperand(0);
15290 }
15291
15292 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
15293 APFloat Val = CFP->getValueAPF();
15294 bool LosesInfo = true;
15296 if (!LosesInfo)
15297 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
15298 }
15299
15300 return SDValue();
15301}
15302
15303SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15304 DAGCombinerInfo &DCI) const {
15305 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15306 "combine only useful on gfx8");
15307
15308 SDValue TruncSrc = N->getOperand(0);
15309 EVT VT = N->getValueType(0);
15310 if (VT != MVT::f16)
15311 return SDValue();
15312
15313 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15314 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15315 return SDValue();
15316
15317 SelectionDAG &DAG = DCI.DAG;
15318 SDLoc SL(N);
15319
15320 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15321 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15322 // casting back.
15323
15324 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15325 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15326 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
15327 if (!A)
15328 return SDValue();
15329
15330 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
15331 if (!B)
15332 return SDValue();
15333
15334 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
15335 if (!C)
15336 return SDValue();
15337
15338 // This changes signaling nan behavior. If an input is a signaling nan, it
15339 // would have been quieted by the fpext originally. We don't care because
15340 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15341 // we would be worse off than just doing the promotion.
15342 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
15343 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
15344 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
15345 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15346}
15347
15348unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15349 const SDNode *N0,
15350 const SDNode *N1) const {
15351 EVT VT = N0->getValueType(0);
15352
15353 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15354 // support denormals ever.
15355 if (((VT == MVT::f32 &&
15357 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15360 return ISD::FMAD;
15361
15362 const TargetOptions &Options = DAG.getTarget().Options;
15363 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15364 (N0->getFlags().hasAllowContract() &&
15365 N1->getFlags().hasAllowContract())) &&
15367 return ISD::FMA;
15368 }
15369
15370 return 0;
15371}
15372
15373// For a reassociatable opcode perform:
15374// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15375SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
15376 SelectionDAG &DAG) const {
15377 EVT VT = N->getValueType(0);
15378 if (VT != MVT::i32 && VT != MVT::i64)
15379 return SDValue();
15380
15381 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
15382 return SDValue();
15383
15384 unsigned Opc = N->getOpcode();
15385 SDValue Op0 = N->getOperand(0);
15386 SDValue Op1 = N->getOperand(1);
15387
15388 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
15389 return SDValue();
15390
15391 if (Op0->isDivergent())
15392 std::swap(Op0, Op1);
15393
15394 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
15395 return SDValue();
15396
15397 SDValue Op2 = Op1.getOperand(1);
15398 Op1 = Op1.getOperand(0);
15399 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
15400 return SDValue();
15401
15402 if (Op1->isDivergent())
15403 std::swap(Op1, Op2);
15404
15405 SDLoc SL(N);
15406 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
15407 return DAG.getNode(Opc, SL, VT, Add1, Op2);
15408}
15409
15410static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
15411 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
15413 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
15414 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
15415 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
15416}
15417
15418// Fold
15419// y = lshr i64 x, 32
15420// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
15421// with Const.hi == -1
15422// To
15423// res = mad_u64_u32 y.lo ,Const.lo, x.lo
15425 SDValue MulLHS, SDValue MulRHS,
15426 SDValue AddRHS) {
15427 if (MulRHS.getOpcode() == ISD::SRL)
15428 std::swap(MulLHS, MulRHS);
15429
15430 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
15431 return SDValue();
15432
15433 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
15434 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
15435 MulLHS.getOperand(0) != AddRHS)
15436 return SDValue();
15437
15439 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
15440 return SDValue();
15441
15442 SDValue ConstMul =
15443 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
15444 return getMad64_32(DAG, SL, MVT::i64,
15445 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
15446 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
15447}
15448
15449// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
15450// multiplies, if any.
15451//
15452// Full 64-bit multiplies that feed into an addition are lowered here instead
15453// of using the generic expansion. The generic expansion ends up with
15454// a tree of ADD nodes that prevents us from using the "add" part of the
15455// MAD instruction. The expansion produced here results in a chain of ADDs
15456// instead of a tree.
15457SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
15458 DAGCombinerInfo &DCI) const {
15459 assert(N->isAnyAdd());
15460
15461 SelectionDAG &DAG = DCI.DAG;
15462 EVT VT = N->getValueType(0);
15463 SDLoc SL(N);
15464 SDValue LHS = N->getOperand(0);
15465 SDValue RHS = N->getOperand(1);
15466
15467 if (VT.isVector())
15468 return SDValue();
15469
15470 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
15471 // result in scalar registers for uniform values.
15472 if (!N->isDivergent() && Subtarget->hasSMulHi())
15473 return SDValue();
15474
15475 unsigned NumBits = VT.getScalarSizeInBits();
15476 if (NumBits <= 32 || NumBits > 64)
15477 return SDValue();
15478
15479 if (LHS.getOpcode() != ISD::MUL) {
15480 assert(RHS.getOpcode() == ISD::MUL);
15481 std::swap(LHS, RHS);
15482 }
15483
15484 // Avoid the fold if it would unduly increase the number of multiplies due to
15485 // multiple uses, except on hardware with full-rate multiply-add (which is
15486 // part of full-rate 64-bit ops).
15487 if (!Subtarget->hasFullRate64Ops()) {
15488 unsigned NumUsers = 0;
15489 for (SDNode *User : LHS->users()) {
15490 // There is a use that does not feed into addition, so the multiply can't
15491 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
15492 if (!User->isAnyAdd())
15493 return SDValue();
15494
15495 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
15496 // MUL + 3xADD + 3xADDC over 3xMAD.
15497 ++NumUsers;
15498 if (NumUsers >= 3)
15499 return SDValue();
15500 }
15501 }
15502
15503 SDValue MulLHS = LHS.getOperand(0);
15504 SDValue MulRHS = LHS.getOperand(1);
15505 SDValue AddRHS = RHS;
15506
15507 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
15508 return FoldedMAD;
15509
15510 // Always check whether operands are small unsigned values, since that
15511 // knowledge is useful in more cases. Check for small signed values only if
15512 // doing so can unlock a shorter code sequence.
15513 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
15514 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
15515
15516 bool MulSignedLo = false;
15517 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15518 MulSignedLo =
15519 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
15520 }
15521
15522 // The operands and final result all have the same number of bits. If
15523 // operands need to be extended, they can be extended with garbage. The
15524 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
15525 // truncated away in the end.
15526 if (VT != MVT::i64) {
15527 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
15528 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
15529 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
15530 }
15531
15532 // The basic code generated is conceptually straightforward. Pseudo code:
15533 //
15534 // accum = mad_64_32 lhs.lo, rhs.lo, accum
15535 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
15536 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
15537 //
15538 // The second and third lines are optional, depending on whether the factors
15539 // are {sign,zero}-extended or not.
15540 //
15541 // The actual DAG is noisier than the pseudo code, but only due to
15542 // instructions that disassemble values into low and high parts, and
15543 // assemble the final result.
15544 SDValue One = DAG.getConstant(1, SL, MVT::i32);
15545
15546 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
15547 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
15548 SDValue Accum =
15549 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15550
15551 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15552 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15553
15554 if (!MulLHSUnsigned32) {
15555 auto MulLHSHi =
15556 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
15557 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
15558 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15559 }
15560
15561 if (!MulRHSUnsigned32) {
15562 auto MulRHSHi =
15563 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
15564 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
15565 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15566 }
15567
15568 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
15569 Accum = DAG.getBitcast(MVT::i64, Accum);
15570 }
15571
15572 if (VT != MVT::i64)
15573 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
15574 return Accum;
15575}
15576
15577SDValue
15578SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
15579 DAGCombinerInfo &DCI) const {
15580 SDValue RHS = N->getOperand(1);
15581 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15582 if (!CRHS)
15583 return SDValue();
15584
15585 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
15586 // common.
15587 uint64_t Val = CRHS->getZExtValue();
15588 if (countr_zero(Val) >= 32) {
15589 SelectionDAG &DAG = DCI.DAG;
15590 SDLoc SL(N);
15591 SDValue LHS = N->getOperand(0);
15592
15593 // Avoid carry machinery if we know the low half of the add does not
15594 // contribute to the final result.
15595 //
15596 // add i64:x, K if computeTrailingZeros(K) >= 32
15597 // => build_pair (add x.hi, K.hi), x.lo
15598
15599 // Breaking the 64-bit add here with this strange constant is unlikely
15600 // to interfere with addressing mode patterns.
15601
15602 SDValue Hi = getHiHalf64(LHS, DAG);
15603 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
15604 unsigned Opcode = N->getOpcode();
15605 if (Opcode == ISD::PTRADD)
15606 Opcode = ISD::ADD;
15607 SDValue AddHi =
15608 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
15609
15610 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
15611 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
15612 }
15613
15614 return SDValue();
15615}
15616
15617// Collect the ultimate src of each of the mul node's operands, and confirm
15618// each operand is 8 bytes.
15619static std::optional<ByteProvider<SDValue>>
15620handleMulOperand(const SDValue &MulOperand) {
15621 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
15622 if (!Byte0 || Byte0->isConstantZero()) {
15623 return std::nullopt;
15624 }
15625 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
15626 if (Byte1 && !Byte1->isConstantZero()) {
15627 return std::nullopt;
15628 }
15629 return Byte0;
15630}
15631
15632static unsigned addPermMasks(unsigned First, unsigned Second) {
15633 unsigned FirstCs = First & 0x0c0c0c0c;
15634 unsigned SecondCs = Second & 0x0c0c0c0c;
15635 unsigned FirstNoCs = First & ~0x0c0c0c0c;
15636 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15637
15638 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15639 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15640 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15641 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15642
15643 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15644}
15645
15646struct DotSrc {
15648 int64_t PermMask;
15650};
15651
15655 SmallVectorImpl<DotSrc> &Src1s, int Step) {
15656
15657 assert(Src0.Src.has_value() && Src1.Src.has_value());
15658 // Src0s and Src1s are empty, just place arbitrarily.
15659 if (Step == 0) {
15660 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
15661 Src0.SrcOffset / 4});
15662 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
15663 Src1.SrcOffset / 4});
15664 return;
15665 }
15666
15667 for (int BPI = 0; BPI < 2; BPI++) {
15668 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
15669 if (BPI == 1) {
15670 BPP = {Src1, Src0};
15671 }
15672 unsigned ZeroMask = 0x0c0c0c0c;
15673 unsigned FMask = 0xFF << (8 * (3 - Step));
15674
15675 unsigned FirstMask =
15676 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15677 unsigned SecondMask =
15678 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15679 // Attempt to find Src vector which contains our SDValue, if so, add our
15680 // perm mask to the existing one. If we are unable to find a match for the
15681 // first SDValue, attempt to find match for the second.
15682 int FirstGroup = -1;
15683 for (int I = 0; I < 2; I++) {
15684 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
15685 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
15686 return IterElt.SrcOp == *BPP.first.Src &&
15687 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15688 };
15689
15690 auto *Match = llvm::find_if(Srcs, MatchesFirst);
15691 if (Match != Srcs.end()) {
15692 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
15693 FirstGroup = I;
15694 break;
15695 }
15696 }
15697 if (FirstGroup != -1) {
15698 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
15699 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
15700 return IterElt.SrcOp == *BPP.second.Src &&
15701 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15702 };
15703 auto *Match = llvm::find_if(Srcs, MatchesSecond);
15704 if (Match != Srcs.end()) {
15705 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
15706 } else
15707 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15708 return;
15709 }
15710 }
15711
15712 // If we have made it here, then we could not find a match in Src0s or Src1s
15713 // for either Src0 or Src1, so just place them arbitrarily.
15714
15715 unsigned ZeroMask = 0x0c0c0c0c;
15716 unsigned FMask = 0xFF << (8 * (3 - Step));
15717
15718 Src0s.push_back(
15719 {*Src0.Src,
15720 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15721 Src0.SrcOffset / 4});
15722 Src1s.push_back(
15723 {*Src1.Src,
15724 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15725 Src1.SrcOffset / 4});
15726}
15727
15729 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
15730 bool IsAny) {
15731
15732 // If we just have one source, just permute it accordingly.
15733 if (Srcs.size() == 1) {
15734 auto *Elt = Srcs.begin();
15735 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
15736
15737 // v_perm will produce the original value
15738 if (Elt->PermMask == 0x3020100)
15739 return EltOp;
15740
15741 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15742 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
15743 }
15744
15745 auto *FirstElt = Srcs.begin();
15746 auto *SecondElt = std::next(FirstElt);
15747
15749
15750 // If we have multiple sources in the chain, combine them via perms (using
15751 // calculated perm mask) and Ors.
15752 while (true) {
15753 auto FirstMask = FirstElt->PermMask;
15754 auto SecondMask = SecondElt->PermMask;
15755
15756 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15757 unsigned FirstPlusFour = FirstMask | 0x04040404;
15758 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
15759 // original 0x0C.
15760 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15761
15762 auto PermMask = addPermMasks(FirstMask, SecondMask);
15763 auto FirstVal =
15764 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15765 auto SecondVal =
15766 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
15767
15768 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
15769 SecondVal,
15770 DAG.getConstant(PermMask, SL, MVT::i32)));
15771
15772 FirstElt = std::next(SecondElt);
15773 if (FirstElt == Srcs.end())
15774 break;
15775
15776 SecondElt = std::next(FirstElt);
15777 // If we only have a FirstElt, then just combine that into the cumulative
15778 // source node.
15779 if (SecondElt == Srcs.end()) {
15780 auto EltOp =
15781 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15782
15783 Perms.push_back(
15784 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15785 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
15786 break;
15787 }
15788 }
15789
15790 assert(Perms.size() == 1 || Perms.size() == 2);
15791 return Perms.size() == 2
15792 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
15793 : Perms[0];
15794}
15795
15796static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
15797 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15798 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15799 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15800 EntryMask += ZeroMask;
15801 }
15802}
15803
15804static bool isMul(const SDValue Op) {
15805 auto Opcode = Op.getOpcode();
15806
15807 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
15808 Opcode == AMDGPUISD::MUL_I24);
15809}
15810
15811static std::optional<bool>
15813 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
15814 const SDValue &S1Op, const SelectionDAG &DAG) {
15815 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
15816 // of the dot4 is irrelevant.
15817 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
15818 return false;
15819
15820 auto Known0 = DAG.computeKnownBits(S0Op, 0);
15821 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
15822 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15823 auto Known1 = DAG.computeKnownBits(S1Op, 0);
15824 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
15825 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15826
15827 assert(!(S0IsUnsigned && S0IsSigned));
15828 assert(!(S1IsUnsigned && S1IsSigned));
15829
15830 // There are 9 possible permutations of
15831 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
15832
15833 // In two permutations, the sign bits are known to be the same for both Ops,
15834 // so simply return Signed / Unsigned corresponding to the MSB
15835
15836 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15837 return S0IsSigned;
15838
15839 // In another two permutations, the sign bits are known to be opposite. In
15840 // this case return std::nullopt to indicate a bad match.
15841
15842 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15843 return std::nullopt;
15844
15845 // In the remaining five permutations, we don't know the value of the sign
15846 // bit for at least one Op. Since we have a valid ByteProvider, we know that
15847 // the upper bits must be extension bits. Thus, the only ways for the sign
15848 // bit to be unknown is if it was sign extended from unknown value, or if it
15849 // was any extended. In either case, it is correct to use the signed
15850 // version of the signedness semantics of dot4
15851
15852 // In two of such permutations, we known the sign bit is set for
15853 // one op, and the other is unknown. It is okay to used signed version of
15854 // dot4.
15855 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15856 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15857 return true;
15858
15859 // In one such permutation, we don't know either of the sign bits. It is okay
15860 // to used the signed version of dot4.
15861 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15862 return true;
15863
15864 // In two of such permutations, we known the sign bit is unset for
15865 // one op, and the other is unknown. Return std::nullopt to indicate a
15866 // bad match.
15867 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15868 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15869 return std::nullopt;
15870
15871 llvm_unreachable("Fully covered condition");
15872}
15873
15874SDValue SITargetLowering::performAddCombine(SDNode *N,
15875 DAGCombinerInfo &DCI) const {
15876 SelectionDAG &DAG = DCI.DAG;
15877 EVT VT = N->getValueType(0);
15878 SDLoc SL(N);
15879 SDValue LHS = N->getOperand(0);
15880 SDValue RHS = N->getOperand(1);
15881
15882 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
15883 if (Subtarget->hasMad64_32()) {
15884 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
15885 return Folded;
15886 }
15887 }
15888
15889 if (SDValue V = reassociateScalarOps(N, DAG)) {
15890 return V;
15891 }
15892
15893 if (VT == MVT::i64) {
15894 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15895 return Folded;
15896 }
15897
15898 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
15899 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15900 SDValue TempNode(N, 0);
15901 std::optional<bool> IsSigned;
15905
15906 // Match the v_dot4 tree, while collecting src nodes.
15907 int ChainLength = 0;
15908 for (int I = 0; I < 4; I++) {
15909 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
15910 if (MulIdx == -1)
15911 break;
15912 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15913 if (!Src0)
15914 break;
15915 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15916 if (!Src1)
15917 break;
15918
15919 auto IterIsSigned = checkDot4MulSignedness(
15920 TempNode->getOperand(MulIdx), *Src0, *Src1,
15921 TempNode->getOperand(MulIdx)->getOperand(0),
15922 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15923 if (!IterIsSigned)
15924 break;
15925 if (!IsSigned)
15926 IsSigned = *IterIsSigned;
15927 if (*IterIsSigned != *IsSigned)
15928 break;
15929 placeSources(*Src0, *Src1, Src0s, Src1s, I);
15930 auto AddIdx = 1 - MulIdx;
15931 // Allow the special case where add (add (mul24, 0), mul24) became ->
15932 // add (mul24, mul24).
15933 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
15934 Src2s.push_back(TempNode->getOperand(AddIdx));
15935 auto Src0 =
15936 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
15937 if (!Src0)
15938 break;
15939 auto Src1 =
15940 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
15941 if (!Src1)
15942 break;
15943 auto IterIsSigned = checkDot4MulSignedness(
15944 TempNode->getOperand(AddIdx), *Src0, *Src1,
15945 TempNode->getOperand(AddIdx)->getOperand(0),
15946 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15947 if (!IterIsSigned)
15948 break;
15949 assert(IsSigned);
15950 if (*IterIsSigned != *IsSigned)
15951 break;
15952 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
15953 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
15954 ChainLength = I + 2;
15955 break;
15956 }
15957
15958 TempNode = TempNode->getOperand(AddIdx);
15959 Src2s.push_back(TempNode);
15960 ChainLength = I + 1;
15961 if (TempNode->getNumOperands() < 2)
15962 break;
15963 LHS = TempNode->getOperand(0);
15964 RHS = TempNode->getOperand(1);
15965 }
15966
15967 if (ChainLength < 2)
15968 return SDValue();
15969
15970 // Masks were constructed with assumption that we would find a chain of
15971 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
15972 // 0x0c) so they do not affect dot calculation.
15973 if (ChainLength < 4) {
15974 fixMasks(Src0s, ChainLength);
15975 fixMasks(Src1s, ChainLength);
15976 }
15977
15978 SDValue Src0, Src1;
15979
15980 // If we are just using a single source for both, and have permuted the
15981 // bytes consistently, we can just use the sources without permuting
15982 // (commutation).
15983 bool UseOriginalSrc = false;
15984 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
15985 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
15986 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
15987 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
15988 SmallVector<unsigned, 4> SrcBytes;
15989 auto Src0Mask = Src0s.begin()->PermMask;
15990 SrcBytes.push_back(Src0Mask & 0xFF000000);
15991 bool UniqueEntries = true;
15992 for (auto I = 1; I < 4; I++) {
15993 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
15994
15995 if (is_contained(SrcBytes, NextByte)) {
15996 UniqueEntries = false;
15997 break;
15998 }
15999 SrcBytes.push_back(NextByte);
16000 }
16001
16002 if (UniqueEntries) {
16003 UseOriginalSrc = true;
16004
16005 auto *FirstElt = Src0s.begin();
16006 auto FirstEltOp =
16007 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16008
16009 auto *SecondElt = Src1s.begin();
16010 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
16011 SecondElt->DWordOffset);
16012
16013 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
16014 MVT::getIntegerVT(32));
16015 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
16016 MVT::getIntegerVT(32));
16017 }
16018 }
16019
16020 if (!UseOriginalSrc) {
16021 Src0 = resolveSources(DAG, SL, Src0s, false, true);
16022 Src1 = resolveSources(DAG, SL, Src1s, false, true);
16023 }
16024
16025 assert(IsSigned);
16026 SDValue Src2 =
16027 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16028
16029 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
16030 : Intrinsic::amdgcn_udot4,
16031 SL, MVT::i64);
16032
16033 assert(!VT.isVector());
16034 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
16035 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
16036
16037 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
16038 }
16039
16040 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16041 return SDValue();
16042
16043 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16044 // add x, sext (setcc) => usubo_carry x, 0, setcc
16045 unsigned Opc = LHS.getOpcode();
16048 std::swap(RHS, LHS);
16049
16050 Opc = RHS.getOpcode();
16051 switch (Opc) {
16052 default:
16053 break;
16054 case ISD::ZERO_EXTEND:
16055 case ISD::SIGN_EXTEND:
16056 case ISD::ANY_EXTEND: {
16057 auto Cond = RHS.getOperand(0);
16058 // If this won't be a real VOPC output, we would still need to insert an
16059 // extra instruction anyway.
16060 if (!isBoolSGPR(Cond))
16061 break;
16062 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16063 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16065 return DAG.getNode(Opc, SL, VTList, Args);
16066 }
16067 case ISD::UADDO_CARRY: {
16068 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16069 if (!isNullConstant(RHS.getOperand(1)))
16070 break;
16071 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
16072 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
16073 }
16074 }
16075 return SDValue();
16076}
16077
16078SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16079 DAGCombinerInfo &DCI) const {
16080 SelectionDAG &DAG = DCI.DAG;
16081 SDLoc DL(N);
16082 EVT VT = N->getValueType(0);
16083 SDValue N0 = N->getOperand(0);
16084 SDValue N1 = N->getOperand(1);
16085
16086 // The following folds transform PTRADDs into regular arithmetic in cases
16087 // where the PTRADD wouldn't be folded as an immediate offset into memory
16088 // instructions anyway. They are target-specific in that other targets might
16089 // prefer to not lose information about the pointer arithmetic.
16090
16091 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16092 // Adapted from DAGCombiner::visitADDLikeCommutative.
16093 SDValue V, K;
16094 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
16095 SDNodeFlags ShlFlags = N1->getFlags();
16096 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16097 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16098 // preserved.
16099 SDNodeFlags NewShlFlags =
16100 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16102 : SDNodeFlags();
16103 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
16104 DCI.AddToWorklist(Inner.getNode());
16105 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
16106 }
16107
16108 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16109 // performAddCombine.
16110 if (N1.getOpcode() == ISD::MUL) {
16111 if (Subtarget->hasMad64_32()) {
16112 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16113 return Folded;
16114 }
16115 }
16116
16117 // If the 32 low bits of the constant are all zero, there is nothing to fold
16118 // into an immediate offset, so it's better to eliminate the unnecessary
16119 // addition for the lower 32 bits than to preserve the PTRADD.
16120 // Analogous to a fold in performAddCombine.
16121 if (VT == MVT::i64) {
16122 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16123 return Folded;
16124 }
16125
16126 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16127 return SDValue();
16128
16129 SDValue X = N0;
16130 SDValue Y = N1.getOperand(0);
16131 SDValue Z = N1.getOperand(1);
16132 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
16133 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
16134
16135 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
16136 Y->isDivergent() != Z->isDivergent()) {
16137 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16138 // y are uniform and z isn't.
16139 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16140 // z are uniform and y isn't.
16141 // The goal is to push uniform operands up in the computation, so that they
16142 // can be handled with scalar operations. We can't use reassociateScalarOps
16143 // for this since it requires two identical commutative operations to
16144 // reassociate.
16145 if (Y->isDivergent())
16146 std::swap(Y, Z);
16147 // If both additions in the original were NUW, reassociation preserves that.
16148 SDNodeFlags ReassocFlags =
16149 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16150 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16151 DCI.AddToWorklist(UniformInner.getNode());
16152 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
16153 }
16154
16155 return SDValue();
16156}
16157
16158SDValue SITargetLowering::performSubCombine(SDNode *N,
16159 DAGCombinerInfo &DCI) const {
16160 SelectionDAG &DAG = DCI.DAG;
16161 EVT VT = N->getValueType(0);
16162
16163 if (VT == MVT::i64) {
16164 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16165 return Folded;
16166 }
16167
16168 if (VT != MVT::i32)
16169 return SDValue();
16170
16171 SDLoc SL(N);
16172 SDValue LHS = N->getOperand(0);
16173 SDValue RHS = N->getOperand(1);
16174
16175 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16176 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16177 unsigned Opc = RHS.getOpcode();
16178 switch (Opc) {
16179 default:
16180 break;
16181 case ISD::ZERO_EXTEND:
16182 case ISD::SIGN_EXTEND:
16183 case ISD::ANY_EXTEND: {
16184 auto Cond = RHS.getOperand(0);
16185 // If this won't be a real VOPC output, we would still need to insert an
16186 // extra instruction anyway.
16187 if (!isBoolSGPR(Cond))
16188 break;
16189 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16190 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16192 return DAG.getNode(Opc, SL, VTList, Args);
16193 }
16194 }
16195
16196 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16197 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16198 if (!isNullConstant(LHS.getOperand(1)))
16199 return SDValue();
16200 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
16201 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
16202 }
16203 return SDValue();
16204}
16205
16206SDValue
16207SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16208 DAGCombinerInfo &DCI) const {
16209
16210 if (N->getValueType(0) != MVT::i32)
16211 return SDValue();
16212
16213 if (!isNullConstant(N->getOperand(1)))
16214 return SDValue();
16215
16216 SelectionDAG &DAG = DCI.DAG;
16217 SDValue LHS = N->getOperand(0);
16218
16219 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16220 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16221 unsigned LHSOpc = LHS.getOpcode();
16222 unsigned Opc = N->getOpcode();
16223 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
16224 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16225 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
16226 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
16227 }
16228 return SDValue();
16229}
16230
16231SDValue SITargetLowering::performFAddCombine(SDNode *N,
16232 DAGCombinerInfo &DCI) const {
16233 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16234 return SDValue();
16235
16236 SelectionDAG &DAG = DCI.DAG;
16237 EVT VT = N->getValueType(0);
16238
16239 SDLoc SL(N);
16240 SDValue LHS = N->getOperand(0);
16241 SDValue RHS = N->getOperand(1);
16242
16243 // These should really be instruction patterns, but writing patterns with
16244 // source modifiers is a pain.
16245
16246 // fadd (fadd (a, a), b) -> mad 2.0, a, b
16247 if (LHS.getOpcode() == ISD::FADD) {
16248 SDValue A = LHS.getOperand(0);
16249 if (A == LHS.getOperand(1)) {
16250 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16251 if (FusedOp != 0) {
16252 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16253 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
16254 }
16255 }
16256 }
16257
16258 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
16259 if (RHS.getOpcode() == ISD::FADD) {
16260 SDValue A = RHS.getOperand(0);
16261 if (A == RHS.getOperand(1)) {
16262 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16263 if (FusedOp != 0) {
16264 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16265 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
16266 }
16267 }
16268 }
16269
16270 return SDValue();
16271}
16272
16273SDValue SITargetLowering::performFSubCombine(SDNode *N,
16274 DAGCombinerInfo &DCI) const {
16275 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16276 return SDValue();
16277
16278 SelectionDAG &DAG = DCI.DAG;
16279 SDLoc SL(N);
16280 EVT VT = N->getValueType(0);
16281 assert(!VT.isVector());
16282
16283 // Try to get the fneg to fold into the source modifier. This undoes generic
16284 // DAG combines and folds them into the mad.
16285 //
16286 // Only do this if we are not trying to support denormals. v_mad_f32 does
16287 // not support denormals ever.
16288 SDValue LHS = N->getOperand(0);
16289 SDValue RHS = N->getOperand(1);
16290 if (LHS.getOpcode() == ISD::FADD) {
16291 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16292 SDValue A = LHS.getOperand(0);
16293 if (A == LHS.getOperand(1)) {
16294 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16295 if (FusedOp != 0) {
16296 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16297 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
16298
16299 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
16300 }
16301 }
16302 }
16303
16304 if (RHS.getOpcode() == ISD::FADD) {
16305 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16306
16307 SDValue A = RHS.getOperand(0);
16308 if (A == RHS.getOperand(1)) {
16309 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16310 if (FusedOp != 0) {
16311 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
16312 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
16313 }
16314 }
16315 }
16316
16317 return SDValue();
16318}
16319
16320SDValue SITargetLowering::performFDivCombine(SDNode *N,
16321 DAGCombinerInfo &DCI) const {
16322 SelectionDAG &DAG = DCI.DAG;
16323 SDLoc SL(N);
16324 EVT VT = N->getValueType(0);
16325 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
16326 return SDValue();
16327
16328 SDValue LHS = N->getOperand(0);
16329 SDValue RHS = N->getOperand(1);
16330
16331 SDNodeFlags Flags = N->getFlags();
16332 SDNodeFlags RHSFlags = RHS->getFlags();
16333 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16334 !RHS->hasOneUse())
16335 return SDValue();
16336
16337 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
16338 bool IsNegative = false;
16339 if (CLHS->isExactlyValue(1.0) ||
16340 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16341 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16342 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16343 if (RHS.getOpcode() == ISD::FSQRT) {
16344 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16345 SDValue Rsq =
16346 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
16347 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16348 }
16349 }
16350 }
16351
16352 return SDValue();
16353}
16354
16355SDValue SITargetLowering::performFMulCombine(SDNode *N,
16356 DAGCombinerInfo &DCI) const {
16357 SelectionDAG &DAG = DCI.DAG;
16358 EVT VT = N->getValueType(0);
16359 EVT ScalarVT = VT.getScalarType();
16360 EVT IntVT = VT.changeElementType(MVT::i32);
16361
16362 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16363 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16364 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16365 return SDValue();
16366 }
16367
16368 SDValue LHS = N->getOperand(0);
16369 SDValue RHS = N->getOperand(1);
16370
16371 // It is cheaper to realize i32 inline constants as compared against
16372 // materializing f16 or f64 (or even non-inline f32) values,
16373 // possible via ldexp usage, as shown below :
16374 //
16375 // Given : A = 2^a & B = 2^b ; where a and b are integers.
16376 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
16377 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
16378 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16379 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
16380 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
16381 if (!TrueNode)
16382 return SDValue();
16383 const ConstantFPSDNode *FalseNode =
16384 isConstOrConstSplatFP(RHS.getOperand(2));
16385 if (!FalseNode)
16386 return SDValue();
16387
16388 if (TrueNode->isNegative() != FalseNode->isNegative())
16389 return SDValue();
16390
16391 // For f32, only non-inline constants should be transformed.
16392 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16393 if (ScalarVT == MVT::f32 &&
16394 TII->isInlineConstant(TrueNode->getValueAPF()) &&
16395 TII->isInlineConstant(FalseNode->getValueAPF()))
16396 return SDValue();
16397
16398 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
16399 if (TrueNodeExpVal == INT_MIN)
16400 return SDValue();
16401 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
16402 if (FalseNodeExpVal == INT_MIN)
16403 return SDValue();
16404
16405 SDLoc SL(N);
16406 SDValue SelectNode =
16407 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
16408 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
16409 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
16410
16411 LHS = TrueNode->isNegative()
16412 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
16413 : LHS;
16414
16415 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
16416 }
16417
16418 return SDValue();
16419}
16420
16421SDValue SITargetLowering::performFMACombine(SDNode *N,
16422 DAGCombinerInfo &DCI) const {
16423 SelectionDAG &DAG = DCI.DAG;
16424 EVT VT = N->getValueType(0);
16425 SDLoc SL(N);
16426
16427 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16428 return SDValue();
16429
16430 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
16431 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
16432 SDValue Op1 = N->getOperand(0);
16433 SDValue Op2 = N->getOperand(1);
16434 SDValue FMA = N->getOperand(2);
16435
16436 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
16437 Op2.getOpcode() != ISD::FP_EXTEND)
16438 return SDValue();
16439
16440 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
16441 // regardless of the denorm mode setting. Therefore,
16442 // fp-contract is sufficient to allow generating fdot2.
16443 const TargetOptions &Options = DAG.getTarget().Options;
16444 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16445 (N->getFlags().hasAllowContract() &&
16446 FMA->getFlags().hasAllowContract())) {
16447 Op1 = Op1.getOperand(0);
16448 Op2 = Op2.getOperand(0);
16449 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16451 return SDValue();
16452
16453 SDValue Vec1 = Op1.getOperand(0);
16454 SDValue Idx1 = Op1.getOperand(1);
16455 SDValue Vec2 = Op2.getOperand(0);
16456
16457 SDValue FMAOp1 = FMA.getOperand(0);
16458 SDValue FMAOp2 = FMA.getOperand(1);
16459 SDValue FMAAcc = FMA.getOperand(2);
16460
16461 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
16462 FMAOp2.getOpcode() != ISD::FP_EXTEND)
16463 return SDValue();
16464
16465 FMAOp1 = FMAOp1.getOperand(0);
16466 FMAOp2 = FMAOp2.getOperand(0);
16467 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16469 return SDValue();
16470
16471 SDValue Vec3 = FMAOp1.getOperand(0);
16472 SDValue Vec4 = FMAOp2.getOperand(0);
16473 SDValue Idx2 = FMAOp1.getOperand(1);
16474
16475 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
16476 // Idx1 and Idx2 cannot be the same.
16477 Idx1 == Idx2)
16478 return SDValue();
16479
16480 if (Vec1 == Vec2 || Vec3 == Vec4)
16481 return SDValue();
16482
16483 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
16484 return SDValue();
16485
16486 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16487 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16488 DAG.getTargetConstant(0, SL, MVT::i1));
16489 }
16490 }
16491 return SDValue();
16492}
16493
16494SDValue SITargetLowering::performSetCCCombine(SDNode *N,
16495 DAGCombinerInfo &DCI) const {
16496 SelectionDAG &DAG = DCI.DAG;
16497 SDLoc SL(N);
16498
16499 SDValue LHS = N->getOperand(0);
16500 SDValue RHS = N->getOperand(1);
16501 EVT VT = LHS.getValueType();
16502 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
16503
16504 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16505 if (!CRHS) {
16507 if (CRHS) {
16508 std::swap(LHS, RHS);
16509 CC = getSetCCSwappedOperands(CC);
16510 }
16511 }
16512
16513 if (CRHS) {
16514 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
16515 isBoolSGPR(LHS.getOperand(0))) {
16516 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
16517 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
16518 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
16519 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
16520 if ((CRHS->isAllOnes() &&
16521 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
16522 (CRHS->isZero() &&
16523 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
16524 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16525 DAG.getAllOnesConstant(SL, MVT::i1));
16526 if ((CRHS->isAllOnes() &&
16527 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
16528 (CRHS->isZero() &&
16529 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
16530 return LHS.getOperand(0);
16531 }
16532
16533 const APInt &CRHSVal = CRHS->getAPIntValue();
16534 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
16535 LHS.getOpcode() == ISD::SELECT &&
16536 isa<ConstantSDNode>(LHS.getOperand(1)) &&
16537 isa<ConstantSDNode>(LHS.getOperand(2)) &&
16538 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
16539 isBoolSGPR(LHS.getOperand(0))) {
16540 // Given CT != FT:
16541 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
16542 // setcc (select cc, CT, CF), CF, ne => cc
16543 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
16544 // setcc (select cc, CT, CF), CT, eq => cc
16545 const APInt &CT = LHS.getConstantOperandAPInt(1);
16546 const APInt &CF = LHS.getConstantOperandAPInt(2);
16547
16548 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
16549 (CT == CRHSVal && CC == ISD::SETNE))
16550 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16551 DAG.getAllOnesConstant(SL, MVT::i1));
16552 if ((CF == CRHSVal && CC == ISD::SETNE) ||
16553 (CT == CRHSVal && CC == ISD::SETEQ))
16554 return LHS.getOperand(0);
16555 }
16556 }
16557
16558 if (VT != MVT::f32 && VT != MVT::f64 &&
16559 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16560 return SDValue();
16561
16562 // Match isinf/isfinite pattern
16563 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
16564 // (fcmp one (fabs x), inf) -> (fp_class x,
16565 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
16566 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
16567 LHS.getOpcode() == ISD::FABS) {
16568 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
16569 if (!CRHS)
16570 return SDValue();
16571
16572 const APFloat &APF = CRHS->getValueAPF();
16573 if (APF.isInfinity() && !APF.isNegative()) {
16574 const unsigned IsInfMask =
16576 const unsigned IsFiniteMask =
16580 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
16581 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
16582 DAG.getConstant(Mask, SL, MVT::i32));
16583 }
16584 }
16585
16586 return SDValue();
16587}
16588
16589SDValue
16590SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
16591 DAGCombinerInfo &DCI) const {
16592 SelectionDAG &DAG = DCI.DAG;
16593 SDLoc SL(N);
16594 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16595
16596 SDValue Src = N->getOperand(0);
16597 SDValue Shift = N->getOperand(0);
16598
16599 // TODO: Extend type shouldn't matter (assuming legal types).
16600 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
16601 Shift = Shift.getOperand(0);
16602
16603 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
16604 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
16605 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
16606 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
16607 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
16608 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
16609 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
16610 SDValue Shifted = DAG.getZExtOrTrunc(
16611 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
16612
16613 unsigned ShiftOffset = 8 * Offset;
16614 if (Shift.getOpcode() == ISD::SHL)
16615 ShiftOffset -= C->getZExtValue();
16616 else
16617 ShiftOffset += C->getZExtValue();
16618
16619 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16620 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16621 MVT::f32, Shifted);
16622 }
16623 }
16624 }
16625
16626 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16627 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
16628 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
16629 // We simplified Src. If this node is not dead, visit it again so it is
16630 // folded properly.
16631 if (N->getOpcode() != ISD::DELETED_NODE)
16632 DCI.AddToWorklist(N);
16633 return SDValue(N, 0);
16634 }
16635
16636 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
16637 if (SDValue DemandedSrc =
16638 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
16639 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
16640
16641 return SDValue();
16642}
16643
16644SDValue SITargetLowering::performClampCombine(SDNode *N,
16645 DAGCombinerInfo &DCI) const {
16646 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
16647 if (!CSrc)
16648 return SDValue();
16649
16650 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16651 const APFloat &F = CSrc->getValueAPF();
16652 APFloat Zero = APFloat::getZero(F.getSemantics());
16653 if (F < Zero ||
16654 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16655 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
16656 }
16657
16658 APFloat One(F.getSemantics(), "1.0");
16659 if (F > One)
16660 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
16661
16662 return SDValue(CSrc, 0);
16663}
16664
16665SDValue SITargetLowering::performSelectCombine(SDNode *N,
16666 DAGCombinerInfo &DCI) const {
16667
16668 // Try to fold CMP + SELECT patterns with shared constants (both FP and
16669 // integer).
16670 // Detect when CMP and SELECT use the same constant and fold them to avoid
16671 // loading the constant twice. Specifically handles patterns like:
16672 // %cmp = icmp eq i32 %val, 4242
16673 // %sel = select i1 %cmp, i32 4242, i32 %other
16674 // It can be optimized to reuse %val instead of 4242 in select.
16675 SDValue Cond = N->getOperand(0);
16676 SDValue TrueVal = N->getOperand(1);
16677 SDValue FalseVal = N->getOperand(2);
16678
16679 // Check if condition is a comparison.
16680 if (Cond.getOpcode() != ISD::SETCC)
16681 return SDValue();
16682
16683 SDValue LHS = Cond.getOperand(0);
16684 SDValue RHS = Cond.getOperand(1);
16685 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16686
16687 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
16688 bool isInteger = LHS.getValueType().isInteger();
16689
16690 // Handle simple floating-point and integer types only.
16691 if (!isFloatingPoint && !isInteger)
16692 return SDValue();
16693
16694 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
16695 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
16696 if (!isEquality && !isNonEquality)
16697 return SDValue();
16698
16699 SDValue ArgVal, ConstVal;
16700 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
16701 (isInteger && isa<ConstantSDNode>(RHS))) {
16702 ConstVal = RHS;
16703 ArgVal = LHS;
16704 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
16705 (isInteger && isa<ConstantSDNode>(LHS))) {
16706 ConstVal = LHS;
16707 ArgVal = RHS;
16708 } else {
16709 return SDValue();
16710 }
16711
16712 // Skip optimization for inlinable immediates.
16713 if (isFloatingPoint) {
16714 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
16715 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16716 return SDValue();
16717 } else {
16719 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
16720 return SDValue();
16721 }
16722
16723 // For equality and non-equality comparisons, patterns:
16724 // select (setcc x, const), const, y -> select (setcc x, const), x, y
16725 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
16726 if (!(isEquality && TrueVal == ConstVal) &&
16727 !(isNonEquality && FalseVal == ConstVal))
16728 return SDValue();
16729
16730 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
16731 SDValue SelectRHS =
16732 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
16733 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
16734 SelectLHS, SelectRHS);
16735}
16736
16738 DAGCombinerInfo &DCI) const {
16739 switch (N->getOpcode()) {
16740 case ISD::ADD:
16741 case ISD::SUB:
16742 case ISD::SHL:
16743 case ISD::SRL:
16744 case ISD::SRA:
16745 case ISD::AND:
16746 case ISD::OR:
16747 case ISD::XOR:
16748 case ISD::MUL:
16749 case ISD::SETCC:
16750 case ISD::SELECT:
16751 case ISD::SMIN:
16752 case ISD::SMAX:
16753 case ISD::UMIN:
16754 case ISD::UMAX:
16755 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
16756 return Res;
16757 break;
16758 default:
16759 break;
16760 }
16761
16762 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
16763 return SDValue();
16764
16765 switch (N->getOpcode()) {
16766 case ISD::ADD:
16767 return performAddCombine(N, DCI);
16768 case ISD::PTRADD:
16769 return performPtrAddCombine(N, DCI);
16770 case ISD::SUB:
16771 return performSubCombine(N, DCI);
16772 case ISD::UADDO_CARRY:
16773 case ISD::USUBO_CARRY:
16774 return performAddCarrySubCarryCombine(N, DCI);
16775 case ISD::FADD:
16776 return performFAddCombine(N, DCI);
16777 case ISD::FSUB:
16778 return performFSubCombine(N, DCI);
16779 case ISD::FDIV:
16780 return performFDivCombine(N, DCI);
16781 case ISD::FMUL:
16782 return performFMulCombine(N, DCI);
16783 case ISD::SETCC:
16784 return performSetCCCombine(N, DCI);
16785 case ISD::SELECT:
16786 if (auto Res = performSelectCombine(N, DCI))
16787 return Res;
16788 break;
16789 case ISD::FMAXNUM:
16790 case ISD::FMINNUM:
16791 case ISD::FMAXNUM_IEEE:
16792 case ISD::FMINNUM_IEEE:
16793 case ISD::FMAXIMUM:
16794 case ISD::FMINIMUM:
16795 case ISD::FMAXIMUMNUM:
16796 case ISD::FMINIMUMNUM:
16797 case ISD::SMAX:
16798 case ISD::SMIN:
16799 case ISD::UMAX:
16800 case ISD::UMIN:
16803 return performMinMaxCombine(N, DCI);
16804 case ISD::FMA:
16805 return performFMACombine(N, DCI);
16806 case ISD::AND:
16807 return performAndCombine(N, DCI);
16808 case ISD::OR:
16809 return performOrCombine(N, DCI);
16810 case ISD::FSHR: {
16812 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
16813 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16814 return matchPERM(N, DCI);
16815 }
16816 break;
16817 }
16818 case ISD::XOR:
16819 return performXorCombine(N, DCI);
16820 case ISD::ZERO_EXTEND:
16821 return performZeroExtendCombine(N, DCI);
16823 return performSignExtendInRegCombine(N, DCI);
16825 return performClassCombine(N, DCI);
16826 case ISD::FCANONICALIZE:
16827 return performFCanonicalizeCombine(N, DCI);
16828 case AMDGPUISD::RCP:
16829 return performRcpCombine(N, DCI);
16830 case ISD::FLDEXP:
16831 case AMDGPUISD::FRACT:
16832 case AMDGPUISD::RSQ:
16835 case AMDGPUISD::RSQ_CLAMP: {
16836 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
16837 SDValue Src = N->getOperand(0);
16838 if (Src.isUndef())
16839 return Src;
16840 break;
16841 }
16842 case ISD::SINT_TO_FP:
16843 case ISD::UINT_TO_FP:
16844 return performUCharToFloatCombine(N, DCI);
16845 case ISD::FCOPYSIGN:
16846 return performFCopySignCombine(N, DCI);
16851 return performCvtF32UByteNCombine(N, DCI);
16852 case AMDGPUISD::FMED3:
16853 return performFMed3Combine(N, DCI);
16855 return performCvtPkRTZCombine(N, DCI);
16856 case AMDGPUISD::CLAMP:
16857 return performClampCombine(N, DCI);
16858 case ISD::SCALAR_TO_VECTOR: {
16859 SelectionDAG &DAG = DCI.DAG;
16860 EVT VT = N->getValueType(0);
16861
16862 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
16863 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16864 SDLoc SL(N);
16865 SDValue Src = N->getOperand(0);
16866 EVT EltVT = Src.getValueType();
16867 if (EltVT != MVT::i16)
16868 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
16869
16870 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
16871 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
16872 }
16873
16874 break;
16875 }
16877 return performExtractVectorEltCombine(N, DCI);
16879 return performInsertVectorEltCombine(N, DCI);
16880 case ISD::FP_ROUND:
16881 return performFPRoundCombine(N, DCI);
16882 case ISD::LOAD: {
16883 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
16884 return Widened;
16885 [[fallthrough]];
16886 }
16887 default: {
16888 if (!DCI.isBeforeLegalize()) {
16889 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
16890 return performMemSDNodeCombine(MemNode, DCI);
16891 }
16892
16893 break;
16894 }
16895 }
16896
16898}
16899
16900/// Helper function for adjustWritemask
16901static unsigned SubIdx2Lane(unsigned Idx) {
16902 switch (Idx) {
16903 default:
16904 return ~0u;
16905 case AMDGPU::sub0:
16906 return 0;
16907 case AMDGPU::sub1:
16908 return 1;
16909 case AMDGPU::sub2:
16910 return 2;
16911 case AMDGPU::sub3:
16912 return 3;
16913 case AMDGPU::sub4:
16914 return 4; // Possible with TFE/LWE
16915 }
16916}
16917
16918/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
16919SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
16920 SelectionDAG &DAG) const {
16921 unsigned Opcode = Node->getMachineOpcode();
16922
16923 // Subtract 1 because the vdata output is not a MachineSDNode operand.
16924 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16925 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
16926 return Node; // not implemented for D16
16927
16928 SDNode *Users[5] = {nullptr};
16929 unsigned Lane = 0;
16930 unsigned DmaskIdx =
16931 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16932 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
16933 unsigned NewDmask = 0;
16934 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16935 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16936 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
16937 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
16938 unsigned TFCLane = 0;
16939 bool HasChain = Node->getNumValues() > 1;
16940
16941 if (OldDmask == 0) {
16942 // These are folded out, but on the chance it happens don't assert.
16943 return Node;
16944 }
16945
16946 unsigned OldBitsSet = llvm::popcount(OldDmask);
16947 // Work out which is the TFE/LWE lane if that is enabled.
16948 if (UsesTFC) {
16949 TFCLane = OldBitsSet;
16950 }
16951
16952 // Try to figure out the used register components
16953 for (SDUse &Use : Node->uses()) {
16954
16955 // Don't look at users of the chain.
16956 if (Use.getResNo() != 0)
16957 continue;
16958
16959 SDNode *User = Use.getUser();
16960
16961 // Abort if we can't understand the usage
16962 if (!User->isMachineOpcode() ||
16963 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
16964 return Node;
16965
16966 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
16967 // Note that subregs are packed, i.e. Lane==0 is the first bit set
16968 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
16969 // set, etc.
16970 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
16971 if (Lane == ~0u)
16972 return Node;
16973
16974 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
16975 if (UsesTFC && Lane == TFCLane) {
16976 Users[Lane] = User;
16977 } else {
16978 // Set which texture component corresponds to the lane.
16979 unsigned Comp;
16980 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
16981 Comp = llvm::countr_zero(Dmask);
16982 Dmask &= ~(1 << Comp);
16983 }
16984
16985 // Abort if we have more than one user per component.
16986 if (Users[Lane])
16987 return Node;
16988
16989 Users[Lane] = User;
16990 NewDmask |= 1 << Comp;
16991 }
16992 }
16993
16994 // Don't allow 0 dmask, as hardware assumes one channel enabled.
16995 bool NoChannels = !NewDmask;
16996 if (NoChannels) {
16997 if (!UsesTFC) {
16998 // No uses of the result and not using TFC. Then do nothing.
16999 return Node;
17000 }
17001 // If the original dmask has one channel - then nothing to do
17002 if (OldBitsSet == 1)
17003 return Node;
17004 // Use an arbitrary dmask - required for the instruction to work
17005 NewDmask = 1;
17006 }
17007 // Abort if there's no change
17008 if (NewDmask == OldDmask)
17009 return Node;
17010
17011 unsigned BitsSet = llvm::popcount(NewDmask);
17012
17013 // Check for TFE or LWE - increase the number of channels by one to account
17014 // for the extra return value
17015 // This will need adjustment for D16 if this is also included in
17016 // adjustWriteMask (this function) but at present D16 are excluded.
17017 unsigned NewChannels = BitsSet + UsesTFC;
17018
17019 int NewOpcode =
17020 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
17021 assert(NewOpcode != -1 &&
17022 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17023 "failed to find equivalent MIMG op");
17024
17025 // Adjust the writemask in the node
17027 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
17028 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
17029 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
17030
17031 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
17032
17033 MVT ResultVT = NewChannels == 1
17034 ? SVT
17035 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
17036 : NewChannels == 5 ? 8
17037 : NewChannels);
17038 SDVTList NewVTList =
17039 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
17040
17041 MachineSDNode *NewNode =
17042 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
17043
17044 if (HasChain) {
17045 // Update chain.
17046 DAG.setNodeMemRefs(NewNode, Node->memoperands());
17047 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
17048 }
17049
17050 if (NewChannels == 1) {
17051 assert(Node->hasNUsesOfValue(1, 0));
17052 SDNode *Copy =
17053 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
17054 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
17055 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
17056 return nullptr;
17057 }
17058
17059 // Update the users of the node with the new indices
17060 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17061 SDNode *User = Users[i];
17062 if (!User) {
17063 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
17064 // Users[0] is still nullptr because channel 0 doesn't really have a use.
17065 if (i || !NoChannels)
17066 continue;
17067 } else {
17068 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
17069 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
17070 if (NewUser != User) {
17071 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
17072 DAG.RemoveDeadNode(User);
17073 }
17074 }
17075
17076 switch (Idx) {
17077 default:
17078 break;
17079 case AMDGPU::sub0:
17080 Idx = AMDGPU::sub1;
17081 break;
17082 case AMDGPU::sub1:
17083 Idx = AMDGPU::sub2;
17084 break;
17085 case AMDGPU::sub2:
17086 Idx = AMDGPU::sub3;
17087 break;
17088 case AMDGPU::sub3:
17089 Idx = AMDGPU::sub4;
17090 break;
17091 }
17092 }
17093
17094 DAG.RemoveDeadNode(Node);
17095 return nullptr;
17096}
17097
17099 if (Op.getOpcode() == ISD::AssertZext)
17100 Op = Op.getOperand(0);
17101
17102 return isa<FrameIndexSDNode>(Op);
17103}
17104
17105/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17106/// with frame index operands.
17107/// LLVM assumes that inputs are to these instructions are registers.
17108SDNode *
17110 SelectionDAG &DAG) const {
17111 if (Node->getOpcode() == ISD::CopyToReg) {
17112 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
17113 SDValue SrcVal = Node->getOperand(2);
17114
17115 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17116 // to try understanding copies to physical registers.
17117 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17118 SDLoc SL(Node);
17120 SDValue VReg = DAG.getRegister(
17121 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17122
17123 SDNode *Glued = Node->getGluedNode();
17124 SDValue ToVReg = DAG.getCopyToReg(
17125 Node->getOperand(0), SL, VReg, SrcVal,
17126 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
17127 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
17128 VReg, ToVReg.getValue(1));
17129 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
17130 DAG.RemoveDeadNode(Node);
17131 return ToResultReg.getNode();
17132 }
17133 }
17134
17136 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
17137 if (!isFrameIndexOp(Node->getOperand(i))) {
17138 Ops.push_back(Node->getOperand(i));
17139 continue;
17140 }
17141
17142 SDLoc DL(Node);
17143 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
17144 Node->getOperand(i).getValueType(),
17145 Node->getOperand(i)),
17146 0));
17147 }
17148
17149 return DAG.UpdateNodeOperands(Node, Ops);
17150}
17151
17152/// Fold the instructions after selecting them.
17153/// Returns null if users were already updated.
17155 SelectionDAG &DAG) const {
17157 unsigned Opcode = Node->getMachineOpcode();
17158
17159 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17160 !TII->isGather4(Opcode) &&
17161 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
17162 return adjustWritemask(Node, DAG);
17163 }
17164
17165 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17167 return Node;
17168 }
17169
17170 switch (Opcode) {
17171 case AMDGPU::V_DIV_SCALE_F32_e64:
17172 case AMDGPU::V_DIV_SCALE_F64_e64: {
17173 // Satisfy the operand register constraint when one of the inputs is
17174 // undefined. Ordinarily each undef value will have its own implicit_def of
17175 // a vreg, so force these to use a single register.
17176 SDValue Src0 = Node->getOperand(1);
17177 SDValue Src1 = Node->getOperand(3);
17178 SDValue Src2 = Node->getOperand(5);
17179
17180 if ((Src0.isMachineOpcode() &&
17181 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17182 (Src0 == Src1 || Src0 == Src2))
17183 break;
17184
17185 MVT VT = Src0.getValueType().getSimpleVT();
17186 const TargetRegisterClass *RC =
17187 getRegClassFor(VT, Src0.getNode()->isDivergent());
17188
17190 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
17191
17192 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
17193 Src0, SDValue());
17194
17195 // src0 must be the same register as src1 or src2, even if the value is
17196 // undefined, so make sure we don't violate this constraint.
17197 if (Src0.isMachineOpcode() &&
17198 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17199 if (Src1.isMachineOpcode() &&
17200 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17201 Src0 = Src1;
17202 else if (Src2.isMachineOpcode() &&
17203 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17204 Src0 = Src2;
17205 else {
17206 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17207 Src0 = UndefReg;
17208 Src1 = UndefReg;
17209 }
17210 } else
17211 break;
17212
17214 Ops[1] = Src0;
17215 Ops[3] = Src1;
17216 Ops[5] = Src2;
17217 Ops.push_back(ImpDef.getValue(1));
17218 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
17219 }
17220 default:
17221 break;
17222 }
17223
17224 return Node;
17225}
17226
17227// Any MIMG instructions that use tfe or lwe require an initialization of the
17228// result register that will be written in the case of a memory access failure.
17229// The required code is also added to tie this init code to the result of the
17230// img instruction.
17233 const SIRegisterInfo &TRI = TII->getRegisterInfo();
17234 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
17235 MachineBasicBlock &MBB = *MI.getParent();
17236
17237 int DstIdx =
17238 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
17239 unsigned InitIdx = 0;
17240
17241 if (TII->isImage(MI)) {
17242 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
17243 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
17244 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
17245
17246 if (!TFE && !LWE) // intersect_ray
17247 return;
17248
17249 unsigned TFEVal = TFE ? TFE->getImm() : 0;
17250 unsigned LWEVal = LWE ? LWE->getImm() : 0;
17251 unsigned D16Val = D16 ? D16->getImm() : 0;
17252
17253 if (!TFEVal && !LWEVal)
17254 return;
17255
17256 // At least one of TFE or LWE are non-zero
17257 // We have to insert a suitable initialization of the result value and
17258 // tie this to the dest of the image instruction.
17259
17260 // Calculate which dword we have to initialize to 0.
17261 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
17262
17263 // check that dmask operand is found.
17264 assert(MO_Dmask && "Expected dmask operand in instruction");
17265
17266 unsigned dmask = MO_Dmask->getImm();
17267 // Determine the number of active lanes taking into account the
17268 // Gather4 special case
17269 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
17270
17271 bool Packed = !Subtarget->hasUnpackedD16VMem();
17272
17273 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17274
17275 // Abandon attempt if the dst size isn't large enough
17276 // - this is in fact an error but this is picked up elsewhere and
17277 // reported correctly.
17278 uint32_t DstSize =
17279 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
17280 if (DstSize < InitIdx)
17281 return;
17282 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
17283 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
17284 } else {
17285 return;
17286 }
17287
17288 const DebugLoc &DL = MI.getDebugLoc();
17289
17290 // Create a register for the initialization value.
17291 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
17292 unsigned NewDst = 0; // Final initialized value will be in here
17293
17294 // If PRTStrictNull feature is enabled (the default) then initialize
17295 // all the result registers to 0, otherwise just the error indication
17296 // register (VGPRn+1)
17297 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17298 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17299
17300 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
17301 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17302 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
17303 // Initialize dword
17304 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
17305 // clang-format off
17306 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
17307 .addImm(0);
17308 // clang-format on
17309 // Insert into the super-reg
17310 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
17311 .addReg(PrevDst)
17312 .addReg(SubReg)
17314
17315 PrevDst = NewDst;
17316 }
17317
17318 // Add as an implicit operand
17319 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
17320
17321 // Tie the just added implicit operand to the dst
17322 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
17323}
17324
17325/// Assign the register class depending on the number of
17326/// bits set in the writemask
17328 SDNode *Node) const {
17330
17331 MachineFunction *MF = MI.getParent()->getParent();
17334
17335 if (TII->isVOP3(MI.getOpcode())) {
17336 // Make sure constant bus requirements are respected.
17337 TII->legalizeOperandsVOP3(MRI, MI);
17338
17339 // Prefer VGPRs over AGPRs in mAI instructions where possible.
17340 // This saves a chain-copy of registers and better balance register
17341 // use between vgpr and agpr as agpr tuples tend to be big.
17342 if (!MI.getDesc().operands().empty()) {
17343 unsigned Opc = MI.getOpcode();
17344 bool HasAGPRs = Info->mayNeedAGPRs();
17345 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17346 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
17347 for (auto I :
17348 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
17349 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
17350 if (I == -1)
17351 break;
17352 if ((I == Src2Idx) && (HasAGPRs))
17353 break;
17354 MachineOperand &Op = MI.getOperand(I);
17355 if (!Op.isReg() || !Op.getReg().isVirtual())
17356 continue;
17357 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
17358 if (!TRI->hasAGPRs(RC))
17359 continue;
17360 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
17361 if (!Src || !Src->isCopy() ||
17362 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
17363 continue;
17364 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
17365 // All uses of agpr64 and agpr32 can also accept vgpr except for
17366 // v_accvgpr_read, but we do not produce agpr reads during selection,
17367 // so no use checks are needed.
17368 MRI.setRegClass(Op.getReg(), NewRC);
17369 }
17370
17371 if (TII->isMAI(MI)) {
17372 // The ordinary src0, src1, src2 were legalized above.
17373 //
17374 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
17375 // as a separate instruction.
17376 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17377 AMDGPU::OpName::scale_src0);
17378 if (Src0Idx != -1) {
17379 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17380 AMDGPU::OpName::scale_src1);
17381 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
17382 TII->usesConstantBus(MRI, MI, Src1Idx))
17383 TII->legalizeOpWithMove(MI, Src1Idx);
17384 }
17385 }
17386
17387 if (!HasAGPRs)
17388 return;
17389
17390 // Resolve the rest of AV operands to AGPRs.
17391 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
17392 if (Src2->isReg() && Src2->getReg().isVirtual()) {
17393 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
17394 if (TRI->isVectorSuperClass(RC)) {
17395 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
17396 MRI.setRegClass(Src2->getReg(), NewRC);
17397 if (Src2->isTied())
17398 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
17399 }
17400 }
17401 }
17402 }
17403
17404 return;
17405 }
17406
17407 if (TII->isImage(MI))
17408 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
17409}
17410
17412 uint64_t Val) {
17413 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
17414 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
17415}
17416
17418 const SDLoc &DL,
17419 SDValue Ptr) const {
17421
17422 // Build the half of the subregister with the constants before building the
17423 // full 128-bit register. If we are building multiple resource descriptors,
17424 // this will allow CSEing of the 2-component register.
17425 const SDValue Ops0[] = {
17426 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
17427 buildSMovImm32(DAG, DL, 0),
17428 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17429 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
17430 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
17431
17432 SDValue SubRegHi = SDValue(
17433 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
17434
17435 // Combine the constants and the pointer.
17436 const SDValue Ops1[] = {
17437 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
17438 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
17439 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
17440
17441 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
17442}
17443
17444/// Return a resource descriptor with the 'Add TID' bit enabled
17445/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
17446/// of the resource descriptor) to create an offset, which is added to
17447/// the resource pointer.
17449 SDValue Ptr, uint32_t RsrcDword1,
17450 uint64_t RsrcDword2And3) const {
17451 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
17452 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
17453 if (RsrcDword1) {
17454 PtrHi =
17455 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
17456 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
17457 0);
17458 }
17459
17460 SDValue DataLo =
17461 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
17462 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
17463
17464 const SDValue Ops[] = {
17465 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
17466 PtrLo,
17467 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17468 PtrHi,
17469 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
17470 DataLo,
17471 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
17472 DataHi,
17473 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
17474
17475 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
17476}
17477
17478//===----------------------------------------------------------------------===//
17479// SI Inline Assembly Support
17480//===----------------------------------------------------------------------===//
17481
17482std::pair<unsigned, const TargetRegisterClass *>
17484 StringRef Constraint,
17485 MVT VT) const {
17486 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
17487
17488 const TargetRegisterClass *RC = nullptr;
17489 if (Constraint.size() == 1) {
17490 // Check if we cannot determine the bit size of the given value type. This
17491 // can happen, for example, in this situation where we have an empty struct
17492 // (size 0): `call void asm "", "v"({} poison)`-
17493 if (VT == MVT::Other)
17494 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17495 const unsigned BitWidth = VT.getSizeInBits();
17496 switch (Constraint[0]) {
17497 default:
17498 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17499 case 's':
17500 case 'r':
17501 switch (BitWidth) {
17502 case 16:
17503 RC = &AMDGPU::SReg_32RegClass;
17504 break;
17505 case 64:
17506 RC = &AMDGPU::SGPR_64RegClass;
17507 break;
17508 default:
17510 if (!RC)
17511 return std::pair(0U, nullptr);
17512 break;
17513 }
17514 break;
17515 case 'v':
17516 switch (BitWidth) {
17517 case 16:
17518 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17519 : &AMDGPU::VGPR_32_Lo256RegClass;
17520 break;
17521 default:
17522 RC = Subtarget->has1024AddressableVGPRs()
17523 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
17524 : TRI->getVGPRClassForBitWidth(BitWidth);
17525 if (!RC)
17526 return std::pair(0U, nullptr);
17527 break;
17528 }
17529 break;
17530 case 'a':
17531 if (!Subtarget->hasMAIInsts())
17532 break;
17533 switch (BitWidth) {
17534 case 16:
17535 RC = &AMDGPU::AGPR_32RegClass;
17536 break;
17537 default:
17538 RC = TRI->getAGPRClassForBitWidth(BitWidth);
17539 if (!RC)
17540 return std::pair(0U, nullptr);
17541 break;
17542 }
17543 break;
17544 }
17545 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
17546 const unsigned BitWidth = VT.getSizeInBits();
17547 switch (BitWidth) {
17548 case 16:
17549 RC = &AMDGPU::AV_32RegClass;
17550 break;
17551 default:
17552 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
17553 if (!RC)
17554 return std::pair(0U, nullptr);
17555 break;
17556 }
17557 }
17558
17559 // We actually support i128, i16 and f16 as inline parameters
17560 // even if they are not reported as legal
17561 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
17562 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
17563 return std::pair(0U, RC);
17564
17565 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
17566 if (Kind != '\0') {
17567 if (Kind == 'v') {
17568 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17569 } else if (Kind == 's') {
17570 RC = &AMDGPU::SGPR_32RegClass;
17571 } else if (Kind == 'a') {
17572 RC = &AMDGPU::AGPR_32RegClass;
17573 }
17574
17575 if (RC) {
17576 if (NumRegs > 1) {
17577 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
17578 return std::pair(0U, nullptr);
17579
17580 uint32_t Width = NumRegs * 32;
17581 // Prohibit constraints for register ranges with a width that does not
17582 // match the required type.
17583 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
17584 return std::pair(0U, nullptr);
17585
17586 MCRegister Reg = RC->getRegister(Idx);
17588 RC = TRI->getVGPRClassForBitWidth(Width);
17589 else if (SIRegisterInfo::isSGPRClass(RC))
17590 RC = TRI->getSGPRClassForBitWidth(Width);
17591 else if (SIRegisterInfo::isAGPRClass(RC))
17592 RC = TRI->getAGPRClassForBitWidth(Width);
17593 if (RC) {
17594 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17595 if (!Reg) {
17596 // The register class does not contain the requested register,
17597 // e.g., because it is an SGPR pair that would violate alignment
17598 // requirements.
17599 return std::pair(0U, nullptr);
17600 }
17601 return std::pair(Reg, RC);
17602 }
17603 }
17604
17605 // Check for lossy scalar/vector conversions.
17606 if (VT.isVector() && VT.getSizeInBits() != 32)
17607 return std::pair(0U, nullptr);
17608 if (Idx < RC->getNumRegs())
17609 return std::pair(RC->getRegister(Idx), RC);
17610 return std::pair(0U, nullptr);
17611 }
17612 }
17613
17614 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17615 if (Ret.first)
17616 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
17617
17618 return Ret;
17619}
17620
17621static bool isImmConstraint(StringRef Constraint) {
17622 if (Constraint.size() == 1) {
17623 switch (Constraint[0]) {
17624 default:
17625 break;
17626 case 'I':
17627 case 'J':
17628 case 'A':
17629 case 'B':
17630 case 'C':
17631 return true;
17632 }
17633 } else if (Constraint == "DA" || Constraint == "DB") {
17634 return true;
17635 }
17636 return false;
17637}
17638
17641 if (Constraint.size() == 1) {
17642 switch (Constraint[0]) {
17643 default:
17644 break;
17645 case 's':
17646 case 'v':
17647 case 'a':
17648 return C_RegisterClass;
17649 }
17650 } else if (Constraint.size() == 2) {
17651 if (Constraint == "VA")
17652 return C_RegisterClass;
17653 }
17654 if (isImmConstraint(Constraint)) {
17655 return C_Other;
17656 }
17657 return TargetLowering::getConstraintType(Constraint);
17658}
17659
17660static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
17662 Val = Val & maskTrailingOnes<uint64_t>(Size);
17663 }
17664 return Val;
17665}
17666
17668 StringRef Constraint,
17669 std::vector<SDValue> &Ops,
17670 SelectionDAG &DAG) const {
17671 if (isImmConstraint(Constraint)) {
17672 uint64_t Val;
17673 if (getAsmOperandConstVal(Op, Val) &&
17674 checkAsmConstraintVal(Op, Constraint, Val)) {
17675 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
17676 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
17677 }
17678 } else {
17680 }
17681}
17682
17684 unsigned Size = Op.getScalarValueSizeInBits();
17685 if (Size > 64)
17686 return false;
17687
17688 if (Size == 16 && !Subtarget->has16BitInsts())
17689 return false;
17690
17692 Val = C->getSExtValue();
17693 return true;
17694 }
17696 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17697 return true;
17698 }
17700 if (Size != 16 || Op.getNumOperands() != 2)
17701 return false;
17702 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
17703 return false;
17704 if (ConstantSDNode *C = V->getConstantSplatNode()) {
17705 Val = C->getSExtValue();
17706 return true;
17707 }
17708 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
17709 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17710 return true;
17711 }
17712 }
17713
17714 return false;
17715}
17716
17718 uint64_t Val) const {
17719 if (Constraint.size() == 1) {
17720 switch (Constraint[0]) {
17721 case 'I':
17723 case 'J':
17724 return isInt<16>(Val);
17725 case 'A':
17726 return checkAsmConstraintValA(Op, Val);
17727 case 'B':
17728 return isInt<32>(Val);
17729 case 'C':
17730 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
17732 default:
17733 break;
17734 }
17735 } else if (Constraint.size() == 2) {
17736 if (Constraint == "DA") {
17737 int64_t HiBits = static_cast<int32_t>(Val >> 32);
17738 int64_t LoBits = static_cast<int32_t>(Val);
17739 return checkAsmConstraintValA(Op, HiBits, 32) &&
17740 checkAsmConstraintValA(Op, LoBits, 32);
17741 }
17742 if (Constraint == "DB") {
17743 return true;
17744 }
17745 }
17746 llvm_unreachable("Invalid asm constraint");
17747}
17748
17750 unsigned MaxSize) const {
17751 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
17752 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17753 if (Size == 16) {
17754 MVT VT = Op.getSimpleValueType();
17755 switch (VT.SimpleTy) {
17756 default:
17757 return false;
17758 case MVT::i16:
17759 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
17760 case MVT::f16:
17761 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
17762 case MVT::bf16:
17763 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
17764 case MVT::v2i16:
17765 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
17766 case MVT::v2f16:
17767 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
17768 case MVT::v2bf16:
17769 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
17770 }
17771 }
17772 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
17773 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
17774 return true;
17775 return false;
17776}
17777
17778static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
17779 switch (UnalignedClassID) {
17780 case AMDGPU::VReg_64RegClassID:
17781 return AMDGPU::VReg_64_Align2RegClassID;
17782 case AMDGPU::VReg_96RegClassID:
17783 return AMDGPU::VReg_96_Align2RegClassID;
17784 case AMDGPU::VReg_128RegClassID:
17785 return AMDGPU::VReg_128_Align2RegClassID;
17786 case AMDGPU::VReg_160RegClassID:
17787 return AMDGPU::VReg_160_Align2RegClassID;
17788 case AMDGPU::VReg_192RegClassID:
17789 return AMDGPU::VReg_192_Align2RegClassID;
17790 case AMDGPU::VReg_224RegClassID:
17791 return AMDGPU::VReg_224_Align2RegClassID;
17792 case AMDGPU::VReg_256RegClassID:
17793 return AMDGPU::VReg_256_Align2RegClassID;
17794 case AMDGPU::VReg_288RegClassID:
17795 return AMDGPU::VReg_288_Align2RegClassID;
17796 case AMDGPU::VReg_320RegClassID:
17797 return AMDGPU::VReg_320_Align2RegClassID;
17798 case AMDGPU::VReg_352RegClassID:
17799 return AMDGPU::VReg_352_Align2RegClassID;
17800 case AMDGPU::VReg_384RegClassID:
17801 return AMDGPU::VReg_384_Align2RegClassID;
17802 case AMDGPU::VReg_512RegClassID:
17803 return AMDGPU::VReg_512_Align2RegClassID;
17804 case AMDGPU::VReg_1024RegClassID:
17805 return AMDGPU::VReg_1024_Align2RegClassID;
17806 case AMDGPU::AReg_64RegClassID:
17807 return AMDGPU::AReg_64_Align2RegClassID;
17808 case AMDGPU::AReg_96RegClassID:
17809 return AMDGPU::AReg_96_Align2RegClassID;
17810 case AMDGPU::AReg_128RegClassID:
17811 return AMDGPU::AReg_128_Align2RegClassID;
17812 case AMDGPU::AReg_160RegClassID:
17813 return AMDGPU::AReg_160_Align2RegClassID;
17814 case AMDGPU::AReg_192RegClassID:
17815 return AMDGPU::AReg_192_Align2RegClassID;
17816 case AMDGPU::AReg_256RegClassID:
17817 return AMDGPU::AReg_256_Align2RegClassID;
17818 case AMDGPU::AReg_512RegClassID:
17819 return AMDGPU::AReg_512_Align2RegClassID;
17820 case AMDGPU::AReg_1024RegClassID:
17821 return AMDGPU::AReg_1024_Align2RegClassID;
17822 default:
17823 return -1;
17824 }
17825}
17826
17827// Figure out which registers should be reserved for stack access. Only after
17828// the function is legalized do we know all of the non-spill stack objects or if
17829// calls are present.
17833 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
17834 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17835 const SIInstrInfo *TII = ST.getInstrInfo();
17836
17837 if (Info->isEntryFunction()) {
17838 // Callable functions have fixed registers used for stack access.
17840 }
17841
17842 // TODO: Move this logic to getReservedRegs()
17843 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
17844 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17845 Register SReg = ST.isWave32()
17846 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17847 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
17848 &AMDGPU::SGPR_64RegClass);
17849 Info->setSGPRForEXECCopy(SReg);
17850
17851 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
17852 Info->getStackPtrOffsetReg()));
17853 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17854 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17855
17856 // We need to worry about replacing the default register with itself in case
17857 // of MIR testcases missing the MFI.
17858 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17859 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17860
17861 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17862 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17863
17864 Info->limitOccupancy(MF);
17865
17866 if (ST.isWave32() && !MF.empty()) {
17867 for (auto &MBB : MF) {
17868 for (auto &MI : MBB) {
17869 TII->fixImplicitOperands(MI);
17870 }
17871 }
17872 }
17873
17874 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
17875 // classes if required. Ideally the register class constraints would differ
17876 // per-subtarget, but there's no easy way to achieve that right now. This is
17877 // not a problem for VGPRs because the correctly aligned VGPR class is implied
17878 // from using them as the register class for legal types.
17879 if (ST.needsAlignedVGPRs()) {
17880 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
17881 const Register Reg = Register::index2VirtReg(I);
17882 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
17883 if (!RC)
17884 continue;
17885 int NewClassID = getAlignedAGPRClassID(RC->getID());
17886 if (NewClassID != -1)
17887 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
17888 }
17889 }
17890
17892}
17893
17895 KnownBits &Known,
17896 const APInt &DemandedElts,
17897 const SelectionDAG &DAG,
17898 unsigned Depth) const {
17899 Known.resetAll();
17900 unsigned Opc = Op.getOpcode();
17901 switch (Opc) {
17903 unsigned IID = Op.getConstantOperandVal(0);
17904 switch (IID) {
17905 case Intrinsic::amdgcn_mbcnt_lo:
17906 case Intrinsic::amdgcn_mbcnt_hi: {
17907 const GCNSubtarget &ST =
17909 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
17910 // most 31 + src1.
17911 Known.Zero.setBitsFrom(
17912 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17913 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
17914 Known = KnownBits::add(Known, Known2);
17915 return;
17916 }
17917 }
17918 break;
17919 }
17920 }
17922 Op, Known, DemandedElts, DAG, Depth);
17923}
17924
17926 const int FI, KnownBits &Known, const MachineFunction &MF) const {
17928
17929 // Set the high bits to zero based on the maximum allowed scratch size per
17930 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
17931 // calculation won't overflow, so assume the sign bit is never set.
17932 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
17933}
17934
17936 GISelValueTracking &VT, KnownBits &Known,
17937 unsigned Dim) {
17938 unsigned MaxValue =
17939 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
17940 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
17941}
17942
17944 KnownBits &Known, const APInt &DemandedElts,
17945 unsigned BFEWidth, bool SExt, unsigned Depth) {
17947 const MachineOperand &Src1 = MI.getOperand(2);
17948
17949 unsigned Src1Cst = 0;
17950 if (Src1.isImm()) {
17951 Src1Cst = Src1.getImm();
17952 } else if (Src1.isReg()) {
17953 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
17954 if (!Cst)
17955 return;
17956 Src1Cst = Cst->Value.getZExtValue();
17957 } else {
17958 return;
17959 }
17960
17961 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
17962 // Width is always [22:16].
17963 const unsigned Offset =
17964 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
17965 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
17966
17967 if (Width >= BFEWidth) // Ill-formed.
17968 return;
17969
17970 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
17971 Depth + 1);
17972
17973 Known = Known.extractBits(Width, Offset);
17974
17975 if (SExt)
17976 Known = Known.sext(BFEWidth);
17977 else
17978 Known = Known.zext(BFEWidth);
17979}
17980
17982 GISelValueTracking &VT, Register R, KnownBits &Known,
17983 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
17984 unsigned Depth) const {
17985 Known.resetAll();
17986 const MachineInstr *MI = MRI.getVRegDef(R);
17987 switch (MI->getOpcode()) {
17988 case AMDGPU::S_BFE_I32:
17989 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
17990 /*SExt=*/true, Depth);
17991 case AMDGPU::S_BFE_U32:
17992 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
17993 /*SExt=*/false, Depth);
17994 case AMDGPU::S_BFE_I64:
17995 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
17996 /*SExt=*/true, Depth);
17997 case AMDGPU::S_BFE_U64:
17998 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
17999 /*SExt=*/false, Depth);
18000 case AMDGPU::G_INTRINSIC:
18001 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18002 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
18003 switch (IID) {
18004 case Intrinsic::amdgcn_workitem_id_x:
18005 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
18006 break;
18007 case Intrinsic::amdgcn_workitem_id_y:
18008 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
18009 break;
18010 case Intrinsic::amdgcn_workitem_id_z:
18011 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
18012 break;
18013 case Intrinsic::amdgcn_mbcnt_lo:
18014 case Intrinsic::amdgcn_mbcnt_hi: {
18015 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18016 // most 31 + src1.
18017 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18018 ? getSubtarget()->getWavefrontSizeLog2()
18019 : 5);
18020 KnownBits Known2;
18021 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
18022 Depth + 1);
18023 Known = KnownBits::add(Known, Known2);
18024 break;
18025 }
18026 case Intrinsic::amdgcn_groupstaticsize: {
18027 // We can report everything over the maximum size as 0. We can't report
18028 // based on the actual size because we don't know if it's accurate or not
18029 // at any given point.
18030 Known.Zero.setHighBits(
18031 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
18032 break;
18033 }
18034 }
18035 break;
18036 }
18037 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18038 Known.Zero.setHighBits(24);
18039 break;
18040 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18041 Known.Zero.setHighBits(16);
18042 break;
18043 case AMDGPU::G_AMDGPU_SMED3:
18044 case AMDGPU::G_AMDGPU_UMED3: {
18045 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18046
18047 KnownBits Known2;
18048 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
18049 if (Known2.isUnknown())
18050 break;
18051
18052 KnownBits Known1;
18053 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
18054 if (Known1.isUnknown())
18055 break;
18056
18057 KnownBits Known0;
18058 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
18059 if (Known0.isUnknown())
18060 break;
18061
18062 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18063 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18064 Known.One = Known0.One & Known1.One & Known2.One;
18065 break;
18066 }
18067 }
18068}
18069
18072 unsigned Depth) const {
18073 const MachineInstr *MI = MRI.getVRegDef(R);
18074 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
18075 // FIXME: Can this move to generic code? What about the case where the call
18076 // site specifies a lower alignment?
18077 Intrinsic::ID IID = GI->getIntrinsicID();
18079 AttributeList Attrs =
18080 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
18081 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18082 return *RetAlign;
18083 }
18084 return Align(1);
18085}
18086
18089 const Align CacheLineAlign = Align(64);
18090
18091 // Pre-GFX10 target did not benefit from loop alignment
18092 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
18093 getSubtarget()->hasInstFwdPrefetchBug())
18094 return PrefAlign;
18095
18096 // On GFX10 I$ is 4 x 64 bytes cache lines.
18097 // By default prefetcher keeps one cache line behind and reads two ahead.
18098 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
18099 // behind and one ahead.
18100 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18101 // If loop fits 64 bytes it always spans no more than two cache lines and
18102 // does not need an alignment.
18103 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18104 // Else if loop is less or equal 192 bytes we need two lines behind.
18105
18107 const MachineBasicBlock *Header = ML->getHeader();
18108 if (Header->getAlignment() != PrefAlign)
18109 return Header->getAlignment(); // Already processed.
18110
18111 unsigned LoopSize = 0;
18112 for (const MachineBasicBlock *MBB : ML->blocks()) {
18113 // If inner loop block is aligned assume in average half of the alignment
18114 // size to be added as nops.
18115 if (MBB != Header)
18116 LoopSize += MBB->getAlignment().value() / 2;
18117
18118 for (const MachineInstr &MI : *MBB) {
18119 LoopSize += TII->getInstSizeInBytes(MI);
18120 if (LoopSize > 192)
18121 return PrefAlign;
18122 }
18123 }
18124
18125 if (LoopSize <= 64)
18126 return PrefAlign;
18127
18128 if (LoopSize <= 128)
18129 return CacheLineAlign;
18130
18131 // If any of parent loops is surrounded by prefetch instructions do not
18132 // insert new for inner loop, which would reset parent's settings.
18133 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18134 if (MachineBasicBlock *Exit = P->getExitBlock()) {
18135 auto I = Exit->getFirstNonDebugInstr();
18136 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18137 return CacheLineAlign;
18138 }
18139 }
18140
18141 MachineBasicBlock *Pre = ML->getLoopPreheader();
18142 MachineBasicBlock *Exit = ML->getExitBlock();
18143
18144 if (Pre && Exit) {
18145 auto PreTerm = Pre->getFirstTerminator();
18146 if (PreTerm == Pre->begin() ||
18147 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18148 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18149 .addImm(1); // prefetch 2 lines behind PC
18150
18151 auto ExitHead = Exit->getFirstNonDebugInstr();
18152 if (ExitHead == Exit->end() ||
18153 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18154 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18155 .addImm(2); // prefetch 1 line behind PC
18156 }
18157
18158 return CacheLineAlign;
18159}
18160
18162static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18163 assert(N->getOpcode() == ISD::CopyFromReg);
18164 do {
18165 // Follow the chain until we find an INLINEASM node.
18166 N = N->getOperand(0).getNode();
18167 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
18168 return true;
18169 } while (N->getOpcode() == ISD::CopyFromReg);
18170 return false;
18171}
18172
18175 UniformityInfo *UA) const {
18176 switch (N->getOpcode()) {
18177 case ISD::CopyFromReg: {
18178 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
18179 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18180 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18181 Register Reg = R->getReg();
18182
18183 // FIXME: Why does this need to consider isLiveIn?
18184 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
18185 return !TRI->isSGPRReg(MRI, Reg);
18186
18187 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
18188 return UA->isDivergent(V);
18189
18191 return !TRI->isSGPRReg(MRI, Reg);
18192 }
18193 case ISD::LOAD: {
18194 const LoadSDNode *L = cast<LoadSDNode>(N);
18195 unsigned AS = L->getAddressSpace();
18196 // A flat load may access private memory.
18198 }
18199 case ISD::CALLSEQ_END:
18200 return true;
18202 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
18204 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
18223 // Target-specific read-modify-write atomics are sources of divergence.
18224 return true;
18225 default:
18226 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
18227 // Generic read-modify-write atomics are sources of divergence.
18228 return A->readMem() && A->writeMem();
18229 }
18230 return false;
18231 }
18232}
18233
18235 EVT VT) const {
18236 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
18237 case MVT::f32:
18239 case MVT::f64:
18240 case MVT::f16:
18242 default:
18243 return false;
18244 }
18245}
18246
18248 LLT Ty, const MachineFunction &MF) const {
18249 switch (Ty.getScalarSizeInBits()) {
18250 case 32:
18251 return !denormalModeIsFlushAllF32(MF);
18252 case 64:
18253 case 16:
18254 return !denormalModeIsFlushAllF64F16(MF);
18255 default:
18256 return false;
18257 }
18258}
18259
18261 const APInt &DemandedElts,
18262 const SelectionDAG &DAG,
18263 bool SNaN,
18264 unsigned Depth) const {
18265 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
18266 const MachineFunction &MF = DAG.getMachineFunction();
18268
18269 if (Info->getMode().DX10Clamp)
18270 return true; // Clamped to 0.
18271 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
18272 }
18273
18275 DAG, SNaN, Depth);
18276}
18277
18278// On older subtargets, global FP atomic instructions have a hardcoded FP mode
18279// and do not support FP32 denormals, and only support v2f16/f64 denormals.
18281 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
18282 return true;
18283
18285 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
18286 if (DenormMode == DenormalMode::getPreserveSign())
18287 return true;
18288
18289 // TODO: Remove this.
18290 return RMW->getFunction()
18291 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
18292 .getValueAsBool();
18293}
18294
18296 LLVMContext &Ctx = RMW->getContext();
18297 StringRef MemScope =
18298 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
18299
18300 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
18301 << "Hardware instruction generated for atomic "
18302 << RMW->getOperationName(RMW->getOperation())
18303 << " operation at memory scope " << MemScope;
18304}
18305
18306static bool isV2F16OrV2BF16(Type *Ty) {
18307 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
18308 Type *EltTy = VT->getElementType();
18309 return VT->getNumElements() == 2 &&
18310 (EltTy->isHalfTy() || EltTy->isBFloatTy());
18311 }
18312
18313 return false;
18314}
18315
18316static bool isV2F16(Type *Ty) {
18318 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
18319}
18320
18321static bool isV2BF16(Type *Ty) {
18323 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
18324}
18325
18326/// \return true if atomicrmw integer ops work for the type.
18327static bool isAtomicRMWLegalIntTy(Type *Ty) {
18328 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
18329 unsigned BW = IT->getBitWidth();
18330 return BW == 32 || BW == 64;
18331 }
18332
18333 return false;
18334}
18335
18336/// \return true if this atomicrmw xchg type can be selected.
18337static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
18338 Type *Ty = RMW->getType();
18339 if (isAtomicRMWLegalIntTy(Ty))
18340 return true;
18341
18342 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
18343 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
18344 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
18345 return BW == 32 || BW == 64;
18346 }
18347
18348 if (Ty->isFloatTy() || Ty->isDoubleTy())
18349 return true;
18350
18352 return VT->getNumElements() == 2 &&
18353 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18354 }
18355
18356 return false;
18357}
18358
18359/// \returns true if it's valid to emit a native instruction for \p RMW, based
18360/// on the properties of the target memory.
18361static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
18362 const AtomicRMWInst *RMW,
18363 bool HasSystemScope) {
18364 // The remote/fine-grained access logic is different from the integer
18365 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
18366 // fine-grained access does not work, even for a device local allocation.
18367 //
18368 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
18369 // allocations work.
18370 if (HasSystemScope) {
18372 RMW->hasMetadata("amdgpu.no.remote.memory"))
18373 return true;
18374 if (Subtarget.hasEmulatedSystemScopeAtomics())
18375 return true;
18377 return true;
18378
18379 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
18380}
18381
18382/// \return Action to perform on AtomicRMWInsts for integer operations.
18389
18390/// Return if a flat address space atomicrmw can access private memory.
18392 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
18393 return !MD ||
18395}
18396
18404
18407 unsigned AS = RMW->getPointerAddressSpace();
18408 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
18410
18411 // 64-bit flat atomics that dynamically reside in private memory will silently
18412 // be dropped.
18413 //
18414 // Note that we will emit a new copy of the original atomic in the expansion,
18415 // which will be incrementally relegalized.
18416 const DataLayout &DL = RMW->getFunction()->getDataLayout();
18417 if (AS == AMDGPUAS::FLAT_ADDRESS &&
18418 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
18421
18422 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
18424 ORE.emit([=]() {
18425 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
18426 });
18427 return Kind;
18428 };
18429
18430 auto SSID = RMW->getSyncScopeID();
18431 bool HasSystemScope =
18432 SSID == SyncScope::System ||
18433 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
18434
18435 auto Op = RMW->getOperation();
18436 switch (Op) {
18438 // PCIe supports add and xchg for system atomics.
18439 return isAtomicRMWLegalXChgTy(RMW)
18442 case AtomicRMWInst::Add:
18443 // PCIe supports add and xchg for system atomics.
18445 case AtomicRMWInst::Sub:
18446 case AtomicRMWInst::And:
18447 case AtomicRMWInst::Or:
18448 case AtomicRMWInst::Xor:
18449 case AtomicRMWInst::Max:
18450 case AtomicRMWInst::Min:
18457 if (Subtarget->hasEmulatedSystemScopeAtomics())
18459
18460 // On most subtargets, for atomicrmw operations other than add/xchg,
18461 // whether or not the instructions will behave correctly depends on where
18462 // the address physically resides and what interconnect is used in the
18463 // system configuration. On some some targets the instruction will nop,
18464 // and in others synchronization will only occur at degraded device scope.
18465 //
18466 // If the allocation is known local to the device, the instructions should
18467 // work correctly.
18468 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
18470
18471 // If fine-grained remote memory works at device scope, we don't need to
18472 // do anything.
18473 if (!HasSystemScope &&
18474 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18476
18477 // If we are targeting a remote allocated address, it depends what kind of
18478 // allocation the address belongs to.
18479 //
18480 // If the allocation is fine-grained (in host memory, or in PCIe peer
18481 // device memory), the operation will fail depending on the target.
18482 //
18483 // Note fine-grained host memory access does work on APUs or if XGMI is
18484 // used, but we do not know if we are targeting an APU or the system
18485 // configuration from the ISA version/target-cpu.
18486 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
18488
18491 // Atomic sub/or/xor do not work over PCI express, but atomic add
18492 // does. InstCombine transforms these with 0 to or, so undo that.
18493 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
18494 ConstVal && ConstVal->isNullValue())
18496 }
18497
18498 // If the allocation could be in remote, fine-grained memory, the rmw
18499 // instructions may fail. cmpxchg should work, so emit that. On some
18500 // system configurations, PCIe atomics aren't supported so cmpxchg won't
18501 // even work, so you're out of luck anyway.
18502
18503 // In summary:
18504 //
18505 // Cases that may fail:
18506 // - fine-grained pinned host memory
18507 // - fine-grained migratable host memory
18508 // - fine-grained PCIe peer device
18509 //
18510 // Cases that should work, but may be treated overly conservatively.
18511 // - fine-grained host memory on an APU
18512 // - fine-grained XGMI peer device
18514 }
18515
18517 }
18518 case AtomicRMWInst::FAdd: {
18519 Type *Ty = RMW->getType();
18520
18521 // TODO: Handle REGION_ADDRESS
18522 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18523 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
18524 // is fixed to round-to-nearest-even.
18525 //
18526 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
18527 // round-to-nearest-even.
18528 //
18529 // We ignore the rounding mode problem, even in strictfp. The C++ standard
18530 // suggests it is OK if the floating-point mode may not match the calling
18531 // thread.
18532 if (Ty->isFloatTy()) {
18533 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
18535 }
18536
18537 if (Ty->isDoubleTy()) {
18538 // Ignores denormal mode, but we don't consider flushing mandatory.
18539 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
18541 }
18542
18543 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18545
18547 }
18548
18549 // LDS atomics respect the denormal mode from the mode register.
18550 //
18551 // Traditionally f32 global/buffer memory atomics would unconditionally
18552 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
18553 // flush.
18554 //
18555 // On targets with flat atomic fadd, denormals would flush depending on
18556 // whether the target address resides in LDS or global memory. We consider
18557 // this flat-maybe-flush as will-flush.
18558 if (Ty->isFloatTy() &&
18559 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18562
18563 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
18564 // safe. The message phrasing also should be better.
18565 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18566 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18567 // gfx942, gfx12
18568 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18569 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18570 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
18571 // gfx90a, gfx942, gfx12
18572 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18573 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18574
18575 // gfx942, gfx12
18576 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
18577 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18578 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
18579 // gfx90a, gfx942, gfx12
18580 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18581 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18582
18583 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
18584 // buffer. gfx12 does have the buffer version.
18585 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
18586 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18587 }
18588
18589 // global and flat atomic fadd f64: gfx90a, gfx942.
18590 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18591 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18592
18593 if (AS != AMDGPUAS::FLAT_ADDRESS) {
18594 if (Ty->isFloatTy()) {
18595 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
18596 // gfx11+.
18597 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18598 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18599 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
18600 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18601 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18602 } else {
18603 // gfx908
18604 if (RMW->use_empty() &&
18605 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18606 isV2F16(Ty))
18607 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18608 }
18609 }
18610
18611 // flat atomic fadd f32: gfx942, gfx11+.
18612 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
18613 if (Subtarget->hasFlatAtomicFaddF32Inst())
18614 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18615
18616 // If it is in flat address space, and the type is float, we will try to
18617 // expand it, if the target supports global and lds atomic fadd. The
18618 // reason we need that is, in the expansion, we emit the check of
18619 // address space. If it is in global address space, we emit the global
18620 // atomic fadd; if it is in shared address space, we emit the LDS atomic
18621 // fadd.
18622 if (Subtarget->hasLDSFPAtomicAddF32()) {
18623 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18625 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18627 }
18628 }
18629 }
18630
18632 }
18634 case AtomicRMWInst::FMax: {
18635 Type *Ty = RMW->getType();
18636
18637 // LDS float and double fmin/fmax were always supported.
18638 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18639 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
18641 }
18642
18643 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18644 // For flat and global cases:
18645 // float, double in gfx7. Manual claims denormal support.
18646 // Removed in gfx8.
18647 // float, double restored in gfx10.
18648 // double removed again in gfx11, so only f32 for gfx11/gfx12.
18649 //
18650 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
18651 // no f32.
18652 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18653 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18654 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18655 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18656 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18657 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
18659 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18660 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18661 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18662 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18663 }
18664 }
18665
18667 }
18670 default:
18672 }
18673
18674 llvm_unreachable("covered atomicrmw op switch");
18675}
18676
18683
18690
18693 unsigned AddrSpace = CmpX->getPointerAddressSpace();
18694 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
18696
18697 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
18699
18700 const DataLayout &DL = CmpX->getDataLayout();
18701
18702 Type *ValTy = CmpX->getNewValOperand()->getType();
18703
18704 // If a 64-bit flat atomic may alias private, we need to avoid using the
18705 // atomic in the private case.
18706 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
18708}
18709
18710const TargetRegisterClass *
18711SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
18713 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18714 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18715 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18716 : &AMDGPU::SReg_32RegClass;
18717 if (!TRI->isSGPRClass(RC) && !isDivergent)
18718 return TRI->getEquivalentSGPRClass(RC);
18719 if (TRI->isSGPRClass(RC) && isDivergent)
18720 return TRI->getEquivalentVGPRClass(RC);
18721
18722 return RC;
18723}
18724
18725// FIXME: This is a workaround for DivergenceAnalysis not understanding always
18726// uniform values (as produced by the mask results of control flow intrinsics)
18727// used outside of divergent blocks. The phi users need to also be treated as
18728// always uniform.
18729//
18730// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
18731static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
18732 unsigned WaveSize) {
18733 // FIXME: We assume we never cast the mask results of a control flow
18734 // intrinsic.
18735 // Early exit if the type won't be consistent as a compile time hack.
18736 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
18737 if (!IT || IT->getBitWidth() != WaveSize)
18738 return false;
18739
18740 if (!isa<Instruction>(V))
18741 return false;
18742 if (!Visited.insert(V).second)
18743 return false;
18744 bool Result = false;
18745 for (const auto *U : V->users()) {
18747 if (V == U->getOperand(1)) {
18748 switch (Intrinsic->getIntrinsicID()) {
18749 default:
18750 Result = false;
18751 break;
18752 case Intrinsic::amdgcn_if_break:
18753 case Intrinsic::amdgcn_if:
18754 case Intrinsic::amdgcn_else:
18755 Result = true;
18756 break;
18757 }
18758 }
18759 if (V == U->getOperand(0)) {
18760 switch (Intrinsic->getIntrinsicID()) {
18761 default:
18762 Result = false;
18763 break;
18764 case Intrinsic::amdgcn_end_cf:
18765 case Intrinsic::amdgcn_loop:
18766 Result = true;
18767 break;
18768 }
18769 }
18770 } else {
18771 Result = hasCFUser(U, Visited, WaveSize);
18772 }
18773 if (Result)
18774 break;
18775 }
18776 return Result;
18777}
18778
18780 const Value *V) const {
18781 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
18782 if (CI->isInlineAsm()) {
18783 // FIXME: This cannot give a correct answer. This should only trigger in
18784 // the case where inline asm returns mixed SGPR and VGPR results, used
18785 // outside the defining block. We don't have a specific result to
18786 // consider, so this assumes if any value is SGPR, the overall register
18787 // also needs to be SGPR.
18788 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
18790 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
18791 for (auto &TC : TargetConstraints) {
18792 if (TC.Type == InlineAsm::isOutput) {
18794 const TargetRegisterClass *RC =
18795 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
18796 TC.ConstraintVT)
18797 .second;
18798 if (RC && SIRI->isSGPRClass(RC))
18799 return true;
18800 }
18801 }
18802 }
18803 }
18805 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18806}
18807
18809 for (SDUse &Use : N->uses()) {
18811 if (getBasePtrIndex(M) == Use.getOperandNo())
18812 return true;
18813 }
18814 }
18815 return false;
18816}
18817
18819 SDValue N1) const {
18820 if (!N0.hasOneUse())
18821 return false;
18822 // Take care of the opportunity to keep N0 uniform
18823 if (N0->isDivergent() || !N1->isDivergent())
18824 return true;
18825 // Check if we have a good chance to form the memory access pattern with the
18826 // base and offset
18827 return (DAG.isBaseWithConstantOffset(N0) &&
18829}
18830
18832 Register N0, Register N1) const {
18833 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
18834}
18835
18838 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
18840 if (I.getMetadata("amdgpu.noclobber"))
18841 Flags |= MONoClobber;
18842 if (I.getMetadata("amdgpu.last.use"))
18843 Flags |= MOLastUse;
18844 return Flags;
18845}
18846
18848 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
18849 const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const {
18850 if (User->getOpcode() != ISD::CopyToReg)
18851 return false;
18852 if (!Def->isMachineOpcode())
18853 return false;
18855 if (!MDef)
18856 return false;
18857
18858 unsigned ResNo = User->getOperand(Op).getResNo();
18859 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
18860 return false;
18861 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
18862 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18863 PhysReg = AMDGPU::SCC;
18864 const TargetRegisterClass *RC =
18865 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18866 Cost = RC->getCopyCost();
18867 return true;
18868 }
18869 return false;
18870}
18871
18873 Instruction *AI) const {
18874 // Given: atomicrmw fadd ptr %addr, float %val ordering
18875 //
18876 // With this expansion we produce the following code:
18877 // [...]
18878 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
18879 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
18880 //
18881 // atomicrmw.shared:
18882 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
18883 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
18884 // float %val ordering
18885 // br label %atomicrmw.phi
18886 //
18887 // atomicrmw.check.private:
18888 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
18889 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
18890 //
18891 // atomicrmw.private:
18892 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
18893 // %loaded.private = load float, ptr addrspace(5) %cast.private
18894 // %val.new = fadd float %loaded.private, %val
18895 // store float %val.new, ptr addrspace(5) %cast.private
18896 // br label %atomicrmw.phi
18897 //
18898 // atomicrmw.global:
18899 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
18900 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
18901 // float %val ordering
18902 // br label %atomicrmw.phi
18903 //
18904 // atomicrmw.phi:
18905 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
18906 // [ %loaded.private, %atomicrmw.private ],
18907 // [ %loaded.global, %atomicrmw.global ]
18908 // br label %atomicrmw.end
18909 //
18910 // atomicrmw.end:
18911 // [...]
18912 //
18913 //
18914 // For 64-bit atomics which may reside in private memory, we perform a simpler
18915 // version that only inserts the private check, and uses the flat operation.
18916
18917 IRBuilder<> Builder(AI);
18918 LLVMContext &Ctx = Builder.getContext();
18919
18920 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
18921 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
18923 Value *Addr = AI->getOperand(PtrOpIdx);
18924
18925 /// TODO: Only need to check private, then emit flat-known-not private (no
18926 /// need for shared block, or cast to global).
18928
18929 Align Alignment;
18930 if (RMW)
18931 Alignment = RMW->getAlign();
18932 else if (CX)
18933 Alignment = CX->getAlign();
18934 else
18935 llvm_unreachable("unhandled atomic operation");
18936
18937 // FullFlatEmulation is true if we need to issue the private, shared, and
18938 // global cases.
18939 //
18940 // If this is false, we are only dealing with the flat-targeting-private case,
18941 // where we only insert a check for private and still use the flat instruction
18942 // for global and shared.
18943
18944 bool FullFlatEmulation =
18945 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
18946 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18947 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
18948 RMW->getType()->isDoubleTy()));
18949
18950 // If the return value isn't used, do not introduce a false use in the phi.
18951 bool ReturnValueIsUsed = !AI->use_empty();
18952
18953 BasicBlock *BB = Builder.GetInsertBlock();
18954 Function *F = BB->getParent();
18955 BasicBlock *ExitBB =
18956 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
18957 BasicBlock *SharedBB = nullptr;
18958
18959 BasicBlock *CheckPrivateBB = BB;
18960 if (FullFlatEmulation) {
18961 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
18962 CheckPrivateBB =
18963 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
18964 }
18965
18966 BasicBlock *PrivateBB =
18967 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
18968 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
18969 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
18970
18971 std::prev(BB->end())->eraseFromParent();
18972 Builder.SetInsertPoint(BB);
18973
18974 Value *LoadedShared = nullptr;
18975 if (FullFlatEmulation) {
18976 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
18977 {Addr}, nullptr, "is.shared");
18978 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
18979 Builder.SetInsertPoint(SharedBB);
18980 Value *CastToLocal = Builder.CreateAddrSpaceCast(
18982
18983 Instruction *Clone = AI->clone();
18984 Clone->insertInto(SharedBB, SharedBB->end());
18985 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
18986 LoadedShared = Clone;
18987
18988 Builder.CreateBr(PhiBB);
18989 Builder.SetInsertPoint(CheckPrivateBB);
18990 }
18991
18992 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
18993 {Addr}, nullptr, "is.private");
18994 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
18995
18996 Builder.SetInsertPoint(PrivateBB);
18997
18998 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19000
19001 Value *LoadedPrivate;
19002 if (RMW) {
19003 LoadedPrivate = Builder.CreateAlignedLoad(
19004 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
19005
19006 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
19007 LoadedPrivate, RMW->getValOperand());
19008
19009 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19010 } else {
19011 auto [ResultLoad, Equal] =
19012 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
19013 CX->getNewValOperand(), CX->getAlign());
19014
19015 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
19016 ResultLoad, 0);
19017 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19018 }
19019
19020 Builder.CreateBr(PhiBB);
19021
19022 Builder.SetInsertPoint(GlobalBB);
19023
19024 // Continue using a flat instruction if we only emitted the check for private.
19025 Instruction *LoadedGlobal = AI;
19026 if (FullFlatEmulation) {
19027 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19029 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
19030 }
19031
19032 AI->removeFromParent();
19033 AI->insertInto(GlobalBB, GlobalBB->end());
19034
19035 // The new atomicrmw may go through another round of legalization later.
19036 if (!FullFlatEmulation) {
19037 // We inserted the runtime check already, make sure we do not try to
19038 // re-expand this.
19039 // TODO: Should union with any existing metadata.
19040 MDBuilder MDB(F->getContext());
19041 MDNode *RangeNotPrivate =
19044 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
19045 RangeNotPrivate);
19046 }
19047
19048 Builder.CreateBr(PhiBB);
19049
19050 Builder.SetInsertPoint(PhiBB);
19051
19052 if (ReturnValueIsUsed) {
19053 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
19054 AI->replaceAllUsesWith(Loaded);
19055 if (FullFlatEmulation)
19056 Loaded->addIncoming(LoadedShared, SharedBB);
19057 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19058 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19059 Loaded->takeName(AI);
19060 }
19061
19062 Builder.CreateBr(ExitBB);
19063}
19064
19066 unsigned PtrOpIdx) {
19067 Value *PtrOp = I->getOperand(PtrOpIdx);
19070
19071 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
19072 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
19073 I->getIterator());
19074 I->setOperand(PtrOpIdx, ASCast);
19075}
19076
19079
19082
19085 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
19086 ConstVal && ConstVal->isNullValue()) {
19087 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19089
19090 // We may still need the private-alias-flat handling below.
19091
19092 // TODO: Skip this for cases where we cannot access remote memory.
19093 }
19094 }
19095
19096 // The non-flat expansions should only perform the de-canonicalization of
19097 // identity values.
19099 return;
19100
19102}
19103
19110
19114
19116 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19117}
19118
19120 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19121 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
19122
19124 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19125}
19126
19127LoadInst *
19129 IRBuilder<> Builder(AI);
19130 auto Order = AI->getOrdering();
19131
19132 // The optimization removes store aspect of the atomicrmw. Therefore, cache
19133 // must be flushed if the atomic ordering had a release semantics. This is
19134 // not necessary a fence, a release fence just coincides to do that flush.
19135 // Avoid replacing of an atomicrmw with a release semantics.
19136 if (isReleaseOrStronger(Order))
19137 return nullptr;
19138
19139 LoadInst *LI = Builder.CreateAlignedLoad(
19140 AI->getType(), AI->getPointerOperand(), AI->getAlign());
19141 LI->setAtomic(Order, AI->getSyncScopeID());
19142 LI->copyMetadata(*AI);
19143 LI->takeName(AI);
19144 AI->replaceAllUsesWith(LI);
19145 AI->eraseFromParent();
19146 return LI;
19147}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1254
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1251
static cl::opt< bool > UseSelectionDAGPTRADD("amdgpu-use-sdag-ptradd", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " "SelectionDAG ISel"), cl::init(false))
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1120
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1497
bool isNegative() const
Definition APFloat.h:1449
bool isNormal() const
Definition APFloat.h:1453
APInt bitcastToAPInt() const
Definition APFloat.h:1353
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1138
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1098
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
bool isInfinity() const
Definition APFloat.h:1446
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:366
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:339
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
BitVector & set()
Definition BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_NE
not equal
Definition InstrTypes.h:700
bool isSigned() const
Definition InstrTypes.h:932
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:772
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:778
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:214
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:208
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:803
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1077
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1441
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:221
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:215
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:218
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:67
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:420
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:107
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI void set(Value *Val)
Definition Value.h:905
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isZero() const
Definition TypeSize.h:154
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:130
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:535
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
Definition MathExtras.h:54
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:241
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:232
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2116
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:296
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:241
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:225
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs